diff --git a/.circleci/config.yml b/.circleci/config.yml index dc4162a0674fdf..6b516b21722acb 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -34,5 +34,5 @@ jobs: command: | export PATH="$MINICONDA_DIR/bin:$PATH" source activate pandas-dev - echo "pytest --strict --durations=10 --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml --skip-slow --skip-network pandas" - pytest --strict --durations=10 --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml --skip-slow --skip-network pandas + echo "pytest -m "not slow and not network" --strict --durations=10 --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml pandas" + pytest -m "not slow and not network" --strict --durations=10 --color=no --junitxml=$CIRCLE_TEST_REPORTS/reports/junit.xml pandas diff --git a/.travis.yml b/.travis.yml index 3217fc5aa1ed6b..03026647d6bb8c 100644 --- a/.travis.yml +++ b/.travis.yml @@ -34,28 +34,28 @@ matrix: include: - dist: trusty env: - - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" TEST_ARGS="--skip-slow --skip-network" + - JOB="3.7" ENV_FILE="ci/deps/travis-37.yaml" PATTERN="not slow and not network" - dist: trusty env: - - JOB="2.7, locale, slow, old NumPy" ENV_FILE="ci/deps/travis-27-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" SLOW=true + - JOB="2.7, locale, slow, old NumPy" ENV_FILE="ci/deps/travis-27-locale.yaml" LOCALE_OVERRIDE="zh_CN.UTF-8" PATTERN="slow" addons: apt: packages: - language-pack-zh-hans - dist: trusty env: - - JOB="2.7" ENV_FILE="ci/deps/travis-27.yaml" TEST_ARGS="--skip-slow" + - JOB="2.7" ENV_FILE="ci/deps/travis-27.yaml" PATTERN="not slow" addons: apt: packages: - python-gtk2 - dist: trusty env: - - JOB="3.6, lint, coverage" ENV_FILE="ci/deps/travis-36.yaml" TEST_ARGS="--skip-slow --skip-network" PANDAS_TESTING_MODE="deprecate" COVERAGE=true LINT=true + - JOB="3.6, coverage" ENV_FILE="ci/deps/travis-36.yaml" PATTERN="not slow and not network" PANDAS_TESTING_MODE="deprecate" COVERAGE=true - dist: trusty env: - - JOB="3.7, NumPy dev" ENV_FILE="ci/deps/travis-37-numpydev.yaml" TEST_ARGS="--skip-slow --skip-network -W error" PANDAS_TESTING_MODE="deprecate" + - JOB="3.7, NumPy dev" ENV_FILE="ci/deps/travis-37-numpydev.yaml" PATTERN="not slow and not network" TEST_ARGS="-W error" PANDAS_TESTING_MODE="deprecate" addons: apt: packages: @@ -64,7 +64,7 @@ matrix: # In allow_failures - dist: trusty env: - - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" SLOW=true + - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" # In allow_failures - dist: trusty @@ -73,7 +73,7 @@ matrix: allow_failures: - dist: trusty env: - - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" SLOW=true + - JOB="3.6, slow" ENV_FILE="ci/deps/travis-36-slow.yaml" PATTERN="slow" - dist: trusty env: - JOB="3.6, doc" ENV_FILE="ci/deps/travis-36-doc.yaml" DOC=true @@ -107,20 +107,15 @@ script: - echo "script start" - source activate pandas-dev - ci/run_build_docs.sh - - ci/script_single.sh - - ci/script_multi.sh - - ci/code_checks.sh - -after_success: - - ci/upload_coverage.sh + - ci/run_tests.sh after_script: - echo "after_script start" - source activate pandas-dev && pushd /tmp && python -c "import pandas; pandas.show_versions();" && popd - if [ -e test-data-single.xml ]; then - ci/print_skipped.py test-data-single.xml; + ci/print_skipped.py test-data-single.xml; fi - if [ -e test-data-multiple.xml ]; then - ci/print_skipped.py test-data-multiple.xml; + ci/print_skipped.py test-data-multiple.xml; fi - echo "after_script done" diff --git a/README.md b/README.md index b4dedecb4c6971..1993b1ecb9dc1b 100644 --- a/README.md +++ b/README.md @@ -171,7 +171,7 @@ pip install pandas ``` ## Dependencies -- [NumPy](https://www.numpy.org): 1.9.0 or higher +- [NumPy](https://www.numpy.org): 1.12.0 or higher - [python-dateutil](https://labix.org/python-dateutil): 2.5.0 or higher - [pytz](https://pythonhosted.org/pytz): 2011k or higher diff --git a/asv_bench/benchmarks/binary_ops.py b/asv_bench/benchmarks/binary_ops.py index dfdebec86d67c3..22b8ed80f3d077 100644 --- a/asv_bench/benchmarks/binary_ops.py +++ b/asv_bench/benchmarks/binary_ops.py @@ -52,6 +52,8 @@ def setup(self): np.iinfo(np.int16).max, size=(N, N))) + self.s = Series(np.random.randn(N)) + # Division def time_frame_float_div(self): @@ -74,6 +76,17 @@ def time_frame_int_mod(self): def time_frame_float_mod(self): self.df % self.df2 + # Dot product + + def time_frame_dot(self): + self.df.dot(self.df2) + + def time_series_dot(self): + self.s.dot(self.s) + + def time_frame_series_dot(self): + self.df.dot(self.s) + class Timeseries(object): diff --git a/asv_bench/benchmarks/categoricals.py b/asv_bench/benchmarks/categoricals.py index 8a0fbc48755b57..7318b40efc8fb8 100644 --- a/asv_bench/benchmarks/categoricals.py +++ b/asv_bench/benchmarks/categoricals.py @@ -46,6 +46,8 @@ def setup(self): self.values_some_nan = list(np.tile(self.categories + [np.nan], N)) self.values_all_nan = [np.nan] * len(self.values) self.values_all_int8 = np.ones(N, 'int8') + self.categorical = pd.Categorical(self.values, self.categories) + self.series = pd.Series(self.categorical) def time_regular(self): pd.Categorical(self.values, self.categories) @@ -68,6 +70,12 @@ def time_all_nan(self): def time_from_codes_all_int8(self): pd.Categorical.from_codes(self.values_all_int8, self.categories) + def time_existing_categorical(self): + pd.Categorical(self.categorical) + + def time_existing_series(self): + pd.Categorical(self.series) + class ValueCounts(object): diff --git a/asv_bench/benchmarks/frame_methods.py b/asv_bench/benchmarks/frame_methods.py index b60b45cc29f7d5..527a2f129cf37a 100644 --- a/asv_bench/benchmarks/frame_methods.py +++ b/asv_bench/benchmarks/frame_methods.py @@ -69,6 +69,36 @@ def time_reindex_upcast(self): self.df2.reindex(np.random.permutation(range(1200))) +class Rename(object): + + def setup(self): + N = 10**3 + self.df = DataFrame(np.random.randn(N * 10, N)) + self.idx = np.arange(4 * N, 7 * N) + self.dict_idx = {k: k for k in self.idx} + self.df2 = DataFrame( + {c: {0: np.random.randint(0, 2, N).astype(np.bool_), + 1: np.random.randint(0, N, N).astype(np.int16), + 2: np.random.randint(0, N, N).astype(np.int32), + 3: np.random.randint(0, N, N).astype(np.int64)} + [np.random.randint(0, 4)] for c in range(N)}) + + def time_rename_single(self): + self.df.rename({0: 0}) + + def time_rename_axis0(self): + self.df.rename(self.dict_idx) + + def time_rename_axis1(self): + self.df.rename(columns=self.dict_idx) + + def time_rename_both_axes(self): + self.df.rename(index=self.dict_idx, columns=self.dict_idx) + + def time_dict_rename_both_axes(self): + self.df.rename(index=self.dict_idx, columns=self.dict_idx) + + class Iteration(object): def setup(self): diff --git a/asv_bench/benchmarks/plotting.py b/asv_bench/benchmarks/plotting.py index 1373d5f0b42589..4f0bbb1690d4b9 100644 --- a/asv_bench/benchmarks/plotting.py +++ b/asv_bench/benchmarks/plotting.py @@ -8,17 +8,48 @@ matplotlib.use('Agg') -class Plotting(object): - - def setup(self): - self.s = Series(np.random.randn(1000000)) - self.df = DataFrame({'col': self.s}) - - def time_series_plot(self): - self.s.plot() - - def time_frame_plot(self): - self.df.plot() +class SeriesPlotting(object): + params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie']] + param_names = ['kind'] + + def setup(self, kind): + if kind in ['bar', 'barh', 'pie']: + n = 100 + elif kind in ['kde']: + n = 10000 + else: + n = 1000000 + + self.s = Series(np.random.randn(n)) + if kind in ['area', 'pie']: + self.s = self.s.abs() + + def time_series_plot(self, kind): + self.s.plot(kind=kind) + + +class FramePlotting(object): + params = [['line', 'bar', 'area', 'barh', 'hist', 'kde', 'pie', 'scatter', + 'hexbin']] + param_names = ['kind'] + + def setup(self, kind): + if kind in ['bar', 'barh', 'pie']: + n = 100 + elif kind in ['kde', 'scatter', 'hexbin']: + n = 10000 + else: + n = 1000000 + + self.x = Series(np.random.randn(n)) + self.y = Series(np.random.randn(n)) + if kind in ['area', 'pie']: + self.x = self.x.abs() + self.y = self.y.abs() + self.df = DataFrame({'x': self.x, 'y': self.y}) + + def time_frame_plot(self, kind): + self.df.plot(x='x', y='y', kind=kind) class TimeseriesPlotting(object): diff --git a/asv_bench/benchmarks/reshape.py b/asv_bench/benchmarks/reshape.py index 67fdfb82e72c01..e5c2f54263a3cb 100644 --- a/asv_bench/benchmarks/reshape.py +++ b/asv_bench/benchmarks/reshape.py @@ -146,4 +146,42 @@ def time_get_dummies_1d_sparse(self): pd.get_dummies(self.s, sparse=True) +class Cut(object): + params = [[4, 10, 1000]] + param_names = ['bins'] + + def setup(self, bins): + N = 10**5 + self.int_series = pd.Series(np.arange(N).repeat(5)) + self.float_series = pd.Series(np.random.randn(N).repeat(5)) + self.timedelta_series = pd.Series(np.random.randint(N, size=N), + dtype='timedelta64[ns]') + self.datetime_series = pd.Series(np.random.randint(N, size=N), + dtype='datetime64[ns]') + + def time_cut_int(self, bins): + pd.cut(self.int_series, bins) + + def time_cut_float(self, bins): + pd.cut(self.float_series, bins) + + def time_cut_timedelta(self, bins): + pd.cut(self.timedelta_series, bins) + + def time_cut_datetime(self, bins): + pd.cut(self.datetime_series, bins) + + def time_qcut_int(self, bins): + pd.qcut(self.int_series, bins) + + def time_qcut_float(self, bins): + pd.qcut(self.float_series, bins) + + def time_qcut_timedelta(self, bins): + pd.qcut(self.timedelta_series, bins) + + def time_qcut_datetime(self, bins): + pd.qcut(self.datetime_series, bins) + + from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/rolling.py b/asv_bench/benchmarks/rolling.py index 86294e33e1e067..659b6591fbd4b1 100644 --- a/asv_bench/benchmarks/rolling.py +++ b/asv_bench/benchmarks/rolling.py @@ -21,6 +21,42 @@ def time_rolling(self, constructor, window, dtype, method): getattr(self.roll, method)() +class ExpandingMethods(object): + + sample_time = 0.2 + params = (['DataFrame', 'Series'], + ['int', 'float'], + ['median', 'mean', 'max', 'min', 'std', 'count', 'skew', 'kurt', + 'sum']) + param_names = ['contructor', 'window', 'dtype', 'method'] + + def setup(self, constructor, dtype, method): + N = 10**5 + arr = (100 * np.random.random(N)).astype(dtype) + self.expanding = getattr(pd, constructor)(arr).expanding() + + def time_expanding(self, constructor, dtype, method): + getattr(self.expanding, method)() + + +class EWMMethods(object): + + sample_time = 0.2 + params = (['DataFrame', 'Series'], + [10, 1000], + ['int', 'float'], + ['mean', 'std']) + param_names = ['contructor', 'window', 'dtype', 'method'] + + def setup(self, constructor, window, dtype, method): + N = 10**5 + arr = (100 * np.random.random(N)).astype(dtype) + self.ewm = getattr(pd, constructor)(arr).ewm(halflife=window) + + def time_ewm(self, constructor, window, dtype, method): + getattr(self.ewm, method)() + + class VariableWindowMethods(Methods): sample_time = 0.2 params = (['DataFrame', 'Series'], diff --git a/asv_bench/benchmarks/stat_ops.py b/asv_bench/benchmarks/stat_ops.py index 5c777c00261e1a..66ded52ca35b2a 100644 --- a/asv_bench/benchmarks/stat_ops.py +++ b/asv_bench/benchmarks/stat_ops.py @@ -96,14 +96,42 @@ def time_average_old(self, constructor, pct): class Correlation(object): - params = ['spearman', 'kendall', 'pearson'] - param_names = ['method'] + params = [['spearman', 'kendall', 'pearson'], [True, False]] + param_names = ['method', 'use_bottleneck'] - def setup(self, method): + def setup(self, method, use_bottleneck): + try: + pd.options.compute.use_bottleneck = use_bottleneck + except TypeError: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck self.df = pd.DataFrame(np.random.randn(1000, 30)) + self.s = pd.Series(np.random.randn(1000)) + self.s2 = pd.Series(np.random.randn(1000)) - def time_corr(self, method): + def time_corr(self, method, use_bottleneck): self.df.corr(method=method) + def time_corr_series(self, method, use_bottleneck): + self.s.corr(self.s2, method=method) + + +class Covariance(object): + + params = [[True, False]] + param_names = ['use_bottleneck'] + + def setup(self, use_bottleneck): + try: + pd.options.compute.use_bottleneck = use_bottleneck + except TypeError: + from pandas.core import nanops + nanops._USE_BOTTLENECK = use_bottleneck + self.s = pd.Series(np.random.randn(100000)) + self.s2 = pd.Series(np.random.randn(100000)) + + def time_cov_series(self, use_bottleneck): + self.s.cov(self.s2) + from .pandas_vb_common import setup # noqa: F401 diff --git a/asv_bench/benchmarks/strings.py b/asv_bench/benchmarks/strings.py index d880fb258560d1..e9f2727f64e15c 100644 --- a/asv_bench/benchmarks/strings.py +++ b/asv_bench/benchmarks/strings.py @@ -26,21 +26,42 @@ def time_extract(self): def time_findall(self): self.s.str.findall('[A-Z]+') + def time_find(self): + self.s.str.find('[A-Z]+') + + def time_rfind(self): + self.s.str.rfind('[A-Z]+') + def time_get(self): self.s.str.get(0) def time_len(self): self.s.str.len() + def time_join(self): + self.s.str.join(' ') + def time_match(self): self.s.str.match('A') + def time_normalize(self): + self.s.str.normalize('NFC') + def time_pad(self): self.s.str.pad(100, side='both') + def time_partition(self): + self.s.str.partition('A') + + def time_rpartition(self): + self.s.str.rpartition('A') + def time_replace(self): self.s.str.replace('A', '\x01\x01') + def time_translate(self): + self.s.str.translate({'A': '\x01\x01'}) + def time_slice(self): self.s.str.slice(5, 15, 2) @@ -65,6 +86,12 @@ def time_upper(self): def time_lower(self): self.s.str.lower() + def time_wrap(self): + self.s.str.wrap(10) + + def time_zfill(self): + self.s.str.zfill(10) + class Repeat(object): @@ -129,6 +156,9 @@ def setup(self, expand): def time_split(self, expand): self.s.str.split('--', expand=expand) + def time_rsplit(self, expand): + self.s.str.rsplit('--', expand=expand) + class Dummies(object): diff --git a/asv_bench/benchmarks/timedelta.py b/asv_bench/benchmarks/timedelta.py index 01d53fb9cbbd99..7ee73fb7ac7b65 100644 --- a/asv_bench/benchmarks/timedelta.py +++ b/asv_bench/benchmarks/timedelta.py @@ -1,7 +1,8 @@ import datetime import numpy as np -from pandas import Series, timedelta_range, to_timedelta, Timestamp, Timedelta +from pandas import Series, timedelta_range, to_timedelta, Timestamp, \ + Timedelta, TimedeltaIndex, DataFrame class TimedeltaConstructor(object): @@ -116,3 +117,36 @@ def time_timedelta_microseconds(self, series): def time_timedelta_nanoseconds(self, series): series.dt.nanoseconds + + +class TimedeltaIndexing(object): + + def setup(self): + self.index = TimedeltaIndex(start='1985', periods=1000, freq='D') + self.index2 = TimedeltaIndex(start='1986', periods=1000, freq='D') + self.series = Series(range(1000), index=self.index) + self.timedelta = self.index[500] + + def time_get_loc(self): + self.index.get_loc(self.timedelta) + + def time_shape(self): + self.index.shape + + def time_shallow_copy(self): + self.index._shallow_copy() + + def time_series_loc(self): + self.series.loc[self.timedelta] + + def time_align(self): + DataFrame({'a': self.series, 'b': self.series[:500]}) + + def time_intersection(self): + self.index.intersection(self.index2) + + def time_union(self): + self.index.union(self.index2) + + def time_unique(self): + self.index.unique() diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 373c22fdf8e629..a58f82ec6de49f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -23,3 +23,104 @@ jobs: parameters: name: WindowsPy27 vmImage: vs2017-win2016 + +- job: 'Checks_and_doc' + pool: + vmImage: ubuntu-16.04 + timeoutInMinutes: 90 + steps: + - script: | + # XXX next command should avoid redefining the path in every step, but + # made the process crash as it couldn't find deactivate + #echo '##vso[task.prependpath]$HOME/miniconda3/bin' + echo '##vso[task.setvariable variable=CONDA_ENV]pandas-dev' + echo '##vso[task.setvariable variable=ENV_FILE]environment.yml' + echo '##vso[task.setvariable variable=AZURE]true' + displayName: 'Setting environment variables' + + # Do not require a conda environment + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + ci/code_checks.sh patterns + displayName: 'Looking for unwanted patterns' + condition: true + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + sudo apt-get install -y libc6-dev-i386 + ci/incremental/install_miniconda.sh + ci/incremental/setup_conda_environment.sh + displayName: 'Set up environment' + + # Do not require pandas + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh lint + displayName: 'Linting' + condition: true + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh dependencies + displayName: 'Dependencies consistency' + condition: true + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/incremental/build.sh + displayName: 'Build' + condition: true + + # Require pandas + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh code + displayName: 'Checks on imported code' + condition: true + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh doctests + displayName: 'Running doctests' + condition: true + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + ci/code_checks.sh docstrings + displayName: 'Docstring validation' + condition: true + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + pytest --capture=no --strict scripts + displayName: 'Testing docstring validaton script' + condition: true + + - script: | + export PATH=$HOME/miniconda3/bin:$PATH + source activate pandas-dev + git remote add upstream https://github.com/pandas-dev/pandas.git + git fetch upstream + if git diff upstream/master --name-only | grep -q "^asv_bench/"; then + cd asv_bench + asv machine --yes + ASV_OUTPUT="$(asv dev)" + if [[ $(echo "$ASV_OUTPUT" | grep "failed") ]]; then + echo "##vso[task.logissue type=error]Benchmarks run with errors" + echo $ASV_OUTPUT + exit 1 + else + echo "Benchmarks run without errors" + fi + else + echo "Benchmarks did not run, no changes detected" + fi + displayName: 'Running benchmarks' + condition: true diff --git a/ci/README.txt b/ci/README.txt deleted file mode 100644 index bb71dc25d60932..00000000000000 --- a/ci/README.txt +++ /dev/null @@ -1,17 +0,0 @@ -Travis is a ci service that's well-integrated with GitHub. -The following types of breakage should be detected -by Travis builds: - -1) Failing tests on any supported version of Python. -2) Pandas should install and the tests should run if no optional deps are installed. -That also means tests which rely on optional deps need to raise SkipTest() -if the dep is missing. -3) unicode related fails when running under exotic locales. - -We tried running the vbench suite for a while, but with varying load -on Travis machines, that wasn't useful. - -Travis currently (4/2013) has a 5-job concurrency limit. Exceeding it -basically doubles the total runtime for a commit through travis, and -since dep+pandas installation is already quite long, this should become -a hard limit on concurrent travis runs. diff --git a/ci/azure/linux.yml b/ci/azure/linux.yml index a773a06c193d45..fe64307e9d08f9 100644 --- a/ci/azure/linux.yml +++ b/ci/azure/linux.yml @@ -9,21 +9,21 @@ jobs: strategy: maxParallel: 11 matrix: - py27_np_19: + py27_np_120: ENV_FILE: ci/deps/azure-27-compat.yaml CONDA_PY: "27" - TEST_ARGS: "--skip-slow --skip-network" + PATTERN: "not slow and not network" py37_locale: ENV_FILE: ci/deps/azure-37-locale.yaml CONDA_PY: "37" - TEST_ARGS: "--skip-slow --skip-network" + PATTERN: "not slow and not network" LOCALE_OVERRIDE: "zh_CN.UTF-8" py36_locale_slow: ENV_FILE: ci/deps/azure-36-locale_slow.yaml CONDA_PY: "36" - TEST_ARGS: "--only-slow --skip-network" + PATTERN: "not slow and not network" LOCALE_OVERRIDE: "it_IT.UTF-8" steps: @@ -43,9 +43,7 @@ jobs: - script: | export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev - ci/script_single.sh - ci/script_multi.sh - echo "[Test done]" + ci/run_tests.sh displayName: 'Test' - script: | export PATH=$HOME/miniconda3/bin:$PATH diff --git a/ci/azure/macos.yml b/ci/azure/macos.yml index d537f0c70cbec8..98409576a5a875 100644 --- a/ci/azure/macos.yml +++ b/ci/azure/macos.yml @@ -12,7 +12,7 @@ jobs: py35_np_120: ENV_FILE: ci/deps/azure-macos-35.yaml CONDA_PY: "35" - TEST_ARGS: "--skip-slow --skip-network" + PATTERN: "not slow and not network" steps: - script: | @@ -31,9 +31,7 @@ jobs: - script: | export PATH=$HOME/miniconda3/bin:$PATH source activate pandas-dev - ci/script_single.sh - ci/script_multi.sh - echo "[Test done]" + ci/run_tests.sh displayName: 'Test' - script: | export PATH=$HOME/miniconda3/bin:$PATH diff --git a/ci/azure/windows-py27.yml b/ci/azure/windows-py27.yml index ac918f3becd2eb..0d9aea816c4adc 100644 --- a/ci/azure/windows-py27.yml +++ b/ci/azure/windows-py27.yml @@ -37,7 +37,7 @@ jobs: displayName: 'Build' - script: | call activate pandas-dev - pytest --junitxml=test-data.xml --skip-slow --skip-network pandas -n 2 -r sxX --strict --durations=10 %* + pytest -m "not slow and not network" --junitxml=test-data.xml pandas -n 2 -r sxX --strict --durations=10 %* displayName: 'Test' - task: PublishTestResults@2 inputs: diff --git a/ci/azure/windows.yml b/ci/azure/windows.yml index f0ebba509e4419..b69c210ca27ba4 100644 --- a/ci/azure/windows.yml +++ b/ci/azure/windows.yml @@ -28,7 +28,7 @@ jobs: displayName: 'Build' - script: | call activate pandas-dev - pytest --junitxml=test-data.xml --skip-slow --skip-network pandas -n 2 -r sxX --strict --durations=10 %* + pytest -m "not slow and not network" --junitxml=test-data.xml pandas -n 2 -r sxX --strict --durations=10 %* displayName: 'Test' - task: PublishTestResults@2 inputs: diff --git a/ci/code_checks.sh b/ci/code_checks.sh index 45cb1708258d73..a8a86eedb05490 100755 --- a/ci/code_checks.sh +++ b/ci/code_checks.sh @@ -5,25 +5,48 @@ # This script is intended for both the CI and to check locally that code standards are # respected. We are currently linting (PEP-8 and similar), looking for patterns of # common mistakes (sphinx directives with missing blank lines, old style classes, -# unwanted imports...), and we also run doctests here (currently some files only). -# In the future we may want to add the validation of docstrings and other checks here. +# unwanted imports...), we run doctests here (currently some files only), and we +# validate formatting error in docstrings. # # Usage: # $ ./ci/code_checks.sh # run all checks # $ ./ci/code_checks.sh lint # run linting only # $ ./ci/code_checks.sh patterns # check for patterns that should not exist +# $ ./ci/code_checks.sh code # checks on imported code # $ ./ci/code_checks.sh doctests # run doctests +# $ ./ci/code_checks.sh docstrings # validate docstring errors # $ ./ci/code_checks.sh dependencies # check that dependencies are consistent -echo "inside $0" -[[ $LINT ]] || { echo "NOT Linting. To lint use: LINT=true $0 $1"; exit 0; } -[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "doctests" || "$1" == "dependencies" ]] \ - || { echo "Unknown command $1. Usage: $0 [lint|patterns|doctests|dependencies]"; exit 9999; } +[[ -z "$1" || "$1" == "lint" || "$1" == "patterns" || "$1" == "code" || "$1" == "doctests" || "$1" == "docstrings" || "$1" == "dependencies" ]] || \ + { echo "Unknown command $1. Usage: $0 [lint|patterns|code|doctests|docstrings|dependencies]"; exit 9999; } BASE_DIR="$(dirname $0)/.." RET=0 CHECK=$1 +function invgrep { + # grep with inverse exist status and formatting for azure-pipelines + # + # This function works exactly as grep, but with opposite exit status: + # - 0 (success) when no patterns are found + # - 1 (fail) when the patterns are found + # + # This is useful for the CI, as we want to fail if one of the patterns + # that we want to avoid is found by grep. + if [[ "$AZURE" == "true" ]]; then + set -o pipefail + grep -n "$@" | awk -F ":" '{print "##vso[task.logissue type=error;sourcepath=" $1 ";linenumber=" $2 ";] Found unwanted pattern: " $3}' + else + grep "$@" + fi + return $((! $?)) +} + +if [[ "$AZURE" == "true" ]]; then + FLAKE8_FORMAT="##vso[task.logissue type=error;sourcepath=%(path)s;linenumber=%(row)s;columnnumber=%(col)s;code=%(code)s;]%(text)s" +else + FLAKE8_FORMAT="default" +fi ### LINTING ### if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then @@ -35,22 +58,22 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then # pandas/_libs/src is C code, so no need to search there. MSG='Linting .py code' ; echo $MSG - flake8 . + flake8 --format="$FLAKE8_FORMAT" . RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Linting .pyx code' ; echo $MSG - flake8 pandas --filename=*.pyx --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126,E265,E305,E301,E127,E261,E271,E129,W291,E222,E241,E123,F403,C400,C401,C402,C403,C404,C405,C406,C407,C408,C409,C410,C411 + flake8 --format="$FLAKE8_FORMAT" pandas --filename=*.pyx --select=E501,E302,E203,E111,E114,E221,E303,E128,E231,E126,E265,E305,E301,E127,E261,E271,E129,W291,E222,E241,E123,F403,C400,C401,C402,C403,C404,C405,C406,C407,C408,C409,C410,C411 RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Linting .pxd and .pxi.in' ; echo $MSG - flake8 pandas/_libs --filename=*.pxi.in,*.pxd --select=E501,E302,E203,E111,E114,E221,E303,E231,E126,F403 + flake8 --format="$FLAKE8_FORMAT" pandas/_libs --filename=*.pxi.in,*.pxd --select=E501,E302,E203,E111,E114,E221,E303,E231,E126,F403 RET=$(($RET + $?)) ; echo $MSG "DONE" echo "flake8-rst --version" flake8-rst --version MSG='Linting code-blocks in .rst documentation' ; echo $MSG - flake8-rst doc/source --filename=*.rst + flake8-rst doc/source --filename=*.rst --format="$FLAKE8_FORMAT" RET=$(($RET + $?)) ; echo $MSG "DONE" # Check that cython casting is of the form `obj` as opposed to ` obj`; @@ -58,7 +81,7 @@ if [[ -z "$CHECK" || "$CHECK" == "lint" ]]; then # Note: this grep pattern is (intended to be) equivalent to the python # regex r'(?])> ' MSG='Linting .pyx code for spacing conventions in casting' ; echo $MSG - ! grep -r -E --include '*.pyx' --include '*.pxi.in' '[a-zA-Z0-9*]> ' pandas/_libs + invgrep -r -E --include '*.pyx' --include '*.pxi.in' '[a-zA-Z0-9*]> ' pandas/_libs RET=$(($RET + $?)) ; echo $MSG "DONE" # readability/casting: Warnings about C casting instead of C++ casting @@ -88,43 +111,48 @@ if [[ -z "$CHECK" || "$CHECK" == "patterns" ]]; then # Check for imports from pandas.core.common instead of `import pandas.core.common as com` MSG='Check for non-standard imports' ; echo $MSG - ! grep -R --include="*.py*" -E "from pandas.core.common import " pandas + invgrep -R --include="*.py*" -E "from pandas.core.common import " pandas RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check for pytest warns' ; echo $MSG - ! grep -r -E --include '*.py' 'pytest\.warns' pandas/tests/ + invgrep -r -E --include '*.py' 'pytest\.warns' pandas/tests/ RET=$(($RET + $?)) ; echo $MSG "DONE" # Check for the following code in testing: `np.testing` and `np.array_equal` MSG='Check for invalid testing' ; echo $MSG - ! grep -r -E --include '*.py' --exclude testing.py '(numpy|np)(\.testing|\.array_equal)' pandas/tests/ + invgrep -r -E --include '*.py' --exclude testing.py '(numpy|np)(\.testing|\.array_equal)' pandas/tests/ RET=$(($RET + $?)) ; echo $MSG "DONE" # Check for the following code in the extension array base tests: `tm.assert_frame_equal` and `tm.assert_series_equal` MSG='Check for invalid EA testing' ; echo $MSG - ! grep -r -E --include '*.py' --exclude base.py 'tm.assert_(series|frame)_equal' pandas/tests/extension/base + invgrep -r -E --include '*.py' --exclude base.py 'tm.assert_(series|frame)_equal' pandas/tests/extension/base RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check for deprecated messages without sphinx directive' ; echo $MSG - ! grep -R --include="*.py" --include="*.pyx" -E "(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.)" pandas + invgrep -R --include="*.py" --include="*.pyx" -E "(DEPRECATED|DEPRECATE|Deprecated)(:|,|\.)" pandas RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check for old-style classes' ; echo $MSG - ! grep -R --include="*.py" -E "class\s\S*[^)]:" pandas scripts + invgrep -R --include="*.py" -E "class\s\S*[^)]:" pandas scripts RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check for backticks incorrectly rendering because of missing spaces' ; echo $MSG - ! grep -R --include="*.rst" -E "[a-zA-Z0-9]\`\`?[a-zA-Z0-9]" doc/source/ + invgrep -R --include="*.rst" -E "[a-zA-Z0-9]\`\`?[a-zA-Z0-9]" doc/source/ RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check for incorrect sphinx directives' ; echo $MSG - ! grep -R --include="*.py" --include="*.pyx" --include="*.rst" -E "\.\. (autosummary|contents|currentmodule|deprecated|function|image|important|include|ipython|literalinclude|math|module|note|raw|seealso|toctree|versionadded|versionchanged|warning):[^:]" ./pandas ./doc/source + invgrep -R --include="*.py" --include="*.pyx" --include="*.rst" -E "\.\. (autosummary|contents|currentmodule|deprecated|function|image|important|include|ipython|literalinclude|math|module|note|raw|seealso|toctree|versionadded|versionchanged|warning):[^:]" ./pandas ./doc/source RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Check that the deprecated `assert_raises_regex` is not used (`pytest.raises(match=pattern)` should be used instead)' ; echo $MSG - ! grep -R --exclude=*.pyc --exclude=testing.py --exclude=test_testing.py assert_raises_regex pandas + invgrep -R --exclude=*.pyc --exclude=testing.py --exclude=test_testing.py assert_raises_regex pandas RET=$(($RET + $?)) ; echo $MSG "DONE" +fi + +### CODE ### +if [[ -z "$CHECK" || "$CHECK" == "code" ]]; then + MSG='Check for modules that pandas should not import' ; echo $MSG python -c " import sys @@ -135,7 +163,7 @@ blacklist = {'bs4', 'gcsfs', 'html5lib', 'ipython', 'jinja2' 'hypothesis', 'tables', 'xlrd', 'xlsxwriter', 'xlwt'} mods = blacklist & set(m.split('.')[0] for m in sys.modules) if mods: - sys.stderr.write('pandas should not import: {}\n'.format(', '.join(mods))) + sys.stderr.write('err: pandas should not import: {}\n'.format(', '.join(mods))) sys.exit(len(mods)) " RET=$(($RET + $?)) ; echo $MSG "DONE" @@ -147,7 +175,7 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then MSG='Doctests frame.py' ; echo $MSG pytest -q --doctest-modules pandas/core/frame.py \ - -k"-axes -combine -itertuples -join -pivot_table -quantile -query -reindex -reindex_axis -replace -round -set_index -stack" + -k"-axes -combine -itertuples -join -pivot_table -quantile -query -reindex -reindex_axis -round" RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests series.py' ; echo $MSG @@ -157,7 +185,7 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then MSG='Doctests generic.py' ; echo $MSG pytest -q --doctest-modules pandas/core/generic.py \ - -k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_json -transpose -values -xs" + -k"-_set_axis_name -_xs -describe -droplevel -groupby -interpolate -pct_change -pipe -reindex -reindex_axis -to_json -transpose -values -xs -to_clipboard" RET=$(($RET + $?)) ; echo $MSG "DONE" MSG='Doctests top-level reshaping functions' ; echo $MSG @@ -178,11 +206,22 @@ if [[ -z "$CHECK" || "$CHECK" == "doctests" ]]; then fi +### DOCSTRINGS ### +if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then + + MSG='Validate docstrings (GL06, SS04, PR03, PR05, EX04)' ; echo $MSG + $BASE_DIR/scripts/validate_docstrings.py --format=azure --errors=GL06,SS04,PR03,PR05,EX04 + RET=$(($RET + $?)) ; echo $MSG "DONE" + +fi + ### DEPENDENCIES ### if [[ -z "$CHECK" || "$CHECK" == "dependencies" ]]; then + MSG='Check that requirements-dev.txt has been generated from environment.yml' ; echo $MSG - $BASE_DIR/scripts/generate_pip_deps_from_conda.py --compare + $BASE_DIR/scripts/generate_pip_deps_from_conda.py --compare --azure RET=$(($RET + $?)) ; echo $MSG "DONE" + fi exit $RET diff --git a/ci/deps/travis-36.yaml b/ci/deps/travis-36.yaml index 1781f67041f44a..bfd69652730ed2 100644 --- a/ci/deps/travis-36.yaml +++ b/ci/deps/travis-36.yaml @@ -7,16 +7,9 @@ dependencies: - cython>=0.28.2 - dask - fastparquet - - flake8>=3.5 - - flake8-comprehensions - - flake8-rst=0.4.2 - gcsfs - geopandas - html5lib - - ipython - - isort - - jinja2 - - lxml - matplotlib - nomkl - numexpr @@ -32,7 +25,6 @@ dependencies: - s3fs - scikit-learn - scipy - - seaborn - sqlalchemy - statsmodels - xarray @@ -48,6 +40,5 @@ dependencies: - pip: - brotlipy - coverage - - cpplint - pandas-datareader - python-dateutil diff --git a/ci/print_versions.py b/ci/print_versions.py deleted file mode 100755 index a2c93748b03881..00000000000000 --- a/ci/print_versions.py +++ /dev/null @@ -1,29 +0,0 @@ -#!/usr/bin/env python - - -def show_versions(as_json=False): - import imp - import os - fn = __file__ - this_dir = os.path.dirname(fn) - pandas_dir = os.path.abspath(os.path.join(this_dir, "..")) - sv_path = os.path.join(pandas_dir, 'pandas', 'util') - mod = imp.load_module( - 'pvmod', *imp.find_module('print_versions', [sv_path])) - return mod.show_versions(as_json) - - -if __name__ == '__main__': - # optparse is 2.6-safe - from optparse import OptionParser - parser = OptionParser() - parser.add_option("-j", "--json", metavar="FILE", nargs=1, - help="Save output as JSON into file, " - "pass in '-' to output to stdout") - - (options, args) = parser.parse_args() - - if options.json == "-": - options.json = True - - show_versions(as_json=options.json) diff --git a/ci/run_tests.sh b/ci/run_tests.sh new file mode 100755 index 00000000000000..77efc60a8cf972 --- /dev/null +++ b/ci/run_tests.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +if [ "$DOC" ]; then + echo "We are not running pytest as this is a doc-build" + exit 0 +fi + +# Workaround for pytest-xdist flaky collection order +# https://github.com/pytest-dev/pytest/issues/920 +# https://github.com/pytest-dev/pytest/issues/1075 +export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') + +if [ -n "$LOCALE_OVERRIDE" ]; then + export LC_ALL="$LOCALE_OVERRIDE" + export LANG="$LOCALE_OVERRIDE" + PANDAS_LOCALE=`python -c 'import pandas; pandas.get_option("display.encoding")'` + if [[ "$LOCALE_OVERIDE" != "$PANDAS_LOCALE" ]]; then + echo "pandas could not detect the locale. System locale: $LOCALE_OVERRIDE, pandas detected: $PANDAS_LOCALE" + # TODO Not really aborting the tests until https://github.com/pandas-dev/pandas/issues/23923 is fixed + # exit 1 + fi +fi +if [[ "not network" == *"$PATTERN"* ]]; then + export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; +fi + + +if [ -n "$PATTERN" ]; then + PATTERN=" and $PATTERN" +fi + +for TYPE in single multiple +do + if [ "$COVERAGE" ]; then + COVERAGE_FNAME="/tmp/coc-$TYPE.xml" + COVERAGE="-s --cov=pandas --cov-report=xml:$COVERAGE_FNAME" + fi + + TYPE_PATTERN=$TYPE + NUM_JOBS=1 + if [[ "$TYPE_PATTERN" == "multiple" ]]; then + TYPE_PATTERN="not single" + NUM_JOBS=2 + fi + + pytest -m "$TYPE_PATTERN$PATTERN" -n $NUM_JOBS -s --strict --durations=10 --junitxml=test-data-$TYPE.xml $TEST_ARGS $COVERAGE pandas + + if [[ "$COVERAGE" && $? == 0 ]]; then + echo "uploading coverage for $TYPE tests" + bash <(curl -s https://codecov.io/bash) -Z -c -F $TYPE -f $COVERAGE_FNAME + fi +done diff --git a/ci/script_multi.sh b/ci/script_multi.sh deleted file mode 100755 index fba0c7ba19dd4b..00000000000000 --- a/ci/script_multi.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -e - -echo "[script multi]" - -if [ -n "$LOCALE_OVERRIDE" ]; then - export LC_ALL="$LOCALE_OVERRIDE"; - export LANG="$LOCALE_OVERRIDE"; - echo "Setting LC_ALL to $LOCALE_OVERRIDE" - - pycmd='import pandas; print("pandas detected console encoding: %s" % pandas.get_option("display.encoding"))' - python -c "$pycmd" -fi - -# Enforce absent network during testing by faking a proxy -if echo "$TEST_ARGS" | grep -e --skip-network -q; then - export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; -fi - -# Workaround for pytest-xdist flaky collection order -# https://github.com/pytest-dev/pytest/issues/920 -# https://github.com/pytest-dev/pytest/issues/1075 -export PYTHONHASHSEED=$(python -c 'import random; print(random.randint(1, 4294967295))') -echo PYTHONHASHSEED=$PYTHONHASHSEED - -if [ "$DOC" ]; then - echo "We are not running pytest as this is a doc-build" - -elif [ "$COVERAGE" ]; then - echo pytest -s -n 2 -m "not single" --durations=10 --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas - pytest -s -n 2 -m "not single" --durations=10 --cov=pandas --cov-report xml:/tmp/cov-multiple.xml --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas - -elif [ "$SLOW" ]; then - TEST_ARGS="--only-slow --skip-network" - # The `-m " and slow"` is redundant here, as `--only-slow` is already used (via $TEST_ARGS). But is needed, because with - # `--only-slow` fast tests are skipped, but each of them is printed in the log (which can be avoided with `-q`), - # and also added to `test-data-multiple.xml`, and then printed in the log in the call to `ci/print_skipped.py`. - # Printing them to the log makes the log exceed the maximum size allowed by Travis and makes the build fail. - echo pytest -n 2 -m "not single and slow" --durations=10 --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas - pytest -n 2 -m "not single and slow" --durations=10 --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas - -else - echo pytest -n 2 -m "not single" --durations=10 --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas - pytest -n 2 -m "not single" --durations=10 --junitxml=test-data-multiple.xml --strict $TEST_ARGS pandas # TODO: doctest - -fi - -RET="$?" - -exit "$RET" diff --git a/ci/script_single.sh b/ci/script_single.sh deleted file mode 100755 index cbbb7a49541c21..00000000000000 --- a/ci/script_single.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/bin/bash - -echo "[script_single]" - -if [ -n "$LOCALE_OVERRIDE" ]; then - echo "Setting LC_ALL and LANG to $LOCALE_OVERRIDE" - export LC_ALL="$LOCALE_OVERRIDE"; - export LANG="$LOCALE_OVERRIDE"; - - pycmd='import pandas; print("pandas detected console encoding: %s" % pandas.get_option("display.encoding"))' - python -c "$pycmd" -fi - -if [ "$SLOW" ]; then - TEST_ARGS="--only-slow --skip-network" -fi - -# Enforce absent network during testing by faking a proxy -if echo "$TEST_ARGS" | grep -e --skip-network -q; then - export http_proxy=http://1.2.3.4 https_proxy=http://1.2.3.4; -fi - -if [ "$DOC" ]; then - echo "We are not running pytest as this is a doc-build" - -elif [ "$COVERAGE" ]; then - echo pytest -s -m "single" --durations=10 --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=test-data-single.xml $TEST_ARGS pandas - pytest -s -m "single" --durations=10 --strict --cov=pandas --cov-report xml:/tmp/cov-single.xml --junitxml=test-data-single.xml $TEST_ARGS pandas - echo pytest -s --strict scripts - pytest -s --strict scripts -else - echo pytest -m "single" --durations=10 --junitxml=test-data-single.xml --strict $TEST_ARGS pandas - pytest -m "single" --durations=10 --junitxml=test-data-single.xml --strict $TEST_ARGS pandas - -fi - -RET="$?" - -exit "$RET" diff --git a/ci/upload_coverage.sh b/ci/upload_coverage.sh deleted file mode 100755 index 88aca20590505b..00000000000000 --- a/ci/upload_coverage.sh +++ /dev/null @@ -1,11 +0,0 @@ -#!/bin/bash - -if [ -z "$COVERAGE" ]; then - echo "coverage is not selected for this build" - exit 0 -fi - - -echo "uploading coverage" -bash <(curl -s https://codecov.io/bash) -Z -c -F single -f /tmp/cov-single.xml -bash <(curl -s https://codecov.io/bash) -Z -c -F multiple -f /tmp/cov-multiple.xml diff --git a/doc/README.rst b/doc/README.rst index 12950d323f5d34..a11ed8d9d03e30 100644 --- a/doc/README.rst +++ b/doc/README.rst @@ -1,173 +1 @@ -.. _contributing.docs: - -Contributing to the documentation -================================= - -Whether you are someone who loves writing, teaching, or development, -contributing to the documentation is a huge value. If you don't see yourself -as a developer type, please don't stress and know that we want you to -contribute. You don't even have to be an expert on *pandas* to do so! -Something as simple as rewriting small passages for clarity -as you reference the docs is a simple but effective way to contribute. The -next person to read that passage will be in your debt! - -Actually, there are sections of the docs that are worse off by being written -by experts. If something in the docs doesn't make sense to you, updating the -relevant section after you figure it out is a simple way to ensure it will -help the next person. - -.. contents:: Table of contents: - :local: - - -About the pandas documentation ------------------------------- - -The documentation is written in **reStructuredText**, which is almost like writing -in plain English, and built using `Sphinx `__. The -Sphinx Documentation has an excellent `introduction to reST -`__. Review the Sphinx docs to perform more -complex changes to the documentation as well. - -Some other important things to know about the docs: - -- The pandas documentation consists of two parts: the docstrings in the code - itself and the docs in this folder ``pandas/doc/``. - - The docstrings provide a clear explanation of the usage of the individual - functions, while the documentation in this folder consists of tutorial-like - overviews per topic together with some other information (what's new, - installation, etc). - -- The docstrings follow the **Numpy Docstring Standard** which is used widely - in the Scientific Python community. This standard specifies the format of - the different sections of the docstring. See `this document - `_ - for a detailed explanation, or look at some of the existing functions to - extend it in a similar manner. - -- The tutorials make heavy use of the `ipython directive - `_ sphinx extension. - This directive lets you put code in the documentation which will be run - during the doc build. For example: - - :: - - .. ipython:: python - - x = 2 - x**3 - - will be rendered as - - :: - - In [1]: x = 2 - - In [2]: x**3 - Out[2]: 8 - - This means that almost all code examples in the docs are always run (and the - output saved) during the doc build. This way, they will always be up to date, - but it makes the doc building a bit more complex. - - -How to build the pandas documentation -------------------------------------- - -Requirements -^^^^^^^^^^^^ - -To build the pandas docs there are some extra requirements: you will need to -have ``sphinx`` and ``ipython`` installed. `numpydoc -`_ is used to parse the docstrings that -follow the Numpy Docstring Standard (see above), but you don't need to install -this because a local copy of ``numpydoc`` is included in the pandas source -code. `nbsphinx `_ is used to convert -Jupyter notebooks. You will need to install it if you intend to modify any of -the notebooks included in the documentation. - -Furthermore, it is recommended to have all `optional dependencies -`_ -installed. This is not needed, but be aware that you will see some error -messages. Because all the code in the documentation is executed during the doc -build, the examples using this optional dependencies will generate errors. -Run ``pd.show_versions()`` to get an overview of the installed version of all -dependencies. - -.. warning:: - - Sphinx version >= 1.2.2 or the older 1.1.3 is required. - -Building pandas -^^^^^^^^^^^^^^^ - -For a step-by-step overview on how to set up your environment, to work with -the pandas code and git, see `the developer pages -`_. -When you start to work on some docs, be sure to update your code to the latest -development version ('master'):: - - git fetch upstream - git rebase upstream/master - -Often it will be necessary to rebuild the C extension after updating:: - - python setup.py build_ext --inplace - -Building the documentation -^^^^^^^^^^^^^^^^^^^^^^^^^^ - -So how do you build the docs? Navigate to your local folder -``pandas/doc/`` directory in the console and run:: - - python make.py html - -And then you can find the html output in the folder ``pandas/doc/build/html/``. - -The first time it will take quite a while, because it has to run all the code -examples in the documentation and build all generated docstring pages. -In subsequent evocations, sphinx will try to only build the pages that have -been modified. - -If you want to do a full clean build, do:: - - python make.py clean - python make.py build - - -Starting with 0.13.1 you can tell ``make.py`` to compile only a single section -of the docs, greatly reducing the turn-around time for checking your changes. -You will be prompted to delete `.rst` files that aren't required, since the -last committed version can always be restored from git. - -:: - - #omit autosummary and API section - python make.py clean - python make.py --no-api - - # compile the docs with only a single - # section, that which is in indexing.rst - python make.py clean - python make.py --single indexing - -For comparison, a full doc build may take 10 minutes. a ``-no-api`` build -may take 3 minutes and a single section may take 15 seconds. - -Where to start? ---------------- - -There are a number of issues listed under `Docs -`_ -and `good first issue -`_ -where you could start out. - -Or maybe you have an idea of your own, by using pandas, looking for something -in the documentation and thinking 'this can be improved', let's do something -about that! - -Feel free to ask questions on `mailing list -`_ or submit an -issue on Github. +See `contributing.rst `_ in this repo. diff --git a/doc/source/10min.rst b/doc/source/10min.rst index b5938a24ce6c50..e04a8253e0bef1 100644 --- a/doc/source/10min.rst +++ b/doc/source/10min.rst @@ -5,19 +5,19 @@ .. ipython:: python :suppress: + import os import numpy as np + import pandas as pd - import os + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - import matplotlib - # matplotlib.style.use('default') pd.options.display.max_rows = 15 - #### portions of this were borrowed from the - #### Pandas cheatsheet - #### created during the PyData Workshop-Sprint 2012 - #### Hannah Chen, Henry Chow, Eric Cox, Robert Mauriello + # portions of this were borrowed from the + # Pandas cheatsheet + # created during the PyData Workshop-Sprint 2012 + # Hannah Chen, Henry Chow, Eric Cox, Robert Mauriello ******************** @@ -31,9 +31,8 @@ Customarily, we import as follows: .. ipython:: python - import pandas as pd import numpy as np - import matplotlib.pyplot as plt + import pandas as pd Object Creation --------------- @@ -55,7 +54,7 @@ and labeled columns: dates = pd.date_range('20130101', periods=6) dates - df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list('ABCD')) + df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD')) df Creating a ``DataFrame`` by passing a dict of objects that can be converted to series-like. @@ -64,7 +63,7 @@ Creating a ``DataFrame`` by passing a dict of objects that can be converted to s df2 = pd.DataFrame({'A': 1., 'B': pd.Timestamp('20130102'), - 'C': pd.Series(1, index=list(range(4)),dtype='float32'), + 'C': pd.Series(1, index=list(range(4)), dtype='float32'), 'D': np.array([3] * 4, dtype='int32'), 'E': pd.Categorical(["test", "train", "test", "train"]), 'F': 'foo'}) @@ -114,13 +113,40 @@ Here is how to view the top and bottom rows of the frame: df.head() df.tail(3) -Display the index, columns, and the underlying NumPy data: +Display the index, columns: .. ipython:: python df.index df.columns - df.values + +:meth:`DataFrame.to_numpy` gives a NumPy representation of the underlying data. +Note that his can be an expensive operation when your :class:`DataFrame` has +columns with different data types, which comes down to a fundamental difference +between pandas and NumPy: **NumPy arrays have one dtype for the entire array, +while pandas DataFrames have one dtype per column**. When you call +:meth:`DataFrame.to_numpy`, pandas will find the NumPy dtype that can hold *all* +of the dtypes in the DataFrame. This may end up being ``object``, which requires +casting every value to a Python object. + +For ``df``, our :class:`DataFrame` of all floating-point values, +:meth:`DataFrame.to_numpy` is fast and doesn't require copying data. + +.. ipython:: python + + df.to_numpy() + +For ``df2``, the :class:`DataFrame` with multiple dtypes, +:meth:`DataFrame.to_numpy` is relatively expensive. + +.. ipython:: python + + df2.to_numpy() + +.. note:: + + :meth:`DataFrame.to_numpy` does *not* include the index or column + labels in the output. :func:`~DataFrame.describe` shows a quick statistic summary of your data: @@ -190,31 +216,31 @@ Selecting on a multi-axis by label: .. ipython:: python - df.loc[:,['A','B']] + df.loc[:, ['A', 'B']] Showing label slicing, both endpoints are *included*: .. ipython:: python - df.loc['20130102':'20130104',['A','B']] + df.loc['20130102':'20130104', ['A', 'B']] Reduction in the dimensions of the returned object: .. ipython:: python - df.loc['20130102',['A','B']] + df.loc['20130102', ['A', 'B']] For getting a scalar value: .. ipython:: python - df.loc[dates[0],'A'] + df.loc[dates[0], 'A'] For getting fast access to a scalar (equivalent to the prior method): .. ipython:: python - df.at[dates[0],'A'] + df.at[dates[0], 'A'] Selection by Position ~~~~~~~~~~~~~~~~~~~~~ @@ -231,37 +257,37 @@ By integer slices, acting similar to numpy/python: .. ipython:: python - df.iloc[3:5,0:2] + df.iloc[3:5, 0:2] By lists of integer position locations, similar to the numpy/python style: .. ipython:: python - df.iloc[[1,2,4],[0,2]] + df.iloc[[1, 2, 4], [0, 2]] For slicing rows explicitly: .. ipython:: python - df.iloc[1:3,:] + df.iloc[1:3, :] For slicing columns explicitly: .. ipython:: python - df.iloc[:,1:3] + df.iloc[:, 1:3] For getting a value explicitly: .. ipython:: python - df.iloc[1,1] + df.iloc[1, 1] For getting fast access to a scalar (equivalent to the prior method): .. ipython:: python - df.iat[1,1] + df.iat[1, 1] Boolean Indexing ~~~~~~~~~~~~~~~~ @@ -303,19 +329,19 @@ Setting values by label: .. ipython:: python - df.at[dates[0],'A'] = 0 + df.at[dates[0], 'A'] = 0 Setting values by position: .. ipython:: python - df.iat[0,1] = 0 + df.iat[0, 1] = 0 Setting by assigning with a NumPy array: .. ipython:: python - df.loc[:,'D'] = np.array([5] * len(df)) + df.loc[:, 'D'] = np.array([5] * len(df)) The result of the prior setting operations. @@ -345,7 +371,7 @@ returns a copy of the data. .. ipython:: python df1 = df.reindex(index=dates[0:4], columns=list(df.columns) + ['E']) - df1.loc[dates[0]:dates[1],'E'] = 1 + df1.loc[dates[0]:dates[1], 'E'] = 1 df1 To drop any rows that have missing data. @@ -653,7 +679,8 @@ pandas can include categorical data in a ``DataFrame``. For full docs, see the .. ipython:: python - df = pd.DataFrame({"id":[1, 2, 3, 4, 5, 6], "raw_grade":['a', 'b', 'b', 'a', 'a', 'e']}) + df = pd.DataFrame({"id": [1, 2, 3, 4, 5, 6], + "raw_grade": ['a', 'b', 'b', 'a', 'a', 'e']}) Convert the raw grades to a categorical data type. @@ -674,7 +701,8 @@ Reorder the categories and simultaneously add the missing categories (methods un .. ipython:: python - df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", "good", "very good"]) + df["grade"] = df["grade"].cat.set_categories(["very bad", "bad", "medium", + "good", "very good"]) df["grade"] Sorting is per order in the categories, not lexical order. @@ -703,7 +731,8 @@ See the :ref:`Plotting ` docs. .. ipython:: python - ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) + ts = pd.Series(np.random.randn(1000), + index=pd.date_range('1/1/2000', periods=1000)) ts = ts.cumsum() @savefig series_plot_basic.png @@ -718,8 +747,10 @@ of the columns with labels: columns=['A', 'B', 'C', 'D']) df = df.cumsum() + plt.figure() + df.plot() @savefig frame_plot_basic.png - plt.figure(); df.plot(); plt.legend(loc='best') + plt.legend(loc='best') Getting Data In/Out ------------------- diff --git a/doc/source/advanced.rst b/doc/source/advanced.rst index 17214ab62b2782..24a1ac7be7d1db 100644 --- a/doc/source/advanced.rst +++ b/doc/source/advanced.rst @@ -188,7 +188,7 @@ highly performant. If you want to see only the used levels, you can use the .. ipython:: python - df[['foo', 'qux']].columns.values + df[['foo', 'qux']].columns.to_numpy() # for a specific level df[['foo', 'qux']].columns.get_level_values(0) diff --git a/doc/source/api.rst b/doc/source/api.rst index 81bb420c47a999..82ae58acc4974a 100644 --- a/doc/source/api.rst +++ b/doc/source/api.rst @@ -1671,6 +1671,7 @@ IntervalIndex Components IntervalIndex.length IntervalIndex.values IntervalIndex.is_non_overlapping_monotonic + IntervalIndex.is_overlapping IntervalIndex.get_loc IntervalIndex.get_indexer IntervalIndex.set_closed @@ -2482,6 +2483,7 @@ Style Application Styler.set_properties Styler.set_uuid Styler.clear + Styler.pipe Builtin Styles ~~~~~~~~~~~~~~ diff --git a/doc/source/basics.rst b/doc/source/basics.rst index 69d1e105f62ab4..25e2c8cd1ff9a9 100644 --- a/doc/source/basics.rst +++ b/doc/source/basics.rst @@ -46,8 +46,8 @@ of elements to display is five, but you may pass a custom number. .. _basics.attrs: -Attributes and the raw ndarray(s) ---------------------------------- +Attributes and Underlying Data +------------------------------ pandas objects have a number of attributes enabling you to access the metadata @@ -65,14 +65,43 @@ Note, **these attributes can be safely assigned to**! df.columns = [x.lower() for x in df.columns] df -To get the actual data inside a data structure, one need only access the -**values** property: +Pandas objects (:class:`Index`, :class:`Series`, :class:`DataFrame`) can be +thought of as containers for arrays, which hold the actual data and do the +actual computation. For many types, the underlying array is a +:class:`numpy.ndarray`. However, pandas and 3rd party libraries may *extend* +NumPy's type system to add support for custom arrays +(see :ref:`basics.dtypes`). + +To get the actual data inside a :class:`Index` or :class:`Series`, use +the **array** property + +.. ipython:: python + + s.array + s.index.array + +Depending on the data type (see :ref:`basics.dtypes`), :attr:`~Series.array` +be either a NumPy array or an :ref:`ExtensionArray `. +If you know you need a NumPy array, use :meth:`~Series.to_numpy` +or :meth:`numpy.asarray`. .. ipython:: python - s.values - df.values - wp.values + s.to_numpy() + np.asarray(s) + +For Series and Indexes backed by NumPy arrays (like we have here), this will +be the same as :attr:`~Series.array`. When the Series or Index is backed by +a :class:`~pandas.api.extension.ExtensionArray`, :meth:`~Series.to_numpy` +may involve copying data and coercing values. + +Getting the "raw data" inside a :class:`DataFrame` is possibly a bit more +complex. When your ``DataFrame`` only has a single data type for all the +columns, :atr:`DataFrame.to_numpy` will return the underlying data: + +.. ipython:: python + + df.to_numpy() If a DataFrame or Panel contains homogeneously-typed data, the ndarray can actually be modified in-place, and the changes will be reflected in the data @@ -87,6 +116,21 @@ unlike the axis labels, cannot be assigned to. strings are involved, the result will be of object dtype. If there are only floats and integers, the resulting array will be of float dtype. +In the past, pandas recommended :attr:`Series.values` or :attr:`DataFrame.values` +for extracting the data from a Series or DataFrame. You'll still find references +to these in old code bases and online. Going forward, we recommend avoiding +``.values`` and using ``.array`` or ``.to_numpy()``. ``.values`` has the following +drawbacks: + +1. When your Series contains an :ref:`extension type `, it's + unclear whether :attr:`Series.values` returns a NumPy array or the extension array. + :attr:`Series.array` will always return the actual array backing the Series, + while :meth:`Series.to_numpy` will always return a NumPy array. +2. When your DataFrame contains a mixture of data types, :attr:`DataFrame.values` may + involve copying data and coercing values to a common dtype, a relatively expensive + operation. :meth:`DataFrame.to_numpy`, being a method, makes it clearer that the + returned NumPy array may not be a view on the same data in the DataFrame. + .. _basics.accelerate: Accelerated operations @@ -541,7 +585,7 @@ will exclude NAs on Series input by default: .. ipython:: python np.mean(df['one']) - np.mean(df['one'].values) + np.mean(df['one'].to_numpy()) :meth:`Series.nunique` will return the number of unique non-NA values in a Series: @@ -839,7 +883,7 @@ Series operation on each column or row: tsdf = pd.DataFrame(np.random.randn(10, 3), columns=['A', 'B', 'C'], index=pd.date_range('1/1/2000', periods=10)) - tsdf.values[3:7] = np.nan + tsdf.iloc[3:7] = np.nan .. ipython:: python @@ -1875,17 +1919,29 @@ dtypes ------ For the most part, pandas uses NumPy arrays and dtypes for Series or individual -columns of a DataFrame. The main types allowed in pandas objects are ``float``, -``int``, ``bool``, and ``datetime64[ns]`` (note that NumPy does not support -timezone-aware datetimes). - -In addition to NumPy's types, pandas :ref:`extends ` -NumPy's type-system for a few cases. - -* :ref:`Categorical ` -* :ref:`Datetime with Timezone ` -* :ref:`Period ` -* :ref:`Interval ` +columns of a DataFrame. NumPy provides support for ``float``, +``int``, ``bool``, ``timedelta64[ns]`` and ``datetime64[ns]`` (note that NumPy +does not support timezone-aware datetimes). + +Pandas and third-party libraries *extend* NumPy's type system in a few places. +This section describes the extensions pandas has made internally. +See :ref:`extending.extension-types` for how to write your own extension that +works with pandas. See :ref:`ecosystem.extensions` for a list of third-party +libraries that have implemented an extension. + +The following table lists all of pandas extension types. See the respective +documentation sections for more on each type. + +=================== ========================= ================== ============================= ============================= +Kind of Data Data Type Scalar Array Documentation +=================== ========================= ================== ============================= ============================= +tz-aware datetime :class:`DatetimeArray` :class:`Timestamp` :class:`arrays.DatetimeArray` :ref:`timeseries.timezone` +Categorical :class:`CategoricalDtype` (none) :class:`Categorical` :ref:`categorical` +period (time spans) :class:`PeriodDtype` :class:`Period` :class:`arrays.PeriodArray` :ref:`timeseries.periods` +sparse :class:`SparseDtype` (none) :class:`arrays.SparseArray` :ref:`sparse` +intervals :class:`IntervalDtype` :class:`Interval` :class:`arrays.IntervalArray` :ref:`advanced.intervalindex` +nullable integer :clsas:`Int64Dtype`, ... (none) :class:`arrays.IntegerArray` :ref:`integer_na` +=================== ========================= ================== ============================= ============================= Pandas uses the ``object`` dtype for storing strings. @@ -1983,13 +2039,13 @@ from the current type (e.g. ``int`` to ``float``). df3 df3.dtypes -The ``values`` attribute on a DataFrame return the *lower-common-denominator* of the dtypes, meaning +:meth:`DataFrame.to_numpy` will return the *lower-common-denominator* of the dtypes, meaning the dtype that can accommodate **ALL** of the types in the resulting homogeneous dtyped NumPy array. This can force some *upcasting*. .. ipython:: python - df3.values.dtype + df3.to_numpy().dtype astype ~~~~~~ @@ -2211,11 +2267,11 @@ dtypes: 'float64': np.arange(4.0, 7.0), 'bool1': [True, False, True], 'bool2': [False, True, False], - 'dates': pd.date_range('now', periods=3).values, + 'dates': pd.date_range('now', periods=3), 'category': pd.Series(list("ABC")).astype('category')}) df['tdeltas'] = df.dates.diff() df['uint64'] = np.arange(3, 6).astype('u8') - df['other_dates'] = pd.date_range('20130101', periods=3).values + df['other_dates'] = pd.date_range('20130101', periods=3) df['tz_aware_dates'] = pd.date_range('20130101', periods=3, tz='US/Eastern') df diff --git a/doc/source/categorical.rst b/doc/source/categorical.rst index 2f2430f02f89df..31f2430e4be885 100644 --- a/doc/source/categorical.rst +++ b/doc/source/categorical.rst @@ -178,7 +178,7 @@ are consistent among all columns. To perform table-wise conversion, where all labels in the entire ``DataFrame`` are used as categories for each column, the ``categories`` parameter can be determined programmatically by - ``categories = pd.unique(df.values.ravel())``. + ``categories = pd.unique(df.to_numpy().ravel())``. If you already have ``codes`` and ``categories``, you can use the :func:`~pandas.Categorical.from_codes` constructor to save the factorize step @@ -955,7 +955,7 @@ Use ``.astype`` or ``union_categoricals`` to get ``category`` result. pd.concat([s1, s3]) pd.concat([s1, s3]).astype('category') - union_categoricals([s1.values, s3.values]) + union_categoricals([s1.array, s3.array]) Following table summarizes the results of ``Categoricals`` related concatenations. diff --git a/doc/source/comparison_with_r.rst b/doc/source/comparison_with_r.rst index eecacde8ad14e7..704b0c4d80537a 100644 --- a/doc/source/comparison_with_r.rst +++ b/doc/source/comparison_with_r.rst @@ -6,7 +6,7 @@ import pandas as pd import numpy as np - pd.options.display.max_rows=15 + pd.options.display.max_rows = 15 Comparison with R / R libraries ******************************* @@ -165,16 +165,15 @@ function. .. ipython:: python - df = pd.DataFrame({ - 'v1': [1,3,5,7,8,3,5,np.nan,4,5,7,9], - 'v2': [11,33,55,77,88,33,55,np.nan,44,55,77,99], - 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], - 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan, - np.nan] - }) + df = pd.DataFrame( + {'v1': [1, 3, 5, 7, 8, 3, 5, np.nan, 4, 5, 7, 9], + 'v2': [11, 33, 55, 77, 88, 33, 55, np.nan, 44, 55, 77, 99], + 'by1': ["red", "blue", 1, 2, np.nan, "big", 1, 2, "red", 1, np.nan, 12], + 'by2': ["wet", "dry", 99, 95, np.nan, "damp", 95, 99, "red", 99, np.nan, + np.nan]}) - g = df.groupby(['by1','by2']) - g[['v1','v2']].mean() + g = df.groupby(['by1', 'by2']) + g[['v1', 'v2']].mean() For more details and examples see :ref:`the groupby documentation `. @@ -195,7 +194,7 @@ The :meth:`~pandas.DataFrame.isin` method is similar to R ``%in%`` operator: .. ipython:: python - s = pd.Series(np.arange(5),dtype=np.float32) + s = pd.Series(np.arange(5), dtype=np.float32) s.isin([2, 4]) The ``match`` function returns a vector of the positions of matches @@ -234,11 +233,11 @@ In ``pandas`` we may use :meth:`~pandas.pivot_table` method to handle this: import random import string - baseball = pd.DataFrame({ - 'team': ["team %d" % (x+1) for x in range(5)]*5, - 'player': random.sample(list(string.ascii_lowercase),25), - 'batting avg': np.random.uniform(.200, .400, 25) - }) + baseball = pd.DataFrame( + {'team': ["team %d" % (x + 1) for x in range(5)] * 5, + 'player': random.sample(list(string.ascii_lowercase), 25), + 'batting avg': np.random.uniform(.200, .400, 25)}) + baseball.pivot_table(values='batting avg', columns='team', aggfunc=np.max) For more details and examples see :ref:`the reshaping documentation @@ -341,15 +340,13 @@ In ``pandas`` the equivalent expression, using the .. ipython:: python - df = pd.DataFrame({ - 'x': np.random.uniform(1., 168., 120), - 'y': np.random.uniform(7., 334., 120), - 'z': np.random.uniform(1.7, 20.7, 120), - 'month': [5,6,7,8]*30, - 'week': np.random.randint(1,4, 120) - }) + df = pd.DataFrame({'x': np.random.uniform(1., 168., 120), + 'y': np.random.uniform(7., 334., 120), + 'z': np.random.uniform(1.7, 20.7, 120), + 'month': [5, 6, 7, 8] * 30, + 'week': np.random.randint(1, 4, 120)}) - grouped = df.groupby(['month','week']) + grouped = df.groupby(['month', 'week']) grouped['x'].agg([np.mean, np.std]) @@ -374,8 +371,8 @@ In Python, since ``a`` is a list, you can simply use list comprehension. .. ipython:: python - a = np.array(list(range(1,24))+[np.NAN]).reshape(2,3,4) - pd.DataFrame([tuple(list(x)+[val]) for x, val in np.ndenumerate(a)]) + a = np.array(list(range(1, 24)) + [np.NAN]).reshape(2, 3, 4) + pd.DataFrame([tuple(list(x) + [val]) for x, val in np.ndenumerate(a)]) |meltlist|_ ~~~~~~~~~~~~ @@ -393,7 +390,7 @@ In Python, this list would be a list of tuples, so .. ipython:: python - a = list(enumerate(list(range(1,5))+[np.NAN])) + a = list(enumerate(list(range(1, 5)) + [np.NAN])) pd.DataFrame(a) For more details and examples see :ref:`the Into to Data Structures @@ -419,12 +416,13 @@ In Python, the :meth:`~pandas.melt` method is the R equivalent: .. ipython:: python - cheese = pd.DataFrame({'first' : ['John', 'Mary'], - 'last' : ['Doe', 'Bo'], - 'height' : [5.5, 6.0], - 'weight' : [130, 150]}) + cheese = pd.DataFrame({'first': ['John', 'Mary'], + 'last': ['Doe', 'Bo'], + 'height': [5.5, 6.0], + 'weight': [130, 150]}) + pd.melt(cheese, id_vars=['first', 'last']) - cheese.set_index(['first', 'last']).stack() # alternative way + cheese.set_index(['first', 'last']).stack() # alternative way For more details and examples see :ref:`the reshaping documentation `. @@ -452,16 +450,15 @@ In Python the best way is to make use of :meth:`~pandas.pivot_table`: .. ipython:: python - df = pd.DataFrame({ - 'x': np.random.uniform(1., 168., 12), - 'y': np.random.uniform(7., 334., 12), - 'z': np.random.uniform(1.7, 20.7, 12), - 'month': [5,6,7]*4, - 'week': [1,2]*6 - }) + df = pd.DataFrame({'x': np.random.uniform(1., 168., 12), + 'y': np.random.uniform(7., 334., 12), + 'z': np.random.uniform(1.7, 20.7, 12), + 'month': [5, 6, 7] * 4, + 'week': [1, 2] * 6}) + mdf = pd.melt(df, id_vars=['month', 'week']) - pd.pivot_table(mdf, values='value', index=['variable','week'], - columns=['month'], aggfunc=np.mean) + pd.pivot_table(mdf, values='value', index=['variable', 'week'], + columns=['month'], aggfunc=np.mean) Similarly for ``dcast`` which uses a data.frame called ``df`` in R to aggregate information based on ``Animal`` and ``FeedType``: @@ -491,13 +488,14 @@ using :meth:`~pandas.pivot_table`: 'Amount': [10, 7, 4, 2, 5, 6, 2], }) - df.pivot_table(values='Amount', index='Animal', columns='FeedType', aggfunc='sum') + df.pivot_table(values='Amount', index='Animal', columns='FeedType', + aggfunc='sum') The second approach is to use the :meth:`~pandas.DataFrame.groupby` method: .. ipython:: python - df.groupby(['Animal','FeedType'])['Amount'].sum() + df.groupby(['Animal', 'FeedType'])['Amount'].sum() For more details and examples see :ref:`the reshaping documentation ` or :ref:`the groupby documentation`. @@ -516,8 +514,8 @@ In pandas this is accomplished with ``pd.cut`` and ``astype("category")``: .. ipython:: python - pd.cut(pd.Series([1,2,3,4,5,6]), 3) - pd.Series([1,2,3,2,2,3]).astype("category") + pd.cut(pd.Series([1, 2, 3, 4, 5, 6]), 3) + pd.Series([1, 2, 3, 2, 2, 3]).astype("category") For more details and examples see :ref:`categorical introduction ` and the :ref:`API documentation `. There is also a documentation regarding the diff --git a/doc/source/comparison_with_sql.rst b/doc/source/comparison_with_sql.rst index db143cd5864410..021f37eb5c66fc 100644 --- a/doc/source/comparison_with_sql.rst +++ b/doc/source/comparison_with_sql.rst @@ -23,7 +23,8 @@ structure. .. ipython:: python - url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv' + url = ('https://raw.github.com/pandas-dev' + '/pandas/master/pandas/tests/data/tips.csv') tips = pd.read_csv(url) tips.head() @@ -387,7 +388,7 @@ Top N rows with offset .. ipython:: python - tips.nlargest(10+5, columns='tip').tail(10) + tips.nlargest(10 + 5, columns='tip').tail(10) Top N rows per group ~~~~~~~~~~~~~~~~~~~~ @@ -411,8 +412,7 @@ Top N rows per group .groupby(['day']) .cumcount() + 1) .query('rn < 3') - .sort_values(['day','rn']) - ) + .sort_values(['day', 'rn'])) the same using `rank(method='first')` function @@ -421,8 +421,7 @@ the same using `rank(method='first')` function (tips.assign(rnk=tips.groupby(['day'])['total_bill'] .rank(method='first', ascending=False)) .query('rnk < 3') - .sort_values(['day','rnk']) - ) + .sort_values(['day', 'rnk'])) .. code-block:: sql @@ -445,11 +444,10 @@ Notice that when using ``rank(method='min')`` function .. ipython:: python (tips[tips['tip'] < 2] - .assign(rnk_min=tips.groupby(['sex'])['tip'] - .rank(method='min')) - .query('rnk_min < 3') - .sort_values(['sex','rnk_min']) - ) + .assign(rnk_min=tips.groupby(['sex'])['tip'] + .rank(method='min')) + .query('rnk_min < 3') + .sort_values(['sex', 'rnk_min'])) UPDATE diff --git a/doc/source/comparison_with_stata.rst b/doc/source/comparison_with_stata.rst index 6c518983d5904a..e039843b220656 100644 --- a/doc/source/comparison_with_stata.rst +++ b/doc/source/comparison_with_stata.rst @@ -102,9 +102,7 @@ and the values are the data. .. ipython:: python - df = pd.DataFrame({ - 'x': [1, 3, 5], - 'y': [2, 4, 6]}) + df = pd.DataFrame({'x': [1, 3, 5], 'y': [2, 4, 6]}) df @@ -128,7 +126,8 @@ the data set if presented with a url. .. ipython:: python - url = 'https://raw.github.com/pandas-dev/pandas/master/pandas/tests/data/tips.csv' + url = ('https://raw.github.com/pandas-dev' + '/pandas/master/pandas/tests/data/tips.csv') tips = pd.read_csv(url) tips.head() @@ -278,17 +277,17 @@ see the :ref:`timeseries documentation` for more details. tips['date1_year'] = tips['date1'].dt.year tips['date2_month'] = tips['date2'].dt.month tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin() - tips['months_between'] = (tips['date2'].dt.to_period('M') - - tips['date1'].dt.to_period('M')) + tips['months_between'] = (tips['date2'].dt.to_period('M') + - tips['date1'].dt.to_period('M')) - tips[['date1','date2','date1_year','date2_month', - 'date1_next','months_between']].head() + tips[['date1', 'date2', 'date1_year', 'date2_month', 'date1_next', + 'months_between']].head() .. ipython:: python :suppress: - tips = tips.drop(['date1','date2','date1_year', - 'date2_month','date1_next','months_between'], axis=1) + tips = tips.drop(['date1', 'date2', 'date1_year', 'date2_month', + 'date1_next', 'months_between'], axis=1) Selection of Columns ~~~~~~~~~~~~~~~~~~~~ @@ -472,7 +471,7 @@ The following tables will be used in the merge examples 'value': np.random.randn(4)}) df1 df2 = pd.DataFrame({'key': ['B', 'D', 'D', 'E'], - 'value': np.random.randn(4)}) + 'value': np.random.randn(4)}) df2 In Stata, to perform a merge, one data set must be in memory @@ -661,7 +660,7 @@ In pandas this would be written as: .. ipython:: python - tips.groupby(['sex','smoker']).first() + tips.groupby(['sex', 'smoker']).first() Other Considerations diff --git a/doc/source/computation.rst b/doc/source/computation.rst index 0d2021de8f88e0..251dce5141ea59 100644 --- a/doc/source/computation.rst +++ b/doc/source/computation.rst @@ -4,14 +4,15 @@ :suppress: import numpy as np + import matplotlib.pyplot as plt + + import pandas as pd + np.random.seed(123456) np.set_printoptions(precision=4, suppress=True) - import pandas as pd - import matplotlib - # matplotlib.style.use('default') - import matplotlib.pyplot as plt + pd.options.display.max_rows = 15 + plt.close('all') - pd.options.display.max_rows=15 .. _computation: @@ -75,7 +76,8 @@ series in the DataFrame, also excluding NA/null values. .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), + columns=['a', 'b', 'c', 'd', 'e']) frame.cov() ``DataFrame.cov`` also supports an optional ``min_periods`` keyword that @@ -127,7 +129,8 @@ Wikipedia has articles covering the above correlation coefficients: .. ipython:: python - frame = pd.DataFrame(np.random.randn(1000, 5), columns=['a', 'b', 'c', 'd', 'e']) + frame = pd.DataFrame(np.random.randn(1000, 5), + columns=['a', 'b', 'c', 'd', 'e']) frame.iloc[::2] = np.nan # Series with Series @@ -163,9 +166,10 @@ compute the correlation based on histogram intersection: .. ipython:: python # histogram intersection - histogram_intersection = lambda a, b: np.minimum( - np.true_divide(a, a.sum()), np.true_divide(b, b.sum()) - ).sum() + def histogram_intersection(a, b): + return np.minimum(np.true_divide(a, a.sum()), + np.true_divide(b, b.sum())).sum() + frame.corr(method=histogram_intersection) A related method :meth:`~DataFrame.corrwith` is implemented on DataFrame to @@ -192,7 +196,7 @@ assigned the mean of the ranks (by default) for the group: .. ipython:: python s = pd.Series(np.random.np.random.randn(5), index=list('abcde')) - s['d'] = s['b'] # so there's a tie + s['d'] = s['b'] # so there's a tie s.rank() :meth:`~DataFrame.rank` is also a DataFrame method and can rank either the rows @@ -202,7 +206,7 @@ ranking. .. ipython:: python df = pd.DataFrame(np.random.np.random.randn(10, 6)) - df[4] = df[2][:5] # some ties + df[4] = df[2][:5] # some ties df df.rank(1) @@ -243,7 +247,8 @@ objects, :class:`~pandas.core.window.Rolling`, :class:`~pandas.core.window.Expan .. ipython:: python - s = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000)) + s = pd.Series(np.random.randn(1000), + index=pd.date_range('1/1/2000', periods=1000)) s = s.cumsum() s @@ -258,7 +263,7 @@ These object provide tab-completion of the available methods and properties. .. code-block:: ipython - In [14]: r. + In [14]: r. # noqa: E225, E999 r.agg r.apply r.count r.exclusions r.max r.median r.name r.skew r.sum r.aggregate r.corr r.cov r.kurt r.mean r.min r.quantile r.std r.var @@ -336,7 +341,9 @@ compute the mean absolute deviation on a rolling basis: .. ipython:: python - mad = lambda x: np.fabs(x - x.mean()).mean() + def mad(x): + return np.fabs(x - x.mean()).mean() + @savefig rolling_apply_ex.png s.rolling(window=60).apply(mad, raw=True).plot(style='k') @@ -376,7 +383,8 @@ The list of recognized types are the `scipy.signal window functions .. ipython:: python - ser = pd.Series(np.random.randn(10), index=pd.date_range('1/1/2000', periods=10)) + ser = pd.Series(np.random.randn(10), + index=pd.date_range('1/1/2000', periods=10)) ser.rolling(window=5, win_type='triang').mean() @@ -423,7 +431,9 @@ This can be particularly useful for a non-regular time frequency index. .. ipython:: python dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index=pd.date_range('20130101 09:00:00', periods=5, freq='s')) + index=pd.date_range('20130101 09:00:00', + periods=5, + freq='s')) dft This is a regular frequency index. Using an integer window parameter works to roll along the window frequency. @@ -445,12 +455,12 @@ Using a non-regular, but still monotonic index, rolling with an integer window d .. ipython:: python dft = pd.DataFrame({'B': [0, 1, 2, np.nan, 4]}, - index = pd.Index([pd.Timestamp('20130101 09:00:00'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:05'), - pd.Timestamp('20130101 09:00:06')], - name='foo')) + index=pd.Index([pd.Timestamp('20130101 09:00:00'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:05'), + pd.Timestamp('20130101 09:00:06')], + name='foo')) dft dft.rolling(2).sum() @@ -496,11 +506,11 @@ from present information back to past information. This allows the rolling windo .. ipython:: python df = pd.DataFrame({'x': 1}, - index = [pd.Timestamp('20130101 09:00:01'), - pd.Timestamp('20130101 09:00:02'), - pd.Timestamp('20130101 09:00:03'), - pd.Timestamp('20130101 09:00:04'), - pd.Timestamp('20130101 09:00:06')]) + index=[pd.Timestamp('20130101 09:00:01'), + pd.Timestamp('20130101 09:00:02'), + pd.Timestamp('20130101 09:00:03'), + pd.Timestamp('20130101 09:00:04'), + pd.Timestamp('20130101 09:00:06')]) df["right"] = df.rolling('2s', closed='right').x.sum() # default df["both"] = df.rolling('2s', closed='both').x.sum() @@ -601,7 +611,8 @@ can even be omitted: .. ipython:: python - covs = df[['B','C','D']].rolling(window=50).cov(df[['A','B','C']], pairwise=True) + covs = (df[['B', 'C', 'D']].rolling(window=50) + .cov(df[['A', 'B', 'C']], pairwise=True)) covs.loc['2002-09-22':] .. ipython:: python @@ -637,7 +648,7 @@ perform multiple computations on the data. These operations are similar to the : dfa = pd.DataFrame(np.random.randn(1000, 3), index=pd.date_range('1/1/2000', periods=1000), columns=['A', 'B', 'C']) - r = dfa.rolling(window=60,min_periods=1) + r = dfa.rolling(window=60, min_periods=1) r We can aggregate by passing a function to the entire DataFrame, or select a @@ -649,7 +660,7 @@ Series (or multiple Series) via standard ``__getitem__``. r['A'].aggregate(np.sum) - r[['A','B']].aggregate(np.sum) + r[['A', 'B']].aggregate(np.sum) As you can see, the result of the aggregation will have the selected columns, or all columns if none are selected. @@ -683,24 +694,21 @@ By passing a dict to ``aggregate`` you can apply a different aggregation to the columns of a ``DataFrame``: .. ipython:: python - :okexcept: - :okwarning: - r.agg({'A' : np.sum, - 'B' : lambda x: np.std(x, ddof=1)}) + r.agg({'A': np.sum, 'B': lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the windowed object .. ipython:: python - r.agg({'A' : 'sum', 'B' : 'std'}) + r.agg({'A': 'sum', 'B': 'std'}) Furthermore you can pass a nested dict to indicate different aggregations on different columns. .. ipython:: python - r.agg({'A' : ['sum','std'], 'B' : ['mean','std'] }) + r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) .. _stats.moments.expanding: diff --git a/doc/source/contributing.rst b/doc/source/contributing.rst index 6fdb5bdbb6b1d7..c55452cf273093 100644 --- a/doc/source/contributing.rst +++ b/doc/source/contributing.rst @@ -670,6 +670,8 @@ Otherwise, you need to do it manually: .. code-block:: python + import warnings + def old_func(): """Summary of the function. @@ -679,6 +681,9 @@ Otherwise, you need to do it manually: warnings.warn('Use new_func instead.', FutureWarning, stacklevel=2) new_func() + def new_func(): + pass + You'll also need to 1. write a new test that asserts a warning is issued when calling with the deprecated argument @@ -933,6 +938,8 @@ If your change involves checking that a warning is actually emitted, use .. code-block:: python + df = pd.DataFrame() + with tm.assert_produces_warning(FutureWarning): df.some_operation() @@ -963,7 +970,7 @@ a single test. .. code-block:: python - with warch.catch_warnings(): + with warnings.catch_warnings(): warnings.simplefilter("ignore", FutureWarning) # Or use warnings.filterwarnings(...) diff --git a/doc/source/dsintro.rst b/doc/source/dsintro.rst index ccd530d11b8f93..6195212873e75c 100644 --- a/doc/source/dsintro.rst +++ b/doc/source/dsintro.rst @@ -137,7 +137,43 @@ However, operations such as slicing will also slice the index. s[[4, 3, 1]] np.exp(s) -We will address array-based indexing in a separate :ref:`section `. +.. note:: + + We will address array-based indexing like ``s[[4, 3, 1]]`` + in :ref:`section `. + +Like a NumPy array, a pandas Series has a :attr:`~Series.dtype`. + +.. ipython:: python + + s.dtype + +This is often a NumPy dtype. However, pandas and 3rd-party libraries +extend NumPy's type system in a few places, in which case the dtype would +be a :class:`~pandas.api.extensions.ExtensionDtype`. Some examples within +pandas are :ref:`categorical` and :ref:`integer_na`. See :ref:`basics.dtypes` +for more. + +If you need the actual array backing a ``Series``, use :attr:`Series.array`. + +.. ipython:: python + + s.array + +Again, this is often a NumPy array, but may instead be a +:class:`~pandas.api.extensions.ExtensionArray`. See :ref:`basics.dtypes` for more. +Accessing the array can be useful when you need to do some operation without the +index (to disable :ref:`automatic alignment `, for example). + +While Series is ndarray-like, if you need an *actual* ndarray, then use +:meth:`Series.to_numpy`. + +.. ipython:: python + + s.to_numpy() + +Even if the Series is backed by a :class:`~pandas.api.extensions.ExtensionArray`, +:meth:`Series.to_numpy` will return a NumPy ndarray. Series is dict-like ~~~~~~~~~~~~~~~~~~~ @@ -617,6 +653,8 @@ slicing, see the :ref:`section on indexing `. We will address the fundamentals of reindexing / conforming to new sets of labels in the :ref:`section on reindexing `. +.. _dsintro.alignment: + Data alignment and arithmetic ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/enhancingperf.rst b/doc/source/enhancingperf.rst index 2ca8a2b7ac0f88..1c873d604cfe0b 100644 --- a/doc/source/enhancingperf.rst +++ b/doc/source/enhancingperf.rst @@ -221,7 +221,7 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra You can **not pass** a ``Series`` directly as a ``ndarray`` typed parameter to a Cython function. Instead pass the actual ``ndarray`` using the - ``.values`` attribute of the ``Series``. The reason is that the Cython + :meth:`Series.to_numpy`. The reason is that the Cython definition is specific to an ndarray and not the passed ``Series``. So, do not do this: @@ -230,11 +230,13 @@ the rows, applying our ``integrate_f_typed``, and putting this in the zeros arra apply_integrate_f(df['a'], df['b'], df['N']) - But rather, use ``.values`` to get the underlying ``ndarray``: + But rather, use :meth:`Series.to_numpy` to get the underlying ``ndarray``: .. code-block:: python - apply_integrate_f(df['a'].values, df['b'].values, df['N'].values) + apply_integrate_f(df['a'].to_numpy(), + df['b'].to_numpy(), + df['N'].to_numpy()) .. note:: diff --git a/doc/source/extending.rst b/doc/source/extending.rst index 6c47d0ae8bd848..7046981a3a3643 100644 --- a/doc/source/extending.rst +++ b/doc/source/extending.rst @@ -186,7 +186,7 @@ Instead, you should detect these cases and return ``NotImplemented``. When pandas encounters an operation like ``op(Series, ExtensionArray)``, pandas will -1. unbox the array from the ``Series`` (roughly ``Series.values``) +1. unbox the array from the ``Series`` (``Series.array``) 2. call ``result = op(values, ExtensionArray)`` 3. re-box the result in a ``Series`` diff --git a/doc/source/indexing.rst b/doc/source/indexing.rst index 5740ab5fa69217..dc0c6dd027b3c5 100644 --- a/doc/source/indexing.rst +++ b/doc/source/indexing.rst @@ -190,7 +190,7 @@ columns. .. ipython:: python - df.loc[:,['B', 'A']] = df[['A', 'B']].values + df.loc[:,['B', 'A']] = df[['A', 'B']].to_numpy() df[['A', 'B']] diff --git a/doc/source/io.rst b/doc/source/io.rst index 92fc28af0281a9..fbd238586c7764 100644 --- a/doc/source/io.rst +++ b/doc/source/io.rst @@ -5,25 +5,23 @@ .. ipython:: python :suppress: - import os import csv - from pandas.compat import StringIO, BytesIO - import pandas as pd - ExcelWriter = pd.ExcelWriter + import os + import matplotlib.pyplot as plt import numpy as np - np.random.seed(123456) + import pandas as pd + from pandas.compat import StringIO, BytesIO + + randn = np.random.randn np.set_printoptions(precision=4, suppress=True) - - import matplotlib.pyplot as plt plt.close('all') - - import pandas.util.testing as tm pd.options.display.max_rows = 15 clipdf = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': ['p', 'q', 'r']}, index=['x', 'y', 'z']) + =============================== IO Tools (Text, CSV, HDF5, ...) =============================== @@ -146,7 +144,10 @@ usecols : list-like or callable, default ``None`` .. ipython:: python - data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + data = ('col1,col2,col3\n' + 'a,b,1\n' + 'a,b,2\n' + 'c,d,3') pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), usecols=lambda x: x.upper() in ['COL1', 'COL3']) @@ -192,7 +193,10 @@ skiprows : list-like or integer, default ``None`` .. ipython:: python - data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + data = ('col1,col2,col3\n' + 'a,b,1\n' + 'a,b,2\n' + 'c,d,3') pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), skiprows=lambda x: x % 2 != 0) @@ -367,7 +371,10 @@ columns: .. ipython:: python - data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9' + data = ('a,b,c\n' + '1,2,3\n' + '4,5,6\n' + '7,8,9') print(data) df = pd.read_csv(StringIO(data), dtype=object) @@ -388,7 +395,11 @@ of :func:`~pandas.read_csv`: .. ipython:: python - data = "col_1\n1\n2\n'A'\n4.22" + data = ("col_1\n" + "1\n" + "2\n" + "'A'\n" + "4.22") df = pd.read_csv(StringIO(data), converters={'col_1': str}) df df['col_1'].apply(type).value_counts() @@ -427,7 +438,8 @@ worth trying. .. ipython:: python :okwarning: - df = pd.DataFrame({'col_1': list(range(500000)) + ['a', 'b'] + list(range(500000))}) + col_1 = list(range(500000)) + ['a', 'b'] + list(range(500000)) + df = pd.DataFrame({'col_1': col_1}) df.to_csv('foo.csv') mixed_df = pd.read_csv('foo.csv') mixed_df['col_1'].apply(type).value_counts() @@ -455,7 +467,10 @@ Specifying Categorical dtype .. ipython:: python - data = 'col1,col2,col3\na,b,1\na,b,2\nc,d,3' + data = ('col1,col2,col3\n' + 'a,b,1\n' + 'a,b,2\n' + 'c,d,3') pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data)).dtypes @@ -479,7 +494,6 @@ that column's ``dtype``. .. ipython:: python from pandas.api.types import CategoricalDtype - dtype = CategoricalDtype(['d', 'c', 'b', 'a'], ordered=True) pd.read_csv(StringIO(data), dtype={'col1': dtype}).dtypes @@ -525,7 +539,10 @@ used as the column names: .. ipython:: python - data = 'a,b,c\n1,2,3\n4,5,6\n7,8,9' + data = ('a,b,c\n' + '1,2,3\n' + '4,5,6\n' + '7,8,9') print(data) pd.read_csv(StringIO(data)) @@ -544,7 +561,11 @@ If the header is in a row other than the first, pass the row number to .. ipython:: python - data = 'skip this skip it\na,b,c\n1,2,3\n4,5,6\n7,8,9' + data = ('skip this skip it\n' + 'a,b,c\n' + '1,2,3\n' + '4,5,6\n' + '7,8,9') pd.read_csv(StringIO(data), header=1) .. note:: @@ -565,7 +586,9 @@ distinguish between them so as to prevent overwriting data: .. ipython :: python - data = 'a,b,a\n0,1,2\n3,4,5' + data = ('a,b,a\n' + '0,1,2\n' + '3,4,5') pd.read_csv(StringIO(data)) There is no more duplicate data because ``mangle_dupe_cols=True`` by default, @@ -633,7 +656,13 @@ be ignored. By default, completely blank lines will be ignored as well. .. ipython:: python - data = '\na,b,c\n \n# commented line\n1,2,3\n\n4,5,6' + data = ('\n' + 'a,b,c\n' + ' \n' + '# commented line\n' + '1,2,3\n' + '\n' + '4,5,6') print(data) pd.read_csv(StringIO(data), comment='#') @@ -641,7 +670,12 @@ If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: .. ipython:: python - data = 'a,b,c\n\n1,2,3\n\n\n4,5,6' + data = ('a,b,c\n' + '\n' + '1,2,3\n' + '\n' + '\n' + '4,5,6') pd.read_csv(StringIO(data), skip_blank_lines=False) .. warning:: @@ -652,20 +686,32 @@ If ``skip_blank_lines=False``, then ``read_csv`` will not ignore blank lines: .. ipython:: python - data = '#comment\na,b,c\nA,B,C\n1,2,3' + data = ('#comment\n' + 'a,b,c\n' + 'A,B,C\n' + '1,2,3') pd.read_csv(StringIO(data), comment='#', header=1) - data = 'A,B,C\n#comment\na,b,c\n1,2,3' + data = ('A,B,C\n' + '#comment\n' + 'a,b,c\n' + '1,2,3') pd.read_csv(StringIO(data), comment='#', skiprows=2) If both ``header`` and ``skiprows`` are specified, ``header`` will be relative to the end of ``skiprows``. For example: - .. ipython:: python +.. ipython:: python - data = '# empty\n# second empty line\n# third empty' \ - 'line\nX,Y,Z\n1,2,3\nA,B,C\n1,2.,4.\n5.,NaN,10.0' - print(data) - pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1) + data = ('# empty\n' + '# second empty line\n' + '# third emptyline\n' + 'X,Y,Z\n' + '1,2,3\n' + 'A,B,C\n' + '1,2.,4.\n' + '5.,NaN,10.0\n') + print(data) + pd.read_csv(StringIO(data), comment='#', skiprows=4, header=1) .. _io.comments: @@ -677,10 +723,10 @@ Sometimes comments or meta data may be included in a file: .. ipython:: python :suppress: - data = ("ID,level,category\n" - "Patient1,123000,x # really unpleasant\n" - "Patient2,23000,y # wouldn't take his medicine\n" - "Patient3,1234018,z # awesome") + data = ("ID,level,category\n" + "Patient1,123000,x # really unpleasant\n" + "Patient2,23000,y # wouldn't take his medicine\n" + "Patient3,1234018,z # awesome") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -718,7 +764,10 @@ result in byte strings being decoded to unicode in the result: .. ipython:: python - data = b'word,length\nTr\xc3\xa4umen,7\nGr\xc3\xbc\xc3\x9fe,5'.decode('utf8').encode('latin-1') + data = (b'word,length\n' + b'Tr\xc3\xa4umen,7\n' + b'Gr\xc3\xbc\xc3\x9fe,5') + data = data.decode('utf8').encode('latin-1') df = pd.read_csv(BytesIO(data), encoding='latin-1') df df['word'][1] @@ -738,12 +787,16 @@ first column will be used as the ``DataFrame``'s row names: .. ipython:: python - data = 'a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' + data = ('a,b,c\n' + '4,apple,bat,5.7\n' + '8,orange,cow,10') pd.read_csv(StringIO(data)) .. ipython:: python - data = 'index,a,b,c\n4,apple,bat,5.7\n8,orange,cow,10' + data = ('index,a,b,c\n' + '4,apple,bat,5.7\n' + '8,orange,cow,10') pd.read_csv(StringIO(data), index_col=0) Ordinarily, you can achieve this behavior using the ``index_col`` option. @@ -754,7 +807,9 @@ index column inference and discard the last column, pass ``index_col=False``: .. ipython:: python - data = 'a,b,c\n4,apple,bat,\n8,orange,cow,' + data = ('a,b,c\n' + '4,apple,bat,\n' + '8,orange,cow,') print(data) pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), index_col=False) @@ -764,7 +819,9 @@ If a subset of data is being parsed using the ``usecols`` option, the .. ipython:: python - data = 'a,b,c\n4,apple,bat,\n8,orange,cow,' + data = ('a,b,c\n' + '4,apple,bat,\n' + '8,orange,cow,') print(data) pd.read_csv(StringIO(data), usecols=['b', 'c']) pd.read_csv(StringIO(data), usecols=['b', 'c'], index_col=0) @@ -812,12 +869,12 @@ column names: .. ipython:: python :suppress: - data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" - "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" - "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" - "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" - "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" - "KORD,19990127, 23:00:00, 22:56:00, -0.5900") + data = ("KORD,19990127, 19:00:00, 18:56:00, 0.8100\n" + "KORD,19990127, 20:00:00, 19:56:00, 0.0100\n" + "KORD,19990127, 21:00:00, 20:56:00, -0.5900\n" + "KORD,19990127, 21:00:00, 21:18:00, -0.9900\n" + "KORD,19990127, 22:00:00, 21:56:00, -0.5900\n" + "KORD,19990127, 23:00:00, 22:56:00, -0.5900") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -895,9 +952,8 @@ take full advantage of the flexibility of the date parsing API: .. ipython:: python - import pandas.io.date_converters as conv df = pd.read_csv('tmp.csv', header=None, parse_dates=date_spec, - date_parser=conv.parse_date_time) + date_parser=pd.io.date_converters.parse_date_time) df Pandas will try to call the ``date_parser`` function in three different ways. If @@ -990,9 +1046,12 @@ DD/MM/YYYY instead. For convenience, a ``dayfirst`` keyword is provided: .. ipython:: python :suppress: - data = "date,value,cat\n1/6/2000,5,a\n2/6/2000,10,b\n3/6/2000,15,c" + data = ("date,value,cat\n" + "1/6/2000,5,a\n" + "2/6/2000,10,b\n" + "3/6/2000,15,c") with open('tmp.csv', 'w') as fh: - fh.write(data) + fh.write(data) .. ipython:: python @@ -1016,9 +1075,12 @@ writing to a file). For example: val = '0.3066101993807095471566981359501369297504425048828125' data = 'a,b,c\n1,2,{0}'.format(val) - abs(pd.read_csv(StringIO(data), engine='c', float_precision=None)['c'][0] - float(val)) - abs(pd.read_csv(StringIO(data), engine='c', float_precision='high')['c'][0] - float(val)) - abs(pd.read_csv(StringIO(data), engine='c', float_precision='round_trip')['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', + float_precision=None)['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', + float_precision='high')['c'][0] - float(val)) + abs(pd.read_csv(StringIO(data), engine='c', + float_precision='round_trip')['c'][0] - float(val)) .. _io.thousands: @@ -1033,10 +1095,10 @@ correctly: .. ipython:: python :suppress: - data = ("ID|level|category\n" - "Patient1|123,000|x\n" - "Patient2|23,000|y\n" - "Patient3|1,234,018|z") + data = ("ID|level|category\n" + "Patient1|123,000|x\n" + "Patient2|23,000|y\n" + "Patient3|1,234,018|z") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -1089,7 +1151,7 @@ Let us consider some examples: .. code-block:: python - read_csv(path, na_values=[5]) + pd.read_csv('path_to_file.csv', na_values=[5]) In the example above ``5`` and ``5.0`` will be recognized as ``NaN``, in addition to the defaults. A string will first be interpreted as a numerical @@ -1097,19 +1159,19 @@ addition to the defaults. A string will first be interpreted as a numerical .. code-block:: python - read_csv(path, keep_default_na=False, na_values=[""]) + pd.read_csv('path_to_file.csv', keep_default_na=False, na_values=[""]) Above, only an empty field will be recognized as ``NaN``. .. code-block:: python - read_csv(path, keep_default_na=False, na_values=["NA", "0"]) + pd.read_csv('path_to_file.csv', keep_default_na=False, na_values=["NA", "0"]) Above, both ``NA`` and ``0`` as strings are ``NaN``. .. code-block:: python - read_csv(path, na_values=["Nope"]) + pd.read_csv('path_to_file.csv', na_values=["Nope"]) The default values, in addition to the string ``"Nope"`` are recognized as ``NaN``. @@ -1132,10 +1194,10 @@ as a ``Series``: .. ipython:: python :suppress: - data = ("level\n" - "Patient1,123000\n" - "Patient2,23000\n" - "Patient3,1234018") + data = ("level\n" + "Patient1,123000\n" + "Patient2,23000\n" + "Patient3,1234018") with open('tmp.csv', 'w') as fh: fh.write(data) @@ -1144,7 +1206,7 @@ as a ``Series``: print(open('tmp.csv').read()) - output = pd.read_csv('tmp.csv', squeeze=True) + output = pd.read_csv('tmp.csv', squeeze=True) output type(output) @@ -1166,7 +1228,9 @@ options as follows: .. ipython:: python - data= 'a,b,c\n1,Yes,2\n3,No,4' + data = ('a,b,c\n' + '1,Yes,2\n' + '3,No,4') print(data) pd.read_csv(StringIO(data)) pd.read_csv(StringIO(data), true_values=['Yes'], false_values=['No']) @@ -1181,18 +1245,13 @@ too few fields will have NA values filled in the trailing fields. Lines with too many fields will raise an error by default: .. ipython:: python - :suppress: - - data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10' + :okexcept: -.. code-block:: ipython - - In [27]: data = 'a,b,c\n1,2,3\n4,5,6,7\n8,9,10' - - In [28]: pd.read_csv(StringIO(data)) - --------------------------------------------------------------------------- - ParserError Traceback (most recent call last) - ParserError: Error tokenizing data. C error: Expected 3 fields in line 3, saw 4 + data = ('a,b,c\n' + '1,2,3\n' + '4,5,6,7\n' + '8,9,10') + pd.read_csv(StringIO(data)) You can elect to skip bad lines: @@ -1437,7 +1496,7 @@ returned object: .. ipython:: python - df = pd.read_csv("data/mindex_ex.csv", index_col=[0,1]) + df = pd.read_csv("data/mindex_ex.csv", index_col=[0, 1]) df df.loc[1978] @@ -1480,7 +1539,6 @@ with ``df.to_csv(..., index=False)``, then any ``names`` on the columns index wi .. ipython:: python :suppress: - import os os.remove('mi.csv') os.remove('mi2.csv') @@ -1966,9 +2024,8 @@ Preserve string indices: .. ipython:: python - si = pd.DataFrame(np.zeros((4, 4)), - columns=list(range(4)), - index=[str(i) for i in range(4)]) + si = pd.DataFrame(np.zeros((4, 4)), columns=list(range(4)), + index=[str(i) for i in range(4)]) si si.index si.columns @@ -2020,11 +2077,11 @@ data: .. ipython:: python - timeit pd.read_json(jsonfloats) + %timeit pd.read_json(jsonfloats) .. ipython:: python - timeit pd.read_json(jsonfloats, numpy=True) + %timeit pd.read_json(jsonfloats, numpy=True) The speedup is less noticeable for smaller datasets: @@ -2034,11 +2091,11 @@ The speedup is less noticeable for smaller datasets: .. ipython:: python - timeit pd.read_json(jsonfloats) + %timeit pd.read_json(jsonfloats) .. ipython:: python - timeit pd.read_json(jsonfloats, numpy=True) + %timeit pd.read_json(jsonfloats, numpy=True) .. warning:: @@ -2059,7 +2116,6 @@ The speedup is less noticeable for smaller datasets: .. ipython:: python :suppress: - import os os.remove('test.json') .. _io.json_normalize: @@ -2081,20 +2137,16 @@ into a flat table. .. ipython:: python data = [{'state': 'Florida', - 'shortname': 'FL', - 'info': { - 'governor': 'Rick Scott' - }, - 'counties': [{'name': 'Dade', 'population': 12345}, + 'shortname': 'FL', + 'info': {'governor': 'Rick Scott'}, + 'counties': [{'name': 'Dade', 'population': 12345}, {'name': 'Broward', 'population': 40000}, {'name': 'Palm Beach', 'population': 60000}]}, - {'state': 'Ohio', - 'shortname': 'OH', - 'info': { - 'governor': 'John Kasich' - }, - 'counties': [{'name': 'Summit', 'population': 1234}, - {'name': 'Cuyahoga', 'population': 1337}]}] + {'state': 'Ohio', + 'shortname': 'OH', + 'info': {'governor': 'John Kasich'}, + 'counties': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}]}] json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']]) @@ -2142,11 +2194,10 @@ a JSON string with two fields, ``schema`` and ``data``. .. ipython:: python - df = pd.DataFrame( - {'A': [1, 2, 3], - 'B': ['a', 'b', 'c'], - 'C': pd.date_range('2016-01-01', freq='d', periods=3), - }, index=pd.Index(range(3), name='idx')) + df = pd.DataFrame({'A': [1, 2, 3], + 'B': ['a', 'b', 'c'], + 'C': pd.date_range('2016-01-01', freq='d', periods=3)}, + index=pd.Index(range(3), name='idx')) df df.to_json(orient='table', date_format="iso") @@ -2322,7 +2373,6 @@ as a string: .. ipython:: python :suppress: - import os file_path = os.path.abspath(os.path.join('source', '_static', 'banklist.html')) .. ipython:: python @@ -2693,7 +2743,7 @@ file, and the ``sheet_name`` indicating which sheet to parse. .. code-block:: python # Returns a DataFrame - read_excel('path_to_file.xls', sheet_name='Sheet1') + pd.read_excel('path_to_file.xls', sheet_name='Sheet1') .. _io.excel.excelfile_class: @@ -2742,14 +2792,14 @@ of sheet names can simply be passed to ``read_excel`` with no loss in performanc # using the ExcelFile class data = {} with pd.ExcelFile('path_to_file.xls') as xls: - data['Sheet1'] = read_excel(xls, 'Sheet1', index_col=None, - na_values=['NA']) - data['Sheet2'] = read_excel(xls, 'Sheet2', index_col=None, - na_values=['NA']) + data['Sheet1'] = pd.read_excel(xls, 'Sheet1', index_col=None, + na_values=['NA']) + data['Sheet2'] = pd.read_excel(xls, 'Sheet2', index_col=None, + na_values=['NA']) # equivalent using the read_excel function - data = read_excel('path_to_file.xls', ['Sheet1', 'Sheet2'], - index_col=None, na_values=['NA']) + data = pd.read_excel('path_to_file.xls', ['Sheet1', 'Sheet2'], + index_col=None, na_values=['NA']) .. _io.excel.specifying_sheets: @@ -2771,35 +2821,35 @@ Specifying Sheets .. code-block:: python # Returns a DataFrame - read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) + pd.read_excel('path_to_file.xls', 'Sheet1', index_col=None, na_values=['NA']) Using the sheet index: .. code-block:: python # Returns a DataFrame - read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) + pd.read_excel('path_to_file.xls', 0, index_col=None, na_values=['NA']) Using all default values: .. code-block:: python # Returns a DataFrame - read_excel('path_to_file.xls') + pd.read_excel('path_to_file.xls') Using None to get all sheets: .. code-block:: python # Returns a dictionary of DataFrames - read_excel('path_to_file.xls', sheet_name=None) + pd.read_excel('path_to_file.xls', sheet_name=None) Using a list to get multiple sheets: .. code-block:: python # Returns the 1st and 4th sheet, as a dictionary of DataFrames. - read_excel('path_to_file.xls', sheet_name=['Sheet1', 3]) + pd.read_excel('path_to_file.xls', sheet_name=['Sheet1', 3]) ``read_excel`` can read more than one sheet, by setting ``sheet_name`` to either a list of sheet names, a list of sheet positions, or ``None`` to read all sheets. @@ -2820,8 +2870,8 @@ For example, to read in a ``MultiIndex`` index without names: .. ipython:: python - df = pd.DataFrame({'a':[1, 2, 3, 4], 'b':[5, 6, 7, 8]}, - index=pd.MultiIndex.from_product([['a', 'b'],['c', 'd']])) + df = pd.DataFrame({'a': [1, 2, 3, 4], 'b': [5, 6, 7, 8]}, + index=pd.MultiIndex.from_product([['a', 'b'], ['c', 'd']])) df.to_excel('path_to_file.xlsx') df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1]) df @@ -2842,7 +2892,8 @@ should be passed to ``index_col`` and ``header``: .. ipython:: python - df.columns = pd.MultiIndex.from_product([['a'], ['b', 'd']], names=['c1', 'c2']) + df.columns = pd.MultiIndex.from_product([['a'], ['b', 'd']], + names=['c1', 'c2']) df.to_excel('path_to_file.xlsx') df = pd.read_excel('path_to_file.xlsx', index_col=[0, 1], header=[0, 1]) df @@ -2850,7 +2901,6 @@ should be passed to ``index_col`` and ``header``: .. ipython:: python :suppress: - import os os.remove('path_to_file.xlsx') @@ -2871,20 +2921,20 @@ to be parsed. .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', usecols=2) + pd.read_excel('path_to_file.xls', 'Sheet1', usecols=2) You can also specify a comma-delimited set of Excel columns and ranges as a string: .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', usecols='A,C:E') + pd.read_excel('path_to_file.xls', 'Sheet1', usecols='A,C:E') If ``usecols`` is a list of integers, then it is assumed to be the file column indices to be parsed. .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', usecols=[0, 2, 3]) + pd.read_excel('path_to_file.xls', 'Sheet1', usecols=[0, 2, 3]) Element order is ignored, so ``usecols=[0, 1]`` is the same as ``[1, 0]``. @@ -2896,7 +2946,7 @@ document header row(s). Those strings define which columns will be parsed: .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', usecols=['foo', 'bar']) + pd.read_excel('path_to_file.xls', 'Sheet1', usecols=['foo', 'bar']) Element order is ignored, so ``usecols=['baz', 'joe']`` is the same as ``['joe', 'baz']``. @@ -2907,7 +2957,7 @@ the column names, returning names where the callable function evaluates to ``Tru .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha()) + pd.read_excel('path_to_file.xls', 'Sheet1', usecols=lambda x: x.isalpha()) Parsing Dates +++++++++++++ @@ -2919,7 +2969,7 @@ use the ``parse_dates`` keyword to parse those strings to datetimes: .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings']) + pd.read_excel('path_to_file.xls', 'Sheet1', parse_dates=['date_strings']) Cell Converters @@ -2930,7 +2980,7 @@ option. For instance, to convert a column to boolean: .. code-block:: python - read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) + pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyBools': bool}) This options handles missing values and treats exceptions in the converters as missing data. Transformations are applied cell by cell rather than to the @@ -2945,7 +2995,7 @@ missing data to recover integer dtype: return int(x) if x else -1 - read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) + pd.read_excel('path_to_file.xls', 'Sheet1', converters={'MyInts': cfun}) dtype Specifications ++++++++++++++++++++ @@ -2959,7 +3009,7 @@ no type inference, use the type ``str`` or ``object``. .. code-block:: python - read_excel('path_to_file.xls', dtype={'MyInts': 'int64', 'MyText': str}) + pd.read_excel('path_to_file.xls', dtype={'MyInts': 'int64', 'MyText': str}) .. _io.excel_writer: @@ -2997,7 +3047,7 @@ one can pass an :class:`~pandas.io.excel.ExcelWriter`. .. code-block:: python - with ExcelWriter('path_to_file.xlsx') as writer: + with pd.ExcelWriter('path_to_file.xlsx') as writer: df1.to_excel(writer, sheet_name='Sheet1') df2.to_excel(writer, sheet_name='Sheet2') @@ -3029,7 +3079,7 @@ Pandas supports writing Excel files to buffer-like objects such as ``StringIO`` bio = BytesIO() # By setting the 'engine' in the ExcelWriter constructor. - writer = ExcelWriter(bio, engine='xlsxwriter') + writer = pd.ExcelWriter(bio, engine='xlsxwriter') df.to_excel(writer, sheet_name='Sheet1') # Save the workbook @@ -3082,7 +3132,7 @@ argument to ``to_excel`` and to ``ExcelWriter``. The built-in engines are: df.to_excel('path_to_file.xlsx', sheet_name='Sheet1', engine='xlsxwriter') # By setting the 'engine' in the ExcelWriter constructor. - writer = ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') + writer = pd.ExcelWriter('path_to_file.xlsx', engine='xlsxwriter') # Or via pandas configuration. from pandas import options # noqa: E402 @@ -3172,7 +3222,6 @@ any pickled pandas object (or any other pickled object) from file: .. ipython:: python :suppress: - import os os.remove('foo.pkl') .. warning:: @@ -3249,7 +3298,6 @@ The default is to 'infer': .. ipython:: python :suppress: - import os os.remove("data.pkl.compress") os.remove("data.pkl.xz") os.remove("data.pkl.gz") @@ -3306,7 +3354,7 @@ pandas objects. .. ipython:: python - pd.to_msgpack('foo2.msg', {'dict': [{ 'df': df }, {'string': 'foo'}, + pd.to_msgpack('foo2.msg', {'dict': [{'df': df}, {'string': 'foo'}, {'scalar': 1.}, {'s': s}]}) pd.read_msgpack('foo2.msg') @@ -3365,7 +3413,6 @@ dict: .. ipython:: python - np.random.seed(1234) index = pd.date_range('1/1/2000', periods=8) s = pd.Series(randn(5), index=['a', 'b', 'c', 'd', 'e']) df = pd.DataFrame(randn(8, 3), index=index, @@ -3421,7 +3468,6 @@ Closing a Store and using a context manager: :suppress: store.close() - import os os.remove('store.h5') @@ -3434,8 +3480,8 @@ similar to how ``read_csv`` and ``to_csv`` work. .. ipython:: python - df_tl = pd.DataFrame(dict(A=list(range(5)), B=list(range(5)))) - df_tl.to_hdf('store_tl.h5','table', append=True) + df_tl = pd.DataFrame({'A': list(range(5)), 'B': list(range(5))}) + df_tl.to_hdf('store_tl.h5', 'table', append=True) pd.read_hdf('store_tl.h5', 'table', where=['index>2']) .. ipython:: python @@ -3447,10 +3493,6 @@ similar to how ``read_csv`` and ``to_csv`` work. HDFStore will by default not drop rows that are all missing. This behavior can be changed by setting ``dropna=True``. -.. ipython:: python - :suppress: - - import os .. ipython:: python @@ -3459,12 +3501,12 @@ HDFStore will by default not drop rows that are all missing. This behavior can b df_with_missing df_with_missing.to_hdf('file.h5', 'df_with_missing', - format='table', mode='w') + format='table', mode='w') pd.read_hdf('file.h5', 'df_with_missing') df_with_missing.to_hdf('file.h5', 'df_with_missing', - format='table', mode='w', dropna=True) + format='table', mode='w', dropna=True) pd.read_hdf('file.h5', 'df_with_missing') @@ -3478,13 +3520,13 @@ This is also true for the major axis of a ``Panel``: .. ipython:: python matrix = [[[np.nan, np.nan, np.nan], [1, np.nan, np.nan]], - [[np.nan, np.nan, np.nan], [np.nan, 5, 6]], - [[np.nan, np.nan, np.nan], [np.nan, 3, np.nan]]] + [[np.nan, np.nan, np.nan], [np.nan, 5, 6]], + [[np.nan, np.nan, np.nan], [np.nan, 3, np.nan]]] - panel_with_major_axis_all_missing=pd.Panel(matrix, - items=['Item1', 'Item2', 'Item3'], - major_axis=[1, 2], - minor_axis=['A', 'B', 'C']) + panel_with_major_axis_all_missing = pd.Panel(matrix, + items=['Item1', 'Item2', 'Item3'], + major_axis=[1, 2], + minor_axis=['A', 'B', 'C']) panel_with_major_axis_all_missing @@ -3585,7 +3627,7 @@ everything in the sub-store and **below**, so be *careful*. store.put('foo/bar/bah', df) store.append('food/orange', df) - store.append('food/apple', df) + store.append('food/apple', df) store # a list of keys are returned @@ -3660,14 +3702,15 @@ defaults to `nan`. df_mixed = pd.DataFrame({'A': randn(8), 'B': randn(8), 'C': np.array(randn(8), dtype='float32'), - 'string':'string', + 'string': 'string', 'int': 1, 'bool': True, 'datetime64': pd.Timestamp('20010102')}, index=list(range(8))) - df_mixed.loc[df_mixed.index[3:5], ['A', 'B', 'string', 'datetime64']] = np.nan + df_mixed.loc[df_mixed.index[3:5], + ['A', 'B', 'string', 'datetime64']] = np.nan - store.append('df_mixed', df_mixed, min_itemsize = {'values': 50}) + store.append('df_mixed', df_mixed, min_itemsize={'values': 50}) df_mixed1 = store.select('df_mixed') df_mixed1 df_mixed1.get_dtype_counts() @@ -3820,7 +3863,8 @@ Works with a Panel as well. store.append('wp', wp) store - store.select('wp', "major_axis>pd.Timestamp('20000102') & minor_axis=['A', 'B']") + store.select('wp', + "major_axis>pd.Timestamp('20000102') & minor_axis=['A', 'B']") The ``columns`` keyword can be supplied to select a list of columns to be returned, this is equivalent to passing a @@ -3863,7 +3907,10 @@ specified in the format: ``()``, where float may be signed (and fra .. ipython:: python from datetime import timedelta - dftd = pd.DataFrame(dict(A = pd.Timestamp('20130101'), B = [ pd.Timestamp('20130101') + timedelta(days=i, seconds=10) for i in range(10) ])) + dftd = pd.DataFrame({'A': pd.Timestamp('20130101'), + 'B': [pd.Timestamp('20130101') + timedelta(days=i, + seconds=10) + for i in range(10)]}) dftd['C'] = dftd['A'] - dftd['B'] dftd store.append('dftd', dftd, data_columns=True) @@ -3940,14 +3987,14 @@ be ``data_columns``. df_dc = df.copy() df_dc['string'] = 'foo' - df_dc.loc[df_dc.index[4: 6], 'string'] = np.nan - df_dc.loc[df_dc.index[7: 9], 'string'] = 'bar' + df_dc.loc[df_dc.index[4:6], 'string'] = np.nan + df_dc.loc[df_dc.index[7:9], 'string'] = 'bar' df_dc['string2'] = 'cool' - df_dc.loc[df_dc.index[1: 3], ['B', 'C']] = 1.0 + df_dc.loc[df_dc.index[1:3], ['B', 'C']] = 1.0 df_dc # on-disk operations - store.append('df_dc', df_dc, data_columns = ['B', 'C', 'string', 'string2']) + store.append('df_dc', df_dc, data_columns=['B', 'C', 'string', 'string2']) store.select('df_dc', where='B > 0') # getting creative @@ -3976,7 +4023,7 @@ The default is 50,000 rows returned in a chunk. .. ipython:: python for df in store.select('df', chunksize=3): - print(df) + print(df) .. note:: @@ -4003,12 +4050,12 @@ chunks. store.append('dfeq', dfeq, data_columns=['number']) def chunks(l, n): - return [l[i: i+n] for i in range(0, len(l), n)] + return [l[i:i + n] for i in range(0, len(l), n)] evens = [2, 4, 6, 8, 10] coordinates = store.select_as_coordinates('dfeq', 'number=evens') for c in chunks(coordinates, 2): - print(store.select('dfeq', where=c)) + print(store.select('dfeq', where=c)) Advanced Queries ++++++++++++++++ @@ -4105,13 +4152,13 @@ results. .. ipython:: python df_mt = pd.DataFrame(randn(8, 6), index=pd.date_range('1/1/2000', periods=8), - columns=['A', 'B', 'C', 'D', 'E', 'F']) + columns=['A', 'B', 'C', 'D', 'E', 'F']) df_mt['foo'] = 'bar' df_mt.loc[df_mt.index[1], ('A', 'B')] = np.nan # you can also create the tables individually - store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None }, - df_mt, selector='df1_mt') + store.append_to_multiple({'df1_mt': ['A', 'B'], 'df2_mt': None}, + df_mt, selector='df1_mt') store # individual tables were created @@ -4120,7 +4167,7 @@ results. # as a multiple store.select_as_multiple(['df1_mt', 'df2_mt'], where=['A>0', 'B>0'], - selector = 'df1_mt') + selector='df1_mt') Delete from a Table @@ -4159,7 +4206,7 @@ the table using a ``where`` that selects all but the missing data. .. ipython:: python # returns the number of rows deleted - store.remove('wp', 'major_axis > 20000102' ) + store.remove('wp', 'major_axis > 20000102') store.select('wp') .. warning:: @@ -4332,7 +4379,7 @@ stored in a more efficient manner. .. ipython:: python dfcat = pd.DataFrame({'A': pd.Series(list('aabbcdba')).astype('category'), - 'B': np.random.randn(8) }) + 'B': np.random.randn(8)}) dfcat dfcat.dtypes cstore = pd.HDFStore('cats.h5', mode='w') @@ -4346,7 +4393,6 @@ stored in a more efficient manner. :okexcept: cstore.close() - import os os.remove('cats.h5') @@ -4374,7 +4420,7 @@ Passing a ``min_itemsize`` dict will cause all passed columns to be created as * .. ipython:: python - dfs = pd.DataFrame(dict(A='foo', B='bar'), index=list(range(5))) + dfs = pd.DataFrame({'A': 'foo', 'B': 'bar'}, index=list(range(5))) dfs # A and B have a size of 30 @@ -4393,7 +4439,7 @@ You could inadvertently turn an actual ``nan`` value into a missing value. .. ipython:: python - dfss = pd.DataFrame(dict(A=['foo', 'bar', 'nan'])) + dfss = pd.DataFrame({'A': ['foo', 'bar', 'nan']}) dfss store.append('dfss', dfss) @@ -4420,11 +4466,10 @@ It is possible to write an ``HDFStore`` object that can easily be imported into .. ipython:: python - np.random.seed(1) df_for_r = pd.DataFrame({"first": np.random.rand(100), "second": np.random.rand(100), "class": np.random.randint(0, 2, (100, ))}, - index=range(100)) + index=range(100)) df_for_r.head() store_export = pd.HDFStore('export.h5') @@ -4435,7 +4480,6 @@ It is possible to write an ``HDFStore`` object that can easily be imported into :suppress: store_export.close() - import os os.remove('export.h5') In R this file can be read into a ``data.frame`` object using the ``rhdf5`` @@ -4523,7 +4567,6 @@ Performance :suppress: store.close() - import os os.remove('store.h5') @@ -4589,7 +4632,6 @@ Read from a feather file. .. ipython:: python :suppress: - import os os.remove('example.feather') @@ -4673,7 +4715,6 @@ Read only certain columns of a parquet file. .. ipython:: python :suppress: - import os os.remove('example_pa.parquet') os.remove('example_fp.parquet') @@ -4722,7 +4763,8 @@ Parquet supports partitioning of data based on the values of one or more columns .. ipython:: python df = pd.DataFrame({'a': [0, 0, 1, 1], 'b': [0, 1, 0, 1]}) - df.to_parquet(fname='test', engine='pyarrow', partition_cols=['a'], compression=None) + df.to_parquet(fname='test', engine='pyarrow', + partition_cols=['a'], compression=None) The `fname` specifies the parent directory to which data will be saved. The `partition_cols` are the column names by which the dataset will be partitioned. @@ -4835,14 +4877,15 @@ the database using :func:`~pandas.DataFrame.to_sql`. import datetime c = ['id', 'Date', 'Col_1', 'Col_2', 'Col_3'] - d = [(26, datetime.datetime(2010,10,18), 'X', 27.5, True), - (42, datetime.datetime(2010,10,19), 'Y', -12.5, False), - (63, datetime.datetime(2010,10,20), 'Z', 5.73, True)] + d = [(26, datetime.datetime(2010, 10, 18), 'X', 27.5, True), + (42, datetime.datetime(2010, 10, 19), 'Y', -12.5, False), + (63, datetime.datetime(2010, 10, 20), 'Z', 5.73, True)] - data = pd.DataFrame(d, columns=c) + data = pd.DataFrame(d, columns=c) .. ipython:: python + data data.to_sql('data', engine) With some databases, writing large DataFrames can result in errors due to @@ -4999,7 +5042,8 @@ Specifying this will return an iterator through chunks of the query result: .. ipython:: python - for chunk in pd.read_sql_query("SELECT * FROM data_chunks", engine, chunksize=5): + for chunk in pd.read_sql_query("SELECT * FROM data_chunks", + engine, chunksize=5): print(chunk) You can also run a plain query without creating a ``DataFrame`` with @@ -5064,14 +5108,14 @@ If you have an SQLAlchemy description of your database you can express where con metadata = sa.MetaData() data_table = sa.Table('data', metadata, - sa.Column('index', sa.Integer), - sa.Column('Date', sa.DateTime), - sa.Column('Col_1', sa.String), - sa.Column('Col_2', sa.Float), - sa.Column('Col_3', sa.Boolean), - ) + sa.Column('index', sa.Integer), + sa.Column('Date', sa.DateTime), + sa.Column('Col_1', sa.String), + sa.Column('Col_2', sa.Float), + sa.Column('Col_3', sa.Boolean), + ) - pd.read_sql(sa.select([data_table]).where(data_table.c.Col_3 == True), engine) + pd.read_sql(sa.select([data_table]).where(data_table.c.Col_3 is True), engine) You can combine SQLAlchemy expressions with parameters passed to :func:`read_sql` using :func:`sqlalchemy.bindparam` @@ -5100,7 +5144,7 @@ And then issue the following queries: .. code-block:: python - data.to_sql('data', cnx) + data.to_sql('data', con) pd.read_sql_query("SELECT * FROM data", con) @@ -5239,7 +5283,6 @@ values will have ``object`` data type. .. ipython:: python :suppress: - import os os.remove('stata.dta') .. _io.stata-categorical: @@ -5318,6 +5361,9 @@ Obtain an iterator and read an XPORT file 100,000 lines at a time: .. code-block:: python + def do_something(chunk): + pass + rdr = pd.read_sas('sas_xport.xpt', chunk=100000) for chunk in rdr: do_something(chunk) @@ -5370,91 +5416,10 @@ ignored. dtypes: float64(1), int64(1) memory usage: 15.3 MB -When writing, the top-three functions in terms of speed are are -``test_pickle_write``, ``test_feather_write`` and ``test_hdf_fixed_write_compress``. - -.. code-block:: ipython - - In [14]: %timeit test_sql_write(df) - 2.37 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - - In [15]: %timeit test_hdf_fixed_write(df) - 194 ms ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [26]: %timeit test_hdf_fixed_write_compress(df) - 119 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [16]: %timeit test_hdf_table_write(df) - 623 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - - In [27]: %timeit test_hdf_table_write_compress(df) - 563 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - - In [17]: %timeit test_csv_write(df) - 3.13 s ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - - In [30]: %timeit test_feather_write(df) - 103 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [31]: %timeit test_pickle_write(df) - 109 ms ± 3.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [32]: %timeit test_pickle_write_compress(df) - 3.33 s ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - -When reading, the top three are ``test_feather_read``, ``test_pickle_read`` and -``test_hdf_fixed_read``. - -.. code-block:: ipython - - In [18]: %timeit test_sql_read() - 1.35 s ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - - In [19]: %timeit test_hdf_fixed_read() - 14.3 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - - In [28]: %timeit test_hdf_fixed_read_compress() - 23.5 ms ± 672 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [20]: %timeit test_hdf_table_read() - 35.4 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [29]: %timeit test_hdf_table_read_compress() - 42.6 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) - - In [22]: %timeit test_csv_read() - 516 ms ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - - In [33]: %timeit test_feather_read() - 4.06 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - - In [34]: %timeit test_pickle_read() - 6.5 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) - - In [35]: %timeit test_pickle_read_compress() - 588 ms ± 3.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) - -Space on disk (in bytes) - -.. code-block:: none - - 34816000 Aug 21 18:00 test.sql - 24009240 Aug 21 18:00 test_fixed.hdf - 7919610 Aug 21 18:00 test_fixed_compress.hdf - 24458892 Aug 21 18:00 test_table.hdf - 8657116 Aug 21 18:00 test_table_compress.hdf - 28520770 Aug 21 18:00 test.csv - 16000248 Aug 21 18:00 test.feather - 16000848 Aug 21 18:00 test.pkl - 7554108 Aug 21 18:00 test.pkl.compress - -And here's the code: +Given the next test set: .. code-block:: python - import os - import pandas as pd - import sqlite3 from numpy.random import randn sz = 1000000 @@ -5538,3 +5503,81 @@ And here's the code: def test_pickle_read_compress(): pd.read_pickle('test.pkl.compress', compression='xz') + +When writing, the top-three functions in terms of speed are are +``test_pickle_write``, ``test_feather_write`` and ``test_hdf_fixed_write_compress``. + +.. code-block:: ipython + + In [14]: %timeit test_sql_write(df) + 2.37 s ± 36.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [15]: %timeit test_hdf_fixed_write(df) + 194 ms ± 65.9 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [26]: %timeit test_hdf_fixed_write_compress(df) + 119 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [16]: %timeit test_hdf_table_write(df) + 623 ms ± 125 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [27]: %timeit test_hdf_table_write_compress(df) + 563 ms ± 23.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [17]: %timeit test_csv_write(df) + 3.13 s ± 49.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [30]: %timeit test_feather_write(df) + 103 ms ± 5.88 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [31]: %timeit test_pickle_write(df) + 109 ms ± 3.72 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [32]: %timeit test_pickle_write_compress(df) + 3.33 s ± 55.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + +When reading, the top three are ``test_feather_read``, ``test_pickle_read`` and +``test_hdf_fixed_read``. + +.. code-block:: ipython + + In [18]: %timeit test_sql_read() + 1.35 s ± 14.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [19]: %timeit test_hdf_fixed_read() + 14.3 ms ± 438 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + + In [28]: %timeit test_hdf_fixed_read_compress() + 23.5 ms ± 672 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [20]: %timeit test_hdf_table_read() + 35.4 ms ± 314 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [29]: %timeit test_hdf_table_read_compress() + 42.6 ms ± 2.1 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) + + In [22]: %timeit test_csv_read() + 516 ms ± 27.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + + In [33]: %timeit test_feather_read() + 4.06 ms ± 115 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + + In [34]: %timeit test_pickle_read() + 6.5 ms ± 172 µs per loop (mean ± std. dev. of 7 runs, 100 loops each) + + In [35]: %timeit test_pickle_read_compress() + 588 ms ± 3.57 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) + +Space on disk (in bytes) + +.. code-block:: none + + 34816000 Aug 21 18:00 test.sql + 24009240 Aug 21 18:00 test_fixed.hdf + 7919610 Aug 21 18:00 test_fixed_compress.hdf + 24458892 Aug 21 18:00 test_table.hdf + 8657116 Aug 21 18:00 test_table_compress.hdf + 28520770 Aug 21 18:00 test.csv + 16000248 Aug 21 18:00 test.feather + 16000848 Aug 21 18:00 test.pkl + 7554108 Aug 21 18:00 test.pkl.compress diff --git a/doc/source/missing_data.rst b/doc/source/missing_data.rst index 48646376916071..7b6d338ee5b6a6 100644 --- a/doc/source/missing_data.rst +++ b/doc/source/missing_data.rst @@ -678,7 +678,7 @@ Replacing more than one value is possible by passing a list. .. ipython:: python - df00 = df.values[0, 0] + df00 = df.iloc[0, 0] df.replace([1.5, df00], [np.nan, 'a']) df[1].dtype diff --git a/doc/source/r_interface.rst b/doc/source/r_interface.rst index d0b26016680692..f40f9199aaf668 100644 --- a/doc/source/r_interface.rst +++ b/doc/source/r_interface.rst @@ -33,10 +33,11 @@ See also the documentation of the `rpy2 `__ project: In the remainder of this page, a few examples of explicit conversion is given. The pandas conversion of rpy2 needs first to be activated: -.. code-block:: python +.. ipython:: + :verbatim: - >>> from rpy2.robjects import pandas2ri # doctest: +SKIP - >>> pandas2ri.activate() # doctest: +SKIP + In [1]: from rpy2.robjects import pandas2ri + ...: pandas2ri.activate() Transferring R data sets into Python ------------------------------------ @@ -44,11 +45,15 @@ Transferring R data sets into Python Once the pandas conversion is activated (``pandas2ri.activate()``), many conversions of R to pandas objects will be done automatically. For example, to obtain the 'iris' dataset as a pandas DataFrame: -.. code-block:: python +.. ipython:: + :verbatim: - >>> from rpy2.robjects import r # doctest: +SKIP - >>> r.data('iris') # doctest: +SKIP - >>> r['iris'].head() # doctest: +SKIP + In [2]: from rpy2.robjects import r + + In [3]: r.data('iris') + + In [4]: r['iris'].head() + Out[4]: Sepal.Length Sepal.Width Petal.Length Petal.Width Species 0 5.1 3.5 1.4 0.2 setosa 1 4.9 3.0 1.4 0.2 setosa @@ -66,14 +71,19 @@ Converting DataFrames into R objects The ``pandas2ri.py2ri`` function support the reverse operation to convert DataFrames into the equivalent R object (that is, **data.frame**): -.. code-block:: python +.. ipython:: + :verbatim: + + In [5]: df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}, + ...: index=["one", "two", "three"]) + + In [6]: r_dataframe = pandas2ri.py2ri(df) + + In [7]: print(type(r_dataframe)) + Out[7]: - >>> df = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6], 'C': [7, 8, 9]}, - ... index=["one", "two", "three"]) # doctest: +SKIP - >>> r_dataframe = pandas2ri.py2ri(df) # doctest: +SKIP - >>> print(type(r_dataframe)) # doctest: +SKIP - - >>> print(r_dataframe) # doctest: +SKIP + In [8]: print(r_dataframe) + Out[8]: A B C one 1 4 7 two 2 5 8 diff --git a/doc/source/reshaping.rst b/doc/source/reshaping.rst index 8650b5ed1ba375..19857db1743e80 100644 --- a/doc/source/reshaping.rst +++ b/doc/source/reshaping.rst @@ -27,12 +27,12 @@ Reshaping by pivoting DataFrame objects tm.N = 3 def unpivot(frame): - N, K = frame.shape - data = {'value': frame.values.ravel('F'), - 'variable': np.asarray(frame.columns).repeat(N), - 'date': np.tile(np.asarray(frame.index), K)} - columns = ['date', 'variable', 'value'] - return pd.DataFrame(data, columns=columns) + N, K = frame.shape + data = {'value': frame.to_numpy().ravel('F'), + 'variable': np.asarray(frame.columns).repeat(N), + 'date': np.tile(np.asarray(frame.index), K)} + columns = ['date', 'variable', 'value'] + return pd.DataFrame(data, columns=columns) df = unpivot(tm.makeTimeDataFrame()) @@ -54,7 +54,7 @@ For the curious here is how the above ``DataFrame`` was created: def unpivot(frame): N, K = frame.shape - data = {'value': frame.values.ravel('F'), + data = {'value': frame.to_numpy().ravel('F'), 'variable': np.asarray(frame.columns).repeat(N), 'date': np.tile(np.asarray(frame.index), K)} return pd.DataFrame(data, columns=['date', 'variable', 'value']) diff --git a/doc/source/text.rst b/doc/source/text.rst index d69888e406f0a6..d677cc38c98885 100644 --- a/doc/source/text.rst +++ b/doc/source/text.rst @@ -317,8 +317,8 @@ All one-dimensional list-likes can be combined in a list-like container (includi s u - s.str.cat([u.values, - u.index.astype(str).values], na_rep='-') + s.str.cat([u.array, + u.index.astype(str).array], na_rep='-') All elements must match in length to the calling ``Series`` (or ``Index``), except those having an index if ``join`` is not None: diff --git a/doc/source/timedeltas.rst b/doc/source/timedeltas.rst index e602e45784f4a5..8dab39aafbf676 100644 --- a/doc/source/timedeltas.rst +++ b/doc/source/timedeltas.rst @@ -4,18 +4,12 @@ .. ipython:: python :suppress: - import datetime import numpy as np import pandas as pd + np.random.seed(123456) - randn = np.random.randn - randint = np.random.randint np.set_printoptions(precision=4, suppress=True) - pd.options.display.max_rows=15 - import dateutil - import pytz - from dateutil.relativedelta import relativedelta - from pandas.tseries.offsets import * + pd.options.display.max_rows = 15 .. _timedeltas.timedeltas: @@ -37,6 +31,8 @@ You can construct a ``Timedelta`` scalar through various arguments: .. ipython:: python + import datetime + # strings pd.Timedelta('1 days') pd.Timedelta('1 days 00:00:00') @@ -74,13 +70,14 @@ You can construct a ``Timedelta`` scalar through various arguments: .. ipython:: python - pd.Timedelta(Second(2)) + pd.Timedelta(pd.offsets.Second(2)) Further, operations among the scalars yield another scalar ``Timedelta``. .. ipython:: python - pd.Timedelta(Day(2)) + pd.Timedelta(Second(2)) + pd.Timedelta('00:00:00.000123') + pd.Timedelta(pd.offsets.Day(2)) + pd.Timedelta(pd.offsets.Second(2)) +\ + pd.Timedelta('00:00:00.000123') to_timedelta ~~~~~~~~~~~~ @@ -135,8 +132,8 @@ subtraction operations on ``datetime64[ns]`` Series, or ``Timestamps``. .. ipython:: python s = pd.Series(pd.date_range('2012-1-1', periods=3, freq='D')) - td = pd.Series([ pd.Timedelta(days=i) for i in range(3) ]) - df = pd.DataFrame(dict(A = s, B = td)) + td = pd.Series([pd.Timedelta(days=i) for i in range(3)]) + df = pd.DataFrame({'A': s, 'B': td}) df df['C'] = df['A'] + df['B'] df @@ -145,8 +142,8 @@ subtraction operations on ``datetime64[ns]`` Series, or ``Timestamps``. s - s.max() s - datetime.datetime(2011, 1, 1, 3, 5) s + datetime.timedelta(minutes=5) - s + Minute(5) - s + Minute(5) + Milli(5) + s + pd.offsets.Minute(5) + s + pd.offsets.Minute(5) + pd.offsets.Milli(5) Operations with scalars from a ``timedelta64[ns]`` series: @@ -184,7 +181,7 @@ Operands can also appear in a reversed order (a singular object operated with a A = s - pd.Timestamp('20120101') - pd.Timedelta('00:05:05') B = s - pd.Series(pd.date_range('2012-1-2', periods=3, freq='D')) - df = pd.DataFrame(dict(A=A, B=B)) + df = pd.DataFrame({'A': A, 'B': B}) df df.min() @@ -232,7 +229,8 @@ Numeric reduction operation for ``timedelta64[ns]`` will return ``Timedelta`` ob .. ipython:: python - y2 = pd.Series(pd.to_timedelta(['-1 days +00:00:05', 'nat', '-1 days +00:00:05', '1 days'])) + y2 = pd.Series(pd.to_timedelta(['-1 days +00:00:05', 'nat', + '-1 days +00:00:05', '1 days'])) y2 y2.mean() y2.median() @@ -250,8 +248,10 @@ Note that division by the NumPy scalar is true division, while astyping is equiv .. ipython:: python - td = pd.Series(pd.date_range('20130101', periods=4)) - \ - pd.Series(pd.date_range('20121201', periods=4)) + december = pd.Series(pd.date_range('20121201', periods=4)) + january = pd.Series(pd.date_range('20130101', periods=4)) + td = january - december + td[2] += datetime.timedelta(minutes=5, seconds=3) td[3] = np.nan td @@ -360,8 +360,8 @@ or ``np.timedelta64`` objects. Passing ``np.nan/pd.NaT/nat`` will represent miss .. ipython:: python - pd.TimedeltaIndex(['1 days', '1 days, 00:00:05', - np.timedelta64(2,'D'), datetime.timedelta(days=2,seconds=2)]) + pd.TimedeltaIndex(['1 days', '1 days, 00:00:05', np.timedelta64(2, 'D'), + datetime.timedelta(days=2, seconds=2)]) The string 'infer' can be passed in order to set the frequency of the index as the inferred frequency upon creation: @@ -458,7 +458,7 @@ Similarly to frequency conversion on a ``Series`` above, you can convert these i .. ipython:: python - tdi / np.timedelta64(1,'s') + tdi / np.timedelta64(1, 's') tdi.astype('timedelta64[s]') Scalars type ops work as well. These can potentially return a *different* type of index. diff --git a/doc/source/timeseries.rst b/doc/source/timeseries.rst index cc377f45c4b8d0..bca7b6a601dd25 100644 --- a/doc/source/timeseries.rst +++ b/doc/source/timeseries.rst @@ -4,18 +4,12 @@ .. ipython:: python :suppress: - from datetime import datetime, timedelta, time import numpy as np import pandas as pd - from pandas import offsets + np.random.seed(123456) - randn = np.random.randn - randint = np.random.randint np.set_printoptions(precision=4, suppress=True) - pd.options.display.max_rows=15 - import dateutil - import pytz - from dateutil.relativedelta import relativedelta + pd.options.display.max_rows = 15 ******************************** Time Series / Date functionality @@ -32,7 +26,10 @@ Parsing time series information from various sources and formats .. ipython:: python - dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), datetime(2018, 1, 1)]) + import datetime + + dti = pd.to_datetime(['1/1/2018', np.datetime64('2018-01-01'), + datetime.datetime(2018, 1, 1)]) dti Generate sequences of fixed-frequency dates and time spans @@ -69,7 +66,7 @@ Performing date and time arithmetic with absolute or relative time increments saturday = friday + pd.Timedelta('1 day') saturday.day_name() # Add 1 business day (Friday --> Monday) - monday = friday + pd.tseries.offsets.BDay() + monday = friday + pd.offsets.BDay() monday.day_name() pandas provides a relatively compact and self-contained set of tools for @@ -110,12 +107,14 @@ However, :class:`Series` and :class:`DataFrame` can directly also support the ti pd.Series(pd.date_range('2000', freq='D', periods=3)) -:class:`Series` and :class:`DataFrame` have extended data type support and functionality for ``datetime`` and ``timedelta`` -data when the time data is used as data itself. The ``Period`` and ``DateOffset`` data will be stored as ``object`` data. +:class:`Series` and :class:`DataFrame` have extended data type support and functionality for ``datetime``, ``timedelta`` +and ``Period`` data when passed into those constructors. ``DateOffset`` +data however will be stored as ``object`` data. .. ipython:: python pd.Series(pd.period_range('1/1/2011', freq='M', periods=3)) + pd.Series([pd.DateOffset(1), pd.DateOffset(2)]) pd.Series(pd.date_range('1/1/2011', freq='M', periods=3)) Lastly, pandas represents null date times, time deltas, and time spans as ``NaT`` which @@ -141,7 +140,7 @@ time. .. ipython:: python - pd.Timestamp(datetime(2012, 5, 1)) + pd.Timestamp(datetime.datetime(2012, 5, 1)) pd.Timestamp('2012-05-01') pd.Timestamp(2012, 5, 1) @@ -163,7 +162,9 @@ and :class:`PeriodIndex` respectively. .. ipython:: python - dates = [pd.Timestamp('2012-05-01'), pd.Timestamp('2012-05-02'), pd.Timestamp('2012-05-03')] + dates = [pd.Timestamp('2012-05-01'), + pd.Timestamp('2012-05-02'), + pd.Timestamp('2012-05-03')] ts = pd.Series(np.random.randn(3), dates) type(ts.index) @@ -327,7 +328,7 @@ which can be specified. These are computed from the starting point specified by 1349979305, 1350065705], unit='s') pd.to_datetime([1349720105100, 1349720105200, 1349720105300, - 1349720105400, 1349720105500 ], unit='ms') + 1349720105400, 1349720105500], unit='ms') .. note:: @@ -400,7 +401,9 @@ To generate an index with timestamps, you can use either the ``DatetimeIndex`` o .. ipython:: python - dates = [datetime(2012, 5, 1), datetime(2012, 5, 2), datetime(2012, 5, 3)] + dates = [datetime.datetime(2012, 5, 1), + datetime.datetime(2012, 5, 2), + datetime.datetime(2012, 5, 3)] # Note the frequency information index = pd.DatetimeIndex(dates) @@ -418,8 +421,8 @@ to create a ``DatetimeIndex``. The default frequency for ``date_range`` is a .. ipython:: python - start = datetime(2011, 1, 1) - end = datetime(2012, 1, 1) + start = datetime.datetime(2011, 1, 1) + end = datetime.datetime(2012, 1, 1) index = pd.date_range(start, end) index @@ -486,7 +489,7 @@ used if a custom frequency string is passed. weekmask = 'Mon Wed Fri' - holidays = [datetime(2011, 1, 5), datetime(2011, 3, 14)] + holidays = [datetime.datetime(2011, 1, 5), datetime.datetime(2011, 3, 14)] pd.bdate_range(start, end, freq='C', weekmask=weekmask, holidays=holidays) @@ -564,7 +567,7 @@ Dates and strings that parse to timestamps can be passed as indexing parameters: ts['1/31/2011'] - ts[datetime(2011, 12, 25):] + ts[datetime.datetime(2011, 12, 25):] ts['10/31/2011':'12/31/2011'] @@ -583,9 +586,8 @@ would include matching times on an included date: .. ipython:: python - dft = pd.DataFrame(randn(100000,1), - columns=['A'], - index=pd.date_range('20130101',periods=100000,freq='T')) + dft = pd.DataFrame(np.random.randn(100000, 1), columns=['A'], + index=pd.date_range('20130101', periods=100000, freq='T')) dft dft['2013'] @@ -622,10 +624,9 @@ We are stopping on the included end-point as it is part of the index: dft2 = pd.DataFrame(np.random.randn(20, 1), columns=['A'], - index=pd.MultiIndex.from_product([pd.date_range('20130101', - periods=10, - freq='12H'), - ['a', 'b']])) + index=pd.MultiIndex.from_product( + [pd.date_range('20130101', periods=10, freq='12H'), + ['a', 'b']])) dft2 dft2.loc['2013-01-05'] idx = pd.IndexSlice @@ -681,7 +682,7 @@ If the timestamp string is treated as a slice, it can be used to index ``DataFra .. ipython:: python dft_minute = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, - index=series_minute.index) + index=series_minute.index) dft_minute['2011-12-31 23'] @@ -693,18 +694,16 @@ If the timestamp string is treated as a slice, it can be used to index ``DataFra .. ipython:: python - dft_minute.loc['2011-12-31 23:59'] + dft_minute.loc['2011-12-31 23:59'] Note also that ``DatetimeIndex`` resolution cannot be less precise than day. .. ipython:: python series_monthly = pd.Series([1, 2, 3], - pd.DatetimeIndex(['2011-12', - '2012-01', - '2012-02'])) + pd.DatetimeIndex(['2011-12', '2012-01', '2012-02'])) series_monthly.index.resolution - series_monthly['2011-12'] # returns Series + series_monthly['2011-12'] # returns Series Exact Indexing @@ -716,13 +715,14 @@ These ``Timestamp`` and ``datetime`` objects have exact ``hours, minutes,`` and .. ipython:: python - dft[datetime(2013, 1, 1):datetime(2013,2,28)] + dft[datetime.datetime(2013, 1, 1):datetime.datetime(2013, 2, 28)] With no defaults. .. ipython:: python - dft[datetime(2013, 1, 1, 10, 12, 0):datetime(2013, 2, 28, 10, 12, 0)] + dft[datetime.datetime(2013, 1, 1, 10, 12, 0): + datetime.datetime(2013, 2, 28, 10, 12, 0)] Truncating & Fancy Indexing @@ -823,120 +823,119 @@ on :ref:`.dt accessors`. DateOffset Objects ------------------ -In the preceding examples, we created ``DatetimeIndex`` objects at various -frequencies by passing in :ref:`frequency strings ` -like 'M', 'W', and 'BM' to the ``freq`` keyword. Under the hood, these frequency -strings are being translated into an instance of :class:`DateOffset`, -which represents a regular frequency increment. Specific offset logic like -"month", "business day", or "one hour" is represented in its various subclasses. - -.. csv-table:: - :header: "Class name", "Description" - :widths: 15, 65 - - DateOffset, "Generic offset class, defaults to 1 calendar day" - BDay, "business day (weekday)" - CDay, "custom business day" - Week, "one week, optionally anchored on a day of the week" - WeekOfMonth, "the x-th day of the y-th week of each month" - LastWeekOfMonth, "the x-th day of the last week of each month" - MonthEnd, "calendar month end" - MonthBegin, "calendar month begin" - BMonthEnd, "business month end" - BMonthBegin, "business month begin" - CBMonthEnd, "custom business month end" - CBMonthBegin, "custom business month begin" - SemiMonthEnd, "15th (or other day_of_month) and calendar month end" - SemiMonthBegin, "15th (or other day_of_month) and calendar month begin" - QuarterEnd, "calendar quarter end" - QuarterBegin, "calendar quarter begin" - BQuarterEnd, "business quarter end" - BQuarterBegin, "business quarter begin" - FY5253Quarter, "retail (aka 52-53 week) quarter" - YearEnd, "calendar year end" - YearBegin, "calendar year begin" - BYearEnd, "business year end" - BYearBegin, "business year begin" - FY5253, "retail (aka 52-53 week) year" - BusinessHour, "business hour" - CustomBusinessHour, "custom business hour" - Hour, "one hour" - Minute, "one minute" - Second, "one second" - Milli, "one millisecond" - Micro, "one microsecond" - Nano, "one nanosecond" - -The basic ``DateOffset`` takes the same arguments as -``dateutil.relativedelta``, which works as follows: - -.. ipython:: python - - d = datetime(2008, 8, 18, 9, 0) - d + relativedelta(months=4, days=5) - -We could have done the same thing with ``DateOffset``: - -.. ipython:: python - - from pandas.tseries.offsets import * - d + DateOffset(months=4, days=5) +In the preceding examples, frequency strings (e.g. ``'D'``) were used to specify +a frequency that defined: -The key features of a ``DateOffset`` object are: +* how the date times in :class:`DatetimeIndex` were spaced when using :meth:`date_range` +* the frequency of a :class:`Period` or :class:`PeriodIndex` -* It can be added / subtracted to/from a datetime object to obtain a - shifted date. -* It can be multiplied by an integer (positive or negative) so that the - increment will be applied multiple times. -* It has :meth:`~pandas.DateOffset.rollforward` and - :meth:`~pandas.DateOffset.rollback` methods for moving a date forward or - backward to the next or previous "offset date". +These frequency strings map to a :class:`DateOffset` object and its subclasses. A :class:`DateOffset` +is similar to a :class:`Timedelta` that represents a duration of time but follows specific calendar duration rules. +For example, a :class:`Timedelta` day will always increment ``datetimes`` by 24 hours, while a :class:`DateOffset` day +will increment ``datetimes`` to the same time the next day whether a day represents 23, 24 or 25 hours due to daylight +savings time. However, all :class:`DateOffset` subclasses that are an hour or smaller +(``Hour``, ``Minute``, ``Second``, ``Milli``, ``Micro``, ``Nano``) behave like +:class:`Timedelta` and respect absolute time. -Subclasses of ``DateOffset`` define the ``apply`` function which dictates -custom date increment logic, such as adding business days: - -.. code-block:: python - - class BDay(DateOffset): - """DateOffset increments between business days""" - def apply(self, other): - ... +The basic :class:`DateOffset` acts similar to ``dateutil.relativedelta`` (`relativedelta documentation`_) +that shifts a date time by the corresponding calendar duration specified. The +arithmetic operator (``+``) or the ``apply`` method can be used to perform the shift. .. ipython:: python - d - 5 * BDay() - d + BMonthEnd() - -The ``rollforward`` and ``rollback`` methods do exactly what you would expect: - -.. ipython:: python - - d - offset = BMonthEnd() - offset.rollforward(d) - offset.rollback(d) - -It's definitely worth exploring the ``pandas.tseries.offsets`` module and the -various docstrings for the classes. + # This particular day contains a day light savings time transition + ts = pd.Timestamp('2016-10-30 00:00:00', tz='Europe/Helsinki') + # Respects absolute time + ts + pd.Timedelta(days=1) + # Respects calendar time + ts + pd.DateOffset(days=1) + friday = pd.Timestamp('2018-01-05') + friday.day_name() + # Add 2 business days (Friday --> Tuesday) + two_business_days = 2 * pd.offsets.BDay() + two_business_days.apply(friday) + friday + two_business_days + (friday + two_business_days).day_name() + +Most ``DateOffsets`` have associated frequencies strings, or offset aliases, that can be passed +into ``freq`` keyword arguments. The available date offsets and associated frequency strings can be found below: -These operations (``apply``, ``rollforward`` and ``rollback``) preserve time -(hour, minute, etc) information by default. To reset time, use ``normalize`` -before or after applying the operation (depending on whether you want the -time information included in the operation. +.. csv-table:: + :header: "Date Offset", "Frequency String", "Description" + :widths: 15, 15, 65 + + ``DateOffset``, None, "Generic offset class, defaults to 1 calendar day" + ``BDay`` or ``BusinessDay``, ``'B'``,"business day (weekday)" + ``CDay`` or ``CustomBusinessDay``, ``'C'``, "custom business day" + ``Week``, ``'W'``, "one week, optionally anchored on a day of the week" + ``WeekOfMonth``, ``'WOM'``, "the x-th day of the y-th week of each month" + ``LastWeekOfMonth``, ``'LWOM'``, "the x-th day of the last week of each month" + ``MonthEnd``, ``'M'``, "calendar month end" + ``MonthBegin``, ``'MS'``, "calendar month begin" + ``BMonthEnd`` or ``BusinessMonthEnd``, ``'BM'``, "business month end" + ``BMonthBegin`` or ``BusinessMonthBegin``, ``'BMS'``, "business month begin" + ``CBMonthEnd`` or ``CustomBusinessMonthEnd``, ``'CBM'``, "custom business month end" + ``CBMonthBegin`` or ``CustomBusinessMonthBegin``, ``'CBMS'``, "custom business month begin" + ``SemiMonthEnd``, ``'SM'``, "15th (or other day_of_month) and calendar month end" + ``SemiMonthBegin``, ``'SMS'``, "15th (or other day_of_month) and calendar month begin" + ``QuarterEnd``, ``'Q'``, "calendar quarter end" + ``QuarterBegin``, ``'QS'``, "calendar quarter begin" + ``BQuarterEnd``, ``'BQ``, "business quarter end" + ``BQuarterBegin``, ``'BQS'``, "business quarter begin" + ``FY5253Quarter``, ``'REQ'``, "retail (aka 52-53 week) quarter" + ``YearEnd``, ``'A'``, "calendar year end" + ``YearBegin``, ``'AS'`` or ``'BYS'``,"calendar year begin" + ``BYearEnd``, ``'BA'``, "business year end" + ``BYearBegin``, ``'BAS'``, "business year begin" + ``FY5253``, ``'RE'``, "retail (aka 52-53 week) year" + ``Easter``, None, "Easter holiday" + ``BusinessHour``, ``'BH'``, "business hour" + ``CustomBusinessHour``, ``'CBH'``, "custom business hour" + ``Day``, ``'D'``, "one absolute day" + ``Hour``, ``'H'``, "one hour" + ``Minute``, ``'T'`` or ``'min'``,"one minute" + ``Second``, ``'S'``, "one second" + ``Milli``, ``'L'`` or ``'ms'``, "one millisecond" + ``Micro``, ``'U'`` or ``'us'``, "one microsecond" + ``Nano``, ``'N'``, "one nanosecond" + +``DateOffsets`` additionally have :meth:`rollforward` and :meth:`rollback` +methods for moving a date forward or backward respectively to a valid offset +date relative to the offset. For example, business offsets will roll dates +that land on the weekends (Saturday and Sunday) forward to Monday since +business offsets operate on the weekdays. + +.. ipython:: python + + ts = pd.Timestamp('2018-01-06 00:00:00') + ts.day_name() + # BusinessHour's valid offset dates are Monday through Friday + offset = pd.offsets.BusinessHour(start='09:00') + # Bring the date to the closest offset date (Monday) + offset.rollforward(ts) + # Date is brought to the closest offset date first and then the hour is added + ts + offset + +These operations preserve time (hour, minute, etc) information by default. +To reset time to midnight, use :meth:`normalize` before or after applying +the operation (depending on whether you want the time information included +in the operation). .. ipython:: python ts = pd.Timestamp('2014-01-01 09:00') - day = Day() + day = pd.offsets.Day() day.apply(ts) day.apply(ts).normalize() ts = pd.Timestamp('2014-01-01 22:00') - hour = Hour() + hour = pd.offsets.Hour() hour.apply(ts) hour.apply(ts).normalize() hour.apply(pd.Timestamp("2014-01-01 23:30")).normalize() +.. _relativedelta documentation: https://dateutil.readthedocs.io/en/stable/relativedelta.html + .. _timeseries.dayvscalendarday: Day vs. CalendarDay @@ -968,27 +967,28 @@ particular day of the week: .. ipython:: python + d = datetime.datetime(2008, 8, 18, 9, 0) d - d + Week() - d + Week(weekday=4) - (d + Week(weekday=4)).weekday() + d + pd.offsets.Week() + d + pd.offsets.Week(weekday=4) + (d + pd.offsets.Week(weekday=4)).weekday() - d - Week() + d - pd.offsets.Week() The ``normalize`` option will be effective for addition and subtraction. .. ipython:: python - d + Week(normalize=True) - d - Week(normalize=True) + d + pd.offsets.Week(normalize=True) + d - pd.offsets.Week(normalize=True) Another example is parameterizing ``YearEnd`` with the specific ending month: .. ipython:: python - d + YearEnd() - d + YearEnd(month=6) + d + pd.offsets.YearEnd() + d + pd.offsets.YearEnd(month=6) .. _timeseries.offsetseries: @@ -1004,9 +1004,9 @@ apply the offset to each element. rng = pd.date_range('2012-01-01', '2012-01-03') s = pd.Series(rng) rng - rng + DateOffset(months=2) - s + DateOffset(months=2) - s - DateOffset(months=2) + rng + pd.DateOffset(months=2) + s + pd.DateOffset(months=2) + s - pd.DateOffset(months=2) If the offset class maps directly to a ``Timedelta`` (``Day``, ``Hour``, ``Minute``, ``Second``, ``Micro``, ``Milli``, ``Nano``) it can be @@ -1015,10 +1015,10 @@ used exactly like a ``Timedelta`` - see the .. ipython:: python - s - Day(2) + s - pd.offsets.Day(2) td = s - pd.Series(pd.date_range('2011-12-29', '2011-12-31')) td - td + Minute(15) + td + pd.offsets.Minute(15) Note that some offsets (such as ``BQuarterEnd``) do not have a vectorized implementation. They can still be used but may @@ -1027,7 +1027,7 @@ calculate significantly slower and will show a ``PerformanceWarning`` .. ipython:: python :okwarning: - rng + BQuarterEnd() + rng + pd.offsets.BQuarterEnd() .. _timeseries.custombusinessdays: @@ -1043,15 +1043,17 @@ As an interesting example, let's look at Egypt where a Friday-Saturday weekend i .. ipython:: python - from pandas.tseries.offsets import CustomBusinessDay weekmask_egypt = 'Sun Mon Tue Wed Thu' # They also observe International Workers' Day so let's # add that for a couple of years - holidays = ['2012-05-01', datetime(2013, 5, 1), np.datetime64('2014-05-01')] - bday_egypt = CustomBusinessDay(holidays=holidays, weekmask=weekmask_egypt) - dt = datetime(2013, 4, 30) + holidays = ['2012-05-01', + datetime.datetime(2013, 5, 1), + np.datetime64('2014-05-01')] + bday_egypt = pd.offsets.CustomBusinessDay(holidays=holidays, + weekmask=weekmask_egypt) + dt = datetime.datetime(2013, 4, 30) dt + 2 * bday_egypt Let's map to the weekday names: @@ -1060,7 +1062,8 @@ Let's map to the weekday names: dts = pd.date_range(dt, periods=5, freq=bday_egypt) - pd.Series(dts.weekday, dts).map(pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split())) + pd.Series(dts.weekday, dts).map( + pd.Series('Mon Tue Wed Thu Fri Sat Sun'.split())) Holiday calendars can be used to provide the list of holidays. See the :ref:`holiday calendar` section for more information. @@ -1069,10 +1072,10 @@ Holiday calendars can be used to provide the list of holidays. See the from pandas.tseries.holiday import USFederalHolidayCalendar - bday_us = CustomBusinessDay(calendar=USFederalHolidayCalendar()) + bday_us = pd.offsets.CustomBusinessDay(calendar=USFederalHolidayCalendar()) # Friday before MLK Day - dt = datetime(2014, 1, 17) + dt = datetime.datetime(2014, 1, 17) # Tuesday after MLK Day (Monday is skipped because it's a holiday) dt + bday_us @@ -1082,15 +1085,15 @@ in the usual way. .. ipython:: python - from pandas.tseries.offsets import CustomBusinessMonthBegin - bmth_us = CustomBusinessMonthBegin(calendar=USFederalHolidayCalendar()) + bmth_us = pd.offsets.CustomBusinessMonthBegin( + calendar=USFederalHolidayCalendar()) # Skip new years - dt = datetime(2013, 12, 17) + dt = datetime.datetime(2013, 12, 17) dt + bmth_us # Define date index with custom offset - pd.DatetimeIndex(start='20100101',end='20120101',freq=bmth_us) + pd.DatetimeIndex(start='20100101', end='20120101', freq=bmth_us) .. note:: @@ -1111,13 +1114,13 @@ allowing to use specific start and end times. By default, ``BusinessHour`` uses 9:00 - 17:00 as business hours. Adding ``BusinessHour`` will increment ``Timestamp`` by hourly frequency. -If target ``Timestamp`` is out of business hours, move to the next business hour -then increment it. If the result exceeds the business hours end, the remaining +If target ``Timestamp`` is out of business hours, move to the next business hour +then increment it. If the result exceeds the business hours end, the remaining hours are added to the next business day. .. ipython:: python - bh = BusinessHour() + bh = pd.offsets.BusinessHour() bh # 2014-08-01 is Friday @@ -1134,19 +1137,19 @@ hours are added to the next business day. pd.Timestamp('2014-08-01 16:30') + bh # Adding 2 business hours - pd.Timestamp('2014-08-01 10:00') + BusinessHour(2) + pd.Timestamp('2014-08-01 10:00') + pd.offsets.BusinessHour(2) # Subtracting 3 business hours - pd.Timestamp('2014-08-01 10:00') + BusinessHour(-3) + pd.Timestamp('2014-08-01 10:00') + pd.offsets.BusinessHour(-3) -You can also specify ``start`` and ``end`` time by keywords. The argument must -be a ``str`` with an ``hour:minute`` representation or a ``datetime.time`` -instance. Specifying seconds, microseconds and nanoseconds as business hour +You can also specify ``start`` and ``end`` time by keywords. The argument must +be a ``str`` with an ``hour:minute`` representation or a ``datetime.time`` +instance. Specifying seconds, microseconds and nanoseconds as business hour results in ``ValueError``. .. ipython:: python - bh = BusinessHour(start='11:00', end=time(20, 0)) + bh = pd.offsets.BusinessHour(start='11:00', end=datetime.time(20, 0)) bh pd.Timestamp('2014-08-01 13:00') + bh @@ -1159,7 +1162,7 @@ Valid business hours are distinguished by whether it started from valid ``Busine .. ipython:: python - bh = BusinessHour(start='17:00', end='09:00') + bh = pd.offsets.BusinessHour(start='17:00', end='09:00') bh pd.Timestamp('2014-08-01 17:00') + bh @@ -1184,22 +1187,22 @@ under the default business hours (9:00 - 17:00), there is no gap (0 minutes) bet .. ipython:: python # This adjusts a Timestamp to business hour edge - BusinessHour().rollback(pd.Timestamp('2014-08-02 15:00')) - BusinessHour().rollforward(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().rollback(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().rollforward(pd.Timestamp('2014-08-02 15:00')) # It is the same as BusinessHour().apply(pd.Timestamp('2014-08-01 17:00')). # And it is the same as BusinessHour().apply(pd.Timestamp('2014-08-04 09:00')) - BusinessHour().apply(pd.Timestamp('2014-08-02 15:00')) + pd.offsets.BusinessHour().apply(pd.Timestamp('2014-08-02 15:00')) # BusinessDay results (for reference) - BusinessHour().rollforward(pd.Timestamp('2014-08-02')) + pd.offsets.BusinessHour().rollforward(pd.Timestamp('2014-08-02')) # It is the same as BusinessDay().apply(pd.Timestamp('2014-08-01')) # The result is the same as rollworward because BusinessDay never overlap. - BusinessHour().apply(pd.Timestamp('2014-08-02')) + pd.offsets.BusinessHour().apply(pd.Timestamp('2014-08-02')) -``BusinessHour`` regards Saturday and Sunday as holidays. To use arbitrary -holidays, you can use ``CustomBusinessHour`` offset, as explained in the +``BusinessHour`` regards Saturday and Sunday as holidays. To use arbitrary +holidays, you can use ``CustomBusinessHour`` offset, as explained in the following subsection. .. _timeseries.custombusinesshour: @@ -1216,9 +1219,9 @@ as ``BusinessHour`` except that it skips specified custom holidays. .. ipython:: python from pandas.tseries.holiday import USFederalHolidayCalendar - bhour_us = CustomBusinessHour(calendar=USFederalHolidayCalendar()) + bhour_us = pd.offsets.CustomBusinessHour(calendar=USFederalHolidayCalendar()) # Friday before MLK Day - dt = datetime(2014, 1, 17, 15) + dt = datetime.datetime(2014, 1, 17, 15) dt + bhour_us @@ -1229,7 +1232,8 @@ You can use keyword arguments supported by either ``BusinessHour`` and ``CustomB .. ipython:: python - bhour_mon = CustomBusinessHour(start='10:00', weekmask='Tue Wed Thu Fri') + bhour_mon = pd.offsets.CustomBusinessHour(start='10:00', + weekmask='Tue Wed Thu Fri') # Monday is skipped because it's a holiday, business hour starts from 10:00 dt + bhour_mon * 2 @@ -1285,7 +1289,7 @@ most functions: pd.date_range(start, periods=5, freq='B') - pd.date_range(start, periods=5, freq=BDay()) + pd.date_range(start, periods=5, freq=pd.offsets.BDay()) You can combine together day and intraday offsets: @@ -1352,39 +1356,39 @@ anchor point, and moved ``|n|-1`` additional steps forwards or backwards. .. ipython:: python - pd.Timestamp('2014-01-02') + MonthBegin(n=1) - pd.Timestamp('2014-01-02') + MonthEnd(n=1) + pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=1) + pd.Timestamp('2014-01-02') + pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-02') - MonthBegin(n=1) - pd.Timestamp('2014-01-02') - MonthEnd(n=1) + pd.Timestamp('2014-01-02') - pd.offsets.MonthBegin(n=1) + pd.Timestamp('2014-01-02') - pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-02') + MonthBegin(n=4) - pd.Timestamp('2014-01-02') - MonthBegin(n=4) + pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=4) + pd.Timestamp('2014-01-02') - pd.offsets.MonthBegin(n=4) If the given date *is* on an anchor point, it is moved ``|n|`` points forwards or backwards. .. ipython:: python - pd.Timestamp('2014-01-01') + MonthBegin(n=1) - pd.Timestamp('2014-01-31') + MonthEnd(n=1) + pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=1) + pd.Timestamp('2014-01-31') + pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-01') - MonthBegin(n=1) - pd.Timestamp('2014-01-31') - MonthEnd(n=1) + pd.Timestamp('2014-01-01') - pd.offsets.MonthBegin(n=1) + pd.Timestamp('2014-01-31') - pd.offsets.MonthEnd(n=1) - pd.Timestamp('2014-01-01') + MonthBegin(n=4) - pd.Timestamp('2014-01-31') - MonthBegin(n=4) + pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=4) + pd.Timestamp('2014-01-31') - pd.offsets.MonthBegin(n=4) For the case when ``n=0``, the date is not moved if on an anchor point, otherwise it is rolled forward to the next anchor point. .. ipython:: python - pd.Timestamp('2014-01-02') + MonthBegin(n=0) - pd.Timestamp('2014-01-02') + MonthEnd(n=0) + pd.Timestamp('2014-01-02') + pd.offsets.MonthBegin(n=0) + pd.Timestamp('2014-01-02') + pd.offsets.MonthEnd(n=0) - pd.Timestamp('2014-01-01') + MonthBegin(n=0) - pd.Timestamp('2014-01-31') + MonthEnd(n=0) + pd.Timestamp('2014-01-01') + pd.offsets.MonthBegin(n=0) + pd.Timestamp('2014-01-31') + pd.offsets.MonthEnd(n=0) .. _timeseries.holiday: @@ -1427,10 +1431,13 @@ An example of how holidays and holiday calendars are defined: USMemorialDay, Holiday('July 4th', month=7, day=4, observance=nearest_workday), Holiday('Columbus Day', month=10, day=1, - offset=DateOffset(weekday=MO(2))), #same as 2*Week(weekday=2) - ] + offset=pd.DateOffset(weekday=MO(2)))] + cal = ExampleCalendar() - cal.holidays(datetime(2012, 1, 1), datetime(2012, 12, 31)) + cal.holidays(datetime.datetime(2012, 1, 1), datetime.datetime(2012, 12, 31)) + +:hint: + **weekday=MO(2)** is same as **2 * Week(weekday=2)** Using this calendar, creating an index or doing offset arithmetic skips weekends and holidays (i.e., Memorial Day/July 4th). For example, the below defines @@ -1440,14 +1447,13 @@ or ``Timestamp`` objects. .. ipython:: python - from pandas.tseries.offsets import CDay pd.DatetimeIndex(start='7/1/2012', end='7/10/2012', - freq=CDay(calendar=cal)).to_pydatetime() - offset = CustomBusinessDay(calendar=cal) - datetime(2012, 5, 25) + offset - datetime(2012, 7, 3) + offset - datetime(2012, 7, 3) + 2 * offset - datetime(2012, 7, 6) + offset + freq=pd.offsets.CDay(calendar=cal)).to_pydatetime() + offset = pd.offsets.CustomBusinessDay(calendar=cal) + datetime.datetime(2012, 5, 25) + offset + datetime.datetime(2012, 7, 3) + offset + datetime.datetime(2012, 7, 3) + 2 * offset + datetime.datetime(2012, 7, 6) + offset Ranges are defined by the ``start_date`` and ``end_date`` class attributes of ``AbstractHolidayCalendar``. The defaults are shown below. @@ -1462,8 +1468,8 @@ datetime/Timestamp/string. .. ipython:: python - AbstractHolidayCalendar.start_date = datetime(2012, 1, 1) - AbstractHolidayCalendar.end_date = datetime(2012, 12, 31) + AbstractHolidayCalendar.start_date = datetime.datetime(2012, 1, 1) + AbstractHolidayCalendar.end_date = datetime.datetime(2012, 12, 31) cal.holidays() Every calendar class is accessible by name using the ``get_calendar`` function @@ -1490,7 +1496,7 @@ Shifting / Lagging ~~~~~~~~~~~~~~~~~~ One may want to *shift* or *lag* the values in a time series back and forward in -time. The method for this is :meth:`~Series.shift`, which is available on all of +time. The method for this is :meth:`~Series.shift`, which is available on all of the pandas objects. .. ipython:: python @@ -1500,16 +1506,16 @@ the pandas objects. ts.shift(1) The ``shift`` method accepts an ``freq`` argument which can accept a -``DateOffset`` class or other ``timedelta``-like object or also an +``DateOffset`` class or other ``timedelta``-like object or also an :ref:`offset alias `: .. ipython:: python - ts.shift(5, freq=offsets.BDay()) + ts.shift(5, freq=pd.offsets.BDay()) ts.shift(5, freq='BM') Rather than changing the alignment of the data and the index, ``DataFrame`` and -``Series`` objects also have a :meth:`~Series.tshift` convenience method that +``Series`` objects also have a :meth:`~Series.tshift` convenience method that changes all the dates in the index by a specified number of offsets: .. ipython:: python @@ -1522,35 +1528,35 @@ is not being realigned. Frequency Conversion ~~~~~~~~~~~~~~~~~~~~ -The primary function for changing frequencies is the :meth:`~Series.asfreq` -method. For a ``DatetimeIndex``, this is basically just a thin, but convenient -wrapper around :meth:`~Series.reindex` which generates a ``date_range`` and +The primary function for changing frequencies is the :meth:`~Series.asfreq` +method. For a ``DatetimeIndex``, this is basically just a thin, but convenient +wrapper around :meth:`~Series.reindex` which generates a ``date_range`` and calls ``reindex``. .. ipython:: python - dr = pd.date_range('1/1/2010', periods=3, freq=3 * offsets.BDay()) - ts = pd.Series(randn(3), index=dr) + dr = pd.date_range('1/1/2010', periods=3, freq=3 * pd.offsets.BDay()) + ts = pd.Series(np.random.randn(3), index=dr) ts - ts.asfreq(BDay()) + ts.asfreq(pd.offsets.BDay()) ``asfreq`` provides a further convenience so you can specify an interpolation method for any gaps that may appear after the frequency conversion. .. ipython:: python - ts.asfreq(BDay(), method='pad') + ts.asfreq(pd.offsets.BDay(), method='pad') Filling Forward / Backward ~~~~~~~~~~~~~~~~~~~~~~~~~~ -Related to ``asfreq`` and ``reindex`` is :meth:`~Series.fillna`, which is +Related to ``asfreq`` and ``reindex`` is :meth:`~Series.fillna`, which is documented in the :ref:`missing data section `. Converting to Python Datetimes ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -``DatetimeIndex`` can be converted to an array of Python native +``DatetimeIndex`` can be converted to an array of Python native :py:class:`datetime.datetime` objects using the ``to_pydatetime`` method. .. _timeseries.resampling: @@ -1563,13 +1569,13 @@ Resampling The interface to ``.resample`` has changed in 0.18.0 to be more groupby-like and hence more flexible. See the :ref:`whatsnew docs ` for a comparison with prior versions. -Pandas has a simple, powerful, and efficient functionality for performing -resampling operations during frequency conversion (e.g., converting secondly -data into 5-minutely data). This is extremely common in, but not limited to, +Pandas has a simple, powerful, and efficient functionality for performing +resampling operations during frequency conversion (e.g., converting secondly +data into 5-minutely data). This is extremely common in, but not limited to, financial applications. -:meth:`~Series.resample` is a time-based groupby, followed by a reduction method -on each of its groups. See some :ref:`cookbook examples ` for +:meth:`~Series.resample` is a time-based groupby, followed by a reduction method +on each of its groups. See some :ref:`cookbook examples ` for some advanced strategies. Starting in version 0.18.1, the ``resample()`` function can be used directly from @@ -1577,7 +1583,7 @@ Starting in version 0.18.1, the ``resample()`` function can be used directly fro .. note:: - ``.resample()`` is similar to using a :meth:`~Series.rolling` operation with + ``.resample()`` is similar to using a :meth:`~Series.rolling` operation with a time-based offset, see a discussion :ref:`here `. Basics @@ -1624,7 +1630,7 @@ labels. .. ipython:: python - ts.resample('5Min').mean() # by default label='left' + ts.resample('5Min').mean() # by default label='left' ts.resample('5Min', label='left').mean() @@ -1632,8 +1638,8 @@ labels. .. note:: - The default values for ``label`` and ``closed`` is 'left' for all - frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' + The default values for ``label`` and ``closed`` is 'left' for all + frequency offsets except for 'M', 'A', 'Q', 'BM', 'BA', 'BQ', and 'W' which all have a default of 'right'. .. ipython:: python @@ -1680,9 +1686,9 @@ Sparse Resampling ~~~~~~~~~~~~~~~~~ Sparse timeseries are the ones where you have a lot fewer points relative -to the amount of time you are looking to resample. Naively upsampling a sparse -series can potentially generate lots of intermediate values. When you don't want -to use a method to fill these values, e.g. ``fill_method`` is ``None``, then +to the amount of time you are looking to resample. Naively upsampling a sparse +series can potentially generate lots of intermediate values. When you don't want +to use a method to fill these values, e.g. ``fill_method`` is ``None``, then intermediate values will be filled with ``NaN``. Since ``resample`` is a time-based groupby, the following is a method to efficiently @@ -1737,7 +1743,7 @@ We can select a specific column or columns using standard getitem. r['A'].mean() - r[['A','B']].mean() + r[['A', 'B']].mean() You can pass a list or dict of functions to do aggregation with, outputting a ``DataFrame``: @@ -1758,21 +1764,21 @@ columns of a ``DataFrame``: .. ipython:: python :okexcept: - r.agg({'A' : np.sum, - 'B' : lambda x: np.std(x, ddof=1)}) + r.agg({'A': np.sum, + 'B': lambda x: np.std(x, ddof=1)}) The function names can also be strings. In order for a string to be valid it must be implemented on the resampled object: .. ipython:: python - r.agg({'A' : 'sum', 'B' : 'std'}) + r.agg({'A': 'sum', 'B': 'std'}) Furthermore, you can also specify multiple aggregation functions for each column separately. .. ipython:: python - r.agg({'A' : ['sum','std'], 'B' : ['mean','std'] }) + r.agg({'A': ['sum', 'std'], 'B': ['mean', 'std']}) If a ``DataFrame`` does not have a datetimelike index, but instead you want @@ -1784,9 +1790,9 @@ to resample based on datetimelike column in the frame, it can passed to the df = pd.DataFrame({'date': pd.date_range('2015-01-01', freq='W', periods=5), 'a': np.arange(5)}, index=pd.MultiIndex.from_arrays([ - [1,2,3,4,5], - pd.date_range('2015-01-01', freq='W', periods=5)], - names=['v','d'])) + [1, 2, 3, 4, 5], + pd.date_range('2015-01-01', freq='W', periods=5)], + names=['v', 'd'])) df df.resample('M', on='date').sum() @@ -1845,13 +1851,13 @@ If ``Period`` freq is daily or higher (``D``, ``H``, ``T``, ``S``, ``L``, ``U``, .. ipython:: python p = pd.Period('2014-07-01 09:00', freq='H') - p + Hour(2) - p + timedelta(minutes=120) + p + pd.offsets.Hour(2) + p + datetime.timedelta(minutes=120) p + np.timedelta64(7200, 's') .. code-block:: ipython - In [1]: p + Minute(5) + In [1]: p + pd.offsets.Minute(5) Traceback ... ValueError: Input has different freq from Period(freq=H) @@ -1861,11 +1867,11 @@ If ``Period`` has other frequencies, only the same ``offsets`` can be added. Oth .. ipython:: python p = pd.Period('2014-07', freq='M') - p + MonthEnd(3) + p + pd.offsets.MonthEnd(3) .. code-block:: ipython - In [1]: p + MonthBegin(3) + In [1]: p + pd.offsets.MonthBegin(3) Traceback ... ValueError: Input has different freq from Period(freq=M) @@ -1923,11 +1929,11 @@ objects: idx = pd.period_range('2014-07-01 09:00', periods=5, freq='H') idx - idx + Hour(2) + idx + pd.offsets.Hour(2) idx = pd.period_range('2014-07', periods=5, freq='M') idx - idx + MonthEnd(3) + idx + pd.offsets.MonthEnd(3) ``PeriodIndex`` has its own dtype named ``period``, refer to :ref:`Period Dtypes `. @@ -1977,7 +1983,7 @@ You can pass in dates and strings to ``Series`` and ``DataFrame`` with ``PeriodI ps['2011-01'] - ps[datetime(2011, 12, 25):] + ps[datetime.datetime(2011, 12, 25):] ps['10/31/2011':'12/31/2011'] @@ -1987,9 +1993,11 @@ Passing a string representing a lower frequency than ``PeriodIndex`` returns par ps['2011'] - dfp = pd.DataFrame(np.random.randn(600,1), + dfp = pd.DataFrame(np.random.randn(600, 1), columns=['A'], - index=pd.period_range('2013-01-01 9:00', periods=600, freq='T')) + index=pd.period_range('2013-01-01 9:00', + periods=600, + freq='T')) dfp dfp['2013-01-01 10H'] @@ -2178,6 +2186,8 @@ time zones by starting with ``dateutil/``. .. ipython:: python + import dateutil + # pytz rng_pytz = pd.date_range('3/6/2012 00:00', periods=10, freq='D', tz='Europe/London') @@ -2199,6 +2209,8 @@ which gives you more control over which time zone is used: .. ipython:: python + import pytz + # pytz tz_pytz = pytz.timezone('Europe/London') rng_pytz = pd.date_range('3/6/2012 00:00', periods=10, freq='D', @@ -2297,7 +2309,8 @@ To remove timezone from tz-aware ``DatetimeIndex``, use ``tz_localize(None)`` or .. ipython:: python - didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H', periods=10, tz='US/Eastern') + didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H', + periods=10, tz='US/Eastern') didx didx.tz_localize(None) didx.tz_convert(None) @@ -2350,7 +2363,8 @@ constructor as well as ``tz_localize``. rng_hourly.tz_localize('US/Eastern', ambiguous=rng_hourly_dst).tolist() rng_hourly.tz_localize('US/Eastern', ambiguous='NaT').tolist() - didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H', periods=10, tz='US/Eastern') + didx = pd.DatetimeIndex(start='2014-08-01 09:00', freq='H', + periods=10, tz='US/Eastern') didx didx.tz_localize(None) didx.tz_convert(None) @@ -2373,7 +2387,7 @@ can be controlled by the ``nonexistent`` argument. The following options are ava .. ipython:: python - dti = pd.date_range(start='2015-03-29 01:30:00', periods=3, freq='H') + dti = pd.date_range(start='2015-03-29 02:30:00', periods=3, freq='H') # 2:30 is a nonexistent time Localization of nonexistent times will raise an error by default. @@ -2401,14 +2415,14 @@ TZ Aware Dtypes .. ipython:: python - s_naive = pd.Series(pd.date_range('20130101',periods=3)) + s_naive = pd.Series(pd.date_range('20130101', periods=3)) s_naive ``Series/DatetimeIndex`` with a timezone **aware** value are represented with a dtype of ``datetime64[ns, tz]``. .. ipython:: python - s_aware = pd.Series(pd.date_range('20130101',periods=3,tz='US/Eastern')) + s_aware = pd.Series(pd.date_range('20130101', periods=3, tz='US/Eastern')) s_aware Both of these ``Series`` can be manipulated via the ``.dt`` accessor, see :ref:`here `. @@ -2436,22 +2450,22 @@ a convert on an aware stamp. .. note:: - Using the ``.values`` accessor on a ``Series``, returns an NumPy array of the data. + Using :meth:`Series.to_numpy` on a ``Series``, returns a NumPy array of the data. These values are converted to UTC, as NumPy does not currently support timezones (even though it is *printing* in the local timezone!). .. ipython:: python - s_naive.values - s_aware.values + s_naive.to_numpy() + s_aware.to_numpy() Further note that once converted to a NumPy array these would lose the tz tenor. .. ipython:: python - pd.Series(s_aware.values) + pd.Series(s_aware.to_numpy()) However, these can be easily converted: .. ipython:: python - pd.Series(s_aware.values).dt.tz_localize('UTC').dt.tz_convert('US/Eastern') + pd.Series(s_aware.to_numpy()).dt.tz_localize('UTC').dt.tz_convert('US/Eastern') diff --git a/doc/source/whatsnew/v0.24.0.rst b/doc/source/whatsnew/v0.24.0.rst index 8fa1c926cba690..eb6b48f8546414 100644 --- a/doc/source/whatsnew/v0.24.0.rst +++ b/doc/source/whatsnew/v0.24.0.rst @@ -24,8 +24,53 @@ New features the user to override the engine's default behavior to include or omit the dataframe's indexes from the resulting Parquet file. (:issue:`20768`) - :meth:`DataFrame.corr` and :meth:`Series.corr` now accept a callable for generic calculation methods of correlation, e.g. histogram intersection (:issue:`22684`) -- :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing -the user to specify which decimal separator should be used in the output. (:issue:`23614`) +- :func:`DataFrame.to_string` now accepts ``decimal`` as an argument, allowing the user to specify which decimal separator should be used in the output. (:issue:`23614`) + +.. _whatsnew_0240.values_api: + +Accessing the values in a Series or Index +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +:attr:`Series.array` and :attr:`Index.array` have been added for extracting the array backing a +``Series`` or ``Index``. + +.. ipython:: python + + idx = pd.period_range('2000', periods=4) + idx.array + pd.Series(idx).array + +Historically, this would have been done with ``series.values``, but with +``.values`` it was unclear whether the returned value would be the actual array, +some transformation of it, or one of pandas custom arrays (like +``Categorical``). For example, with :class:`PeriodIndex`, ``.values`` generates +a new ndarray of period objects each time. + +.. ipython:: python + + id(idx.values) + id(idx.values) + +If you need an actual NumPy array, use :meth:`Series.to_numpy` or :meth:`Index.to_numpy`. + +.. ipython:: python + + idx.to_numpy() + pd.Series(idx).to_numpy() + +For Series and Indexes backed by normal NumPy arrays, this will be the same thing (and the same +as ``.values``). + +.. ipython:: python + + ser = pd.Series([1, 2, 3]) + ser.array + ser.to_numpy() + +We haven't removed or deprecated :attr:`Series.values` or :attr:`DataFrame.values`, but we +recommend and using ``.array`` or ``.to_numpy()`` instead. + +See :ref:`basics.dtypes` and :ref:`dsintro.attrs` for more. .. _whatsnew_0240.enhancements.extension_array_operators: @@ -184,6 +229,30 @@ array, but rather an ``ExtensionArray``: This is the same behavior as ``Series.values`` for categorical data. See :ref:`whatsnew_0240.api_breaking.interval_values` for more. + +.. _whatsnew_0240.enhancements.styler_pipe: + +New ``Styler.pipe()`` method +^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +The :class:`~pandas.io.formats.style.Styler` class has gained a +:meth:`~pandas.io.formats.style.Styler.pipe` method (:issue:`23229`). This provides a +convenient way to apply users' predefined styling functions, and can help reduce +"boilerplate" when using DataFrame styling functionality repeatedly within a notebook. + +.. ipython:: python + + df = pandas.DataFrame({'N': [1250, 1500, 1750], 'X': [0.25, 0.35, 0.50]}) + + def format_and_align(styler): + return (styler.format({'N': '{:,}', 'X': '{:.1%}'}) + .set_properties(**{'text-align': 'right'})) + + df.style.pipe(format_and_align).set_caption('Summary of results.') + +Similar methods already exist for other classes in pandas, including :meth:`DataFrame.pipe`, +:meth:`Groupby.pipe`, and :meth:`Resampler.pipe`. + + .. _whatsnew_0240.enhancements.join_with_two_multiindexes: Joining with two multi-indexes @@ -225,6 +294,7 @@ For earlier versions this can be done using the following. pd.merge(left.reset_index(), right.reset_index(), on=['key'], how='inner').set_index(['key', 'X', 'Y']) + .. _whatsnew_0240.enhancements.rename_axis: Renaming names in a MultiIndex @@ -248,6 +318,7 @@ Example: See the :ref:`advanced docs on renaming` for more details. + .. _whatsnew_0240.enhancements.other: Other Enhancements @@ -294,6 +365,7 @@ Other Enhancements - :meth:`MultiIndex.to_flat_index` has been added to flatten multiple levels into a single-level :class:`Index` object. - :meth:`DataFrame.to_stata` and :class:` pandas.io.stata.StataWriter117` can write mixed sting columns to Stata strl format (:issue:`23633`) - :meth:`DataFrame.between_time` and :meth:`DataFrame.at_time` have gained the an ``axis`` parameter (:issue: `8839`) +- :class:`IntervalIndex` has gained the :attr:`~IntervalIndex.is_overlapping` attribute to indicate if the ``IntervalIndex`` contains any overlapping intervals (:issue:`23309`) .. _whatsnew_0240.api_breaking: @@ -303,6 +375,8 @@ Backwards incompatible API changes - A newly constructed empty :class:`DataFrame` with integer as the ``dtype`` will now only be cast to ``float64`` if ``index`` is specified (:issue:`22858`) - :meth:`Series.str.cat` will now raise if `others` is a `set` (:issue:`23009`) - Passing scalar values to :class:`DatetimeIndex` or :class:`TimedeltaIndex` will now raise ``TypeError`` instead of ``ValueError`` (:issue:`23539`) +- ``max_rows`` and ``max_cols`` parameters removed from :class:`HTMLFormatter` since truncation is handled by :class:`DataFrameFormatter` (:issue:`23818`) +- :meth:`read_csv` will now raise a ``ValueError`` if a column with missing values is declared as having dtype ``bool`` (:issue:`20591`) .. _whatsnew_0240.api_breaking.deps: @@ -1017,6 +1091,7 @@ Other API Changes - :meth:`Categorical.searchsorted` now raises a ``KeyError`` rather that a ``ValueError``, if a searched for key is not found in its categories (:issue:`23466`). - :meth:`Index.hasnans` and :meth:`Series.hasnans` now always return a python boolean. Previously, a python or a numpy boolean could be returned, depending on circumstances (:issue:`23294`). - The order of the arguments of :func:`DataFrame.to_html` and :func:`DataFrame.to_string` is rearranged to be consistent with each other. (:issue:`23614`) +- :meth:`CategoricalIndex.reindex` now raises a ``ValueError`` if the target index is non-unique and not equal to the current index. It previously only raised if the target index was not of a categorical dtype (:issue:`23963`). .. _whatsnew_0240.deprecations: @@ -1043,9 +1118,13 @@ Deprecations `use_threads` to reflect the changes in pyarrow 0.11.0. (:issue:`23053`) - :func:`pandas.read_excel` has deprecated accepting ``usecols`` as an integer. Please pass in a list of ints from 0 to ``usecols`` inclusive instead (:issue:`23527`) - Constructing a :class:`TimedeltaIndex` from data with ``datetime64``-dtyped data is deprecated, will raise ``TypeError`` in a future version (:issue:`23539`) +- Constructing a :class:`DatetimeIndex` from data with ``timedelta64``-dtyped data is deprecated, will raise ``TypeError`` in a future version (:issue:`23675`) - The ``keep_tz=False`` option (the default) of the ``keep_tz`` keyword of :meth:`DatetimeIndex.to_series` is deprecated (:issue:`17832`). - Timezone converting a tz-aware ``datetime.datetime`` or :class:`Timestamp` with :class:`Timestamp` and the ``tz`` argument is now deprecated. Instead, use :meth:`Timestamp.tz_convert` (:issue:`23579`) +- :func:`pandas.types.is_period` is deprecated in favor of `pandas.types.is_period_dtype` (:issue:`23917`) +- :func:`pandas.types.is_datetimetz` is deprecated in favor of `pandas.types.is_datetime64tz` (:issue:`23917`) +- Creating a :class:`TimedeltaIndex` or :class:`DatetimeIndex` by passing range arguments `start`, `end`, and `periods` is deprecated in favor of :func:`timedelta_range` and :func:`date_range` (:issue:`23919`) .. _whatsnew_0240.deprecations.datetimelike_int_ops: @@ -1136,6 +1215,7 @@ Performance Improvements The speed increase is both when indexing by label (using .loc) and position(.iloc) (:issue:`20395`) Slicing a monotonically increasing :class:`CategoricalIndex` itself (i.e. ``ci[1000:2000]``) shows similar speed improvements as above (:issue:`21659`) +- Improved performance of :meth:`CategoricalIndex.equals` when comparing to another :class:`CategoricalIndex` (:issue:`24023`) - Improved performance of :func:`Series.describe` in case of numeric dtpyes (:issue:`21274`) - Improved performance of :func:`pandas.core.groupby.GroupBy.rank` when dealing with tied rankings (:issue:`21237`) - Improved performance of :func:`DataFrame.set_index` with columns consisting of :class:`Period` objects (:issue:`21582`, :issue:`21606`) @@ -1151,7 +1231,7 @@ Performance Improvements - Improved performance of :func:`pd.concat` for `Series` objects (:issue:`23404`) - Improved performance of :meth:`DatetimeIndex.normalize` and :meth:`Timestamp.normalize` for timezone naive or UTC datetimes (:issue:`23634`) - Improved performance of :meth:`DatetimeIndex.tz_localize` and various ``DatetimeIndex`` attributes with dateutil UTC timezone (:issue:`23772`) - +- Improved performance of :class:`Categorical` constructor for `Series` objects (:issue:`23814`) .. _whatsnew_0240.docs: @@ -1177,6 +1257,7 @@ Categorical - Bug in :meth:`Categorical.take` with a user-provided ``fill_value`` not encoding the ``fill_value``, which could result in a ``ValueError``, incorrect results, or a segmentation fault (:issue:`23296`). - In meth:`Series.unstack`, specifying a ``fill_value`` not present in the categories now raises a ``TypeError`` rather than ignoring the ``fill_value`` (:issue:`23284`) - Bug when resampling :meth:`Dataframe.resample()` and aggregating on categorical data, the categorical dtype was getting lost. (:issue:`23227`) +- Bug in many methods of the ``.str``-accessor, which always failed on calling the ``CategoricalIndex.str`` constructor (:issue:`23555`, :issue:`23556`) Datetimelike ^^^^^^^^^^^^ @@ -1225,7 +1306,7 @@ Timedelta - Bug in :class:`TimedeltaIndex` where adding a timezone-aware datetime scalar incorrectly returned a timezone-naive :class:`DatetimeIndex` (:issue:`23215`) - Bug in :class:`TimedeltaIndex` where adding ``np.timedelta64('NaT')`` incorrectly returned an all-`NaT` :class:`DatetimeIndex` instead of an all-`NaT` :class:`TimedeltaIndex` (:issue:`23215`) - Bug in :class:`Timedelta` and :func:`to_timedelta()` have inconsistencies in supported unit string (:issue:`21762`) - +- Bug in :class:`TimedeltaIndex` division where dividing by another :class:`TimedeltaIndex` raised ``TypeError`` instead of returning a :class:`Float64Index` (:issue:`23829`, :issue:`22631`) Timezones ^^^^^^^^^ @@ -1279,6 +1360,7 @@ Numeric - Bug in :meth:`Series.rpow` with object dtype ``NaN`` for ``1 ** NA`` instead of ``1`` (:issue:`22922`). - :meth:`Series.agg` can now handle numpy NaN-aware methods like :func:`numpy.nansum` (:issue:`19629`) - Bug in :meth:`Series.rank` and :meth:`DataFrame.rank` when ``pct=True`` and more than 2:sup:`24` rows are present resulted in percentages greater than 1.0 (:issue:`18271`) +- Calls such as :meth:`DataFrame.round` with a non-unique :meth:`CategoricalIndex` now return expected data. Previously, data would be improperly duplicated (:issue:`21809`). Strings ^^^^^^^ @@ -1286,6 +1368,7 @@ Strings - Bug in :meth:`Index.str.partition` was not nan-safe (:issue:`23558`). - Bug in :meth:`Index.str.split` was not nan-safe (:issue:`23677`). - Bug :func:`Series.str.contains` not respecting the ``na`` argument for a ``Categorical`` dtype ``Series`` (:issue:`22158`) +- Bug in :meth:`Index.str.cat` when the result contained only ``NaN`` (:issue:`24044`) Interval ^^^^^^^^ @@ -1318,6 +1401,7 @@ Indexing - Bug where setting a timedelta column by ``Index`` causes it to be casted to double, and therefore lose precision (:issue:`23511`) - Bug in :func:`Index.union` and :func:`Index.intersection` where name of the ``Index`` of the result was not computed correctly for certain cases (:issue:`9943`, :issue:`9862`) - Bug in :class:`Index` slicing with boolean :class:`Index` may raise ``TypeError`` (:issue:`22533`) +- Bug in ``PeriodArray.__setitem__`` when accepting slice and list-like value (:issue:`23978`) Missing ^^^^^^^ @@ -1338,6 +1422,7 @@ MultiIndex I/O ^^^ + .. _whatsnew_0240.bug_fixes.nan_with_str_dtype: Proper handling of `np.NaN` in a string data-typed column with the Python engine @@ -1373,6 +1458,7 @@ Current Behavior: Notice how we now instead output ``np.nan`` itself instead of a stringified form of it. +- Bug in :func:`read_csv` in which a column specified with ``CategoricalDtype`` of boolean categories was not being correctly coerced from string values to booleans (:issue:`20498`) - Bug in :meth:`to_sql` when writing timezone aware data (``datetime64[ns, tz]`` dtype) would raise a ``TypeError`` (:issue:`9086`) - Bug in :meth:`to_sql` where a naive DatetimeIndex would be written as ``TIMESTAMP WITH TIMEZONE`` type in supported databases, e.g. PostgreSQL (:issue:`23510`) - Bug in :meth:`read_excel()` when ``parse_cols`` is specified with an empty dataset (:issue:`9208`) @@ -1397,8 +1483,10 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - Bug in :meth:`read_csv()` in which unnecessary warnings were being raised when the dialect's values conflicted with the default arguments (:issue:`23761`) - Bug in :meth:`read_html()` in which the error message was not displaying the valid flavors when an invalid one was provided (:issue:`23549`) - Bug in :meth:`read_excel()` in which extraneous header names were extracted, even though none were specified (:issue:`11733`) -- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`20480`) +- Bug in :meth:`read_excel()` in which column names were not being properly converted to string sometimes in Python 2.x (:issue:`23874`) +- Bug in :meth:`read_excel()` in which ``index_col=None`` was not being respected and parsing index columns anyway (:issue:`18792`, :issue:`20480`) - Bug in :meth:`read_excel()` in which ``usecols`` was not being validated for proper column names when passed in as a string (:issue:`20480`) +- Bug in :meth:`DataFrame.to_dict` when the resulting dict contains non-Python scalars in the case of numeric data (:issue:`23753`) - :func:`DataFrame.to_string()`, :func:`DataFrame.to_html()`, :func:`DataFrame.to_latex()` will correctly format output when a string is passed as the ``float_format`` argument (:issue:`21625`, :issue:`22270`) Plotting @@ -1425,6 +1513,7 @@ Groupby/Resample/Rolling - Bug in :meth:`DataFrame.expanding` in which the ``axis`` argument was not being respected during aggregations (:issue:`23372`) - Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.transform` which caused missing values when the input function can accept a :class:`DataFrame` but renames it (:issue:`23455`). - Bug in :func:`pandas.core.groupby.GroupBy.nth` where column order was not always preserved (:issue:`20760`) +- Bug in :meth:`pandas.core.groupby.DataFrameGroupBy.rank` with ``method='dense'`` and ``pct=True`` when a group has only one member would raise a ``ZeroDivisionError`` (:issue:`23666`). Reshaping ^^^^^^^^^ @@ -1450,6 +1539,7 @@ Reshaping - Bug in :func:`pandas.melt` when passing column names that are not present in ``DataFrame`` (:issue:`23575`) - Bug in :meth:`DataFrame.append` with a :class:`Series` with a dateutil timezone would raise a ``TypeError`` (:issue:`23682`) - Bug in ``Series`` construction when passing no data and ``dtype=str`` (:issue:`22477`) +- Bug in :func:`cut` with ``bins`` as an overlapping ``IntervalIndex`` where multiple bins were returned per item instead of raising a ``ValueError`` (:issue:`23980`) - Bug in :meth:`DataFrame.join` when joining on partial MultiIndex would drop names (:issue:`20452`). .. _whatsnew_0240.bug_fixes.sparse: @@ -1466,6 +1556,7 @@ Sparse - Bug in unary inversion operator (``~``) on a ``SparseSeries`` with boolean values. The performance of this has also been improved (:issue:`22835`) - Bug in :meth:`SparseArary.unique` not returning the unique values (:issue:`19595`) - Bug in :meth:`SparseArray.nonzero` and :meth:`SparseDataFrame.dropna` returning shifted/incorrect results (:issue:`21172`) +- Bug in :meth:`DataFrame.apply` where dtypes would lose sparseness (:issue:`23744`) Build Changes ^^^^^^^^^^^^^ @@ -1492,4 +1583,3 @@ Contributors ~~~~~~~~~~~~ .. contributors:: v0.23.4..HEAD - diff --git a/environment.yml b/environment.yml index fc35f1290f1b13..e31511e5b8afe3 100644 --- a/environment.yml +++ b/environment.yml @@ -4,22 +4,22 @@ channels: - conda-forge dependencies: # required - - NumPy + - numpy>=1.15 - python=3 - python-dateutil>=2.5.0 - pytz # development - - Cython>=0.28.2 + - asv + - cython>=0.28.2 - flake8 - flake8-comprehensions - - flake8-rst=0.4.2 + - flake8-rst>=0.6.0 - gitpython - - hypothesis>=3.58.0 + - hypothesis>=3.82 - isort - moto - - pytest>=3.6 - - setuptools>=24.2.0 + - pytest>=4.0 - sphinx - sphinxcontrib-spelling @@ -28,7 +28,6 @@ dependencies: - blosc - bottleneck>=1.2.0 - fastparquet>=0.1.2 - - gcsfs - html5lib - ipython>=5.6.0 - ipykernel @@ -36,15 +35,13 @@ dependencies: - lxml - matplotlib>=2.0.0 - nbsphinx - - numexpr>=2.6.1 + - numexpr>=2.6.8 - openpyxl - pyarrow>=0.7.0 - - pymysql - pytables>=3.4.2 - pytest-cov - pytest-xdist - - s3fs - - scipy>=0.18.1 + - scipy>=1.1 - seaborn - sqlalchemy - statsmodels @@ -52,3 +49,5 @@ dependencies: - xlrd - xlsxwriter - xlwt + - pip: + - cpplint diff --git a/pandas/_libs/algos_rank_helper.pxi.in b/pandas/_libs/algos_rank_helper.pxi.in index 5ffc6dd5780235..5dac94394c7ed0 100644 --- a/pandas/_libs/algos_rank_helper.pxi.in +++ b/pandas/_libs/algos_rank_helper.pxi.in @@ -102,15 +102,7 @@ def rank_1d_{{dtype}}(object in_arr, ties_method='average', ranks = np.empty(n, dtype='f8') {{if dtype == 'object'}} - - try: - _as = np.lexsort(keys=order) - except TypeError: - # lexsort on object array will raise TypeError for numpy version - # earlier than 1.11.0. Use argsort with order argument instead. - _dt = [('values', 'O'), ('mask', '?')] - _values = np.asarray(list(zip(order[0], order[1])), dtype=_dt) - _as = np.argsort(_values, kind='mergesort', order=('mask', 'values')) + _as = np.lexsort(keys=order) {{else}} if tiebreak == TIEBREAK_FIRST: # need to use a stable sort here diff --git a/pandas/_libs/groupby_helper.pxi.in b/pandas/_libs/groupby_helper.pxi.in index 523d43f893aadd..abac9f147848e7 100644 --- a/pandas/_libs/groupby_helper.pxi.in +++ b/pandas/_libs/groupby_helper.pxi.in @@ -587,7 +587,7 @@ def group_rank_{{name}}(ndarray[float64_t, ndim=2] out, # rankings, so we assign them percentages of NaN. if out[i, 0] != out[i, 0] or out[i, 0] == NAN: out[i, 0] = NAN - else: + elif grp_sizes[i, 0] != 0: out[i, 0] = out[i, 0] / grp_sizes[i, 0] {{endif}} {{endfor}} diff --git a/pandas/_libs/hashtable_class_helper.pxi.in b/pandas/_libs/hashtable_class_helper.pxi.in index a71023ed34f44d..7f4c2a6410870d 100644 --- a/pandas/_libs/hashtable_class_helper.pxi.in +++ b/pandas/_libs/hashtable_class_helper.pxi.in @@ -356,11 +356,12 @@ cdef class {{name}}HashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _factorize(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + def _unique(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None, bint ignore_na=False, + bint return_inverse=False): """ - Calculate unique values and labels (no sorting); ignores all NA-values + Calculate unique values and labels (no sorting!) Parameters ---------- @@ -374,13 +375,22 @@ cdef class {{name}}HashTable(HashTable): Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then - any value satisfying val!=val are considered missing. + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. Returns ------- uniques : ndarray[{{dtype}}] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques """ cdef: @@ -392,7 +402,8 @@ cdef class {{name}}HashTable(HashTable): {{name}}VectorData *ud bint use_na_value - labels = np.empty(n, dtype=np.int64) + if return_inverse: + labels = np.empty(n, dtype=np.int64) ud = uniques.data use_na_value = na_value is not None @@ -410,20 +421,19 @@ cdef class {{name}}HashTable(HashTable): for i in range(n): val = values[i] - if val != val or (use_na_value and val == na_value2): + if ignore_na and (val != val + or (use_na_value and val == na_value2)): + # if missing values do not count as unique values (i.e. if + # ignore_na is True), skip the hashtable entry for them, + # and replace the corresponding label with na_sentinel labels[i] = na_sentinel continue k = kh_get_{{dtype}}(self.table, val) - if k != self.table.n_buckets: - # k falls into a previous bucket - idx = self.table.vals[k] - labels[i] = idx - else: + if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_{{dtype}}(self.table, val, &ret) - self.table.vals[k] = count if needs_resize(ud): with gil: @@ -433,23 +443,82 @@ cdef class {{name}}HashTable(HashTable): "Vector.resize() needed") uniques.resize() append_data_{{dtype}}(ud, val) - labels[i] = count - count += 1 + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx + + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() - return np.asarray(labels) + def unique(self, const {{dtype}}_t[:] values, bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = {{name}}Vector() + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) def factorize(self, const {{dtype}}_t[:] values, Py_ssize_t na_sentinel=-1, object na_value=None): - uniques = {{name}}Vector() - labels = self._factorize(values, uniques=uniques, - na_sentinel=na_sentinel, na_value=na_value) - return labels, uniques.to_array() + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[{{dtype}}] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + + Returns + ------- + uniques : ndarray[{{dtype}}] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ + uniques_vector = {{name}}Vector() + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) def get_labels(self, const {{dtype}}_t[:] values, {{name}}Vector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - return self._factorize(values, uniques, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True) + return labels @cython.boundscheck(False) def get_labels_groupby(self, const {{dtype}}_t[:] values): @@ -496,44 +565,6 @@ cdef class {{name}}HashTable(HashTable): return np.asarray(labels), arr_uniques - @cython.boundscheck(False) - @cython.wraparound(False) - def unique(self, const {{dtype}}_t[:] values): - """ - Calculate unique values without sorting - - Parameters - ---------- - values : ndarray[{{dtype}}] - Array of values of which unique will be calculated - - Returns - ------- - uniques : ndarray[{{dtype}}] - Unique values of input, not sorted - """ - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - {{dtype}}_t val - khiter_t k - {{name}}Vector uniques = {{name}}Vector() - {{name}}VectorData *ud - - ud = uniques.data - - with nogil: - for i in range(n): - val = values[i] - k = kh_get_{{dtype}}(self.table, val) - if k == self.table.n_buckets: - kh_put_{{dtype}}(self.table, val, &ret) - if needs_resize(ud): - with gil: - uniques.resize() - append_data_{{dtype}}(ud, val) - return uniques.to_array() - {{endfor}} @@ -613,56 +644,6 @@ cdef class StringHashTable(HashTable): free(vecs) return labels - @cython.boundscheck(False) - @cython.wraparound(False) - def unique(self, ndarray[object] values): - """ - Calculate unique values without sorting - - Parameters - ---------- - values : ndarray[object] - Array of values of which unique will be calculated - - Returns - ------- - uniques : ndarray[object] - Unique values of input, not sorted - """ - cdef: - Py_ssize_t i, count, n = len(values) - int64_t[:] uindexer - int ret = 0 - object val - ObjectVector uniques - khiter_t k - const char *v - const char **vecs - - vecs = malloc(n * sizeof(char *)) - uindexer = np.empty(n, dtype=np.int64) - for i in range(n): - val = values[i] - v = util.get_c_string(val) - vecs[i] = v - - count = 0 - with nogil: - for i in range(n): - v = vecs[i] - k = kh_get_str(self.table, v) - if k == self.table.n_buckets: - kh_put_str(self.table, v, &ret) - uindexer[count] = i - count += 1 - free(vecs) - - # uniques - uniques = ObjectVector() - for i in range(count): - uniques.append(values[uindexer[i]]) - return uniques.to_array() - @cython.boundscheck(False) def lookup(self, ndarray[object] values): cdef: @@ -726,11 +707,12 @@ cdef class StringHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def _factorize(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): + def _unique(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None, bint ignore_na=False, + bint return_inverse=False): """ - Calculate unique values and labels (no sorting); ignores all NA-values + Calculate unique values and labels (no sorting!) Parameters ---------- @@ -743,13 +725,23 @@ cdef class StringHashTable(HashTable): na_sentinel : Py_ssize_t, default -1 Sentinel value used for all NA-values in inverse na_value : object, default None - Value to identify as missing + Value to identify as missing. If na_value is None, then any value + that is not a string is considered missing. If na_value is + not None, then _additionally_ any value "val" satisfying + val == na_value is considered missing. + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. Returns ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques """ cdef: @@ -763,41 +755,50 @@ cdef class StringHashTable(HashTable): khiter_t k bint use_na_value - labels = np.zeros(n, dtype=np.int64) + if return_inverse: + labels = np.zeros(n, dtype=np.int64) uindexer = np.empty(n, dtype=np.int64) use_na_value = na_value is not None - # assign pointers and pre-filter out missing + # assign pointers and pre-filter out missing (if ignore_na) vecs = malloc(n * sizeof(char *)) for i in range(n): val = values[i] - if (isinstance(val, (str, unicode)) - and not (use_na_value and val == na_value)): + if (ignore_na + and (not isinstance(val, (str, unicode)) + or (use_na_value and val == na_value))): + # if missing values do not count as unique values (i.e. if + # ignore_na is True), we can skip the actual value, and + # replace the label with na_sentinel directly + labels[i] = na_sentinel + else: + # if ignore_na is False, we also stringify NaN/None/etc. v = util.get_c_string(val) vecs[i] = v - else: - labels[i] = na_sentinel # compute with nogil: for i in range(n): - if labels[i] == na_sentinel: + if ignore_na and labels[i] == na_sentinel: + # skip entries for ignored missing values (see above) continue v = vecs[i] k = kh_get_str(self.table, v) - if k != self.table.n_buckets: - # k falls into a previous bucket - idx = self.table.vals[k] - labels[i] = idx - else: + if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_str(self.table, v, &ret) - self.table.vals[k] = count uindexer[count] = i - labels[i] = count + if return_inverse: + self.table.vals[k] = count + labels[i] = count count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx free(vecs) @@ -805,20 +806,72 @@ cdef class StringHashTable(HashTable): for i in range(count): uniques.append(values[uindexer[i]]) - return np.asarray(labels) + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() + + def unique(self, ndarray[object] values, bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = ObjectVector() + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, object na_value=None): - uniques = ObjectVector() - labels = self._factorize(values, uniques=uniques, - na_sentinel=na_sentinel, na_value=na_value) - return labels, uniques.to_array() + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then any value + that is not a string is considered missing. If na_value is + not None, then _additionally_ any value "val" satisfying + val == na_value is considered missing. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ + uniques_vector = ObjectVector() + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - return self._factorize(values, uniques, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True) + return labels cdef class PyObjectHashTable(HashTable): @@ -908,44 +961,12 @@ cdef class PyObjectHashTable(HashTable): @cython.boundscheck(False) @cython.wraparound(False) - def unique(self, ndarray[object] values): + def _unique(self, ndarray[object] values, ObjectVector uniques, + Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, + object na_value=None, bint ignore_na=False, + bint return_inverse=False): """ - Calculate unique values without sorting - - Parameters - ---------- - values : ndarray[object] - Array of values of which unique will be calculated - - Returns - ------- - uniques : ndarray[object] - Unique values of input, not sorted - """ - cdef: - Py_ssize_t i, n = len(values) - int ret = 0 - object val - khiter_t k - ObjectVector uniques = ObjectVector() - - for i in range(n): - val = values[i] - hash(val) - k = kh_get_pymap(self.table, val) - if k == self.table.n_buckets: - kh_put_pymap(self.table, val, &ret) - uniques.append(val) - - return uniques.to_array() - - @cython.boundscheck(False) - @cython.wraparound(False) - def _factorize(self, ndarray[object] values, ObjectVector uniques, - Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, - object na_value=None): - """ - Calculate unique values and labels (no sorting); ignores all NA-values + Calculate unique values and labels (no sorting!) Parameters ---------- @@ -959,13 +980,22 @@ cdef class PyObjectHashTable(HashTable): Sentinel value used for all NA-values in inverse na_value : object, default None Value to identify as missing. If na_value is None, then None _plus_ - any value satisfying val!=val are considered missing. + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + ignore_na : boolean, default False + Whether NA-values should be ignored for calculating the uniques. If + True, the labels corresponding to missing values will be set to + na_sentinel. + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. Returns ------- uniques : ndarray[object] Unique values of input, not sorted - labels : ndarray[int64] + labels : ndarray[int64] (if return_inverse=True) The labels from values to uniques """ cdef: @@ -976,42 +1006,100 @@ cdef class PyObjectHashTable(HashTable): khiter_t k bint use_na_value - labels = np.empty(n, dtype=np.int64) + if return_inverse: + labels = np.empty(n, dtype=np.int64) use_na_value = na_value is not None for i in range(n): val = values[i] hash(val) - if ((val != val or val is None) - or (use_na_value and val == na_value)): + if ignore_na and ((val != val or val is None) + or (use_na_value and val == na_value)): + # if missing values do not count as unique values (i.e. if + # ignore_na is True), skip the hashtable entry for them, and + # replace the corresponding label with na_sentinel labels[i] = na_sentinel continue k = kh_get_pymap(self.table, val) - if k != self.table.n_buckets: - # k falls into a previous bucket - idx = self.table.vals[k] - labels[i] = idx - else: + if k == self.table.n_buckets: # k hasn't been seen yet k = kh_put_pymap(self.table, val, &ret) - self.table.vals[k] = count uniques.append(val) - labels[i] = count - count += 1 + if return_inverse: + self.table.vals[k] = count + labels[i] = count + count += 1 + elif return_inverse: + # k falls into a previous bucket + # only relevant in case we need to construct the inverse + idx = self.table.vals[k] + labels[i] = idx + + if return_inverse: + return uniques.to_array(), np.asarray(labels) + return uniques.to_array() + + def unique(self, ndarray[object] values, bint return_inverse=False): + """ + Calculate unique values and labels (no sorting!) - return np.asarray(labels) + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + return_inverse : boolean, default False + Whether the mapping of the original array values to their location + in the vector of uniques should be returned. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] (if return_inverse) + The labels from values to uniques + """ + uniques = ObjectVector() + return self._unique(values, uniques, ignore_na=False, + return_inverse=return_inverse) def factorize(self, ndarray[object] values, Py_ssize_t na_sentinel=-1, object na_value=None): - uniques = ObjectVector() - labels = self._factorize(values, uniques=uniques, - na_sentinel=na_sentinel, na_value=na_value) - return labels, uniques.to_array() + """ + Calculate unique values and labels (no sorting!) + + Missing values are not included in the "uniques" for this method. + The labels for any missing values will be set to "na_sentinel" + + Parameters + ---------- + values : ndarray[object] + Array of values of which unique will be calculated + na_sentinel : Py_ssize_t, default -1 + Sentinel value used for all NA-values in inverse + na_value : object, default None + Value to identify as missing. If na_value is None, then None _plus_ + any value "val" satisfying val != val is considered missing. + If na_value is not None, then _additionally_, any value "val" + satisfying val == na_value is considered missing. + + Returns + ------- + uniques : ndarray[object] + Unique values of input, not sorted + labels : ndarray[int64] + The labels from values to uniques + """ + uniques_vector = ObjectVector() + return self._unique(values, uniques_vector, na_sentinel=na_sentinel, + na_value=na_value, ignore_na=True, + return_inverse=True) def get_labels(self, ndarray[object] values, ObjectVector uniques, Py_ssize_t count_prior=0, Py_ssize_t na_sentinel=-1, object na_value=None): - return self._factorize(values, uniques, count_prior=count_prior, - na_sentinel=na_sentinel, na_value=na_value) + _, labels = self._unique(values, uniques, count_prior=count_prior, + na_sentinel=na_sentinel, na_value=na_value, + ignore_na=True, return_inverse=True) + return labels diff --git a/pandas/_libs/intervaltree.pxi.in b/pandas/_libs/intervaltree.pxi.in index 7be3bdbc1048a8..fb6f30c030f113 100644 --- a/pandas/_libs/intervaltree.pxi.in +++ b/pandas/_libs/intervaltree.pxi.in @@ -26,7 +26,7 @@ cdef class IntervalTree(IntervalMixin): cdef: readonly object left, right, root, dtype readonly str closed - object _left_sorter, _right_sorter + object _is_overlapping, _left_sorter, _right_sorter def __init__(self, left, right, closed='right', leaf_size=100): """ @@ -81,6 +81,26 @@ cdef class IntervalTree(IntervalMixin): self._right_sorter = np.argsort(self.right) return self._right_sorter + @property + def is_overlapping(self): + """ + Determine if the IntervalTree contains overlapping intervals. + Cached as self._is_overlapping. + """ + if self._is_overlapping is not None: + return self._is_overlapping + + # <= when both sides closed since endpoints can overlap + op = le if self.closed == 'both' else lt + + # overlap if start of current interval < end of previous interval + # (current and previous in terms of sorted order by left/start side) + current = self.left[self.left_sorter[1:]] + previous = self.right[self.left_sorter[:-1]] + self._is_overlapping = bool(op(current, previous).any()) + + return self._is_overlapping + def get_loc(self, scalar_t key): """Return all positions corresponding to intervals that overlap with the given scalar key diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index ad538ff103c2f3..0c081986d83c55 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -518,9 +518,7 @@ def astype_intsafe(ndarray[object] arr, new_dtype): bint is_datelike ndarray result - # on 32-bit, 1.6.2 numpy M8[ns] is a subdtype of integer, which is weird - is_datelike = new_dtype in ['M8[ns]', 'm8[ns]'] - + is_datelike = new_dtype == 'm8[ns]' result = np.empty(n, dtype=new_dtype) for i in range(n): val = arr[i] @@ -1250,25 +1248,19 @@ def infer_dtype(value: object, skipna: bool=False) -> str: if util.is_datetime64_object(val): if is_datetime64_array(values): return 'datetime64' - elif is_timedelta_or_timedelta64_array(values): - return 'timedelta' elif is_timedelta(val): if is_timedelta_or_timedelta64_array(values): return 'timedelta' elif util.is_integer_object(val): - # a timedelta will show true here as well - if is_timedelta(val): - if is_timedelta_or_timedelta64_array(values): - return 'timedelta' + # ordering matters here; this check must come after the is_timedelta + # check otherwise numpy timedelta64 objects would come through here if is_integer_array(values): return 'integer' elif is_integer_float_array(values): return 'mixed-integer-float' - elif is_timedelta_or_timedelta64_array(values): - return 'timedelta' return 'mixed-integer' elif PyDateTime_Check(val): @@ -1701,27 +1693,6 @@ cdef class TimedeltaValidator(TemporalValidator): return is_null_timedelta64(value) -# TODO: Not used outside of tests; remove? -def is_timedelta_array(values: ndarray) -> bool: - cdef: - TimedeltaValidator validator = TimedeltaValidator(len(values), - skipna=True) - return validator.validate(values) - - -cdef class Timedelta64Validator(TimedeltaValidator): - cdef inline bint is_value_typed(self, object value) except -1: - return util.is_timedelta64_object(value) - - -# TODO: Not used outside of tests; remove? -def is_timedelta64_array(values: ndarray) -> bool: - cdef: - Timedelta64Validator validator = Timedelta64Validator(len(values), - skipna=True) - return validator.validate(values) - - cdef class AnyTimedeltaValidator(TimedeltaValidator): cdef inline bint is_value_typed(self, object value) except -1: return is_timedelta(value) diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index f74de79542628d..a459057555cf3a 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -1202,7 +1202,20 @@ cdef class TextReader: bint user_dtype, kh_str_t *na_hashset, object na_flist): - if is_integer_dtype(dtype): + if is_categorical_dtype(dtype): + # TODO: I suspect that _categorical_convert could be + # optimized when dtype is an instance of CategoricalDtype + codes, cats, na_count = _categorical_convert( + self.parser, i, start, end, na_filter, + na_hashset, self.c_encoding) + + # Method accepts list of strings, not encoded ones. + true_values = [x.decode() for x in self.true_values] + cat = Categorical._from_inferred_categories( + cats, codes, dtype, true_values=true_values) + return cat, na_count + + elif is_integer_dtype(dtype): try: result, na_count = _try_int64(self.parser, i, start, end, na_filter, na_hashset) @@ -1232,7 +1245,12 @@ cdef class TextReader: result, na_count = _try_bool_flex(self.parser, i, start, end, na_filter, na_hashset, self.true_set, self.false_set) + if user_dtype and na_count is not None: + if na_count > 0: + raise ValueError("Bool column has NA values in " + "column {column}".format(column=i)) return result, na_count + elif dtype.kind == 'S': # TODO: na handling width = dtype.itemsize @@ -1252,15 +1270,6 @@ cdef class TextReader: # unicode variable width return self._string_convert(i, start, end, na_filter, na_hashset) - elif is_categorical_dtype(dtype): - # TODO: I suspect that _categorical_convert could be - # optimized when dtype is an instance of CategoricalDtype - codes, cats, na_count = _categorical_convert( - self.parser, i, start, end, na_filter, - na_hashset, self.c_encoding) - cat = Categorical._from_inferred_categories(cats, codes, dtype) - return cat, na_count - elif is_object_dtype(dtype): return self._string_convert(i, start, end, na_filter, na_hashset) diff --git a/pandas/_libs/sparse.pyx b/pandas/_libs/sparse.pyx index 668bd0ae6bbb75..f5980998f6db47 100644 --- a/pandas/_libs/sparse.pyx +++ b/pandas/_libs/sparse.pyx @@ -8,14 +8,6 @@ from numpy cimport (ndarray, uint8_t, int64_t, int32_t, int16_t, int8_t, cnp.import_array() -from distutils.version import LooseVersion - -# numpy versioning -_np_version = np.version.short_version -_np_version_under1p10 = LooseVersion(_np_version) < LooseVersion('1.10') -_np_version_under1p11 = LooseVersion(_np_version) < LooseVersion('1.11') - - # ----------------------------------------------------------------------------- # Preamble stuff diff --git a/pandas/_libs/sparse_op_helper.pxi.in b/pandas/_libs/sparse_op_helper.pxi.in index 1f41096a3f1942..c6621ab5977caf 100644 --- a/pandas/_libs/sparse_op_helper.pxi.in +++ b/pandas/_libs/sparse_op_helper.pxi.in @@ -42,13 +42,6 @@ cdef inline sparse_t __mod__(sparse_t a, sparse_t b): cdef inline sparse_t __floordiv__(sparse_t a, sparse_t b): if b == 0: if sparse_t is float64_t: - # numpy >= 1.11 returns NaN - # for a // 0, rather than +-inf - if _np_version_under1p11: - if a > 0: - return INF - elif a < 0: - return -INF return NaN else: return 0 diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index c5bc969ede3c90..efabc5ad0b1ba5 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -40,8 +40,8 @@ from tslibs.conversion cimport (tz_convert_single, _TSObject, tz_convert_utc_to_tzlocal) # many modules still look for NaT and iNaT here despite them not being needed -from tslibs.nattype import nat_strings, NaT, iNaT # noqa:F821 -from tslibs.nattype cimport checknull_with_nat, NPY_NAT +from tslibs.nattype import nat_strings, iNaT # noqa:F821 +from tslibs.nattype cimport checknull_with_nat, NPY_NAT, c_NaT as NaT from tslibs.offsets cimport to_offset @@ -76,7 +76,8 @@ cdef inline object create_time_from_ts( @cython.wraparound(False) @cython.boundscheck(False) -def ints_to_pydatetime(int64_t[:] arr, tz=None, freq=None, box="datetime"): +def ints_to_pydatetime(int64_t[:] arr, object tz=None, object freq=None, + str box="datetime"): """ Convert an i8 repr to an ndarray of datetimes, date, time or Timestamp @@ -104,8 +105,9 @@ def ints_to_pydatetime(int64_t[:] arr, tz=None, freq=None, box="datetime"): int64_t[:] deltas Py_ssize_t pos npy_datetimestruct dts - object dt - int64_t value, delta + object dt, new_tz + str typ + int64_t value, delta, local_value ndarray[object] result = np.empty(n, dtype=object) object (*func_create)(int64_t, npy_datetimestruct, object, object) @@ -303,7 +305,8 @@ def format_array_from_datetime(ndarray[int64_t] values, object tz=None, return result -def array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): +def array_with_unit_to_datetime(ndarray values, object unit, + str errors='coerce'): """ convert the ndarray according to the unit if errors: @@ -458,10 +461,9 @@ def array_with_unit_to_datetime(ndarray values, unit, errors='coerce'): @cython.wraparound(False) @cython.boundscheck(False) -cpdef array_to_datetime(ndarray[object] values, errors='raise', - dayfirst=False, yearfirst=False, - format=None, utc=None, - require_iso8601=False): +cpdef array_to_datetime(ndarray[object] values, str errors='raise', + bint dayfirst=False, bint yearfirst=False, + object utc=None, bint require_iso8601=False): """ Converts a 1D array of date-like values to a numpy array of either: 1) datetime64[ns] data @@ -485,8 +487,6 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', dayfirst parsing behavior when encountering datetime strings yearfirst : bool, default False yearfirst parsing behavior when encountering datetime strings - format : str, default None - format of the string to parse utc : bool, default None indicator whether the dates should be UTC require_iso8601 : bool, default False @@ -510,261 +510,259 @@ cpdef array_to_datetime(ndarray[object] values, errors='raise', bint is_raise = errors=='raise' bint is_ignore = errors=='ignore' bint is_coerce = errors=='coerce' + bint is_same_offsets _TSObject _ts + int64_t value int out_local=0, out_tzoffset=0 - float offset_seconds + float offset_seconds, tz_offset set out_tzoffset_vals = set() # specify error conditions assert is_raise or is_ignore or is_coerce + result = np.empty(n, dtype='M8[ns]') + iresult = result.view('i8') + try: - result = np.empty(n, dtype='M8[ns]') - iresult = result.view('i8') for i in range(n): val = values[i] - if checknull_with_nat(val): - iresult[i] = NPY_NAT + try: + if checknull_with_nat(val): + iresult[i] = NPY_NAT - elif PyDateTime_Check(val): - seen_datetime = 1 - if val.tzinfo is not None: - if utc_convert: - try: + elif PyDateTime_Check(val): + seen_datetime = 1 + if val.tzinfo is not None: + if utc_convert: _ts = convert_datetime_to_tsobject(val, None) iresult[i] = _ts.value - except OutOfBoundsDatetime: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise + else: + raise ValueError('Tz-aware datetime.datetime ' + 'cannot be converted to ' + 'datetime64 unless utc=True') else: - raise ValueError('Tz-aware datetime.datetime cannot ' - 'be converted to datetime64 unless ' - 'utc=True') - else: - iresult[i] = pydatetime_to_dt64(val, &dts) - if not PyDateTime_CheckExact(val): - # i.e. a Timestamp object - iresult[i] += val.nanosecond - try: + iresult[i] = pydatetime_to_dt64(val, &dts) + if not PyDateTime_CheckExact(val): + # i.e. a Timestamp object + iresult[i] += val.nanosecond check_dts_bounds(&dts) - except OutOfBoundsDatetime: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise - elif PyDate_Check(val): - seen_datetime = 1 - iresult[i] = pydate_to_dt64(val, &dts) - try: + elif PyDate_Check(val): + seen_datetime = 1 + iresult[i] = pydate_to_dt64(val, &dts) check_dts_bounds(&dts) - except OutOfBoundsDatetime: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise - elif is_datetime64_object(val): - seen_datetime = 1 - if get_datetime64_value(val) == NPY_NAT: - iresult[i] = NPY_NAT - else: - try: - iresult[i] = get_datetime64_nanos(val) - except OutOfBoundsDatetime: - if is_coerce: - iresult[i] = NPY_NAT - continue - raise + elif is_datetime64_object(val): + seen_datetime = 1 + iresult[i] = get_datetime64_nanos(val) - elif is_integer_object(val) or is_float_object(val): - # these must be ns unit by-definition - seen_integer = 1 + elif is_integer_object(val) or is_float_object(val): + # these must be ns unit by-definition + seen_integer = 1 - if val != val or val == NPY_NAT: - iresult[i] = NPY_NAT - elif is_raise or is_ignore: - iresult[i] = val - else: - # coerce - # we now need to parse this as if unit='ns' - # we can ONLY accept integers at this point - # if we have previously (or in future accept - # datetimes/strings, then we must coerce) - try: - iresult[i] = cast_from_unit(val, 'ns') - except: + if val != val or val == NPY_NAT: iresult[i] = NPY_NAT + elif is_raise or is_ignore: + iresult[i] = val + else: + # coerce + # we now need to parse this as if unit='ns' + # we can ONLY accept integers at this point + # if we have previously (or in future accept + # datetimes/strings, then we must coerce) + try: + iresult[i] = cast_from_unit(val, 'ns') + except: + iresult[i] = NPY_NAT - elif is_string_object(val): - # string - seen_string = 1 - - if len(val) == 0 or val in nat_strings: - iresult[i] = NPY_NAT - continue - if isinstance(val, unicode) and PY2: - val = val.encode('utf-8') + elif is_string_object(val): + # string + seen_string = 1 - try: - _string_to_dts(val, &dts, &out_local, &out_tzoffset) - except ValueError: - # A ValueError at this point is a _parsing_ error - # specifically _not_ OutOfBoundsDatetime - if _parse_today_now(val, &iresult[i]): + if len(val) == 0 or val in nat_strings: + iresult[i] = NPY_NAT continue - elif require_iso8601: - # if requiring iso8601 strings, skip trying - # other formats - if is_coerce: - iresult[i] = NPY_NAT - continue - elif is_raise: - raise ValueError("time data {val} doesn't match " - "format specified" - .format(val=val)) - return values, tz_out + if isinstance(val, unicode) and PY2: + val = val.encode('utf-8') try: - py_dt = parse_datetime_string(val, dayfirst=dayfirst, - yearfirst=yearfirst) - except Exception: - if is_coerce: - iresult[i] = NPY_NAT + _string_to_dts(val, &dts, &out_local, &out_tzoffset) + except ValueError: + # A ValueError at this point is a _parsing_ error + # specifically _not_ OutOfBoundsDatetime + if _parse_today_now(val, &iresult[i]): continue - raise TypeError("invalid string coercion to datetime") - - # If the dateutil parser returned tzinfo, capture it - # to check if all arguments have the same tzinfo - tz = py_dt.utcoffset() - if tz is not None: - seen_datetime_offset = 1 - # dateutil timezone objects cannot be hashed, so store - # the UTC offsets in seconds instead - out_tzoffset_vals.add(tz.total_seconds()) - else: - # Add a marker for naive string, to track if we are - # parsing mixed naive and aware strings - out_tzoffset_vals.add('naive') - try: + elif require_iso8601: + # if requiring iso8601 strings, skip trying + # other formats + if is_coerce: + iresult[i] = NPY_NAT + continue + elif is_raise: + raise ValueError("time data {val} doesn't " + "match format specified" + .format(val=val)) + return values, tz_out + + try: + py_dt = parse_datetime_string(val, + dayfirst=dayfirst, + yearfirst=yearfirst) + except Exception: + if is_coerce: + iresult[i] = NPY_NAT + continue + raise TypeError("invalid string coercion to " + "datetime") + + # If the dateutil parser returned tzinfo, capture it + # to check if all arguments have the same tzinfo + tz = py_dt.utcoffset() + if tz is not None: + seen_datetime_offset = 1 + # dateutil timezone objects cannot be hashed, so + # store the UTC offsets in seconds instead + out_tzoffset_vals.add(tz.total_seconds()) + else: + # Add a marker for naive string, to track if we are + # parsing mixed naive and aware strings + out_tzoffset_vals.add('naive') + _ts = convert_datetime_to_tsobject(py_dt, None) iresult[i] = _ts.value - except OutOfBoundsDatetime: + except: + # TODO: What exception are we concerned with here? if is_coerce: iresult[i] = NPY_NAT continue raise - except: - # TODO: What exception are we concerned with here? + else: + # No error raised by string_to_dts, pick back up + # where we left off + value = dtstruct_to_dt64(&dts) + if out_local == 1: + seen_datetime_offset = 1 + # Store the out_tzoffset in seconds + # since we store the total_seconds of + # dateutil.tz.tzoffset objects + out_tzoffset_vals.add(out_tzoffset * 60.) + tz = pytz.FixedOffset(out_tzoffset) + value = tz_convert_single(value, tz, UTC) + else: + # Add a marker for naive string, to track if we are + # parsing mixed naive and aware strings + out_tzoffset_vals.add('naive') + iresult[i] = value + check_dts_bounds(&dts) + + else: if is_coerce: iresult[i] = NPY_NAT - continue - raise - else: - # No error raised by string_to_dts, pick back up - # where we left off - value = dtstruct_to_dt64(&dts) - if out_local == 1: - seen_datetime_offset = 1 - # Store the out_tzoffset in seconds - # since we store the total_seconds of - # dateutil.tz.tzoffset objects - out_tzoffset_vals.add(out_tzoffset * 60.) - tz = pytz.FixedOffset(out_tzoffset) - value = tz_convert_single(value, tz, UTC) else: - # Add a marker for naive string, to track if we are - # parsing mixed naive and aware strings - out_tzoffset_vals.add('naive') - iresult[i] = value - try: - check_dts_bounds(&dts) - except OutOfBoundsDatetime: - # GH#19382 for just-barely-OutOfBounds falling back to - # dateutil parser will return incorrect result because - # it will ignore nanoseconds - if is_coerce: - iresult[i] = NPY_NAT - continue - elif require_iso8601: - if is_raise: - raise ValueError("time data {val} doesn't " - "match format specified" - .format(val=val)) - return values, tz_out - raise + raise TypeError("{typ} is not convertible to datetime" + .format(typ=type(val))) - else: + except OutOfBoundsDatetime: if is_coerce: iresult[i] = NPY_NAT - else: - raise TypeError("{typ} is not convertible to datetime" - .format(typ=type(val))) - - if seen_datetime and seen_integer: - # we have mixed datetimes & integers - - if is_coerce: - # coerce all of the integers/floats to NaT, preserve - # the datetimes and other convertibles - for i in range(n): - val = values[i] - if is_integer_object(val) or is_float_object(val): - result[i] = NPY_NAT - elif is_raise: - raise ValueError( - "mixed datetimes and integers in passed array") - else: - raise TypeError - - if seen_datetime_offset and not utc_convert: - # GH 17697 - # 1) If all the offsets are equal, return one offset for - # the parsed dates to (maybe) pass to DatetimeIndex - # 2) If the offsets are different, then force the parsing down the - # object path where an array of datetimes - # (with individual dateutil.tzoffsets) are returned - is_same_offsets = len(out_tzoffset_vals) == 1 - if not is_same_offsets: - return array_to_datetime_object(values, is_raise, - dayfirst, yearfirst) - else: - tz_offset = out_tzoffset_vals.pop() - tz_out = pytz.FixedOffset(tz_offset / 60.) - return result, tz_out + continue + elif require_iso8601 and is_string_object(val): + # GH#19382 for just-barely-OutOfBounds falling back to + # dateutil parser will return incorrect result because + # it will ignore nanoseconds + if is_raise: + raise ValueError("time data {val} doesn't " + "match format specified" + .format(val=val)) + assert is_ignore + return values, tz_out + raise + except OutOfBoundsDatetime: if is_raise: raise - oresult = np.empty(n, dtype=object) - for i in range(n): - val = values[i] + return ignore_errors_out_of_bounds_fallback(values), tz_out - # set as nan except if its a NaT - if checknull_with_nat(val): - if isinstance(val, float): - oresult[i] = np.nan - else: - oresult[i] = NaT - elif is_datetime64_object(val): - if get_datetime64_value(val) == NPY_NAT: - oresult[i] = NaT - else: - oresult[i] = val.item() - else: - oresult[i] = val - return oresult, tz_out except TypeError: return array_to_datetime_object(values, is_raise, dayfirst, yearfirst) + if seen_datetime and seen_integer: + # we have mixed datetimes & integers + + if is_coerce: + # coerce all of the integers/floats to NaT, preserve + # the datetimes and other convertibles + for i in range(n): + val = values[i] + if is_integer_object(val) or is_float_object(val): + result[i] = NPY_NAT + elif is_raise: + raise ValueError("mixed datetimes and integers in passed array") + else: + return array_to_datetime_object(values, is_raise, + dayfirst, yearfirst) + + if seen_datetime_offset and not utc_convert: + # GH#17697 + # 1) If all the offsets are equal, return one offset for + # the parsed dates to (maybe) pass to DatetimeIndex + # 2) If the offsets are different, then force the parsing down the + # object path where an array of datetimes + # (with individual dateutil.tzoffsets) are returned + is_same_offsets = len(out_tzoffset_vals) == 1 + if not is_same_offsets: + return array_to_datetime_object(values, is_raise, + dayfirst, yearfirst) + else: + tz_offset = out_tzoffset_vals.pop() + tz_out = pytz.FixedOffset(tz_offset / 60.) + return result, tz_out + + +cdef inline ignore_errors_out_of_bounds_fallback(ndarray[object] values): + """ + Fallback for array_to_datetime if an OutOfBoundsDatetime is raised + and errors == "ignore" + + Parameters + ---------- + values : ndarray[object] + + Returns + ------- + ndarray[object] + """ + cdef: + Py_ssize_t i, n = len(values) + object val + + oresult = np.empty(n, dtype=object) + + for i in range(n): + val = values[i] + + # set as nan except if its a NaT + if checknull_with_nat(val): + if isinstance(val, float): + oresult[i] = np.nan + else: + oresult[i] = NaT + elif is_datetime64_object(val): + if get_datetime64_value(val) == NPY_NAT: + oresult[i] = NaT + else: + oresult[i] = val.item() + else: + oresult[i] = val + return oresult + @cython.wraparound(False) @cython.boundscheck(False) cdef array_to_datetime_object(ndarray[object] values, bint is_raise, - dayfirst=False, yearfirst=False): + bint dayfirst=False, bint yearfirst=False): """ Fall back function for array_to_datetime diff --git a/pandas/_libs/tslibs/conversion.pyx b/pandas/_libs/tslibs/conversion.pyx index 9d6daf3d425230..e6e7884f05b200 100644 --- a/pandas/_libs/tslibs/conversion.pyx +++ b/pandas/_libs/tslibs/conversion.pyx @@ -4,7 +4,7 @@ from cython import Py_ssize_t import numpy as np cimport numpy as cnp -from numpy cimport int64_t, int32_t, ndarray +from numpy cimport uint8_t, int64_t, int32_t, ndarray cnp.import_array() import pytz @@ -39,8 +39,8 @@ from timezones cimport (is_utc, is_tzlocal, is_fixed_offset, from timezones import UTC from parsing import parse_datetime_string -from nattype import nat_strings, NaT -from nattype cimport NPY_NAT, checknull_with_nat +from nattype import nat_strings +from nattype cimport NPY_NAT, checknull_with_nat, c_NaT as NaT # ---------------------------------------------------------------------- # Constants @@ -62,8 +62,11 @@ cdef inline int64_t get_datetime64_nanos(object val) except? -1: NPY_DATETIMEUNIT unit npy_datetime ival - unit = get_datetime64_unit(val) ival = get_datetime64_value(val) + if ival == NPY_NAT: + return NPY_NAT + + unit = get_datetime64_unit(val) if unit != NPY_FR_ns: pandas_datetime_to_datetimestruct(ival, unit, &dts) @@ -99,10 +102,13 @@ def ensure_datetime64ns(arr: ndarray, copy: bool=True): ivalues = arr.view(np.int64).ravel() - result = np.empty(shape, dtype='M8[ns]') + result = np.empty(shape, dtype=NS_DTYPE) iresult = result.ravel().view(np.int64) if len(iresult) == 0: + result = arr.view(NS_DTYPE) + if copy: + result = result.copy() return result unit = get_datetime64_unit(arr.flat[0]) @@ -280,10 +286,8 @@ cdef convert_to_tsobject(object ts, object tz, object unit, if ts is None or ts is NaT: obj.value = NPY_NAT elif is_datetime64_object(ts): - if ts.view('i8') == NPY_NAT: - obj.value = NPY_NAT - else: - obj.value = get_datetime64_nanos(ts) + obj.value = get_datetime64_nanos(ts) + if obj.value != NPY_NAT: dt64_to_dtstruct(obj.value, &obj.dts) elif is_integer_object(ts): if ts == NPY_NAT: @@ -532,6 +536,7 @@ cdef inline void localize_tso(_TSObject obj, tzinfo tz): int64_t[:] deltas int64_t local_val Py_ssize_t pos + str typ assert obj.tzinfo is None @@ -643,7 +648,7 @@ cdef inline int64_t[:] _tz_convert_dst(int64_t[:] values, tzinfo tz, if not is_tzlocal(tz): # get_dst_info cannot extract offsets from tzlocal because its # dependent on a datetime - trans, deltas, typ = get_dst_info(tz) + trans, deltas, _ = get_dst_info(tz) if not to_utc: # We add `offset` below instead of subtracting it deltas = -1 * np.array(deltas, dtype='i8') @@ -687,7 +692,7 @@ cdef inline int64_t _tz_convert_tzlocal_utc(int64_t val, tzinfo tz, """ cdef: npy_datetimestruct dts - int64_t result, delta + int64_t delta datetime dt dt64_to_dtstruct(val, &dts) @@ -876,18 +881,20 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, localized : ndarray[int64_t] """ cdef: - ndarray[int64_t] trans int64_t[:] deltas, idx_shifted, idx_shifted_left, idx_shifted_right - ndarray ambiguous_array + ndarray[uint8_t, cast=True] ambiguous_array, both_nat, both_eq Py_ssize_t i, idx, pos, ntrans, n = len(vals) Py_ssize_t delta_idx_offset, delta_idx, pos_left, pos_right int64_t *tdata int64_t v, left, right, val, v_left, v_right, new_local, remaining_mins int64_t HOURS_NS = HOUR_SECONDS * 1000000000 - ndarray[int64_t] result, result_a, result_b, dst_hours + ndarray[int64_t] trans, result, result_a, result_b, dst_hours, delta + ndarray trans_idx, grp, a_idx, b_idx, one_diff npy_datetimestruct dts bint infer_dst = False, is_dst = False, fill = False bint shift = False, fill_nonexist = False + list trans_grp + str stamp # Vectorized version of DstTzInfo.localize if is_utc(tz) or tz is None: @@ -920,7 +927,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, if len(ambiguous) != len(vals): raise ValueError("Length of ambiguous bool-array must be " "the same size as vals") - ambiguous_array = np.asarray(ambiguous) + ambiguous_array = np.asarray(ambiguous, dtype=bool) if nonexistent == 'NaT': fill_nonexist = True @@ -930,7 +937,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, assert nonexistent in ('raise', None), ("nonexistent must be one of" " {'NaT', 'raise', 'shift'}") - trans, deltas, typ = get_dst_info(tz) + trans, deltas, _ = get_dst_info(tz) tdata = cnp.PyArray_DATA(trans) ntrans = len(trans) @@ -981,7 +988,7 @@ def tz_localize_to_utc(ndarray[int64_t] vals, object tz, object ambiguous=None, # years which is useful for checking that there is not an ambiguous # transition in an individual year. if trans_idx.size > 0: - one_diff = np.where(np.diff(trans_idx) != 1)[0] +1 + one_diff = np.where(np.diff(trans_idx) != 1)[0] + 1 trans_grp = np.array_split(trans_idx, one_diff) # Iterate through each day, if there are no hours where the @@ -1169,13 +1176,14 @@ cdef int64_t[:] _normalize_local(int64_t[:] stamps, tzinfo tz): result : int64 ndarray of converted of normalized nanosecond timestamps """ cdef: - Py_ssize_t n = len(stamps) + Py_ssize_t i, n = len(stamps) int64_t[:] result = np.empty(n, dtype=np.int64) ndarray[int64_t] trans int64_t[:] deltas + str typ Py_ssize_t[:] pos npy_datetimestruct dts - int64_t delta + int64_t delta, local_val if is_utc(tz): with nogil: @@ -1261,6 +1269,7 @@ def is_date_array_normalized(int64_t[:] stamps, object tz=None): int64_t[:] deltas npy_datetimestruct dts int64_t local_val, delta + str typ if tz is None or is_utc(tz): for i in range(n): diff --git a/pandas/_libs/tslibs/nattype.pxd b/pandas/_libs/tslibs/nattype.pxd index 382ac9d323918a..f649518e969beb 100644 --- a/pandas/_libs/tslibs/nattype.pxd +++ b/pandas/_libs/tslibs/nattype.pxd @@ -1,9 +1,20 @@ # -*- coding: utf-8 -*- +from cpython.datetime cimport datetime + from numpy cimport int64_t cdef int64_t NPY_NAT cdef bint _nat_scalar_rules[6] + +cdef class _NaT(datetime): + cdef readonly: + int64_t value + object freq + +cdef _NaT c_NaT + + cdef bint checknull_with_nat(object val) cdef bint is_null_datetimelike(object val) diff --git a/pandas/_libs/tslibs/nattype.pyx b/pandas/_libs/tslibs/nattype.pyx index 7b7f5f2e34c5f6..42ec235992089c 100644 --- a/pandas/_libs/tslibs/nattype.pyx +++ b/pandas/_libs/tslibs/nattype.pyx @@ -47,7 +47,7 @@ def _make_nan_func(func_name, doc): def _make_nat_func(func_name, doc): def f(*args, **kwargs): - return NaT + return c_NaT f.__name__ = func_name f.__doc__ = doc return f @@ -67,10 +67,10 @@ def _make_error_func(func_name, cls): cdef _nat_divide_op(self, other): - if PyDelta_Check(other) or is_timedelta64_object(other) or other is NaT: + if PyDelta_Check(other) or is_timedelta64_object(other) or other is c_NaT: return np.nan if is_integer_object(other) or is_float_object(other): - return NaT + return c_NaT return NotImplemented @@ -82,15 +82,15 @@ cdef _nat_rdivide_op(self, other): def __nat_unpickle(*args): # return constant defined in the module - return NaT + return c_NaT # ---------------------------------------------------------------------- cdef class _NaT(datetime): - cdef readonly: - int64_t value - object freq + # cdef readonly: + # int64_t value + # object freq def __hash__(_NaT self): # py3k needs this defined here @@ -116,18 +116,18 @@ cdef class _NaT(datetime): def __add__(self, other): if PyDateTime_Check(other): - return NaT + return c_NaT elif hasattr(other, 'delta'): # Timedelta, offsets.Tick, offsets.Week - return NaT + return c_NaT elif getattr(other, '_typ', None) in ['dateoffset', 'series', 'period', 'datetimeindex', 'timedeltaindex']: # Duplicate logic in _Timestamp.__add__ to avoid needing # to subclass; allows us to @final(_Timestamp.__add__) return NotImplemented - return NaT + return c_NaT def __sub__(self, other): # Duplicate some logic from _Timestamp.__sub__ to avoid needing @@ -184,19 +184,6 @@ cdef class _NaT(datetime): """ Returns a numpy.datetime64 object with 'ns' precision """ return np.datetime64('NaT', 'ns') - -class NaTType(_NaT): - """(N)ot-(A)-(T)ime, the time equivalent of NaN""" - - def __new__(cls): - cdef _NaT base - - base = _NaT.__new__(cls, 1, 1, 1) - base.value = NPY_NAT - base.freq = None - - return base - def __repr__(self): return 'NaT' @@ -216,20 +203,11 @@ class NaTType(_NaT): def __long__(self): return NPY_NAT - def __reduce_ex__(self, protocol): - # python 3.6 compat - # http://bugs.python.org/issue28730 - # now __reduce_ex__ is defined and higher priority than __reduce__ - return self.__reduce__() - - def __reduce__(self): - return (__nat_unpickle, (None, )) - def total_seconds(self): """ Total duration of timedelta in seconds (to ns precision) """ - # GH 10939 + # GH#10939 return np.nan @property @@ -260,6 +238,28 @@ class NaTType(_NaT): def is_year_end(self): return False + +class NaTType(_NaT): + """(N)ot-(A)-(T)ime, the time equivalent of NaN""" + + def __new__(cls): + cdef _NaT base + + base = _NaT.__new__(cls, 1, 1, 1) + base.value = NPY_NAT + base.freq = None + + return base + + def __reduce_ex__(self, protocol): + # python 3.6 compat + # http://bugs.python.org/issue28730 + # now __reduce_ex__ is defined and higher priority than __reduce__ + return self.__reduce__() + + def __reduce__(self): + return (__nat_unpickle, (None, )) + def __rdiv__(self, other): return _nat_rdivide_op(self, other) @@ -271,7 +271,7 @@ class NaTType(_NaT): def __rmul__(self, other): if is_integer_object(other) or is_float_object(other): - return NaT + return c_NaT return NotImplemented # ---------------------------------------------------------------------- @@ -659,14 +659,15 @@ class NaTType(_NaT): """) -NaT = NaTType() +c_NaT = NaTType() # C-visible +NaT = c_NaT # Python-visible # ---------------------------------------------------------------------- cdef inline bint checknull_with_nat(object val): """ utility to check if a value is a nat or not """ - return val is None or util.is_nan(val) or val is NaT + return val is None or util.is_nan(val) or val is c_NaT cdef inline bint is_null_datetimelike(object val): @@ -683,7 +684,7 @@ cdef inline bint is_null_datetimelike(object val): """ if val is None or util.is_nan(val): return True - elif val is NaT: + elif val is c_NaT: return True elif util.is_timedelta64_object(val): return val.view('int64') == NPY_NAT diff --git a/pandas/_libs/tslibs/period.pyx b/pandas/_libs/tslibs/period.pyx index e02e493c32a008..dfbf24cf177f60 100644 --- a/pandas/_libs/tslibs/period.pyx +++ b/pandas/_libs/tslibs/period.pyx @@ -46,8 +46,9 @@ from frequencies cimport (get_freq_code, get_base_alias, get_rule_month) from parsing import parse_time_string from resolution import Resolution -from nattype import nat_strings, NaT -from nattype cimport _nat_scalar_rules, NPY_NAT, is_null_datetimelike +from nattype import nat_strings +from nattype cimport ( + _nat_scalar_rules, NPY_NAT, is_null_datetimelike, c_NaT as NaT) from offsets cimport to_offset from offsets import _Tick diff --git a/pandas/_libs/tslibs/timedeltas.pyx b/pandas/_libs/tslibs/timedeltas.pyx index 4d612a6f431076..b0bead2f66ce4e 100644 --- a/pandas/_libs/tslibs/timedeltas.pyx +++ b/pandas/_libs/tslibs/timedeltas.pyx @@ -33,8 +33,8 @@ from ccalendar import DAY_SECONDS from np_datetime cimport (cmp_scalar, reverse_ops, td64_to_tdstruct, pandas_timedeltastruct) -from nattype import nat_strings, NaT -from nattype cimport checknull_with_nat, NPY_NAT +from nattype import nat_strings +from nattype cimport checknull_with_nat, NPY_NAT, c_NaT as NaT from offsets cimport to_offset # ---------------------------------------------------------------------- diff --git a/pandas/_libs/tslibs/timestamps.pyx b/pandas/_libs/tslibs/timestamps.pyx index bb7a9a57b8a75f..b4862a5f3b02f1 100644 --- a/pandas/_libs/tslibs/timestamps.pyx +++ b/pandas/_libs/tslibs/timestamps.pyx @@ -26,8 +26,7 @@ from conversion import tz_localize_to_utc, normalize_i8_timestamps from conversion cimport (tz_convert_single, _TSObject, convert_to_tsobject, convert_datetime_to_tsobject) from fields import get_start_end_field, get_date_name_field -from nattype import NaT -from nattype cimport NPY_NAT +from nattype cimport NPY_NAT, c_NaT as NaT from np_datetime import OutOfBoundsDatetime from np_datetime cimport (reverse_ops, cmp_scalar, check_dts_bounds, npy_datetimestruct, dt64_to_dtstruct) @@ -377,13 +376,15 @@ cdef class _Timestamp(datetime): neg_other = -other return self + neg_other + typ = getattr(other, '_typ', None) + # a Timestamp-DatetimeIndex -> yields a negative TimedeltaIndex - elif getattr(other, '_typ', None) == 'datetimeindex': + if typ in ('datetimeindex', 'datetimearray'): # timezone comparison is performed in DatetimeIndex._sub_datelike return -other.__sub__(self) # a Timestamp-TimedeltaIndex -> yields a negative TimedeltaIndex - elif getattr(other, '_typ', None) == 'timedeltaindex': + elif typ in ('timedeltaindex', 'timedeltaarray'): return (-other).__add__(self) elif other is NaT: diff --git a/pandas/_libs/tslibs/timezones.pyx b/pandas/_libs/tslibs/timezones.pyx index 5fa8a45af30830..9f8922b274abdd 100644 --- a/pandas/_libs/tslibs/timezones.pyx +++ b/pandas/_libs/tslibs/timezones.pyx @@ -2,6 +2,8 @@ from cython import Py_ssize_t +from cpython.datetime cimport tzinfo + # dateutil compat from dateutil.tz import ( tzutc as _dateutil_tzutc, diff --git a/pandas/_version.py b/pandas/_version.py index 036c927df45d34..d000539421b919 100644 --- a/pandas/_version.py +++ b/pandas/_version.py @@ -238,14 +238,14 @@ def git_pieces_from_vcs(tag_prefix, root, verbose, run_command=run_command): # tag full_tag = mo.group(1) if not full_tag.startswith(tag_prefix): + fmt = ("tag '{full_tag}' doesn't start with prefix " + "'{tag_prefix}'") + msg = fmt.format(full_tag=full_tag, tag_prefix=tag_prefix) if verbose: - fmt = "tag '{full_tag}' doesn't start with prefix " \ - "'{tag_prefix}'" - print(fmt.format(full_tag=full_tag, tag_prefix=tag_prefix)) - pieces["error"] = ("tag '{full_tag}' doesn't start with " - "prefix '{tag_prefix}'".format( - full_tag, tag_prefix)) + print(msg) + pieces["error"] = msg return pieces + pieces["closest-tag"] = full_tag[len(tag_prefix):] # distance: number of commits since tag diff --git a/pandas/conftest.py b/pandas/conftest.py index f450193d9388e3..20f97bdec11075 100644 --- a/pandas/conftest.py +++ b/pandas/conftest.py @@ -1,3 +1,5 @@ +from datetime import date, time, timedelta +from decimal import Decimal import importlib import os @@ -8,7 +10,7 @@ import pytest from pytz import FixedOffset, utc -from pandas.compat import PY3 +from pandas.compat import PY3, u import pandas.util._test_decorators as td import pandas as pd @@ -384,8 +386,17 @@ def tz_aware_fixture(request): COMPLEX_DTYPES = [complex, "complex64", "complex128"] STRING_DTYPES = [str, 'str', 'U'] +DATETIME_DTYPES = ['datetime64[ns]', 'M8[ns]'] +TIMEDELTA_DTYPES = ['timedelta64[ns]', 'm8[ns]'] + +BOOL_DTYPES = [bool, 'bool'] +BYTES_DTYPES = [bytes, 'bytes'] +OBJECT_DTYPES = [object, 'object'] + ALL_REAL_DTYPES = FLOAT_DTYPES + ALL_INT_DTYPES -ALL_NUMPY_DTYPES = ALL_REAL_DTYPES + COMPLEX_DTYPES + STRING_DTYPES +ALL_NUMPY_DTYPES = (ALL_REAL_DTYPES + COMPLEX_DTYPES + STRING_DTYPES + + DATETIME_DTYPES + TIMEDELTA_DTYPES + BOOL_DTYPES + + OBJECT_DTYPES + BYTES_DTYPES * PY3) # bytes only for PY3 @pytest.fixture(params=STRING_DTYPES) @@ -404,8 +415,9 @@ def float_dtype(request): """ Parameterized fixture for float dtypes. - * float32 - * float64 + * float + * 'float32' + * 'float64' """ return request.param @@ -416,8 +428,9 @@ def complex_dtype(request): """ Parameterized fixture for complex dtypes. - * complex64 - * complex128 + * complex + * 'complex64' + * 'complex128' """ return request.param @@ -428,10 +441,11 @@ def sint_dtype(request): """ Parameterized fixture for signed integer dtypes. - * int8 - * int16 - * int32 - * int64 + * int + * 'int8' + * 'int16' + * 'int32' + * 'int64' """ return request.param @@ -442,10 +456,10 @@ def uint_dtype(request): """ Parameterized fixture for unsigned integer dtypes. - * uint8 - * uint16 - * uint32 - * uint64 + * 'uint8' + * 'uint16' + * 'uint32' + * 'uint64' """ return request.param @@ -454,16 +468,17 @@ def uint_dtype(request): @pytest.fixture(params=ALL_INT_DTYPES) def any_int_dtype(request): """ - Parameterized fixture for any integer dtypes. + Parameterized fixture for any integer dtype. - * int8 - * uint8 - * int16 - * uint16 - * int32 - * uint32 - * int64 - * uint64 + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' """ return request.param @@ -472,18 +487,20 @@ def any_int_dtype(request): @pytest.fixture(params=ALL_REAL_DTYPES) def any_real_dtype(request): """ - Parameterized fixture for any (purely) real numeric dtypes. + Parameterized fixture for any (purely) real numeric dtype. - * int8 - * uint8 - * int16 - * uint16 - * int32 - * uint32 - * int64 - * uint64 - * float32 - * float64 + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' + * float + * 'float32' + * 'float64' """ return request.param @@ -494,26 +511,117 @@ def any_numpy_dtype(request): """ Parameterized fixture for all numpy dtypes. - * int8 - * uint8 - * int16 - * uint16 - * int32 - * uint32 - * int64 - * uint64 - * float32 - * float64 - * complex64 - * complex128 + * bool + * 'bool' + * int + * 'int8' + * 'uint8' + * 'int16' + * 'uint16' + * 'int32' + * 'uint32' + * 'int64' + * 'uint64' + * float + * 'float32' + * 'float64' + * complex + * 'complex64' + * 'complex128' * str * 'str' * 'U' + * bytes + * 'bytes' + * 'datetime64[ns]' + * 'M8[ns]' + * 'timedelta64[ns]' + * 'm8[ns]' + * object + * 'object' """ return request.param +# categoricals are handled separately +_any_skipna_inferred_dtype = [ + ('string', ['a', np.nan, 'c']), + ('unicode' if not PY3 else 'string', [u('a'), np.nan, u('c')]), + ('bytes' if PY3 else 'string', [b'a', np.nan, b'c']), + ('empty', [np.nan, np.nan, np.nan]), + ('empty', []), + ('mixed-integer', ['a', np.nan, 2]), + ('mixed', ['a', np.nan, 2.0]), + ('floating', [1.0, np.nan, 2.0]), + ('integer', [1, np.nan, 2]), + ('mixed-integer-float', [1, np.nan, 2.0]), + ('decimal', [Decimal(1), np.nan, Decimal(2)]), + ('boolean', [True, np.nan, False]), + ('datetime64', [np.datetime64('2013-01-01'), np.nan, + np.datetime64('2018-01-01')]), + ('datetime', [pd.Timestamp('20130101'), np.nan, pd.Timestamp('20180101')]), + ('date', [date(2013, 1, 1), np.nan, date(2018, 1, 1)]), + # The following two dtypes are commented out due to GH 23554 + # ('complex', [1 + 1j, np.nan, 2 + 2j]), + # ('timedelta64', [np.timedelta64(1, 'D'), + # np.nan, np.timedelta64(2, 'D')]), + ('timedelta', [timedelta(1), np.nan, timedelta(2)]), + ('time', [time(1), np.nan, time(2)]), + ('period', [pd.Period(2013), pd.NaT, pd.Period(2018)]), + ('interval', [pd.Interval(0, 1), np.nan, pd.Interval(0, 2)])] +ids, _ = zip(*_any_skipna_inferred_dtype) # use inferred type as fixture-id + + +@pytest.fixture(params=_any_skipna_inferred_dtype, ids=ids) +def any_skipna_inferred_dtype(request): + """ + Fixture for all inferred dtypes from _libs.lib.infer_dtype + + The covered (inferred) types are: + * 'string' + * 'unicode' (if PY2) + * 'empty' + * 'bytes' (if PY3) + * 'mixed' + * 'mixed-integer' + * 'mixed-integer-float' + * 'floating' + * 'integer' + * 'decimal' + * 'boolean' + * 'datetime64' + * 'datetime' + * 'date' + * 'timedelta' + * 'time' + * 'period' + * 'interval' + + Returns + ------- + inferred_dtype : str + The string for the inferred dtype from _libs.lib.infer_dtype + values : np.ndarray + An array of object dtype that will be inferred to have + `inferred_dtype` + + Examples + -------- + >>> import pandas._libs.lib as lib + >>> + >>> def test_something(any_skipna_inferred_dtype): + ... inferred_dtype, values = any_skipna_inferred_dtype + ... # will pass + ... assert lib.infer_dtype(values, skipna=True) == inferred_dtype + """ + inferred_dtype, values = request.param + values = np.array(values, dtype=object) # object dtype to avoid casting + + # correctness of inference tested in tests/dtypes/test_inference.py + return inferred_dtype, values + + @pytest.fixture def mock(): """ diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 5f7995ac649a20..1a4368ee8ea98a 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -19,7 +19,7 @@ ensure_float64, ensure_int64, ensure_object, ensure_platform_int, ensure_uint64, is_array_like, is_bool_dtype, is_categorical_dtype, is_complex_dtype, is_datetime64_any_dtype, is_datetime64tz_dtype, - is_datetimelike, is_datetimetz, is_extension_array_dtype, is_float_dtype, + is_datetimelike, is_extension_array_dtype, is_float_dtype, is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, is_object_dtype, is_period_dtype, is_scalar, is_signed_integer_dtype, is_sparse, is_timedelta64_dtype, is_unsigned_integer_dtype, @@ -460,7 +460,7 @@ def _factorize_array(values, na_sentinel=-1, size_hint=None, (hash_klass, _), values = _get_data_algo(values, _hashtables) table = hash_klass(size_hint or len(values)) - labels, uniques = table.factorize(values, na_sentinel=na_sentinel, + uniques, labels = table.factorize(values, na_sentinel=na_sentinel, na_value=na_value) labels = ensure_platform_int(labels) @@ -1581,7 +1581,7 @@ def take_nd(arr, indexer, axis=0, out=None, fill_value=np.nan, mask_info=None, # dispatch to internal type takes if is_extension_array_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) - elif is_datetimetz(arr): + elif is_datetime64tz_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) elif is_interval_dtype(arr): return arr.take(indexer, fill_value=fill_value, allow_fill=allow_fill) diff --git a/pandas/core/apply.py b/pandas/core/apply.py index c44e64d29ed269..5658094ec36c60 100644 --- a/pandas/core/apply.py +++ b/pandas/core/apply.py @@ -132,7 +132,7 @@ def get_result(self): # ufunc elif isinstance(self.f, np.ufunc): with np.errstate(all='ignore'): - results = self.f(self.values) + results = self.obj._data.apply('apply', func=self.f) return self.obj._constructor(data=results, index=self.index, columns=self.columns, copy=False) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 6dc3a960dc817b..42696e4796fe09 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -98,7 +98,7 @@ def f(self, other): ret[na_mask] = False return ret - # Numpy-1.9 and earlier may convert a scalar to a zerodim array during + # Numpy < 1.13 may convert a scalar to a zerodim array during # comparison operation when second arg has higher priority, e.g. # # cat[0] < cat @@ -347,6 +347,16 @@ def __init__(self, values, categories=None, ordered=None, dtype=None, # the "ordered" and "categories" arguments dtype = values.dtype._from_categorical_dtype(values.dtype, categories, ordered) + + # GH23814, for perf, if values._values already an instance of + # Categorical, set values to codes, and run fastpath + if (isinstance(values, (ABCSeries, ABCIndexClass)) and + isinstance(values._values, type(self))): + values = values._values.codes.copy() + if categories is None: + categories = dtype.categories + fastpath = True + else: # If dtype=None and values is not categorical, create a new dtype dtype = CategoricalDtype(categories, ordered) @@ -546,8 +556,9 @@ def base(self): @classmethod def _from_inferred_categories(cls, inferred_categories, inferred_codes, - dtype): - """Construct a Categorical from inferred values + dtype, true_values=None): + """ + Construct a Categorical from inferred values. For inferred categories (`dtype` is None) the categories are sorted. For explicit `dtype`, the `inferred_categories` are cast to the @@ -555,10 +566,12 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, Parameters ---------- - inferred_categories : Index inferred_codes : Index dtype : CategoricalDtype or 'category' + true_values : list, optional + If none are provided, the default ones are + "True", "TRUE", and "true." Returns ------- @@ -567,27 +580,32 @@ def _from_inferred_categories(cls, inferred_categories, inferred_codes, from pandas import Index, to_numeric, to_datetime, to_timedelta cats = Index(inferred_categories) - known_categories = (isinstance(dtype, CategoricalDtype) and dtype.categories is not None) if known_categories: - # Convert to a specialzed type with `dtype` if specified + # Convert to a specialized type with `dtype` if specified. if dtype.categories.is_numeric(): - cats = to_numeric(inferred_categories, errors='coerce') + cats = to_numeric(inferred_categories, errors="coerce") elif is_datetime64_dtype(dtype.categories): - cats = to_datetime(inferred_categories, errors='coerce') + cats = to_datetime(inferred_categories, errors="coerce") elif is_timedelta64_dtype(dtype.categories): - cats = to_timedelta(inferred_categories, errors='coerce') + cats = to_timedelta(inferred_categories, errors="coerce") + elif dtype.categories.is_boolean(): + if true_values is None: + true_values = ["True", "TRUE", "true"] + + cats = cats.isin(true_values) if known_categories: - # recode from observation order to dtype.categories order + # Recode from observation order to dtype.categories order. categories = dtype.categories codes = _recode_for_categories(inferred_codes, cats, categories) elif not cats.is_monotonic_increasing: - # sort categories and recode for unknown categories + # Sort categories and recode for unknown categories. unsorted = cats.copy() categories = cats.sort_values() + codes = _recode_for_categories(inferred_codes, unsorted, categories) dtype = CategoricalDtype(categories, ordered=False) @@ -2038,15 +2056,7 @@ def __setitem__(self, key, value): elif isinstance(key, slice): pass - # Array of True/False in Series or Categorical - else: - # There is a bug in numpy, which does not accept a Series as a - # indexer - # https://github.com/pandas-dev/pandas/issues/6168 - # https://github.com/numpy/numpy/issues/4240 -> fixed in numpy 1.9 - # FIXME: remove when numpy 1.9 is the lowest numpy version pandas - # accepts... - key = np.asarray(key) + # else: array of True/False in Series or Categorical lindexer = self.categories.get_indexer(rvalue) lindexer = self._maybe_coerce_indexer(lindexer) diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 4e784d9c89c5f1..a6f254c79fb518 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -10,11 +10,12 @@ from pandas._libs.tslibs.period import ( DIFFERENT_FREQ_INDEX, IncompatibleFrequency, Period) from pandas._libs.tslibs.timedeltas import Timedelta, delta_to_nanoseconds -from pandas._libs.tslibs.timestamps import maybe_integer_op_deprecated +from pandas._libs.tslibs.timestamps import ( + RoundTo, maybe_integer_op_deprecated, round_nsint64) import pandas.compat as compat from pandas.errors import ( AbstractMethodError, NullFrequencyError, PerformanceWarning) -from pandas.util._decorators import deprecate_kwarg +from pandas.util._decorators import Appender, deprecate_kwarg from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_any_dtype, is_datetime64_dtype, @@ -80,6 +81,189 @@ def _get_attributes_dict(self): return {k: getattr(self, k, None) for k in self._attributes} +class DatelikeOps(object): + """ + Common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex. + """ + + def strftime(self, date_format): + from pandas import Index + return Index(self.format(date_format=date_format), + dtype=compat.text_type) + strftime.__doc__ = """ + Convert to Index using specified date_format. + + Return an Index of formatted strings specified by date_format, which + supports the same string format as the python standard library. Details + of the string format can be found in `python string format doc <{0}>`__ + + Parameters + ---------- + date_format : str + Date format string (e.g. "%Y-%m-%d"). + + Returns + ------- + Index + Index of formatted strings + + See Also + -------- + to_datetime : Convert the given argument to datetime. + DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. + DatetimeIndex.round : Round the DatetimeIndex to the specified freq. + DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. + + Examples + -------- + >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), + ... periods=3, freq='s') + >>> rng.strftime('%B %d, %Y, %r') + Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', + 'March 10, 2018, 09:00:02 AM'], + dtype='object') + """.format("https://docs.python.org/3/library/datetime.html" + "#strftime-and-strptime-behavior") + + +class TimelikeOps(object): + """ + Common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex. + """ + + _round_doc = ( + """ + Perform {op} operation on the data to the specified `freq`. + + Parameters + ---------- + freq : str or Offset + The frequency level to {op} the index to. Must be a fixed + frequency like 'S' (second) not 'ME' (month end). See + :ref:`frequency aliases ` for + a list of possible `freq` values. + ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' + Only relevant for DatetimeIndex: + + - 'infer' will attempt to infer fall dst-transition hours based on + order + - bool-ndarray where True signifies a DST time, False designates + a non-DST time (note that this flag is only applicable for + ambiguous times) + - 'NaT' will return NaT where there are ambiguous times + - 'raise' will raise an AmbiguousTimeError if there are ambiguous + times + + .. versionadded:: 0.24.0 + nonexistent : 'shift', 'NaT', default 'raise' + A nonexistent time does not exist in a particular timezone + where clocks moved forward due to DST. + + - 'shift' will shift the nonexistent time forward to the closest + existing time + - 'NaT' will return NaT where there are nonexistent times + - 'raise' will raise an NonExistentTimeError if there are + nonexistent times + + .. versionadded:: 0.24.0 + + Returns + ------- + DatetimeIndex, TimedeltaIndex, or Series + Index of the same type for a DatetimeIndex or TimedeltaIndex, + or a Series with the same index for a Series. + + Raises + ------ + ValueError if the `freq` cannot be converted. + + Examples + -------- + **DatetimeIndex** + + >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') + >>> rng + DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', + '2018-01-01 12:01:00'], + dtype='datetime64[ns]', freq='T') + """) + + _round_example = ( + """>>> rng.round('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.round("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + """) + + _floor_example = ( + """>>> rng.floor('H') + DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', + '2018-01-01 12:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.floor("H") + 0 2018-01-01 11:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 12:00:00 + dtype: datetime64[ns] + """ + ) + + _ceil_example = ( + """>>> rng.ceil('H') + DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', + '2018-01-01 13:00:00'], + dtype='datetime64[ns]', freq=None) + + **Series** + + >>> pd.Series(rng).dt.ceil("H") + 0 2018-01-01 12:00:00 + 1 2018-01-01 12:00:00 + 2 2018-01-01 13:00:00 + dtype: datetime64[ns] + """ + ) + + def _round(self, freq, mode, ambiguous, nonexistent): + # round the local times + values = _ensure_datetimelike_to_i8(self) + result = round_nsint64(values, mode, freq) + result = self._maybe_mask_results(result, fill_value=NaT) + + attribs = self._get_attributes_dict() + attribs['freq'] = None + if 'tz' in attribs: + attribs['tz'] = None + return self._ensure_localized( + self._shallow_copy(result, **attribs), ambiguous, nonexistent + ) + + @Appender((_round_doc + _round_example).format(op="round")) + def round(self, freq, ambiguous='raise', nonexistent='raise'): + return self._round( + freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent + ) + + @Appender((_round_doc + _floor_example).format(op="floor")) + def floor(self, freq, ambiguous='raise', nonexistent='raise'): + return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) + + @Appender((_round_doc + _ceil_example).format(op="ceil")) + def ceil(self, freq, ambiguous='raise', nonexistent='raise'): + return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) + + class DatetimeLikeArrayMixin(ExtensionOpsMixin, AttributesMixin): """ Shared Base/Mixin class for DatetimeArray, TimedeltaArray, PeriodArray @@ -835,8 +1019,7 @@ def __isub__(self, other): def _evaluate_compare(self, other, op): """ We have been called because a comparison between - 8 aware arrays. numpy >= 1.11 will - now warn about NaT comparisons + 8 aware arrays. numpy will warn about NaT comparisons """ # Called by comparison methods when comparing datetimelike # with datetimelike @@ -1024,3 +1207,39 @@ def validate_dtype_freq(dtype, freq): raise IncompatibleFrequency('specified freq and dtype ' 'are different') return freq + + +def _ensure_datetimelike_to_i8(other, to_utc=False): + """ + Helper for coercing an input scalar or array to i8. + + Parameters + ---------- + other : 1d array + to_utc : bool, default False + If True, convert the values to UTC before extracting the i8 values + If False, extract the i8 values directly. + + Returns + ------- + i8 1d array + """ + from pandas import Index + from pandas.core.arrays import PeriodArray + + if lib.is_scalar(other) and isna(other): + return iNaT + elif isinstance(other, (PeriodArray, ABCIndexClass)): + # convert tz if needed + if getattr(other, 'tz', None) is not None: + if to_utc: + other = other.tz_convert('UTC') + else: + other = other.tz_localize(None) + else: + try: + return np.array(other, copy=False).view('i8') + except TypeError: + # period array cannot be coerced to int + other = Index(other) + return other.asi8 diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index ae366149ab8999..45630f8109932d 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -6,17 +6,17 @@ from pytz import utc from pandas._libs import lib, tslib -from pandas._libs.tslib import NaT, Timestamp, iNaT from pandas._libs.tslibs import ( - ccalendar, conversion, fields, normalize_date, resolution as libresolution, - timezones) + NaT, Timestamp, ccalendar, conversion, fields, iNaT, normalize_date, + resolution as libresolution, timezones) import pandas.compat as compat from pandas.errors import PerformanceWarning from pandas.util._decorators import Appender, cache_readonly from pandas.core.dtypes.common import ( - _NS_DTYPE, is_datetime64_dtype, is_datetime64tz_dtype, is_int64_dtype, - is_object_dtype) + _NS_DTYPE, is_datetime64_dtype, is_datetime64tz_dtype, is_extension_type, + is_float_dtype, is_int64_dtype, is_object_dtype, is_period_dtype, + is_timedelta64_dtype) from pandas.core.dtypes.dtypes import DatetimeTZDtype from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -155,7 +155,9 @@ def wrapper(self, other): return compat.set_function_name(wrapper, opname, cls) -class DatetimeArrayMixin(dtl.DatetimeLikeArrayMixin): +class DatetimeArrayMixin(dtl.DatetimeLikeArrayMixin, + dtl.TimelikeOps, + dtl.DatelikeOps): """ Assumes that subclass __new__/__init__ defines: tz @@ -1421,6 +1423,163 @@ def to_julian_date(self): DatetimeArrayMixin._add_datetimelike_methods() +# ------------------------------------------------------------------- +# Constructor Helpers + +def maybe_infer_tz(tz, inferred_tz): + """ + If a timezone is inferred from data, check that it is compatible with + the user-provided timezone, if any. + + Parameters + ---------- + tz : tzinfo or None + inferred_tz : tzinfo or None + + Returns + ------- + tz : tzinfo or None + + Raises + ------ + TypeError : if both timezones are present but do not match + """ + if tz is None: + tz = inferred_tz + elif inferred_tz is None: + pass + elif not timezones.tz_compare(tz, inferred_tz): + raise TypeError('data is already tz-aware {inferred_tz}, unable to ' + 'set specified tz: {tz}' + .format(inferred_tz=inferred_tz, tz=tz)) + return tz + + +def maybe_convert_dtype(data, copy): + """ + Convert data based on dtype conventions, issuing deprecation warnings + or errors where appropriate. + + Parameters + ---------- + data : np.ndarray or pd.Index + copy : bool + + Returns + ------- + data : np.ndarray or pd.Index + copy : bool + + Raises + ------ + TypeError : PeriodDType data is passed + """ + if is_float_dtype(data): + # Note: we must cast to datetime64[ns] here in order to treat these + # as wall-times instead of UTC timestamps. + data = data.astype(_NS_DTYPE) + copy = False + # TODO: deprecate this behavior to instead treat symmetrically + # with integer dtypes. See discussion in GH#23675 + + elif is_timedelta64_dtype(data): + warnings.warn("Passing timedelta64-dtype data is deprecated, will " + "raise a TypeError in a future version", + FutureWarning, stacklevel=3) + data = data.view(_NS_DTYPE) + + elif is_period_dtype(data): + # Note: without explicitly raising here, PeriondIndex + # test_setops.test_join_does_not_recur fails + raise TypeError("Passing PeriodDtype data is invalid. " + "Use `data.to_timestamp()` instead") + + elif is_extension_type(data) and not is_datetime64tz_dtype(data): + # Includes categorical + # TODO: We have no tests for these + data = np.array(data, dtype=np.object_) + copy = False + + return data, copy + + +def objects_to_datetime64ns(data, dayfirst, yearfirst, + utc=False, errors="raise", + require_iso8601=False, allow_object=False): + """ + Convert data to array of timestamps. + + Parameters + ---------- + data : np.ndarray[object] + dayfirst : bool + yearfirst : bool + utc : bool, default False + Whether to convert timezone-aware timestamps to UTC + errors : {'raise', 'ignore', 'coerce'} + allow_object : bool + Whether to return an object-dtype ndarray instead of raising if the + data contains more than one timezone. + + Returns + ------- + result : ndarray + np.int64 dtype if returned values represent UTC timestamps + np.datetime64[ns] if returned values represent wall times + object if mixed timezones + inferred_tz : tzinfo or None + + Raises + ------ + ValueError : if data cannot be converted to datetimes + """ + assert errors in ["raise", "ignore", "coerce"] + + # if str-dtype, convert + data = np.array(data, copy=False, dtype=np.object_) + + try: + result, tz_parsed = tslib.array_to_datetime( + data, + errors=errors, + utc=utc, + dayfirst=dayfirst, + yearfirst=yearfirst, + require_iso8601=require_iso8601 + ) + except ValueError as e: + try: + values, tz_parsed = conversion.datetime_to_datetime64(data) + # If tzaware, these values represent unix timestamps, so we + # return them as i8 to distinguish from wall times + return values.view('i8'), tz_parsed + except (ValueError, TypeError): + raise e + + if tz_parsed is not None: + # We can take a shortcut since the datetime64 numpy array + # is in UTC + # Return i8 values to denote unix timestamps + return result.view('i8'), tz_parsed + elif is_datetime64_dtype(result): + # returning M8[ns] denotes wall-times; since tz is None + # the distinction is a thin one + return result, tz_parsed + elif is_object_dtype(result): + # GH#23675 when called via `pd.to_datetime`, returning an object-dtype + # array is allowed. When called via `pd.DatetimeIndex`, we can + # only accept datetime64 dtype, so raise TypeError if object-dtype + # is returned, as that indicates the values can be recognized as + # datetimes but they have conflicting timezones/awareness + if allow_object: + return result, tz_parsed + raise TypeError(result) + else: # pragma: no cover + # GH#23675 this TypeError should never be hit, whereas the TypeError + # in the object-dtype branch above is reachable. + raise TypeError(result) + + def _generate_regular_range(cls, start, end, periods, freq): """ Generate a range of dates with the spans between dates described by diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 70be850481d857..b055bc3f2eb526 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -74,6 +74,7 @@ length values is_non_overlapping_monotonic +%(extra_attributes)s\ Methods ------- @@ -107,6 +108,7 @@ summary="Pandas array for interval data that are closed on the same side.", versionadded="0.24.0", name='', + extra_attributes='', extra_methods='', examples=textwrap.dedent("""\ Examples diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 53629dca4d3912..6ed68ee890e403 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -4,8 +4,7 @@ import numpy as np -from pandas._libs.tslib import NaT, iNaT -from pandas._libs.tslibs import period as libperiod +from pandas._libs.tslibs import NaT, iNaT, period as libperiod from pandas._libs.tslibs.fields import isleapyear_arr from pandas._libs.tslibs.period import ( DIFFERENT_FREQ_INDEX, IncompatibleFrequency, Period, get_period_field_arr, @@ -347,7 +346,7 @@ def __repr__(self): def __setitem__( self, - key, # type: Union[int, Sequence[int], Sequence[bool]] + key, # type: Union[int, Sequence[int], Sequence[bool], slice] value # type: Union[NaTType, Period, Sequence[Period]] ): # type: (...) -> None @@ -357,11 +356,14 @@ def __setitem__( # ndarray[datetime64ns]. I think ndarray[int] / ndarray[str] won't # work, since the freq can't be inferred. if is_list_like(value): - if len(key) != len(value) and not com.is_bool_indexer(key): + is_slice = isinstance(key, slice) + if (not is_slice + and len(key) != len(value) + and not com.is_bool_indexer(key)): msg = ("shape mismatch: value array of length '{}' does not " "match indexing result of length '{}'.") raise ValueError(msg.format(len(key), len(value))) - if len(key) == 0: + if not is_slice and len(key) == 0: return value = period_array(value) diff --git a/pandas/core/arrays/sparse.py b/pandas/core/arrays/sparse.py index 24e33c32d08986..9a5ef3b3a7dd0e 100644 --- a/pandas/core/arrays/sparse.py +++ b/pandas/core/arrays/sparse.py @@ -1015,9 +1015,6 @@ def __getitem__(self, key): key = np.asarray(key) if com.is_bool_indexer(key) and len(self) == len(key): - # TODO(numpy 1.11): Remove this asarray. - # Old NumPy didn't treat array-like as boolean masks. - key = np.asarray(key) return self.take(np.arange(len(key), dtype=np.int32)[key]) elif hasattr(key, '__len__'): return self.take(key) diff --git a/pandas/core/arrays/timedeltas.py b/pandas/core/arrays/timedeltas.py index d1e6d979b554c4..c0cfa996810bc1 100644 --- a/pandas/core/arrays/timedeltas.py +++ b/pandas/core/arrays/timedeltas.py @@ -2,12 +2,11 @@ from __future__ import division from datetime import timedelta -import operator import warnings import numpy as np -from pandas._libs import tslibs +from pandas._libs import algos, lib, tslibs from pandas._libs.tslibs import NaT, Timedelta, Timestamp, iNaT from pandas._libs.tslibs.fields import get_timedelta_field from pandas._libs.tslibs.timedeltas import ( @@ -17,14 +16,13 @@ from pandas.core.dtypes.common import ( _TD_DTYPE, ensure_int64, is_datetime64_dtype, is_float_dtype, - is_integer_dtype, is_list_like, is_object_dtype, is_string_dtype, - is_timedelta64_dtype) + is_integer_dtype, is_list_like, is_object_dtype, is_scalar, + is_string_dtype, is_timedelta64_dtype) from pandas.core.dtypes.generic import ( ABCDataFrame, ABCIndexClass, ABCSeries, ABCTimedeltaIndex) from pandas.core.dtypes.missing import isna -from pandas.core import ops -from pandas.core.algorithms import checked_add_with_arr +from pandas.core.algorithms import checked_add_with_arr, unique1d import pandas.core.common as com from pandas.tseries.frequencies import to_offset @@ -106,30 +104,7 @@ def wrapper(self, other): return compat.set_function_name(wrapper, opname, cls) -def _wrap_tdi_op(op): - """ - Instead of re-implementing multiplication/division etc operations - in the Array class, for now we dispatch to the TimedeltaIndex - implementations. - """ - # TODO: implement directly here and wrap in TimedeltaIndex, instead of - # the other way around - def method(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): - return NotImplemented - - from pandas import TimedeltaIndex - obj = TimedeltaIndex(self) - result = op(obj, other) - if is_timedelta64_dtype(result): - return type(self)(result) - return np.array(result) - - method.__name__ = '__{name}__'.format(name=op.__name__) - return method - - -class TimedeltaArrayMixin(dtl.DatetimeLikeArrayMixin): +class TimedeltaArrayMixin(dtl.DatetimeLikeArrayMixin, dtl.TimelikeOps): _typ = "timedeltaarray" __array_priority__ = 1000 @@ -162,16 +137,36 @@ def _simple_new(cls, values, freq=None, dtype=_TD_DTYPE): result._freq = freq return result - def __new__(cls, values, freq=None, dtype=_TD_DTYPE): + def __new__(cls, values, freq=None, dtype=_TD_DTYPE, copy=False): + return cls._from_sequence(values, freq=freq, dtype=dtype, copy=copy) - freq, freq_infer = dtl.maybe_infer_freq(freq) + @classmethod + def _from_sequence(cls, data, freq=None, unit=None, + dtype=_TD_DTYPE, copy=False): + if dtype != _TD_DTYPE: + raise ValueError("Only timedelta64[ns] dtype is valid.") - values = np.array(values, copy=False) - if values.dtype == np.object_: - values = array_to_timedelta64(values) + freq, freq_infer = dtl.maybe_infer_freq(freq) - result = cls._simple_new(values, freq=freq) - if freq_infer: + data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit) + if inferred_freq is not None: + if freq is not None and freq != inferred_freq: + raise ValueError('Inferred frequency {inferred} from passed ' + 'values does not conform to passed frequency ' + '{passed}' + .format(inferred=inferred_freq, + passed=freq.freqstr)) + elif freq is None: + freq = inferred_freq + freq_infer = False + + result = cls._simple_new(data, freq=freq) + + if inferred_freq is None and freq is not None: + # this condition precludes `freq_infer` + cls._validate_frequency(result, freq) + + elif freq_infer: result.freq = to_offset(result.inferred_freq) return result @@ -227,6 +222,21 @@ def _validate_fill_value(self, fill_value): "Got '{got}'.".format(got=fill_value)) return fill_value + # monotonicity/uniqueness properties are called via frequencies.infer_freq, + # see GH#23789 + + @property + def _is_monotonic_increasing(self): + return algos.is_monotonic(self.asi8, timelike=True)[0] + + @property + def _is_monotonic_decreasing(self): + return algos.is_monotonic(self.asi8, timelike=True)[1] + + @property + def _is_unique(self): + return len(unique1d(self.asi8)) == len(self) + # ---------------------------------------------------------------- # Arithmetic Methods @@ -241,7 +251,7 @@ def _add_offset(self, other): def _add_delta(self, delta): """ Add a timedelta-like, Tick, or TimedeltaIndex-like object - to self, yielding a new TimedeltaArray + to self, yielding a new TimedeltaArray. Parameters ---------- @@ -256,7 +266,9 @@ def _add_delta(self, delta): return type(self)(new_values, freq='infer') def _add_datetime_arraylike(self, other): - """Add DatetimeArray/Index or ndarray[datetime64] to TimedeltaArray""" + """ + Add DatetimeArray/Index or ndarray[datetime64] to TimedeltaArray. + """ if isinstance(other, np.ndarray): # At this point we have already checked that dtype is datetime64 from pandas.core.arrays import DatetimeArrayMixin @@ -281,7 +293,7 @@ def _add_datetimelike_scalar(self, other): result = checked_add_with_arr(i8, other.value, arr_mask=self._isnan) result = self._maybe_mask_results(result) - return DatetimeArrayMixin(result, tz=other.tz) + return DatetimeArrayMixin(result, tz=other.tz, freq=self.freq) def _addsub_offset_array(self, other, op): # Add or subtract Array-like of DateOffset objects @@ -295,41 +307,300 @@ def _addsub_offset_array(self, other, op): raise TypeError("Cannot add/subtract non-tick DateOffset to {cls}" .format(cls=type(self).__name__)) - def _evaluate_with_timedelta_like(self, other, op): - if isinstance(other, ABCSeries): - # GH#19042 + def __mul__(self, other): + other = lib.item_from_zerodim(other) + + if isinstance(other, (ABCDataFrame, ABCSeries, ABCIndexClass)): return NotImplemented - opstr = '__{opname}__'.format(opname=op.__name__).replace('__r', '__') - # allow division by a timedelta - if opstr in ['__div__', '__truediv__', '__floordiv__']: - if _is_convertible_to_td(other): - other = Timedelta(other) - if isna(other): - raise NotImplementedError( - "division by pd.NaT not implemented") - - i8 = self.asi8 - left, right = i8, other.value - - if opstr in ['__floordiv__']: - result = op(left, right) - else: - result = op(left, np.float64(right)) - result = self._maybe_mask_results(result, fill_value=None, - convert='float64') - return result + if is_scalar(other): + # numpy will accept float and int, raise TypeError for others + result = self._data * other + freq = None + if self.freq is not None and not isna(other): + freq = self.freq * other + return type(self)(result, freq=freq) + + if not hasattr(other, "dtype"): + # list, tuple + other = np.array(other) + if len(other) != len(self) and not is_timedelta64_dtype(other): + # Exclude timedelta64 here so we correctly raise TypeError + # for that instead of ValueError + raise ValueError("Cannot multiply with unequal lengths") + + if is_object_dtype(other): + # this multiplication will succeed only if all elements of other + # are int or float scalars, so we will end up with + # timedelta64[ns]-dtyped result + result = [self[n] * other[n] for n in range(len(self))] + result = np.array(result) + return type(self)(result) - return NotImplemented + # numpy will accept float or int dtype, raise TypeError for others + result = self._data * other + return type(self)(result) - __mul__ = _wrap_tdi_op(operator.mul) __rmul__ = __mul__ - __truediv__ = _wrap_tdi_op(operator.truediv) - __floordiv__ = _wrap_tdi_op(operator.floordiv) - __rfloordiv__ = _wrap_tdi_op(ops.rfloordiv) + + def __truediv__(self, other): + # timedelta / X is well-defined for timedelta-like or numeric X + other = lib.item_from_zerodim(other) + + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + if other is NaT: + # specifically timedelta64-NaT + result = np.empty(self.shape, dtype=np.float64) + result.fill(np.nan) + return result + + # otherwise, dispatch to Timedelta implementation + return self._data / other + + elif lib.is_scalar(other): + # assume it is numeric + result = self._data / other + freq = None + if self.freq is not None: + # Tick division is not implemented, so operate on Timedelta + freq = self.freq.delta / other + return type(self)(result, freq=freq) + + if not hasattr(other, "dtype"): + # e.g. list, tuple + other = np.array(other) + + if len(other) != len(self): + raise ValueError("Cannot divide vectors with unequal lengths") + + elif is_timedelta64_dtype(other): + # let numpy handle it + return self._data / other + + elif is_object_dtype(other): + # Note: we do not do type inference on the result, so either + # an object array or numeric-dtyped (if numpy does inference) + # will be returned. GH#23829 + result = [self[n] / other[n] for n in range(len(self))] + result = np.array(result) + return result + + else: + result = self._data / other + return type(self)(result) + + def __rtruediv__(self, other): + # X / timedelta is defined only for timedelta-like X + other = lib.item_from_zerodim(other) + + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + if other is NaT: + # specifically timedelta64-NaT + result = np.empty(self.shape, dtype=np.float64) + result.fill(np.nan) + return result + + # otherwise, dispatch to Timedelta implementation + return other / self._data + + elif lib.is_scalar(other): + raise TypeError("Cannot divide {typ} by {cls}" + .format(typ=type(other).__name__, + cls=type(self).__name__)) + + if not hasattr(other, "dtype"): + # e.g. list, tuple + other = np.array(other) + + if len(other) != len(self): + raise ValueError("Cannot divide vectors with unequal lengths") + + elif is_timedelta64_dtype(other): + # let numpy handle it + return other / self._data + + elif is_object_dtype(other): + # Note: unlike in __truediv__, we do not _need_ to do type# + # inference on the result. It does not raise, a numeric array + # is returned. GH#23829 + result = [other[n] / self[n] for n in range(len(self))] + return np.array(result) + + else: + raise TypeError("Cannot divide {dtype} data by {cls}" + .format(dtype=other.dtype, + cls=type(self).__name__)) if compat.PY2: __div__ = __truediv__ + __rdiv__ = __rtruediv__ + + def __floordiv__(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if is_scalar(other): + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + if other is NaT: + # treat this specifically as timedelta-NaT + result = np.empty(self.shape, dtype=np.float64) + result.fill(np.nan) + return result + + # dispatch to Timedelta implementation + result = other.__rfloordiv__(self._data) + return result + + # at this point we should only have numeric scalars; anything + # else will raise + result = self.asi8 // other + result[self._isnan] = iNaT + freq = None + if self.freq is not None: + # Note: freq gets division, not floor-division + freq = self.freq / other + return type(self)(result.view('m8[ns]'), freq=freq) + + if not hasattr(other, "dtype"): + # list, tuple + other = np.array(other) + if len(other) != len(self): + raise ValueError("Cannot divide with unequal lengths") + + elif is_timedelta64_dtype(other): + other = type(self)(other) + + # numpy timedelta64 does not natively support floordiv, so operate + # on the i8 values + result = self.asi8 // other.asi8 + mask = self._isnan | other._isnan + if mask.any(): + result = result.astype(np.int64) + result[mask] = np.nan + return result + + elif is_object_dtype(other): + result = [self[n] // other[n] for n in range(len(self))] + result = np.array(result) + if lib.infer_dtype(result) == 'timedelta': + result, _ = sequence_to_td64ns(result) + return type(self)(result) + return result + + elif is_integer_dtype(other) or is_float_dtype(other): + result = self._data // other + return type(self)(result) + + else: + dtype = getattr(other, "dtype", type(other).__name__) + raise TypeError("Cannot divide {typ} by {cls}" + .format(typ=dtype, cls=type(self).__name__)) + + def __rfloordiv__(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if is_scalar(other): + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + if other is NaT: + # treat this specifically as timedelta-NaT + result = np.empty(self.shape, dtype=np.float64) + result.fill(np.nan) + return result + + # dispatch to Timedelta implementation + result = other.__floordiv__(self._data) + return result + + raise TypeError("Cannot divide {typ} by {cls}" + .format(typ=type(other).__name__, + cls=type(self).__name__)) + + if not hasattr(other, "dtype"): + # list, tuple + other = np.array(other) + if len(other) != len(self): + raise ValueError("Cannot divide with unequal lengths") + + elif is_timedelta64_dtype(other): + other = type(self)(other) + + # numpy timedelta64 does not natively support floordiv, so operate + # on the i8 values + result = other.asi8 // self.asi8 + mask = self._isnan | other._isnan + if mask.any(): + result = result.astype(np.int64) + result[mask] = np.nan + return result + + elif is_object_dtype(other): + result = [other[n] // self[n] for n in range(len(self))] + result = np.array(result) + return result + + else: + dtype = getattr(other, "dtype", type(other).__name__) + raise TypeError("Cannot divide {typ} by {cls}" + .format(typ=dtype, cls=type(self).__name__)) + + def __mod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + return self - (self // other) * other + + def __rmod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + return other - (other // self) * self + + def __divmod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + + res1 = self // other + res2 = self - res1 * other + return res1, res2 + + def __rdivmod__(self, other): + # Note: This is a naive implementation, can likely be optimized + if isinstance(other, (ABCSeries, ABCDataFrame, ABCIndexClass)): + return NotImplemented + + other = lib.item_from_zerodim(other) + if isinstance(other, (timedelta, np.timedelta64, Tick)): + other = Timedelta(other) + + res1 = other // self + res2 = other - res1 * self + return res1, res2 # Note: TimedeltaIndex overrides this in call to cls._add_numeric_methods def __neg__(self): @@ -404,7 +675,7 @@ def total_seconds(self): def to_pytimedelta(self): """ Return Timedelta Array/Index as object ndarray of datetime.timedelta - objects + objects. Returns ------- @@ -539,7 +810,7 @@ def sequence_to_td64ns(data, copy=False, unit="ns", errors="raise"): warnings.warn("Passing datetime64-dtype data to TimedeltaIndex is " "deprecated, will raise a TypeError in a future " "version", - FutureWarning, stacklevel=3) + FutureWarning, stacklevel=4) data = ensure_int64(data).view(_TD_DTYPE) else: diff --git a/pandas/core/base.py b/pandas/core/base.py index fd303182959a5d..86de25444cf4c0 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -15,8 +15,8 @@ from pandas.util._validators import validate_bool_kwarg from pandas.core.dtypes.common import ( - is_datetimelike, is_extension_array_dtype, is_extension_type, is_list_like, - is_object_dtype, is_scalar) + is_datetime64tz_dtype, is_datetimelike, is_extension_array_dtype, + is_extension_type, is_list_like, is_object_dtype, is_scalar) from pandas.core.dtypes.generic import ABCDataFrame, ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna @@ -777,6 +777,125 @@ def base(self): FutureWarning, stacklevel=2) return self.values.base + @property + def array(self): + # type: () -> Union[np.ndarray, ExtensionArray] + """ + The actual Array backing this Series or Index. + + .. versionadded:: 0.24.0 + + Returns + ------- + array : numpy.ndarray or ExtensionArray + This is the actual array stored within this object. This differs + from ``.values`` which may require converting the data + to a different form. + + See Also + -------- + Index.to_numpy : Similar method that always returns a NumPy array. + Series.to_numpy : Similar method that always returns a NumPy array. + + Notes + ----- + This table lays out the different array types for each extension + dtype within pandas. + + ================== ============================= + dtype array type + ================== ============================= + category Categorical + period PeriodArray + interval IntervalArray + IntegerNA IntegerArray + datetime64[ns, tz] DatetimeArray + ================== ============================= + + For any 3rd-party extension types, the array type will be an + ExtensionArray. + + For all remaining dtypes ``.array`` will be the :class:`numpy.ndarray` + stored within. If you absolutely need a NumPy array (possibly with + copying / coercing data), then use :meth:`Series.to_numpy` instead. + + .. note:: + + ``.array`` will always return the underlying object backing the + Series or Index. If a future version of pandas adds a specialized + extension type for a data type, then the return type of ``.array`` + for that data type will change from an object-dtype ndarray to the + new ExtensionArray. + + Examples + -------- + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) + >>> ser.array + [a, b, a] + Categories (2, object): [a, b] + """ + return self._values + + def to_numpy(self): + """ + A NumPy ndarray representing the values in this Series or Index. + + .. versionadded:: 0.24.0 + + The returned array will be the same up to equality (values equal + in `self` will be equal in the returned array; likewise for values + that are not equal). When `self` contains an ExtensionArray, the + dtype may be different. For example, for a category-dtype Series, + ``to_numpy()`` will return a NumPy array and the categorical dtype + will be lost. + + Returns + ------- + numpy.ndarray + + See Also + -------- + Series.array : Get the actual data stored within. + Index.array : Get the actual data stored within. + DataFrame.to_numpy : Similar method for DataFrame. + + Notes + ----- + For NumPy dtypes, this will be a reference to the actual data stored + in this Series or Index. Modifying the result in place will modify + the data stored in the Series or Index (not that we recommend doing + that). + + For extension types, ``to_numpy()`` *may* require copying data and + coercing the result to a NumPy type (possibly object), which may be + expensive. When you need a no-copy reference to the underlying data, + :attr:`Series.array` should be used instead. + + This table lays out the different dtypes and return types of + ``to_numpy()`` for various dtypes within pandas. + + ================== ================================ + dtype array type + ================== ================================ + category[T] ndarray[T] (same dtype as input) + period ndarray[object] (Periods) + interval ndarray[object] (Intervals) + IntegerNA ndarray[object] + datetime64[ns, tz] ndarray[object] (Timestamps) + ================== ================================ + + Examples + -------- + >>> ser = pd.Series(pd.Categorical(['a', 'b', 'a'])) + >>> ser.to_numpy() + array(['a', 'b', 'a'], dtype=object) + """ + if (is_extension_array_dtype(self.dtype) or + is_datetime64tz_dtype(self.dtype)): + # TODO(DatetimeArray): remove the second clause. + return np.asarray(self._values) + return self._values + @property def _ndarray_values(self): # type: () -> np.ndarray diff --git a/pandas/core/computation/align.py b/pandas/core/computation/align.py index f7f40a66af9c63..951174648091f7 100644 --- a/pandas/core/computation/align.py +++ b/pandas/core/computation/align.py @@ -30,9 +30,8 @@ def _align_core_single_unary_op(term): def _zip_axes_from_type(typ, new_axes): - axes = {} - for ax_ind, ax_name in compat.iteritems(typ._AXIS_NAMES): - axes[ax_name] = new_axes[ax_ind] + axes = {ax_name: new_axes[ax_ind] + for ax_ind, ax_name in compat.iteritems(typ._AXIS_NAMES)} return axes diff --git a/pandas/core/config.py b/pandas/core/config.py index 6b50ab9ffe7d44..0f43ca65d187ab 100644 --- a/pandas/core/config.py +++ b/pandas/core/config.py @@ -395,11 +395,8 @@ def __init__(self, *args): self.ops = list(zip(args[::2], args[1::2])) def __enter__(self): - undo = [] - for pat, val in self.ops: - undo.append((pat, _get_option(pat, silent=True))) - - self.undo = undo + self.undo = [(pat, _get_option(pat, silent=True)) + for pat, val in self.ops] for pat, val in self.ops: _set_option(pat, val, silent=True) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 3c5f8830441f71..eae9eb97f35fef 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -13,11 +13,11 @@ ensure_int8, ensure_int16, ensure_int32, ensure_int64, ensure_object, is_bool, is_bool_dtype, is_categorical_dtype, is_complex, is_complex_dtype, is_datetime64_dtype, is_datetime64_ns_dtype, is_datetime64tz_dtype, - is_datetime_or_timedelta_dtype, is_datetimelike, is_datetimetz, - is_dtype_equal, is_extension_array_dtype, is_extension_type, is_float, - is_float_dtype, is_integer, is_integer_dtype, is_object_dtype, is_scalar, - is_string_dtype, is_timedelta64_dtype, is_timedelta64_ns_dtype, - is_unsigned_integer_dtype, pandas_dtype) + is_datetime_or_timedelta_dtype, is_datetimelike, is_dtype_equal, + is_extension_array_dtype, is_extension_type, is_float, is_float_dtype, + is_integer, is_integer_dtype, is_object_dtype, is_scalar, is_string_dtype, + is_timedelta64_dtype, is_timedelta64_ns_dtype, is_unsigned_integer_dtype, + pandas_dtype) from .dtypes import ( DatetimeTZDtype, ExtensionDtype, PandasExtensionDtype, PeriodDtype) from .generic import ABCDatetimeIndex, ABCPeriodIndex, ABCSeries @@ -263,29 +263,11 @@ def maybe_promote(dtype, fill_value=np.nan): fill_value = np.nan # returns tuple of (dtype, fill_value) - if issubclass(dtype.type, (np.datetime64, np.timedelta64)): - # for now: refuse to upcast datetime64 - # (this is because datetime64 will not implicitly upconvert - # to object correctly as of numpy 1.6.1) - if isna(fill_value): - fill_value = iNaT - else: - if issubclass(dtype.type, np.datetime64): - try: - fill_value = tslibs.Timestamp(fill_value).value - except Exception: - # the proper thing to do here would probably be to upcast - # to object (but numpy 1.6.1 doesn't do this properly) - fill_value = iNaT - elif issubclass(dtype.type, np.timedelta64): - try: - fill_value = tslibs.Timedelta(fill_value).value - except Exception: - # as for datetimes, cannot upcast to object - fill_value = iNaT - else: - fill_value = iNaT - elif is_datetimetz(dtype): + if issubclass(dtype.type, np.datetime64): + fill_value = tslibs.Timestamp(fill_value).value + elif issubclass(dtype.type, np.timedelta64): + fill_value = tslibs.Timedelta(fill_value).value + elif is_datetime64tz_dtype(dtype): if isna(fill_value): fill_value = iNaT elif is_extension_array_dtype(dtype) and isna(fill_value): @@ -328,7 +310,7 @@ def maybe_promote(dtype, fill_value=np.nan): # in case we have a string that looked like a number if is_extension_array_dtype(dtype): pass - elif is_datetimetz(dtype): + elif is_datetime64tz_dtype(dtype): pass elif issubclass(np.dtype(dtype).type, string_types): dtype = np.object_ @@ -564,34 +546,6 @@ def invalidate_string_dtypes(dtype_set): raise TypeError("string dtypes are not allowed, use 'object' instead") -def maybe_convert_string_to_object(values): - """ - - Convert string-like and string-like array to convert object dtype. - This is to avoid numpy to handle the array as str dtype. - """ - if isinstance(values, string_types): - values = np.array([values], dtype=object) - elif (isinstance(values, np.ndarray) and - issubclass(values.dtype.type, (np.string_, np.unicode_))): - values = values.astype(object) - return values - - -def maybe_convert_scalar(values): - """ - Convert a python scalar to the appropriate numpy dtype if possible - This avoids numpy directly converting according to platform preferences - """ - if is_scalar(values): - dtype, values = infer_dtype_from_scalar(values) - try: - values = dtype(values) - except TypeError: - pass - return values - - def coerce_indexer_dtype(indexer, categories): """ coerce the indexer input array to the smallest dtype possible """ length = len(categories) @@ -1206,7 +1160,7 @@ def construct_1d_arraylike_from_scalar(value, length, dtype): np.ndarray / pandas type of length, filled with value """ - if is_datetimetz(dtype): + if is_datetime64tz_dtype(dtype): from pandas import DatetimeIndex subarr = DatetimeIndex([value] * length, dtype=dtype) elif is_categorical_dtype(dtype): diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index de7e453e80d85f..51b8488313e996 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -1,4 +1,6 @@ """ common type operations """ +import warnings + import numpy as np from pandas._libs import algos, lib @@ -287,6 +289,8 @@ def is_datetimetz(arr): Check whether an array-like is a datetime array-like with a timezone component in its dtype. + .. deprecated:: 0.24.0 + Parameters ---------- arr : array-like @@ -320,12 +324,10 @@ def is_datetimetz(arr): True """ - # TODO: do we need this function? - # It seems like a repeat of is_datetime64tz_dtype. - - return ((isinstance(arr, ABCDatetimeIndex) and - getattr(arr, 'tz', None) is not None) or - is_datetime64tz_dtype(arr)) + warnings.warn("'is_datetimetz' is deprecated and will be removed in a " + "future version. Use 'is_datetime64tz_dtype' instead.", + FutureWarning, stacklevel=2) + return is_datetime64tz_dtype(arr) def is_offsetlike(arr_or_obj): @@ -363,6 +365,8 @@ def is_period(arr): """ Check whether an array-like is a periodical index. + .. deprecated:: 0.24.0 + Parameters ---------- arr : array-like @@ -382,8 +386,10 @@ def is_period(arr): True """ - # TODO: do we need this function? - # It seems like a repeat of is_period_arraylike. + warnings.warn("'is_period' is deprecated and will be removed in a future " + "version. Use 'is_period_dtype' or is_period_arraylike' " + "instead.", FutureWarning, stacklevel=2) + return isinstance(arr, ABCPeriodIndex) or is_period_arraylike(arr) @@ -743,8 +749,7 @@ def is_datetimelike(arr): return (is_datetime64_dtype(arr) or is_datetime64tz_dtype(arr) or is_timedelta64_dtype(arr) or - isinstance(arr, ABCPeriodIndex) or - is_datetimetz(arr)) + isinstance(arr, ABCPeriodIndex)) def is_dtype_equal(source, target): @@ -1050,54 +1055,6 @@ def is_int64_dtype(arr_or_dtype): return issubclass(tipo, np.int64) -def is_int_or_datetime_dtype(arr_or_dtype): - """ - Check whether the provided array or dtype is of an - integer, timedelta64, or datetime64 dtype. - - Parameters - ---------- - arr_or_dtype : array-like - The array or dtype to check. - - Returns - ------- - boolean : Whether or not the array or dtype is of an - integer, timedelta64, or datetime64 dtype. - - Examples - -------- - >>> is_int_or_datetime_dtype(str) - False - >>> is_int_or_datetime_dtype(int) - True - >>> is_int_or_datetime_dtype(float) - False - >>> is_int_or_datetime_dtype(np.uint64) - True - >>> is_int_or_datetime_dtype(np.datetime64) - True - >>> is_int_or_datetime_dtype(np.timedelta64) - True - >>> is_int_or_datetime_dtype(np.array(['a', 'b'])) - False - >>> is_int_or_datetime_dtype(pd.Series([1, 2])) - True - >>> is_int_or_datetime_dtype(np.array([], dtype=np.timedelta64)) - True - >>> is_int_or_datetime_dtype(np.array([], dtype=np.datetime64)) - True - >>> is_int_or_datetime_dtype(pd.Index([1, 2.])) # float - False - """ - - if arr_or_dtype is None: - return False - tipo = _get_dtype_type(arr_or_dtype) - return (issubclass(tipo, np.integer) or - issubclass(tipo, (np.datetime64, np.timedelta64))) - - def is_datetime64_any_dtype(arr_or_dtype): """ Check whether the provided array or dtype is of the datetime64 dtype. @@ -1619,22 +1576,6 @@ def is_float_dtype(arr_or_dtype): return issubclass(tipo, np.floating) -def is_floating_dtype(arr_or_dtype): - """Check whether the provided array or dtype is an instance of - numpy's float dtype. - - .. deprecated:: 0.20.0 - - Unlike, `is_float_dtype`, this check is a lot stricter, as it requires - `isinstance` of `np.floating` and not `issubclass`. - """ - - if arr_or_dtype is None: - return False - tipo = _get_dtype_type(arr_or_dtype) - return isinstance(tipo, np.floating) - - def is_bool_dtype(arr_or_dtype): """ Check whether the provided array or dtype is of a boolean dtype. @@ -1758,7 +1699,7 @@ def is_extension_type(arr): return True elif is_sparse(arr): return True - elif is_datetimetz(arr): + elif is_datetime64tz_dtype(arr): return True return False @@ -1991,7 +1932,7 @@ def _get_dtype_from_object(dtype): return dtype elif is_categorical(dtype): return CategoricalDtype().type - elif is_datetimetz(dtype): + elif is_datetime64tz_dtype(dtype): return DatetimeTZDtype(dtype).type elif isinstance(dtype, np.dtype): # dtype object try: diff --git a/pandas/core/dtypes/concat.py b/pandas/core/dtypes/concat.py index f482f7e1927b7a..58f1bcbfa74c01 100644 --- a/pandas/core/dtypes/concat.py +++ b/pandas/core/dtypes/concat.py @@ -8,7 +8,7 @@ from pandas.core.dtypes.common import ( _NS_DTYPE, _TD_DTYPE, is_bool_dtype, is_categorical_dtype, - is_datetime64_dtype, is_datetimetz, is_dtype_equal, + is_datetime64_dtype, is_datetime64tz_dtype, is_dtype_equal, is_extension_array_dtype, is_interval_dtype, is_object_dtype, is_period_dtype, is_sparse, is_timedelta64_dtype) from pandas.core.dtypes.generic import ( @@ -39,7 +39,7 @@ def get_dtype_kinds(l): typ = 'sparse' elif isinstance(arr, ABCRangeIndex): typ = 'range' - elif is_datetimetz(arr): + elif is_datetime64tz_dtype(arr): # if to_concat contains different tz, # the result must be object dtype typ = str(arr.dtype) @@ -373,10 +373,8 @@ def _maybe_unwrap(x): if sort_categories: categories = categories.sort_values() - new_codes = [] - for c in to_union: - new_codes.append(_recode_for_categories(c.codes, c.categories, - categories)) + new_codes = [_recode_for_categories(c.codes, c.categories, categories) + for c in to_union] new_codes = np.concatenate(new_codes) else: # ordered - to show a proper error message diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index bd7c6630c7c5df..fee983f9692216 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -338,12 +338,7 @@ def _hash_categories(categories, ordered=True): cat_array = [cat_array] hashed = _combine_hash_arrays(iter(cat_array), num_items=len(cat_array)) - if len(hashed) == 0: - # bug in Numpy<1.12 for length 0 arrays. Just return the correct - # value of 0 - return 0 - else: - return np.bitwise_xor.reduce(hashed) + return np.bitwise_xor.reduce(hashed) @classmethod def construct_array_type(cls): diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 032d68e2d3e8c1..b9f32042924b93 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -54,7 +54,7 @@ is_object_dtype, is_extension_type, is_extension_array_dtype, - is_datetimetz, + is_datetime64tz_dtype, is_datetime64_any_dtype, is_bool_dtype, is_integer_dtype, @@ -285,7 +285,8 @@ class DataFrame(NDFrame): - """ Two-dimensional size-mutable, potentially heterogeneous tabular data + """ + Two-dimensional size-mutable, potentially heterogeneous tabular data structure with labeled axes (rows and columns). Arithmetic operations align on both row and column labels. Can be thought of as a dict-like container for Series objects. The primary pandas data structure. @@ -368,6 +369,9 @@ def _constructor_expanddim(self): from pandas.core.panel import Panel return Panel + # ---------------------------------------------------------------------- + # Constructors + def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False): if data is None: @@ -541,7 +545,8 @@ def _get_axes(N, K, index=index, columns=columns): index, columns = _get_axes(len(values), 1) return _arrays_to_mgr([values], columns, index, columns, dtype=dtype) - elif (is_datetimetz(values) or is_extension_array_dtype(values)): + elif (is_datetime64tz_dtype(values) or + is_extension_array_dtype(values)): # GH19157 if columns is None: columns = [0] @@ -573,6 +578,8 @@ def _get_axes(N, K, index=index, columns=columns): return create_block_manager_from_blocks([values], [columns, index]) + # ---------------------------------------------------------------------- + @property def axes(self): """ @@ -641,6 +648,9 @@ def _is_homogeneous_type(self): else: return not self._data.is_mixed_type + # ---------------------------------------------------------------------- + # Rendering Methods + def _repr_fits_vertical_(self): """ Check length against max_rows. @@ -651,10 +661,11 @@ def _repr_fits_vertical_(self): def _repr_fits_horizontal_(self, ignore_width=False): """ Check if full repr fits in horizontal boundaries imposed by the display - options width and max_columns. In case off non-interactive session, no - boundaries apply. + options width and max_columns. + + In case off non-interactive session, no boundaries apply. - ignore_width is here so ipnb+HTML output can behave the way + `ignore_width` is here so ipnb+HTML output can behave the way users expect. display.max_columns remains in effect. GH3541, GH3573 """ @@ -702,14 +713,16 @@ def _repr_fits_horizontal_(self, ignore_width=False): return repr_width < width def _info_repr(self): - """True if the repr should show the info view.""" + """ + True if the repr should show the info view. + """ info_repr_option = (get_option("display.large_repr") == "info") return info_repr_option and not (self._repr_fits_horizontal_() and self._repr_fits_vertical_()) def __unicode__(self): """ - Return a string representation for a particular DataFrame + Return a string representation for a particular DataFrame. Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. @@ -734,6 +747,7 @@ def __unicode__(self): def _repr_html_(self): """ Return a html representation for a particular DataFrame. + Mainly for IPython notebook. """ # qtconsole doesn't report its line width, and also @@ -764,6 +778,57 @@ def _repr_html_(self): else: return None + @Substitution(header='Write out the column names. If a list of strings ' + 'is given, it is assumed to be aliases for the ' + 'column names') + @Substitution(shared_params=fmt.common_docstring, + returns=fmt.return_docstring) + def to_string(self, buf=None, columns=None, col_space=None, header=True, + index=True, na_rep='NaN', formatters=None, float_format=None, + sparsify=None, index_names=True, justify=None, + max_rows=None, max_cols=None, show_dimensions=False, + decimal='.', line_width=None): + """ + Render a DataFrame to a console-friendly tabular output. + %(shared_params)s + line_width : int, optional + Width to wrap a line in characters. + %(returns)s + See Also + -------- + to_html : Convert DataFrame to HTML. + + Examples + -------- + >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} + >>> df = pd.DataFrame(d) + >>> print(df.to_string()) + col1 col2 + 0 1 4 + 1 2 5 + 2 3 6 + """ + + formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, + col_space=col_space, na_rep=na_rep, + formatters=formatters, + float_format=float_format, + sparsify=sparsify, justify=justify, + index_names=index_names, + header=header, index=index, + max_rows=max_rows, + max_cols=max_cols, + show_dimensions=show_dimensions, + decimal=decimal, + line_width=line_width) + formatter.to_string() + + if buf is None: + result = formatter.buf.getvalue() + return result + + # ---------------------------------------------------------------------- + @property def style(self): """ @@ -974,12 +1039,14 @@ def itertuples(self, index=True, name="Pandas"): items = iteritems def __len__(self): - """Returns length of info axis, but here we use the index """ + """ + Returns length of info axis, but here we use the index. + """ return len(self.index) def dot(self, other): """ - Matrix multiplication with DataFrame or Series objects. Can also be + Matrix multiplication with DataFrame or Series objects. Can also be called using `self @ other` in Python >= 3.5. Parameters @@ -1024,11 +1091,15 @@ def dot(self, other): raise TypeError('unsupported type: {oth}'.format(oth=type(other))) def __matmul__(self, other): - """ Matrix multiplication using binary `@` operator in Python>=3.5 """ + """ + Matrix multiplication using binary `@` operator in Python>=3.5. + """ return self.dot(other) def __rmatmul__(self, other): - """ Matrix multiplication using binary `@` operator in Python>=3.5 """ + """ + Matrix multiplication using binary `@` operator in Python>=3.5. + """ return self.T.dot(np.transpose(other)).T # ---------------------------------------------------------------------- @@ -1116,6 +1187,50 @@ def from_dict(cls, data, orient='columns', dtype=None, columns=None): return cls(data, index=index, columns=columns, dtype=dtype) + def to_numpy(self): + """ + Convert the DataFrame to a NumPy array. + + .. versionadded:: 0.24.0 + + The dtype of the returned array will be the common NumPy + dtype of all types in the DataFrame. For example, + if the dtypes are ``float16`` and ``float32``, the results + dtype will be ``float32``. This may require copying data and + coercing values, which may be expensive. + + Returns + ------- + array : numpy.ndarray + + See Also + -------- + Series.to_numpy : Similar method for Series. + + Examples + -------- + >>> pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy() + array([[1, 3], + [2, 4]]) + + With heterogenous data, the lowest common type will have to + be used. + + >>> df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]}) + >>> df.to_numpy() + array([[1. , 3. ], + [2. , 4.5]]) + + For a mix of numeric and non-numeric types, the output array will + have object dtype. + + >>> df['C'] = pd.date_range('2000', periods=2) + >>> df.to_numpy() + array([[1, 3.0, Timestamp('2000-01-01 00:00:00')], + [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) + """ + return self.values + def to_dict(self, orient='dict', into=dict): """ Convert the DataFrame to a dictionary. @@ -1183,10 +1298,10 @@ def to_dict(self, orient='dict', into=dict): >>> df.to_dict('split') {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'], - 'data': [[1.0, 0.5], [2.0, 0.75]]} + 'data': [[1, 0.5], [2, 0.75]]} >>> df.to_dict('records') - [{'col1': 1.0, 'col2': 0.5}, {'col1': 2.0, 'col2': 0.75}] + [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}] >>> df.to_dict('index') {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}} @@ -1202,8 +1317,8 @@ def to_dict(self, orient='dict', into=dict): >>> dd = defaultdict(list) >>> df.to_dict('records', into=dd) - [defaultdict(, {'col1': 1.0, 'col2': 0.5}), - defaultdict(, {'col1': 2.0, 'col2': 0.75})] + [defaultdict(, {'col1': 1, 'col2': 0.5}), + defaultdict(, {'col1': 2, 'col2': 0.75})] """ if not self.columns.is_unique: warnings.warn("DataFrame columns are not unique, some " @@ -1219,16 +1334,18 @@ def to_dict(self, orient='dict', into=dict): elif orient.lower().startswith('sp'): return into_c((('index', self.index.tolist()), ('columns', self.columns.tolist()), - ('data', lib.map_infer(self.values.ravel(), - com.maybe_box_datetimelike) - .reshape(self.values.shape).tolist()))) + ('data', [ + list(map(com.maybe_box_datetimelike, t)) + for t in self.itertuples(index=False)] + ))) elif orient.lower().startswith('s'): return into_c((k, com.maybe_box_datetimelike(v)) for k, v in compat.iteritems(self)) elif orient.lower().startswith('r'): - return [into_c((k, com.maybe_box_datetimelike(v)) - for k, v in zip(self.columns, np.atleast_1d(row))) - for row in self.values] + return [ + into_c((k, com.maybe_box_datetimelike(v)) + for k, v in compat.iteritems(row._asdict())) + for row in self.itertuples(index=False)] elif orient.lower().startswith('i'): if not self.index.is_unique: raise ValueError( @@ -1275,10 +1392,6 @@ def to_gbq(self, destination_table, project_id=None, chunksize=None, If table exists, drop it, recreate it, and insert data. ``'append'`` If table exists, insert data. Create if does not exist. - private_key : str, optional - Service account private key in JSON format. Can be file path - or string contents. This is useful for remote server - authentication (eg. Jupyter/IPython notebook on remote host). auth_local_webserver : bool, default False Use the `local webserver flow`_ instead of the `console flow`_ when getting user credentials. @@ -1354,7 +1467,7 @@ def to_gbq(self, destination_table, project_id=None, chunksize=None, def from_records(cls, data, index=None, exclude=None, columns=None, coerce_float=False, nrows=None): """ - Convert structured or record ndarray to DataFrame + Convert structured or record ndarray to DataFrame. Parameters ---------- @@ -1579,7 +1692,8 @@ def to_records(self, index=True, convert_datetime64=None): @classmethod def from_items(cls, items, columns=None, orient='columns'): - """Construct a dataframe from a list of tuples + """ + Construct a DataFrame from a list of tuples. .. deprecated:: 0.23.0 `from_items` is deprecated and will be removed in a future version. @@ -1673,7 +1787,8 @@ def _from_arrays(cls, arrays, columns, index, dtype=None): def from_csv(cls, path, header=0, sep=',', index_col=0, parse_dates=True, encoding=None, tupleize_cols=None, infer_datetime_format=False): - """Read CSV file. + """ + Read CSV file. .. deprecated:: 0.21.0 Use :func:`pandas.read_csv` instead. @@ -1953,7 +2068,7 @@ def to_stata(self, fname, convert_dates=None, write_index=True, def to_feather(self, fname): """ - write out the binary feather-format for DataFrames + Write out the binary feather-format for DataFrames. .. versionadded:: 0.20.0 @@ -2037,55 +2152,6 @@ def to_parquet(self, fname, engine='auto', compression='snappy', compression=compression, index=index, partition_cols=partition_cols, **kwargs) - @Substitution(header='Write out the column names. If a list of strings ' - 'is given, it is assumed to be aliases for the ' - 'column names') - @Substitution(shared_params=fmt.common_docstring, - returns=fmt.return_docstring) - def to_string(self, buf=None, columns=None, col_space=None, header=True, - index=True, na_rep='NaN', formatters=None, float_format=None, - sparsify=None, index_names=True, justify=None, - max_rows=None, max_cols=None, show_dimensions=False, - decimal='.', line_width=None): - """ - Render a DataFrame to a console-friendly tabular output. - %(shared_params)s - line_width : int, optional - Width to wrap a line in characters. - %(returns)s - See Also - -------- - to_html : Convert DataFrame to HTML. - - Examples - -------- - >>> d = {'col1': [1, 2, 3], 'col2': [4, 5, 6]} - >>> df = pd.DataFrame(d) - >>> print(df.to_string()) - col1 col2 - 0 1 4 - 1 2 5 - 2 3 6 - """ - - formatter = fmt.DataFrameFormatter(self, buf=buf, columns=columns, - col_space=col_space, na_rep=na_rep, - formatters=formatters, - float_format=float_format, - sparsify=sparsify, justify=justify, - index_names=index_names, - header=header, index=index, - max_rows=max_rows, - max_cols=max_cols, - show_dimensions=show_dimensions, - decimal=decimal, - line_width=line_width) - formatter.to_string() - - if buf is None: - result = formatter.buf.getvalue() - return result - @Substitution(header='Whether to print column labels, default True') @Substitution(shared_params=fmt.common_docstring, returns=fmt.return_docstring) @@ -2144,6 +2210,8 @@ def to_html(self, buf=None, columns=None, col_space=None, header=True, if buf is None: return formatter.buf.getvalue() + # ---------------------------------------------------------------------- + def info(self, verbose=None, buf=None, max_cols=None, memory_usage=None, null_counts=None): """ @@ -2610,7 +2678,8 @@ def _unpickle_matrix_compat(self, state): # pragma: no cover # Getting and setting elements def get_value(self, index, col, takeable=False): - """Quickly retrieve single value at passed column and index + """ + Quickly retrieve single value at passed column and index. .. deprecated:: 0.21.0 Use .at[] or .iat[] accessors instead. @@ -2653,7 +2722,8 @@ def _get_value(self, index, col, takeable=False): _get_value.__doc__ = get_value.__doc__ def set_value(self, index, col, value, takeable=False): - """Put single value at passed column and index + """ + Put single value at passed column and index. .. deprecated:: 0.21.0 Use .at[] or .iat[] accessors instead. @@ -2698,18 +2768,17 @@ def _set_value(self, index, col, value, takeable=False): def _ixs(self, i, axis=0): """ + Parameters + ---------- i : int, slice, or sequence of integers axis : int - """ + Notes + ----- + If slice passed, the resulting data will be a view. + """ # irow if axis == 0: - """ - Notes - ----- - If slice passed, the resulting data will be a view - """ - if isinstance(i, slice): return self[i] else: @@ -2735,12 +2804,6 @@ def _ixs(self, i, axis=0): # icol else: - """ - Notes - ----- - If slice passed, the resulting data will be a view - """ - label = self.columns[i] if isinstance(i, slice): # need to return view @@ -2887,7 +2950,8 @@ def _getitem_frame(self, key): return self.where(key) def query(self, expr, inplace=False, **kwargs): - """Query the columns of a frame with a boolean expression. + """ + Query the columns of a DataFrame with a boolean expression. Parameters ---------- @@ -3223,7 +3287,9 @@ def _box_item_values(self, key, values): return self._box_col_values(values, items) def _box_col_values(self, values, items): - """ provide boxed values for a column """ + """ + Provide boxed values for a column. + """ klass = self._constructor_sliced return klass(values, index=self.index, name=items, fastpath=True) @@ -3289,8 +3355,8 @@ def _setitem_frame(self, key, value): def _ensure_valid_index(self, value): """ - ensure that if we don't have an index, that we can create one from the - passed value + Ensure that if we don't have an index, that we can create one from the + passed value. """ # GH5632, make sure that we are a Series convertible if not len(self.index) and is_list_like(value): @@ -3545,14 +3611,13 @@ def reindexer(value): @property def _series(self): - result = {} - for idx, item in enumerate(self.columns): - result[item] = Series(self._data.iget(idx), index=self.index, - name=item) - return result + return {item: Series(self._data.iget(idx), index=self.index, name=item) + for idx, item in enumerate(self.columns)} def lookup(self, row_labels, col_labels): - """Label-based "fancy indexing" function for DataFrame. + """ + Label-based "fancy indexing" function for DataFrame. + Given equal-length arrays of row and column labels, return an array of the values corresponding to each (row, col) pair. @@ -3567,9 +3632,8 @@ def lookup(self, row_labels, col_labels): ----- Akin to:: - result = [] - for row, col in zip(row_labels, col_labels): - result.append(df.get_value(row, col)) + result = [df.get_value(row, col) + for row, col in zip(row_labels, col_labels)] Examples -------- @@ -3639,7 +3703,9 @@ def _reindex_columns(self, new_columns, method, copy, level, allow_dups=False) def _reindex_multi(self, axes, copy, fill_value): - """ we are guaranteed non-Nones in the axes! """ + """ + We are guaranteed non-Nones in the axes. + """ new_index, row_indexer = self.index.reindex(axes['index']) new_columns, col_indexer = self.columns.reindex(axes['columns']) @@ -3819,7 +3885,8 @@ def drop(self, labels=None, axis=0, index=None, columns=None, ('inplace', False), ('level', None)]) def rename(self, *args, **kwargs): - """Alter axes labels. + """ + Alter axes labels. Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series will be left as-is. Extra labels listed don't throw an @@ -3924,47 +3991,58 @@ def shift(self, periods=1, freq=None, axis=0): def set_index(self, keys, drop=True, append=False, inplace=False, verify_integrity=False): """ + Set the DataFrame index using existing columns. + Set the DataFrame index (row labels) using one or more existing - columns. By default yields a new object. + columns. The index can replace the existing index or expand on it. Parameters ---------- - keys : column label or list of column labels / arrays - drop : boolean, default True - Delete columns to be used as the new index - append : boolean, default False - Whether to append columns to existing index - inplace : boolean, default False - Modify the DataFrame in place (do not create a new object) - verify_integrity : boolean, default False + keys : label or list of label + Name or names of the columns that will be used as the index. + drop : bool, default True + Delete columns to be used as the new index. + append : bool, default False + Whether to append columns to existing index. + inplace : bool, default False + Modify the DataFrame in place (do not create a new object). + verify_integrity : bool, default False Check the new index for duplicates. Otherwise defer the check until necessary. Setting to False will improve the performance of this - method + method. Returns ------- DataFrame + Changed row labels. + + See Also + -------- + DataFrame.reset_index : Opposite of set_index. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. Examples -------- >>> df = pd.DataFrame({'month': [1, 4, 7, 10], ... 'year': [2012, 2014, 2013, 2014], - ... 'sale':[55, 40, 84, 31]}) - month sale year - 0 1 55 2012 - 1 4 40 2014 - 2 7 84 2013 - 3 10 31 2014 + ... 'sale': [55, 40, 84, 31]}) + >>> df + month year sale + 0 1 2012 55 + 1 4 2014 40 + 2 7 2013 84 + 3 10 2014 31 Set the index to become the 'month' column: >>> df.set_index('month') - sale year + year sale month - 1 55 2012 - 4 40 2014 - 7 84 2013 - 10 31 2014 + 1 2012 55 + 4 2014 40 + 7 2013 84 + 10 2014 31 Create a multi-index using columns 'year' and 'month': @@ -4072,22 +4150,22 @@ def set_index(self, keys, drop=True, append=False, inplace=False, def reset_index(self, level=None, drop=False, inplace=False, col_level=0, col_fill=''): """ - For DataFrame with multi-level index, return new DataFrame with - labeling information in the columns under the index names, defaulting - to 'level_0', 'level_1', etc. if any are None. For a standard index, - the index name will be used (if set), otherwise a default 'index' or - 'level_0' (if 'index' is already taken) will be used. + Reset the index, or a level of it. + + Reset the index of the DataFrame, and use the default one instead. + If the DataFrame has a MultiIndex, this method can remove one or more + levels. Parameters ---------- level : int, str, tuple, or list, default None Only remove the given levels from the index. Removes all levels by - default - drop : boolean, default False + default. + drop : bool, default False Do not try to insert index into dataframe columns. This resets the index to the default integer index. - inplace : boolean, default False - Modify the DataFrame in place (do not create a new object) + inplace : bool, default False + Modify the DataFrame in place (do not create a new object). col_level : int or str, default 0 If the columns have multiple levels, determines which level the labels are inserted into. By default it is inserted into the first @@ -4098,13 +4176,20 @@ def reset_index(self, level=None, drop=False, inplace=False, col_level=0, Returns ------- - resetted : DataFrame + DataFrame + DataFrame with the new index. + + See Also + -------- + DataFrame.set_index : Opposite of reset_index. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. Examples -------- - >>> df = pd.DataFrame([('bird', 389.0), - ... ('bird', 24.0), - ... ('mammal', 80.5), + >>> df = pd.DataFrame([('bird', 389.0), + ... ('bird', 24.0), + ... ('mammal', 80.5), ... ('mammal', np.nan)], ... index=['falcon', 'parrot', 'lion', 'monkey'], ... columns=('class', 'max_speed')) @@ -4451,7 +4536,7 @@ def dropna(self, axis=0, how='any', thresh=None, subset=None, def drop_duplicates(self, subset=None, keep='first', inplace=False): """ Return DataFrame with duplicate rows removed, optionally only - considering certain columns + considering certain columns. Parameters ---------- @@ -4485,7 +4570,7 @@ def drop_duplicates(self, subset=None, keep='first', inplace=False): def duplicated(self, subset=None, keep='first'): """ Return boolean Series denoting duplicate rows, optionally only - considering certain columns + considering certain columns. Parameters ---------- @@ -4553,10 +4638,8 @@ def sort_values(self, by, axis=0, ascending=True, inplace=False, if len(by) > 1: from pandas.core.sorting import lexsort_indexer - keys = [] - for x in by: - k = self._get_label_or_level_values(x, axis=axis) - keys.append(k) + keys = [self._get_label_or_level_values(x, axis=axis) + for x in by] indexer = lexsort_indexer(keys, orders=ascending, na_position=na_position) indexer = ensure_platform_int(indexer) @@ -4862,7 +4945,7 @@ def nsmallest(self, n, columns, keep='first'): def swaplevel(self, i=-2, j=-1, axis=0): """ - Swap levels i and j in a MultiIndex on a particular axis + Swap levels i and j in a MultiIndex on a particular axis. Parameters ---------- @@ -4889,8 +4972,8 @@ def swaplevel(self, i=-2, j=-1, axis=0): def reorder_levels(self, order, axis=0): """ - Rearrange index levels using input order. - May not drop or duplicate levels + Rearrange index levels using input order. May not drop or + duplicate levels. Parameters ---------- @@ -5479,7 +5562,7 @@ def pivot(self, index=None, columns=None, values=None): _shared_docs['pivot_table'] = """ Create a spreadsheet-style pivot table as a DataFrame. The levels in the pivot table will be stored in MultiIndex objects (hierarchical - indexes) on the index and columns of the result DataFrame + indexes) on the index and columns of the result DataFrame. Parameters ----------%s @@ -5781,9 +5864,11 @@ def unstack(self, level=-1, fill_value=None): """ Pivot a level of the (necessarily hierarchical) index labels, returning a DataFrame having a new level of column labels whose inner-most level - consists of the pivoted index labels. If the index is not a MultiIndex, - the output will be a Series (the analogue of stack when the columns are - not a MultiIndex). + consists of the pivoted index labels. + + If the index is not a MultiIndex, the output will be a Series + (the analogue of stack when the columns are not a MultiIndex). + The level involved will automatically get sorted. Parameters @@ -5839,7 +5924,7 @@ def unstack(self, level=-1, fill_value=None): return unstack(self, level, fill_value) _shared_docs['melt'] = (""" - "Unpivots" a DataFrame from wide format to long format, optionally + Unpivots a DataFrame from wide format to long format, optionally leaving identifier variables set. This function is useful to massage a DataFrame into a format where one @@ -6045,8 +6130,7 @@ def _gotitem(self, ): # type: (...) -> Union[Series, DataFrame] """ - sub-classes to define - return a sliced object + Sub-classes to define. Return a sliced object. Parameters ---------- @@ -6797,7 +6881,7 @@ def _series_round(s, decimals): def corr(self, method='pearson', min_periods=1): """ - Compute pairwise correlation of columns, excluding NA/null values + Compute pairwise correlation of columns, excluding NA/null values. Parameters ---------- @@ -7392,7 +7476,9 @@ def idxmax(self, axis=0, skipna=True): return Series(result, index=self._get_agg_axis(axis)) def _get_agg_axis(self, axis_num): - """ let's be explicit about this """ + """ + Let's be explicit about this. + """ if axis_num == 0: return self.columns elif axis_num == 1: @@ -7582,7 +7668,7 @@ def quantile(self, q=0.5, axis=0, numeric_only=True, def to_timestamp(self, freq=None, how='start', axis=0, copy=True): """ - Cast to DatetimeIndex of timestamps, at *beginning* of period + Cast to DatetimeIndex of timestamps, at *beginning* of period. Parameters ---------- @@ -7618,7 +7704,7 @@ def to_timestamp(self, freq=None, how='start', axis=0, copy=True): def to_period(self, freq=None, axis=0, copy=True): """ Convert DataFrame from DatetimeIndex to PeriodIndex with desired - frequency (inferred from index if not passed) + frequency (inferred from index if not passed). Parameters ---------- @@ -7758,6 +7844,7 @@ def isin(self, values): def _arrays_to_mgr(arrays, arr_names, index, columns, dtype=None): """ Segregate Series based on type and coerce into matrices. + Needs to handle a lot of exceptional cases. """ # figure out the index, if necessary @@ -7866,7 +7953,7 @@ def convert(v): def _to_arrays(data, columns, coerce_float=False, dtype=None): """ - Return list of arrays, columns + Return list of arrays, columns. """ if isinstance(data, DataFrame): if columns is not None: @@ -7912,7 +7999,9 @@ def _to_arrays(data, columns, coerce_float=False, dtype=None): def _masked_rec_array_to_mgr(data, index, columns, dtype, copy): - """ extract from a masked rec array and create the manager """ + """ + Extract from a masked rec array and create the manager. + """ # essentially process a record array then fill it fill_value = data.fill_value diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 3a7016ce396769..c58c84b422209b 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -111,6 +111,9 @@ class NDFrame(PandasObject, SelectionMixin): _metadata = [] _is_copy = None + # ---------------------------------------------------------------------- + # Constructors + def __init__(self, data, axes=None, copy=False, dtype=None, fastpath=False): @@ -128,6 +131,25 @@ def __init__(self, data, axes=None, copy=False, dtype=None, object.__setattr__(self, '_data', data) object.__setattr__(self, '_item_cache', {}) + def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): + """ passed a manager and a axes dict """ + for a, axe in axes.items(): + if axe is not None: + mgr = mgr.reindex_axis(axe, + axis=self._get_block_manager_axis(a), + copy=False) + + # make a copy if explicitly requested + if copy: + mgr = mgr.copy() + if dtype is not None: + # avoid further copies if we can + if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype: + mgr = mgr.astype(dtype=dtype) + return mgr + + # ---------------------------------------------------------------------- + @property def is_copy(self): warnings.warn("Attribute 'is_copy' is deprecated and will be removed " @@ -140,17 +162,6 @@ def is_copy(self, msg): "in a future version.", FutureWarning, stacklevel=2) self._is_copy = msg - def _repr_data_resource_(self): - """ - Not a real Jupyter special repr method, but we use the same - naming convention. - """ - if config.get_option("display.html.table_schema"): - data = self.head(config.get_option('display.max_rows')) - payload = json.loads(data.to_json(orient='table'), - object_pairs_hook=collections.OrderedDict) - return payload - def _validate_dtype(self, dtype): """ validate the passed dtype """ @@ -165,23 +176,6 @@ def _validate_dtype(self, dtype): return dtype - def _init_mgr(self, mgr, axes=None, dtype=None, copy=False): - """ passed a manager and a axes dict """ - for a, axe in axes.items(): - if axe is not None: - mgr = mgr.reindex_axis(axe, - axis=self._get_block_manager_axis(a), - copy=False) - - # make a copy if explicitly requested - if copy: - mgr = mgr.copy() - if dtype is not None: - # avoid further copies if we can - if len(mgr.blocks) > 1 or mgr.blocks[0].values.dtype != dtype: - mgr = mgr.astype(dtype=dtype) - return mgr - # ---------------------------------------------------------------------- # Construction @@ -192,20 +186,6 @@ def _constructor(self): """ raise AbstractMethodError(self) - def __unicode__(self): - # unicode representation based upon iterating over self - # (since, by definition, `PandasContainers` are iterable) - prepr = '[%s]' % ','.join(map(pprint_thing, self)) - return '%s(%s)' % (self.__class__.__name__, prepr) - - def _dir_additions(self): - """ add the string-like attributes from the info_axis. - If info_axis is a MultiIndex, it's first level values are used. - """ - additions = {c for c in self._info_axis.unique(level=0)[:100] - if isinstance(c, string_types) and isidentifier(c)} - return super(NDFrame, self)._dir_additions().union(additions) - @property def _constructor_sliced(self): """Used when a manipulation result has one lower dimension(s) as the @@ -1338,48 +1318,12 @@ def _set_axis_name(self, name, axis=0, inplace=False): return renamed # ---------------------------------------------------------------------- - # Comparisons + # Comparison Methods def _indexed_same(self, other): return all(self._get_axis(a).equals(other._get_axis(a)) for a in self._AXIS_ORDERS) - def __neg__(self): - values = com.values_from_object(self) - if is_bool_dtype(values): - arr = operator.inv(values) - elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) - or is_object_dtype(values)): - arr = operator.neg(values) - else: - raise TypeError("Unary negative expects numeric dtype, not {}" - .format(values.dtype)) - return self.__array_wrap__(arr) - - def __pos__(self): - values = com.values_from_object(self) - if (is_bool_dtype(values) or is_period_arraylike(values)): - arr = values - elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) - or is_object_dtype(values)): - arr = operator.pos(values) - else: - raise TypeError("Unary plus expects numeric dtype, not {}" - .format(values.dtype)) - return self.__array_wrap__(arr) - - def __invert__(self): - try: - arr = operator.inv(com.values_from_object(self)) - return self.__array_wrap__(arr) - except Exception: - - # inv fails with 0 len - if not np.prod(self.shape): - return self - - raise - def equals(self, other): """ Test whether two objects contain the same elements. @@ -1466,6 +1410,74 @@ def equals(self, other): return False return self._data.equals(other._data) + # ------------------------------------------------------------------------- + # Unary Methods + + def __neg__(self): + values = com.values_from_object(self) + if is_bool_dtype(values): + arr = operator.inv(values) + elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) + or is_object_dtype(values)): + arr = operator.neg(values) + else: + raise TypeError("Unary negative expects numeric dtype, not {}" + .format(values.dtype)) + return self.__array_wrap__(arr) + + def __pos__(self): + values = com.values_from_object(self) + if (is_bool_dtype(values) or is_period_arraylike(values)): + arr = values + elif (is_numeric_dtype(values) or is_timedelta64_dtype(values) + or is_object_dtype(values)): + arr = operator.pos(values) + else: + raise TypeError("Unary plus expects numeric dtype, not {}" + .format(values.dtype)) + return self.__array_wrap__(arr) + + def __invert__(self): + try: + arr = operator.inv(com.values_from_object(self)) + return self.__array_wrap__(arr) + except Exception: + + # inv fails with 0 len + if not np.prod(self.shape): + return self + + raise + + def __nonzero__(self): + raise ValueError("The truth value of a {0} is ambiguous. " + "Use a.empty, a.bool(), a.item(), a.any() or a.all()." + .format(self.__class__.__name__)) + + __bool__ = __nonzero__ + + def bool(self): + """Return the bool of a single element PandasObject. + + This must be a boolean scalar value, either True or False. Raise a + ValueError if the PandasObject does not have exactly 1 element, or that + element is not boolean + """ + v = self.squeeze() + if isinstance(v, (bool, np.bool_)): + return bool(v) + elif is_scalar(v): + raise ValueError("bool cannot act on a non-boolean single element " + "{0}".format(self.__class__.__name__)) + + self.__nonzero__() + + def __abs__(self): + return self.abs() + + def __round__(self, decimals=0): + return self.round(decimals) + # ------------------------------------------------------------------------- # Label or Level Combination Helpers # @@ -1858,35 +1870,6 @@ def empty(self): """ return any(len(self._get_axis(a)) == 0 for a in self._AXIS_ORDERS) - def __nonzero__(self): - raise ValueError("The truth value of a {0} is ambiguous. " - "Use a.empty, a.bool(), a.item(), a.any() or a.all()." - .format(self.__class__.__name__)) - - __bool__ = __nonzero__ - - def bool(self): - """Return the bool of a single element PandasObject. - - This must be a boolean scalar value, either True or False. Raise a - ValueError if the PandasObject does not have exactly 1 element, or that - element is not boolean - """ - v = self.squeeze() - if isinstance(v, (bool, np.bool_)): - return bool(v) - elif is_scalar(v): - raise ValueError("bool cannot act on a non-boolean single element " - "{0}".format(self.__class__.__name__)) - - self.__nonzero__() - - def __abs__(self): - return self.abs() - - def __round__(self, decimals=0): - return self.round(decimals) - # ---------------------------------------------------------------------- # Array Interface @@ -1962,7 +1945,13 @@ def __setstate__(self, state): self._item_cache = {} # ---------------------------------------------------------------------- - # IO + # Rendering Methods + + def __unicode__(self): + # unicode representation based upon iterating over self + # (since, by definition, `PandasContainers` are iterable) + prepr = '[%s]' % ','.join(map(pprint_thing, self)) + return '%s(%s)' % (self.__class__.__name__, prepr) def _repr_latex_(self): """ @@ -1974,6 +1963,17 @@ def _repr_latex_(self): else: return None + def _repr_data_resource_(self): + """ + Not a real Jupyter special repr method, but we use the same + naming convention. + """ + if config.get_option("display.html.table_schema"): + data = self.head(config.get_option('display.max_rows')) + payload = json.loads(data.to_json(orient='table'), + object_pairs_hook=collections.OrderedDict) + return payload + # ---------------------------------------------------------------------- # I/O Methods @@ -2079,6 +2079,25 @@ def _repr_latex_(self): >>> df1.to_excel('output1.xlsx', engine='xlsxwriter') # doctest: +SKIP """ + @Appender(_shared_docs["to_excel"] % dict(klass="object")) + def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="", + float_format=None, columns=None, header=True, index=True, + index_label=None, startrow=0, startcol=0, engine=None, + merge_cells=True, encoding=None, inf_rep="inf", verbose=True, + freeze_panes=None): + df = self if isinstance(self, ABCDataFrame) else self.to_frame() + + from pandas.io.formats.excel import ExcelFormatter + formatter = ExcelFormatter(df, na_rep=na_rep, cols=columns, + header=header, + float_format=float_format, index=index, + index_label=index_label, + merge_cells=merge_cells, + inf_rep=inf_rep) + formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow, + startcol=startcol, freeze_panes=freeze_panes, + engine=engine) + def to_json(self, path_or_buf=None, orient=None, date_format=None, double_precision=10, force_ascii=True, date_unit='ms', default_handler=None, lines=False, compression='infer', @@ -2821,6 +2840,148 @@ def to_latex(self, buf=None, columns=None, col_space=None, header=True, if buf is None: return formatter.buf.getvalue() + def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, + columns=None, header=True, index=True, index_label=None, + mode='w', encoding=None, compression='infer', quoting=None, + quotechar='"', line_terminator=None, chunksize=None, + tupleize_cols=None, date_format=None, doublequote=True, + escapechar=None, decimal='.'): + r""" + Write object to a comma-separated values (csv) file. + + .. versionchanged:: 0.24.0 + The order of arguments for Series was changed. + + Parameters + ---------- + path_or_buf : str or file handle, default None + File path or object, if None is provided the result is returned as + a string. + + .. versionchanged:: 0.24.0 + + Was previously named "path" for Series. + + sep : str, default ',' + String of length 1. Field delimiter for the output file. + na_rep : str, default '' + Missing data representation. + float_format : str, default None + Format string for floating point numbers. + columns : sequence, optional + Columns to write. + header : bool or list of str, default True + Write out the column names. If a list of strings is given it is + assumed to be aliases for the column names. + + .. versionchanged:: 0.24.0 + + Previously defaulted to False for Series. + + index : bool, default True + Write row names (index). + index_label : str or sequence, or False, default None + Column label for index column(s) if desired. If None is given, and + `header` and `index` are True, then the index names are used. A + sequence should be given if the object uses MultiIndex. If + False do not print fields for index names. Use index_label=False + for easier importing in R. + mode : str + Python write mode, default 'w'. + encoding : str, optional + A string representing the encoding to use in the output file, + defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. + compression : str, default 'infer' + Compression mode among the following possible values: {'infer', + 'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf` + is path-like, then detect compression from the following + extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no + compression). + + .. versionchanged:: 0.24.0 + + 'infer' option added and set to default. + + quoting : optional constant from csv module + Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` + then floats are converted to strings and thus csv.QUOTE_NONNUMERIC + will treat them as non-numeric. + quotechar : str, default '\"' + String of length 1. Character used to quote fields. + line_terminator : string, optional + The newline character or character sequence to use in the output + file. Defaults to `os.linesep`, which depends on the OS in which + this method is called ('\n' for linux, '\r\n' for Windows, i.e.). + + .. versionchanged:: 0.24.0 + chunksize : int or None + Rows to write at a time. + tupleize_cols : bool, default False + Write MultiIndex columns as a list of tuples (if True) or in + the new, expanded format, where each MultiIndex column is a row + in the CSV (if False). + + .. deprecated:: 0.21.0 + This argument will be removed and will always write each row + of the multi-index as a separate row in the CSV file. + date_format : str, default None + Format string for datetime objects. + doublequote : bool, default True + Control quoting of `quotechar` inside a field. + escapechar : str, default None + String of length 1. Character used to escape `sep` and `quotechar` + when appropriate. + decimal : str, default '.' + Character recognized as decimal separator. E.g. use ',' for + European data. + + Returns + ------- + None or str + If path_or_buf is None, returns the resulting csv format as a + string. Otherwise returns None. + + See Also + -------- + read_csv : Load a CSV file into a DataFrame. + to_excel : Load an Excel file into a DataFrame. + + Examples + -------- + >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], + ... 'mask': ['red', 'purple'], + ... 'weapon': ['sai', 'bo staff']}) + >>> df.to_csv(index=False) + 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' + """ + + df = self if isinstance(self, ABCDataFrame) else self.to_frame() + + if tupleize_cols is not None: + warnings.warn("The 'tupleize_cols' parameter is deprecated and " + "will be removed in a future version", + FutureWarning, stacklevel=2) + else: + tupleize_cols = False + + from pandas.io.formats.csvs import CSVFormatter + formatter = CSVFormatter(df, path_or_buf, + line_terminator=line_terminator, sep=sep, + encoding=encoding, + compression=compression, quoting=quoting, + na_rep=na_rep, float_format=float_format, + cols=columns, header=header, index=index, + index_label=index_label, mode=mode, + chunksize=chunksize, quotechar=quotechar, + tupleize_cols=tupleize_cols, + date_format=date_format, + doublequote=doublequote, + escapechar=escapechar, decimal=decimal) + formatter.save() + + if path_or_buf is None: + return formatter.path_or_buf.getvalue() + # ---------------------------------------------------------------------- # Fancy Indexing @@ -3259,71 +3420,102 @@ class max_speed def xs(self, key, axis=0, level=None, drop_level=True): """ - Returns a cross-section (row(s) or column(s)) from the - Series/DataFrame. Defaults to cross-section on the rows (axis=0). + Return cross-section from the Series/DataFrame. + + This method takes a `key` argument to select data at a particular + level of a MultiIndex. Parameters ---------- - key : object - Some label contained in the index, or partially in a MultiIndex - axis : int, default 0 - Axis to retrieve cross-section on + key : label or tuple of label + Label contained in the index, or partially in a MultiIndex. + axis : {0 or 'index', 1 or 'columns'}, default 0 + Axis to retrieve cross-section on. level : object, defaults to first n levels (n=1 or len(key)) In case of a key partially contained in a MultiIndex, indicate which levels are used. Levels can be referred by label or position. - drop_level : boolean, default True + drop_level : bool, default True If False, returns object with same levels as self. + Returns + ------- + Series or DataFrame + Cross-section from the original Series or DataFrame + corresponding to the selected index levels. + + See Also + -------- + DataFrame.loc : Access a group of rows and columns + by label(s) or a boolean array. + DataFrame.iloc : Purely integer-location based indexing + for selection by position. + + Notes + ----- + `xs` can not be used to set values. + + MultiIndex Slicers is a generic way to get/set values on + any level or levels. + It is a superset of `xs` functionality, see + :ref:`MultiIndex Slicers `. + Examples -------- + >>> d = {'num_legs': [4, 4, 2, 2], + ... 'num_wings': [0, 0, 2, 2], + ... 'class': ['mammal', 'mammal', 'mammal', 'bird'], + ... 'animal': ['cat', 'dog', 'bat', 'penguin'], + ... 'locomotion': ['walks', 'walks', 'flies', 'walks']} + >>> df = pd.DataFrame(data=d) + >>> df = df.set_index(['class', 'animal', 'locomotion']) >>> df - A B C - a 4 5 2 - b 4 0 9 - c 9 7 3 - >>> df.xs('a') - A 4 - B 5 - C 2 - Name: a - >>> df.xs('C', axis=1) - a 2 - b 9 - c 3 - Name: C + num_legs num_wings + class animal locomotion + mammal cat walks 4 0 + dog walks 4 0 + bat flies 2 2 + bird penguin walks 2 2 - >>> df - A B C D - first second third - bar one 1 4 1 8 9 - two 1 7 5 5 0 - baz one 1 6 6 8 0 - three 2 5 3 5 3 - >>> df.xs(('baz', 'three')) - A B C D - third - 2 5 3 5 3 - >>> df.xs('one', level=1) - A B C D - first third - bar 1 4 1 8 9 - baz 1 6 6 8 0 - >>> df.xs(('baz', 2), level=[0, 'third']) - A B C D - second - three 5 3 5 3 + Get values at specified index - Returns - ------- - xs : Series or DataFrame + >>> df.xs('mammal') + num_legs num_wings + animal locomotion + cat walks 4 0 + dog walks 4 0 + bat flies 2 2 - Notes - ----- - xs is only for getting, not setting values. + Get values at several indexes - MultiIndex Slicers is a generic way to get/set values on any level or - levels. It is a superset of xs functionality, see - :ref:`MultiIndex Slicers ` + >>> df.xs(('mammal', 'dog')) + num_legs num_wings + locomotion + walks 4 0 + + Get values at specified index and level + + >>> df.xs('cat', level=1) + num_legs num_wings + class locomotion + mammal walks 4 0 + + Get values at several indexes and levels + + >>> df.xs(('bird', 'walks'), + ... level=[0, 'locomotion']) + num_legs num_wings + animal + penguin 2 2 + + Get values at specified column and axis + + >>> df.xs('num_wings', axis=1) + class animal locomotion + mammal cat walks 0 + dog walks 0 + bat flies 2 + bird penguin walks 2 + Name: num_wings, dtype: int64 """ axis = self._get_axis_number(axis) labels = self._get_axis(axis) @@ -3423,29 +3615,99 @@ def select(self, crit, axis=0): def reindex_like(self, other, method=None, copy=True, limit=None, tolerance=None): - """Return an object with matching indices to myself. + """ + Return an object with matching indices as other object. + + Conform the object to the same index on all axes. Optional + filling logic, placing NaN in locations having no value + in the previous index. A new object is produced unless the + new index is equivalent to the current one and copy=False. Parameters ---------- - other : Object - method : string or None - copy : boolean, default True + other : Object of the same data type + Its row and column indices are used to define the new indices + of this object. + method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'} + Method to use for filling holes in reindexed DataFrame. + Please note: this is only applicable to DataFrames/Series with a + monotonically increasing/decreasing index. + + * None (default): don't fill gaps + * pad / ffill: propagate last valid observation forward to next + valid + * backfill / bfill: use next valid observation to fill gap + * nearest: use nearest valid observations to fill gap + + copy : bool, default True + Return a new object, even if the passed indexes are the same. limit : int, default None Maximum number of consecutive labels to fill for inexact matches. tolerance : optional - Maximum distance between labels of the other object and this - object for inexact matches. Can be list-like. + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. .. versionadded:: 0.21.0 (list-like tolerance) + Returns + ------- + Series or DataFrame + Same type as caller, but with changed indices on each axis. + + See Also + -------- + DataFrame.set_index : Set row labels. + DataFrame.reset_index : Remove row labels or move them to new columns. + DataFrame.reindex : Change to new indices or expand indices. + Notes ----- - Like calling s.reindex(index=other.index, columns=other.columns, - method=...) + Same as calling + ``.reindex(index=other.index, columns=other.columns,...)``. - Returns - ------- - reindexed : same as input + Examples + -------- + >>> df1 = pd.DataFrame([[24.3, 75.7, 'high'], + ... [31, 87.8, 'high'], + ... [22, 71.6, 'medium'], + ... [35, 95, 'medium']], + ... columns=['temp_celsius', 'temp_fahrenheit', 'windspeed'], + ... index=pd.date_range(start='2014-02-12', + ... end='2014-02-15', freq='D')) + + >>> df1 + temp_celsius temp_fahrenheit windspeed + 2014-02-12 24.3 75.7 high + 2014-02-13 31.0 87.8 high + 2014-02-14 22.0 71.6 medium + 2014-02-15 35.0 95.0 medium + + >>> df2 = pd.DataFrame([[28, 'low'], + ... [30, 'low'], + ... [35.1, 'medium']], + ... columns=['temp_celsius', 'windspeed'], + ... index=pd.DatetimeIndex(['2014-02-12', '2014-02-13', + ... '2014-02-15'])) + + >>> df2 + temp_celsius windspeed + 2014-02-12 28.0 low + 2014-02-13 30.0 low + 2014-02-15 35.1 medium + + >>> df2.reindex_like(df1) + temp_celsius temp_fahrenheit windspeed + 2014-02-12 28.0 NaN low + 2014-02-13 30.0 NaN low + 2014-02-14 NaN NaN NaN + 2014-02-15 35.1 NaN medium """ d = other._construct_axes_dict(axes=self._AXIS_ORDERS, method=method, copy=copy, limit=limit, @@ -3812,36 +4074,36 @@ def reindex(self, *args, **kwargs): Conform %(klass)s to new index with optional filling logic, placing NA/NaN in locations having no value in the previous index. A new object is produced unless the new index is equivalent to the current one and - copy=False + ``copy=False``. Parameters ---------- %(optional_labels)s - %(axes)s : array-like, optional (should be specified using keywords) - New labels / index to conform to. Preferably an Index object to - avoid duplicating data + %(axes)s : array-like, optional + New labels / index to conform to, should be specified using + keywords. Preferably an Index object to avoid duplicating data %(optional_axis)s - method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'}, optional - method to use for filling holes in reindexed DataFrame. + method : {None, 'backfill'/'bfill', 'pad'/'ffill', 'nearest'} + Method to use for filling holes in reindexed DataFrame. Please note: this is only applicable to DataFrames/Series with a monotonically increasing/decreasing index. - * default: don't fill gaps + * None (default): don't fill gaps * pad / ffill: propagate last valid observation forward to next valid * backfill / bfill: use next valid observation to fill gap * nearest: use nearest valid observations to fill gap - copy : boolean, default True - Return a new object, even if the passed indexes are the same + copy : bool, default True + Return a new object, even if the passed indexes are the same. level : int or name Broadcast across a level, matching Index values on the - passed MultiIndex level + passed MultiIndex level. fill_value : scalar, default np.NaN Value to use for missing values. Defaults to NaN, but can be any - "compatible" value + "compatible" value. limit : int, default None - Maximum number of consecutive elements to forward or backward fill + Maximum number of consecutive elements to forward or backward fill. tolerance : optional Maximum distance between original and new labels for inexact matches. The values of the index at the matching locations most @@ -3855,6 +4117,12 @@ def reindex(self, *args, **kwargs): .. versionadded:: 0.21.0 (list-like tolerance) + See Also + -------- + DataFrame.set_index : Set row labels. + DataFrame.reset_index : Remove row labels or move them to new columns. + DataFrame.reindex_like : Change to same indices as other DataFrame. + Examples -------- @@ -3946,12 +4214,12 @@ def reindex(self, *args, **kwargs): ... index=date_index) >>> df2 prices - 2010-01-01 100 - 2010-01-02 101 + 2010-01-01 100.0 + 2010-01-02 101.0 2010-01-03 NaN - 2010-01-04 100 - 2010-01-05 89 - 2010-01-06 88 + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 Suppose we decide to expand the dataframe to cover a wider date range. @@ -3962,12 +4230,12 @@ def reindex(self, *args, **kwargs): 2009-12-29 NaN 2009-12-30 NaN 2009-12-31 NaN - 2010-01-01 100 - 2010-01-02 101 + 2010-01-01 100.0 + 2010-01-02 101.0 2010-01-03 NaN - 2010-01-04 100 - 2010-01-05 89 - 2010-01-06 88 + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 2010-01-07 NaN The index entries that did not have a value in the original data frame @@ -3980,15 +4248,15 @@ def reindex(self, *args, **kwargs): >>> df2.reindex(date_index2, method='bfill') prices - 2009-12-29 100 - 2009-12-30 100 - 2009-12-31 100 - 2010-01-01 100 - 2010-01-02 101 + 2009-12-29 100.0 + 2009-12-30 100.0 + 2009-12-31 100.0 + 2010-01-01 100.0 + 2010-01-02 101.0 2010-01-03 NaN - 2010-01-04 100 - 2010-01-05 89 - 2010-01-06 88 + 2010-01-04 100.0 + 2010-01-05 89.0 + 2010-01-06 88.0 2010-01-07 NaN Please note that the ``NaN`` value present in the original dataframe @@ -4002,7 +4270,7 @@ def reindex(self, *args, **kwargs): Returns ------- - reindexed : %(klass)s + %(klass)s with changed index. """ # TODO: Decide if we care about having different examples for different # kinds @@ -4074,11 +4342,10 @@ def _needs_reindex_multi(self, axes, method, level): def _reindex_multi(self, axes, copy, fill_value): return NotImplemented - _shared_docs[ - 'reindex_axis'] = ("""Conform input object to new index with optional - filling logic, placing NA/NaN in locations having no value in the - previous index. A new object is produced unless the new index is - equivalent to the current one and copy=False + _shared_docs['reindex_axis'] = ("""Conform input object to new index + with optional filling logic, placing NA/NaN in locations having + no value in the previous index. A new object is produced unless + the new index is equivalent to the current one and copy=False. Parameters ---------- @@ -4115,17 +4382,20 @@ def _reindex_multi(self, axes, copy, fill_value): .. versionadded:: 0.21.0 (list-like tolerance) - Examples - -------- - >>> df.reindex_axis(['A', 'B', 'C'], axis=1) - See Also -------- - reindex, reindex_like + DataFrame.set_index : Set row labels. + DataFrame.reset_index : Remove row labels or move them to new columns. + DataFrame.reindex : Change to new indices or expand indices. + DataFrame.reindex_like : Change to same indices as other DataFrame. Returns ------- - reindexed : %(klass)s + %(klass)s + + Examples + -------- + >>> df.reindex_axis(['A', 'B', 'C'], axis=1) """) @Appender(_shared_docs['reindex_axis'] % _shared_doc_kwargs) @@ -4779,6 +5049,14 @@ def __setattr__(self, name, value): stacklevel=2) object.__setattr__(self, name, value) + def _dir_additions(self): + """ add the string-like attributes from the info_axis. + If info_axis is a MultiIndex, it's first level values are used. + """ + additions = {c for c in self._info_axis.unique(level=0)[:100] + if isinstance(c, string_types) and isidentifier(c)} + return super(NDFrame, self)._dir_additions().union(additions) + # ---------------------------------------------------------------------- # Getting and setting elements @@ -4917,6 +5195,10 @@ def values(self): """ Return a Numpy representation of the DataFrame. + .. warning:: + + We recommend using :meth:`DataFrame.to_numpy` instead. + Only the values in the DataFrame will be returned, the axes labels will be removed. @@ -4978,6 +5260,7 @@ def values(self): See Also -------- + DataFrame.to_numpy : Recommended alternative to this method. pandas.DataFrame.index : Retrieve the index labels. pandas.DataFrame.columns : Retrieving the column names. """ @@ -5871,7 +6154,7 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): value to use for each column (columns not in the dict will not be filled). Regular expressions, strings and lists or dicts of such objects are also allowed. - inplace : boolean, default False + inplace : bool, default False If True, in place. Note: this will modify any other views on this object (e.g. a column from a DataFrame). Returns the caller if this is True. @@ -5890,12 +6173,6 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): .. versionchanged:: 0.23.0 Added to DataFrame. - See Also - -------- - %(klass)s.fillna : Fill NA values. - %(klass)s.where : Replace values based on boolean condition. - Series.str.replace : Simple string replacement. - Returns ------- %(klass)s @@ -5919,6 +6196,12 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): * If a ``list`` or an ``ndarray`` is passed to `to_replace` and `value` but they are not the same length. + See Also + -------- + %(klass)s.fillna : Fill NA values. + %(klass)s.where : Replace values based on boolean condition. + Series.str.replace : Simple string replacement. + Notes ----- * Regex substitution is performed under the hood with ``re.sub``. The @@ -6033,7 +6316,7 @@ def bfill(self, axis=None, inplace=False, limit=None, downcast=None): 1 foo new 2 bait xyz - >>> df.replace(regex={r'^ba.$':'new', 'foo':'xyz'}) + >>> df.replace(regex={r'^ba.$': 'new', 'foo': 'xyz'}) A B 0 new abc 1 xyz new @@ -9831,167 +10114,6 @@ def first_valid_index(self): def last_valid_index(self): return self._find_valid_index('last') - def to_csv(self, path_or_buf=None, sep=",", na_rep='', float_format=None, - columns=None, header=True, index=True, index_label=None, - mode='w', encoding=None, compression='infer', quoting=None, - quotechar='"', line_terminator=None, chunksize=None, - tupleize_cols=None, date_format=None, doublequote=True, - escapechar=None, decimal='.'): - r""" - Write object to a comma-separated values (csv) file. - - .. versionchanged:: 0.24.0 - The order of arguments for Series was changed. - - Parameters - ---------- - path_or_buf : str or file handle, default None - File path or object, if None is provided the result is returned as - a string. - - .. versionchanged:: 0.24.0 - - Was previously named "path" for Series. - - sep : str, default ',' - String of length 1. Field delimiter for the output file. - na_rep : str, default '' - Missing data representation. - float_format : str, default None - Format string for floating point numbers. - columns : sequence, optional - Columns to write. - header : bool or list of str, default True - Write out the column names. If a list of strings is given it is - assumed to be aliases for the column names. - - .. versionchanged:: 0.24.0 - - Previously defaulted to False for Series. - - index : bool, default True - Write row names (index). - index_label : str or sequence, or False, default None - Column label for index column(s) if desired. If None is given, and - `header` and `index` are True, then the index names are used. A - sequence should be given if the object uses MultiIndex. If - False do not print fields for index names. Use index_label=False - for easier importing in R. - mode : str - Python write mode, default 'w'. - encoding : str, optional - A string representing the encoding to use in the output file, - defaults to 'ascii' on Python 2 and 'utf-8' on Python 3. - compression : str, default 'infer' - Compression mode among the following possible values: {'infer', - 'gzip', 'bz2', 'zip', 'xz', None}. If 'infer' and `path_or_buf` - is path-like, then detect compression from the following - extensions: '.gz', '.bz2', '.zip' or '.xz'. (otherwise no - compression). - - .. versionchanged:: 0.24.0 - - 'infer' option added and set to default. - - quoting : optional constant from csv module - Defaults to csv.QUOTE_MINIMAL. If you have set a `float_format` - then floats are converted to strings and thus csv.QUOTE_NONNUMERIC - will treat them as non-numeric. - quotechar : str, default '\"' - String of length 1. Character used to quote fields. - line_terminator : string, optional - The newline character or character sequence to use in the output - file. Defaults to `os.linesep`, which depends on the OS in which - this method is called ('\n' for linux, '\r\n' for Windows, i.e.). - - .. versionchanged:: 0.24.0 - chunksize : int or None - Rows to write at a time. - tupleize_cols : bool, default False - Write MultiIndex columns as a list of tuples (if True) or in - the new, expanded format, where each MultiIndex column is a row - in the CSV (if False). - - .. deprecated:: 0.21.0 - This argument will be removed and will always write each row - of the multi-index as a separate row in the CSV file. - date_format : str, default None - Format string for datetime objects. - doublequote : bool, default True - Control quoting of `quotechar` inside a field. - escapechar : str, default None - String of length 1. Character used to escape `sep` and `quotechar` - when appropriate. - decimal : str, default '.' - Character recognized as decimal separator. E.g. use ',' for - European data. - - Returns - ------- - None or str - If path_or_buf is None, returns the resulting csv format as a - string. Otherwise returns None. - - See Also - -------- - read_csv : Load a CSV file into a DataFrame. - to_excel : Load an Excel file into a DataFrame. - - Examples - -------- - >>> df = pd.DataFrame({'name': ['Raphael', 'Donatello'], - ... 'mask': ['red', 'purple'], - ... 'weapon': ['sai', 'bo staff']}) - >>> df.to_csv(index=False) - 'name,mask,weapon\nRaphael,red,sai\nDonatello,purple,bo staff\n' - """ - - df = self if isinstance(self, ABCDataFrame) else self.to_frame() - - if tupleize_cols is not None: - warnings.warn("The 'tupleize_cols' parameter is deprecated and " - "will be removed in a future version", - FutureWarning, stacklevel=2) - else: - tupleize_cols = False - - from pandas.io.formats.csvs import CSVFormatter - formatter = CSVFormatter(df, path_or_buf, - line_terminator=line_terminator, sep=sep, - encoding=encoding, - compression=compression, quoting=quoting, - na_rep=na_rep, float_format=float_format, - cols=columns, header=header, index=index, - index_label=index_label, mode=mode, - chunksize=chunksize, quotechar=quotechar, - tupleize_cols=tupleize_cols, - date_format=date_format, - doublequote=doublequote, - escapechar=escapechar, decimal=decimal) - formatter.save() - - if path_or_buf is None: - return formatter.path_or_buf.getvalue() - - @Appender(_shared_docs["to_excel"] % dict(klass="object")) - def to_excel(self, excel_writer, sheet_name="Sheet1", na_rep="", - float_format=None, columns=None, header=True, index=True, - index_label=None, startrow=0, startcol=0, engine=None, - merge_cells=True, encoding=None, inf_rep="inf", verbose=True, - freeze_panes=None): - df = self if isinstance(self, ABCDataFrame) else self.to_frame() - - from pandas.io.formats.excel import ExcelFormatter - formatter = ExcelFormatter(df, na_rep=na_rep, cols=columns, - header=header, - float_format=float_format, index=index, - index_label=index_label, - merge_cells=merge_cells, - inf_rep=inf_rep) - formatter.write(excel_writer, sheet_name=sheet_name, startrow=startrow, - startcol=startcol, freeze_panes=freeze_panes, - engine=engine) - def _doc_parms(cls): """Return a tuple of the doc parms.""" diff --git a/pandas/core/groupby/base.py b/pandas/core/groupby/base.py index 1bf97690a84ed7..a148f7e0cab87b 100644 --- a/pandas/core/groupby/base.py +++ b/pandas/core/groupby/base.py @@ -12,11 +12,15 @@ class GroupByMixin(object): - """ provide the groupby facilities to the mixed object """ + """ + Provide the groupby facilities to the mixed object. + """ @staticmethod def _dispatch(name, *args, **kwargs): - """ dispatch to apply """ + """ + Dispatch to apply. + """ def outer(self, *args, **kwargs): def f(x): @@ -28,8 +32,7 @@ def f(x): def _gotitem(self, key, ndim, subset=None): """ - sub-classes to define - return a sliced object + Sub-classes to define. Return a sliced object. Parameters ---------- diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index ada9c5d456a77f..a17e2ce7f1ef54 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -826,8 +826,9 @@ def _aggregate_multiple_funcs(self, arg, _level): for name, func in arg: obj = self if name in results: - raise SpecificationError('Function names must be unique, ' - 'found multiple named %s' % name) + raise SpecificationError( + 'Function names must be unique, found multiple named ' + '{}'.format(name)) # reset the cache so that we # only include the named selection @@ -1027,8 +1028,7 @@ def nunique(self, dropna=True): try: sorter = np.lexsort((val, ids)) except TypeError: # catches object dtypes - msg = ('val.dtype must be object, got {dtype}' - .format(dtype=val.dtype)) + msg = 'val.dtype must be object, got {}'.format(val.dtype) assert val.dtype == object, msg val, _ = algorithms.factorize(val, sort=False) sorter = np.lexsort((val, ids)) diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index b68fdf853ab192..253860d83f49e0 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -291,7 +291,7 @@ class providing the base-class of operations. class GroupByPlot(PandasObject): """ - Class implementing the .plot attribute for groupby objects + Class implementing the .plot attribute for groupby objects. """ def __init__(self, groupby): @@ -314,7 +314,7 @@ def f(self): @contextmanager def _group_selection_context(groupby): """ - set / reset the _group_selection_context + Set / reset the _group_selection_context. """ groupby._set_group_selection() yield groupby @@ -377,14 +377,16 @@ def __unicode__(self): def _assure_grouper(self): """ - we create the grouper on instantiation - sub-classes may have a different policy + We create the grouper on instantiation sub-classes may have a + different policy. """ pass @property def groups(self): - """ dict {group name -> group labels} """ + """ + Dict {group name -> group labels}. + """ self._assure_grouper() return self.grouper.groups @@ -395,14 +397,16 @@ def ngroups(self): @property def indices(self): - """ dict {group name -> group indices} """ + """ + Dict {group name -> group indices}. + """ self._assure_grouper() return self.grouper.indices def _get_indices(self, names): """ - safe get multiple indices, translate keys for - datelike to underlying repr + Safe get multiple indices, translate keys for + datelike to underlying repr. """ def get_converter(s): @@ -450,7 +454,9 @@ def get_converter(s): return [self.indices.get(name, []) for name in names] def _get_index(self, name): - """ safe get index, translate keys for datelike to underlying repr """ + """ + Safe get index, translate keys for datelike to underlying repr. + """ return self._get_indices([name])[0] @cache_readonly @@ -465,8 +471,10 @@ def _selected_obj(self): def _reset_group_selection(self): """ - Clear group based selection. Used for methods needing to return info on - each group regardless of whether a group selection was previously set. + Clear group based selection. + + Used for methods needing to return info on each group regardless of + whether a group selection was previously set. """ if self._group_selection is not None: # GH12839 clear cached selection too when changing group selection @@ -475,8 +483,9 @@ def _reset_group_selection(self): def _set_group_selection(self): """ - Create group based selection. Used when selection is not passed - directly but instead via a grouper. + Create group based selection. + + Used when selection is not passed directly but instead via a grouper. NOTE: this should be paired with a call to _reset_group_selection """ @@ -617,7 +626,7 @@ def curried(x): def get_group(self, name, obj=None): """ - Constructs NDFrame from group with provided name + Constructs NDFrame from group with provided name. Parameters ---------- @@ -643,7 +652,7 @@ def get_group(self, name, obj=None): def __iter__(self): """ - Groupby iterator + Groupby iterator. Returns ------- @@ -743,11 +752,11 @@ def _cumcount_array(self, ascending=True): def _try_cast(self, result, obj, numeric_only=False): """ - try to cast the result to our obj original type, - we may have roundtripped thru object in the mean-time + Try to cast the result to our obj original type, + we may have roundtripped through object in the mean-time. - if numeric_only is True, then only try to cast numerics - and not datetimelikes + If numeric_only is True, then only try to cast numerics + and not datetimelikes. """ if obj.ndim > 1: @@ -945,8 +954,9 @@ def _apply_filter(self, indices, dropna): class GroupBy(_GroupBy): """ - Class for grouping and aggregating relational data. See aggregate, - transform, and apply functions on this object. + Class for grouping and aggregating relational data. + + See aggregate, transform, and apply functions on this object. It's easiest to use obj.groupby(...) to use GroupBy, but you can also do: @@ -1010,7 +1020,9 @@ class GroupBy(_GroupBy): Number of groups """ def _bool_agg(self, val_test, skipna): - """Shared func to call any / all Cython GroupBy implementations""" + """ + Shared func to call any / all Cython GroupBy implementations. + """ def objs_to_bool(vals): try: @@ -1036,7 +1048,7 @@ def result_to_bool(result): @Appender(_doc_template) def any(self, skipna=True): """ - Returns True if any value in the group is truthful, else False + Returns True if any value in the group is truthful, else False. Parameters ---------- @@ -1049,7 +1061,7 @@ def any(self, skipna=True): @Appender(_doc_template) def all(self, skipna=True): """ - Returns True if all values in the group are truthful, else False + Returns True if all values in the group are truthful, else False. Parameters ---------- @@ -1061,7 +1073,9 @@ def all(self, skipna=True): @Substitution(name='groupby') @Appender(_doc_template) def count(self): - """Compute count of group, excluding missing values""" + """ + Compute count of group, excluding missing values. + """ # defined here for API doc raise NotImplementedError @@ -1127,7 +1141,7 @@ def mean(self, *args, **kwargs): @Appender(_doc_template) def median(self, **kwargs): """ - Compute median of groups, excluding missing values + Compute median of groups, excluding missing values. For multiple groupings, the result index will be a MultiIndex """ @@ -1148,9 +1162,9 @@ def f(x): @Appender(_doc_template) def std(self, ddof=1, *args, **kwargs): """ - Compute standard deviation of groups, excluding missing values + Compute standard deviation of groups, excluding missing values. - For multiple groupings, the result index will be a MultiIndex + For multiple groupings, the result index will be a MultiIndex. Parameters ---------- @@ -1166,9 +1180,9 @@ def std(self, ddof=1, *args, **kwargs): @Appender(_doc_template) def var(self, ddof=1, *args, **kwargs): """ - Compute variance of groups, excluding missing values + Compute variance of groups, excluding missing values. - For multiple groupings, the result index will be a MultiIndex + For multiple groupings, the result index will be a MultiIndex. Parameters ---------- @@ -1192,9 +1206,9 @@ def var(self, ddof=1, *args, **kwargs): @Appender(_doc_template) def sem(self, ddof=1): """ - Compute standard error of the mean of groups, excluding missing values + Compute standard error of the mean of groups, excluding missing values. - For multiple groupings, the result index will be a MultiIndex + For multiple groupings, the result index will be a MultiIndex. Parameters ---------- @@ -1207,7 +1221,9 @@ def sem(self, ddof=1): @Substitution(name='groupby') @Appender(_doc_template) def size(self): - """Compute group sizes""" + """ + Compute group sizes. + """ result = self.grouper.size() if isinstance(self.obj, Series): @@ -1216,7 +1232,9 @@ def size(self): @classmethod def _add_numeric_operations(cls): - """ add numeric operations to the GroupBy generically """ + """ + Add numeric operations to the GroupBy generically. + """ def groupby_function(name, alias, npfunc, numeric_only=True, _convert=False, @@ -1293,7 +1311,8 @@ def last(x): @Appender(_doc_template) def ohlc(self): """ - Compute sum of values, excluding missing values + Compute sum of values, excluding missing values. + For multiple groupings, the result index will be a MultiIndex """ @@ -1421,9 +1440,7 @@ def resample(self, rule, *args, **kwargs): @Appender(_doc_template) def rolling(self, *args, **kwargs): """ - Return a rolling grouper, providing rolling - functionality per group - + Return a rolling grouper, providing rolling functionality per group. """ from pandas.core.window import RollingGroupby return RollingGroupby(self, *args, **kwargs) @@ -1433,14 +1450,14 @@ def rolling(self, *args, **kwargs): def expanding(self, *args, **kwargs): """ Return an expanding grouper, providing expanding - functionality per group - + functionality per group. """ from pandas.core.window import ExpandingGroupby return ExpandingGroupby(self, *args, **kwargs) def _fill(self, direction, limit=None): - """Shared function for `pad` and `backfill` to call Cython method + """ + Shared function for `pad` and `backfill` to call Cython method. Parameters ---------- @@ -1474,7 +1491,7 @@ def _fill(self, direction, limit=None): @Substitution(name='groupby') def pad(self, limit=None): """ - Forward fill the values + Forward fill the values. Parameters ---------- @@ -1494,7 +1511,7 @@ def pad(self, limit=None): @Substitution(name='groupby') def backfill(self, limit=None): """ - Backward fill the values + Backward fill the values. Parameters ---------- @@ -1627,7 +1644,8 @@ def nth(self, n, dropna=None): # just returns NaN raise ValueError("For a DataFrame groupby, dropna must be " "either None, 'any' or 'all', " - "(was passed %s)." % (dropna),) + "(was passed {dropna}).".format( + dropna=dropna)) # old behaviour, but with all and any support for DataFrames. # modified in GH 7559 to have better perf @@ -1830,7 +1848,9 @@ def rank(self, method='average', ascending=True, na_option='keep', @Substitution(name='groupby') @Appender(_doc_template) def cumprod(self, axis=0, *args, **kwargs): - """Cumulative product for each group""" + """ + Cumulative product for each group. + """ nv.validate_groupby_func('cumprod', args, kwargs, ['numeric_only', 'skipna']) if axis != 0: @@ -1841,7 +1861,9 @@ def cumprod(self, axis=0, *args, **kwargs): @Substitution(name='groupby') @Appender(_doc_template) def cumsum(self, axis=0, *args, **kwargs): - """Cumulative sum for each group""" + """ + Cumulative sum for each group. + """ nv.validate_groupby_func('cumsum', args, kwargs, ['numeric_only', 'skipna']) if axis != 0: @@ -1852,7 +1874,9 @@ def cumsum(self, axis=0, *args, **kwargs): @Substitution(name='groupby') @Appender(_doc_template) def cummin(self, axis=0, **kwargs): - """Cumulative min for each group""" + """ + Cumulative min for each group. + """ if axis != 0: return self.apply(lambda x: np.minimum.accumulate(x, axis)) @@ -1861,7 +1885,9 @@ def cummin(self, axis=0, **kwargs): @Substitution(name='groupby') @Appender(_doc_template) def cummax(self, axis=0, **kwargs): - """Cumulative max for each group""" + """ + Cumulative max for each group. + """ if axis != 0: return self.apply(lambda x: np.maximum.accumulate(x, axis)) @@ -1873,7 +1899,8 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, result_is_index=False, pre_processing=None, post_processing=None, **kwargs): - """Get result for Cythonized functions + """ + Get result for Cythonized functions. Parameters ---------- @@ -1968,7 +1995,7 @@ def _get_cythonized_result(self, how, grouper, aggregate=False, @Appender(_doc_template) def shift(self, periods=1, freq=None, axis=0): """ - Shift each group by periods observations + Shift each group by periods observations. Parameters ---------- @@ -1991,7 +2018,9 @@ def shift(self, periods=1, freq=None, axis=0): @Appender(_doc_template) def pct_change(self, periods=1, fill_method='pad', limit=None, freq=None, axis=0): - """Calculate pct_change of each value to previous entry in group""" + """ + Calculate pct_change of each value to previous entry in group. + """ if freq is not None or axis != 0: return self.apply(lambda x: x.pct_change(periods=periods, fill_method=fill_method, @@ -2035,7 +2064,7 @@ def head(self, n=5): @Appender(_doc_template) def tail(self, n=5): """ - Returns last n rows of each group + Returns last n rows of each group. Essentially equivalent to ``.apply(lambda x: x.tail(n))``, except ignores as_index flag. @@ -2071,6 +2100,6 @@ def groupby(obj, by, **kwds): from pandas.core.groupby.generic import DataFrameGroupBy klass = DataFrameGroupBy else: # pragma: no cover - raise TypeError('invalid type: %s' % type(obj)) + raise TypeError('invalid type: {}'.format(obj)) return klass(obj, by, **kwds) diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index b49bc5ee5950f9..d8df227d4911a0 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -257,7 +257,7 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, if level is not None: if not isinstance(level, int): if level not in index.names: - raise AssertionError('Level %s not in index' % str(level)) + raise AssertionError('Level {} not in index'.format(level)) level = index.names.index(level) if self.name is None: @@ -317,7 +317,8 @@ def __init__(self, index, grouper=None, obj=None, name=None, level=None, (Series, Index, ExtensionArray, np.ndarray)): if getattr(self.grouper, 'ndim', 1) != 1: t = self.name or str(type(self.grouper)) - raise ValueError("Grouper for '%s' not 1-dimensional" % t) + raise ValueError( + "Grouper for '{}' not 1-dimensional".format(t)) self.grouper = self.index.map(self.grouper) if not (hasattr(self.grouper, "__len__") and len(self.grouper) == len(self.index)): @@ -460,8 +461,8 @@ def _get_grouper(obj, key=None, axis=0, level=None, sort=True, if isinstance(level, compat.string_types): if obj.index.name != level: - raise ValueError('level name %s is not the name of the ' - 'index' % level) + raise ValueError('level name {} is not the name of the ' + 'index'.format(level)) elif level > 0 or level < -1: raise ValueError('level > 0 or level < -1 only valid with ' ' MultiIndex') diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index 125bd9a5e855d9..8455c03953ad15 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -7,7 +7,6 @@ """ import collections -import copy import numpy as np @@ -380,7 +379,8 @@ def get_func(fname): # otherwise find dtype-specific version, falling back to object for dt in [dtype_str, 'object']: - f = getattr(libgroupby, "%s_%s" % (fname, dtype_str), None) + f = getattr(libgroupby, "{fname}_{dtype_str}".format( + fname=fname, dtype_str=dtype_str), None) if f is not None: return f @@ -403,9 +403,11 @@ def wrapper(*args, **kwargs): func = get_func(ftype) if func is None: - raise NotImplementedError("function is not implemented for this" - "dtype: [how->%s,dtype->%s]" % - (how, dtype_str)) + raise NotImplementedError( + "function is not implemented for this dtype: " + "[how->{how},dtype->{dtype_str}]".format(how=how, + dtype_str=dtype_str)) + return func def _cython_operation(self, kind, values, how, axis, min_count=-1, @@ -485,7 +487,8 @@ def _cython_operation(self, kind, values, how, axis, min_count=-1, out_dtype = 'float' else: if is_numeric: - out_dtype = '%s%d' % (values.dtype.kind, values.dtype.itemsize) + out_dtype = '{kind}{itemsize}'.format( + kind=values.dtype.kind, itemsize=values.dtype.itemsize) else: out_dtype = 'object' @@ -675,10 +678,8 @@ def groups(self): # this is mainly for compat # GH 3881 - result = {} - for key, value in zip(self.binlabels, self.bins): - if key is not NaT: - result[key] = value + result = {key: value for key, value in zip(self.binlabels, self.bins) + if key is not NaT} return result @property @@ -739,10 +740,6 @@ def group_info(self): obs_group_ids.astype('int64', copy=False), ngroups) - @cache_readonly - def ngroups(self): - return len(self.result_index) - @cache_readonly def result_index(self): if len(self.binlabels) != 0 and isna(self.binlabels[0]): @@ -769,11 +766,6 @@ def agg_series(self, obj, func): grouper = reduction.SeriesBinGrouper(obj, func, self.bins, dummy) return grouper.get_result() - # ---------------------------------------------------------------------- - # cython aggregation - - _cython_functions = copy.deepcopy(BaseGrouper._cython_functions) - def _get_axes(group): if isinstance(group, Series): @@ -853,9 +845,6 @@ def _chop(self, sdata, slice_obj): class FrameSplitter(DataSplitter): - def __init__(self, data, labels, ngroups, axis=0): - super(FrameSplitter, self).__init__(data, labels, ngroups, axis=axis) - def fast_apply(self, f, names): # must return keys::list, values::list, mutated::bool try: diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index 28aefb652adb0c..a5b8e22070923e 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -150,8 +150,9 @@ class InvalidIndexError(Exception): def _new_Index(cls, d): - """ This is called upon unpickling, rather than the default which doesn't - have arguments and breaks __new__ + """ + This is called upon unpickling, rather than the default which doesn't + have arguments and breaks __new__. """ # required for backward compat, because PI can't be instantiated with # ordinals through __new__ GH #13277 @@ -164,7 +165,7 @@ def _new_Index(cls, d): class Index(IndexOpsMixin, PandasObject): """ Immutable ndarray implementing an ordered, sliceable set. The basic object - storing axis labels for all pandas objects + storing axis labels for all pandas objects. Parameters ---------- @@ -243,6 +244,9 @@ def _outer_indexer(self, left, right): str = CachedAccessor("str", StringMethods) + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, data=None, dtype=None, copy=False, name=None, fastpath=None, tupleize_cols=True, **kwargs): @@ -490,8 +494,8 @@ def __new__(cls, data=None, dtype=None, copy=False, name=None, @classmethod def _simple_new(cls, values, name=None, dtype=None, **kwargs): """ - we require the we have a dtype compat for the values - if we are passed a non-dtype compat, then coerce using the constructor + We require that we have a dtype compat for the values. If we are passed + a non-dtype compat, then coerce using the constructor. Must be careful not to recurse. """ @@ -517,10 +521,23 @@ def _simple_new(cls, values, name=None, dtype=None, **kwargs): setattr(result, k, v) return result._reset_identity() + @cache_readonly + def _constructor(self): + return type(self) + + # -------------------------------------------------------------------- + # Index Internals Methods + + def _get_attributes_dict(self): + """ + Return an attributes dict for my class. + """ + return {k: getattr(self, k, None) for k in self._attributes} + _index_shared_docs['_shallow_copy'] = """ - create a new Index with the same class as the caller, don't copy the + Create a new Index with the same class as the caller, don't copy the data, use the same object attributes with passed in attributes taking - precedence + precedence. *this is an internal non-public method* @@ -550,9 +567,9 @@ def _shallow_copy(self, values=None, **kwargs): def _shallow_copy_with_infer(self, values, **kwargs): """ - create a new Index inferring the class with passed value, don't copy + Create a new Index inferring the class with passed value, don't copy the data, use the same object attributes with passed in attributes - taking precedence + taking precedence. *this is an internal non-public method* @@ -575,11 +592,11 @@ def _shallow_copy_with_infer(self, values, **kwargs): def _deepcopy_if_needed(self, orig, copy=False): """ - .. versionadded:: 0.19.0 - Make a copy of self if data coincides (in memory) with orig. Subclasses should override this if self._base is not an ndarray. + .. versionadded:: 0.19.0 + Parameters ---------- orig : ndarray @@ -607,43 +624,9 @@ def _update_inplace(self, result, **kwargs): # guard when called from IndexOpsMixin raise TypeError("Index can't be updated inplace") - def _sort_levels_monotonic(self): - """ compat with MultiIndex """ - return self - - _index_shared_docs['_get_grouper_for_level'] = """ - Get index grouper corresponding to an index level - - Parameters - ---------- - mapper: Group mapping function or None - Function mapping index values to groups - level : int or None - Index level - - Returns - ------- - grouper : Index - Index of values to group on - labels : ndarray of int or None - Array of locations in level_index - uniques : Index or None - Index of unique values for level - """ - - @Appender(_index_shared_docs['_get_grouper_for_level']) - def _get_grouper_for_level(self, mapper, level=None): - assert level is None or level == 0 - if mapper is None: - grouper = self - else: - grouper = self.map(mapper) - - return grouper, None, None - def is_(self, other): """ - More flexible, faster check like ``is`` but that works through views + More flexible, faster check like ``is`` but that works through views. Note: this is *not* the same as ``Index.identical()``, which checks that metadata is also the same. @@ -662,24 +645,39 @@ def is_(self, other): other, '_id', Ellipsis) and self._id is not None def _reset_identity(self): - """Initializes or resets ``_id`` attribute with new object""" + """ + Initializes or resets ``_id`` attribute with new object. + """ self._id = _Identity() return self + def _cleanup(self): + self._engine.clear_mapping() + + @cache_readonly + def _engine(self): + # property, for now, slow to look up + return self._engine_type(lambda: self._ndarray_values, len(self)) + + # -------------------------------------------------------------------- + # Array-Like Methods + # ndarray compat def __len__(self): """ - return the length of the Index + Return the length of the Index. """ return len(self._data) def __array__(self, dtype=None): - """ the array interface, return my values """ + """ + The array interface, return my values. + """ return self._data.view(np.ndarray) def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc + Gets called after a ufunc. """ if is_bool_dtype(result): return result @@ -690,102 +688,141 @@ def __array_wrap__(self, result, context=None): @cache_readonly def dtype(self): - """ return the dtype object of the underlying data """ + """ + Return the dtype object of the underlying data. + """ return self._data.dtype @cache_readonly def dtype_str(self): - """ return the dtype str of the underlying data """ + """ + Return the dtype str of the underlying data. + """ return str(self.dtype) - @property - def values(self): - """ return the underlying data as an ndarray """ - return self._data.view(np.ndarray) - - @property - def _values(self): - # type: () -> Union[ExtensionArray, Index, np.ndarray] - # TODO(EA): remove index types as they become extension arrays - """The best array representation. + def ravel(self, order='C'): + """ + Return an ndarray of the flattened values of the underlying data. - This is an ndarray, ExtensionArray, or Index subclass. This differs - from ``_ndarray_values``, which always returns an ndarray. + See Also + -------- + numpy.ndarray.ravel + """ + return self._ndarray_values.ravel(order=order) - Both ``_values`` and ``_ndarray_values`` are consistent between - ``Series`` and ``Index``. + def view(self, cls=None): - It may differ from the public '.values' method. + # we need to see if we are subclassing an + # index type here + if cls is not None and not hasattr(cls, '_typ'): + result = self._data.view(cls) + else: + result = self._shallow_copy() + if isinstance(result, Index): + result._id = self._id + return result - index | values | _values | _ndarray_values | - ----------------- | --------------- | ------------- | --------------- | - Index | ndarray | ndarray | ndarray | - CategoricalIndex | Categorical | Categorical | ndarray[int] | - DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] | - DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | - PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | - IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | + _index_shared_docs['astype'] = """ + Create an Index with values cast to dtypes. The class of a new Index + is determined by dtype. When conversion is impossible, a ValueError + exception is raised. - See Also - -------- - values - _ndarray_values - """ - return self.values + Parameters + ---------- + dtype : numpy dtype or pandas type + copy : bool, default True + By default, astype always returns a newly allocated object. + If copy is set to False and internal requirements on dtype are + satisfied, the original data is used to create a new Index + or the original Index is returned. - def get_values(self): + .. versionadded:: 0.19.0 """ - Return `Index` data as an `numpy.ndarray`. - Returns - ------- - numpy.ndarray - A one-dimensional numpy array of the `Index` values. + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + if is_dtype_equal(self.dtype, dtype): + return self.copy() if copy else self - See Also - -------- - Index.values : The attribute that get_values wraps. + elif is_categorical_dtype(dtype): + from .category import CategoricalIndex + return CategoricalIndex(self.values, name=self.name, dtype=dtype, + copy=copy) - Examples - -------- - Getting the `Index` values of a `DataFrame`: + elif is_extension_array_dtype(dtype): + return Index(np.asarray(self), dtype=dtype, copy=copy) - >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], - ... index=['a', 'b', 'c'], columns=['A', 'B', 'C']) - >>> df - A B C - a 1 2 3 - b 4 5 6 - c 7 8 9 - >>> df.index.get_values() - array(['a', 'b', 'c'], dtype=object) + try: + if is_datetime64tz_dtype(dtype): + from pandas import DatetimeIndex + return DatetimeIndex(self.values, name=self.name, dtype=dtype, + copy=copy) + return Index(self.values.astype(dtype, copy=copy), name=self.name, + dtype=dtype) + except (TypeError, ValueError): + msg = 'Cannot cast {name} to dtype {dtype}' + raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) - Standalone `Index` values: + _index_shared_docs['take'] = """ + Return a new %(klass)s of the values selected by the indices. - >>> idx = pd.Index(['1', '2', '3']) - >>> idx.get_values() - array(['1', '2', '3'], dtype=object) + For internal compatibility with numpy arrays. - `MultiIndex` arrays also have only one dimension: + Parameters + ---------- + indices : list + Indices to be taken + axis : int, optional + The axis over which to select values, always 0. + allow_fill : bool, default True + fill_value : bool, default None + If allow_fill=True and fill_value is not None, indices specified by + -1 is regarded as NA. If Index doesn't hold NA, raise ValueError - >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']], - ... names=('number', 'letter')) - >>> midx.get_values() - array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object) - >>> midx.get_values().ndim - 1 + See Also + -------- + numpy.ndarray.take """ - return self.values - @Appender(IndexOpsMixin.memory_usage.__doc__) - def memory_usage(self, deep=False): - result = super(Index, self).memory_usage(deep=deep) + @Appender(_index_shared_docs['take'] % _index_doc_kwargs) + def take(self, indices, axis=0, allow_fill=True, + fill_value=None, **kwargs): + if kwargs: + nv.validate_take(tuple(), kwargs) + indices = ensure_platform_int(indices) + if self._can_hold_na: + taken = self._assert_take_fillable(self.values, indices, + allow_fill=allow_fill, + fill_value=fill_value, + na_value=self._na_value) + else: + if allow_fill and fill_value is not None: + msg = 'Unable to fill values because {0} cannot contain NA' + raise ValueError(msg.format(self.__class__.__name__)) + taken = self.values.take(indices) + return self._shallow_copy(taken) - # include our engine hashtable - result += self._engine.sizeof(deep=deep) - return result + def _assert_take_fillable(self, values, indices, allow_fill=True, + fill_value=None, na_value=np.nan): + """ + Internal method to handle NA filling of take. + """ + indices = ensure_platform_int(indices) + + # only fill if we are passing a non-None fill_value + if allow_fill and fill_value is not None: + if (indices < -1).any(): + msg = ('When allow_fill=True and fill_value is not None, ' + 'all indices must be >= -1') + raise ValueError(msg) + taken = algos.take(values, + indices, + allow_fill=allow_fill, + fill_value=na_value) + else: + taken = values.take(indices) + return taken - # ops compat def repeat(self, repeats, *args, **kwargs): """ Repeat elements of an Index. @@ -824,169 +861,22 @@ def repeat(self, repeats, *args, **kwargs): nv.validate_repeat(args, kwargs) return self._shallow_copy(self._values.repeat(repeats)) - _index_shared_docs['where'] = """ - .. versionadded:: 0.19.0 + # -------------------------------------------------------------------- + # Copying Methods - Return an Index of same shape as self and whose corresponding - entries are from self where cond is True and otherwise are from - other. + _index_shared_docs['copy'] = """ + Make a copy of this object. Name and dtype sets those attributes on + the new object. Parameters ---------- - cond : boolean array-like with the same length as self - other : scalar, or array-like - """ + name : string, optional + deep : boolean, default False + dtype : numpy dtype or pandas type - @Appender(_index_shared_docs['where']) - def where(self, cond, other=None): - if other is None: - other = self._na_value - - dtype = self.dtype - values = self.values - - if is_bool(other) or is_bool_dtype(other): - - # bools force casting - values = values.astype(object) - dtype = None - - values = np.where(cond, values, other) - - if self._is_numeric_dtype and np.any(isna(values)): - # We can't coerce to the numeric dtype of "self" (unless - # it's float) if there are NaN values in our output. - dtype = None - - return self._shallow_copy_with_infer(values, dtype=dtype) - - def ravel(self, order='C'): - """ - return an ndarray of the flattened values of the underlying data - - See Also - -------- - numpy.ndarray.ravel - """ - return self._ndarray_values.ravel(order=order) - - # construction helpers - @classmethod - def _try_convert_to_int_index(cls, data, copy, name, dtype): - """ - Attempt to convert an array of data into an integer index. - - Parameters - ---------- - data : The data to convert. - copy : Whether to copy the data or not. - name : The name of the index returned. - - Returns - ------- - int_index : data converted to either an Int64Index or a - UInt64Index - - Raises - ------ - ValueError if the conversion was not successful. - """ - - from .numeric import Int64Index, UInt64Index - if not is_unsigned_integer_dtype(dtype): - # skip int64 conversion attempt if uint-like dtype is passed, as - # this could return Int64Index when UInt64Index is what's desrired - try: - res = data.astype('i8', copy=False) - if (res == data).all(): - return Int64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - # Conversion to int64 failed (possibly due to overflow) or was skipped, - # so let's try now with uint64. - try: - res = data.astype('u8', copy=False) - if (res == data).all(): - return UInt64Index(res, copy=copy, name=name) - except (OverflowError, TypeError, ValueError): - pass - - raise ValueError - - @classmethod - def _scalar_data_error(cls, data): - raise TypeError('{0}(...) must be called with a collection of some ' - 'kind, {1} was passed'.format(cls.__name__, - repr(data))) - - @classmethod - def _string_data_error(cls, data): - raise TypeError('String dtype not supported, you may need ' - 'to explicitly cast to a numeric type') - - @classmethod - def _coerce_to_ndarray(cls, data): - """coerces data to ndarray, raises on scalar data. Converts other - iterables to list first and then to array. Does not touch ndarrays. - """ - - if not isinstance(data, (np.ndarray, Index)): - if data is None or is_scalar(data): - cls._scalar_data_error(data) - - # other iterable of some kind - if not isinstance(data, (ABCSeries, list, tuple)): - data = list(data) - data = np.asarray(data) - return data - - def _get_attributes_dict(self): - """ return an attributes dict for my class """ - return {k: getattr(self, k, None) for k in self._attributes} - - def view(self, cls=None): - - # we need to see if we are subclassing an - # index type here - if cls is not None and not hasattr(cls, '_typ'): - result = self._data.view(cls) - else: - result = self._shallow_copy() - if isinstance(result, Index): - result._id = self._id - return result - - def _coerce_scalar_to_index(self, item): - """ - we need to coerce a scalar to a compat for our index type - - Parameters - ---------- - item : scalar item to coerce - """ - dtype = self.dtype - - if self._is_numeric_dtype and isna(item): - # We can't coerce to the numeric dtype of "self" (unless - # it's float) if there are NaN values in our output. - dtype = None - - return Index([item], dtype=dtype, **self._get_attributes_dict()) - - _index_shared_docs['copy'] = """ - Make a copy of this object. Name and dtype sets those attributes on - the new object. - - Parameters - ---------- - name : string, optional - deep : boolean, default False - dtype : numpy dtype or pandas type - - Returns - ------- - copy : Index + Returns + ------- + copy : Index Notes ----- @@ -1023,24 +913,8 @@ def __deepcopy__(self, memo=None): memo = {} return self.copy(deep=True) - def _validate_names(self, name=None, names=None, deep=False): - """ - Handles the quirks of having a singular 'name' parameter for general - Index and plural 'names' parameter for MultiIndex. - """ - from copy import deepcopy - if names is not None and name is not None: - raise TypeError("Can only provide one of `names` and `name`") - elif names is None and name is None: - return deepcopy(self.names) if deep else self.names - elif names is not None: - if not is_list_like(names): - raise TypeError("Must pass list-like as `names`.") - return names - else: - if not is_list_like(name): - return [name] - return name + # -------------------------------------------------------------------- + # Rendering Methods def __unicode__(self): """ @@ -1078,13 +952,13 @@ def _format_space(self): @property def _formatter_func(self): """ - Return the formatter function + Return the formatter function. """ return default_pprint def _format_data(self, name=None): """ - Return the formatted data as a unicode string + Return the formatted data as a unicode string. """ # do we want to justify (only do so for non-objects) @@ -1097,60 +971,188 @@ def _format_data(self, name=None): def _format_attrs(self): """ - Return a list of tuples of the (attr,formatted_value) + Return a list of tuples of the (attr,formatted_value). """ return format_object_attrs(self) - def to_flat_index(self): + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self.values + + def format(self, name=False, formatter=None, **kwargs): """ - Identity method. + Render a string representation of the Index. + """ + header = [] + if name: + header.append(pprint_thing(self.name, + escape_chars=('\t', '\r', '\n')) if + self.name is not None else '') - .. versionadded:: 0.24.0 + if formatter is not None: + return header + list(self.map(formatter)) - This is implemented for compatability with subclass implementations - when chaining. + return self._format_with_header(header, **kwargs) - Returns - ------- - pd.Index - Caller. + def _format_with_header(self, header, na_rep='NaN', **kwargs): + values = self.values - See Also - -------- - MultiIndex.to_flat_index : Subclass implementation. - """ - return self + from pandas.io.formats.format import format_array - def to_series(self, index=None, name=None): + if is_categorical_dtype(values.dtype): + values = np.array(values) + + elif is_object_dtype(values.dtype): + values = lib.maybe_convert_objects(values, safe=1) + + if is_object_dtype(values.dtype): + result = [pprint_thing(x, escape_chars=('\t', '\r', '\n')) + for x in values] + + # could have nans + mask = isna(values) + if mask.any(): + result = np.array(result) + result[mask] = na_rep + result = result.tolist() + + else: + result = _trim_front(format_array(values, None, justify='left')) + return header + result + + def to_native_types(self, slicer=None, **kwargs): """ - Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index + Format specified values of `self` and return them. Parameters ---------- - index : Index, optional - index of resulting Series. If None, defaults to original index - name : string, optional - name of resulting Series. If None, defaults to name of original - index + slicer : int, array-like + An indexer into `self` that specifies which values + are used in the formatting process. + kwargs : dict + Options for specifying how the values should be formatted. + These options include the following: - Returns - ------- - Series : dtype will be based on the type of the Index values. + 1) na_rep : str + The value that serves as a placeholder for NULL values + 2) quoting : bool or None + Whether or not there are quoted values in `self` + 3) date_format : str + The format used to represent date-like values """ - from pandas import Series + values = self + if slicer is not None: + values = values[slicer] + return values._format_native_types(**kwargs) - if index is None: - index = self._shallow_copy() - if name is None: - name = self.name + def _format_native_types(self, na_rep='', quoting=None, **kwargs): + """ + Actually format specific types of the index. + """ + mask = isna(self) + if not self.is_object() and not quoting: + values = np.asarray(self).astype(str) + else: + values = np.array(self, dtype=object, copy=True) - return Series(self.values.copy(), index=index, name=name) + values[mask] = na_rep + return values - def to_frame(self, index=True, name=None): + def _summary(self, name=None): """ - Create a DataFrame with a column containing the Index. + Return a summarized representation. + + Parameters + ---------- + name : str + name to use in the summary representation + + Returns + ------- + String with a summarized representation of the index + """ + if len(self) > 0: + head = self[0] + if (hasattr(head, 'format') and + not isinstance(head, compat.string_types)): + head = head.format() + tail = self[-1] + if (hasattr(tail, 'format') and + not isinstance(tail, compat.string_types)): + tail = tail.format() + index_summary = ', %s to %s' % (pprint_thing(head), + pprint_thing(tail)) + else: + index_summary = '' + + if name is None: + name = type(self).__name__ + return '%s: %s entries%s' % (name, len(self), index_summary) + + def summary(self, name=None): + """ + Return a summarized representation. + + .. deprecated:: 0.23.0 + """ + warnings.warn("'summary' is deprecated and will be removed in a " + "future version.", FutureWarning, stacklevel=2) + return self._summary(name) + + # -------------------------------------------------------------------- + # Conversion Methods + + def to_flat_index(self): + """ + Identity method. + + .. versionadded:: 0.24.0 + + This is implemented for compatability with subclass implementations + when chaining. + + Returns + ------- + pd.Index + Caller. + + See Also + -------- + MultiIndex.to_flat_index : Subclass implementation. + """ + return self + + def to_series(self, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index. + + Parameters + ---------- + index : Index, optional + index of resulting Series. If None, defaults to original index + name : string, optional + name of resulting Series. If None, defaults to name of original + index + + Returns + ------- + Series : dtype will be based on the type of the Index values. + """ + + from pandas import Series + + if index is None: + index = self._shallow_copy() + if name is None: + name = self.name + + return Series(self.values.copy(), index=index, name=name) + + def to_frame(self, index=True, name=None): + """ + Create a DataFrame with a column containing the Index. .. versionadded:: 0.24.0 @@ -1209,77 +1211,27 @@ def to_frame(self, index=True, name=None): result.index = self return result - _index_shared_docs['astype'] = """ - Create an Index with values cast to dtypes. The class of a new Index - is determined by dtype. When conversion is impossible, a ValueError - exception is raised. - - Parameters - ---------- - dtype : numpy dtype or pandas type - copy : bool, default True - By default, astype always returns a newly allocated object. - If copy is set to False and internal requirements on dtype are - satisfied, the original data is used to create a new Index - or the original Index is returned. + # -------------------------------------------------------------------- + # Name-Centric Methods - .. versionadded:: 0.19.0 + def _validate_names(self, name=None, names=None, deep=False): """ - - @Appender(_index_shared_docs['astype']) - def astype(self, dtype, copy=True): - if is_dtype_equal(self.dtype, dtype): - return self.copy() if copy else self - - elif is_categorical_dtype(dtype): - from .category import CategoricalIndex - return CategoricalIndex(self.values, name=self.name, dtype=dtype, - copy=copy) - - elif is_extension_array_dtype(dtype): - return Index(np.asarray(self), dtype=dtype, copy=copy) - - try: - if is_datetime64tz_dtype(dtype): - from pandas import DatetimeIndex - return DatetimeIndex(self.values, name=self.name, dtype=dtype, - copy=copy) - return Index(self.values.astype(dtype, copy=copy), name=self.name, - dtype=dtype) - except (TypeError, ValueError): - msg = 'Cannot cast {name} to dtype {dtype}' - raise TypeError(msg.format(name=type(self).__name__, dtype=dtype)) - - def _to_safe_for_reshape(self): - """ convert to object if we are a categorical """ - return self - - def _assert_can_do_setop(self, other): - if not is_list_like(other): - raise TypeError('Input must be Index or array-like') - return True - - def _convert_can_do_setop(self, other): - if not isinstance(other, Index): - other = Index(other, name=self.name) - result_name = self.name + Handles the quirks of having a singular 'name' parameter for general + Index and plural 'names' parameter for MultiIndex. + """ + from copy import deepcopy + if names is not None and name is not None: + raise TypeError("Can only provide one of `names` and `name`") + elif names is None and name is None: + return deepcopy(self.names) if deep else self.names + elif names is not None: + if not is_list_like(names): + raise TypeError("Must pass list-like as `names`.") + return names else: - result_name = get_op_result_name(self, other) - return other, result_name - - def _convert_for_op(self, value): - """ Convert value to be insertable to ndarray """ - return value - - def _assert_can_do_op(self, value): - """ Check value is valid for scalar op """ - if not is_scalar(value): - msg = "'value' must be a scalar, passed: {0}" - raise TypeError(msg.format(type(value).__name__)) - - @property - def nlevels(self): - return 1 + if not is_list_like(name): + return [name] + return name def _get_names(self): return FrozenList((self.name, )) @@ -1438,77 +1390,213 @@ def rename(self, name, inplace=False): """ return self.set_names([name], inplace=inplace) + # -------------------------------------------------------------------- + # Level-Centric Methods + @property - def _has_complex_internals(self): - # to disable groupby tricks in MultiIndex - return False + def nlevels(self): + return 1 - def _summary(self, name=None): + def _sort_levels_monotonic(self): """ - Return a summarized representation - - Parameters - ---------- - name : str - name to use in the summary representation + Compat with MultiIndex. + """ + return self - Returns - ------- - String with a summarized representation of the index + def _validate_index_level(self, level): """ - if len(self) > 0: - head = self[0] - if (hasattr(head, 'format') and - not isinstance(head, compat.string_types)): - head = head.format() - tail = self[-1] - if (hasattr(tail, 'format') and - not isinstance(tail, compat.string_types)): - tail = tail.format() - index_summary = ', %s to %s' % (pprint_thing(head), - pprint_thing(tail)) - else: - index_summary = '' + Validate index level. - if name is None: - name = type(self).__name__ - return '%s: %s entries%s' % (name, len(self), index_summary) + For single-level Index getting level number is a no-op, but some + verification must be done like in MultiIndex. - def summary(self, name=None): """ - Return a summarized representation - .. deprecated:: 0.23.0 + if isinstance(level, int): + if level < 0 and level != -1: + raise IndexError("Too many levels: Index has only 1 level," + " %d is not a valid level number" % (level, )) + elif level > 0: + raise IndexError("Too many levels:" + " Index has only 1 level, not %d" % + (level + 1)) + elif level != self.name: + raise KeyError('Level %s must be same as name (%s)' % + (level, self.name)) + + def _get_level_number(self, level): + self._validate_index_level(level) + return 0 + + def sortlevel(self, level=None, ascending=True, sort_remaining=None): """ - warnings.warn("'summary' is deprecated and will be removed in a " - "future version.", FutureWarning, stacklevel=2) - return self._summary(name) + For internal compatibility with with the Index API. - def _mpl_repr(self): - # how to represent ourselves to matplotlib - return self.values + Sort the Index. This is for compat with MultiIndex - _na_value = np.nan - """The expected NA value to use with this index.""" + Parameters + ---------- + ascending : boolean, default True + False to sort in descending order - # introspection - @property - def is_monotonic(self): - """ alias for is_monotonic_increasing (deprecated) """ - return self.is_monotonic_increasing + level, sort_remaining are compat parameters - @property - def is_monotonic_increasing(self): + Returns + ------- + sorted_index : Index """ - return if the index is monotonic increasing (only equal or - increasing) values. + return self.sort_values(return_indexer=True, ascending=ascending) - Examples - -------- - >>> Index([1, 2, 3]).is_monotonic_increasing - True - >>> Index([1, 2, 2]).is_monotonic_increasing - True - >>> Index([1, 3, 2]).is_monotonic_increasing + def _get_level_values(self, level): + """ + Return an Index of values for requested level. + + This is primarily useful to get an individual level of values from a + MultiIndex, but is provided on Index as well for compatability. + + Parameters + ---------- + level : int or str + It is either the integer position or the name of the level. + + Returns + ------- + values : Index + Calling object, as there is only one level in the Index. + + See Also + -------- + MultiIndex.get_level_values : Get values for a level of a MultiIndex. + + Notes + ----- + For Index, level should be 0, since there are no multiple levels. + + Examples + -------- + + >>> idx = pd.Index(list('abc')) + >>> idx + Index(['a', 'b', 'c'], dtype='object') + + Get level values by supplying `level` as integer: + + >>> idx.get_level_values(0) + Index(['a', 'b', 'c'], dtype='object') + """ + self._validate_index_level(level) + return self + + get_level_values = _get_level_values + + def droplevel(self, level=0): + """ + Return index with requested level(s) removed. + + If resulting index has only 1 level left, the result will be + of Index type, not MultiIndex. + + .. versionadded:: 0.23.1 (support for non-MultiIndex) + + Parameters + ---------- + level : int, str, or list-like, default 0 + If a string is given, must be the name of a level + If list-like, elements must be names or indexes of levels. + + Returns + ------- + index : Index or MultiIndex + """ + if not isinstance(level, (tuple, list)): + level = [level] + + levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] + + if len(level) == 0: + return self + if len(level) >= self.nlevels: + raise ValueError("Cannot remove {} levels from an index with {} " + "levels: at least one level must be " + "left.".format(len(level), self.nlevels)) + # The two checks above guarantee that here self is a MultiIndex + + new_levels = list(self.levels) + new_labels = list(self.labels) + new_names = list(self.names) + + for i in levnums: + new_levels.pop(i) + new_labels.pop(i) + new_names.pop(i) + + if len(new_levels) == 1: + + # set nan if needed + mask = new_labels[0] == -1 + result = new_levels[0].take(new_labels[0]) + if mask.any(): + result = result.putmask(mask, np.nan) + + result.name = new_names[0] + return result + else: + from .multi import MultiIndex + return MultiIndex(levels=new_levels, labels=new_labels, + names=new_names, verify_integrity=False) + + _index_shared_docs['_get_grouper_for_level'] = """ + Get index grouper corresponding to an index level + + Parameters + ---------- + mapper: Group mapping function or None + Function mapping index values to groups + level : int or None + Index level + + Returns + ------- + grouper : Index + Index of values to group on + labels : ndarray of int or None + Array of locations in level_index + uniques : Index or None + Index of unique values for level + """ + + @Appender(_index_shared_docs['_get_grouper_for_level']) + def _get_grouper_for_level(self, mapper, level=None): + assert level is None or level == 0 + if mapper is None: + grouper = self + else: + grouper = self.map(mapper) + + return grouper, None, None + + # -------------------------------------------------------------------- + # Introspection Methods + + @property + def is_monotonic(self): + """ + Alias for is_monotonic_increasing. + """ + return self.is_monotonic_increasing + + @property + def is_monotonic_increasing(self): + """ + Return if the index is monotonic increasing (only equal or + increasing) values. + + Examples + -------- + >>> Index([1, 2, 3]).is_monotonic_increasing + True + >>> Index([1, 2, 2]).is_monotonic_increasing + True + >>> Index([1, 3, 2]).is_monotonic_increasing False """ return self._engine.is_monotonic_increasing @@ -1516,7 +1604,7 @@ def is_monotonic_increasing(self): @property def is_monotonic_decreasing(self): """ - return if the index is monotonic decreasing (only equal or + Return if the index is monotonic decreasing (only equal or decreasing) values. Examples @@ -1532,8 +1620,9 @@ def is_monotonic_decreasing(self): @property def _is_strictly_monotonic_increasing(self): - """return if the index is strictly monotonic increasing - (only increasing) values + """ + Return if the index is strictly monotonic increasing + (only increasing) values. Examples -------- @@ -1548,8 +1637,9 @@ def _is_strictly_monotonic_increasing(self): @property def _is_strictly_monotonic_decreasing(self): - """return if the index is strictly monotonic decreasing - (only decreasing) values + """ + Return if the index is strictly monotonic decreasing + (only decreasing) values. Examples -------- @@ -1567,7 +1657,9 @@ def is_lexsorted_for_tuple(self, tup): @cache_readonly def is_unique(self): - """ return if the index has unique values """ + """ + Return if the index has unique values. + """ return self._engine.is_unique @property @@ -1634,232 +1726,385 @@ def is_mixed(self): def holds_integer(self): return self.inferred_type in ['integer', 'mixed-integer'] - _index_shared_docs['_convert_scalar_indexer'] = """ - Convert a scalar indexer. - - Parameters - ---------- - key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None - """ + @cache_readonly + def inferred_type(self): + """ + Return a string of the type inferred from the values. + """ + return lib.infer_dtype(self) - @Appender(_index_shared_docs['_convert_scalar_indexer']) - def _convert_scalar_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + @cache_readonly + def is_all_dates(self): + if self._data is None: + return False + return is_datetime_array(ensure_object(self.values)) - if kind == 'iloc': - return self._validate_indexer('positional', key, kind) + # -------------------------------------------------------------------- + # Pickle Methods - if len(self) and not isinstance(self, ABCMultiIndex,): + def __reduce__(self): + d = dict(data=self._data) + d.update(self._get_attributes_dict()) + return _new_Index, (self.__class__, d), None - # we can raise here if we are definitive that this - # is positional indexing (eg. .ix on with a float) - # or label indexing if we are using a type able - # to be represented in the index + def __setstate__(self, state): + """ + Necessary for making this object picklable. + """ - if kind in ['getitem', 'ix'] and is_float(key): - if not self.is_floating(): - return self._invalid_indexer('label', key) + if isinstance(state, dict): + self._data = state.pop('data') + for k, v in compat.iteritems(state): + setattr(self, k, v) - elif kind in ['loc'] and is_float(key): + elif isinstance(state, tuple): - # we want to raise KeyError on string/mixed here - # technically we *could* raise a TypeError - # on anything but mixed though - if self.inferred_type not in ['floating', - 'mixed-integer-float', - 'string', - 'unicode', - 'mixed']: - return self._invalid_indexer('label', key) + if len(state) == 2: + nd_state, own_state = state + data = np.empty(nd_state[1], dtype=nd_state[2]) + np.ndarray.__setstate__(data, nd_state) + self.name = own_state[0] - elif kind in ['loc'] and is_integer(key): - if not self.holds_integer(): - return self._invalid_indexer('label', key) + else: # pragma: no cover + data = np.empty(state) + np.ndarray.__setstate__(data, state) - return key + self._data = data + self._reset_identity() + else: + raise Exception("invalid pickle state") - _index_shared_docs['_convert_slice_indexer'] = """ - Convert a slice indexer. + _unpickle_compat = __setstate__ - By definition, these are labels unless 'iloc' is passed in. - Floats are not allowed as the start, step, or stop of the slice. + # -------------------------------------------------------------------- + # Null Handling Methods - Parameters - ---------- - key : label of the slice bound - kind : {'ix', 'loc', 'getitem', 'iloc'} or None - """ + _na_value = np.nan + """The expected NA value to use with this index.""" - @Appender(_index_shared_docs['_convert_slice_indexer']) - def _convert_slice_indexer(self, key, kind=None): - assert kind in ['ix', 'loc', 'getitem', 'iloc', None] + @cache_readonly + def _isnan(self): + """ + Return if each value is NaN. + """ + if self._can_hold_na: + return isna(self) + else: + # shouldn't reach to this condition by checking hasnans beforehand + values = np.empty(len(self), dtype=np.bool_) + values.fill(False) + return values - # if we are not a slice, then we are done - if not isinstance(key, slice): - return key + @cache_readonly + def _nan_idxs(self): + if self._can_hold_na: + w, = self._isnan.nonzero() + return w + else: + return np.array([], dtype=np.int64) - # validate iloc - if kind == 'iloc': - return slice(self._validate_indexer('slice', key.start, kind), - self._validate_indexer('slice', key.stop, kind), - self._validate_indexer('slice', key.step, kind)) + @cache_readonly + def hasnans(self): + """ + Return if I have any nans; enables various perf speedups. + """ + if self._can_hold_na: + return bool(self._isnan.any()) + else: + return False - # potentially cast the bounds to integers - start, stop, step = key.start, key.stop, key.step + def isna(self): + """ + Detect missing values. - # figure out if this is a positional indexer - def is_int(v): - return v is None or is_integer(v) + Return a boolean same-sized object indicating if the values are NA. + NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get + mapped to ``True`` values. + Everything else get mapped to ``False`` values. Characters such as + empty strings `''` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). - is_null_slicer = start is None and stop is None - is_index_slice = is_int(start) and is_int(stop) - is_positional = is_index_slice and not self.is_integer() + .. versionadded:: 0.20.0 - if kind == 'getitem': - """ - called from the getitem slicers, validate that we are in fact - integers - """ - if self.is_integer() or is_index_slice: - return slice(self._validate_indexer('slice', key.start, kind), - self._validate_indexer('slice', key.stop, kind), - self._validate_indexer('slice', key.step, kind)) + Returns + ------- + numpy.ndarray + A boolean array of whether my values are NA - # convert the slice to an indexer here + See Also + -------- + pandas.Index.notna : Boolean inverse of isna. + pandas.Index.dropna : Omit entries with missing values. + pandas.isna : Top-level isna. + Series.isna : Detect missing values in Series object. - # if we are mixed and have integers - try: - if is_positional and self.is_mixed(): - # Validate start & stop - if start is not None: - self.get_loc(start) - if stop is not None: - self.get_loc(stop) - is_positional = False - except KeyError: - if self.inferred_type == 'mixed-integer-float': - raise + Examples + -------- + Show which entries in a pandas.Index are NA. The result is an + array. - if is_null_slicer: - indexer = key - elif is_positional: - indexer = key - else: - try: - indexer = self.slice_indexer(start, stop, step, kind=kind) - except Exception: - if is_index_slice: - if self.is_integer(): - raise - else: - indexer = key - else: - raise + >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx + Float64Index([5.2, 6.0, nan], dtype='float64') + >>> idx.isna() + array([False, False, True], dtype=bool) - return indexer + Empty strings are not considered NA values. None is considered an NA + value. - def _convert_listlike_indexer(self, keyarr, kind=None): + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.isna() + array([False, False, False, True], dtype=bool) + + For datetimes, `NaT` (Not a Time) is considered as an NA value. + + >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), + ... pd.Timestamp(''), None, pd.NaT]) + >>> idx + DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], + dtype='datetime64[ns]', freq=None) + >>> idx.isna() + array([False, True, True, True], dtype=bool) + """ + return self._isnan + isnull = isna + + def notna(self): + """ + Detect existing (non-missing) values. + + Return a boolean same-sized object indicating if the values are not NA. + Non-missing values get mapped to ``True``. Characters such as empty + strings ``''`` or :attr:`numpy.inf` are not considered NA values + (unless you set ``pandas.options.mode.use_inf_as_na = True``). + NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` + values. + + .. versionadded:: 0.20.0 + + Returns + ------- + numpy.ndarray + Boolean array to indicate which entries are not NA. + + See Also + -------- + Index.notnull : Alias of notna. + Index.isna: Inverse of notna. + pandas.notna : Top-level notna. + + Examples + -------- + Show which entries in an Index are not NA. The result is an + array. + + >>> idx = pd.Index([5.2, 6.0, np.NaN]) + >>> idx + Float64Index([5.2, 6.0, nan], dtype='float64') + >>> idx.notna() + array([ True, True, False]) + + Empty strings are not considered NA values. None is considered a NA + value. + + >>> idx = pd.Index(['black', '', 'red', None]) + >>> idx + Index(['black', '', 'red', None], dtype='object') + >>> idx.notna() + array([ True, True, True, False]) """ + return ~self.isna() + notnull = notna + + _index_shared_docs['fillna'] = """ + Fill NA/NaN values with the specified value + Parameters ---------- - keyarr : list-like - Indexer to convert. + value : scalar + Scalar value to use to fill holes (e.g. 0). + This value cannot be a list-likes. + downcast : dict, default is None + a dict of item->dtype of what to downcast if possible, + or the string 'infer' which will try to downcast to an appropriate + equal type (e.g. float64 to int64 if possible) Returns ------- - tuple (indexer, keyarr) - indexer is an ndarray or None if cannot convert - keyarr are tuple-safe keys + filled : %(klass)s """ - if isinstance(keyarr, Index): - keyarr = self._convert_index_indexer(keyarr) - else: - keyarr = self._convert_arr_indexer(keyarr) - indexer = self._convert_list_indexer(keyarr, kind=kind) - return indexer, keyarr + @Appender(_index_shared_docs['fillna']) + def fillna(self, value=None, downcast=None): + self._assert_can_do_op(value) + if self.hasnans: + result = self.putmask(self._isnan, value) + if downcast is None: + # no need to care metadata other than name + # because it can't have freq if + return Index(result, name=self.name) + return self._shallow_copy() - _index_shared_docs['_convert_arr_indexer'] = """ - Convert an array-like indexer to the appropriate dtype. + _index_shared_docs['dropna'] = """ + Return Index without NA/NaN values Parameters ---------- - keyarr : array-like - Indexer to convert. + how : {'any', 'all'}, default 'any' + If the Index is a MultiIndex, drop the value when any or all levels + are NaN. Returns ------- - converted_keyarr : array-like - """ + valid : Index + """ - @Appender(_index_shared_docs['_convert_arr_indexer']) - def _convert_arr_indexer(self, keyarr): - keyarr = com.asarray_tuplesafe(keyarr) - return keyarr + @Appender(_index_shared_docs['dropna']) + def dropna(self, how='any'): + if how not in ('any', 'all'): + raise ValueError("invalid how option: {0}".format(how)) - _index_shared_docs['_convert_index_indexer'] = """ - Convert an Index indexer to the appropriate dtype. + if self.hasnans: + return self._shallow_copy(self.values[~self._isnan]) + return self._shallow_copy() + + # -------------------------------------------------------------------- + # Uniqueness Methods + + _index_shared_docs['index_unique'] = ( + """ + Return unique values in the index. Uniques are returned in order + of appearance, this does NOT sort. Parameters ---------- - keyarr : Index (or sub-class) - Indexer to convert. + level : int or str, optional, default None + Only return values from specified level (for MultiIndex) + + .. versionadded:: 0.23.0 Returns ------- - converted_keyarr : Index (or sub-class) - """ + Index without duplicates - @Appender(_index_shared_docs['_convert_index_indexer']) - def _convert_index_indexer(self, keyarr): - return keyarr + See Also + -------- + unique + Series.unique + """) - _index_shared_docs['_convert_list_indexer'] = """ - Convert a list-like indexer to the appropriate dtype. + @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + result = super(Index, self).unique() + return self._shallow_copy(result) + + def drop_duplicates(self, keep='first'): + """ + Return Index with duplicate values removed. Parameters ---------- - keyarr : Index (or sub-class) - Indexer to convert. - kind : iloc, ix, loc, optional + keep : {'first', 'last', ``False``}, default 'first' + - 'first' : Drop duplicates except for the first occurrence. + - 'last' : Drop duplicates except for the last occurrence. + - ``False`` : Drop all duplicates. Returns ------- - positional indexer or None - """ + deduplicated : Index - @Appender(_index_shared_docs['_convert_list_indexer']) - def _convert_list_indexer(self, keyarr, kind=None): - if (kind in [None, 'iloc', 'ix'] and - is_integer_dtype(keyarr) and not self.is_floating() and - not isinstance(keyarr, ABCPeriodIndex)): + See Also + -------- + Series.drop_duplicates : Equivalent method on Series. + DataFrame.drop_duplicates : Equivalent method on DataFrame. + Index.duplicated : Related method on Index, indicating duplicate + Index values. - if self.inferred_type == 'mixed-integer': - indexer = self.get_indexer(keyarr) - if (indexer >= 0).all(): - return indexer - # missing values are flagged as -1 by get_indexer and negative - # indices are already converted to positive indices in the - # above if-statement, so the negative flags are changed to - # values outside the range of indices so as to trigger an - # IndexError in maybe_convert_indices - indexer[indexer < 0] = len(self) - from pandas.core.indexing import maybe_convert_indices - return maybe_convert_indices(indexer, len(self)) + Examples + -------- + Generate an pandas.Index with duplicate values. - elif not self.inferred_type == 'integer': - keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr) - return keyarr + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) - return None + The `keep` parameter controls which duplicate values are removed. + The value 'first' keeps the first occurrence for each + set of duplicated entries. The default value of keep is 'first'. - def _invalid_indexer(self, form, key): - """ consistent invalid indexer message """ - raise TypeError("cannot do {form} indexing on {klass} with these " - "indexers [{key}] of {kind}".format( - form=form, klass=type(self), key=key, - kind=type(key))) + >>> idx.drop_duplicates(keep='first') + Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') + + The value 'last' keeps the last occurrence for each set of duplicated + entries. + + >>> idx.drop_duplicates(keep='last') + Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') + + The value ``False`` discards all sets of duplicated entries. + + >>> idx.drop_duplicates(keep=False) + Index(['cow', 'beetle', 'hippo'], dtype='object') + """ + return super(Index, self).drop_duplicates(keep=keep) + + def duplicated(self, keep='first'): + """ + Indicate duplicate index values. + + Duplicated values are indicated as ``True`` values in the resulting + array. Either all duplicates, all except the first, or all except the + last occurrence of duplicates can be indicated. + + Parameters + ---------- + keep : {'first', 'last', False}, default 'first' + The value or values in a set of duplicates to mark as missing. + + - 'first' : Mark duplicates as ``True`` except for the first + occurrence. + - 'last' : Mark duplicates as ``True`` except for the last + occurrence. + - ``False`` : Mark all duplicates as ``True``. + + Examples + -------- + By default, for each set of duplicated values, the first occurrence is + set to False and all others to True: + + >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) + >>> idx.duplicated() + array([False, False, True, False, True]) + + which is equivalent to + + >>> idx.duplicated(keep='first') + array([False, False, True, False, True]) + + By using 'last', the last occurrence of each set of duplicated values + is set on False and all others on True: + + >>> idx.duplicated(keep='last') + array([ True, False, True, False, False]) + + By setting keep on ``False``, all duplicates are True: + + >>> idx.duplicated(keep=False) + array([ True, False, True, False, True]) + + Returns + ------- + numpy.ndarray + + See Also + -------- + pandas.Series.duplicated : Equivalent method on pandas.Series. + pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame. + pandas.Index.drop_duplicates : Remove duplicate values from Index. + """ + return super(Index, self).duplicated(keep=keep) def get_duplicates(self): """ @@ -1920,91 +2165,65 @@ def get_duplicates(self): return self[self.duplicated()].unique() - def _cleanup(self): - self._engine.clear_mapping() - - @cache_readonly - def _constructor(self): - return type(self) - - @cache_readonly - def _engine(self): - # property, for now, slow to look up - return self._engine_type(lambda: self._ndarray_values, len(self)) - - def _validate_index_level(self, level): + def _get_unique_index(self, dropna=False): """ - Validate index level. + Returns an index containing unique values. - For single-level Index getting level number is a no-op, but some - verification must be done like in MultiIndex. + Parameters + ---------- + dropna : bool + If True, NaN values are dropped. + Returns + ------- + uniques : index """ - if isinstance(level, int): - if level < 0 and level != -1: - raise IndexError("Too many levels: Index has only 1 level," - " %d is not a valid level number" % (level, )) - elif level > 0: - raise IndexError("Too many levels:" - " Index has only 1 level, not %d" % - (level + 1)) - elif level != self.name: - raise KeyError('Level %s must be same as name (%s)' % - (level, self.name)) + if self.is_unique and not dropna: + return self - def _get_level_number(self, level): - self._validate_index_level(level) - return 0 + values = self.values - @cache_readonly - def inferred_type(self): - """ return a string of the type inferred from the values """ - return lib.infer_dtype(self) + if not self.is_unique: + values = self.unique() - def _is_memory_usage_qualified(self): - """ return a boolean if we need a qualified .info display """ - return self.is_object() + if dropna: + try: + if self.hasnans: + values = values[~isna(values)] + except NotImplementedError: + pass - def is_type_compatible(self, kind): - return kind == self.inferred_type + return self._shallow_copy(values) - @cache_readonly - def is_all_dates(self): - if self._data is None: - return False - return is_datetime_array(ensure_object(self.values)) + # -------------------------------------------------------------------- + # Arithmetic & Logical Methods - def __reduce__(self): - d = dict(data=self._data) - d.update(self._get_attributes_dict()) - return _new_Index, (self.__class__, d), None + def __add__(self, other): + if isinstance(other, (ABCSeries, ABCDataFrame)): + return NotImplemented + return Index(np.array(self) + other) - def __setstate__(self, state): - """Necessary for making this object picklable""" + def __radd__(self, other): + return Index(other + np.array(self)) - if isinstance(state, dict): - self._data = state.pop('data') - for k, v in compat.iteritems(state): - setattr(self, k, v) + def __iadd__(self, other): + # alias for __add__ + return self + other - elif isinstance(state, tuple): + def __sub__(self, other): + return Index(np.array(self) - other) - if len(state) == 2: - nd_state, own_state = state - data = np.empty(nd_state[1], dtype=nd_state[2]) - np.ndarray.__setstate__(data, nd_state) - self.name = own_state[0] + def __rsub__(self, other): + return Index(other - np.array(self)) - else: # pragma: no cover - data = np.empty(state) - np.ndarray.__setstate__(data, state) + def __and__(self, other): + return self.intersection(other) - self._data = data - self._reset_identity() - else: - raise Exception("invalid pickle state") + def __or__(self, other): + return self.union(other) - _unpickle_compat = __setstate__ + def __xor__(self, other): + return self.symmetric_difference(other) def __nonzero__(self): raise ValueError("The truth value of a {0} is ambiguous. " @@ -2013,2201 +2232,2319 @@ def __nonzero__(self): __bool__ = __nonzero__ - _index_shared_docs['__contains__'] = """ - return a boolean if this key is IN the index - - Parameters - ---------- - key : object + # -------------------------------------------------------------------- + # Set Operation Methods - Returns - ------- - boolean + def _get_reconciled_name_object(self, other): """ + If the result of a set operation will be self, + return self, unless the name changes, in which + case make a shallow copy of self. + """ + name = get_op_result_name(self, other) + if self.name != name: + return self._shallow_copy(name=name) + return self - @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) - def __contains__(self, key): - hash(key) - try: - return key in self._engine - except (OverflowError, TypeError, ValueError): - return False - - _index_shared_docs['contains'] = """ - return a boolean if this key is IN the index + def union(self, other): + """ + Form the union of two Index objects and sorts if possible. Parameters ---------- - key : object + other : Index or array-like Returns ------- - boolean - """ + union : Index - @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) - def contains(self, key): - hash(key) - try: - return key in self._engine - except (TypeError, ValueError): - return False + Examples + -------- - def __hash__(self): - raise TypeError("unhashable type: %r" % type(self).__name__) + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.union(idx2) + Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') + """ + self._assert_can_do_setop(other) + other = ensure_index(other) - def __setitem__(self, key, value): - raise TypeError("Index does not support mutable operations") + if len(other) == 0 or self.equals(other): + return self._get_reconciled_name_object(other) - def __getitem__(self, key): - """ - Override numpy.ndarray's __getitem__ method to work as desired. + if len(self) == 0: + return other._get_reconciled_name_object(self) - This function adds lists and Series as valid boolean indexers - (ndarrays only supports ndarray with dtype=bool). + # TODO: is_dtype_union_equal is a hack around + # 1. buggy set ops with duplicates (GH #13432) + # 2. CategoricalIndex lacking setops (GH #10186) + # Once those are fixed, this workaround can be removed + if not is_dtype_union_equal(self.dtype, other.dtype): + this = self.astype('O') + other = other.astype('O') + return this.union(other) - If resulting ndim != 1, plain ndarray is returned instead of - corresponding `Index` subclass. + # TODO(EA): setops-refactor, clean all this up + if is_period_dtype(self) or is_datetime64tz_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_period_dtype(other) or is_datetime64tz_dtype(other): + rvals = other._ndarray_values + else: + rvals = other._values - """ - # There's no custom logic to be implemented in __getslice__, so it's - # not overloaded intentionally. - getitem = self._data.__getitem__ - promote = self._shallow_copy + if self.is_monotonic and other.is_monotonic: + try: + result = self._outer_indexer(lvals, rvals)[0] + except TypeError: + # incomparable objects + result = list(lvals) - if is_scalar(key): - key = com.cast_scalar_indexer(key) - return getitem(key) + # worth making this faster? a very unusual case + value_set = set(lvals) + result.extend([x for x in rvals if x not in value_set]) + else: + indexer = self.get_indexer(other) + indexer, = (indexer == -1).nonzero() - if isinstance(key, slice): - # This case is separated from the conditional above to avoid - # pessimization of basic indexing. - return promote(getitem(key)) + if len(indexer) > 0: + other_diff = algos.take_nd(rvals, indexer, + allow_fill=False) + result = _concat._concat_compat((lvals, other_diff)) - if com.is_bool_indexer(key): - key = np.asarray(key, dtype=bool) + try: + lvals[0] < other_diff[0] + except TypeError as e: + warnings.warn("%s, sort order is undefined for " + "incomparable objects" % e, RuntimeWarning, + stacklevel=3) + else: + types = frozenset((self.inferred_type, + other.inferred_type)) + if not types & _unsortable_types: + result.sort() - key = com.values_from_object(key) - result = getitem(key) - if not is_scalar(result): - return promote(result) - else: - return result + else: + result = lvals - def _can_hold_identifiers_and_holds_name(self, name): - """ - Faster check for ``name in self`` when we know `name` is a Python - identifier (e.g. in NDFrame.__getattr__, which hits this to support - . key lookup). For indexes that can't hold identifiers (everything - but object & categorical) we just return False. + try: + result = np.sort(result) + except TypeError as e: + warnings.warn("%s, sort order is undefined for " + "incomparable objects" % e, RuntimeWarning, + stacklevel=3) - https://github.com/pandas-dev/pandas/issues/19764 - """ - if self.is_object() or self.is_categorical(): - return name in self - return False + # for subclasses + return self._wrap_setop_result(other, result) - def append(self, other): + def _wrap_setop_result(self, other, result): + return self._constructor(result, name=get_op_result_name(self, other)) + + def intersection(self, other): """ - Append a collection of Index options together + Form the intersection of two Index objects. + + This returns a new Index with elements common to the index and `other`, + preserving the order of the calling index. Parameters ---------- - other : Index or list/tuple of indices + other : Index or array-like Returns ------- - appended : Index - """ + intersection : Index - to_concat = [self] + Examples + -------- - if isinstance(other, (list, tuple)): - to_concat = to_concat + list(other) - else: - to_concat.append(other) + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.intersection(idx2) + Int64Index([3, 4], dtype='int64') + """ + self._assert_can_do_setop(other) + other = ensure_index(other) - for obj in to_concat: - if not isinstance(obj, Index): - raise TypeError('all inputs must be Index') + if self.equals(other): + return self._get_reconciled_name_object(other) - names = {obj.name for obj in to_concat} - name = None if len(names) > 1 else self.name + if not is_dtype_equal(self.dtype, other.dtype): + this = self.astype('O') + other = other.astype('O') + return this.intersection(other) - return self._concat(to_concat, name) + # TODO(EA): setops-refactor, clean all this up + if is_period_dtype(self): + lvals = self._ndarray_values + else: + lvals = self._values + if is_period_dtype(other): + rvals = other._ndarray_values + else: + rvals = other._values - def _concat(self, to_concat, name): + if self.is_monotonic and other.is_monotonic: + try: + result = self._inner_indexer(lvals, rvals)[0] + return self._wrap_setop_result(other, result) + except TypeError: + pass - typs = _concat.get_dtype_kinds(to_concat) + try: + indexer = Index(rvals).get_indexer(lvals) + indexer = indexer.take((indexer != -1).nonzero()[0]) + except Exception: + # duplicates + indexer = algos.unique1d( + Index(rvals).get_indexer_non_unique(lvals)[0]) + indexer = indexer[indexer != -1] - if len(typs) == 1: - return self._concat_same_dtype(to_concat, name=name) - return _concat._concat_index_asobject(to_concat, name=name) + taken = other.take(indexer) + if self.name != other.name: + taken.name = None + return taken - def _concat_same_dtype(self, to_concat, name): - """ - Concatenate to_concat which has the same class + def difference(self, other, sort=True): """ - # must be overridden in specific classes - return _concat._concat_index_asobject(to_concat, name) - - _index_shared_docs['take'] = """ - return a new %(klass)s of the values selected by the indices + Return a new Index with elements from the index that are not in + `other`. - For internal compatibility with numpy arrays. + This is the set difference of two Index objects. Parameters ---------- - indices : list - Indices to be taken - axis : int, optional - The axis over which to select values, always 0. - allow_fill : bool, default True - fill_value : bool, default None - If allow_fill=True and fill_value is not None, indices specified by - -1 is regarded as NA. If Index doesn't hold NA, raise ValueError + other : Index or array-like + sort : bool, default True + Sort the resulting index if possible - See Also + .. versionadded:: 0.24.0 + + Returns + ------- + difference : Index + + Examples -------- - numpy.ndarray.take + + >>> idx1 = pd.Index([2, 1, 3, 4]) + >>> idx2 = pd.Index([3, 4, 5, 6]) + >>> idx1.difference(idx2) + Int64Index([1, 2], dtype='int64') + >>> idx1.difference(idx2, sort=False) + Int64Index([2, 1], dtype='int64') """ + self._assert_can_do_setop(other) - @Appender(_index_shared_docs['take'] % _index_doc_kwargs) - def take(self, indices, axis=0, allow_fill=True, - fill_value=None, **kwargs): - if kwargs: - nv.validate_take(tuple(), kwargs) - indices = ensure_platform_int(indices) - if self._can_hold_na: - taken = self._assert_take_fillable(self.values, indices, - allow_fill=allow_fill, - fill_value=fill_value, - na_value=self._na_value) - else: - if allow_fill and fill_value is not None: - msg = 'Unable to fill values because {0} cannot contain NA' - raise ValueError(msg.format(self.__class__.__name__)) - taken = self.values.take(indices) - return self._shallow_copy(taken) + if self.equals(other): + # pass an empty np.ndarray with the appropriate dtype + return self._shallow_copy(self._data[:0]) - def _assert_take_fillable(self, values, indices, allow_fill=True, - fill_value=None, na_value=np.nan): - """ Internal method to handle NA filling of take """ - indices = ensure_platform_int(indices) + other, result_name = self._convert_can_do_setop(other) - # only fill if we are passing a non-None fill_value - if allow_fill and fill_value is not None: - if (indices < -1).any(): - msg = ('When allow_fill=True and fill_value is not None, ' - 'all indices must be >= -1') - raise ValueError(msg) - taken = algos.take(values, - indices, - allow_fill=allow_fill, - fill_value=na_value) - else: - taken = values.take(indices) - return taken + this = self._get_unique_index() - @cache_readonly - def _isnan(self): - """ return if each value is nan""" - if self._can_hold_na: - return isna(self) - else: - # shouldn't reach to this condition by checking hasnans beforehand - values = np.empty(len(self), dtype=np.bool_) - values.fill(False) - return values + indexer = this.get_indexer(other) + indexer = indexer.take((indexer != -1).nonzero()[0]) - @cache_readonly - def _nan_idxs(self): - if self._can_hold_na: - w, = self._isnan.nonzero() - return w - else: - return np.array([], dtype=np.int64) + label_diff = np.setdiff1d(np.arange(this.size), indexer, + assume_unique=True) + the_diff = this.values.take(label_diff) + if sort: + try: + the_diff = sorting.safe_sort(the_diff) + except TypeError: + pass - @cache_readonly - def hasnans(self): - """ - Return if I have any nans; enables various perf speedups. - """ - if self._can_hold_na: - return bool(self._isnan.any()) - else: - return False + return this._shallow_copy(the_diff, name=result_name, freq=None) - def isna(self): + def symmetric_difference(self, other, result_name=None): """ - Detect missing values. + Compute the symmetric difference of two Index objects. - Return a boolean same-sized object indicating if the values are NA. - NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`pd.NaT`, get - mapped to ``True`` values. - Everything else get mapped to ``False`` values. Characters such as - empty strings `''` or :attr:`numpy.inf` are not considered NA values - (unless you set ``pandas.options.mode.use_inf_as_na = True``). + It's sorted if sorting is possible. - .. versionadded:: 0.20.0 + Parameters + ---------- + other : Index or array-like + result_name : str Returns ------- - numpy.ndarray - A boolean array of whether my values are NA + symmetric_difference : Index - See Also - -------- - pandas.Index.notna : Boolean inverse of isna. - pandas.Index.dropna : Omit entries with missing values. - pandas.isna : Top-level isna. - Series.isna : Detect missing values in Series object. + Notes + ----- + ``symmetric_difference`` contains elements that appear in either + ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by + ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates + dropped. Examples -------- - Show which entries in a pandas.Index are NA. The result is an - array. + >>> idx1 = pd.Index([1, 2, 3, 4]) + >>> idx2 = pd.Index([2, 3, 4, 5]) + >>> idx1.symmetric_difference(idx2) + Int64Index([1, 5], dtype='int64') - >>> idx = pd.Index([5.2, 6.0, np.NaN]) - >>> idx - Float64Index([5.2, 6.0, nan], dtype='float64') - >>> idx.isna() - array([False, False, True], dtype=bool) + You can also use the ``^`` operator: - Empty strings are not considered NA values. None is considered an NA - value. + >>> idx1 ^ idx2 + Int64Index([1, 5], dtype='int64') + """ + self._assert_can_do_setop(other) + other, result_name_update = self._convert_can_do_setop(other) + if result_name is None: + result_name = result_name_update - >>> idx = pd.Index(['black', '', 'red', None]) - >>> idx - Index(['black', '', 'red', None], dtype='object') - >>> idx.isna() - array([False, False, False, True], dtype=bool) + this = self._get_unique_index() + other = other._get_unique_index() + indexer = this.get_indexer(other) - For datetimes, `NaT` (Not a Time) is considered as an NA value. + # {this} minus {other} + common_indexer = indexer.take((indexer != -1).nonzero()[0]) + left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, + assume_unique=True) + left_diff = this.values.take(left_indexer) - >>> idx = pd.DatetimeIndex([pd.Timestamp('1940-04-25'), - ... pd.Timestamp(''), None, pd.NaT]) - >>> idx - DatetimeIndex(['1940-04-25', 'NaT', 'NaT', 'NaT'], - dtype='datetime64[ns]', freq=None) - >>> idx.isna() - array([False, True, True, True], dtype=bool) - """ - return self._isnan - isnull = isna + # {other} minus {this} + right_indexer = (indexer == -1).nonzero()[0] + right_diff = other.values.take(right_indexer) - def notna(self): - """ - Detect existing (non-missing) values. + the_diff = _concat._concat_compat([left_diff, right_diff]) + try: + the_diff = sorting.safe_sort(the_diff) + except TypeError: + pass - Return a boolean same-sized object indicating if the values are not NA. - Non-missing values get mapped to ``True``. Characters such as empty - strings ``''`` or :attr:`numpy.inf` are not considered NA values - (unless you set ``pandas.options.mode.use_inf_as_na = True``). - NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False`` - values. + attribs = self._get_attributes_dict() + attribs['name'] = result_name + if 'freq' in attribs: + attribs['freq'] = None + return self._shallow_copy_with_infer(the_diff, **attribs) - .. versionadded:: 0.20.0 + def _assert_can_do_setop(self, other): + if not is_list_like(other): + raise TypeError('Input must be Index or array-like') + return True - Returns - ------- - numpy.ndarray - Boolean array to indicate which entries are not NA. + def _convert_can_do_setop(self, other): + if not isinstance(other, Index): + other = Index(other, name=self.name) + result_name = self.name + else: + result_name = get_op_result_name(self, other) + return other, result_name - See Also - -------- - Index.notnull : Alias of notna. - Index.isna: Inverse of notna. - pandas.notna : Top-level notna. + # -------------------------------------------------------------------- + # Indexing Methods - Examples - -------- - Show which entries in an Index are not NA. The result is an - array. + _index_shared_docs['get_loc'] = """ + Get integer location, slice or boolean mask for requested label. - >>> idx = pd.Index([5.2, 6.0, np.NaN]) - >>> idx - Float64Index([5.2, 6.0, nan], dtype='float64') - >>> idx.notna() - array([ True, True, False]) + Parameters + ---------- + key : label + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional + * default: exact matches only. + * pad / ffill: find the PREVIOUS index value if no exact match. + * backfill / bfill: use NEXT index value if no exact match + * nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index value. + tolerance : optional + Maximum distance from index value for inexact matches. The value of + the index at the matching location most satisfy the equation + ``abs(index[loc] - key) <= tolerance``. - Empty strings are not considered NA values. None is considered a NA - value. + Tolerance may be a scalar + value, which applies the same tolerance to all values, or + list-like, which applies variable tolerance per element. List-like + includes list, tuple, array, Series, and must be the same size as + the index and its dtype must exactly match the index's type. - >>> idx = pd.Index(['black', '', 'red', None]) - >>> idx - Index(['black', '', 'red', None], dtype='object') - >>> idx.notna() - array([ True, True, True, False]) - """ - return ~self.isna() - notnull = notna + .. versionadded:: 0.21.0 (list-like tolerance) - def putmask(self, mask, value): - """ - return a new Index of the values set with the mask + Returns + ------- + loc : int if unique index, slice if monotonic index, else mask - See Also - -------- - numpy.ndarray.putmask - """ - values = self.values.copy() - try: - np.putmask(values, mask, self._convert_for_op(value)) - return self._shallow_copy(values) - except (ValueError, TypeError) as err: - if is_object_dtype(self): - raise err + Examples + --------- + >>> unique_index = pd.Index(list('abc')) + >>> unique_index.get_loc('b') + 1 - # coerces to object - return self.astype(object).putmask(mask, value) + >>> monotonic_index = pd.Index(list('abbc')) + >>> monotonic_index.get_loc('b') + slice(1, 3, None) - def format(self, name=False, formatter=None, **kwargs): - """ - Render a string representation of the Index + >>> non_monotonic_index = pd.Index(list('abcb')) + >>> non_monotonic_index.get_loc('b') + array([False, True, False, True], dtype=bool) """ - header = [] - if name: - header.append(pprint_thing(self.name, - escape_chars=('\t', '\r', '\n')) if - self.name is not None else '') - - if formatter is not None: - return header + list(self.map(formatter)) - return self._format_with_header(header, **kwargs) + @Appender(_index_shared_docs['get_loc']) + def get_loc(self, key, method=None, tolerance=None): + if method is None: + if tolerance is not None: + raise ValueError('tolerance argument only valid if using pad, ' + 'backfill or nearest lookups') + try: + return self._engine.get_loc(key) + except KeyError: + return self._engine.get_loc(self._maybe_cast_indexer(key)) + indexer = self.get_indexer([key], method=method, tolerance=tolerance) + if indexer.ndim > 1 or indexer.size > 1: + raise TypeError('get_loc requires scalar valued input') + loc = indexer.item() + if loc == -1: + raise KeyError(key) + return loc - def _format_with_header(self, header, na_rep='NaN', **kwargs): - values = self.values - - from pandas.io.formats.format import format_array + _index_shared_docs['get_indexer'] = """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. - if is_categorical_dtype(values.dtype): - values = np.array(values) + Parameters + ---------- + target : %(target_klass)s + method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional + * default: exact matches only. + * pad / ffill: find the PREVIOUS index value if no exact match. + * backfill / bfill: use NEXT index value if no exact match + * nearest: use the NEAREST index value if no exact match. Tied + distances are broken by preferring the larger index value. + limit : int, optional + Maximum number of consecutive labels in ``target`` to match for + inexact matches. + tolerance : optional + Maximum distance between original and new labels for inexact + matches. The values of the index at the matching locations most + satisfy the equation ``abs(index[indexer] - target) <= tolerance``. - elif is_object_dtype(values.dtype): - values = lib.maybe_convert_objects(values, safe=1) + Tolerance may be a scalar value, which applies the same tolerance + to all values, or list-like, which applies variable tolerance per + element. List-like includes list, tuple, array, Series, and must be + the same size as the index and its dtype must exactly match the + index's type. - if is_object_dtype(values.dtype): - result = [pprint_thing(x, escape_chars=('\t', '\r', '\n')) - for x in values] + .. versionadded:: 0.21.0 (list-like tolerance) - # could have nans - mask = isna(values) - if mask.any(): - result = np.array(result) - result[mask] = na_rep - result = result.tolist() + Returns + ------- + indexer : ndarray of int + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. - else: - result = _trim_front(format_array(values, None, justify='left')) - return header + result + Examples + -------- + >>> index = pd.Index(['c', 'a', 'b']) + >>> index.get_indexer(['a', 'b', 'x']) + array([ 1, 2, -1]) - def to_native_types(self, slicer=None, **kwargs): + Notice that the return value is an array of locations in ``index`` + and ``x`` is marked by -1, as it is not in ``index``. """ - Format specified values of `self` and return them. - Parameters - ---------- - slicer : int, array-like - An indexer into `self` that specifies which values - are used in the formatting process. - kwargs : dict - Options for specifying how the values should be formatted. - These options include the following: + @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) + def get_indexer(self, target, method=None, limit=None, tolerance=None): + method = missing.clean_reindex_fill_method(method) + target = ensure_index(target) + if tolerance is not None: + tolerance = self._convert_tolerance(tolerance, target) - 1) na_rep : str - The value that serves as a placeholder for NULL values - 2) quoting : bool or None - Whether or not there are quoted values in `self` - 3) date_format : str - The format used to represent date-like values - """ + # Treat boolean labels passed to a numeric index as not found. Without + # this fix False and True would be treated as 0 and 1 respectively. + # (GH #16877) + if target.is_boolean() and self.is_numeric(): + return ensure_platform_int(np.repeat(-1, target.size)) - values = self - if slicer is not None: - values = values[slicer] - return values._format_native_types(**kwargs) + pself, ptarget = self._maybe_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer(ptarget, method=method, limit=limit, + tolerance=tolerance) - def _format_native_types(self, na_rep='', quoting=None, **kwargs): - """ actually format my specific types """ - mask = isna(self) - if not self.is_object() and not quoting: - values = np.asarray(self).astype(str) + if not is_dtype_equal(self.dtype, target.dtype): + this = self.astype(object) + target = target.astype(object) + return this.get_indexer(target, method=method, limit=limit, + tolerance=tolerance) + + if not self.is_unique: + raise InvalidIndexError('Reindexing only valid with uniquely' + ' valued Index objects') + + if method == 'pad' or method == 'backfill': + indexer = self._get_fill_indexer(target, method, limit, tolerance) + elif method == 'nearest': + indexer = self._get_nearest_indexer(target, limit, tolerance) else: - values = np.array(self, dtype=object, copy=True) + if tolerance is not None: + raise ValueError('tolerance argument only valid if doing pad, ' + 'backfill or nearest reindexing') + if limit is not None: + raise ValueError('limit argument only valid if doing pad, ' + 'backfill or nearest reindexing') - values[mask] = na_rep - return values + indexer = self._engine.get_indexer(target._ndarray_values) - def equals(self, other): + return ensure_platform_int(indexer) + + def _convert_tolerance(self, tolerance, target): + # override this method on subclasses + tolerance = np.asarray(tolerance) + if target.size != tolerance.size and tolerance.size > 1: + raise ValueError('list-like tolerance size must match ' + 'target index size') + return tolerance + + def _get_fill_indexer(self, target, method, limit=None, tolerance=None): + if self.is_monotonic_increasing and target.is_monotonic_increasing: + method = (self._engine.get_pad_indexer if method == 'pad' else + self._engine.get_backfill_indexer) + indexer = method(target._ndarray_values, limit) + else: + indexer = self._get_fill_indexer_searchsorted(target, method, + limit) + if tolerance is not None: + indexer = self._filter_indexer_tolerance(target._ndarray_values, + indexer, + tolerance) + return indexer + + def _get_fill_indexer_searchsorted(self, target, method, limit=None): """ - Determines if two Index objects contain the same elements. + Fallback pad/backfill get_indexer that works for monotonic decreasing + indexes and non-monotonic targets. """ - if self.is_(other): - return True - - if not isinstance(other, Index): - return False + if limit is not None: + raise ValueError('limit argument for %r method only well-defined ' + 'if index and target are monotonic' % method) - if is_object_dtype(self) and not is_object_dtype(other): - # if other is not object, use other's logic for coercion - return other.equals(self) + side = 'left' if method == 'pad' else 'right' - try: - return array_equivalent(com.values_from_object(self), - com.values_from_object(other)) - except Exception: - return False + # find exact matches first (this simplifies the algorithm) + indexer = self.get_indexer(target) + nonexact = (indexer == -1) + indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], + side) + if side == 'left': + # searchsorted returns "indices into a sorted array such that, + # if the corresponding elements in v were inserted before the + # indices, the order of a would be preserved". + # Thus, we need to subtract 1 to find values to the left. + indexer[nonexact] -= 1 + # This also mapped not found values (values of 0 from + # np.searchsorted) to -1, which conveniently is also our + # sentinel for missing values + else: + # Mark indices to the right of the largest value as not found + indexer[indexer == len(self)] = -1 + return indexer - def identical(self, other): - """Similar to equals, but check that other comparable attributes are - also equal + def _get_nearest_indexer(self, target, limit, tolerance): """ - return (self.equals(other) and - all((getattr(self, c, None) == getattr(other, c, None) - for c in self._comparables)) and - type(self) == type(other)) - - def asof(self, label): + Get the indexer for the nearest index labels; requires an index with + values that can be subtracted from each other (e.g., not strings or + tuples). """ - Return the label from the index, or, if not present, the previous one. + left_indexer = self.get_indexer(target, 'pad', limit=limit) + right_indexer = self.get_indexer(target, 'backfill', limit=limit) - Assuming that the index is sorted, return the passed index label if it - is in the index, or return the previous index label if the passed one - is not in the index. + target = np.asarray(target) + left_distances = abs(self.values[left_indexer] - target) + right_distances = abs(self.values[right_indexer] - target) - Parameters - ---------- - label : object - The label up to which the method returns the latest index label. + op = operator.lt if self.is_monotonic_increasing else operator.le + indexer = np.where(op(left_distances, right_distances) | + (right_indexer == -1), left_indexer, right_indexer) + if tolerance is not None: + indexer = self._filter_indexer_tolerance(target, indexer, + tolerance) + return indexer - Returns - ------- - object - The passed label if it is in the index. The previous label if the - passed label is not in the sorted index or `NaN` if there is no - such label. + def _filter_indexer_tolerance(self, target, indexer, tolerance): + distance = abs(self.values[indexer] - target) + indexer = np.where(distance <= tolerance, indexer, -1) + return indexer - See Also - -------- - Series.asof : Return the latest value in a Series up to the - passed index. - merge_asof : Perform an asof merge (similar to left join but it - matches on nearest key rather than equal key). - Index.get_loc : An `asof` is a thin wrapper around `get_loc` - with method='pad'. + # -------------------------------------------------------------------- + # Indexer Conversion Methods - Examples - -------- - `Index.asof` returns the latest index label up to the passed label. + _index_shared_docs['_convert_scalar_indexer'] = """ + Convert a scalar indexer. - >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03']) - >>> idx.asof('2014-01-01') - '2013-12-31' + Parameters + ---------- + key : label of the slice bound + kind : {'ix', 'loc', 'getitem', 'iloc'} or None + """ - If the label is in the index, the method returns the passed label. + @Appender(_index_shared_docs['_convert_scalar_indexer']) + def _convert_scalar_indexer(self, key, kind=None): + assert kind in ['ix', 'loc', 'getitem', 'iloc', None] - >>> idx.asof('2014-01-02') - '2014-01-02' + if kind == 'iloc': + return self._validate_indexer('positional', key, kind) - If all of the labels in the index are later than the passed label, - NaN is returned. - - >>> idx.asof('1999-01-02') - nan + if len(self) and not isinstance(self, ABCMultiIndex,): - If the index is not sorted, an error is raised. + # we can raise here if we are definitive that this + # is positional indexing (eg. .ix on with a float) + # or label indexing if we are using a type able + # to be represented in the index - >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02', - ... '2014-01-03']) - >>> idx_not_sorted.asof('2013-12-31') - Traceback (most recent call last): - ValueError: index must be monotonic increasing or decreasing - """ - try: - loc = self.get_loc(label, method='pad') - except KeyError: - return self._na_value - else: - if isinstance(loc, slice): - loc = loc.indices(len(self))[-1] - return self[loc] + if kind in ['getitem', 'ix'] and is_float(key): + if not self.is_floating(): + return self._invalid_indexer('label', key) - def asof_locs(self, where, mask): - """ - where : array of timestamps - mask : array of booleans where data is not NA - """ - locs = self.values[mask].searchsorted(where.values, side='right') + elif kind in ['loc'] and is_float(key): - locs = np.where(locs > 0, locs - 1, 0) - result = np.arange(len(self))[mask].take(locs) + # we want to raise KeyError on string/mixed here + # technically we *could* raise a TypeError + # on anything but mixed though + if self.inferred_type not in ['floating', + 'mixed-integer-float', + 'string', + 'unicode', + 'mixed']: + return self._invalid_indexer('label', key) - first = mask.argmax() - result[(locs == 0) & (where.values < self.values[first])] = -1 + elif kind in ['loc'] and is_integer(key): + if not self.holds_integer(): + return self._invalid_indexer('label', key) - return result + return key - def sort_values(self, return_indexer=False, ascending=True): - """ - Return a sorted copy of the index. + _index_shared_docs['_convert_slice_indexer'] = """ + Convert a slice indexer. - Return a sorted copy of the index, and optionally return the indices - that sorted the index itself. + By definition, these are labels unless 'iloc' is passed in. + Floats are not allowed as the start, step, or stop of the slice. Parameters ---------- - return_indexer : bool, default False - Should the indices that would sort the index be returned. - ascending : bool, default True - Should the index values be sorted in an ascending order. + key : label of the slice bound + kind : {'ix', 'loc', 'getitem', 'iloc'} or None + """ - Returns - ------- - sorted_index : pandas.Index - Sorted copy of the index. - indexer : numpy.ndarray, optional - The indices that the index itself was sorted by. + @Appender(_index_shared_docs['_convert_slice_indexer']) + def _convert_slice_indexer(self, key, kind=None): + assert kind in ['ix', 'loc', 'getitem', 'iloc', None] - See Also - -------- - pandas.Series.sort_values : Sort values of a Series. - pandas.DataFrame.sort_values : Sort values in a DataFrame. + # if we are not a slice, then we are done + if not isinstance(key, slice): + return key - Examples - -------- - >>> idx = pd.Index([10, 100, 1, 1000]) - >>> idx - Int64Index([10, 100, 1, 1000], dtype='int64') + # validate iloc + if kind == 'iloc': + return slice(self._validate_indexer('slice', key.start, kind), + self._validate_indexer('slice', key.stop, kind), + self._validate_indexer('slice', key.step, kind)) - Sort values in ascending order (default behavior). + # potentially cast the bounds to integers + start, stop, step = key.start, key.stop, key.step - >>> idx.sort_values() - Int64Index([1, 10, 100, 1000], dtype='int64') + # figure out if this is a positional indexer + def is_int(v): + return v is None or is_integer(v) - Sort values in descending order, and also get the indices `idx` was - sorted by. + is_null_slicer = start is None and stop is None + is_index_slice = is_int(start) and is_int(stop) + is_positional = is_index_slice and not self.is_integer() - >>> idx.sort_values(ascending=False, return_indexer=True) - (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) - """ - _as = self.argsort() - if not ascending: - _as = _as[::-1] + if kind == 'getitem': + """ + called from the getitem slicers, validate that we are in fact + integers + """ + if self.is_integer() or is_index_slice: + return slice(self._validate_indexer('slice', key.start, kind), + self._validate_indexer('slice', key.stop, kind), + self._validate_indexer('slice', key.step, kind)) - sorted_index = self.take(_as) + # convert the slice to an indexer here - if return_indexer: - return sorted_index, _as + # if we are mixed and have integers + try: + if is_positional and self.is_mixed(): + # Validate start & stop + if start is not None: + self.get_loc(start) + if stop is not None: + self.get_loc(stop) + is_positional = False + except KeyError: + if self.inferred_type == 'mixed-integer-float': + raise + + if is_null_slicer: + indexer = key + elif is_positional: + indexer = key else: - return sorted_index + try: + indexer = self.slice_indexer(start, stop, step, kind=kind) + except Exception: + if is_index_slice: + if self.is_integer(): + raise + else: + indexer = key + else: + raise - def sort(self, *args, **kwargs): - raise TypeError("cannot sort an Index object in-place, use " - "sort_values instead") + return indexer - def sortlevel(self, level=None, ascending=True, sort_remaining=None): + def _convert_listlike_indexer(self, keyarr, kind=None): """ - - For internal compatibility with with the Index API - - Sort the Index. This is for compat with MultiIndex - Parameters ---------- - ascending : boolean, default True - False to sort in descending order - - level, sort_remaining are compat parameters + keyarr : list-like + Indexer to convert. Returns ------- - sorted_index : Index + tuple (indexer, keyarr) + indexer is an ndarray or None if cannot convert + keyarr are tuple-safe keys """ - return self.sort_values(return_indexer=True, ascending=ascending) + if isinstance(keyarr, Index): + keyarr = self._convert_index_indexer(keyarr) + else: + keyarr = self._convert_arr_indexer(keyarr) - def shift(self, periods=1, freq=None): - """ - Shift index by desired number of time frequency increments. + indexer = self._convert_list_indexer(keyarr, kind=kind) + return indexer, keyarr - This method is for shifting the values of datetime-like indexes - by a specified time increment a given number of times. + _index_shared_docs['_convert_arr_indexer'] = """ + Convert an array-like indexer to the appropriate dtype. Parameters ---------- - periods : int, default 1 - Number of periods (or increments) to shift by, - can be positive or negative. - freq : pandas.DateOffset, pandas.Timedelta or string, optional - Frequency increment to shift by. - If None, the index is shifted by its own `freq` attribute. - Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. + keyarr : array-like + Indexer to convert. Returns ------- - pandas.Index - shifted index - - See Also - -------- - Series.shift : Shift values of Series. - - Examples - -------- - Put the first 5 month starts of 2011 into an index. - - >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS') - >>> month_starts - DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', - '2011-05-01'], - dtype='datetime64[ns]', freq='MS') - - Shift the index by 10 days. - - >>> month_starts.shift(10, freq='D') - DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', - '2011-05-11'], - dtype='datetime64[ns]', freq=None) - - The default value of `freq` is the `freq` attribute of the index, - which is 'MS' (month start) in this example. - - >>> month_starts.shift(10) - DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01', - '2012-03-01'], - dtype='datetime64[ns]', freq='MS') + converted_keyarr : array-like + """ - Notes - ----- - This method is only implemented for datetime-like index classes, - i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex. - """ - raise NotImplementedError("Not supported for type %s" % - type(self).__name__) + @Appender(_index_shared_docs['_convert_arr_indexer']) + def _convert_arr_indexer(self, keyarr): + keyarr = com.asarray_tuplesafe(keyarr) + return keyarr - def argsort(self, *args, **kwargs): - """ - Return the integer indices that would sort the index. + _index_shared_docs['_convert_index_indexer'] = """ + Convert an Index indexer to the appropriate dtype. Parameters ---------- - *args - Passed to `numpy.ndarray.argsort`. - **kwargs - Passed to `numpy.ndarray.argsort`. + keyarr : Index (or sub-class) + Indexer to convert. Returns ------- - numpy.ndarray - Integer indices that would sort the index if used as - an indexer. + converted_keyarr : Index (or sub-class) + """ - See Also - -------- - numpy.argsort : Similar method for NumPy arrays. - Index.sort_values : Return sorted copy of Index. + @Appender(_index_shared_docs['_convert_index_indexer']) + def _convert_index_indexer(self, keyarr): + return keyarr - Examples - -------- - >>> idx = pd.Index(['b', 'a', 'd', 'c']) - >>> idx - Index(['b', 'a', 'd', 'c'], dtype='object') + _index_shared_docs['_convert_list_indexer'] = """ + Convert a list-like indexer to the appropriate dtype. - >>> order = idx.argsort() - >>> order - array([1, 0, 3, 2]) + Parameters + ---------- + keyarr : Index (or sub-class) + Indexer to convert. + kind : iloc, ix, loc, optional - >>> idx[order] - Index(['a', 'b', 'c', 'd'], dtype='object') - """ - result = self.asi8 - if result is None: - result = np.array(self) - return result.argsort(*args, **kwargs) + Returns + ------- + positional indexer or None + """ - def __add__(self, other): - if isinstance(other, (ABCSeries, ABCDataFrame)): - return NotImplemented - return Index(np.array(self) + other) + @Appender(_index_shared_docs['_convert_list_indexer']) + def _convert_list_indexer(self, keyarr, kind=None): + if (kind in [None, 'iloc', 'ix'] and + is_integer_dtype(keyarr) and not self.is_floating() and + not isinstance(keyarr, ABCPeriodIndex)): - def __radd__(self, other): - return Index(other + np.array(self)) + if self.inferred_type == 'mixed-integer': + indexer = self.get_indexer(keyarr) + if (indexer >= 0).all(): + return indexer + # missing values are flagged as -1 by get_indexer and negative + # indices are already converted to positive indices in the + # above if-statement, so the negative flags are changed to + # values outside the range of indices so as to trigger an + # IndexError in maybe_convert_indices + indexer[indexer < 0] = len(self) + from pandas.core.indexing import maybe_convert_indices + return maybe_convert_indices(indexer, len(self)) - def __iadd__(self, other): - # alias for __add__ - return self + other + elif not self.inferred_type == 'integer': + keyarr = np.where(keyarr < 0, len(self) + keyarr, keyarr) + return keyarr - def __sub__(self, other): - return Index(np.array(self) - other) + return None - def __rsub__(self, other): - return Index(other - np.array(self)) + def _invalid_indexer(self, form, key): + """ + Consistent invalid indexer message. + """ + raise TypeError("cannot do {form} indexing on {klass} with these " + "indexers [{key}] of {kind}".format( + form=form, klass=type(self), key=key, + kind=type(key))) - def __and__(self, other): - return self.intersection(other) + # -------------------------------------------------------------------- + # Reindex Methods - def __or__(self, other): - return self.union(other) + def _can_reindex(self, indexer): + """ + Check if we are allowing reindexing with this particular indexer. - def __xor__(self, other): - return self.symmetric_difference(other) + Parameters + ---------- + indexer : an integer indexer - def _get_reconciled_name_object(self, other): - """ - If the result of a set operation will be self, - return self, unless the name changes, in which - case make a shallow copy of self. + Raises + ------ + ValueError if its a duplicate axis """ - name = get_op_result_name(self, other) - if self.name != name: - return self._shallow_copy(name=name) - return self - def union(self, other): + # trying to reindex on an axis with duplicates + if not self.is_unique and len(indexer): + raise ValueError("cannot reindex from a duplicate axis") + + def reindex(self, target, method=None, level=None, limit=None, + tolerance=None): """ - Form the union of two Index objects and sorts if possible. + Create index with target's values (move/add/delete values + as necessary). Parameters ---------- - other : Index or array-like + target : an iterable Returns ------- - union : Index - - Examples - -------- + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index - >>> idx1 = pd.Index([1, 2, 3, 4]) - >>> idx2 = pd.Index([3, 4, 5, 6]) - >>> idx1.union(idx2) - Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') """ - self._assert_can_do_setop(other) - other = ensure_index(other) - - if len(other) == 0 or self.equals(other): - return self._get_reconciled_name_object(other) - - if len(self) == 0: - return other._get_reconciled_name_object(self) + # GH6552: preserve names when reindexing to non-named target + # (i.e. neither Index nor Series). + preserve_names = not hasattr(target, 'name') - # TODO: is_dtype_union_equal is a hack around - # 1. buggy set ops with duplicates (GH #13432) - # 2. CategoricalIndex lacking setops (GH #10186) - # Once those are fixed, this workaround can be removed - if not is_dtype_union_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') - return this.union(other) + # GH7774: preserve dtype/tz if target is empty and not an Index. + target = _ensure_has_len(target) # target may be an iterator - # TODO(EA): setops-refactor, clean all this up - if is_period_dtype(self) or is_datetime64tz_dtype(self): - lvals = self._ndarray_values - else: - lvals = self._values - if is_period_dtype(other) or is_datetime64tz_dtype(other): - rvals = other._ndarray_values + if not isinstance(target, Index) and len(target) == 0: + attrs = self._get_attributes_dict() + attrs.pop('freq', None) # don't preserve freq + values = self._data[:0] # appropriately-dtyped empty array + target = self._simple_new(values, dtype=self.dtype, **attrs) else: - rvals = other._values - - if self.is_monotonic and other.is_monotonic: - try: - result = self._outer_indexer(lvals, rvals)[0] - except TypeError: - # incomparable objects - result = list(lvals) + target = ensure_index(target) - # worth making this faster? a very unusual case - value_set = set(lvals) - result.extend([x for x in rvals if x not in value_set]) + if level is not None: + if method is not None: + raise TypeError('Fill method not supported if level passed') + _, indexer, _ = self._join_level(target, level, how='right', + return_indexers=True) else: - indexer = self.get_indexer(other) - indexer, = (indexer == -1).nonzero() - - if len(indexer) > 0: - other_diff = algos.take_nd(rvals, indexer, - allow_fill=False) - result = _concat._concat_compat((lvals, other_diff)) - - try: - lvals[0] < other_diff[0] - except TypeError as e: - warnings.warn("%s, sort order is undefined for " - "incomparable objects" % e, RuntimeWarning, - stacklevel=3) - else: - types = frozenset((self.inferred_type, - other.inferred_type)) - if not types & _unsortable_types: - result.sort() - + if self.equals(target): + indexer = None else: - result = lvals - try: - result = np.sort(result) - except TypeError as e: - warnings.warn("%s, sort order is undefined for " - "incomparable objects" % e, RuntimeWarning, - stacklevel=3) + if self.is_unique: + indexer = self.get_indexer(target, method=method, + limit=limit, + tolerance=tolerance) + else: + if method is not None or limit is not None: + raise ValueError("cannot reindex a non-unique index " + "with a method or limit") + indexer, missing = self.get_indexer_non_unique(target) - # for subclasses - return self._wrap_setop_result(other, result) + if preserve_names and target.nlevels == 1 and target.name != self.name: + target = target.copy() + target.name = self.name - def _wrap_setop_result(self, other, result): - return self._constructor(result, name=get_op_result_name(self, other)) + return target, indexer - def intersection(self, other): + def _reindex_non_unique(self, target): """ - Form the intersection of two Index objects. - - This returns a new Index with elements common to the index and `other`, - preserving the order of the calling index. + Create a new index with target's values (move/add/delete values as + necessary) use with non-unique Index and a possibly non-unique target. Parameters ---------- - other : Index or array-like + target : an iterable Returns ------- - intersection : Index - - Examples - -------- + new_index : pd.Index + Resulting index + indexer : np.ndarray or None + Indices of output values in original index - >>> idx1 = pd.Index([1, 2, 3, 4]) - >>> idx2 = pd.Index([3, 4, 5, 6]) - >>> idx1.intersection(idx2) - Int64Index([3, 4], dtype='int64') """ - self._assert_can_do_setop(other) - other = ensure_index(other) - if self.equals(other): - return self._get_reconciled_name_object(other) + target = ensure_index(target) + indexer, missing = self.get_indexer_non_unique(target) + check = indexer != -1 + new_labels = self.take(indexer[check]) + new_indexer = None - if not is_dtype_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') - return this.intersection(other) + if len(missing): + length = np.arange(len(indexer)) - # TODO(EA): setops-refactor, clean all this up - if is_period_dtype(self): - lvals = self._ndarray_values - else: - lvals = self._values - if is_period_dtype(other): - rvals = other._ndarray_values - else: - rvals = other._values + missing = ensure_platform_int(missing) + missing_labels = target.take(missing) + missing_indexer = ensure_int64(length[~check]) + cur_labels = self.take(indexer[check]).values + cur_indexer = ensure_int64(length[check]) - if self.is_monotonic and other.is_monotonic: - try: - result = self._inner_indexer(lvals, rvals)[0] - return self._wrap_setop_result(other, result) - except TypeError: - pass + new_labels = np.empty(tuple([len(indexer)]), dtype=object) + new_labels[cur_indexer] = cur_labels + new_labels[missing_indexer] = missing_labels - try: - indexer = Index(rvals).get_indexer(lvals) - indexer = indexer.take((indexer != -1).nonzero()[0]) - except Exception: - # duplicates - indexer = algos.unique1d( - Index(rvals).get_indexer_non_unique(lvals)[0]) - indexer = indexer[indexer != -1] + # a unique indexer + if target.is_unique: - taken = other.take(indexer) - if self.name != other.name: - taken.name = None - return taken + # see GH5553, make sure we use the right indexer + new_indexer = np.arange(len(indexer)) + new_indexer[cur_indexer] = np.arange(len(cur_labels)) + new_indexer[missing_indexer] = -1 - def difference(self, other, sort=True): - """ - Return a new Index with elements from the index that are not in - `other`. + # we have a non_unique selector, need to use the original + # indexer here + else: - This is the set difference of two Index objects. + # need to retake to have the same size as the indexer + indexer[~check] = -1 + + # reset the new indexer to account for the new size + new_indexer = np.arange(len(self.take(indexer))) + new_indexer[~check] = -1 + + new_index = self._shallow_copy_with_infer(new_labels, freq=None) + return new_index, indexer, new_indexer + + # -------------------------------------------------------------------- + # Join Methods + + _index_shared_docs['join'] = """ + Compute join_index and indexers to conform data + structures to the new index. Parameters ---------- - other : Index or array-like - sort : bool, default True - Sort the resulting index if possible + other : Index + how : {'left', 'right', 'inner', 'outer'} + level : int or level name, default None + return_indexers : boolean, default False + sort : boolean, default False + Sort the join keys lexicographically in the result Index. If False, + the order of the join keys depends on the join type (how keyword) - .. versionadded:: 0.24.0 + .. versionadded:: 0.20.0 Returns ------- - difference : Index + join_index, (left_indexer, right_indexer) + """ - Examples - -------- + @Appender(_index_shared_docs['join']) + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): + from .multi import MultiIndex + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) - >>> idx1 = pd.Index([2, 1, 3, 4]) - >>> idx2 = pd.Index([3, 4, 5, 6]) - >>> idx1.difference(idx2) - Int64Index([1, 2], dtype='int64') - >>> idx1.difference(idx2, sort=False) - Int64Index([2, 1], dtype='int64') - """ - self._assert_can_do_setop(other) + # try to figure out the join level + # GH3662 + if level is None and (self_is_mi or other_is_mi): - if self.equals(other): - # pass an empty np.ndarray with the appropriate dtype - return self._shallow_copy(self._data[:0]) + # have the same levels/names so a simple join + if self.names == other.names: + pass + else: + return self._join_multi(other, how=how, + return_indexers=return_indexers) - other, result_name = self._convert_can_do_setop(other) + # join on the level + if level is not None and (self_is_mi or other_is_mi): + return self._join_level(other, level, how=how, + return_indexers=return_indexers) - this = self._get_unique_index() + other = ensure_index(other) - indexer = this.get_indexer(other) - indexer = indexer.take((indexer != -1).nonzero()[0]) + if len(other) == 0 and how in ('left', 'outer'): + join_index = self._shallow_copy() + if return_indexers: + rindexer = np.repeat(-1, len(join_index)) + return join_index, None, rindexer + else: + return join_index - label_diff = np.setdiff1d(np.arange(this.size), indexer, - assume_unique=True) - the_diff = this.values.take(label_diff) - if sort: + if len(self) == 0 and how in ('right', 'outer'): + join_index = other._shallow_copy() + if return_indexers: + lindexer = np.repeat(-1, len(join_index)) + return join_index, lindexer, None + else: + return join_index + + if self._join_precedence < other._join_precedence: + how = {'right': 'left', 'left': 'right'}.get(how, how) + result = other.join(self, how=how, level=level, + return_indexers=return_indexers) + if return_indexers: + x, y, z = result + result = x, z, y + return result + + if not is_dtype_equal(self.dtype, other.dtype): + this = self.astype('O') + other = other.astype('O') + return this.join(other, how=how, return_indexers=return_indexers) + + _validate_join_method(how) + + if not self.is_unique and not other.is_unique: + return self._join_non_unique(other, how=how, + return_indexers=return_indexers) + elif not self.is_unique or not other.is_unique: + if self.is_monotonic and other.is_monotonic: + return self._join_monotonic(other, how=how, + return_indexers=return_indexers) + else: + return self._join_non_unique(other, how=how, + return_indexers=return_indexers) + elif self.is_monotonic and other.is_monotonic: try: - the_diff = sorting.safe_sort(the_diff) + return self._join_monotonic(other, how=how, + return_indexers=return_indexers) except TypeError: pass - return this._shallow_copy(the_diff, name=result_name, freq=None) + if how == 'left': + join_index = self + elif how == 'right': + join_index = other + elif how == 'inner': + join_index = self.intersection(other) + elif how == 'outer': + join_index = self.union(other) - def symmetric_difference(self, other, result_name=None): - """ - Compute the symmetric difference of two Index objects. - It's sorted if sorting is possible. + if sort: + join_index = join_index.sort_values() - Parameters - ---------- - other : Index or array-like - result_name : str + if return_indexers: + if join_index is self: + lindexer = None + else: + lindexer = self.get_indexer(join_index) + if join_index is other: + rindexer = None + else: + rindexer = other.get_indexer(join_index) + return join_index, lindexer, rindexer + else: + return join_index - Returns - ------- - symmetric_difference : Index + def _join_multi(self, other, how, return_indexers=True): + from .multi import MultiIndex + from pandas.core.reshape.merge import _restore_dropped_levels_multijoin - Notes - ----- - ``symmetric_difference`` contains elements that appear in either - ``idx1`` or ``idx2`` but not both. Equivalent to the Index created by - ``idx1.difference(idx2) | idx2.difference(idx1)`` with duplicates - dropped. + # figure out join names + self_names = set(com._not_none(*self.names)) + other_names = set(com._not_none(*other.names)) + overlap = self_names & other_names - Examples - -------- - >>> idx1 = pd.Index([1, 2, 3, 4]) - >>> idx2 = pd.Index([2, 3, 4, 5]) - >>> idx1.symmetric_difference(idx2) - Int64Index([1, 5], dtype='int64') + # need at least 1 in common + if not overlap: + raise ValueError("cannot join with no overlapping index names") - You can also use the ``^`` operator: + self_is_mi = isinstance(self, MultiIndex) + other_is_mi = isinstance(other, MultiIndex) - >>> idx1 ^ idx2 - Int64Index([1, 5], dtype='int64') - """ - self._assert_can_do_setop(other) - other, result_name_update = self._convert_can_do_setop(other) - if result_name is None: - result_name = result_name_update + if self_is_mi and other_is_mi: - this = self._get_unique_index() - other = other._get_unique_index() - indexer = this.get_indexer(other) + # Drop the non-matching levels from left and right respectively + ldrop_names = list(self_names - overlap) + rdrop_names = list(other_names - overlap) - # {this} minus {other} - common_indexer = indexer.take((indexer != -1).nonzero()[0]) - left_indexer = np.setdiff1d(np.arange(this.size), common_indexer, - assume_unique=True) - left_diff = this.values.take(left_indexer) + self_jnlevels = self.droplevel(ldrop_names) + other_jnlevels = other.droplevel(rdrop_names) - # {other} minus {this} - right_indexer = (indexer == -1).nonzero()[0] - right_diff = other.values.take(right_indexer) + # Join left and right + # Join on same leveled multi-index frames is supported + join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how, + return_indexers=True) - the_diff = _concat._concat_compat([left_diff, right_diff]) - try: - the_diff = sorting.safe_sort(the_diff) - except TypeError: - pass + # Restore the dropped levels + # Returned index level order is + # common levels, ldrop_names, rdrop_names + dropped_names = ldrop_names + rdrop_names - attribs = self._get_attributes_dict() - attribs['name'] = result_name - if 'freq' in attribs: - attribs['freq'] = None - return self._shallow_copy_with_infer(the_diff, **attribs) - - def _get_unique_index(self, dropna=False): - """ - Returns an index containing unique values. + levels, labels, names = ( + _restore_dropped_levels_multijoin(self, other, + dropped_names, + join_idx, + lidx, ridx)) - Parameters - ---------- - dropna : bool - If True, NaN values are dropped. + # Re-create the multi-index + multi_join_idx = MultiIndex(levels=levels, labels=labels, + names=names, verify_integrity=False) - Returns - ------- - uniques : index - """ - if self.is_unique and not dropna: - return self + multi_join_idx = multi_join_idx.remove_unused_levels() - values = self.values + return multi_join_idx, lidx, ridx - if not self.is_unique: - values = self.unique() + jl = list(overlap)[0] - if dropna: - try: - if self.hasnans: - values = values[~isna(values)] - except NotImplementedError: - pass + # Case where only one index is multi + # make the indices into mi's that match + flip_order = False + if self_is_mi: + self, other = other, self + flip_order = True + # flip if join method is right or left + how = {'right': 'left', 'left': 'right'}.get(how, how) - return self._shallow_copy(values) + level = other.names.index(jl) + result = self._join_level(other, level, how=how, + return_indexers=return_indexers) - _index_shared_docs['get_loc'] = """ - Get integer location, slice or boolean mask for requested label. + if flip_order: + if isinstance(result, tuple): + return result[0], result[2], result[1] + return result - Parameters - ---------- - key : label - method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional - * default: exact matches only. - * pad / ffill: find the PREVIOUS index value if no exact match. - * backfill / bfill: use NEXT index value if no exact match - * nearest: use the NEAREST index value if no exact match. Tied - distances are broken by preferring the larger index value. - tolerance : optional - Maximum distance from index value for inexact matches. The value of - the index at the matching location most satisfy the equation - ``abs(index[loc] - key) <= tolerance``. + def _join_non_unique(self, other, how='left', return_indexers=False): + from pandas.core.reshape.merge import _get_join_indexers - Tolerance may be a scalar - value, which applies the same tolerance to all values, or - list-like, which applies variable tolerance per element. List-like - includes list, tuple, array, Series, and must be the same size as - the index and its dtype must exactly match the index's type. + left_idx, right_idx = _get_join_indexers([self._ndarray_values], + [other._ndarray_values], + how=how, + sort=True) - .. versionadded:: 0.21.0 (list-like tolerance) + left_idx = ensure_platform_int(left_idx) + right_idx = ensure_platform_int(right_idx) - Returns - ------- - loc : int if unique index, slice if monotonic index, else mask + join_index = np.asarray(self._ndarray_values.take(left_idx)) + mask = left_idx == -1 + np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) - Examples - --------- - >>> unique_index = pd.Index(list('abc')) - >>> unique_index.get_loc('b') - 1 + join_index = self._wrap_joined_index(join_index, other) - >>> monotonic_index = pd.Index(list('abbc')) - >>> monotonic_index.get_loc('b') - slice(1, 3, None) + if return_indexers: + return join_index, left_idx, right_idx + else: + return join_index - >>> non_monotonic_index = pd.Index(list('abcb')) - >>> non_monotonic_index.get_loc('b') - array([False, True, False, True], dtype=bool) + def _join_level(self, other, level, how='left', return_indexers=False, + keep_order=True): """ + The join method *only* affects the level of the resulting + MultiIndex. Otherwise it just exactly aligns the Index data to the + labels of the level in the MultiIndex. - @Appender(_index_shared_docs['get_loc']) - def get_loc(self, key, method=None, tolerance=None): - if method is None: - if tolerance is not None: - raise ValueError('tolerance argument only valid if using pad, ' - 'backfill or nearest lookups') - try: - return self._engine.get_loc(key) - except KeyError: - return self._engine.get_loc(self._maybe_cast_indexer(key)) - indexer = self.get_indexer([key], method=method, tolerance=tolerance) - if indexer.ndim > 1 or indexer.size > 1: - raise TypeError('get_loc requires scalar valued input') - loc = indexer.item() - if loc == -1: - raise KeyError(key) - return loc - - def get_value(self, series, key): - """ - Fast lookup of value from 1-dimensional ndarray. Only use this if you - know what you're doing + If ```keep_order == True```, the order of the data indexed by the + MultiIndex will not be changed; otherwise, it will tie out + with `other`. """ + from .multi import MultiIndex - # if we have something that is Index-like, then - # use this, e.g. DatetimeIndex - s = getattr(series, '_values', None) - if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): - # GH 20882, 21257 - # Unify Index and ExtensionArray treatment - # First try to convert the key to a location - # If that fails, raise a KeyError if an integer - # index, otherwise, see if key is an integer, and - # try that - try: - iloc = self.get_loc(key) - return s[iloc] - except KeyError: - if (len(self) > 0 and - (self.holds_integer() or self.is_boolean())): - raise - elif is_integer(key): - return s[key] + def _get_leaf_sorter(labels): + """ + Returns sorter for the inner most level while preserving the + order of higher levels. + """ + if labels[0].size == 0: + return np.empty(0, dtype='int64') - s = com.values_from_object(series) - k = com.values_from_object(key) + if len(labels) == 1: + lab = ensure_int64(labels[0]) + sorter, _ = libalgos.groupsort_indexer(lab, 1 + lab.max()) + return sorter - k = self._convert_scalar_indexer(k, kind='getitem') - try: - return self._engine.get_value(s, k, - tz=getattr(series.dtype, 'tz', None)) - except KeyError as e1: - if len(self) > 0 and (self.holds_integer() or self.is_boolean()): - raise + # find indexers of beginning of each set of + # same-key labels w.r.t all but last level + tic = labels[0][:-1] != labels[0][1:] + for lab in labels[1:-1]: + tic |= lab[:-1] != lab[1:] - try: - return libindex.get_value_box(s, key) - except IndexError: - raise - except TypeError: - # generator/iterator-like - if is_iterator(key): - raise InvalidIndexError(key) - else: - raise e1 - except Exception: # pragma: no cover - raise e1 - except TypeError: - # python 3 - if is_scalar(key): # pragma: no cover - raise IndexError(key) - raise InvalidIndexError(key) + starts = np.hstack(([True], tic, [True])).nonzero()[0] + lab = ensure_int64(labels[-1]) + return lib.get_level_sorter(lab, ensure_int64(starts)) - def set_value(self, arr, key, value): - """ - Fast lookup of value from 1-dimensional ndarray. Only use this if you - know what you're doing - """ - self._engine.set_value(com.values_from_object(arr), - com.values_from_object(key), value) + if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): + raise TypeError('Join on level between two MultiIndex objects ' + 'is ambiguous') - def _get_level_values(self, level): - """ - Return an Index of values for requested level. + left, right = self, other - This is primarily useful to get an individual level of values from a - MultiIndex, but is provided on Index as well for compatability. + flip_order = not isinstance(self, MultiIndex) + if flip_order: + left, right = right, left + how = {'right': 'left', 'left': 'right'}.get(how, how) - Parameters - ---------- - level : int or str - It is either the integer position or the name of the level. + level = left._get_level_number(level) + old_level = left.levels[level] - Returns - ------- - values : Index - Calling object, as there is only one level in the Index. + if not right.is_unique: + raise NotImplementedError('Index._join_level on non-unique index ' + 'is not implemented') - See Also - -------- - MultiIndex.get_level_values : Get values for a level of a MultiIndex. + new_level, left_lev_indexer, right_lev_indexer = \ + old_level.join(right, how=how, return_indexers=True) - Notes - ----- - For Index, level should be 0, since there are no multiple levels. + if left_lev_indexer is None: + if keep_order or len(left) == 0: + left_indexer = None + join_index = left + else: # sort the leaves + left_indexer = _get_leaf_sorter(left.labels[:level + 1]) + join_index = left[left_indexer] - Examples - -------- + else: + left_lev_indexer = ensure_int64(left_lev_indexer) + rev_indexer = lib.get_reverse_indexer(left_lev_indexer, + len(old_level)) - >>> idx = pd.Index(list('abc')) - >>> idx - Index(['a', 'b', 'c'], dtype='object') + new_lev_labels = algos.take_nd(rev_indexer, left.labels[level], + allow_fill=False) - Get level values by supplying `level` as integer: + new_labels = list(left.labels) + new_labels[level] = new_lev_labels - >>> idx.get_level_values(0) - Index(['a', 'b', 'c'], dtype='object') - """ - self._validate_index_level(level) - return self + new_levels = list(left.levels) + new_levels[level] = new_level - get_level_values = _get_level_values + if keep_order: # just drop missing values. o.w. keep order + left_indexer = np.arange(len(left), dtype=np.intp) + mask = new_lev_labels != -1 + if not mask.all(): + new_labels = [lab[mask] for lab in new_labels] + left_indexer = left_indexer[mask] - def droplevel(self, level=0): - """ - Return index with requested level(s) removed. If resulting index has - only 1 level left, the result will be of Index type, not MultiIndex. + else: # tie out the order with other + if level == 0: # outer most level, take the fast route + ngroups = 1 + new_lev_labels.max() + left_indexer, counts = libalgos.groupsort_indexer( + new_lev_labels, ngroups) - .. versionadded:: 0.23.1 (support for non-MultiIndex) + # missing values are placed first; drop them! + left_indexer = left_indexer[counts[0]:] + new_labels = [lab[left_indexer] for lab in new_labels] - Parameters - ---------- - level : int, str, or list-like, default 0 - If a string is given, must be the name of a level - If list-like, elements must be names or indexes of levels. + else: # sort the leaves + mask = new_lev_labels != -1 + mask_all = mask.all() + if not mask_all: + new_labels = [lab[mask] for lab in new_labels] - Returns - ------- - index : Index or MultiIndex - """ - if not isinstance(level, (tuple, list)): - level = [level] + left_indexer = _get_leaf_sorter(new_labels[:level + 1]) + new_labels = [lab[left_indexer] for lab in new_labels] - levnums = sorted(self._get_level_number(lev) for lev in level)[::-1] + # left_indexers are w.r.t masked frame. + # reverse to original frame! + if not mask_all: + left_indexer = mask.nonzero()[0][left_indexer] - if len(level) == 0: - return self - if len(level) >= self.nlevels: - raise ValueError("Cannot remove {} levels from an index with {} " - "levels: at least one level must be " - "left.".format(len(level), self.nlevels)) - # The two checks above guarantee that here self is a MultiIndex + join_index = MultiIndex(levels=new_levels, labels=new_labels, + names=left.names, verify_integrity=False) - new_levels = list(self.levels) - new_labels = list(self.labels) - new_names = list(self.names) + if right_lev_indexer is not None: + right_indexer = algos.take_nd(right_lev_indexer, + join_index.labels[level], + allow_fill=False) + else: + right_indexer = join_index.labels[level] - for i in levnums: - new_levels.pop(i) - new_labels.pop(i) - new_names.pop(i) + if flip_order: + left_indexer, right_indexer = right_indexer, left_indexer - if len(new_levels) == 1: + if return_indexers: + left_indexer = (None if left_indexer is None + else ensure_platform_int(left_indexer)) + right_indexer = (None if right_indexer is None + else ensure_platform_int(right_indexer)) + return join_index, left_indexer, right_indexer + else: + return join_index - # set nan if needed - mask = new_labels[0] == -1 - result = new_levels[0].take(new_labels[0]) - if mask.any(): - result = result.putmask(mask, np.nan) + def _join_monotonic(self, other, how='left', return_indexers=False): + if self.equals(other): + ret_index = other if how == 'right' else self + if return_indexers: + return ret_index, None, None + else: + return ret_index - result.name = new_names[0] - return result + sv = self._ndarray_values + ov = other._ndarray_values + + if self.is_unique and other.is_unique: + # We can perform much better than the general case + if how == 'left': + join_index = self + lidx = None + ridx = self._left_indexer_unique(sv, ov) + elif how == 'right': + join_index = other + lidx = self._left_indexer_unique(ov, sv) + ridx = None + elif how == 'inner': + join_index, lidx, ridx = self._inner_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) + elif how == 'outer': + join_index, lidx, ridx = self._outer_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) else: - from .multi import MultiIndex - return MultiIndex(levels=new_levels, labels=new_labels, - names=new_names, verify_integrity=False) + if how == 'left': + join_index, lidx, ridx = self._left_indexer(sv, ov) + elif how == 'right': + join_index, ridx, lidx = self._left_indexer(ov, sv) + elif how == 'inner': + join_index, lidx, ridx = self._inner_indexer(sv, ov) + elif how == 'outer': + join_index, lidx, ridx = self._outer_indexer(sv, ov) + join_index = self._wrap_joined_index(join_index, other) - _index_shared_docs['get_indexer'] = """ - Compute indexer and mask for new index given the current index. The - indexer should be then used as an input to ndarray.take to align the - current data to the new index. + if return_indexers: + lidx = None if lidx is None else ensure_platform_int(lidx) + ridx = None if ridx is None else ensure_platform_int(ridx) + return join_index, lidx, ridx + else: + return join_index - Parameters - ---------- - target : %(target_klass)s - method : {None, 'pad'/'ffill', 'backfill'/'bfill', 'nearest'}, optional - * default: exact matches only. - * pad / ffill: find the PREVIOUS index value if no exact match. - * backfill / bfill: use NEXT index value if no exact match - * nearest: use the NEAREST index value if no exact match. Tied - distances are broken by preferring the larger index value. - limit : int, optional - Maximum number of consecutive labels in ``target`` to match for - inexact matches. - tolerance : optional - Maximum distance between original and new labels for inexact - matches. The values of the index at the matching locations most - satisfy the equation ``abs(index[indexer] - target) <= tolerance``. + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + return Index(joined, name=name) - Tolerance may be a scalar value, which applies the same tolerance - to all values, or list-like, which applies variable tolerance per - element. List-like includes list, tuple, array, Series, and must be - the same size as the index and its dtype must exactly match the - index's type. + # -------------------------------------------------------------------- + # Uncategorized Methods - .. versionadded:: 0.21.0 (list-like tolerance) + @property + def values(self): + """ + Return an array representing the data in the Index. + + .. warning:: + + We recommend using :attr:`Index.array` or + :meth:`Index.to_numpy`, depending on whether you need + a reference to the underlying data or a NumPy array. Returns ------- - indexer : ndarray of int - Integers from 0 to n - 1 indicating that the index at these - positions matches the corresponding target values. Missing values - in the target are marked by -1. + array: numpy.ndarray or ExtensionArray - Examples + See Also -------- - >>> index = pd.Index(['c', 'a', 'b']) - >>> index.get_indexer(['a', 'b', 'x']) - array([ 1, 2, -1]) + Index.array : Reference to the underlying data. + Index.to_numpy : A NumPy array representing the underlying data. - Notice that the return value is an array of locations in ``index`` - and ``x`` is marked by -1, as it is not in ``index``. + Return the underlying data as an ndarray. """ + return self._data.view(np.ndarray) - @Appender(_index_shared_docs['get_indexer'] % _index_doc_kwargs) - def get_indexer(self, target, method=None, limit=None, tolerance=None): - method = missing.clean_reindex_fill_method(method) - target = ensure_index(target) - if tolerance is not None: - tolerance = self._convert_tolerance(tolerance, target) + @property + def _values(self): + # type: () -> Union[ExtensionArray, Index, np.ndarray] + # TODO(EA): remove index types as they become extension arrays + """ + The best array representation. - # Treat boolean labels passed to a numeric index as not found. Without - # this fix False and True would be treated as 0 and 1 respectively. - # (GH #16877) - if target.is_boolean() and self.is_numeric(): - return ensure_platform_int(np.repeat(-1, target.size)) + This is an ndarray, ExtensionArray, or Index subclass. This differs + from ``_ndarray_values``, which always returns an ndarray. - pself, ptarget = self._maybe_promote(target) - if pself is not self or ptarget is not target: - return pself.get_indexer(ptarget, method=method, limit=limit, - tolerance=tolerance) + Both ``_values`` and ``_ndarray_values`` are consistent between + ``Series`` and ``Index``. - if not is_dtype_equal(self.dtype, target.dtype): - this = self.astype(object) - target = target.astype(object) - return this.get_indexer(target, method=method, limit=limit, - tolerance=tolerance) + It may differ from the public '.values' method. - if not self.is_unique: - raise InvalidIndexError('Reindexing only valid with uniquely' - ' valued Index objects') + index | values | _values | _ndarray_values | + ----------------- | --------------- | ------------- | --------------- | + Index | ndarray | ndarray | ndarray | + CategoricalIndex | Categorical | Categorical | ndarray[int] | + DatetimeIndex | ndarray[M8ns] | ndarray[M8ns] | ndarray[M8ns] | + DatetimeIndex[tz] | ndarray[M8ns] | DTI[tz] | ndarray[M8ns] | + PeriodIndex | ndarray[object] | PeriodArray | ndarray[int] | + IntervalIndex | IntervalArray | IntervalArray | ndarray[object] | - if method == 'pad' or method == 'backfill': - indexer = self._get_fill_indexer(target, method, limit, tolerance) - elif method == 'nearest': - indexer = self._get_nearest_indexer(target, limit, tolerance) - else: - if tolerance is not None: - raise ValueError('tolerance argument only valid if doing pad, ' - 'backfill or nearest reindexing') - if limit is not None: - raise ValueError('limit argument only valid if doing pad, ' - 'backfill or nearest reindexing') + See Also + -------- + values + _ndarray_values + """ + return self._data - indexer = self._engine.get_indexer(target._ndarray_values) + def get_values(self): + """ + Return `Index` data as an `numpy.ndarray`. - return ensure_platform_int(indexer) + Returns + ------- + numpy.ndarray + A one-dimensional numpy array of the `Index` values. - def _convert_tolerance(self, tolerance, target): - # override this method on subclasses - tolerance = np.asarray(tolerance) - if target.size != tolerance.size and tolerance.size > 1: - raise ValueError('list-like tolerance size must match ' - 'target index size') - return tolerance + See Also + -------- + Index.values : The attribute that get_values wraps. - def _get_fill_indexer(self, target, method, limit=None, tolerance=None): - if self.is_monotonic_increasing and target.is_monotonic_increasing: - method = (self._engine.get_pad_indexer if method == 'pad' else - self._engine.get_backfill_indexer) - indexer = method(target._ndarray_values, limit) - else: - indexer = self._get_fill_indexer_searchsorted(target, method, - limit) - if tolerance is not None: - indexer = self._filter_indexer_tolerance(target._ndarray_values, - indexer, - tolerance) - return indexer + Examples + -------- + Getting the `Index` values of a `DataFrame`: - def _get_fill_indexer_searchsorted(self, target, method, limit=None): - """ - Fallback pad/backfill get_indexer that works for monotonic decreasing - indexes and non-monotonic targets - """ - if limit is not None: - raise ValueError('limit argument for %r method only well-defined ' - 'if index and target are monotonic' % method) + >>> df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]], + ... index=['a', 'b', 'c'], columns=['A', 'B', 'C']) + >>> df + A B C + a 1 2 3 + b 4 5 6 + c 7 8 9 + >>> df.index.get_values() + array(['a', 'b', 'c'], dtype=object) - side = 'left' if method == 'pad' else 'right' + Standalone `Index` values: - # find exact matches first (this simplifies the algorithm) - indexer = self.get_indexer(target) - nonexact = (indexer == -1) - indexer[nonexact] = self._searchsorted_monotonic(target[nonexact], - side) - if side == 'left': - # searchsorted returns "indices into a sorted array such that, - # if the corresponding elements in v were inserted before the - # indices, the order of a would be preserved". - # Thus, we need to subtract 1 to find values to the left. - indexer[nonexact] -= 1 - # This also mapped not found values (values of 0 from - # np.searchsorted) to -1, which conveniently is also our - # sentinel for missing values - else: - # Mark indices to the right of the largest value as not found - indexer[indexer == len(self)] = -1 - return indexer + >>> idx = pd.Index(['1', '2', '3']) + >>> idx.get_values() + array(['1', '2', '3'], dtype=object) - def _get_nearest_indexer(self, target, limit, tolerance): - """ - Get the indexer for the nearest index labels; requires an index with - values that can be subtracted from each other (e.g., not strings or - tuples). + `MultiIndex` arrays also have only one dimension: + + >>> midx = pd.MultiIndex.from_arrays([[1, 2, 3], ['a', 'b', 'c']], + ... names=('number', 'letter')) + >>> midx.get_values() + array([(1, 'a'), (2, 'b'), (3, 'c')], dtype=object) + >>> midx.get_values().ndim + 1 """ - left_indexer = self.get_indexer(target, 'pad', limit=limit) - right_indexer = self.get_indexer(target, 'backfill', limit=limit) + return self.values - target = np.asarray(target) - left_distances = abs(self.values[left_indexer] - target) - right_distances = abs(self.values[right_indexer] - target) + @Appender(IndexOpsMixin.memory_usage.__doc__) + def memory_usage(self, deep=False): + result = super(Index, self).memory_usage(deep=deep) - op = operator.lt if self.is_monotonic_increasing else operator.le - indexer = np.where(op(left_distances, right_distances) | - (right_indexer == -1), left_indexer, right_indexer) - if tolerance is not None: - indexer = self._filter_indexer_tolerance(target, indexer, - tolerance) - return indexer + # include our engine hashtable + result += self._engine.sizeof(deep=deep) + return result - def _filter_indexer_tolerance(self, target, indexer, tolerance): - distance = abs(self.values[indexer] - target) - indexer = np.where(distance <= tolerance, indexer, -1) - return indexer + _index_shared_docs['where'] = """ + Return an Index of same shape as self and whose corresponding + entries are from self where cond is True and otherwise are from + other. - _index_shared_docs['get_indexer_non_unique'] = """ - Compute indexer and mask for new index given the current index. The - indexer should be then used as an input to ndarray.take to align the - current data to the new index. + .. versionadded:: 0.19.0 Parameters ---------- - target : %(target_klass)s - - Returns - ------- - indexer : ndarray of int - Integers from 0 to n - 1 indicating that the index at these - positions matches the corresponding target values. Missing values - in the target are marked by -1. - missing : ndarray of int - An indexer into the target of the values not found. - These correspond to the -1 in the indexer array + cond : boolean array-like with the same length as self + other : scalar, or array-like """ - @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) - def get_indexer_non_unique(self, target): - target = ensure_index(target) - if is_categorical(target): - target = target.astype(target.dtype.categories.dtype) - pself, ptarget = self._maybe_promote(target) - if pself is not self or ptarget is not target: - return pself.get_indexer_non_unique(ptarget) + @Appender(_index_shared_docs['where']) + def where(self, cond, other=None): + if other is None: + other = self._na_value - if self.is_all_dates: - self = Index(self.asi8) - tgt_values = target.asi8 - else: - tgt_values = target._ndarray_values + dtype = self.dtype + values = self.values - indexer, missing = self._engine.get_indexer_non_unique(tgt_values) - return ensure_platform_int(indexer), missing + if is_bool(other) or is_bool_dtype(other): - def get_indexer_for(self, target, **kwargs): - """ - guaranteed return of an indexer even when non-unique - This dispatches to get_indexer or get_indexer_nonunique as appropriate - """ - if self.is_unique: - return self.get_indexer(target, **kwargs) - indexer, _ = self.get_indexer_non_unique(target, **kwargs) - return indexer + # bools force casting + values = values.astype(object) + dtype = None - def _maybe_promote(self, other): - # A hack, but it works - from pandas import DatetimeIndex - if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): - return DatetimeIndex(self), other - elif self.inferred_type == 'boolean': - if not is_object_dtype(self.dtype): - return self.astype('object'), other.astype('object') - return self, other + values = np.where(cond, values, other) - def groupby(self, values): + if self._is_numeric_dtype and np.any(isna(values)): + # We can't coerce to the numeric dtype of "self" (unless + # it's float) if there are NaN values in our output. + dtype = None + + return self._shallow_copy_with_infer(values, dtype=dtype) + + # construction helpers + @classmethod + def _try_convert_to_int_index(cls, data, copy, name, dtype): """ - Group the index labels by a given array of values. + Attempt to convert an array of data into an integer index. Parameters ---------- - values : array - Values used to determine the groups. + data : The data to convert. + copy : Whether to copy the data or not. + name : The name of the index returned. Returns ------- - groups : dict - {group name -> group labels} + int_index : data converted to either an Int64Index or a + UInt64Index + + Raises + ------ + ValueError if the conversion was not successful. """ - # TODO: if we are a MultiIndex, we can do better - # that converting to tuples - from .multi import MultiIndex - if isinstance(values, MultiIndex): - values = values.values - values = ensure_categorical(values) - result = values._reverse_indexer() + from .numeric import Int64Index, UInt64Index + if not is_unsigned_integer_dtype(dtype): + # skip int64 conversion attempt if uint-like dtype is passed, as + # this could return Int64Index when UInt64Index is what's desrired + try: + res = data.astype('i8', copy=False) + if (res == data).all(): + return Int64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass - # map to the label - result = {k: self.take(v) for k, v in compat.iteritems(result)} + # Conversion to int64 failed (possibly due to overflow) or was skipped, + # so let's try now with uint64. + try: + res = data.astype('u8', copy=False) + if (res == data).all(): + return UInt64Index(res, copy=copy, name=name) + except (OverflowError, TypeError, ValueError): + pass - return result + raise ValueError - def map(self, mapper, na_action=None): - """ - Map values using input correspondence (a dict, Series, or function). + @classmethod + def _scalar_data_error(cls, data): + raise TypeError('{0}(...) must be called with a collection of some ' + 'kind, {1} was passed'.format(cls.__name__, + repr(data))) - Parameters - ---------- - mapper : function, dict, or Series - Mapping correspondence. - na_action : {None, 'ignore'} - If 'ignore', propagate NA values, without passing them to the - mapping correspondence. + @classmethod + def _string_data_error(cls, data): + raise TypeError('String dtype not supported, you may need ' + 'to explicitly cast to a numeric type') - Returns - ------- - applied : Union[Index, MultiIndex], inferred - The output of the mapping function applied to the index. - If the function returns a tuple with more than one element - a MultiIndex will be returned. + @classmethod + def _coerce_to_ndarray(cls, data): """ + Coerces data to ndarray. - from .multi import MultiIndex - new_values = super(Index, self)._map_values( - mapper, na_action=na_action) + Converts other iterables to list first and then to array. + Does not touch ndarrays. - attributes = self._get_attributes_dict() + Raises + ------ + TypeError + When the data passed in is a scalar. + """ - # we can return a MultiIndex - if new_values.size and isinstance(new_values[0], tuple): - if isinstance(self, MultiIndex): - names = self.names - elif attributes.get('name'): - names = [attributes.get('name')] * len(new_values[0]) - else: - names = None - return MultiIndex.from_tuples(new_values, - names=names) + if not isinstance(data, (np.ndarray, Index)): + if data is None or is_scalar(data): + cls._scalar_data_error(data) - attributes['copy'] = False - if not new_values.size: - # empty - attributes['dtype'] = self.dtype - - return Index(new_values, **attributes) + # other iterable of some kind + if not isinstance(data, (ABCSeries, list, tuple)): + data = list(data) + data = np.asarray(data) + return data - def isin(self, values, level=None): + def _coerce_scalar_to_index(self, item): """ - Return a boolean array where the index values are in `values`. - - Compute boolean array of whether each index value is found in the - passed set of values. The length of the returned boolean array matches - the length of the index. + We need to coerce a scalar to a compat for our index type. Parameters ---------- - values : set or list-like - Sought values. + item : scalar item to coerce + """ + dtype = self.dtype - .. versionadded:: 0.18.1 + if self._is_numeric_dtype and isna(item): + # We can't coerce to the numeric dtype of "self" (unless + # it's float) if there are NaN values in our output. + dtype = None - Support for values as a set. + return Index([item], dtype=dtype, **self._get_attributes_dict()) - level : str or int, optional - Name or position of the index level to use (if the index is a - `MultiIndex`). + def _to_safe_for_reshape(self): + """ + Convert to object if we are a categorical. + """ + return self - Returns - ------- - is_contained : ndarray - NumPy array of boolean values. + def _convert_for_op(self, value): + """ + Convert value to be insertable to ndarray. + """ + return value - See Also - -------- - Series.isin : Same for Series. - DataFrame.isin : Same method for DataFrames. + def _assert_can_do_op(self, value): + """ + Check value is valid for scalar op. + """ + if not is_scalar(value): + msg = "'value' must be a scalar, passed: {0}" + raise TypeError(msg.format(type(value).__name__)) - Notes - ----- - In the case of `MultiIndex` you must either specify `values` as a - list-like object containing tuples that are the same length as the - number of levels, or specify `level`. Otherwise it will raise a - ``ValueError``. + @property + def _has_complex_internals(self): + # to disable groupby tricks in MultiIndex + return False - If `level` is specified: + def _is_memory_usage_qualified(self): + """ + Return a boolean if we need a qualified .info display. + """ + return self.is_object() - - if it is the name of one *and only one* index level, use that level; - - otherwise it should be a number indicating level position. + def is_type_compatible(self, kind): + return kind == self.inferred_type - Examples - -------- - >>> idx = pd.Index([1,2,3]) - >>> idx - Int64Index([1, 2, 3], dtype='int64') + _index_shared_docs['__contains__'] = """ + Return a boolean if this key is IN the index. - Check whether each index value in a list of values. - >>> idx.isin([1, 4]) - array([ True, False, False]) + Parameters + ---------- + key : object - >>> midx = pd.MultiIndex.from_arrays([[1,2,3], - ... ['red', 'blue', 'green']], - ... names=('number', 'color')) - >>> midx - MultiIndex(levels=[[1, 2, 3], ['blue', 'green', 'red']], - labels=[[0, 1, 2], [2, 0, 1]], - names=['number', 'color']) + Returns + ------- + boolean + """ - Check whether the strings in the 'color' level of the MultiIndex - are in a list of colors. + @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) + def __contains__(self, key): + hash(key) + try: + return key in self._engine + except (OverflowError, TypeError, ValueError): + return False - >>> midx.isin(['red', 'orange', 'yellow'], level='color') - array([ True, False, False]) + _index_shared_docs['contains'] = """ + Return a boolean if this key is IN the index. - To check across the levels of a MultiIndex, pass a list of tuples: + Parameters + ---------- + key : object - >>> midx.isin([(1, 'red'), (3, 'red')]) - array([ True, False, False]) + Returns + ------- + boolean + """ - For a DatetimeIndex, string values in `values` are converted to - Timestamps. + @Appender(_index_shared_docs['contains'] % _index_doc_kwargs) + def contains(self, key): + hash(key) + try: + return key in self._engine + except (TypeError, ValueError): + return False - >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13'] - >>> dti = pd.to_datetime(dates) - >>> dti - DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'], - dtype='datetime64[ns]', freq=None) + def __hash__(self): + raise TypeError("unhashable type: %r" % type(self).__name__) - >>> dti.isin(['2000-03-11']) - array([ True, False, False]) + def __setitem__(self, key, value): + raise TypeError("Index does not support mutable operations") + + def __getitem__(self, key): """ - if level is not None: - self._validate_index_level(level) - return algos.isin(self, values) + Override numpy.ndarray's __getitem__ method to work as desired. + + This function adds lists and Series as valid boolean indexers + (ndarrays only supports ndarray with dtype=bool). + + If resulting ndim != 1, plain ndarray is returned instead of + corresponding `Index` subclass. - def _can_reindex(self, indexer): """ - *this is an internal non-public method* + # There's no custom logic to be implemented in __getslice__, so it's + # not overloaded intentionally. + getitem = self._data.__getitem__ + promote = self._shallow_copy - Check if we are allowing reindexing with this particular indexer + if is_scalar(key): + key = com.cast_scalar_indexer(key) + return getitem(key) - Parameters - ---------- - indexer : an integer indexer + if isinstance(key, slice): + # This case is separated from the conditional above to avoid + # pessimization of basic indexing. + return promote(getitem(key)) - Raises - ------ - ValueError if its a duplicate axis + if com.is_bool_indexer(key): + key = np.asarray(key, dtype=bool) + + key = com.values_from_object(key) + result = getitem(key) + if not is_scalar(result): + return promote(result) + else: + return result + + def _can_hold_identifiers_and_holds_name(self, name): """ + Faster check for ``name in self`` when we know `name` is a Python + identifier (e.g. in NDFrame.__getattr__, which hits this to support + . key lookup). For indexes that can't hold identifiers (everything + but object & categorical) we just return False. - # trying to reindex on an axis with duplicates - if not self.is_unique and len(indexer): - raise ValueError("cannot reindex from a duplicate axis") + https://github.com/pandas-dev/pandas/issues/19764 + """ + if self.is_object() or self.is_categorical(): + return name in self + return False - def reindex(self, target, method=None, level=None, limit=None, - tolerance=None): + def append(self, other): """ - Create index with target's values (move/add/delete values as necessary) + Append a collection of Index options together. Parameters ---------- - target : an iterable + other : Index or list/tuple of indices Returns ------- - new_index : pd.Index - Resulting index - indexer : np.ndarray or None - Indices of output values in original index - + appended : Index """ - # GH6552: preserve names when reindexing to non-named target - # (i.e. neither Index nor Series). - preserve_names = not hasattr(target, 'name') - - # GH7774: preserve dtype/tz if target is empty and not an Index. - target = _ensure_has_len(target) # target may be an iterator - if not isinstance(target, Index) and len(target) == 0: - attrs = self._get_attributes_dict() - attrs.pop('freq', None) # don't preserve freq - values = self._data[:0] # appropriately-dtyped empty array - target = self._simple_new(values, dtype=self.dtype, **attrs) - else: - target = ensure_index(target) + to_concat = [self] - if level is not None: - if method is not None: - raise TypeError('Fill method not supported if level passed') - _, indexer, _ = self._join_level(target, level, how='right', - return_indexers=True) + if isinstance(other, (list, tuple)): + to_concat = to_concat + list(other) else: - if self.equals(target): - indexer = None - else: + to_concat.append(other) - if self.is_unique: - indexer = self.get_indexer(target, method=method, - limit=limit, - tolerance=tolerance) - else: - if method is not None or limit is not None: - raise ValueError("cannot reindex a non-unique index " - "with a method or limit") - indexer, missing = self.get_indexer_non_unique(target) + for obj in to_concat: + if not isinstance(obj, Index): + raise TypeError('all inputs must be Index') - if preserve_names and target.nlevels == 1 and target.name != self.name: - target = target.copy() - target.name = self.name + names = {obj.name for obj in to_concat} + name = None if len(names) > 1 else self.name - return target, indexer + return self._concat(to_concat, name) - def _reindex_non_unique(self, target): - """ - *this is an internal non-public method* + def _concat(self, to_concat, name): - Create a new index with target's values (move/add/delete values as - necessary) use with non-unique Index and a possibly non-unique target + typs = _concat.get_dtype_kinds(to_concat) - Parameters - ---------- - target : an iterable - - Returns - ------- - new_index : pd.Index - Resulting index - indexer : np.ndarray or None - Indices of output values in original index + if len(typs) == 1: + return self._concat_same_dtype(to_concat, name=name) + return _concat._concat_index_asobject(to_concat, name=name) + def _concat_same_dtype(self, to_concat, name): """ + Concatenate to_concat which has the same class. + """ + # must be overridden in specific classes + return _concat._concat_index_asobject(to_concat, name) - target = ensure_index(target) - indexer, missing = self.get_indexer_non_unique(target) - check = indexer != -1 - new_labels = self.take(indexer[check]) - new_indexer = None - - if len(missing): - length = np.arange(len(indexer)) - - missing = ensure_platform_int(missing) - missing_labels = target.take(missing) - missing_indexer = ensure_int64(length[~check]) - cur_labels = self.take(indexer[check]).values - cur_indexer = ensure_int64(length[check]) + def putmask(self, mask, value): + """ + Return a new Index of the values set with the mask. - new_labels = np.empty(tuple([len(indexer)]), dtype=object) - new_labels[cur_indexer] = cur_labels - new_labels[missing_indexer] = missing_labels + See Also + -------- + numpy.ndarray.putmask + """ + values = self.values.copy() + try: + np.putmask(values, mask, self._convert_for_op(value)) + return self._shallow_copy(values) + except (ValueError, TypeError) as err: + if is_object_dtype(self): + raise err - # a unique indexer - if target.is_unique: + # coerces to object + return self.astype(object).putmask(mask, value) - # see GH5553, make sure we use the right indexer - new_indexer = np.arange(len(indexer)) - new_indexer[cur_indexer] = np.arange(len(cur_labels)) - new_indexer[missing_indexer] = -1 + def equals(self, other): + """ + Determines if two Index objects contain the same elements. + """ + if self.is_(other): + return True - # we have a non_unique selector, need to use the original - # indexer here - else: + if not isinstance(other, Index): + return False - # need to retake to have the same size as the indexer - indexer[~check] = -1 + if is_object_dtype(self) and not is_object_dtype(other): + # if other is not object, use other's logic for coercion + return other.equals(self) - # reset the new indexer to account for the new size - new_indexer = np.arange(len(self.take(indexer))) - new_indexer[~check] = -1 + try: + return array_equivalent(com.values_from_object(self), + com.values_from_object(other)) + except Exception: + return False - new_index = self._shallow_copy_with_infer(new_labels, freq=None) - return new_index, indexer, new_indexer + def identical(self, other): + """ + Similar to equals, but check that other comparable attributes are + also equal. + """ + return (self.equals(other) and + all((getattr(self, c, None) == getattr(other, c, None) + for c in self._comparables)) and + type(self) == type(other)) - _index_shared_docs['join'] = """ - *this is an internal non-public method* + def asof(self, label): + """ + Return the label from the index, or, if not present, the previous one. - Compute join_index and indexers to conform data - structures to the new index. + Assuming that the index is sorted, return the passed index label if it + is in the index, or return the previous index label if the passed one + is not in the index. Parameters ---------- - other : Index - how : {'left', 'right', 'inner', 'outer'} - level : int or level name, default None - return_indexers : boolean, default False - sort : boolean, default False - Sort the join keys lexicographically in the result Index. If False, - the order of the join keys depends on the join type (how keyword) - - .. versionadded:: 0.20.0 + label : object + The label up to which the method returns the latest index label. Returns ------- - join_index, (left_indexer, right_indexer) - """ + object + The passed label if it is in the index. The previous label if the + passed label is not in the sorted index or `NaN` if there is no + such label. - @Appender(_index_shared_docs['join']) - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): - from .multi import MultiIndex - self_is_mi = isinstance(self, MultiIndex) - other_is_mi = isinstance(other, MultiIndex) + See Also + -------- + Series.asof : Return the latest value in a Series up to the + passed index. + merge_asof : Perform an asof merge (similar to left join but it + matches on nearest key rather than equal key). + Index.get_loc : An `asof` is a thin wrapper around `get_loc` + with method='pad'. - # try to figure out the join level - # GH3662 - if level is None and (self_is_mi or other_is_mi): + Examples + -------- + `Index.asof` returns the latest index label up to the passed label. - # have the same levels/names so a simple join - if self.names == other.names: - pass - else: - return self._join_multi(other, how=how, - return_indexers=return_indexers) + >>> idx = pd.Index(['2013-12-31', '2014-01-02', '2014-01-03']) + >>> idx.asof('2014-01-01') + '2013-12-31' - # join on the level - if level is not None and (self_is_mi or other_is_mi): - return self._join_level(other, level, how=how, - return_indexers=return_indexers) + If the label is in the index, the method returns the passed label. - other = ensure_index(other) + >>> idx.asof('2014-01-02') + '2014-01-02' - if len(other) == 0 and how in ('left', 'outer'): - join_index = self._shallow_copy() - if return_indexers: - rindexer = np.repeat(-1, len(join_index)) - return join_index, None, rindexer - else: - return join_index + If all of the labels in the index are later than the passed label, + NaN is returned. - if len(self) == 0 and how in ('right', 'outer'): - join_index = other._shallow_copy() - if return_indexers: - lindexer = np.repeat(-1, len(join_index)) - return join_index, lindexer, None - else: - return join_index + >>> idx.asof('1999-01-02') + nan - if self._join_precedence < other._join_precedence: - how = {'right': 'left', 'left': 'right'}.get(how, how) - result = other.join(self, how=how, level=level, - return_indexers=return_indexers) - if return_indexers: - x, y, z = result - result = x, z, y - return result + If the index is not sorted, an error is raised. - if not is_dtype_equal(self.dtype, other.dtype): - this = self.astype('O') - other = other.astype('O') - return this.join(other, how=how, return_indexers=return_indexers) + >>> idx_not_sorted = pd.Index(['2013-12-31', '2015-01-02', + ... '2014-01-03']) + >>> idx_not_sorted.asof('2013-12-31') + Traceback (most recent call last): + ValueError: index must be monotonic increasing or decreasing + """ + try: + loc = self.get_loc(label, method='pad') + except KeyError: + return self._na_value + else: + if isinstance(loc, slice): + loc = loc.indices(len(self))[-1] + return self[loc] - _validate_join_method(how) + def asof_locs(self, where, mask): + """ + Finds the locations (indices) of the labels from the index for + every entry in the `where` argument. - if not self.is_unique and not other.is_unique: - return self._join_non_unique(other, how=how, - return_indexers=return_indexers) - elif not self.is_unique or not other.is_unique: - if self.is_monotonic and other.is_monotonic: - return self._join_monotonic(other, how=how, - return_indexers=return_indexers) - else: - return self._join_non_unique(other, how=how, - return_indexers=return_indexers) - elif self.is_monotonic and other.is_monotonic: - try: - return self._join_monotonic(other, how=how, - return_indexers=return_indexers) - except TypeError: - pass + As in the `asof` function, if the label (a particular entry in + `where`) is not in the index, the latest index label upto the + passed label is chosen and its index returned. - if how == 'left': - join_index = self - elif how == 'right': - join_index = other - elif how == 'inner': - join_index = self.intersection(other) - elif how == 'outer': - join_index = self.union(other) + If all of the labels in the index are later than a label in `where`, + -1 is returned. - if sort: - join_index = join_index.sort_values() + `mask` is used to ignore NA values in the index during calculation. - if return_indexers: - if join_index is self: - lindexer = None - else: - lindexer = self.get_indexer(join_index) - if join_index is other: - rindexer = None - else: - rindexer = other.get_indexer(join_index) - return join_index, lindexer, rindexer - else: - return join_index + Parameters + ---------- + where : Index + An Index consisting of an array of timestamps. + mask : array-like + Array of booleans denoting where values in the original + data are not NA. - def _join_multi(self, other, how, return_indexers=True): - from .multi import MultiIndex - from pandas.core.reshape.merge import _restore_dropped_levels_multijoin + Returns + ------- + numpy.ndarray + An array of locations (indices) of the labels from the Index + which correspond to the return values of the `asof` function + for every element in `where`. + """ + locs = self.values[mask].searchsorted(where.values, side='right') + locs = np.where(locs > 0, locs - 1, 0) - # figure out join names - self_names = set(com._not_none(*self.names)) - other_names = set(com._not_none(*other.names)) - overlap = self_names & other_names + result = np.arange(len(self))[mask].take(locs) - # need at least 1 in common - if not overlap: - raise ValueError("cannot join with no overlapping index names") + first = mask.argmax() + result[(locs == 0) & (where.values < self.values[first])] = -1 - self_is_mi = isinstance(self, MultiIndex) - other_is_mi = isinstance(other, MultiIndex) + return result - if self_is_mi and other_is_mi: + def sort_values(self, return_indexer=False, ascending=True): + """ + Return a sorted copy of the index. - # Drop the non-matching levels from left and right respectively - ldrop_names = list(self_names - overlap) - rdrop_names = list(other_names - overlap) + Return a sorted copy of the index, and optionally return the indices + that sorted the index itself. - self_jnlevels = self.droplevel(ldrop_names) - other_jnlevels = other.droplevel(rdrop_names) + Parameters + ---------- + return_indexer : bool, default False + Should the indices that would sort the index be returned. + ascending : bool, default True + Should the index values be sorted in an ascending order. - # Join left and right - # Join on same leveled multi-index frames is supported - join_idx, lidx, ridx = self_jnlevels.join(other_jnlevels, how, - return_indexers=True) + Returns + ------- + sorted_index : pandas.Index + Sorted copy of the index. + indexer : numpy.ndarray, optional + The indices that the index itself was sorted by. - # Restore the dropped levels - # Returned index level order is - # common levels, ldrop_names, rdrop_names - dropped_names = ldrop_names + rdrop_names + See Also + -------- + pandas.Series.sort_values : Sort values of a Series. + pandas.DataFrame.sort_values : Sort values in a DataFrame. - levels, labels, names = ( - _restore_dropped_levels_multijoin(self, other, - dropped_names, - join_idx, - lidx, ridx)) + Examples + -------- + >>> idx = pd.Index([10, 100, 1, 1000]) + >>> idx + Int64Index([10, 100, 1, 1000], dtype='int64') - # Re-create the multi-index - multi_join_idx = MultiIndex(levels=levels, labels=labels, - names=names, verify_integrity=False) + Sort values in ascending order (default behavior). - multi_join_idx = multi_join_idx.remove_unused_levels() + >>> idx.sort_values() + Int64Index([1, 10, 100, 1000], dtype='int64') - return multi_join_idx, lidx, ridx + Sort values in descending order, and also get the indices `idx` was + sorted by. - jl = list(overlap)[0] + >>> idx.sort_values(ascending=False, return_indexer=True) + (Int64Index([1000, 100, 10, 1], dtype='int64'), array([3, 1, 0, 2])) + """ + _as = self.argsort() + if not ascending: + _as = _as[::-1] - # Case where only one index is multi - # make the indices into mi's that match - flip_order = False - if self_is_mi: - self, other = other, self - flip_order = True - # flip if join method is right or left - how = {'right': 'left', 'left': 'right'}.get(how, how) + sorted_index = self.take(_as) - level = other.names.index(jl) - result = self._join_level(other, level, how=how, - return_indexers=return_indexers) + if return_indexer: + return sorted_index, _as + else: + return sorted_index - if flip_order: - if isinstance(result, tuple): - return result[0], result[2], result[1] - return result + def sort(self, *args, **kwargs): + raise TypeError("cannot sort an Index object in-place, use " + "sort_values instead") - def _join_non_unique(self, other, how='left', return_indexers=False): - from pandas.core.reshape.merge import _get_join_indexers + def shift(self, periods=1, freq=None): + """ + Shift index by desired number of time frequency increments. - left_idx, right_idx = _get_join_indexers([self._ndarray_values], - [other._ndarray_values], - how=how, - sort=True) + This method is for shifting the values of datetime-like indexes + by a specified time increment a given number of times. - left_idx = ensure_platform_int(left_idx) - right_idx = ensure_platform_int(right_idx) + Parameters + ---------- + periods : int, default 1 + Number of periods (or increments) to shift by, + can be positive or negative. + freq : pandas.DateOffset, pandas.Timedelta or string, optional + Frequency increment to shift by. + If None, the index is shifted by its own `freq` attribute. + Offset aliases are valid strings, e.g., 'D', 'W', 'M' etc. - join_index = np.asarray(self._ndarray_values.take(left_idx)) - mask = left_idx == -1 - np.putmask(join_index, mask, other._ndarray_values.take(right_idx)) + Returns + ------- + pandas.Index + shifted index - join_index = self._wrap_joined_index(join_index, other) + See Also + -------- + Series.shift : Shift values of Series. - if return_indexers: - return join_index, left_idx, right_idx + Examples + -------- + Put the first 5 month starts of 2011 into an index. + + >>> month_starts = pd.date_range('1/1/2011', periods=5, freq='MS') + >>> month_starts + DatetimeIndex(['2011-01-01', '2011-02-01', '2011-03-01', '2011-04-01', + '2011-05-01'], + dtype='datetime64[ns]', freq='MS') + + Shift the index by 10 days. + + >>> month_starts.shift(10, freq='D') + DatetimeIndex(['2011-01-11', '2011-02-11', '2011-03-11', '2011-04-11', + '2011-05-11'], + dtype='datetime64[ns]', freq=None) + + The default value of `freq` is the `freq` attribute of the index, + which is 'MS' (month start) in this example. + + >>> month_starts.shift(10) + DatetimeIndex(['2011-11-01', '2011-12-01', '2012-01-01', '2012-02-01', + '2012-03-01'], + dtype='datetime64[ns]', freq='MS') + + Notes + ----- + This method is only implemented for datetime-like index classes, + i.e., DatetimeIndex, PeriodIndex and TimedeltaIndex. + """ + raise NotImplementedError("Not supported for type %s" % + type(self).__name__) + + def argsort(self, *args, **kwargs): + """ + Return the integer indices that would sort the index. + + Parameters + ---------- + *args + Passed to `numpy.ndarray.argsort`. + **kwargs + Passed to `numpy.ndarray.argsort`. + + Returns + ------- + numpy.ndarray + Integer indices that would sort the index if used as + an indexer. + + See Also + -------- + numpy.argsort : Similar method for NumPy arrays. + Index.sort_values : Return sorted copy of Index. + + Examples + -------- + >>> idx = pd.Index(['b', 'a', 'd', 'c']) + >>> idx + Index(['b', 'a', 'd', 'c'], dtype='object') + + >>> order = idx.argsort() + >>> order + array([1, 0, 3, 2]) + + >>> idx[order] + Index(['a', 'b', 'c', 'd'], dtype='object') + """ + result = self.asi8 + if result is None: + result = np.array(self) + return result.argsort(*args, **kwargs) + + def get_value(self, series, key): + """ + Fast lookup of value from 1-dimensional ndarray. Only use this if you + know what you're doing. + """ + + # if we have something that is Index-like, then + # use this, e.g. DatetimeIndex + s = getattr(series, '_values', None) + if isinstance(s, (ExtensionArray, Index)) and is_scalar(key): + # GH 20882, 21257 + # Unify Index and ExtensionArray treatment + # First try to convert the key to a location + # If that fails, raise a KeyError if an integer + # index, otherwise, see if key is an integer, and + # try that + try: + iloc = self.get_loc(key) + return s[iloc] + except KeyError: + if (len(self) > 0 and + (self.holds_integer() or self.is_boolean())): + raise + elif is_integer(key): + return s[key] + + s = com.values_from_object(series) + k = com.values_from_object(key) + + k = self._convert_scalar_indexer(k, kind='getitem') + try: + return self._engine.get_value(s, k, + tz=getattr(series.dtype, 'tz', None)) + except KeyError as e1: + if len(self) > 0 and (self.holds_integer() or self.is_boolean()): + raise + + try: + return libindex.get_value_box(s, key) + except IndexError: + raise + except TypeError: + # generator/iterator-like + if is_iterator(key): + raise InvalidIndexError(key) + else: + raise e1 + except Exception: # pragma: no cover + raise e1 + except TypeError: + # python 3 + if is_scalar(key): # pragma: no cover + raise IndexError(key) + raise InvalidIndexError(key) + + def set_value(self, arr, key, value): + """ + Fast lookup of value from 1-dimensional ndarray. + + Notes + ----- + Only use this if you know what you're doing. + """ + self._engine.set_value(com.values_from_object(arr), + com.values_from_object(key), value) + + _index_shared_docs['get_indexer_non_unique'] = """ + Compute indexer and mask for new index given the current index. The + indexer should be then used as an input to ndarray.take to align the + current data to the new index. + + Parameters + ---------- + target : %(target_klass)s + + Returns + ------- + indexer : ndarray of int + Integers from 0 to n - 1 indicating that the index at these + positions matches the corresponding target values. Missing values + in the target are marked by -1. + missing : ndarray of int + An indexer into the target of the values not found. + These correspond to the -1 in the indexer array + """ + + @Appender(_index_shared_docs['get_indexer_non_unique'] % _index_doc_kwargs) + def get_indexer_non_unique(self, target): + target = ensure_index(target) + if is_categorical(target): + target = target.astype(target.dtype.categories.dtype) + pself, ptarget = self._maybe_promote(target) + if pself is not self or ptarget is not target: + return pself.get_indexer_non_unique(ptarget) + + if self.is_all_dates: + self = Index(self.asi8) + tgt_values = target.asi8 else: - return join_index + tgt_values = target._ndarray_values - def _join_level(self, other, level, how='left', return_indexers=False, - keep_order=True): + indexer, missing = self._engine.get_indexer_non_unique(tgt_values) + return ensure_platform_int(indexer), missing + + def get_indexer_for(self, target, **kwargs): """ - The join method *only* affects the level of the resulting - MultiIndex. Otherwise it just exactly aligns the Index data to the - labels of the level in the MultiIndex. If `keep_order` == True, the - order of the data indexed by the MultiIndex will not be changed; - otherwise, it will tie out with `other`. + Guaranteed return of an indexer even when non-unique. + + This dispatches to get_indexer or get_indexer_nonunique + as appropriate. """ - from .multi import MultiIndex + if self.is_unique: + return self.get_indexer(target, **kwargs) + indexer, _ = self.get_indexer_non_unique(target, **kwargs) + return indexer - def _get_leaf_sorter(labels): - """ - returns sorter for the inner most level while preserving the - order of higher levels - """ - if labels[0].size == 0: - return np.empty(0, dtype='int64') + def _maybe_promote(self, other): + # A hack, but it works + from pandas import DatetimeIndex + if self.inferred_type == 'date' and isinstance(other, DatetimeIndex): + return DatetimeIndex(self), other + elif self.inferred_type == 'boolean': + if not is_object_dtype(self.dtype): + return self.astype('object'), other.astype('object') + return self, other - if len(labels) == 1: - lab = ensure_int64(labels[0]) - sorter, _ = libalgos.groupsort_indexer(lab, 1 + lab.max()) - return sorter + def groupby(self, values): + """ + Group the index labels by a given array of values. - # find indexers of beginning of each set of - # same-key labels w.r.t all but last level - tic = labels[0][:-1] != labels[0][1:] - for lab in labels[1:-1]: - tic |= lab[:-1] != lab[1:] + Parameters + ---------- + values : array + Values used to determine the groups. - starts = np.hstack(([True], tic, [True])).nonzero()[0] - lab = ensure_int64(labels[-1]) - return lib.get_level_sorter(lab, ensure_int64(starts)) + Returns + ------- + groups : dict + {group name -> group labels} + """ - if isinstance(self, MultiIndex) and isinstance(other, MultiIndex): - raise TypeError('Join on level between two MultiIndex objects ' - 'is ambiguous') + # TODO: if we are a MultiIndex, we can do better + # that converting to tuples + from .multi import MultiIndex + if isinstance(values, MultiIndex): + values = values.values + values = ensure_categorical(values) + result = values._reverse_indexer() - left, right = self, other + # map to the label + result = {k: self.take(v) for k, v in compat.iteritems(result)} - flip_order = not isinstance(self, MultiIndex) - if flip_order: - left, right = right, left - how = {'right': 'left', 'left': 'right'}.get(how, how) + return result - level = left._get_level_number(level) - old_level = left.levels[level] + def map(self, mapper, na_action=None): + """ + Map values using input correspondence (a dict, Series, or function). - if not right.is_unique: - raise NotImplementedError('Index._join_level on non-unique index ' - 'is not implemented') + Parameters + ---------- + mapper : function, dict, or Series + Mapping correspondence. + na_action : {None, 'ignore'} + If 'ignore', propagate NA values, without passing them to the + mapping correspondence. - new_level, left_lev_indexer, right_lev_indexer = \ - old_level.join(right, how=how, return_indexers=True) + Returns + ------- + applied : Union[Index, MultiIndex], inferred + The output of the mapping function applied to the index. + If the function returns a tuple with more than one element + a MultiIndex will be returned. + """ - if left_lev_indexer is None: - if keep_order or len(left) == 0: - left_indexer = None - join_index = left - else: # sort the leaves - left_indexer = _get_leaf_sorter(left.labels[:level + 1]) - join_index = left[left_indexer] + from .multi import MultiIndex + new_values = super(Index, self)._map_values( + mapper, na_action=na_action) - else: - left_lev_indexer = ensure_int64(left_lev_indexer) - rev_indexer = lib.get_reverse_indexer(left_lev_indexer, - len(old_level)) + attributes = self._get_attributes_dict() - new_lev_labels = algos.take_nd(rev_indexer, left.labels[level], - allow_fill=False) + # we can return a MultiIndex + if new_values.size and isinstance(new_values[0], tuple): + if isinstance(self, MultiIndex): + names = self.names + elif attributes.get('name'): + names = [attributes.get('name')] * len(new_values[0]) + else: + names = None + return MultiIndex.from_tuples(new_values, + names=names) - new_labels = list(left.labels) - new_labels[level] = new_lev_labels + attributes['copy'] = False + if not new_values.size: + # empty + attributes['dtype'] = self.dtype - new_levels = list(left.levels) - new_levels[level] = new_level + return Index(new_values, **attributes) - if keep_order: # just drop missing values. o.w. keep order - left_indexer = np.arange(len(left), dtype=np.intp) - mask = new_lev_labels != -1 - if not mask.all(): - new_labels = [lab[mask] for lab in new_labels] - left_indexer = left_indexer[mask] + def isin(self, values, level=None): + """ + Return a boolean array where the index values are in `values`. - else: # tie out the order with other - if level == 0: # outer most level, take the fast route - ngroups = 1 + new_lev_labels.max() - left_indexer, counts = libalgos.groupsort_indexer( - new_lev_labels, ngroups) + Compute boolean array of whether each index value is found in the + passed set of values. The length of the returned boolean array matches + the length of the index. - # missing values are placed first; drop them! - left_indexer = left_indexer[counts[0]:] - new_labels = [lab[left_indexer] for lab in new_labels] + Parameters + ---------- + values : set or list-like + Sought values. - else: # sort the leaves - mask = new_lev_labels != -1 - mask_all = mask.all() - if not mask_all: - new_labels = [lab[mask] for lab in new_labels] + .. versionadded:: 0.18.1 - left_indexer = _get_leaf_sorter(new_labels[:level + 1]) - new_labels = [lab[left_indexer] for lab in new_labels] + Support for values as a set. - # left_indexers are w.r.t masked frame. - # reverse to original frame! - if not mask_all: - left_indexer = mask.nonzero()[0][left_indexer] + level : str or int, optional + Name or position of the index level to use (if the index is a + `MultiIndex`). - join_index = MultiIndex(levels=new_levels, labels=new_labels, - names=left.names, verify_integrity=False) + Returns + ------- + is_contained : ndarray + NumPy array of boolean values. - if right_lev_indexer is not None: - right_indexer = algos.take_nd(right_lev_indexer, - join_index.labels[level], - allow_fill=False) - else: - right_indexer = join_index.labels[level] + See Also + -------- + Series.isin : Same for Series. + DataFrame.isin : Same method for DataFrames. - if flip_order: - left_indexer, right_indexer = right_indexer, left_indexer + Notes + ----- + In the case of `MultiIndex` you must either specify `values` as a + list-like object containing tuples that are the same length as the + number of levels, or specify `level`. Otherwise it will raise a + ``ValueError``. - if return_indexers: - left_indexer = (None if left_indexer is None - else ensure_platform_int(left_indexer)) - right_indexer = (None if right_indexer is None - else ensure_platform_int(right_indexer)) - return join_index, left_indexer, right_indexer - else: - return join_index + If `level` is specified: - def _join_monotonic(self, other, how='left', return_indexers=False): - if self.equals(other): - ret_index = other if how == 'right' else self - if return_indexers: - return ret_index, None, None - else: - return ret_index + - if it is the name of one *and only one* index level, use that level; + - otherwise it should be a number indicating level position. - sv = self._ndarray_values - ov = other._ndarray_values + Examples + -------- + >>> idx = pd.Index([1,2,3]) + >>> idx + Int64Index([1, 2, 3], dtype='int64') - if self.is_unique and other.is_unique: - # We can perform much better than the general case - if how == 'left': - join_index = self - lidx = None - ridx = self._left_indexer_unique(sv, ov) - elif how == 'right': - join_index = other - lidx = self._left_indexer_unique(ov, sv) - ridx = None - elif how == 'inner': - join_index, lidx, ridx = self._inner_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) - elif how == 'outer': - join_index, lidx, ridx = self._outer_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) - else: - if how == 'left': - join_index, lidx, ridx = self._left_indexer(sv, ov) - elif how == 'right': - join_index, ridx, lidx = self._left_indexer(ov, sv) - elif how == 'inner': - join_index, lidx, ridx = self._inner_indexer(sv, ov) - elif how == 'outer': - join_index, lidx, ridx = self._outer_indexer(sv, ov) - join_index = self._wrap_joined_index(join_index, other) + Check whether each index value in a list of values. + >>> idx.isin([1, 4]) + array([ True, False, False]) + + >>> midx = pd.MultiIndex.from_arrays([[1,2,3], + ... ['red', 'blue', 'green']], + ... names=('number', 'color')) + >>> midx + MultiIndex(levels=[[1, 2, 3], ['blue', 'green', 'red']], + labels=[[0, 1, 2], [2, 0, 1]], + names=['number', 'color']) - if return_indexers: - lidx = None if lidx is None else ensure_platform_int(lidx) - ridx = None if ridx is None else ensure_platform_int(ridx) - return join_index, lidx, ridx - else: - return join_index + Check whether the strings in the 'color' level of the MultiIndex + are in a list of colors. - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - return Index(joined, name=name) + >>> midx.isin(['red', 'orange', 'yellow'], level='color') + array([ True, False, False]) + + To check across the levels of a MultiIndex, pass a list of tuples: + + >>> midx.isin([(1, 'red'), (3, 'red')]) + array([ True, False, False]) + + For a DatetimeIndex, string values in `values` are converted to + Timestamps. + + >>> dates = ['2000-03-11', '2000-03-12', '2000-03-13'] + >>> dti = pd.to_datetime(dates) + >>> dti + DatetimeIndex(['2000-03-11', '2000-03-12', '2000-03-13'], + dtype='datetime64[ns]', freq=None) + + >>> dti.isin(['2000-03-11']) + array([ True, False, False]) + """ + if level is not None: + self._validate_index_level(level) + return algos.isin(self, values) def _get_string_slice(self, key, use_lhs=True, use_rhs=True): # this is for partial string indexing, @@ -4266,8 +4603,8 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): def _maybe_cast_indexer(self, key): """ - If we have a float key and are not a floating index - then try to cast to an int if equivalent + If we have a float key and are not a floating index, then try to cast + to an int if equivalent. """ if is_float(key) and not self.is_floating(): @@ -4281,9 +4618,8 @@ def _maybe_cast_indexer(self, key): def _validate_indexer(self, form, key, kind): """ - if we are positional indexer - validate that we have appropriate typed bounds - must be an integer + If we are positional indexer, validate that we have appropriate + typed bounds must be an integer. """ assert kind in ['ix', 'loc', 'getitem', 'iloc'] @@ -4493,7 +4829,7 @@ def slice_locs(self, start=None, end=None, step=None, kind=None): def delete(self, loc): """ - Make new Index with passed location(-s) deleted + Make new Index with passed location(-s) deleted. Returns ------- @@ -4503,8 +4839,9 @@ def delete(self, loc): def insert(self, loc, item): """ - Make new Index inserting new item at location. Follows - Python list.append semantics for negative values + Make new Index inserting new item at location. + + Follows Python list.append semantics for negative values. Parameters ---------- @@ -4522,7 +4859,7 @@ def insert(self, loc, item): def drop(self, labels, errors='raise'): """ - Make new Index with passed list of labels deleted + Make new Index with passed list of labels deleted. Parameters ---------- @@ -4550,190 +4887,8 @@ def drop(self, labels, errors='raise'): indexer = indexer[~mask] return self.delete(indexer) - _index_shared_docs['index_unique'] = ( - """ - Return unique values in the index. Uniques are returned in order - of appearance, this does NOT sort. - - Parameters - ---------- - level : int or str, optional, default None - Only return values from specified level (for MultiIndex) - - .. versionadded:: 0.23.0 - - Returns - ------- - Index without duplicates - - See Also - -------- - unique - Series.unique - """) - - @Appender(_index_shared_docs['index_unique'] % _index_doc_kwargs) - def unique(self, level=None): - if level is not None: - self._validate_index_level(level) - result = super(Index, self).unique() - return self._shallow_copy(result) - - def drop_duplicates(self, keep='first'): - """ - Return Index with duplicate values removed. - - Parameters - ---------- - keep : {'first', 'last', ``False``}, default 'first' - - 'first' : Drop duplicates except for the first occurrence. - - 'last' : Drop duplicates except for the last occurrence. - - ``False`` : Drop all duplicates. - - Returns - ------- - deduplicated : Index - - See Also - -------- - Series.drop_duplicates : Equivalent method on Series. - DataFrame.drop_duplicates : Equivalent method on DataFrame. - Index.duplicated : Related method on Index, indicating duplicate - Index values. - - Examples - -------- - Generate an pandas.Index with duplicate values. - - >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama', 'hippo']) - - The `keep` parameter controls which duplicate values are removed. - The value 'first' keeps the first occurrence for each - set of duplicated entries. The default value of keep is 'first'. - - >>> idx.drop_duplicates(keep='first') - Index(['lama', 'cow', 'beetle', 'hippo'], dtype='object') - - The value 'last' keeps the last occurrence for each set of duplicated - entries. - - >>> idx.drop_duplicates(keep='last') - Index(['cow', 'beetle', 'lama', 'hippo'], dtype='object') - - The value ``False`` discards all sets of duplicated entries. - - >>> idx.drop_duplicates(keep=False) - Index(['cow', 'beetle', 'hippo'], dtype='object') - """ - return super(Index, self).drop_duplicates(keep=keep) - - def duplicated(self, keep='first'): - """ - Indicate duplicate index values. - - Duplicated values are indicated as ``True`` values in the resulting - array. Either all duplicates, all except the first, or all except the - last occurrence of duplicates can be indicated. - - Parameters - ---------- - keep : {'first', 'last', False}, default 'first' - The value or values in a set of duplicates to mark as missing. - - - 'first' : Mark duplicates as ``True`` except for the first - occurrence. - - 'last' : Mark duplicates as ``True`` except for the last - occurrence. - - ``False`` : Mark all duplicates as ``True``. - - Examples - -------- - By default, for each set of duplicated values, the first occurrence is - set to False and all others to True: - - >>> idx = pd.Index(['lama', 'cow', 'lama', 'beetle', 'lama']) - >>> idx.duplicated() - array([False, False, True, False, True]) - - which is equivalent to - - >>> idx.duplicated(keep='first') - array([False, False, True, False, True]) - - By using 'last', the last occurrence of each set of duplicated values - is set on False and all others on True: - - >>> idx.duplicated(keep='last') - array([ True, False, True, False, False]) - - By setting keep on ``False``, all duplicates are True: - - >>> idx.duplicated(keep=False) - array([ True, False, True, False, True]) - - Returns - ------- - numpy.ndarray - - See Also - -------- - pandas.Series.duplicated : Equivalent method on pandas.Series. - pandas.DataFrame.duplicated : Equivalent method on pandas.DataFrame. - pandas.Index.drop_duplicates : Remove duplicate values from Index. - """ - return super(Index, self).duplicated(keep=keep) - - _index_shared_docs['fillna'] = """ - Fill NA/NaN values with the specified value - - Parameters - ---------- - value : scalar - Scalar value to use to fill holes (e.g. 0). - This value cannot be a list-likes. - downcast : dict, default is None - a dict of item->dtype of what to downcast if possible, - or the string 'infer' which will try to downcast to an appropriate - equal type (e.g. float64 to int64 if possible) - - Returns - ------- - filled : %(klass)s - """ - - @Appender(_index_shared_docs['fillna']) - def fillna(self, value=None, downcast=None): - self._assert_can_do_op(value) - if self.hasnans: - result = self.putmask(self._isnan, value) - if downcast is None: - # no need to care metadata other than name - # because it can't have freq if - return Index(result, name=self.name) - return self._shallow_copy() - - _index_shared_docs['dropna'] = """ - Return Index without NA/NaN values - - Parameters - ---------- - how : {'any', 'all'}, default 'any' - If the Index is a MultiIndex, drop the value when any or all levels - are NaN. - - Returns - ------- - valid : Index - """ - - @Appender(_index_shared_docs['dropna']) - def dropna(self, how='any'): - if how not in ('any', 'all'): - raise ValueError("invalid how option: {0}".format(how)) - - if self.hasnans: - return self._shallow_copy(self.values[~self._isnan]) - return self._shallow_copy() + # -------------------------------------------------------------------- + # Generated Arithmetic, Comparison, and Unary Methods def _evaluate_with_timedelta_like(self, other, op): # Timedelta knows how to operate with np.array, so dispatch to that @@ -4762,7 +4917,9 @@ def _evaluate_with_datetime_like(self, other, op): @classmethod def _add_comparison_methods(cls): - """ add in comparison methods """ + """ + Add in comparison methods. + """ cls.__eq__ = _make_comparison_op(operator.eq, cls) cls.__ne__ = _make_comparison_op(operator.ne, cls) cls.__lt__ = _make_comparison_op(operator.lt, cls) @@ -4772,7 +4929,9 @@ def _add_comparison_methods(cls): @classmethod def _add_numeric_methods_add_sub_disabled(cls): - """ add in the numeric add/sub methods to disable """ + """ + Add in the numeric add/sub methods to disable. + """ cls.__add__ = make_invalid_op('__add__') cls.__radd__ = make_invalid_op('__radd__') cls.__iadd__ = make_invalid_op('__iadd__') @@ -4782,7 +4941,9 @@ def _add_numeric_methods_add_sub_disabled(cls): @classmethod def _add_numeric_methods_disabled(cls): - """ add in numeric methods to disable other than add/sub """ + """ + Add in numeric methods to disable other than add/sub. + """ cls.__pow__ = make_invalid_op('__pow__') cls.__rpow__ = make_invalid_op('__rpow__') cls.__mul__ = make_invalid_op('__mul__') @@ -4802,12 +4963,15 @@ def _add_numeric_methods_disabled(cls): cls.__inv__ = make_invalid_op('__inv__') def _maybe_update_attributes(self, attrs): - """ Update Index attributes (e.g. freq) depending on op """ + """ + Update Index attributes (e.g. freq) depending on op. + """ return attrs def _validate_for_numeric_unaryop(self, op, opstr): - """ validate if we can perform a numeric unary operation """ - + """ + Validate if we can perform a numeric unary operation. + """ if not self._is_numeric_dtype: raise TypeError("cannot evaluate a numeric op " "{opstr} for type: {typ}" @@ -4815,10 +4979,12 @@ def _validate_for_numeric_unaryop(self, op, opstr): def _validate_for_numeric_binop(self, other, op): """ - return valid other, evaluate or raise TypeError - if we are not of the appropriate type + Return valid other; evaluate or raise TypeError if we are not of + the appropriate type. - internal method called by ops + Notes + ----- + This is an internal method called by ops. """ opstr = '__{opname}__'.format(opname=op.__name__) # if we are an inheritor of numeric, @@ -4858,30 +5024,35 @@ def _validate_for_numeric_binop(self, other, op): @classmethod def _add_numeric_methods_binary(cls): - """ add in numeric methods """ + """ + Add in numeric methods. + """ cls.__add__ = _make_arithmetic_op(operator.add, cls) cls.__radd__ = _make_arithmetic_op(ops.radd, cls) cls.__sub__ = _make_arithmetic_op(operator.sub, cls) cls.__rsub__ = _make_arithmetic_op(ops.rsub, cls) - cls.__mul__ = _make_arithmetic_op(operator.mul, cls) - cls.__rmul__ = _make_arithmetic_op(ops.rmul, cls) cls.__rpow__ = _make_arithmetic_op(ops.rpow, cls) cls.__pow__ = _make_arithmetic_op(operator.pow, cls) - cls.__mod__ = _make_arithmetic_op(operator.mod, cls) - cls.__floordiv__ = _make_arithmetic_op(operator.floordiv, cls) - cls.__rfloordiv__ = _make_arithmetic_op(ops.rfloordiv, cls) + cls.__truediv__ = _make_arithmetic_op(operator.truediv, cls) cls.__rtruediv__ = _make_arithmetic_op(ops.rtruediv, cls) if not compat.PY3: cls.__div__ = _make_arithmetic_op(operator.div, cls) cls.__rdiv__ = _make_arithmetic_op(ops.rdiv, cls) + # TODO: rmod? rdivmod? + cls.__mod__ = _make_arithmetic_op(operator.mod, cls) + cls.__floordiv__ = _make_arithmetic_op(operator.floordiv, cls) + cls.__rfloordiv__ = _make_arithmetic_op(ops.rfloordiv, cls) cls.__divmod__ = _make_arithmetic_op(divmod, cls) + cls.__mul__ = _make_arithmetic_op(operator.mul, cls) + cls.__rmul__ = _make_arithmetic_op(ops.rmul, cls) @classmethod def _add_numeric_methods_unary(cls): - """ add in numeric unary methods """ - + """ + Add in numeric unary methods. + """ def _make_evaluate_unary(op, opstr): def _evaluate_numeric_unary(self): @@ -4905,8 +5076,9 @@ def _add_numeric_methods(cls): @classmethod def _add_logical_methods(cls): - """ add in logical methods """ - + """ + Add in logical methods. + """ _doc = """ %(desc)s @@ -5010,7 +5182,9 @@ def logical_func(self, *args, **kwargs): @classmethod def _add_logical_methods_disabled(cls): - """ add in logical methods to disable """ + """ + Add in logical methods to disable. + """ cls.all = make_invalid_op('all') cls.any = make_invalid_op('any') @@ -5021,7 +5195,8 @@ def _add_logical_methods_disabled(cls): def ensure_index_from_sequences(sequences, names=None): - """Construct an index from sequences of data. + """ + Construct an index from sequences of data. A single sequence returns an Index. Many sequences returns a MultiIndex. @@ -5062,7 +5237,7 @@ def ensure_index_from_sequences(sequences, names=None): def ensure_index(index_like, copy=False): """ - Ensure that we have an index from some index-like object + Ensure that we have an index from some index-like object. Parameters ---------- @@ -5124,7 +5299,9 @@ def ensure_index(index_like, copy=False): def _ensure_has_len(seq): - """If seq is an iterator, put its values into a list.""" + """ + If seq is an iterator, put its values into a list. + """ try: len(seq) except TypeError: @@ -5135,7 +5312,7 @@ def _ensure_has_len(seq): def _trim_front(strings): """ - Trims zeros and decimal points + Trims zeros and decimal points. """ trimmed = strings while len(strings) > 0 and all(x[0] == ' ' for x in trimmed): diff --git a/pandas/core/indexes/category.py b/pandas/core/indexes/category.py index f05b0fdd4a3236..6d26894514a9c1 100644 --- a/pandas/core/indexes/category.py +++ b/pandas/core/indexes/category.py @@ -13,7 +13,7 @@ is_scalar) from pandas.core.dtypes.dtypes import CategoricalDtype from pandas.core.dtypes.generic import ABCCategorical, ABCSeries -from pandas.core.dtypes.missing import array_equivalent, isna +from pandas.core.dtypes.missing import isna from pandas.core import accessor from pandas.core.algorithms import take_1d @@ -94,6 +94,9 @@ def _engine_type(self): _attributes = ['name'] + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, data=None, categories=None, ordered=None, dtype=None, copy=False, name=None, fastpath=None): @@ -212,6 +215,8 @@ def _simple_new(cls, values, name=None, categories=None, ordered=None, result._reset_identity() return result + # -------------------------------------------------------------------- + @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, values=None, categories=None, ordered=None, dtype=None, **kwargs): @@ -278,12 +283,17 @@ def equals(self, other): try: other = self._is_dtype_compat(other) - return array_equivalent(self._data, other) + if isinstance(other, type(self)): + other = other._data + return self._data.equals(other) except (TypeError, ValueError): pass return False + # -------------------------------------------------------------------- + # Rendering Methods + @property def _formatter_func(self): return self.categories._formatter_func @@ -307,6 +317,8 @@ def _format_attrs(self): attrs.append(('length', len(self))) return attrs + # -------------------------------------------------------------------- + @property def inferred_type(self): return 'categorical' @@ -522,12 +534,16 @@ def reindex(self, target, method=None, level=None, limit=None, target = ibase.ensure_index(target) - if not is_categorical_dtype(target) and not target.is_unique: - raise ValueError("cannot reindex with a non-unique indexer") + if self.equals(target): + indexer = None + missing = [] + else: + if not target.is_unique: + raise ValueError("cannot reindex with a non-unique indexer") - indexer, missing = self.get_indexer_non_unique(np.array(target)) + indexer, missing = self.get_indexer_non_unique(np.array(target)) - if len(self.codes): + if len(self.codes) and indexer is not None: new_target = self.take(indexer) else: new_target = target diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 1179f6f39d06c9..8bb9d0b9611ede 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -2,13 +2,12 @@ """ Base and utility classes for tseries type pandas objects. """ +import operator import warnings import numpy as np from pandas._libs import NaT, iNaT, lib -from pandas._libs.tslibs.timestamps import RoundTo, round_nsint64 -import pandas.compat as compat from pandas.compat.numpy import function as nv from pandas.errors import AbstractMethodError from pandas.util._decorators import Appender, cache_readonly @@ -19,11 +18,11 @@ is_integer, is_integer_dtype, is_list_like, is_object_dtype, is_period_dtype, is_scalar, is_string_dtype) from pandas.core.dtypes.generic import ABCIndex, ABCIndexClass, ABCSeries -from pandas.core.dtypes.missing import isna from pandas.core import algorithms, ops -from pandas.core.arrays import PeriodArray -from pandas.core.arrays.datetimelike import DatetimeLikeArrayMixin +from pandas.core.accessor import PandasDelegate +from pandas.core.arrays.datetimelike import ( + DatetimeLikeArrayMixin, _ensure_datetimelike_to_i8) import pandas.core.indexes.base as ibase from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.tools.timedeltas import to_timedelta @@ -33,184 +32,6 @@ _index_doc_kwargs = dict(ibase._index_doc_kwargs) -class DatelikeOps(object): - """ common ops for DatetimeIndex/PeriodIndex, but not TimedeltaIndex """ - - def strftime(self, date_format): - return Index(self.format(date_format=date_format), - dtype=compat.text_type) - strftime.__doc__ = """ - Convert to Index using specified date_format. - - Return an Index of formatted strings specified by date_format, which - supports the same string format as the python standard library. Details - of the string format can be found in `python string format doc <{0}>`__ - - Parameters - ---------- - date_format : str - Date format string (e.g. "%Y-%m-%d"). - - Returns - ------- - Index - Index of formatted strings - - See Also - -------- - to_datetime : Convert the given argument to datetime. - DatetimeIndex.normalize : Return DatetimeIndex with times to midnight. - DatetimeIndex.round : Round the DatetimeIndex to the specified freq. - DatetimeIndex.floor : Floor the DatetimeIndex to the specified freq. - - Examples - -------- - >>> rng = pd.date_range(pd.Timestamp("2018-03-10 09:00"), - ... periods=3, freq='s') - >>> rng.strftime('%B %d, %Y, %r') - Index(['March 10, 2018, 09:00:00 AM', 'March 10, 2018, 09:00:01 AM', - 'March 10, 2018, 09:00:02 AM'], - dtype='object') - """.format("https://docs.python.org/3/library/datetime.html" - "#strftime-and-strptime-behavior") - - -class TimelikeOps(object): - """ common ops for TimedeltaIndex/DatetimeIndex, but not PeriodIndex """ - - _round_doc = ( - """ - Perform {op} operation on the data to the specified `freq`. - - Parameters - ---------- - freq : str or Offset - The frequency level to {op} the index to. Must be a fixed - frequency like 'S' (second) not 'ME' (month end). See - :ref:`frequency aliases ` for - a list of possible `freq` values. - ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise' - Only relevant for DatetimeIndex: - - - 'infer' will attempt to infer fall dst-transition hours based on - order - - bool-ndarray where True signifies a DST time, False designates - a non-DST time (note that this flag is only applicable for - ambiguous times) - - 'NaT' will return NaT where there are ambiguous times - - 'raise' will raise an AmbiguousTimeError if there are ambiguous - times - - .. versionadded:: 0.24.0 - nonexistent : 'shift', 'NaT', default 'raise' - A nonexistent time does not exist in a particular timezone - where clocks moved forward due to DST. - - - 'shift' will shift the nonexistent time forward to the closest - existing time - - 'NaT' will return NaT where there are nonexistent times - - 'raise' will raise an NonExistentTimeError if there are - nonexistent times - - .. versionadded:: 0.24.0 - - Returns - ------- - DatetimeIndex, TimedeltaIndex, or Series - Index of the same type for a DatetimeIndex or TimedeltaIndex, - or a Series with the same index for a Series. - - Raises - ------ - ValueError if the `freq` cannot be converted. - - Examples - -------- - **DatetimeIndex** - - >>> rng = pd.date_range('1/1/2018 11:59:00', periods=3, freq='min') - >>> rng - DatetimeIndex(['2018-01-01 11:59:00', '2018-01-01 12:00:00', - '2018-01-01 12:01:00'], - dtype='datetime64[ns]', freq='T') - """) - - _round_example = ( - """>>> rng.round('H') - DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', - '2018-01-01 12:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.round("H") - 0 2018-01-01 12:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 12:00:00 - dtype: datetime64[ns] - """) - - _floor_example = ( - """>>> rng.floor('H') - DatetimeIndex(['2018-01-01 11:00:00', '2018-01-01 12:00:00', - '2018-01-01 12:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.floor("H") - 0 2018-01-01 11:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 12:00:00 - dtype: datetime64[ns] - """ - ) - - _ceil_example = ( - """>>> rng.ceil('H') - DatetimeIndex(['2018-01-01 12:00:00', '2018-01-01 12:00:00', - '2018-01-01 13:00:00'], - dtype='datetime64[ns]', freq=None) - - **Series** - - >>> pd.Series(rng).dt.ceil("H") - 0 2018-01-01 12:00:00 - 1 2018-01-01 12:00:00 - 2 2018-01-01 13:00:00 - dtype: datetime64[ns] - """ - ) - - def _round(self, freq, mode, ambiguous, nonexistent): - # round the local times - values = _ensure_datetimelike_to_i8(self) - result = round_nsint64(values, mode, freq) - result = self._maybe_mask_results(result, fill_value=NaT) - - attribs = self._get_attributes_dict() - attribs['freq'] = None - if 'tz' in attribs: - attribs['tz'] = None - return self._ensure_localized( - self._shallow_copy(result, **attribs), ambiguous, nonexistent - ) - - @Appender((_round_doc + _round_example).format(op="round")) - def round(self, freq, ambiguous='raise', nonexistent='raise'): - return self._round( - freq, RoundTo.NEAREST_HALF_EVEN, ambiguous, nonexistent - ) - - @Appender((_round_doc + _floor_example).format(op="floor")) - def floor(self, freq, ambiguous='raise', nonexistent='raise'): - return self._round(freq, RoundTo.MINUS_INFTY, ambiguous, nonexistent) - - @Appender((_round_doc + _ceil_example).format(op="ceil")) - def ceil(self, freq, ambiguous='raise', nonexistent='raise'): - return self._round(freq, RoundTo.PLUS_INFTY, ambiguous, nonexistent) - - class DatetimeIndexOpsMixin(DatetimeLikeArrayMixin): """ common ops mixin to support a unified interface datetimelike Index """ @@ -257,7 +78,9 @@ def equals(self, other): @staticmethod def _join_i8_wrapper(joinf, dtype, with_indexers=True): - """ create the join wrapper methods """ + """ + Create the join wrapper methods. + """ @staticmethod def wrapper(left, right): @@ -287,7 +110,7 @@ def _evaluate_compare(self, other, op): def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise', from_utc=False): """ - ensure that we are re-localized + Ensure that we are re-localized. This is for compat as we can then call this on all datetimelike indexes generally (ignored for Period/Timedelta) @@ -320,14 +143,11 @@ def _ensure_localized(self, arg, ambiguous='raise', nonexistent='raise', def _box_values_as_index(self): """ - return object Index which contains boxed values + Return object Index which contains boxed values. """ from pandas.core.index import Index return Index(self._box_values(self.asi8), name=self.name, dtype=object) - def _format_with_header(self, header, **kwargs): - return header + list(self._format_native_types(**kwargs)) - @Appender(_index_shared_docs['__contains__'] % _index_doc_kwargs) def __contains__(self, key): try: @@ -357,7 +177,7 @@ def map(self, f): def sort_values(self, return_indexer=False, ascending=True): """ - Return sorted copy of Index + Return sorted copy of Index. """ if return_indexer: _as = self.argsort() @@ -411,7 +231,8 @@ def take(self, indices, axis=0, allow_fill=True, @property def asobject(self): - """Return object Index which contains boxed values. + """ + Return object Index which contains boxed values. .. deprecated:: 0.23.0 Use ``astype(object)`` instead. @@ -431,7 +252,7 @@ def _convert_tolerance(self, tolerance, target): def tolist(self): """ - return a list of the underlying data + Return a list of the underlying data. """ return list(self.astype(object)) @@ -466,6 +287,7 @@ def min(self, axis=None, *args, **kwargs): def argmin(self, axis=None, *args, **kwargs): """ Returns the indices of the minimum values along an axis. + See `numpy.ndarray.argmin` for more information on the `axis` parameter. @@ -516,6 +338,7 @@ def max(self, axis=None, *args, **kwargs): def argmax(self, axis=None, *args, **kwargs): """ Returns the indices of the maximum values along an axis. + See `numpy.ndarray.argmax` for more information on the `axis` parameter. @@ -535,13 +358,19 @@ def argmax(self, axis=None, *args, **kwargs): i8[mask] = 0 return i8.argmax() + # -------------------------------------------------------------------- + # Rendering Methods + + def _format_with_header(self, header, **kwargs): + return header + list(self._format_native_types(**kwargs)) + @property def _formatter_func(self): raise AbstractMethodError(self) def _format_attrs(self): """ - Return a list of tuples of the (attr,formatted_value) + Return a list of tuples of the (attr,formatted_value). """ attrs = super(DatetimeIndexOpsMixin, self)._format_attrs() for attrib in self._attributes: @@ -552,10 +381,12 @@ def _format_attrs(self): attrs.append(('freq', freq)) return attrs + # -------------------------------------------------------------------- + def _convert_scalar_indexer(self, key, kind=None): """ - we don't allow integer or float indexing on datetime-like when using - loc + We don't allow integer or float indexing on datetime-like when using + loc. Parameters ---------- @@ -581,8 +412,8 @@ def _convert_scalar_indexer(self, key, kind=None): @classmethod def _add_datetimelike_methods(cls): """ - add in the datetimelike methods (as we may have to override the - superclass) + Add in the datetimelike methods (as we may have to override the + superclass). """ def __add__(self, other): @@ -613,7 +444,7 @@ def __rsub__(self, other): def isin(self, values): """ Compute boolean array of whether each index value is found in the - passed set of values + passed set of values. Parameters ---------- @@ -633,7 +464,7 @@ def isin(self, values): def repeat(self, repeats, *args, **kwargs): """ - Analogous to ndarray.repeat + Analogous to ndarray.repeat. """ nv.validate_repeat(args, kwargs) if is_period_dtype(self): @@ -654,7 +485,7 @@ def where(self, cond, other=None): def _summary(self, name=None): """ - Return a summarized representation + Return a summarized representation. Parameters ---------- @@ -685,7 +516,7 @@ def _summary(self, name=None): def _concat_same_dtype(self, to_concat, name): """ - Concatenate to_concat which has the same class + Concatenate to_concat which has the same class. """ attribs = self._get_attributes_dict() attribs['name'] = name @@ -740,43 +571,16 @@ def _time_shift(self, periods, freq=None): return result -def _ensure_datetimelike_to_i8(other, to_utc=False): - """ - helper for coercing an input scalar or array to i8 - - Parameters - ---------- - other : 1d array - to_utc : bool, default False - If True, convert the values to UTC before extracting the i8 values - If False, extract the i8 values directly. - - Returns - ------- - i8 1d array - """ - if is_scalar(other) and isna(other): - return iNaT - elif isinstance(other, (PeriodArray, ABCIndexClass)): - # convert tz if needed - if getattr(other, 'tz', None) is not None: - if to_utc: - other = other.tz_convert('UTC') - else: - other = other.tz_localize(None) - else: - try: - return np.array(other, copy=False).view('i8') - except TypeError: - # period array cannot be coerced to int - other = Index(other) - return other.asi8 - - def wrap_arithmetic_op(self, other, result): if result is NotImplemented: return NotImplemented + if isinstance(result, tuple): + # divmod, rdivmod + assert len(result) == 2 + return (wrap_arithmetic_op(self, other, result[0]), + wrap_arithmetic_op(self, other, result[1])) + if not isinstance(result, Index): # Index.__new__ will choose appropriate subclass for dtype result = Index(result) @@ -841,3 +645,48 @@ def f(self): f.__name__ = fget.__name__ f.__doc__ = fget.__doc__ return property(f) + + +class DatetimelikeDelegateMixin(PandasDelegate): + """ + Delegation mechanism, specific for Datetime, Timedelta, and Period types. + + Functionality is delegated from the Index class to an Array class. A + few things can be customized + + * _delegate_class : type + The class being delegated to. + * _delegated_methods, delegated_properties : List + The list of property / method names being delagated. + * raw_methods : Set + The set of methods whose results should should *not* be + boxed in an index, after being returned from the array + * raw_properties : Set + The set of properties whose results should should *not* be + boxed in an index, after being returned from the array + """ + # raw_methods : dispatch methods that shouldn't be boxed in an Index + _raw_methods = set() + # raw_properties : dispatch properties that shouldn't be boxed in an Index + _raw_properties = set() + name = None + _data = None + + @property + def _delegate_class(self): + raise AbstractMethodError + + def _delegate_property_get(self, name, *args, **kwargs): + result = getattr(self._data, name) + if name not in self._raw_properties: + result = Index(result, name=self.name) + return result + + def _delegate_property_set(self, name, value, *args, **kwargs): + setattr(self._data, name, value) + + def _delegate_method(self, name, *args, **kwargs): + result = operator.methodcaller(name, *args, **kwargs)(self._data) + if name not in self._raw_methods: + result = Index(result, name=self.name) + return result diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 1ba14ffce383be..b778b2132cd96e 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -16,22 +16,22 @@ from pandas.core.dtypes.common import ( _INT64_DTYPE, _NS_DTYPE, ensure_int64, is_datetime64_dtype, - is_datetime64_ns_dtype, is_datetimetz, is_dtype_equal, is_float, - is_integer, is_integer_dtype, is_list_like, is_period_dtype, is_scalar, - is_string_like, pandas_dtype) + is_datetime64_ns_dtype, is_datetime64tz_dtype, is_dtype_equal, is_float, + is_integer, is_list_like, is_object_dtype, is_period_dtype, is_scalar, + is_string_dtype, is_string_like, pandas_dtype) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.generic import ABCSeries from pandas.core.dtypes.missing import isna from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays.datetimes import ( - DatetimeArrayMixin as DatetimeArray, _to_m8) + DatetimeArrayMixin as DatetimeArray, _to_m8, maybe_convert_dtype, + maybe_infer_tz, objects_to_datetime64ns) from pandas.core.base import _shared_docs import pandas.core.common as com from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.datetimelike import ( - DatelikeOps, DatetimeIndexOpsMixin, TimelikeOps, wrap_array_method, - wrap_field_accessor) + DatetimeIndexOpsMixin, wrap_array_method, wrap_field_accessor) from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name import pandas.core.tools.datetimes as tools @@ -49,14 +49,19 @@ def _new_DatetimeIndex(cls, d): # so need to localize tz = d.pop('tz', None) - result = cls.__new__(cls, verify_integrity=False, **d) + with warnings.catch_warnings(): + # we ignore warnings from passing verify_integrity=False + # TODO: If we knew what was going in to **d, we might be able to + # go through _simple_new instead + warnings.simplefilter("ignore") + result = cls.__new__(cls, verify_integrity=False, **d) + if tz is not None: result = result.tz_localize('UTC').tz_convert(tz) return result -class DatetimeIndex(DatetimeArray, DatelikeOps, TimelikeOps, - DatetimeIndexOpsMixin, Int64Index): +class DatetimeIndex(DatetimeArray, DatetimeIndexOpsMixin, Int64Index): """ Immutable ndarray of datetime64 data, represented internally as int64, and which can be boxed to Timestamp objects that are subclasses of datetime and @@ -219,10 +224,20 @@ def __new__(cls, data=None, freq=None, start=None, end=None, periods=None, tz=None, normalize=False, closed=None, ambiguous='raise', dayfirst=False, yearfirst=False, dtype=None, - copy=False, name=None, verify_integrity=True): + copy=False, name=None, verify_integrity=None): + + if verify_integrity is not None: + warnings.warn("The 'verify_integrity' argument is deprecated, " + "will be removed in a future version.", + FutureWarning, stacklevel=2) + else: + verify_integrity = True if data is None: - # TODO: Remove this block and associated kwargs; GH#20535 + warnings.warn("Creating a DatetimeIndex by passing range " + "endpoints is deprecated. Use " + "`pandas.date_range` instead.", + FutureWarning, stacklevel=2) result = cls._generate_range(start, end, periods, freq=freq, tz=tz, normalize=normalize, closed=closed, ambiguous=ambiguous) @@ -246,50 +261,57 @@ def __new__(cls, data=None, name = data.name freq, freq_infer = dtl.maybe_infer_freq(freq) + if freq is None and hasattr(data, "freq"): + # i.e. DatetimeArray/Index + freq = data.freq + verify_integrity = False # if dtype has an embedded tz, capture it tz = dtl.validate_tz_from_dtype(dtype, tz) - if not isinstance(data, (np.ndarray, Index, ABCSeries, DatetimeArray)): - # other iterable of some kind - if not isinstance(data, (list, tuple)): + if not hasattr(data, "dtype"): + # e.g. list, tuple + if np.ndim(data) == 0: + # i.e. generator data = list(data) - data = np.asarray(data, dtype='O') + data = np.asarray(data) + copy = False elif isinstance(data, ABCSeries): data = data._values - # data must be Index or np.ndarray here - if not (is_datetime64_dtype(data) or is_datetimetz(data) or - is_integer_dtype(data) or lib.infer_dtype(data) == 'integer'): - data = tools.to_datetime(data, dayfirst=dayfirst, - yearfirst=yearfirst) - - if isinstance(data, DatetimeArray): - if tz is None: - tz = data.tz - elif data.tz is None: - data = data.tz_localize(tz, ambiguous=ambiguous) - else: - # the tz's must match - if not timezones.tz_compare(tz, data.tz): - msg = ('data is already tz-aware {0}, unable to ' - 'set specified tz: {1}') - raise TypeError(msg.format(data.tz, tz)) + # By this point we are assured to have either a numpy array or Index + data, copy = maybe_convert_dtype(data, copy) + if is_object_dtype(data) or is_string_dtype(data): + # TODO: We do not have tests specific to string-dtypes, + # also complex or categorical or other extension + copy = False + if lib.infer_dtype(data) == 'integer': + data = data.astype(np.int64) + else: + # data comes back here as either i8 to denote UTC timestamps + # or M8[ns] to denote wall times + data, inferred_tz = objects_to_datetime64ns( + data, dayfirst=dayfirst, yearfirst=yearfirst) + tz = maybe_infer_tz(tz, inferred_tz) + + if is_datetime64tz_dtype(data): + tz = maybe_infer_tz(tz, data.tz) subarr = data._data - if freq is None: - freq = data.freq - verify_integrity = False - elif issubclass(data.dtype.type, np.datetime64): + elif is_datetime64_dtype(data): + # tz-naive DatetimeArray/Index or ndarray[datetime64] + data = getattr(data, "_data", data) if data.dtype != _NS_DTYPE: data = conversion.ensure_datetime64ns(data) + if tz is not None: # Convert tz-naive to UTC tz = timezones.maybe_get_tz(tz) data = conversion.tz_localize_to_utc(data.view('i8'), tz, ambiguous=ambiguous) subarr = data.view(_NS_DTYPE) + else: # must be integer dtype otherwise # assume this data are epoch timestamps @@ -373,22 +395,12 @@ def nbytes(self): # for TZ-aware return self._ndarray_values.nbytes - def _mpl_repr(self): - # how to represent ourselves to matplotlib - return libts.ints_to_pydatetime(self.asi8, self.tz) - @cache_readonly def _is_dates_only(self): """Return a boolean if we are only dates (and don't have a timezone)""" from pandas.io.formats.format import _is_dates_only return _is_dates_only(self.values) and self.tz is None - @property - def _formatter_func(self): - from pandas.io.formats.format import _get_format_datetime64 - formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) - return lambda x: "'%s'" % formatter(x, tz=self.tz) - def __reduce__(self): # we use a special reudce here because we need @@ -415,11 +427,6 @@ def __setstate__(self, state): self._freq = own_state[1] self._tz = timezones.tz_standardize(own_state[2]) - # provide numpy < 1.7 compat - if nd_state[2] == 'M8[us]': - new_state = np.ndarray.__reduce__(data.astype('M8[ns]')) - np.ndarray.__setstate__(data, new_state[2]) - else: # pragma: no cover data = np.empty(state) np.ndarray.__setstate__(data, state) @@ -445,6 +452,13 @@ def _maybe_update_attributes(self, attrs): attrs['freq'] = 'infer' return attrs + # -------------------------------------------------------------------- + # Rendering Methods + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return libts.ints_to_pydatetime(self.asi8, self.tz) + def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): from pandas.io.formats.format import _get_format_datetime64_from_values format = _get_format_datetime64_from_values(self, date_format) @@ -454,124 +468,14 @@ def _format_native_types(self, na_rep='NaT', date_format=None, **kwargs): format=format, na_rep=na_rep) - @Appender(_index_shared_docs['astype']) - def astype(self, dtype, copy=True): - dtype = pandas_dtype(dtype) - if (is_datetime64_ns_dtype(dtype) and - not is_dtype_equal(dtype, self.dtype)): - # GH 18951: datetime64_ns dtype but not equal means different tz - new_tz = getattr(dtype, 'tz', None) - if getattr(self.dtype, 'tz', None) is None: - return self.tz_localize(new_tz) - return self.tz_convert(new_tz) - elif is_period_dtype(dtype): - return self.to_period(freq=dtype.freq) - return super(DatetimeIndex, self).astype(dtype, copy=copy) - - def _get_time_micros(self): - values = self.asi8 - if self.tz is not None and not timezones.is_utc(self.tz): - values = self._local_timestamps() - return fields.get_time_micros(values) - - def to_series(self, keep_tz=None, index=None, name=None): - """ - Create a Series with both index and values equal to the index keys - useful with map for returning an indexer based on an index - - Parameters - ---------- - keep_tz : optional, defaults False - Return the data keeping the timezone. - - If keep_tz is True: - - If the timezone is not set, the resulting - Series will have a datetime64[ns] dtype. - - Otherwise the Series will have an datetime64[ns, tz] dtype; the - tz will be preserved. - - If keep_tz is False: - - Series will have a datetime64[ns] dtype. TZ aware - objects will have the tz removed. - - .. versionchanged:: 0.24 - The default value will change to True in a future release. - You can set ``keep_tz=True`` to already obtain the future - behaviour and silence the warning. - - index : Index, optional - index of resulting Series. If None, defaults to original index - name : string, optional - name of resulting Series. If None, defaults to name of original - index - - Returns - ------- - Series - """ - from pandas import Series - - if index is None: - index = self._shallow_copy() - if name is None: - name = self.name - - if keep_tz is None and self.tz is not None: - warnings.warn("The default of the 'keep_tz' keyword will change " - "to True in a future release. You can set " - "'keep_tz=True' to obtain the future behaviour and " - "silence this warning.", FutureWarning, stacklevel=2) - keep_tz = False - elif keep_tz is False: - warnings.warn("Specifying 'keep_tz=False' is deprecated and this " - "option will be removed in a future release. If " - "you want to remove the timezone information, you " - "can do 'idx.tz_convert(None)' before calling " - "'to_series'.", FutureWarning, stacklevel=2) - - if keep_tz and self.tz is not None: - # preserve the tz & copy - values = self.copy(deep=True) - else: - values = self.values.copy() - - return Series(values, index=index, name=name) - - def snap(self, freq='S'): - """ - Snap time stamps to nearest occurring frequency - """ - # Superdumb, punting on any optimizing - freq = to_offset(freq) - - snapped = np.empty(len(self), dtype=_NS_DTYPE) - - for i, v in enumerate(self): - s = v - if not freq.onOffset(s): - t0 = freq.rollback(s) - t1 = freq.rollforward(s) - if abs(s - t0) < abs(t1 - s): - s = t0 - else: - s = t1 - snapped[i] = s - - # we know it conforms; skip check - return DatetimeIndex(snapped, freq=freq, verify_integrity=False) - # TODO: what about self.name? if so, use shallow_copy? - - def unique(self, level=None): - if level is not None: - self._validate_index_level(level) + @property + def _formatter_func(self): + from pandas.io.formats.format import _get_format_datetime64 + formatter = _get_format_datetime64(is_dates_only=self._is_dates_only) + return lambda x: "'%s'" % formatter(x, tz=self.tz) - # TODO(DatetimeArray): change dispatch once inheritance is removed - # call DatetimeArray method - result = DatetimeArray.unique(self) - return self._shallow_copy(result._data) + # -------------------------------------------------------------------- + # Set Operation Methods def union(self, other): """ @@ -640,51 +544,6 @@ def union_many(self, others): return this - def join(self, other, how='left', level=None, return_indexers=False, - sort=False): - """ - See Index.join - """ - if (not isinstance(other, DatetimeIndex) and len(other) > 0 and - other.inferred_type not in ('floating', 'integer', 'mixed-integer', - 'mixed-integer-float', 'mixed')): - try: - other = DatetimeIndex(other) - except (TypeError, ValueError): - pass - - this, other = self._maybe_utc_convert(other) - return Index.join(this, other, how=how, level=level, - return_indexers=return_indexers, sort=sort) - - def _maybe_utc_convert(self, other): - this = self - if isinstance(other, DatetimeIndex): - if self.tz is not None: - if other.tz is None: - raise TypeError('Cannot join tz-naive with tz-aware ' - 'DatetimeIndex') - elif other.tz is not None: - raise TypeError('Cannot join tz-naive with tz-aware ' - 'DatetimeIndex') - - if not timezones.tz_compare(self.tz, other.tz): - this = self.tz_convert('UTC') - other = other.tz_convert('UTC') - return this, other - - def _wrap_joined_index(self, joined, other): - name = get_op_result_name(self, other) - if (isinstance(other, DatetimeIndex) and - self.freq == other.freq and - self._can_fast_union(other)): - joined = self._shallow_copy(joined) - joined.name = name - return joined - else: - tz = getattr(other, 'tz', None) - return self._simple_new(joined, name, tz=tz) - def _can_fast_union(self, other): if not isinstance(other, DatetimeIndex): return False @@ -811,6 +670,172 @@ def intersection(self, other): left_chunk = left.values[lslice] return self._shallow_copy(left_chunk) + # -------------------------------------------------------------------- + + @Appender(_index_shared_docs['astype']) + def astype(self, dtype, copy=True): + dtype = pandas_dtype(dtype) + if (is_datetime64_ns_dtype(dtype) and + not is_dtype_equal(dtype, self.dtype)): + # GH 18951: datetime64_ns dtype but not equal means different tz + new_tz = getattr(dtype, 'tz', None) + if getattr(self.dtype, 'tz', None) is None: + return self.tz_localize(new_tz) + return self.tz_convert(new_tz) + elif is_period_dtype(dtype): + return self.to_period(freq=dtype.freq) + return super(DatetimeIndex, self).astype(dtype, copy=copy) + + def _get_time_micros(self): + values = self.asi8 + if self.tz is not None and not timezones.is_utc(self.tz): + values = self._local_timestamps() + return fields.get_time_micros(values) + + def to_series(self, keep_tz=None, index=None, name=None): + """ + Create a Series with both index and values equal to the index keys + useful with map for returning an indexer based on an index + + Parameters + ---------- + keep_tz : optional, defaults False + Return the data keeping the timezone. + + If keep_tz is True: + + If the timezone is not set, the resulting + Series will have a datetime64[ns] dtype. + + Otherwise the Series will have an datetime64[ns, tz] dtype; the + tz will be preserved. + + If keep_tz is False: + + Series will have a datetime64[ns] dtype. TZ aware + objects will have the tz removed. + + .. versionchanged:: 0.24 + The default value will change to True in a future release. + You can set ``keep_tz=True`` to already obtain the future + behaviour and silence the warning. + + index : Index, optional + index of resulting Series. If None, defaults to original index + name : string, optional + name of resulting Series. If None, defaults to name of original + index + + Returns + ------- + Series + """ + from pandas import Series + + if index is None: + index = self._shallow_copy() + if name is None: + name = self.name + + if keep_tz is None and self.tz is not None: + warnings.warn("The default of the 'keep_tz' keyword will change " + "to True in a future release. You can set " + "'keep_tz=True' to obtain the future behaviour and " + "silence this warning.", FutureWarning, stacklevel=2) + keep_tz = False + elif keep_tz is False: + warnings.warn("Specifying 'keep_tz=False' is deprecated and this " + "option will be removed in a future release. If " + "you want to remove the timezone information, you " + "can do 'idx.tz_convert(None)' before calling " + "'to_series'.", FutureWarning, stacklevel=2) + + if keep_tz and self.tz is not None: + # preserve the tz & copy + values = self.copy(deep=True) + else: + values = self.values.copy() + + return Series(values, index=index, name=name) + + def snap(self, freq='S'): + """ + Snap time stamps to nearest occurring frequency + """ + # Superdumb, punting on any optimizing + freq = to_offset(freq) + + snapped = np.empty(len(self), dtype=_NS_DTYPE) + + for i, v in enumerate(self): + s = v + if not freq.onOffset(s): + t0 = freq.rollback(s) + t1 = freq.rollforward(s) + if abs(s - t0) < abs(t1 - s): + s = t0 + else: + s = t1 + snapped[i] = s + + # we know it conforms; skip check + return DatetimeIndex._simple_new(snapped, freq=freq) + # TODO: what about self.name? tz? if so, use shallow_copy? + + def unique(self, level=None): + if level is not None: + self._validate_index_level(level) + + # TODO(DatetimeArray): change dispatch once inheritance is removed + # call DatetimeArray method + result = DatetimeArray.unique(self) + return self._shallow_copy(result._data) + + def join(self, other, how='left', level=None, return_indexers=False, + sort=False): + """ + See Index.join + """ + if (not isinstance(other, DatetimeIndex) and len(other) > 0 and + other.inferred_type not in ('floating', 'integer', 'mixed-integer', + 'mixed-integer-float', 'mixed')): + try: + other = DatetimeIndex(other) + except (TypeError, ValueError): + pass + + this, other = self._maybe_utc_convert(other) + return Index.join(this, other, how=how, level=level, + return_indexers=return_indexers, sort=sort) + + def _maybe_utc_convert(self, other): + this = self + if isinstance(other, DatetimeIndex): + if self.tz is not None: + if other.tz is None: + raise TypeError('Cannot join tz-naive with tz-aware ' + 'DatetimeIndex') + elif other.tz is not None: + raise TypeError('Cannot join tz-naive with tz-aware ' + 'DatetimeIndex') + + if not timezones.tz_compare(self.tz, other.tz): + this = self.tz_convert('UTC') + other = other.tz_convert('UTC') + return this, other + + def _wrap_joined_index(self, joined, other): + name = get_op_result_name(self, other) + if (isinstance(other, DatetimeIndex) and + self.freq == other.freq and + self._can_fast_union(other)): + joined = self._shallow_copy(joined) + joined.name = name + return joined + else: + tz = getattr(other, 'tz', None) + return self._simple_new(joined, name, tz=tz) + def _parsed_string_to_bounds(self, reso, parsed): """ Calculate datetime bounds for parsed time string and its resolution. @@ -1133,6 +1158,11 @@ def slice_indexer(self, start=None, end=None, step=None, kind=None): # -------------------------------------------------------------------- # Wrapping DatetimeArray + # Compat for frequency inference, see GH#23789 + _is_monotonic_increasing = Index.is_monotonic_increasing + _is_monotonic_decreasing = Index.is_monotonic_decreasing + _is_unique = Index.is_unique + _timezone = cache_readonly(DatetimeArray._timezone.fget) is_normalized = cache_readonly(DatetimeArray.is_normalized.fget) _resolution = cache_readonly(DatetimeArray._resolution.fget) @@ -1512,9 +1542,13 @@ def date_range(start=None, end=None, periods=None, freq=None, tz=None, if freq is None and com._any_none(periods, start, end): freq = 'D' - return DatetimeIndex(start=start, end=end, periods=periods, - freq=freq, tz=tz, normalize=normalize, name=name, - closed=closed, **kwargs) + result = DatetimeIndex._generate_range( + start=start, end=end, periods=periods, + freq=freq, tz=tz, normalize=normalize, + closed=closed, **kwargs) + + result.name = name + return result def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, @@ -1600,9 +1634,9 @@ def bdate_range(start=None, end=None, periods=None, freq='B', tz=None, 'weekmask are passed, got frequency {freq}').format(freq=freq) raise ValueError(msg) - return DatetimeIndex(start=start, end=end, periods=periods, - freq=freq, tz=tz, normalize=normalize, name=name, - closed=closed, **kwargs) + return date_range(start=start, end=end, periods=periods, + freq=freq, tz=tz, normalize=normalize, name=name, + closed=closed, **kwargs) def cdate_range(start=None, end=None, periods=None, freq='C', tz=None, @@ -1659,9 +1693,10 @@ def cdate_range(start=None, end=None, periods=None, freq='C', tz=None, holidays = kwargs.pop('holidays', []) weekmask = kwargs.pop('weekmask', 'Mon Tue Wed Thu Fri') freq = CDay(holidays=holidays, weekmask=weekmask) - return DatetimeIndex(start=start, end=end, periods=periods, freq=freq, - tz=tz, normalize=normalize, name=name, - closed=closed, **kwargs) + + return date_range(start=start, end=end, periods=periods, freq=freq, + tz=tz, normalize=normalize, name=name, + closed=closed, **kwargs) def _time_to_micros(time): diff --git a/pandas/core/indexes/interval.py b/pandas/core/indexes/interval.py index 1ebcf213ab0ebf..444f9e21b0bdcb 100644 --- a/pandas/core/indexes/interval.py +++ b/pandas/core/indexes/interval.py @@ -104,6 +104,7 @@ def _new_IntervalIndex(cls, d): summary="Immutable index of intervals that are closed on the same side.", name=_index_doc_kwargs['name'], versionadded="0.20.0", + extra_attributes="is_overlapping\n", extra_methods="contains\n", examples=textwrap.dedent("""\ Examples @@ -137,6 +138,9 @@ class IntervalIndex(IntervalMixin, Index): # Immutable, so we are able to cache computations like isna in '_mask' _mask = None + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, data, closed=None, dtype=None, copy=False, name=None, verify_integrity=True): @@ -168,6 +172,50 @@ def _simple_new(cls, array, name, closed=None): result._reset_identity() return result + @classmethod + @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs) + def from_breaks(cls, breaks, closed='right', name=None, copy=False, + dtype=None): + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, + dtype=dtype) + return cls._simple_new(array, name=name) + + @classmethod + @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs) + def from_arrays(cls, left, right, closed='right', name=None, copy=False, + dtype=None): + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray.from_arrays(left, right, closed, copy=copy, + dtype=dtype) + return cls._simple_new(array, name=name) + + @classmethod + @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs) + def from_intervals(cls, data, closed=None, name=None, copy=False, + dtype=None): + msg = ('IntervalIndex.from_intervals is deprecated and will be ' + 'removed in a future version; Use IntervalIndex(...) instead') + warnings.warn(msg, FutureWarning, stacklevel=2) + with rewrite_exception("IntervalArray", cls.__name__): + array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) + + if name is None and isinstance(data, cls): + name = data.name + + return cls._simple_new(array, name=name) + + @classmethod + @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs) + def from_tuples(cls, data, closed='right', name=None, copy=False, + dtype=None): + with rewrite_exception("IntervalArray", cls.__name__): + arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, + dtype=dtype) + return cls._simple_new(arr, name=name) + + # -------------------------------------------------------------------- + @Appender(_index_shared_docs['_shallow_copy']) def _shallow_copy(self, left=None, right=None, **kwargs): result = self._data._shallow_copy(left=left, right=right) @@ -231,48 +279,6 @@ def contains(self, key): except KeyError: return False - @classmethod - @Appender(_interval_shared_docs['from_breaks'] % _index_doc_kwargs) - def from_breaks(cls, breaks, closed='right', name=None, copy=False, - dtype=None): - with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray.from_breaks(breaks, closed=closed, copy=copy, - dtype=dtype) - return cls._simple_new(array, name=name) - - @classmethod - @Appender(_interval_shared_docs['from_arrays'] % _index_doc_kwargs) - def from_arrays(cls, left, right, closed='right', name=None, copy=False, - dtype=None): - with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray.from_arrays(left, right, closed, copy=copy, - dtype=dtype) - return cls._simple_new(array, name=name) - - @classmethod - @Appender(_interval_shared_docs['from_intervals'] % _index_doc_kwargs) - def from_intervals(cls, data, closed=None, name=None, copy=False, - dtype=None): - msg = ('IntervalIndex.from_intervals is deprecated and will be ' - 'removed in a future version; Use IntervalIndex(...) instead') - warnings.warn(msg, FutureWarning, stacklevel=2) - with rewrite_exception("IntervalArray", cls.__name__): - array = IntervalArray(data, closed=closed, copy=copy, dtype=dtype) - - if name is None and isinstance(data, cls): - name = data.name - - return cls._simple_new(array, name=name) - - @classmethod - @Appender(_interval_shared_docs['from_tuples'] % _index_doc_kwargs) - def from_tuples(cls, data, closed='right', name=None, copy=False, - dtype=None): - with rewrite_exception("IntervalArray", cls.__name__): - arr = IntervalArray.from_tuples(data, closed=closed, copy=copy, - dtype=dtype) - return cls._simple_new(arr, name=name) - @Appender(_interval_shared_docs['to_tuples'] % dict( return_type="Index", examples=""" @@ -464,6 +470,61 @@ def is_unique(self): def is_non_overlapping_monotonic(self): return self._data.is_non_overlapping_monotonic + @property + def is_overlapping(self): + """ + Return True if the IntervalIndex has overlapping intervals, else False. + + Two intervals overlap if they share a common point, including closed + endpoints. Intervals that only have an open endpoint in common do not + overlap. + + .. versionadded:: 0.24.0 + + Returns + ------- + bool + Boolean indicating if the IntervalIndex has overlapping intervals. + + Examples + -------- + >>> index = pd.IntervalIndex.from_tuples([(0, 2), (1, 3), (4, 5)]) + >>> index + IntervalIndex([(0, 2], (1, 3], (4, 5]], + closed='right', + dtype='interval[int64]') + >>> index.is_overlapping + True + + Intervals that share closed endpoints overlap: + + >>> index = pd.interval_range(0, 3, closed='both') + >>> index + IntervalIndex([[0, 1], [1, 2], [2, 3]], + closed='both', + dtype='interval[int64]') + >>> index.is_overlapping + True + + Intervals that only have an open endpoint in common do not overlap: + + >>> index = pd.interval_range(0, 3, closed='left') + >>> index + IntervalIndex([[0, 1), [1, 2), [2, 3)], + closed='left', + dtype='interval[int64]') + >>> index.is_overlapping + False + + See Also + -------- + Interval.overlaps : Check whether two Interval objects overlap. + IntervalIndex.overlaps : Check an IntervalIndex elementwise for + overlaps. + """ + # GH 23309 + return self._engine.is_overlapping + @Appender(_index_shared_docs['_convert_scalar_indexer']) def _convert_scalar_indexer(self, key, kind=None): if kind == 'iloc': @@ -570,6 +631,10 @@ def _maybe_convert_i8(self, key): else: # DatetimeIndex/TimedeltaIndex key_dtype, key_i8 = key.dtype, Index(key.asi8) + if key.hasnans: + # convert NaT from it's i8 value to np.nan so it's not viewed + # as a valid value, maybe causing errors (e.g. is_overlapping) + key_i8 = key_i8.where(~key._isnan) # ensure consistency with IntervalIndex subtype subtype = self.dtype.subtype @@ -941,6 +1006,8 @@ def __getitem__(self, value): # scalar return result + # -------------------------------------------------------------------- + # Rendering Methods # __repr__ associated methods are based on MultiIndex def _format_with_header(self, header, **kwargs): @@ -997,6 +1064,8 @@ def _format_space(self): space = ' ' * (len(self.__class__.__name__) + 1) return "\n{space}".format(space=space) + # -------------------------------------------------------------------- + def argsort(self, *args, **kwargs): return np.lexsort((self.right, self.left)) diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index f9483b48b5261c..567834b04c1ca4 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -121,7 +121,7 @@ def _codes_to_ints(self, codes): class MultiIndex(Index): """ - A multi-level, or hierarchical, index object for pandas objects + A multi-level, or hierarchical, index object for pandas objects. Parameters ---------- @@ -200,6 +200,9 @@ class MultiIndex(Index): _comparables = ['names'] rename = Index.set_names + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, levels=None, labels=None, sortorder=None, names=None, dtype=None, copy=False, name=None, verify_integrity=True, _set_identity=True): @@ -275,10 +278,177 @@ def _verify_integrity(self, labels=None, levels=None): values=[value for value in level], level=i)) + @classmethod + def from_arrays(cls, arrays, sortorder=None, names=None): + """ + Convert arrays to MultiIndex + + Parameters + ---------- + arrays : list / sequence of array-likes + Each array-like gives one level's value for each data point. + len(arrays) is the number of levels. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level) + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] + >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) + + See Also + -------- + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables. + """ + if not is_list_like(arrays): + raise TypeError("Input must be a list / sequence of array-likes.") + elif is_iterator(arrays): + arrays = list(arrays) + + # Check if lengths of all arrays are equal or not, + # raise ValueError, if not + for i in range(1, len(arrays)): + if len(arrays[i]) != len(arrays[i - 1]): + raise ValueError('all arrays must be same length') + + from pandas.core.arrays.categorical import _factorize_from_iterables + + labels, levels = _factorize_from_iterables(arrays) + if names is None: + names = [getattr(arr, "name", None) for arr in arrays] + + return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, + names=names, verify_integrity=False) + + @classmethod + def from_tuples(cls, tuples, sortorder=None, names=None): + """ + Convert list of tuples to MultiIndex + + Parameters + ---------- + tuples : list / sequence of tuple-likes + Each tuple is the index of one row/column. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level) + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> tuples = [(1, u'red'), (1, u'blue'), + (2, u'red'), (2, u'blue')] + >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex + MultiIndex.from_product : Make a MultiIndex from cartesian product + of iterables + """ + if not is_list_like(tuples): + raise TypeError('Input must be a list / sequence of tuple-likes.') + elif is_iterator(tuples): + tuples = list(tuples) + + if len(tuples) == 0: + if names is None: + msg = 'Cannot infer number of levels from empty list' + raise TypeError(msg) + arrays = [[]] * len(names) + elif isinstance(tuples, (np.ndarray, Index)): + if isinstance(tuples, Index): + tuples = tuples._values + + arrays = list(lib.tuples_to_object_array(tuples).T) + elif isinstance(tuples, list): + arrays = list(lib.to_object_array_tuples(tuples).T) + else: + arrays = lzip(*tuples) + + return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) + + @classmethod + def from_product(cls, iterables, sortorder=None, names=None): + """ + Make a MultiIndex from the cartesian product of multiple iterables + + Parameters + ---------- + iterables : list / sequence of iterables + Each iterable has unique labels for each level of the index. + sortorder : int or None + Level of sortedness (must be lexicographically sorted by that + level). + names : list / sequence of strings or None + Names for the levels in the index. + + Returns + ------- + index : MultiIndex + + Examples + -------- + >>> numbers = [0, 1, 2] + >>> colors = [u'green', u'purple'] + >>> pd.MultiIndex.from_product([numbers, colors], + names=['number', 'color']) + MultiIndex(levels=[[0, 1, 2], [u'green', u'purple']], + labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], + names=[u'number', u'color']) + + See Also + -------- + MultiIndex.from_arrays : Convert list of arrays to MultiIndex. + MultiIndex.from_tuples : Convert list of tuples to MultiIndex. + """ + from pandas.core.arrays.categorical import _factorize_from_iterables + from pandas.core.reshape.util import cartesian_product + + if not is_list_like(iterables): + raise TypeError("Input must be a list / sequence of iterables.") + elif is_iterator(iterables): + iterables = list(iterables) + + labels, levels = _factorize_from_iterables(iterables) + labels = cartesian_product(labels) + return MultiIndex(levels, labels, sortorder=sortorder, names=names) + + # -------------------------------------------------------------------- + @property def levels(self): return self._levels + @property + def _values(self): + # We override here, since our parent uses _data, which we dont' use. + return self.values + + @property + def array(self): + """ + Raises a ValueError for `MultiIndex` because there's no single + array backing a MultiIndex. + + Raises + ------ + ValueError + """ + msg = ("MultiIndex has no single backing array. Use " + "'MultiIndex.to_numpy()' to get a NumPy array of tuples.") + raise ValueError(msg) + @property def _is_homogeneous_type(self): """Whether the levels of a MultiIndex all have the same dtype. @@ -622,6 +792,9 @@ def _nbytes(self, deep=False): result += self._engine.sizeof(deep=deep) return result + # -------------------------------------------------------------------- + # Rendering Methods + def _format_attrs(self): """ Return a list of tuples of the (attr,formatted_value) @@ -644,6 +817,94 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + def _format_native_types(self, na_rep='nan', **kwargs): + new_levels = [] + new_labels = [] + + # go through the levels and format them + for level, label in zip(self.levels, self.labels): + level = level._format_native_types(na_rep=na_rep, **kwargs) + # add nan values, if there are any + mask = (label == -1) + if mask.any(): + nan_index = len(level) + level = np.append(level, na_rep) + label = label.values() + label[mask] = nan_index + new_levels.append(level) + new_labels.append(label) + + if len(new_levels) == 1: + return Index(new_levels[0])._format_native_types() + else: + # reconstruct the multi-index + mi = MultiIndex(levels=new_levels, labels=new_labels, + names=self.names, sortorder=self.sortorder, + verify_integrity=False) + return mi.values + + def format(self, space=2, sparsify=None, adjoin=True, names=False, + na_rep=None, formatter=None): + if len(self) == 0: + return [] + + stringified_levels = [] + for lev, lab in zip(self.levels, self.labels): + na = na_rep if na_rep is not None else _get_na_rep(lev.dtype.type) + + if len(lev) > 0: + + formatted = lev.take(lab).format(formatter=formatter) + + # we have some NA + mask = lab == -1 + if mask.any(): + formatted = np.array(formatted, dtype=object) + formatted[mask] = na + formatted = formatted.tolist() + + else: + # weird all NA case + formatted = [pprint_thing(na if isna(x) else x, + escape_chars=('\t', '\r', '\n')) + for x in algos.take_1d(lev._values, lab)] + stringified_levels.append(formatted) + + result_levels = [] + for lev, name in zip(stringified_levels, self.names): + level = [] + + if names: + level.append(pprint_thing(name, + escape_chars=('\t', '\r', '\n')) + if name is not None else '') + + level.extend(np.array(lev, dtype=object)) + result_levels.append(level) + + if sparsify is None: + sparsify = get_option("display.multi_sparse") + + if sparsify: + sentinel = '' + # GH3547 + # use value of sparsify as sentinel, unless it's an obvious + # "Truthey" value + if sparsify not in [True, 1]: + sentinel = sparsify + # little bit of a kludge job for #1217 + result_levels = _sparsify(result_levels, start=int(names), + sentinel=sentinel) + + if adjoin: + from pandas.io.formats.format import _get_adjustment + adj = _get_adjustment() + return adj.adjoin(space, *result_levels).split('\n') + else: + return result_levels + + # -------------------------------------------------------------------- + def __len__(self): return len(self.labels[0]) @@ -705,32 +966,6 @@ def _set_names(self, names, level=None, validate=True): names = property(fset=_set_names, fget=_get_names, doc="Names of levels in MultiIndex") - def _format_native_types(self, na_rep='nan', **kwargs): - new_levels = [] - new_labels = [] - - # go through the levels and format them - for level, label in zip(self.levels, self.labels): - level = level._format_native_types(na_rep=na_rep, **kwargs) - # add nan values, if there are any - mask = (label == -1) - if mask.any(): - nan_index = len(level) - level = np.append(level, na_rep) - label = label.values() - label[mask] = nan_index - new_levels.append(level) - new_labels.append(label) - - if len(new_levels) == 1: - return Index(new_levels[0])._format_native_types() - else: - # reconstruct the multi-index - mi = MultiIndex(levels=new_levels, labels=new_labels, - names=self.names, sortorder=self.sortorder, - verify_integrity=False) - return mi.values - @Appender(_index_shared_docs['_get_grouper_for_level']) def _get_grouper_for_level(self, mapper, level): indexer = self.labels[level] @@ -1081,66 +1316,6 @@ def unique(self, level=None): level = self._get_level_number(level) return self._get_level_values(level=level, unique=True) - def format(self, space=2, sparsify=None, adjoin=True, names=False, - na_rep=None, formatter=None): - if len(self) == 0: - return [] - - stringified_levels = [] - for lev, lab in zip(self.levels, self.labels): - na = na_rep if na_rep is not None else _get_na_rep(lev.dtype.type) - - if len(lev) > 0: - - formatted = lev.take(lab).format(formatter=formatter) - - # we have some NA - mask = lab == -1 - if mask.any(): - formatted = np.array(formatted, dtype=object) - formatted[mask] = na - formatted = formatted.tolist() - - else: - # weird all NA case - formatted = [pprint_thing(na if isna(x) else x, - escape_chars=('\t', '\r', '\n')) - for x in algos.take_1d(lev._values, lab)] - stringified_levels.append(formatted) - - result_levels = [] - for lev, name in zip(stringified_levels, self.names): - level = [] - - if names: - level.append(pprint_thing(name, - escape_chars=('\t', '\r', '\n')) - if name is not None else '') - - level.extend(np.array(lev, dtype=object)) - result_levels.append(level) - - if sparsify is None: - sparsify = get_option("display.multi_sparse") - - if sparsify: - sentinel = '' - # GH3547 - # use value of sparsify as sentinel, unless it's an obvious - # "Truthey" value - if sparsify not in [True, 1]: - sentinel = sparsify - # little bit of a kludge job for #1217 - result_levels = _sparsify(result_levels, start=int(names), - sentinel=sentinel) - - if adjoin: - from pandas.io.formats.format import _get_adjustment - adj = _get_adjustment() - return adj.adjoin(space, *result_levels).split('\n') - else: - return result_levels - def _to_safe_for_reshape(self): """ convert to object if we are a categorical """ return self.set_levels([i._to_safe_for_reshape() for i in self.levels]) @@ -1195,14 +1370,14 @@ def to_frame(self, index=True, name=None): def to_hierarchical(self, n_repeat, n_shuffle=1): """ - .. deprecated:: 0.24.0 - Return a MultiIndex reshaped to conform to the shapes given by n_repeat and n_shuffle. Useful to replicate and rearrange a MultiIndex for combination with another Index with n_repeat items. + .. deprecated:: 0.24.0 + Parameters ---------- n_repeat : int @@ -1289,152 +1464,6 @@ def lexsort_depth(self): return 0 - @classmethod - def from_arrays(cls, arrays, sortorder=None, names=None): - """ - Convert arrays to MultiIndex - - Parameters - ---------- - arrays : list / sequence of array-likes - Each array-like gives one level's value for each data point. - len(arrays) is the number of levels. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level) - - Returns - ------- - index : MultiIndex - - Examples - -------- - >>> arrays = [[1, 1, 2, 2], ['red', 'blue', 'red', 'blue']] - >>> pd.MultiIndex.from_arrays(arrays, names=('number', 'color')) - - See Also - -------- - MultiIndex.from_tuples : Convert list of tuples to MultiIndex. - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables. - """ - if not is_list_like(arrays): - raise TypeError("Input must be a list / sequence of array-likes.") - elif is_iterator(arrays): - arrays = list(arrays) - - # Check if lengths of all arrays are equal or not, - # raise ValueError, if not - for i in range(1, len(arrays)): - if len(arrays[i]) != len(arrays[i - 1]): - raise ValueError('all arrays must be same length') - - from pandas.core.arrays.categorical import _factorize_from_iterables - - labels, levels = _factorize_from_iterables(arrays) - if names is None: - names = [getattr(arr, "name", None) for arr in arrays] - - return MultiIndex(levels=levels, labels=labels, sortorder=sortorder, - names=names, verify_integrity=False) - - @classmethod - def from_tuples(cls, tuples, sortorder=None, names=None): - """ - Convert list of tuples to MultiIndex - - Parameters - ---------- - tuples : list / sequence of tuple-likes - Each tuple is the index of one row/column. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level) - - Returns - ------- - index : MultiIndex - - Examples - -------- - >>> tuples = [(1, u'red'), (1, u'blue'), - (2, u'red'), (2, u'blue')] - >>> pd.MultiIndex.from_tuples(tuples, names=('number', 'color')) - - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex - MultiIndex.from_product : Make a MultiIndex from cartesian product - of iterables - """ - if not is_list_like(tuples): - raise TypeError('Input must be a list / sequence of tuple-likes.') - elif is_iterator(tuples): - tuples = list(tuples) - - if len(tuples) == 0: - if names is None: - msg = 'Cannot infer number of levels from empty list' - raise TypeError(msg) - arrays = [[]] * len(names) - elif isinstance(tuples, (np.ndarray, Index)): - if isinstance(tuples, Index): - tuples = tuples._values - - arrays = list(lib.tuples_to_object_array(tuples).T) - elif isinstance(tuples, list): - arrays = list(lib.to_object_array_tuples(tuples).T) - else: - arrays = lzip(*tuples) - - return MultiIndex.from_arrays(arrays, sortorder=sortorder, names=names) - - @classmethod - def from_product(cls, iterables, sortorder=None, names=None): - """ - Make a MultiIndex from the cartesian product of multiple iterables - - Parameters - ---------- - iterables : list / sequence of iterables - Each iterable has unique labels for each level of the index. - sortorder : int or None - Level of sortedness (must be lexicographically sorted by that - level). - names : list / sequence of strings or None - Names for the levels in the index. - - Returns - ------- - index : MultiIndex - - Examples - -------- - >>> numbers = [0, 1, 2] - >>> colors = [u'green', u'purple'] - >>> pd.MultiIndex.from_product([numbers, colors], - names=['number', 'color']) - MultiIndex(levels=[[0, 1, 2], [u'green', u'purple']], - labels=[[0, 0, 1, 1, 2, 2], [0, 1, 0, 1, 0, 1]], - names=[u'number', u'color']) - - See Also - -------- - MultiIndex.from_arrays : Convert list of arrays to MultiIndex. - MultiIndex.from_tuples : Convert list of tuples to MultiIndex. - """ - from pandas.core.arrays.categorical import _factorize_from_iterables - from pandas.core.reshape.util import cartesian_product - - if not is_list_like(iterables): - raise TypeError("Input must be a list / sequence of iterables.") - elif is_iterator(iterables): - iterables = list(iterables) - - labels, levels = _factorize_from_iterables(iterables) - labels = cartesian_product(labels) - return MultiIndex(levels, labels, sortorder=sortorder, names=names) - def _sort_levels_monotonic(self): """ .. versionadded:: 0.20.0 diff --git a/pandas/core/indexes/period.py b/pandas/core/indexes/period.py index fec3a9bd24cc85..71f55f9021eac8 100644 --- a/pandas/core/indexes/period.py +++ b/pandas/core/indexes/period.py @@ -1,6 +1,5 @@ # pylint: disable=E1101,E1103,W0232 from datetime import datetime, timedelta -import operator import warnings import numpy as np @@ -18,15 +17,16 @@ from pandas import compat from pandas.core import common as com -from pandas.core.accessor import PandasDelegate, delegate_names +from pandas.core.accessor import delegate_names from pandas.core.algorithms import unique1d import pandas.core.arrays.datetimelike as dtl +from pandas.core.arrays.datetimelike import DatelikeOps from pandas.core.arrays.period import PeriodArray, period_array from pandas.core.base import _shared_docs import pandas.core.indexes.base as ibase from pandas.core.indexes.base import _index_shared_docs, ensure_index from pandas.core.indexes.datetimelike import ( - DatelikeOps, DatetimeIndexOpsMixin, wrap_arithmetic_op) + DatetimeIndexOpsMixin, DatetimelikeDelegateMixin, wrap_arithmetic_op) from pandas.core.indexes.datetimes import DatetimeIndex, Index, Int64Index from pandas.core.missing import isna from pandas.core.ops import get_op_result_name @@ -54,37 +54,26 @@ def _new_PeriodIndex(cls, **d): return cls(values, **d) -class PeriodDelegateMixin(PandasDelegate): +class PeriodDelegateMixin(DatetimelikeDelegateMixin): """ Delegate from PeriodIndex to PeriodArray. """ - def _delegate_property_get(self, name, *args, **kwargs): - result = getattr(self._data, name) - box_ops = ( - set(PeriodArray._datetimelike_ops) - set(PeriodArray._bool_ops) - ) - if name in box_ops: - result = Index(result, name=self.name) - return result - - def _delegate_property_set(self, name, value, *args, **kwargs): - setattr(self._data, name, value) - - def _delegate_method(self, name, *args, **kwargs): - result = operator.methodcaller(name, *args, **kwargs)(self._data) - return Index(result, name=self.name) + _delegate_class = PeriodArray + _delegated_properties = PeriodArray._datetimelike_ops + _delegated_methods = ( + set(PeriodArray._datetimelike_methods) | {'_addsub_int_array'} + ) + _raw_properties = {'is_leap_year'} @delegate_names(PeriodArray, - PeriodArray._datetimelike_ops + ['size', 'asi8', 'shape'], + PeriodDelegateMixin._delegated_properties, typ='property') @delegate_names(PeriodArray, - [x for x in PeriodArray._datetimelike_methods - if x not in {"asfreq", "to_timestamp"}], - typ="method", - overwrite=True) -class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, - Int64Index, PeriodDelegateMixin): + PeriodDelegateMixin._delegated_methods, + typ="method") +class PeriodIndex(DatelikeOps, DatetimeIndexOpsMixin, Int64Index, + PeriodDelegateMixin): """ Immutable ndarray holding ordinal values indicating regular periods in time such as particular years, quarters, months, etc. @@ -349,27 +338,6 @@ def _maybe_box_as_values(self, values, **attribs): freq = attribs['freq'] return PeriodArray(values, freq=freq) - # ------------------------------------------------------------------------ - # Dispatch and maybe box. Not done in delegate_names because we box - # different from those (which use Index). - - def asfreq(self, freq=None, how='E'): - result = self._data.asfreq(freq=freq, how=how) - return self._simple_new(result, name=self.name) - - def to_timestamp(self, freq=None, how='start'): - from pandas import DatetimeIndex - result = self._data.to_timestamp(freq=freq, how=how) - return DatetimeIndex._simple_new(result.asi8, - name=self.name, - freq=result.freq) - - def _format_native_types(self, na_rep=u'NaT', quoting=None, **kwargs): - # just dispatch, return ndarray - return self._data._format_native_types(na_rep=na_rep, - quoting=quoting, - **kwargs) - def _maybe_convert_timedelta(self, other): """ Convert timedelta-like input to an integer multiple of self.freq @@ -412,6 +380,19 @@ def _maybe_convert_timedelta(self, other): raise IncompatibleFrequency(msg.format(cls=type(self).__name__, freqstr=self.freqstr)) + # ------------------------------------------------------------------------ + # Rendering Methods + + def _format_native_types(self, na_rep=u'NaT', quoting=None, **kwargs): + # just dispatch, return ndarray + return self._data._format_native_types(na_rep=na_rep, + quoting=quoting, + **kwargs) + + def _mpl_repr(self): + # how to represent ourselves to matplotlib + return self.astype(object).values + # ------------------------------------------------------------------------ # Indexing @@ -595,10 +576,6 @@ def is_full(self): values = self.asi8 return ((values[1:] - values[:-1]) < 2).all() - def _mpl_repr(self): - # how to represent ourselves to matplotlib - return self.astype(object).values - @property def inferred_type(self): # b/c data is represented as ints make sure we can't have ambiguous diff --git a/pandas/core/indexes/range.py b/pandas/core/indexes/range.py index 0d4e7aaebeca5f..364aadb9523f02 100644 --- a/pandas/core/indexes/range.py +++ b/pandas/core/indexes/range.py @@ -25,7 +25,6 @@ class RangeIndex(Int64Index): - """ Immutable Index implementing a monotonic integer range. @@ -64,6 +63,9 @@ class RangeIndex(Int64Index): _typ = 'rangeindex' _engine_type = libindex.Int64Engine + # -------------------------------------------------------------------- + # Constructors + def __new__(cls, start=None, stop=None, step=None, dtype=None, copy=False, name=None, fastpath=None): @@ -122,7 +124,7 @@ def ensure_int(value, field): @classmethod def from_range(cls, data, name=None, dtype=None, **kwargs): - """ create RangeIndex from a range (py3), or xrange (py2) object """ + """ Create RangeIndex from a range (py3), or xrange (py2) object. """ if not isinstance(data, range): raise TypeError( '{0}(...) must be called with object coercible to a ' @@ -158,6 +160,8 @@ def _simple_new(cls, start, stop=None, step=None, name=None, result._reset_identity() return result + # -------------------------------------------------------------------- + @staticmethod def _validate_dtype(dtype): """ require dtype to be None or int64 """ @@ -188,6 +192,9 @@ def __reduce__(self): d.update(dict(self._get_data_as_items())) return ibase._new_Index, (self.__class__, d), None + # -------------------------------------------------------------------- + # Rendering Methods + def _format_attrs(self): """ Return a list of tuples of the (attr, formatted_value) @@ -201,6 +208,8 @@ def _format_data(self, name=None): # we are formatting thru the attributes return None + # -------------------------------------------------------------------- + @cache_readonly def nbytes(self): """ diff --git a/pandas/core/indexes/timedeltas.py b/pandas/core/indexes/timedeltas.py index 8f50b40a207385..1c84e592d3a0dd 100644 --- a/pandas/core/indexes/timedeltas.py +++ b/pandas/core/indexes/timedeltas.py @@ -1,5 +1,6 @@ """ implement the TimedeltaIndex """ from datetime import datetime +import warnings import numpy as np @@ -16,13 +17,12 @@ from pandas.core.arrays import datetimelike as dtl from pandas.core.arrays.timedeltas import ( - TimedeltaArrayMixin as TimedeltaArray, _is_convertible_to_td, _to_m8, - sequence_to_td64ns) + TimedeltaArrayMixin as TimedeltaArray, _is_convertible_to_td, _to_m8) from pandas.core.base import _shared_docs import pandas.core.common as com from pandas.core.indexes.base import Index, _index_shared_docs from pandas.core.indexes.datetimelike import ( - DatetimeIndexOpsMixin, TimelikeOps, wrap_arithmetic_op, wrap_array_method, + DatetimeIndexOpsMixin, wrap_arithmetic_op, wrap_array_method, wrap_field_accessor) from pandas.core.indexes.numeric import Int64Index from pandas.core.ops import get_op_result_name @@ -31,8 +31,24 @@ from pandas.tseries.frequencies import to_offset +def _make_wrapped_arith_op(opname): + + meth = getattr(TimedeltaArray, opname) + + def method(self, other): + oth = other + if isinstance(other, Index): + oth = other._data + + result = meth(self, oth) + return wrap_arithmetic_op(self, other, result) + + method.__name__ = opname + return method + + class TimedeltaIndex(TimedeltaArray, DatetimeIndexOpsMixin, - TimelikeOps, Int64Index): + dtl.TimelikeOps, Int64Index): """ Immutable ndarray of timedelta64 data, represented internally as int64, and which can be boxed to timedelta objects @@ -127,14 +143,26 @@ def _join_i8_wrapper(joinf, **kwargs): _freq = None + # ------------------------------------------------------------------- + # Constructors + def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, - periods=None, closed=None, dtype=None, copy=False, - name=None, verify_integrity=True): + periods=None, closed=None, dtype=_TD_DTYPE, copy=False, + name=None, verify_integrity=None): - freq, freq_infer = dtl.maybe_infer_freq(freq) + if verify_integrity is not None: + warnings.warn("The 'verify_integrity' argument is deprecated, " + "will be removed in a future version.", + FutureWarning, stacklevel=2) + else: + verify_integrity = True if data is None: - # TODO: Remove this block and associated kwargs; GH#20535 + freq, freq_infer = dtl.maybe_infer_freq(freq) + warnings.warn("Creating a TimedeltaIndex by passing range " + "endpoints is deprecated. Use " + "`pandas.timedelta_range` instead.", + FutureWarning, stacklevel=2) result = cls._generate_range(start, end, periods, freq, closed=closed) result.name = name @@ -153,29 +181,10 @@ def __new__(cls, data=None, unit=None, freq=None, start=None, end=None, # - Cases checked above all return/raise before reaching here - # - data, inferred_freq = sequence_to_td64ns(data, copy=copy, unit=unit) - if inferred_freq is not None: - if freq is not None and freq != inferred_freq: - raise ValueError('Inferred frequency {inferred} from passed ' - 'values does not conform to passed frequency ' - '{passed}' - .format(inferred=inferred_freq, - passed=freq.freqstr)) - elif freq_infer: - freq = inferred_freq - freq_infer = False - verify_integrity = False - - subarr = cls._simple_new(data, name=name, freq=freq) - # check that we are matching freqs - if verify_integrity and len(subarr) > 0: - if freq is not None and not freq_infer: - cls._validate_frequency(subarr, freq) - - if freq_infer: - subarr.freq = to_offset(subarr.inferred_freq) - - return subarr + result = cls._from_sequence(data, freq=freq, unit=unit, + dtype=dtype, copy=copy) + result.name = name + return result @classmethod def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): @@ -193,10 +202,7 @@ def _simple_new(cls, values, name=None, freq=None, dtype=_TD_DTYPE): result._reset_identity() return result - @property - def _formatter_func(self): - from pandas.io.formats.format import _get_format_timedelta64 - return _get_format_timedelta64(self, box=True) + # ------------------------------------------------------------------- def __setstate__(self, state): """Necessary for making this object picklable""" @@ -214,9 +220,13 @@ def _maybe_update_attributes(self, attrs): attrs['freq'] = 'infer' return attrs - def _evaluate_with_timedelta_like(self, other, op): - result = TimedeltaArray._evaluate_with_timedelta_like(self, other, op) - return wrap_arithmetic_op(self, other, result) + # ------------------------------------------------------------------- + # Rendering Methods + + @property + def _formatter_func(self): + from pandas.io.formats.format import _get_format_timedelta64 + return _get_format_timedelta64(self, box=True) def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): from pandas.io.formats.format import Timedelta64Formatter @@ -227,13 +237,14 @@ def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): # ------------------------------------------------------------------- # Wrapping TimedeltaArray - __mul__ = Index.__mul__ - __rmul__ = Index.__rmul__ - __truediv__ = Index.__truediv__ - __floordiv__ = Index.__floordiv__ - __rfloordiv__ = Index.__rfloordiv__ - if compat.PY2: - __div__ = Index.__div__ + __mul__ = _make_wrapped_arith_op("__mul__") + __rmul__ = _make_wrapped_arith_op("__rmul__") + __floordiv__ = _make_wrapped_arith_op("__floordiv__") + __rfloordiv__ = _make_wrapped_arith_op("__rfloordiv__") + __mod__ = _make_wrapped_arith_op("__mod__") + __rmod__ = _make_wrapped_arith_op("__rmod__") + __divmod__ = _make_wrapped_arith_op("__divmod__") + __rdivmod__ = _make_wrapped_arith_op("__rdivmod__") days = wrap_field_accessor(TimedeltaArray.days) seconds = wrap_field_accessor(TimedeltaArray.seconds) @@ -242,6 +253,31 @@ def _format_native_types(self, na_rep=u'NaT', date_format=None, **kwargs): total_seconds = wrap_array_method(TimedeltaArray.total_seconds, True) + def __truediv__(self, other): + oth = other + if isinstance(other, Index): + # TimedeltaArray defers, so we need to unwrap + oth = other._values + result = TimedeltaArray.__truediv__(self, oth) + return wrap_arithmetic_op(self, other, result) + + def __rtruediv__(self, other): + oth = other + if isinstance(other, Index): + # TimedeltaArray defers, so we need to unwrap + oth = other._values + result = TimedeltaArray.__rtruediv__(self, oth) + return wrap_arithmetic_op(self, other, result) + + if compat.PY2: + __div__ = __truediv__ + __rdiv__ = __rtruediv__ + + # Compat for frequency inference, see GH#23789 + _is_monotonic_increasing = Index.is_monotonic_increasing + _is_monotonic_decreasing = Index.is_monotonic_decreasing + _is_unique = Index.is_unique + # ------------------------------------------------------------------- @Appender(_index_shared_docs['astype']) @@ -639,7 +675,7 @@ def delete(self, loc): TimedeltaIndex._add_comparison_ops() -TimedeltaIndex._add_numeric_methods() +TimedeltaIndex._add_numeric_methods_unary() TimedeltaIndex._add_logical_methods_disabled() TimedeltaIndex._add_datetimelike_methods() @@ -727,5 +763,8 @@ def timedelta_range(start=None, end=None, periods=None, freq=None, if freq is None and com._any_none(periods, start, end): freq = 'D' - return TimedeltaIndex(start=start, end=end, periods=periods, - freq=freq, name=name, closed=closed) + freq, freq_infer = dtl.maybe_infer_freq(freq) + result = TimedeltaIndex._generate_range(start, end, periods, freq, + closed=closed) + result.name = name + return result diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index 857bf18c5982be..1b67c20530eb0e 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -20,7 +20,7 @@ from pandas.core.dtypes.common import ( _NS_DTYPE, _TD_DTYPE, ensure_platform_int, is_bool_dtype, is_categorical, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, - is_datetimetz, is_dtype_equal, is_extension_array_dtype, is_extension_type, + is_dtype_equal, is_extension_array_dtype, is_extension_type, is_float_dtype, is_integer, is_integer_dtype, is_list_like, is_numeric_v_string_like, is_object_dtype, is_re, is_re_compilable, is_sparse, is_timedelta64_dtype, pandas_dtype) @@ -1458,11 +1458,6 @@ def quantile(self, qs, interpolation='linear', axis=0, axes=None): def _nanpercentile1D(values, mask, q, **kw): # mask is Union[ExtensionArray, ndarray] - # we convert to an ndarray for NumPy 1.9 compat, which didn't - # treat boolean-like arrays as boolean. This conversion would have - # been done inside ndarray.__getitem__ anyway, since values is - # an ndarray at this point. - mask = np.asarray(mask) values = values[~mask] if len(values) == 0: @@ -2300,10 +2295,7 @@ def convert(self, *args, **kwargs): 'convert_timedeltas'] fn_inputs += ['copy'] - fn_kwargs = {} - for key in fn_inputs: - if key in kwargs: - fn_kwargs[key] = kwargs[key] + fn_kwargs = {key: kwargs[key] for key in fn_inputs if key in kwargs} # operate column-by-column def f(m, v, i): @@ -2770,7 +2762,7 @@ def to_native_types(self, slicer=None, na_rep=None, date_format=None, def should_store(self, value): return (issubclass(value.dtype.type, np.datetime64) and - not is_datetimetz(value) and + not is_datetime64tz_dtype(value) and not is_extension_array_dtype(value)) def set(self, locs, values, check=False): @@ -2781,9 +2773,7 @@ def set(self, locs, values, check=False): ------- None """ - if values.dtype != _NS_DTYPE: - # Workaround for numpy 1.6 bug - values = conversion.ensure_datetime64ns(values) + values = conversion.ensure_datetime64ns(values, copy=False) self.values[locs] = values @@ -3024,9 +3014,9 @@ def get_block_type(values, dtype=None): elif issubclass(vtype, np.complexfloating): cls = ComplexBlock elif issubclass(vtype, np.datetime64): - assert not is_datetimetz(values) + assert not is_datetime64tz_dtype(values) cls = DatetimeBlock - elif is_datetimetz(values): + elif is_datetime64tz_dtype(values): cls = DatetimeTZBlock elif issubclass(vtype, np.integer): cls = IntBlock @@ -3047,7 +3037,7 @@ def make_block(values, placement, klass=None, ndim=None, dtype=None, dtype = dtype or values.dtype klass = get_block_type(values, dtype) - elif klass is DatetimeTZBlock and not is_datetimetz(values): + elif klass is DatetimeTZBlock and not is_datetime64tz_dtype(values): return klass(values, ndim=ndim, placement=placement, dtype=dtype) @@ -3102,7 +3092,7 @@ def _merge_blocks(blocks, dtype=None, _can_consolidate=True): # FIXME: optimization potential in case all mgrs contain slices and # combination of those slices is a slice, too. new_mgr_locs = np.concatenate([b.mgr_locs.as_array for b in blocks]) - new_values = _vstack([b.values for b in blocks], dtype) + new_values = np.vstack([b.values for b in blocks]) argsort = np.argsort(new_mgr_locs) new_values = new_values[argsort] @@ -3114,17 +3104,6 @@ def _merge_blocks(blocks, dtype=None, _can_consolidate=True): return blocks -def _vstack(to_stack, dtype): - - # work around NumPy 1.6 bug - if dtype == _NS_DTYPE or dtype == _TD_DTYPE: - new_values = np.vstack([x.view('i8') for x in to_stack]) - return new_values.view(dtype) - - else: - return np.vstack(to_stack) - - def _block2d_to_blocknd(values, placement, shape, labels, ref_items): """ pivot to the labels shape """ panel_shape = (len(placement),) + shape diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 2fb533478b2f3f..2441c64518d59b 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -10,8 +10,9 @@ from pandas.core.dtypes.cast import maybe_promote from pandas.core.dtypes.common import ( - _get_dtype, is_categorical_dtype, is_datetime64_dtype, is_datetimetz, - is_float_dtype, is_numeric_dtype, is_sparse, is_timedelta64_dtype) + _get_dtype, is_categorical_dtype, is_datetime64_dtype, + is_datetime64tz_dtype, is_float_dtype, is_numeric_dtype, is_sparse, + is_timedelta64_dtype) import pandas.core.dtypes.concat as _concat from pandas.core.dtypes.missing import isna @@ -179,7 +180,7 @@ def get_reindexed_values(self, empty_dtype, upcasted_na): fill_value = None if (getattr(self.block, 'is_datetimetz', False) or - is_datetimetz(empty_dtype)): + is_datetime64tz_dtype(empty_dtype)): if self.block is None: array = empty_dtype.construct_array_type() missing_arr = array([fill_value], dtype=empty_dtype) @@ -293,7 +294,7 @@ def get_empty_dtype_and_na(join_units): if is_categorical_dtype(dtype): upcast_cls = 'category' - elif is_datetimetz(dtype): + elif is_datetime64tz_dtype(dtype): upcast_cls = 'datetimetz' elif issubclass(dtype.type, np.bool_): upcast_cls = 'bool' diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index c3762d98191533..5f9860ce98b111 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -248,9 +248,6 @@ def __getstate__(self): def __setstate__(self, state): def unpickle_block(values, mgr_locs): - # numpy < 1.7 pickle compat - if values.dtype == 'M8[us]': - values = values.astype('M8[ns]') return make_block(values, placement=mgr_locs) if (isinstance(state, tuple) and len(state) >= 4 and @@ -776,18 +773,6 @@ def _interleave(self): result = np.empty(self.shape, dtype=dtype) - if result.shape[0] == 0: - # Workaround for numpy 1.7 bug: - # - # >>> a = np.empty((0,10)) - # >>> a[slice(0,0)] - # array([], shape=(0, 10), dtype=float64) - # >>> a[[]] - # Traceback (most recent call last): - # File "", line 1, in - # IndexError: index 0 is out of bounds for axis 0 with size 0 - return result - itemmask = np.zeros(self.shape[0]) for blk in self.blocks: @@ -1170,8 +1155,7 @@ def insert(self, loc, item, value, allow_duplicates=False): blk.mgr_locs = new_mgr_locs if loc == self._blklocs.shape[0]: - # np.append is a lot faster (at least in numpy 1.7.1), let's use it - # if we can. + # np.append is a lot faster, let's use it if we can. self._blklocs = np.append(self._blklocs, 0) self._blknos = np.append(self._blknos, len(self.blocks)) else: @@ -1995,13 +1979,9 @@ def _transform_index(index, func, level=None): def _fast_count_smallints(arr): """Faster version of set(arr) for sequences of small numbers.""" - if len(arr) == 0: - # Handle empty arr case separately: numpy 1.6 chokes on that. - return np.empty((0, 2), dtype=arr.dtype) - else: - counts = np.bincount(arr.astype(np.int_)) - nz = counts.nonzero()[0] - return np.c_[nz, counts[nz]] + counts = np.bincount(arr.astype(np.int_)) + nz = counts.nonzero()[0] + return np.c_[nz, counts[nz]] def _preprocess_slice_or_indexer(slice_or_indexer, length, allow_fill): diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index 4369ac60a075e7..027f458614bd8b 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -13,8 +13,8 @@ from pandas.core.dtypes.common import ( _get_dtype, is_any_int_dtype, is_bool_dtype, is_complex, is_complex_dtype, is_datetime64_dtype, is_datetime_or_timedelta_dtype, is_float, - is_float_dtype, is_int_or_datetime_dtype, is_integer, is_integer_dtype, - is_numeric_dtype, is_object_dtype, is_scalar, is_timedelta64_dtype) + is_float_dtype, is_integer, is_integer_dtype, is_numeric_dtype, + is_object_dtype, is_scalar, is_timedelta64_dtype) from pandas.core.dtypes.missing import isna, na_value_for_dtype, notna import pandas.core.common as com @@ -254,7 +254,9 @@ def _isfinite(values): def _na_ok_dtype(dtype): - return not is_int_or_datetime_dtype(dtype) + # TODO: what about datetime64tz? PeriodDtype? + return not issubclass(dtype.type, + (np.integer, np.timedelta64, np.datetime64)) def _view_if_needed(values): diff --git a/pandas/core/ops.py b/pandas/core/ops.py index 2a21593fab8f54..6ea31422478f2c 100644 --- a/pandas/core/ops.py +++ b/pandas/core/ops.py @@ -235,7 +235,7 @@ def _gen_eval_kwargs(name): {} >>> _gen_eval_kwargs("rtruediv") - {"reversed": True, "truediv": True} + {'reversed': True, 'truediv': True} """ kwargs = {} @@ -384,124 +384,21 @@ def _get_op_name(op, special): # ----------------------------------------------------------------------------- # Docstring Generation and Templates -_add_example_FRAME = """ ->>> a = pd.DataFrame([1, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], -... columns=['one']) ->>> a - one -a 1.0 -b 1.0 -c 1.0 -d NaN ->>> b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], -... two=[np.nan, 2, np.nan, 2]), -... index=['a', 'b', 'd', 'e']) ->>> b - one two -a 1.0 NaN -b NaN 2.0 -d 1.0 NaN -e NaN 2.0 ->>> a.add(b, fill_value=0) - one two -a 2.0 NaN -b 1.0 2.0 -c 1.0 NaN -d 1.0 NaN -e NaN 2.0 -""" - -_sub_example_FRAME = """ ->>> a = pd.DataFrame([2, 1, 1, np.nan], index=['a', 'b', 'c', 'd'], -... columns=['one']) ->>> a - one -a 2.0 -b 1.0 -c 1.0 -d NaN ->>> b = pd.DataFrame(dict(one=[1, np.nan, 1, np.nan], -... two=[3, 2, np.nan, 2]), -... index=['a', 'b', 'd', 'e']) ->>> b - one two -a 1.0 3.0 -b NaN 2.0 -d 1.0 NaN -e NaN 2.0 ->>> a.sub(b, fill_value=0) - one two -a 1.0 -3.0 -b 1.0 -2.0 -c 1.0 NaN -d -1.0 NaN -e NaN -2.0 -""" - -_mod_example_FRAME = """ -**Using a scalar argument** - ->>> df = pd.DataFrame([2, 4, np.nan, 6.2], index=["a", "b", "c", "d"], -... columns=['one']) ->>> df - one -a 2.0 -b 4.0 -c NaN -d 6.2 ->>> df.mod(3, fill_value=-1) - one -a 2.0 -b 1.0 -c 2.0 -d 0.2 - -**Using a DataFrame argument** - ->>> df = pd.DataFrame(dict(one=[np.nan, 2, 3, 14], two=[np.nan, 1, 1, 3]), -... index=['a', 'b', 'c', 'd']) ->>> df - one two -a NaN NaN -b 2.0 1.0 -c 3.0 1.0 -d 14.0 3.0 ->>> other = pd.DataFrame(dict(one=[np.nan, np.nan, 6, np.nan], -... three=[np.nan, 10, np.nan, -7]), -... index=['a', 'b', 'd', 'e']) ->>> other - one three -a NaN NaN -b NaN 10.0 -d 6.0 NaN -e NaN -7.0 ->>> df.mod(other, fill_value=3) - one three two -a NaN NaN NaN -b 2.0 3.0 1.0 -c 0.0 NaN 1.0 -d 2.0 NaN 0.0 -e NaN -4.0 NaN -""" - _op_descriptions = { # Arithmetic Operators 'add': {'op': '+', 'desc': 'Addition', - 'reverse': 'radd', - 'df_examples': _add_example_FRAME}, + 'reverse': 'radd'}, 'sub': {'op': '-', 'desc': 'Subtraction', - 'reverse': 'rsub', - 'df_examples': _sub_example_FRAME}, + 'reverse': 'rsub'}, 'mul': {'op': '*', 'desc': 'Multiplication', 'reverse': 'rmul', 'df_examples': None}, 'mod': {'op': '%', 'desc': 'Modulo', - 'reverse': 'rmod', - 'df_examples': _mod_example_FRAME}, + 'reverse': 'rmod'}, 'pow': {'op': '**', 'desc': 'Exponential power', 'reverse': 'rpow', @@ -522,28 +419,23 @@ def _get_op_name(op, special): # Comparison Operators 'eq': {'op': '==', 'desc': 'Equal to', - 'reverse': None, - 'df_examples': None}, + 'reverse': None}, 'ne': {'op': '!=', 'desc': 'Not equal to', - 'reverse': None, - 'df_examples': None}, + 'reverse': None}, 'lt': {'op': '<', 'desc': 'Less than', - 'reverse': None, - 'df_examples': None}, + 'reverse': None}, 'le': {'op': '<=', 'desc': 'Less than or equal to', - 'reverse': None, - 'df_examples': None}, + 'reverse': None}, 'gt': {'op': '>', 'desc': 'Greater than', - 'reverse': None, - 'df_examples': None}, + 'reverse': None}, 'ge': {'op': '>=', 'desc': 'Greater than or equal to', - 'reverse': None, - 'df_examples': None}} + 'reverse': None} +} _op_names = list(_op_descriptions.keys()) for key in _op_names: @@ -635,38 +527,295 @@ def _get_op_name(op, special): _flex_doc_FRAME = """ {desc} of dataframe and other, element-wise (binary operator `{op_name}`). -Equivalent to ``{equiv}``, but with support to substitute a fill_value for -missing data in one of the inputs. +Equivalent to ``{equiv}``, but with support to substitute a fill_value +for missing data in one of the inputs. With reverse version, `{reverse}`. + +Among flexible wrappers (`add`, `sub`, `mul`, `div`, `mod`, `pow`) to +arithmetic operators: `+`, `-`, `*`, `/`, `//`, `%`, `**. Parameters ---------- -other : Series, DataFrame, or constant -axis : {{0, 1, 'index', 'columns'}} - For Series input, axis to match Series index on -level : int or name +other : scalar, sequence, Series, or DataFrame + Any single or multiple element data structure, or list-like object. +axis : {{0 or 'index', 1 or 'columns'}} + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). For Series input, axis to match Series index on. +level : int or label Broadcast across a level, matching Index values on the - passed MultiIndex level -fill_value : None or float value, default None + passed MultiIndex level. +fill_value : float or None, default None Fill existing missing (NaN) values, and any new element needed for successful DataFrame alignment, with this value before computation. If data in both corresponding DataFrame locations is missing - the result will be missing + the result will be missing. Notes ----- -Mismatched indices will be unioned together +Mismatched indices will be unioned together. Returns ------- -result : DataFrame +DataFrame + Result of the arithmetic operation. + +See Also +-------- +DataFrame.add : Add DataFrames. +DataFrame.sub : Subtract DataFrames. +DataFrame.mul : Multiply DataFrames. +DataFrame.div : Divide DataFrames (float division). +DataFrame.truediv : Divide DataFrames (float division). +DataFrame.floordiv : Divide DataFrames (integer division). +DataFrame.mod : Calculate modulo (remainder after division). +DataFrame.pow : Calculate exponential power. Examples -------- -{df_examples} +>>> df = pd.DataFrame({{'angles': [0, 3, 4], +... 'degrees': [360, 180, 360]}}, +... index=['circle', 'triangle', 'rectangle']) +>>> df + angles degrees +circle 0 360 +triangle 3 180 +rectangle 4 360 + +Add a scalar with operator version which return the same +results. + +>>> df + 1 + angles degrees +circle 1 361 +triangle 4 181 +rectangle 5 361 + +>>> df.add(1) + angles degrees +circle 1 361 +triangle 4 181 +rectangle 5 361 + +Divide by constant with reverse version. + +>>> df.div(10) + angles degrees +circle 0.0 36.0 +triangle 0.3 18.0 +rectangle 0.4 36.0 + +>>> df.rdiv(10) + angles degrees +circle inf 0.027778 +triangle 3.333333 0.055556 +rectangle 2.500000 0.027778 + +Subtract a list and Series by axis with operator version. + +>>> df - [1, 2] + angles degrees +circle -1 358 +triangle 2 178 +rectangle 3 358 + +>>> df.sub([1, 2], axis='columns') + angles degrees +circle -1 358 +triangle 2 178 +rectangle 3 358 + +>>> df.sub(pd.Series([1, 1, 1], index=['circle', 'triangle', 'rectangle']), +... axis='index') + angles degrees +circle -1 359 +triangle 2 179 +rectangle 3 359 + +Multiply a DataFrame of different shape with operator version. + +>>> other = pd.DataFrame({{'angles': [0, 3, 4]}}, +... index=['circle', 'triangle', 'rectangle']) +>>> other + angles +circle 0 +triangle 3 +rectangle 4 + +>>> df * other + angles degrees +circle 0 NaN +triangle 9 NaN +rectangle 16 NaN + +>>> df.mul(other, fill_value=0) + angles degrees +circle 0 0.0 +triangle 9 0.0 +rectangle 16 0.0 + +Divide by a MultiIndex by level. + +>>> df_multindex = pd.DataFrame({{'angles': [0, 3, 4, 4, 5, 6], +... 'degrees': [360, 180, 360, 360, 540, 720]}}, +... index=[['A', 'A', 'A', 'B', 'B', 'B'], +... ['circle', 'triangle', 'rectangle', +... 'square', 'pentagon', 'hexagon']]) +>>> df_multindex + angles degrees +A circle 0 360 + triangle 3 180 + rectangle 4 360 +B square 4 360 + pentagon 5 540 + hexagon 6 720 + +>>> df.div(df_multindex, level=1, fill_value=0) + angles degrees +A circle NaN 1.0 + triangle 1.0 1.0 + rectangle 1.0 1.0 +B square 0.0 0.0 + pentagon 0.0 0.0 + hexagon 0.0 0.0 +""" + +_flex_comp_doc_FRAME = """ +{desc} of dataframe and other, element-wise (binary operator `{op_name}`). + +Among flexible wrappers (`eq`, `ne`, `le`, `lt`, `ge`, `gt`) to comparison +operators. + +Equivalent to `==`, `=!`, `<=`, `<`, `>=`, `>` with support to choose axis +(rows or columns) and level for comparison. + +Parameters +---------- +other : scalar, sequence, Series, or DataFrame + Any single or multiple element data structure, or list-like object. +axis : {{0 or 'index', 1 or 'columns'}}, default 'columns' + Whether to compare by the index (0 or 'index') or columns + (1 or 'columns'). +level : int or label + Broadcast across a level, matching Index values on the passed + MultiIndex level. + +Returns +------- +DataFrame of bool + Result of the comparison. See Also -------- -DataFrame.{reverse} +DataFrame.eq : Compare DataFrames for equality elementwise. +DataFrame.ne : Compare DataFrames for inequality elementwise. +DataFrame.le : Compare DataFrames for less than inequality + or equality elementwise. +DataFrame.lt : Compare DataFrames for strictly less than + inequality elementwise. +DataFrame.ge : Compare DataFrames for greater than inequality + or equality elementwise. +DataFrame.gt : Compare DataFrames for strictly greater than + inequality elementwise. + +Notes +-------- +Mismatched indices will be unioned together. +`NaN` values are considered different (i.e. `NaN` != `NaN`). + +Examples +-------- +>>> df = pd.DataFrame({{'cost': [250, 150, 100], +... 'revenue': [100, 250, 300]}}, +... index=['A', 'B', 'C']) +>>> df + cost revenue +A 250 100 +B 150 250 +C 100 300 + +Compare to a scalar and operator version which return the same +results. + +>>> df == 100 + cost revenue +A False True +B False False +C True False + +>>> df.eq(100) + cost revenue +A False True +B False False +C True False + +Compare to a list and Series by axis and operator version. As shown, +for list axis is by default 'index', but for Series axis is by +default 'columns'. + +>>> df != [100, 250, 300] + cost revenue +A True False +B True False +C True False + +>>> df.ne([100, 250, 300], axis='index') + cost revenue +A True False +B True False +C True False + +>>> df != pd.Series([100, 250, 300]) + cost revenue 0 1 2 +A True True True True True +B True True True True True +C True True True True True + +>>> df.ne(pd.Series([100, 250, 300]), axis='columns') + cost revenue 0 1 2 +A True True True True True +B True True True True True +C True True True True True + +Compare to a DataFrame of different shape. + +>>> other = pd.DataFrame({{'revenue': [300, 250, 100, 150]}}, +... index=['A', 'B', 'C', 'D']) +>>> other + revenue +A 300 +B 250 +C 100 +D 150 + +>>> df.gt(other) + cost revenue +A False False +B False False +C False True +D False False + +Compare to a MultiIndex by level. + +>>> df_multindex = pd.DataFrame({{'cost': [250, 150, 100, 150, 300, 220], +... 'revenue': [100, 250, 300, 200, 175, 225]}}, +... index=[['Q1', 'Q1', 'Q1', 'Q2', 'Q2', 'Q2'], +... ['A', 'B', 'C', 'A', 'B' ,'C']]) +>>> df_multindex + cost revenue +Q1 A 250 100 + B 150 250 + C 100 300 +Q2 A 150 200 + B 300 175 + C 220 225 + +>>> df.le(df_multindex, level=1) + cost revenue +Q1 A True True + B True True + C True True +Q2 A False True + B True False + C True False """ _flex_doc_PANEL = """ @@ -734,8 +883,7 @@ def _make_flex_doc(op_name, typ): elif typ == 'dataframe': base_doc = _flex_doc_FRAME doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, - equiv=equiv, reverse=op_desc['reverse'], - df_examples=op_desc['df_examples']) + equiv=equiv, reverse=op_desc['reverse']) elif typ == 'panel': base_doc = _flex_doc_PANEL doc = base_doc.format(desc=op_desc['desc'], op_name=op_name, @@ -1404,8 +1552,7 @@ def wrapper(left, right): elif is_timedelta64_dtype(left): result = dispatch_to_index_op(op, left, right, pd.TimedeltaIndex) return construct_result(left, result, - index=left.index, name=res_name, - dtype=result.dtype) + index=left.index, name=res_name) elif is_timedelta64_dtype(right): # We should only get here with non-scalar or timedelta64('NaT') @@ -1894,8 +2041,10 @@ def na_op(x, y): result = mask_cmp_op(x, y, op, (np.ndarray, ABCSeries)) return result - @Appender('Wrapper for flexible comparison methods {name}' - .format(name=op_name)) + doc = _flex_comp_doc_FRAME.format(op_name=op_name, + desc=_op_descriptions[op_name]['desc']) + + @Appender(doc) def f(self, other, axis=default_axis, level=None): other = _align_method_FRAME(self, other, axis) diff --git a/pandas/core/panel.py b/pandas/core/panel.py index 90016f599addc5..bfa00d13524014 100644 --- a/pandas/core/panel.py +++ b/pandas/core/panel.py @@ -48,7 +48,7 @@ def _ensure_like_indices(time, panels): """ - Makes sure that time and panels are conformable + Makes sure that time and panels are conformable. """ n_time = len(time) n_panel = len(panels) @@ -63,7 +63,7 @@ def _ensure_like_indices(time, panels): def panel_index(time, panels, names=None): """ - Returns a multi-index suitable for a panel-like DataFrame + Returns a multi-index suitable for a panel-like DataFrame. Parameters ---------- @@ -125,10 +125,10 @@ class Panel(NDFrame): axis=1 minor_axis : Index or array-like axis=2 - dtype : dtype, default None - Data type to force, otherwise infer copy : boolean, default False Copy data from inputs. Only affects DataFrame / 2d ndarray input + dtype : dtype, default None + Data type to force, otherwise infer """ @property @@ -157,7 +157,7 @@ def __init__(self, data=None, items=None, major_axis=None, minor_axis=None, def _init_data(self, data, copy, dtype, **kwargs): """ Generate ND initialization; axes are passed - as required objects to __init__ + as required objects to __init__. """ if data is None: data = {} @@ -242,7 +242,7 @@ def _init_arrays(self, arrays, arr_names, axes): @classmethod def from_dict(cls, data, intersect=False, orient='items', dtype=None): """ - Construct Panel from dict of DataFrame objects + Construct Panel from dict of DataFrame objects. Parameters ---------- @@ -336,9 +336,8 @@ def _compare_constructor(self, other, func): raise Exception('Can only compare identically-labeled ' 'same type objects') - new_data = {} - for col in self._info_axis: - new_data[col] = func(self[col], other[col]) + new_data = {col: func(self[col], other[col]) + for col in self._info_axis} d = self._construct_axes_dict(copy=False) return self._constructor(data=new_data, **d) @@ -348,7 +347,7 @@ def _compare_constructor(self, other, func): def __unicode__(self): """ - Return a string representation for a particular Panel + Return a string representation for a particular Panel. Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. @@ -377,7 +376,7 @@ def _get_plane_axes_index(self, axis): """ Get my plane axes indexes: these are already (as compared with higher level planes), - as we are returning a DataFrame axes indexes + as we are returning a DataFrame axes indexes. """ axis_name = self._get_axis_name(axis) @@ -397,7 +396,7 @@ def _get_plane_axes(self, axis): """ Get my plane axes indexes: these are already (as compared with higher level planes), - as we are returning a DataFrame axes + as we are returning a DataFrame axes. """ return [self._get_axis(axi) for axi in self._get_plane_axes_index(axis)] @@ -409,14 +408,14 @@ def to_sparse(self, *args, **kwargs): NOT IMPLEMENTED: do not call this method, as sparsifying is not supported for Panel objects and will raise an error. - Convert to SparsePanel + Convert to SparsePanel. """ raise NotImplementedError("sparsifying is not supported " "for Panel objects") def to_excel(self, path, na_rep='', engine=None, **kwargs): """ - Write each DataFrame in Panel to a separate excel sheet + Write each DataFrame in Panel to a separate excel sheet. Parameters ---------- @@ -473,7 +472,8 @@ def as_matrix(self): # Getting and setting elements def get_value(self, *args, **kwargs): - """Quickly retrieve single value at (item, major, minor) location + """ + Quickly retrieve single value at (item, major, minor) location. .. deprecated:: 0.21.0 @@ -520,7 +520,8 @@ def _get_value(self, *args, **kwargs): _get_value.__doc__ = get_value.__doc__ def set_value(self, *args, **kwargs): - """Quickly set single value at (item, major, minor) location + """ + Quickly set single value at (item, major, minor) location. .. deprecated:: 0.21.0 @@ -619,7 +620,9 @@ def __setitem__(self, key, value): NDFrame._set_item(self, key, mat) def _unpickle_panel_compat(self, state): # pragma: no cover - "Unpickle the panel" + """ + Unpickle the panel. + """ from pandas.io.pickle import _unpickle_array _unpickle = _unpickle_array @@ -687,7 +690,9 @@ def round(self, decimals=0, *args, **kwargs): raise TypeError("decimals must be an integer") def _needs_reindex_multi(self, axes, method, level): - """ don't allow a multi reindex on Panel or above ndim """ + """ + Don't allow a multi reindex on Panel or above ndim. + """ return False def align(self, other, **kwargs): @@ -695,7 +700,7 @@ def align(self, other, **kwargs): def dropna(self, axis=0, how='any', inplace=False): """ - Drop 2D from panel, holding passed axis constant + Drop 2D from panel, holding passed axis constant. Parameters ---------- @@ -787,7 +792,7 @@ def _combine_panel(self, other, func): def major_xs(self, key): """ - Return slice of panel along major axis + Return slice of panel along major axis. Parameters ---------- @@ -811,7 +816,7 @@ def major_xs(self, key): def minor_xs(self, key): """ - Return slice of panel along minor axis + Return slice of panel along minor axis. Parameters ---------- @@ -835,7 +840,7 @@ def minor_xs(self, key): def xs(self, key, axis=1): """ - Return slice of panel along selected axis + Return slice of panel along selected axis. Parameters ---------- @@ -871,6 +876,8 @@ def xs(self, key, axis=1): def _ixs(self, i, axis=0): """ + Parameters + ---------- i : int, slice, or sequence of integers axis : int """ @@ -898,7 +905,7 @@ def _ixs(self, i, axis=0): def groupby(self, function, axis='major'): """ - Group data on given axis, returning GroupBy object + Group data on given axis, returning GroupBy object. Parameters ---------- @@ -941,9 +948,8 @@ def to_frame(self, filter_observations=True): # size = N * K selector = slice(None, None) - data = {} - for item in self.items: - data[item] = self[item].values.ravel()[selector] + data = {item: self[item].values.ravel()[selector] + for item in self.items} def construct_multi_parts(idx, n_repeat, n_shuffle=1): # Replicates and shuffles MultiIndex, returns individual attributes @@ -993,7 +999,7 @@ def construct_index_parts(idx, major=True): def apply(self, func, axis='major', **kwargs): """ - Applies function along axis (or axes) of the Panel + Applies function along axis (or axes) of the Panel. Parameters ---------- @@ -1115,8 +1121,9 @@ def _apply_1d(self, func, axis): return self._construct_return_type(results, planes) def _apply_2d(self, func, axis): - """ handle 2-d slices, equiv to iterating over the other axis """ - + """ + Handle 2-d slices, equiv to iterating over the other axis. + """ ndim = self.ndim axis = [self._get_axis_number(a) for a in axis] @@ -1172,7 +1179,9 @@ def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, return self._construct_return_type(result, axes) def _construct_return_type(self, result, axes=None): - """ return the type for the ndim of the result """ + """ + Return the type for the ndim of the result. + """ ndim = getattr(result, 'ndim', None) # need to assume they are the same @@ -1308,6 +1317,7 @@ def count(self, axis='major'): def shift(self, periods=1, freq=None, axis='major'): """ Shift index by desired number of periods with an optional time freq. + The shifted data will not include the dropped periods and the shifted axis will be smaller than the original. This is different from the behavior of DataFrame.shift() @@ -1333,7 +1343,7 @@ def tshift(self, periods=1, freq=None, axis='major'): def join(self, other, how='left', lsuffix='', rsuffix=''): """ - Join items with other Panel either on major and minor axes column + Join items with other Panel either on major and minor axes column. Parameters ---------- @@ -1440,13 +1450,17 @@ def _get_join_index(self, other, how): # miscellaneous data creation @staticmethod def _extract_axes(self, data, axes, **kwargs): - """ return a list of the axis indices """ + """ + Return a list of the axis indices. + """ return [self._extract_axis(self, data, axis=i, **kwargs) for i, a in enumerate(axes)] @staticmethod def _extract_axes_for_slice(self, axes): - """ return the slice dictionary for these axes """ + """ + Return the slice dictionary for these axes. + """ return {self._AXIS_SLICEMAP[i]: a for i, a in zip(self._AXIS_ORDERS[self._AXIS_LEN - len(axes):], axes)} diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 125b441e5558a5..f2cf17f8f060dc 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -26,7 +26,7 @@ from pandas.core.groupby.ops import BinGrouper from pandas.core.indexes.datetimes import DatetimeIndex, date_range from pandas.core.indexes.period import PeriodIndex -from pandas.core.indexes.timedeltas import TimedeltaIndex +from pandas.core.indexes.timedeltas import TimedeltaIndex, timedelta_range from pandas.tseries.frequencies import is_subperiod, is_superperiod, to_offset from pandas.tseries.offsets import ( @@ -81,7 +81,9 @@ def __init__(self, obj, groupby=None, axis=0, kind=None, **kwargs): self.groupby._set_grouper(self._convert_obj(obj), sort=True) def __unicode__(self): - """ provide a nice str repr of our rolling object """ + """ + Provide a nice str repr of our rolling object. + """ attrs = ["{k}={v}".format(k=k, v=getattr(self.groupby, k)) for k in self._attributes if getattr(self.groupby, k, None) is not None] @@ -100,7 +102,7 @@ def __getattr__(self, attr): def __iter__(self): """ - Resampler iterator + Resampler iterator. Returns ------- @@ -124,14 +126,18 @@ def ax(self): @property def _typ(self): - """ masquerade for compat as a Series or a DataFrame """ + """ + Masquerade for compat as a Series or a DataFrame. + """ if isinstance(self._selected_obj, pd.Series): return 'series' return 'dataframe' @property def _from_selection(self): - """ is the resampling from a DataFrame column or MultiIndex level """ + """ + Is the resampling from a DataFrame column or MultiIndex level. + """ # upsampling and PeriodIndex resampling do not work # with selection, this state used to catch and raise an error return (self.groupby is not None and @@ -140,7 +146,7 @@ def _from_selection(self): def _convert_obj(self, obj): """ - provide any conversions for the object in order to correctly handle + Provide any conversions for the object in order to correctly handle. Parameters ---------- @@ -158,17 +164,17 @@ def _get_binner_for_time(self): def _set_binner(self): """ - setup our binners - cache these as we are an immutable object - """ + Setup our binners. + Cache these as we are an immutable object + """ if self.binner is None: self.binner, self.grouper = self._get_binner() def _get_binner(self): """ - create the BinGrouper, assume that self.set_grouper(obj) - has already been called + Create the BinGrouper, assume that self.set_grouper(obj) + has already been called. """ binner, bins, binlabels = self._get_binner_for_time() @@ -176,28 +182,31 @@ def _get_binner(self): return binner, bin_grouper def _assure_grouper(self): - """ make sure that we are creating our binner & grouper """ + """ + Make sure that we are creating our binner & grouper. + """ self._set_binner() @Substitution(klass='Resampler', versionadded='.. versionadded:: 0.23.0', examples=""" ->>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, -... index=pd.date_range('2012-08-02', periods=4)) ->>> df - A -2012-08-02 1 -2012-08-03 2 -2012-08-04 3 -2012-08-05 4 - -To get the difference between each 2-day period's maximum and minimum value in -one pass, you can do - ->>> df.resample('2D').pipe(lambda x: x.max() - x.min()) - A -2012-08-02 1 -2012-08-04 1""") + >>> df = pd.DataFrame({'A': [1, 2, 3, 4]}, + ... index=pd.date_range('2012-08-02', periods=4)) + >>> df + A + 2012-08-02 1 + 2012-08-03 2 + 2012-08-04 3 + 2012-08-05 4 + + To get the difference between each 2-day period's maximum and minimum + value in one pass, you can do + + >>> df.resample('2D').pipe(lambda x: x.max() - x.min()) + A + 2012-08-02 1 + 2012-08-04 1 + """) @Appender(_pipe_template) def pipe(self, func, *args, **kwargs): return super(Resampler, self).pipe(func, *args, **kwargs) @@ -270,7 +279,7 @@ def aggregate(self, func, *args, **kwargs): def transform(self, arg, *args, **kwargs): """ Call function producing a like-indexed Series on each group and return - a Series with the transformed values + a Series with the transformed values. Parameters ---------- @@ -296,8 +305,7 @@ def _upsample(self, f, limit=None, fill_value=None): def _gotitem(self, key, ndim, subset=None): """ - sub-classes to define - return a sliced object + Sub-classes to define. Return a sliced object. Parameters ---------- @@ -320,7 +328,9 @@ def _gotitem(self, key, ndim, subset=None): return grouped def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): - """ re-evaluate the obj with a groupby aggregation """ + """ + Re-evaluate the obj with a groupby aggregation. + """ if grouper is None: self._set_binner() @@ -352,7 +362,7 @@ def _groupby_and_aggregate(self, how, grouper=None, *args, **kwargs): def _apply_loffset(self, result): """ - if loffset is set, offset the result index + If loffset is set, offset the result index. This is NOT an idempotent routine, it will be applied exactly once to the result. @@ -377,11 +387,15 @@ def _apply_loffset(self, result): return result def _get_resampler_for_grouping(self, groupby, **kwargs): - """ return the correct class for resampling with groupby """ + """ + Return the correct class for resampling with groupby. + """ return self._resampler_for_grouping(self, groupby=groupby, **kwargs) def _wrap_result(self, result): - """ potentially wrap any results """ + """ + Potentially wrap any results. + """ if isinstance(result, ABCSeries) and self._selection is not None: result.name = self._selection @@ -394,7 +408,7 @@ def _wrap_result(self, result): def pad(self, limit=None): """ - Forward fill the values + Forward fill the values. Parameters ---------- @@ -757,8 +771,7 @@ def interpolate(self, method='linear', axis=0, limit=None, inplace=False, def asfreq(self, fill_value=None): """ - return the values at the new freq, - essentially a reindex + Return the values at the new freq, essentially a reindex. Parameters ---------- @@ -777,7 +790,7 @@ def asfreq(self, fill_value=None): def std(self, ddof=1, *args, **kwargs): """ - Compute standard deviation of groups, excluding missing values + Compute standard deviation of groups, excluding missing values. Parameters ---------- @@ -789,12 +802,12 @@ def std(self, ddof=1, *args, **kwargs): def var(self, ddof=1, *args, **kwargs): """ - Compute variance of groups, excluding missing values + Compute variance of groups, excluding missing values. Parameters ---------- ddof : integer, default 1 - degrees of freedom + degrees of freedom """ nv.validate_resampler_func('var', args, kwargs) return self._downsample('var', ddof=ddof) @@ -863,8 +876,10 @@ def f(self, _method=method): def _maybe_process_deprecations(r, how=None, fill_method=None, limit=None): - """ potentially we might have a deprecation warning, show it - but call the appropriate methods anyhow """ + """ + Potentially we might have a deprecation warning, show it + but call the appropriate methods anyhow. + """ if how is not None: @@ -909,8 +924,9 @@ def _maybe_process_deprecations(r, how=None, fill_method=None, limit=None): class _GroupByMixin(GroupByMixin): - """ provide the groupby facilities """ - + """ + Provide the groupby facilities. + """ def __init__(self, obj, *args, **kwargs): parent = kwargs.pop('parent', None) @@ -931,8 +947,8 @@ def __init__(self, obj, *args, **kwargs): def _apply(self, f, grouper=None, *args, **kwargs): """ - dispatch to _upsample; we are stripping all of the _upsample kwargs and - performing the original function call on the grouped object + Dispatch to _upsample; we are stripping all of the _upsample kwargs and + performing the original function call on the grouped object. """ def func(x): @@ -966,7 +982,7 @@ def _get_binner_for_time(self): def _downsample(self, how, **kwargs): """ - Downsample the cython defined function + Downsample the cython defined function. Parameters ---------- @@ -1003,6 +1019,7 @@ def _downsample(self, how, **kwargs): def _adjust_binner_for_upsample(self, binner): """ Adjust our binner when upsampling. + The range of a new index should not be outside specified range """ if self.closed == 'right': @@ -1013,6 +1030,8 @@ def _adjust_binner_for_upsample(self, binner): def _upsample(self, method, limit=None, fill_value=None): """ + Parameters + ---------- method : string {'backfill', 'bfill', 'pad', 'ffill', 'asfreq'} method for upsampling limit : int, default None @@ -1065,7 +1084,6 @@ class DatetimeIndexResamplerGroupby(_GroupByMixin, DatetimeIndexResampler): Provides a resample of a groupby implementation .. versionadded:: 0.18.1 - """ @property def _constructor(self): @@ -1106,7 +1124,7 @@ def _convert_obj(self, obj): def _downsample(self, how, **kwargs): """ - Downsample the cython defined function + Downsample the cython defined function. Parameters ---------- @@ -1143,6 +1161,8 @@ def _downsample(self, how, **kwargs): def _upsample(self, method, limit=None, fill_value=None): """ + Parameters + ---------- method : string {'backfill', 'bfill', 'pad', 'ffill'} method for upsampling limit : int, default None @@ -1177,10 +1197,9 @@ def _upsample(self, method, limit=None, fill_value=None): class PeriodIndexResamplerGroupby(_GroupByMixin, PeriodIndexResampler): """ - Provides a resample of a groupby implementation + Provides a resample of a groupby implementation. .. versionadded:: 0.18.1 - """ @property def _constructor(self): @@ -1199,6 +1218,7 @@ def _get_binner_for_time(self): def _adjust_binner_for_upsample(self, binner): """ Adjust our binner when upsampling. + The range of a new index is allowed to be greater than original range so we don't need to change the length of a binner, GH 13022 """ @@ -1207,10 +1227,9 @@ def _adjust_binner_for_upsample(self, binner): class TimedeltaIndexResamplerGroupby(_GroupByMixin, TimedeltaIndexResampler): """ - Provides a resample of a groupby implementation + Provides a resample of a groupby implementation. .. versionadded:: 0.18.1 - """ @property def _constructor(self): @@ -1218,7 +1237,9 @@ def _constructor(self): def resample(obj, kind=None, **kwds): - """ create a TimeGrouper and return our resampler """ + """ + Create a TimeGrouper and return our resampler. + """ tg = TimeGrouper(**kwds) return tg._get_resampler(obj, kind=kind) @@ -1228,7 +1249,9 @@ def resample(obj, kind=None, **kwds): def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None, limit=None, kind=None, **kwargs): - """ return our appropriate resampler when grouping as well """ + """ + Return our appropriate resampler when grouping as well. + """ # .resample uses 'on' similar to how .groupby uses 'key' kwargs['key'] = kwargs.pop('on', None) @@ -1244,7 +1267,7 @@ def get_resampler_for_grouping(groupby, rule, how=None, fill_method=None, class TimeGrouper(Grouper): """ - Custom groupby class for time-interval grouping + Custom groupby class for time-interval grouping. Parameters ---------- @@ -1311,7 +1334,7 @@ def __init__(self, freq='Min', closed=None, label=None, how='mean', def _get_resampler(self, obj, kind=None): """ - return my resampler or raise if we have an invalid axis + Return my resampler or raise if we have an invalid axis. Parameters ---------- @@ -1375,11 +1398,11 @@ def _get_time_bins(self, ax): # because replace() will swallow the nanosecond part # thus last bin maybe slightly before the end if the end contains # nanosecond part and lead to `Values falls after last bin` error - binner = labels = DatetimeIndex(freq=self.freq, - start=first, - end=last, - tz=tz, - name=ax.name) + binner = labels = date_range(freq=self.freq, + start=first, + end=last, + tz=tz, + name=ax.name) # GH 15549 # In edge case of tz-aware resapmling binner last index can be @@ -1461,10 +1484,10 @@ def _get_time_delta_bins(self, ax): return binner, [], labels start, end = ax.min(), ax.max() - labels = binner = TimedeltaIndex(start=start, - end=end, - freq=self.freq, - name=ax.name) + labels = binner = timedelta_range(start=start, + end=end, + freq=self.freq, + name=ax.name) end_stamps = labels + self.freq bins = ax.searchsorted(end_stamps, side='left') @@ -1643,7 +1666,7 @@ def _adjust_dates_anchored(first, last, offset, closed='right', base=0): def asfreq(obj, freq, method=None, how=None, normalize=False, fill_value=None): """ - Utility frequency conversion method for Series/DataFrame + Utility frequency conversion method for Series/DataFrame. """ if isinstance(obj.index, PeriodIndex): if method is not None: diff --git a/pandas/core/reshape/melt.py b/pandas/core/reshape/melt.py index 2dd6dc71b9d98b..aafc0de64ee12c 100644 --- a/pandas/core/reshape/melt.py +++ b/pandas/core/reshape/melt.py @@ -448,9 +448,8 @@ def melt_stub(df, stub, i, j, value_vars, sep): value_vars_flattened = [e for sublist in value_vars for e in sublist] id_vars = list(set(df.columns.tolist()).difference(value_vars_flattened)) - melted = [] - for s, v in zip(stubnames, value_vars): - melted.append(melt_stub(df, s, i, j, v, sep)) + melted = [melt_stub(df, s, i, j, v, sep) + for s, v in zip(stubnames, value_vars)] melted = melted[0].join(melted[1:], how='outer') if len(i) == 1: diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index b442e46852800e..b078ff32f69447 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -18,8 +18,8 @@ ensure_float64, ensure_int64, ensure_object, is_array_like, is_bool, is_bool_dtype, is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_datetimelike, is_dtype_equal, is_float_dtype, - is_int64_dtype, is_int_or_datetime_dtype, is_integer, is_integer_dtype, - is_list_like, is_number, is_numeric_dtype, needs_i8_conversion) + is_int64_dtype, is_integer, is_integer_dtype, is_list_like, is_number, + is_numeric_dtype, needs_i8_conversion) from pandas.core.dtypes.missing import isnull, na_value_for_dtype from pandas import Categorical, DataFrame, Index, MultiIndex, Series, Timedelta @@ -1605,7 +1605,15 @@ def _factorize_keys(lk, rk, sort=True): lk = ensure_int64(lk.codes) rk = ensure_int64(rk) - elif is_int_or_datetime_dtype(lk) and is_int_or_datetime_dtype(rk): + elif is_integer_dtype(lk) and is_integer_dtype(rk): + # GH#23917 TODO: needs tests for case where lk is integer-dtype + # and rk is datetime-dtype + klass = libhashtable.Int64Factorizer + lk = ensure_int64(com.values_from_object(lk)) + rk = ensure_int64(com.values_from_object(rk)) + elif (issubclass(lk.dtype.type, (np.timedelta64, np.datetime64)) and + issubclass(rk.dtype.type, (np.timedelta64, np.datetime64))): + # GH#23917 TODO: Needs tests for non-matching dtypes klass = libhashtable.Int64Factorizer lk = ensure_int64(com.values_from_object(lk)) rk = ensure_int64(com.values_from_object(rk)) diff --git a/pandas/core/reshape/tile.py b/pandas/core/reshape/tile.py index 4a863372eea137..5d5f6cf8102be7 100644 --- a/pandas/core/reshape/tile.py +++ b/pandas/core/reshape/tile.py @@ -43,7 +43,8 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, and maximum values of `x`. * sequence of scalars : Defines the bin edges allowing for non-uniform width. No extension of the range of `x` is done. - * IntervalIndex : Defines the exact bins to be used. + * IntervalIndex : Defines the exact bins to be used. Note that + IntervalIndex for `bins` must be non-overlapping. right : bool, default True Indicates whether `bins` includes the rightmost edge or not. If @@ -217,7 +218,9 @@ def cut(x, bins, right=True, labels=None, retbins=False, precision=3, bins[-1] += adj elif isinstance(bins, IntervalIndex): - pass + if bins.is_overlapping: + raise ValueError('Overlapping IntervalIndex is not accepted.') + else: bins = np.asarray(bins) bins = _convert_bin_to_numeric_type(bins, dtype) @@ -334,8 +337,7 @@ def _bins_to_cuts(x, bins, right=True, labels=None, ids = ensure_int64(bins.searchsorted(x, side=side)) if include_lowest: - # Numpy 1.9 support: ensure this mask is a Numpy array - ids[np.asarray(x == bins[0])] = 1 + ids[x == bins[0]] = 1 na_mask = isna(x) | (ids == len(bins)) | (ids == 0) has_nas = na_mask.any() diff --git a/pandas/core/series.py b/pandas/core/series.py index c9b1a2c45eab38..0d6c9f4d845da2 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -75,7 +75,8 @@ # see gh-16971 def remove_na(arr): - """Remove null values from array like structure. + """ + Remove null values from array like structure. .. deprecated:: 0.21.0 Use s[s.notnull()] instead. @@ -87,7 +88,9 @@ def remove_na(arr): def _coerce_method(converter): - """ install the scalar coercion methods """ + """ + Install the scalar coercion methods. + """ def wrapper(self): if len(self) == 1: @@ -146,6 +149,9 @@ class Series(base.IndexOpsMixin, generic.NDFrame): hasnans = property(base.IndexOpsMixin.hasnans.func, doc=base.IndexOpsMixin.hasnans.__doc__) + # ---------------------------------------------------------------------- + # Constructors + def __init__(self, data=None, index=None, dtype=None, name=None, copy=False, fastpath=False): @@ -310,7 +316,8 @@ def _init_dict(self, data, index=None, dtype=None): @classmethod def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, fastpath=False): - """Construct Series from array. + """ + Construct Series from array. .. deprecated :: 0.23.0 Use pd.Series(..) constructor instead. @@ -324,6 +331,8 @@ def from_array(cls, arr, index=None, name=None, dtype=None, copy=False, return cls(arr, index=index, name=name, dtype=dtype, copy=copy, fastpath=fastpath) + # ---------------------------------------------------------------------- + @property def _constructor(self): return Series @@ -341,7 +350,9 @@ def _can_hold_na(self): _index = None def _set_axis(self, axis, labels, fastpath=False): - """ override generic, we want to set the _typ here """ + """ + Override generic, we want to set the _typ here. + """ if not fastpath: labels = ensure_index(labels) @@ -418,13 +429,23 @@ def ftypes(self): @property def values(self): """ - Return Series as ndarray or ndarray-like - depending on the dtype + Return Series as ndarray or ndarray-like depending on the dtype. + + .. warning:: + + We recommend using :attr:`Series.array` or + :Series:`Index.to_numpy`, depending on whether you need + a reference to the underlying data or a NumPy array. Returns ------- arr : numpy.ndarray or ndarray-like + See Also + -------- + Series.array : Reference to the underlying data. + Series.to_numpy : A NumPy array representing the underlying data. + Examples -------- >>> pd.Series([1, 2, 3]).values @@ -455,8 +476,9 @@ def _values(self): return self._data.internal_values() def _formatting_values(self): - """Return the values that can be formatted (used by SeriesFormatter - and DataFrameFormatter) + """ + Return the values that can be formatted (used by SeriesFormatter + and DataFrameFormatter). """ return self._data.formatting_values() @@ -468,7 +490,8 @@ def get_values(self): @property def asobject(self): - """Return object Series which contains boxed values. + """ + Return object Series which contains boxed values. .. deprecated :: 0.23.0 @@ -483,7 +506,7 @@ def asobject(self): # ops def ravel(self, order='C'): """ - Return the flattened underlying data as an ndarray + Return the flattened underlying data as an ndarray. See Also -------- @@ -493,7 +516,7 @@ def ravel(self, order='C'): def compress(self, condition, *args, **kwargs): """ - Return selected slices of an array along given axis as a Series + Return selected slices of an array along given axis as a Series. .. deprecated:: 0.24.0 @@ -510,7 +533,7 @@ def compress(self, condition, *args, **kwargs): def nonzero(self): """ - Return the *integer* indices of the elements that are non-zero + Return the *integer* indices of the elements that are non-zero. This method is equivalent to calling `numpy.nonzero` on the series data. For compatibility with NumPy, the return value is @@ -545,8 +568,7 @@ def nonzero(self): def put(self, *args, **kwargs): """ - Applies the `put` method to its `values` attribute - if it has one. + Applies the `put` method to its `values` attribute if it has one. See Also -------- @@ -556,7 +578,7 @@ def put(self, *args, **kwargs): def __len__(self): """ - return the length of the Series + Return the length of the Series. """ return len(self._data) @@ -629,22 +651,25 @@ def view(self, dtype=None): return self._constructor(self._values.view(dtype), index=self.index).__finalize__(self) + # ---------------------------------------------------------------------- + # NDArray Compat + def __array__(self, result=None): """ - the array interface, return my values + The array interface, return my values. """ return self.get_values() def __array_wrap__(self, result, context=None): """ - Gets called after a ufunc + Gets called after a ufunc. """ return self._constructor(result, index=self.index, copy=False).__finalize__(self) def __array_prepare__(self, result, context=None): """ - Gets called prior to a ufunc + Gets called prior to a ufunc. """ # nice error message for non-ufunc types @@ -659,7 +684,9 @@ def __array_prepare__(self, result, context=None): op=context[0].__name__)) return result - # complex + # ---------------------------------------------------------------------- + # Unary Methods + @property def real(self): return self.values.real @@ -681,6 +708,8 @@ def imag(self, v): __long__ = _coerce_method(int) __int__ = _coerce_method(int) + # ---------------------------------------------------------------------- + def _unpickle_series_compat(self, state): if isinstance(state, dict): self._data = state['_data'] @@ -713,12 +742,14 @@ def _unpickle_series_compat(self, state): # indexers @property def axes(self): - """Return a list of the row axis labels""" + """ + Return a list of the row axis labels. + """ return [self.index] def _ixs(self, i, axis=0): """ - Return the i-th value or values in the Series by location + Return the i-th value or values in the Series by location. Parameters ---------- @@ -1013,7 +1044,8 @@ def repeat(self, repeats, *args, **kwargs): index=new_index).__finalize__(self) def get_value(self, label, takeable=False): - """Quickly retrieve single value at passed index label + """ + Quickly retrieve single value at passed index label. .. deprecated:: 0.21.0 Please use .at[] or .iat[] accessors. @@ -1040,9 +1072,11 @@ def _get_value(self, label, takeable=False): _get_value.__doc__ = get_value.__doc__ def set_value(self, label, value, takeable=False): - """Quickly set single value at passed label. If label is not contained, - a new object is created with the label placed at the end of the result - index. + """ + Quickly set single value at passed label. + + If label is not contained, a new object is created with the label + placed at the end of the result index. .. deprecated:: 0.21.0 Please use .at[] or .iat[] accessors. @@ -1213,9 +1247,12 @@ def reset_index(self, level=None, drop=False, name=None, inplace=False): df = self.to_frame(name) return df.reset_index(level=level, drop=drop) + # ---------------------------------------------------------------------- + # Rendering Methods + def __unicode__(self): """ - Return a string representation for a particular DataFrame + Return a string representation for a particular DataFrame. Invoked by unicode(df) in py2 only. Yields a Unicode String in both py2/py3. @@ -1236,7 +1273,7 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, index=True, length=False, dtype=False, name=False, max_rows=None): """ - Render a string representation of the Series + Render a string representation of the Series. Parameters ---------- @@ -1288,9 +1325,11 @@ def to_string(self, buf=None, na_rep='NaN', float_format=None, header=True, with open(buf, 'w') as f: f.write(result) + # ---------------------------------------------------------------------- + def iteritems(self): """ - Lazily iterate over (index, value) tuples + Lazily iterate over (index, value) tuples. """ return zip(iter(self.index), iter(self)) @@ -1300,7 +1339,9 @@ def iteritems(self): # Misc public methods def keys(self): - """Alias for index""" + """ + Alias for index. + """ return self.index def to_dict(self, into=dict): @@ -1339,7 +1380,7 @@ def to_dict(self, into=dict): def to_frame(self, name=None): """ - Convert Series to DataFrame + Convert Series to DataFrame. Parameters ---------- @@ -1360,7 +1401,7 @@ def to_frame(self, name=None): def to_sparse(self, kind='block', fill_value=None): """ - Convert Series to SparseSeries + Convert Series to SparseSeries. Parameters ---------- @@ -1401,7 +1442,7 @@ def _set_name(self, name, inplace=False): def count(self, level=None): """ - Return number of non-NA/null observations in the Series + Return number of non-NA/null observations in the Series. Parameters ---------- @@ -1433,7 +1474,8 @@ def count(self, level=None): dtype='int64').__finalize__(self) def mode(self, dropna=True): - """Return the mode(s) of the dataset. + """ + Return the mode(s) of the dataset. Always returns Series even if only one value is returned. @@ -1796,7 +1838,7 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): # ndarray compat argmin = deprecate( 'argmin', idxmin, '0.21.0', - msg=dedent("""\ + msg=dedent(""" The current behaviour of 'Series.argmin' is deprecated, use 'idxmin' instead. The behavior of 'argmin' will be corrected to return the positional @@ -1806,7 +1848,7 @@ def idxmax(self, axis=0, skipna=True, *args, **kwargs): ) argmax = deprecate( 'argmax', idxmax, '0.21.0', - msg=dedent("""\ + msg=dedent(""" The current behaviour of 'Series.argmax' is deprecated, use 'idxmax' instead. The behavior of 'argmax' will be corrected to return the positional @@ -1899,7 +1941,7 @@ def quantile(self, q=0.5, interpolation='linear'): def corr(self, other, method='pearson', min_periods=None): """ - Compute correlation with `other` Series, excluding missing values + Compute correlation with `other` Series, excluding missing values. Parameters ---------- @@ -1942,7 +1984,7 @@ def corr(self, other, method='pearson', min_periods=None): def cov(self, other, min_periods=None): """ - Compute covariance with Series, excluding missing values + Compute covariance with Series, excluding missing values. Parameters ---------- @@ -2149,11 +2191,15 @@ def dot(self, other): raise TypeError('unsupported type: %s' % type(other)) def __matmul__(self, other): - """ Matrix multiplication using binary `@` operator in Python>=3.5 """ + """ + Matrix multiplication using binary `@` operator in Python>=3.5. + """ return self.dot(other) def __rmatmul__(self, other): - """ Matrix multiplication using binary `@` operator in Python>=3.5 """ + """ + Matrix multiplication using binary `@` operator in Python>=3.5. + """ return self.dot(np.transpose(other)) @Substitution(klass='Series') @@ -2250,7 +2296,7 @@ def append(self, to_append, ignore_index=False, verify_integrity=False): def _binop(self, other, func, level=None, fill_value=None): """ - Perform generic binary operation with optional fill value + Perform generic binary operation with optional fill value. Parameters ---------- @@ -2357,7 +2403,7 @@ def combine(self, other, func, fill_value=None): eagle 200.0 falcon 345.0 dtype: float64 -""" + """ if fill_value is None: fill_value = na_value_for_dtype(self.dtype, compat=False) @@ -2439,7 +2485,7 @@ def combine_first(self, other): def update(self, other): """ Modify Series in place using non-NA values from passed - Series. Aligns on index + Series. Aligns on index. Parameters ---------- @@ -2814,7 +2860,7 @@ def sort_index(self, axis=0, level=None, ascending=True, inplace=False, def argsort(self, axis=0, kind='quicksort', order=None): """ Overrides ndarray.argsort. Argsorts the value, omitting NA/null values, - and places the result in the same locations as the non-NA values + and places the result in the same locations as the non-NA values. Parameters ---------- @@ -3040,7 +3086,7 @@ def nsmallest(self, n=5, keep='first'): def swaplevel(self, i=-2, j=-1, copy=True): """ - Swap levels i and j in a MultiIndex + Swap levels i and j in a MultiIndex. Parameters ---------- @@ -3062,8 +3108,9 @@ def swaplevel(self, i=-2, j=-1, copy=True): def reorder_levels(self, order): """ - Rearrange index levels using input order. May not drop or duplicate - levels + Rearrange index levels using input order. + + May not drop or duplicate levels. Parameters ---------- @@ -3208,8 +3255,7 @@ def map(self, arg, na_action=None): def _gotitem(self, key, ndim, subset=None): """ - sub-classes to define - return a sliced object + Sub-classes to define. Return a sliced object. Parameters ---------- @@ -3290,8 +3336,8 @@ def apply(self, func, convert_dtype=True, args=(), **kwds): """ Invoke function on values of Series. - Can be ufunc (a NumPy function that applies to the entire Series) or a - Python function that only works on single values. + Can be ufunc (a NumPy function that applies to the entire Series) + or a Python function that only works on single values. Parameters ---------- @@ -3423,11 +3469,10 @@ def f(x): def _reduce(self, op, name, axis=0, skipna=True, numeric_only=None, filter_type=None, **kwds): """ - perform a reduction operation - - if we have an ndarray as a value, then simply perform the operation, - otherwise delegate to the object + Perform a reduction operation. + If we have an ndarray as a value, then simply perform the operation, + otherwise delegate to the object. """ delegate = self._values @@ -3464,8 +3509,9 @@ def _reindex_indexer(self, new_index, indexer, copy): return self._constructor(new_values, index=new_index) def _needs_reindex_multi(self, axes, method, level): - """ check if we do need a multi reindex; this is for compat with - higher dims + """ + Check if we do need a multi reindex; this is for compat with + higher dims. """ return False @@ -3480,7 +3526,8 @@ def align(self, other, join='outer', axis=None, level=None, copy=True, broadcast_axis=broadcast_axis) def rename(self, index=None, **kwargs): - """Alter Series index labels or name + """ + Alter Series index labels or name. Function / dict values must be unique (1-to-1). Labels not contained in a dict / Series will be left as-is. Extra labels listed don't throw an @@ -3664,7 +3711,8 @@ def shift(self, periods=1, freq=None, axis=0): return super(Series, self).shift(periods=periods, freq=freq, axis=axis) def reindex_axis(self, labels, axis=0, **kwargs): - """Conform Series to new index with optional filling logic. + """ + Conform Series to new index with optional filling logic. .. deprecated:: 0.21.0 Use ``Series.reindex`` instead. @@ -3893,7 +3941,8 @@ def between(self, left, right, inclusive=True): @classmethod def from_csv(cls, path, sep=',', parse_dates=True, header=None, index_col=0, encoding=None, infer_datetime_format=False): - """Read CSV file. + """ + Read CSV file. .. deprecated:: 0.21.0 Use :func:`pandas.read_csv` instead. @@ -4123,7 +4172,8 @@ def dropna(self, axis=0, inplace=False, **kwargs): return self.copy() def valid(self, inplace=False, **kwargs): - """Return Series without null values. + """ + Return Series without null values. .. deprecated:: 0.23.0 Use :meth:`Series.dropna` instead. @@ -4137,7 +4187,7 @@ def valid(self, inplace=False, **kwargs): def to_timestamp(self, freq=None, how='start', copy=True): """ - Cast to datetimeindex of timestamps, at *beginning* of period + Cast to datetimeindex of timestamps, at *beginning* of period. Parameters ---------- @@ -4162,7 +4212,7 @@ def to_timestamp(self, freq=None, how='start', copy=True): def to_period(self, freq=None, copy=True): """ Convert Series from DatetimeIndex to PeriodIndex with desired - frequency (inferred from index if not passed) + frequency (inferred from index if not passed). Parameters ---------- @@ -4210,8 +4260,9 @@ def to_period(self, freq=None, copy=True): def _sanitize_index(data, index, copy=False): - """ sanitize an index type to return an ndarray of the underlying, pass - thru a non-Index + """ + Sanitize an index type to return an ndarray of the underlying, pass + through a non-Index. """ if index is None: @@ -4238,8 +4289,9 @@ def _sanitize_index(data, index, copy=False): def _sanitize_array(data, index, dtype=None, copy=False, raise_cast_failure=False): - """ sanitize input data to an ndarray, copy if specified, coerce to the - dtype if specified + """ + Sanitize input data to an ndarray, copy if specified, coerce to the + dtype if specified. """ if dtype is not None: diff --git a/pandas/core/sparse/frame.py b/pandas/core/sparse/frame.py index a25ffa2744cb79..f1c46abfab0b21 100644 --- a/pandas/core/sparse/frame.py +++ b/pandas/core/sparse/frame.py @@ -339,9 +339,8 @@ def to_dense(self): def _apply_columns(self, func): """ get new SparseDataFrame applying func to each columns """ - new_data = {} - for col, series in compat.iteritems(self): - new_data[col] = func(series) + new_data = {col: func(series) + for col, series in compat.iteritems(self)} return self._constructor( data=new_data, index=self.index, columns=self.columns, diff --git a/pandas/core/strings.py b/pandas/core/strings.py index 0b791f6f91aa36..e639e2e65e648d 100644 --- a/pandas/core/strings.py +++ b/pandas/core/strings.py @@ -15,7 +15,7 @@ from pandas.core.dtypes.common import ( ensure_object, is_bool_dtype, is_categorical_dtype, is_integer, is_list_like, is_object_dtype, is_re, is_scalar, is_string_like) -from pandas.core.dtypes.generic import ABCIndex, ABCSeries +from pandas.core.dtypes.generic import ABCIndexClass, ABCSeries from pandas.core.dtypes.missing import isna from pandas.core.algorithms import take_1d @@ -931,7 +931,7 @@ def str_extractall(arr, pat, flags=0): if regex.groups == 0: raise ValueError("pattern contains no capture groups") - if isinstance(arr, ABCIndex): + if isinstance(arr, ABCIndexClass): arr = arr.to_series().reset_index(drop=True) names = dict(zip(regex.groupindex.values(), regex.groupindex.keys())) @@ -1854,7 +1854,7 @@ def __iter__(self): def _wrap_result(self, result, use_codes=True, name=None, expand=None, fill_value=np.nan): - from pandas.core.index import Index, MultiIndex + from pandas import Index, Series, MultiIndex # for category, we do the stuff on the categories, so blow it up # to the full series again @@ -1862,7 +1862,8 @@ def _wrap_result(self, result, use_codes=True, # so make it possible to skip this step as the method already did this # before the transformation... if use_codes and self._is_categorical: - result = take_1d(result, self._orig.cat.codes, + # if self._orig is a CategoricalIndex, there is no .cat-accessor + result = take_1d(result, Series(self._orig, copy=False).cat.codes, fill_value=fill_value) if not hasattr(result, 'ndim') or not hasattr(result, 'dtype'): @@ -2260,9 +2261,11 @@ def cat(self, others=None, sep=None, na_rep=None, join=None): result = cat_core(all_cols, sep) if isinstance(self._orig, Index): - result = Index(result, name=self._orig.name) + # add dtype for case that result is all-NA + result = Index(result, dtype=object, name=self._orig.name) else: # Series - result = Series(result, index=data.index, name=self._orig.name) + result = Series(result, dtype=object, index=data.index, + name=self._orig.name) return result _shared_docs['str_split'] = (""" diff --git a/pandas/core/tools/datetimes.py b/pandas/core/tools/datetimes.py index 2c6fdb3eaf03c5..7a87e33c7f97e3 100644 --- a/pandas/core/tools/datetimes.py +++ b/pandas/core/tools/datetimes.py @@ -171,6 +171,9 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, - ndarray of Timestamps if box=False """ from pandas import DatetimeIndex + from pandas.core.arrays.datetimes import ( + maybe_convert_dtype, objects_to_datetime64ns) + if isinstance(arg, (list, tuple)): arg = np.array(arg, dtype='O') @@ -208,6 +211,11 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, raise TypeError('arg must be a string, datetime, list, tuple, ' '1-d array, or Series') + # warn if passing timedelta64, raise for PeriodDtype + # NB: this must come after unit transformation + orig_arg = arg + arg, _ = maybe_convert_dtype(arg, copy=False) + arg = ensure_object(arg) require_iso8601 = False @@ -224,14 +232,18 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, require_iso8601 = not infer_datetime_format format = None - try: - result = None + tz_parsed = None + result = None - if format is not None: + if format is not None: + try: # shortcut formatting here if format == '%Y%m%d': try: - result = _attempt_YYYYMMDD(arg, errors=errors) + # pass orig_arg as float-dtype may have been converted to + # datetime64[ns] + orig_arg = ensure_object(orig_arg) + result = _attempt_YYYYMMDD(orig_arg, errors=errors) except (ValueError, TypeError, tslibs.OutOfBoundsDatetime): raise ValueError("cannot convert the input to " "'%Y%m%d' date format") @@ -256,45 +268,45 @@ def _convert_listlike_datetimes(arg, box, format, name=None, tz=None, if errors == 'raise': raise result = arg - - if result is None and (format is None or infer_datetime_format): - result, tz_parsed = tslib.array_to_datetime( - arg, - errors=errors, - utc=tz == 'utc', - dayfirst=dayfirst, - yearfirst=yearfirst, - require_iso8601=require_iso8601 - ) - if tz_parsed is not None: - if box: - # We can take a shortcut since the datetime64 numpy array - # is in UTC - return DatetimeIndex._simple_new(result, name=name, - tz=tz_parsed) - else: - # Convert the datetime64 numpy array to an numpy array - # of datetime objects - result = [Timestamp(ts, tz=tz_parsed).to_pydatetime() - for ts in result] - return np.array(result, dtype=object) - + except ValueError as e: + # Fallback to try to convert datetime objects if timezone-aware + # datetime objects are found without passing `utc=True` + try: + values, tz = conversion.datetime_to_datetime64(arg) + return DatetimeIndex._simple_new(values, name=name, tz=tz) + except (ValueError, TypeError): + raise e + + if result is None: + assert format is None or infer_datetime_format + utc = tz == 'utc' + result, tz_parsed = objects_to_datetime64ns( + arg, dayfirst=dayfirst, yearfirst=yearfirst, + utc=utc, errors=errors, require_iso8601=require_iso8601, + allow_object=True) + + if tz_parsed is not None: if box: - # Ensure we return an Index in all cases where box=True - if is_datetime64_dtype(result): - return DatetimeIndex(result, tz=tz, name=name) - elif is_object_dtype(result): - # e.g. an Index of datetime objects - from pandas import Index - return Index(result, name=name) - return result + # We can take a shortcut since the datetime64 numpy array + # is in UTC + return DatetimeIndex._simple_new(result, name=name, + tz=tz_parsed) + else: + # Convert the datetime64 numpy array to an numpy array + # of datetime objects + result = [Timestamp(ts, tz=tz_parsed).to_pydatetime() + for ts in result] + return np.array(result, dtype=object) - except ValueError as e: - try: - values, tz = conversion.datetime_to_datetime64(arg) - return DatetimeIndex._simple_new(values, name=name, tz=tz) - except (ValueError, TypeError): - raise e + if box: + # Ensure we return an Index in all cases where box=True + if is_datetime64_dtype(result): + return DatetimeIndex(result, tz=tz, name=name) + elif is_object_dtype(result): + # e.g. an Index of datetime objects + from pandas import Index + return Index(result, name=name) + return result def _adjust_to_origin(arg, origin, unit): diff --git a/pandas/core/tools/timedeltas.py b/pandas/core/tools/timedeltas.py index 7b0a3da738436b..6bcf56c306e6a6 100644 --- a/pandas/core/tools/timedeltas.py +++ b/pandas/core/tools/timedeltas.py @@ -50,7 +50,7 @@ def to_timedelta(arg, unit='ns', box=True, errors='raise'): timedelta64 or numpy.array of timedelta64 Output type returned if parsing succeeded. - See also + See Also -------- DataFrame.astype : Cast argument to a specified dtype. to_datetime : Convert argument to datetime. diff --git a/pandas/core/window.py b/pandas/core/window.py index 4c67b04e89ba83..68a36fb2a69993 100644 --- a/pandas/core/window.py +++ b/pandas/core/window.py @@ -1,9 +1,6 @@ """ - -provide a generic structure to support window functions, -similar to how we have a Groupby object - - +Provide a generic structure to support window functions, +similar to how we have a Groupby object. """ from __future__ import division @@ -33,15 +30,14 @@ _shared_docs = dict(**_shared_docs) _doc_template = """ + Returns + ------- + same type as input -Returns -------- -same type as input - -See Also --------- -pandas.Series.%(name)s -pandas.DataFrame.%(name)s + See Also + -------- + Series.%(name)s + DataFrame.%(name)s """ @@ -95,14 +91,17 @@ def validate(self): "'neither'") def _convert_freq(self): - """ resample according to the how, return a new object """ - + """ + Resample according to the how, return a new object. + """ obj = self._selected_obj index = None return obj, index def _create_blocks(self): - """ split data into blocks & return conformed data """ + """ + Split data into blocks & return conformed data. + """ obj, index = self._convert_freq() if index is not None: @@ -119,8 +118,7 @@ def _create_blocks(self): def _gotitem(self, key, ndim, subset=None): """ - sub-classes to define - return a sliced object + Sub-classes to define. Return a sliced object. Parameters ---------- @@ -161,7 +159,9 @@ def _window_type(self): return self.__class__.__name__ def __unicode__(self): - """ provide a nice str repr of our rolling object """ + """ + Provide a nice str repr of our rolling object. + """ attrs = ["{k}={v}".format(k=k, v=getattr(self, k)) for k in self._attributes @@ -175,7 +175,7 @@ def __iter__(self): def _get_index(self, index=None): """ - Return index as ndarrays + Return index as ndarrays. Returns ------- @@ -219,7 +219,9 @@ def _prep_values(self, values=None, kill_inf=True): return values def _wrap_result(self, result, block=None, obj=None): - """ wrap a single result """ + """ + Wrap a single result. + """ if obj is None: obj = self._selected_obj @@ -243,7 +245,7 @@ def _wrap_result(self, result, block=None, obj=None): def _wrap_results(self, results, blocks, obj): """ - wrap the results + Wrap the results. Parameters ---------- @@ -288,7 +290,9 @@ def _wrap_results(self, results, blocks, obj): return concat(final, axis=1).reindex(columns=columns, copy=False) def _center_window(self, result, window): - """ center the result in the window """ + """ + Center the result in the window. + """ if self.axis > result.ndim - 1: raise ValueError("Requested axis is larger then no. of argument " "dimensions") @@ -606,8 +610,8 @@ def validate(self): def _prep_window(self, **kwargs): """ - provide validation for our window type, return the window - we have already been validated + Provide validation for our window type, return the window + we have already been validated. """ window = self._get_window() @@ -757,7 +761,9 @@ def mean(self, *args, **kwargs): class _GroupByMixin(GroupByMixin): - """ provide the groupby facilities """ + """ + Provide the groupby facilities. + """ def __init__(self, obj, *args, **kwargs): parent = kwargs.pop('parent', None) # noqa @@ -776,8 +782,8 @@ def __init__(self, obj, *args, **kwargs): def _apply(self, func, name, window=None, center=None, check_minp=None, **kwargs): """ - dispatch to apply; we are stripping all of the _apply kwargs and - performing the original function call on the grouped object + Dispatch to apply; we are stripping all of the _apply kwargs and + performing the original function call on the grouped object. """ def f(x, name=name, *args): @@ -800,8 +806,9 @@ def _constructor(self): def _apply(self, func, name=None, window=None, center=None, check_minp=None, **kwargs): """ - Rolling statistical measure using supplied function. Designed to be - used with passed-in Cython array-based functions. + Rolling statistical measure using supplied function. + + Designed to be used with passed-in Cython array-based functions. Parameters ---------- @@ -937,7 +944,7 @@ def count(self): return self._wrap_results(results, blocks, obj) _shared_docs['apply'] = dedent(r""" - %(name)s function apply + %(name)s function apply. Parameters ---------- @@ -995,7 +1002,7 @@ def sum(self, *args, **kwargs): return self._apply('roll_sum', 'sum', **kwargs) _shared_docs['max'] = dedent(""" - %(name)s maximum + Calculate the %(name)s maximum. """) def max(self, *args, **kwargs): @@ -1259,7 +1266,7 @@ def kurt(self, **kwargs): check_minp=_require_min_periods(4), **kwargs) _shared_docs['quantile'] = dedent(""" - %(name)s quantile. + Calculate the %(name)s quantile. Parameters ---------- @@ -1332,23 +1339,25 @@ def f(arg, *args, **kwargs): return self._apply(f, 'quantile', quantile=quantile, **kwargs) - _shared_docs['cov'] = dedent(""" - %(name)s sample covariance + _shared_docs['cov'] = """ + Calculate the %(name)s sample covariance. - Parameters - ---------- - other : Series, DataFrame, or ndarray, optional - if not supplied then will default to self and produce pairwise output - pairwise : bool, default None - If False then only matching columns between self and other will be used - and the output will be a DataFrame. - If True then all pairwise combinations will be calculated and the - output will be a MultiIndexed DataFrame in the case of DataFrame - inputs. In the case of missing elements, only complete pairwise - observations will be used. - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is ``N - ddof``, where ``N`` represents the number of elements.""") + Parameters + ---------- + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndexed DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + ddof : int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is ``N - ddof``, where ``N`` represents the number of elements. + """ def cov(self, other=None, pairwise=None, ddof=1, **kwargs): if other is None: @@ -1485,7 +1494,7 @@ def _get_cov(X, Y): Y 0.626300 1.000000 4 X 1.000000 0.555368 Y 0.555368 1.000000 -""") + """) def corr(self, other=None, pairwise=None, **kwargs): if other is None: @@ -1566,14 +1575,18 @@ def validate(self): "and offset based windows") def _validate_monotonic(self): - """ validate on is monotonic """ + """ + Validate on is_monotonic. + """ if not self._on.is_monotonic: formatted = self.on or 'index' raise ValueError("{0} must be " "monotonic".format(formatted)) def _validate_freq(self): - """ validate & return window frequency """ + """ + Validate & return window frequency. + """ from pandas.tseries.frequencies import to_offset try: return to_offset(self.window) @@ -1760,7 +1773,7 @@ def corr(self, other=None, pairwise=None, **kwargs): class RollingGroupby(_GroupByMixin, Rolling): """ - Provides a rolling groupby implementation + Provides a rolling groupby implementation. .. versionadded:: 0.18.1 @@ -1781,10 +1794,10 @@ def _gotitem(self, key, ndim, subset=None): def _validate_monotonic(self): """ - validate that on is monotonic; + Validate that on is monotonic; we don't care for groupby.rolling because we have already validated at a higher - level + level. """ pass @@ -2031,7 +2044,7 @@ def corr(self, other=None, pairwise=None, **kwargs): class ExpandingGroupby(_GroupByMixin, Expanding): """ - Provides a expanding groupby implementation + Provides a expanding groupby implementation. .. versionadded:: 0.18.1 @@ -2042,34 +2055,33 @@ def _constructor(self): _bias_template = """ - -Parameters ----------- -bias : bool, default False - Use a standard estimation bias correction + Parameters + ---------- + bias : bool, default False + Use a standard estimation bias correction """ _pairwise_template = """ - -Parameters ----------- -other : Series, DataFrame, or ndarray, optional - if not supplied then will default to self and produce pairwise output -pairwise : bool, default None - If False then only matching columns between self and other will be used and - the output will be a DataFrame. - If True then all pairwise combinations will be calculated and the output - will be a MultiIndex DataFrame in the case of DataFrame inputs. - In the case of missing elements, only complete pairwise observations will - be used. -bias : bool, default False - Use a standard estimation bias correction + Parameters + ---------- + other : Series, DataFrame, or ndarray, optional + If not supplied then will default to self and produce pairwise + output. + pairwise : bool, default None + If False then only matching columns between self and other will be + used and the output will be a DataFrame. + If True then all pairwise combinations will be calculated and the + output will be a MultiIndex DataFrame in the case of DataFrame + inputs. In the case of missing elements, only complete pairwise + observations will be used. + bias : bool, default False + Use a standard estimation bias correction """ class EWM(_Rolling): r""" - Provides exponential weighted functions + Provides exponential weighted functions. .. versionadded:: 0.18.0 @@ -2219,7 +2231,8 @@ def aggregate(self, arg, *args, **kwargs): agg = aggregate def _apply(self, func, **kwargs): - """Rolling statistical measure using supplied function. Designed to be + """ + Rolling statistical measure using supplied function. Designed to be used with passed-in Cython array-based functions. Parameters @@ -2261,7 +2274,9 @@ def func(arg): @Substitution(name='ewm') @Appender(_doc_template) def mean(self, *args, **kwargs): - """exponential weighted moving average""" + """ + Exponential weighted moving average. + """ nv.validate_window_func('mean', args, kwargs) return self._apply('ewma', **kwargs) @@ -2269,7 +2284,9 @@ def mean(self, *args, **kwargs): @Appender(_doc_template) @Appender(_bias_template) def std(self, bias=False, *args, **kwargs): - """exponential weighted moving stddev""" + """ + Exponential weighted moving stddev. + """ nv.validate_window_func('std', args, kwargs) return _zsqrt(self.var(bias=bias, **kwargs)) @@ -2279,7 +2296,9 @@ def std(self, bias=False, *args, **kwargs): @Appender(_doc_template) @Appender(_bias_template) def var(self, bias=False, *args, **kwargs): - """exponential weighted moving variance""" + """ + Exponential weighted moving variance. + """ nv.validate_window_func('var', args, kwargs) def f(arg): @@ -2293,7 +2312,9 @@ def f(arg): @Appender(_doc_template) @Appender(_pairwise_template) def cov(self, other=None, pairwise=None, bias=False, **kwargs): - """exponential weighted sample covariance""" + """ + Exponential weighted sample covariance. + """ if other is None: other = self._selected_obj # only default unset @@ -2316,7 +2337,9 @@ def _get_cov(X, Y): @Appender(_doc_template) @Appender(_pairwise_template) def corr(self, other=None, pairwise=None, **kwargs): - """exponential weighted sample correlation""" + """ + Exponential weighted sample correlation. + """ if other is None: other = self._selected_obj # only default unset @@ -2456,9 +2479,8 @@ def dataframe_from_int_dict(data, frame_template): else: raise ValueError("'pairwise' is not True/False") else: - results = {} - for i, col in enumerate(arg1.columns): - results[i] = f(*_prep_binary(arg1.iloc[:, i], arg2)) + results = {i: f(*_prep_binary(arg1.iloc[:, i], arg2)) + for i, col in enumerate(arg1.columns)} return dataframe_from_int_dict(results, arg1) else: diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 1328713736b036..03d873467dc10a 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -586,10 +586,9 @@ def _parse_cell(cell_contents, cell_typ): usecols = _maybe_convert_usecols(usecols) for i in range(sheet.nrows): - row = [] - for j, (value, typ) in enumerate(zip(sheet.row_values(i), - sheet.row_types(i))): - row.append(_parse_cell(value, typ)) + row = [_parse_cell(value, typ) + for value, typ in zip(sheet.row_values(i), + sheet.row_types(i))] data.append(row) if sheet.nrows == 0: @@ -662,10 +661,14 @@ def _parse_cell(cell_contents, cell_typ): output[asheetname] = parser.read(nrows=nrows) - if ((not squeeze or isinstance(output[asheetname], DataFrame)) - and header_names): - output[asheetname].columns = output[ - asheetname].columns.set_names(header_names) + if not squeeze or isinstance(output[asheetname], DataFrame): + if header_names: + output[asheetname].columns = output[ + asheetname].columns.set_names(header_names) + elif compat.PY2: + output[asheetname].columns = _maybe_convert_to_string( + output[asheetname].columns) + except EmptyDataError: # No Data, return an empty DataFrame output[asheetname] = DataFrame() @@ -810,6 +813,39 @@ def _trim_excel_header(row): return row +def _maybe_convert_to_string(row): + """ + Convert elements in a row to string from Unicode. + + This is purely a Python 2.x patch and is performed ONLY when all + elements of the row are string-like. + + Parameters + ---------- + row : array-like + The row of data to convert. + + Returns + ------- + converted : array-like + """ + if compat.PY2: + converted = [] + + for i in range(len(row)): + if isinstance(row[i], compat.string_types): + try: + converted.append(str(row[i])) + except UnicodeEncodeError: + break + else: + break + else: + row = converted + + return row + + def _fill_mi_header(row, control_row): """Forward fills blank entries in row, but only inside the same parent index @@ -838,7 +874,7 @@ def _fill_mi_header(row, control_row): control_row[i] = False last = row[i] - return row, control_row + return _maybe_convert_to_string(row), control_row # fill blank if index_col not None diff --git a/pandas/io/formats/format.py b/pandas/io/formats/format.py index 4c08ee89c33dfc..b35f5d1e548b74 100644 --- a/pandas/io/formats/format.py +++ b/pandas/io/formats/format.py @@ -16,7 +16,7 @@ from pandas.compat import StringIO, lzip, map, u, zip from pandas.core.dtypes.common import ( - is_categorical_dtype, is_datetime64_dtype, is_datetimetz, is_float, + is_categorical_dtype, is_datetime64_dtype, is_datetime64tz_dtype, is_float, is_float_dtype, is_integer, is_integer_dtype, is_interval_dtype, is_list_like, is_numeric_dtype, is_period_arraylike, is_scalar, is_timedelta64_dtype) @@ -730,12 +730,8 @@ def to_html(self, classes=None, notebook=False, border=None): .. versionadded:: 0.19.0 """ from pandas.io.formats.html import HTMLFormatter - html_renderer = HTMLFormatter(self, classes=classes, - max_rows=self.max_rows, - max_cols=self.max_cols, - notebook=notebook, - border=border, - table_id=self.table_id) + html_renderer = HTMLFormatter(self, classes=classes, notebook=notebook, + border=border, table_id=self.table_id) if hasattr(self.buf, 'write'): html_renderer.write_result(self.buf) elif isinstance(self.buf, compat.string_types): @@ -856,7 +852,7 @@ def format_array(values, formatter, float_format=None, na_rep='NaN', fmt_klass = PeriodArrayFormatter elif is_integer_dtype(values.dtype): fmt_klass = IntArrayFormatter - elif is_datetimetz(values): + elif is_datetime64tz_dtype(values): fmt_klass = Datetime64TZFormatter elif is_datetime64_dtype(values.dtype): fmt_klass = Datetime64Formatter diff --git a/pandas/io/formats/html.py b/pandas/io/formats/html.py index bc2de210df3f48..bf92ce7ee0f679 100644 --- a/pandas/io/formats/html.py +++ b/pandas/io/formats/html.py @@ -25,8 +25,8 @@ class HTMLFormatter(TableFormatter): indent_delta = 2 - def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, - notebook=False, border=None, table_id=None): + def __init__(self, formatter, classes=None, notebook=False, border=None, + table_id=None): self.fmt = formatter self.classes = classes @@ -35,18 +35,21 @@ def __init__(self, formatter, classes=None, max_rows=None, max_cols=None, self.elements = [] self.bold_rows = self.fmt.kwds.get('bold_rows', False) self.escape = self.fmt.kwds.get('escape', True) - - self.max_rows = max_rows or len(self.fmt.frame) - self.max_cols = max_cols or len(self.fmt.columns) self.show_dimensions = self.fmt.show_dimensions - self.is_truncated = (self.max_rows < len(self.fmt.frame) or - self.max_cols < len(self.fmt.columns)) self.notebook = notebook if border is None: border = get_option('display.html.border') self.border = border self.table_id = table_id + @property + def is_truncated(self): + return self.fmt.is_truncated + + @property + def ncols(self): + return len(self.fmt.tr_frame.columns) + def write(self, s, indent=0): rs = pprint_thing(s) self.elements.append(' ' * indent + rs) @@ -301,12 +304,8 @@ def _write_header(self, indent): if all((self.fmt.has_index_names, self.fmt.index, self.fmt.show_index_names)): - row = ([x if x is not None else '' - for x in self.frame.index.names] + - [''] * min(len(self.columns), self.max_cols)) - if truncate_h: - ins_col = row_levels + self.fmt.tr_col_num - row.insert(ins_col, '') + row = ([x if x is not None else '' for x in self.frame.index.names] + + [''] * (self.ncols + (1 if truncate_h else 0))) self.write_tr(row, indent, self.indent_delta, header=True) indent -= self.indent_delta @@ -318,9 +317,7 @@ def _write_body(self, indent): self.write('', indent) indent += self.indent_delta - fmt_values = {} - for i in range(min(len(self.columns), self.max_cols)): - fmt_values[i] = self.fmt._format_col(i) + fmt_values = {i: self.fmt._format_col(i) for i in range(self.ncols)} # write values if self.fmt.index and isinstance(self.frame.index, ABCMultiIndex): @@ -338,7 +335,6 @@ def _write_regular_rows(self, fmt_values, indent): truncate_h = self.fmt.truncate_h truncate_v = self.fmt.truncate_v - ncols = len(self.fmt.tr_frame.columns) nrows = len(self.fmt.tr_frame) if self.fmt.index: @@ -362,7 +358,7 @@ def _write_regular_rows(self, fmt_values, indent): row = [] if self.fmt.index: row.append(index_values[i]) - row.extend(fmt_values[j][i] for j in range(ncols)) + row.extend(fmt_values[j][i] for j in range(self.ncols)) if truncate_h: dot_col_ix = self.fmt.tr_col_num + row_levels @@ -376,7 +372,6 @@ def _write_hierarchical_rows(self, fmt_values, indent): truncate_h = self.fmt.truncate_h truncate_v = self.fmt.truncate_v frame = self.fmt.tr_frame - ncols = len(frame.columns) nrows = len(frame) row_levels = self.frame.index.nlevels @@ -454,7 +449,7 @@ def _write_hierarchical_rows(self, fmt_values, indent): j += 1 row.append(v) - row.extend(fmt_values[j][i] for j in range(ncols)) + row.extend(fmt_values[j][i] for j in range(self.ncols)) if truncate_h: row.insert(row_levels - sparse_offset + self.fmt.tr_col_num, '...') @@ -466,7 +461,7 @@ def _write_hierarchical_rows(self, fmt_values, indent): sparsify=False, adjoin=False, names=False))) row = [] row.extend(idx_values[i]) - row.extend(fmt_values[j][i] for j in range(ncols)) + row.extend(fmt_values[j][i] for j in range(self.ncols)) if truncate_h: row.insert(row_levels + self.fmt.tr_col_num, '...') self.write_tr(row, indent, self.indent_delta, tags=None, diff --git a/pandas/io/formats/printing.py b/pandas/io/formats/printing.py index f814bf965a1e99..e671571560b192 100644 --- a/pandas/io/formats/printing.py +++ b/pandas/io/formats/printing.py @@ -110,10 +110,10 @@ def _pprint_seq(seq, _nest_lvl=0, max_seq_items=None, **kwds): nitems = max_seq_items or get_option("max_seq_items") or len(seq) s = iter(seq) - r = [] - for i in range(min(nitems, len(seq))): # handle sets, no slicing - r.append(pprint_thing( - next(s), _nest_lvl + 1, max_seq_items=max_seq_items, **kwds)) + # handle sets, no slicing + r = [pprint_thing(next(s), + _nest_lvl + 1, max_seq_items=max_seq_items, **kwds) + for i in range(min(nitems, len(seq)))] body = ", ".join(r) if nitems < len(seq): diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 2c018598a6a6e1..4fdcb978b4695a 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -53,8 +53,7 @@ def _mpl(func): class Styler(object): """ - Helps style a DataFrame or Series according to the - data with HTML and CSS. + Helps style a DataFrame or Series according to the data with HTML and CSS. Parameters ---------- @@ -157,7 +156,9 @@ def default_display_func(x): self._display_funcs = defaultdict(lambda: default_display_func) def _repr_html_(self): - """Hooks into Jupyter notebook rich display system.""" + """ + Hooks into Jupyter notebook rich display system. + """ return self.render() @Appender(_shared_docs['to_excel'] % dict( @@ -187,7 +188,7 @@ def to_excel(self, excel_writer, sheet_name='Sheet1', na_rep='', def _translate(self): """ Convert the DataFrame in `self.data` and the attrs from `_build_styles` - into a dictionary of {head, body, uuid, cellstyle} + into a dictionary of {head, body, uuid, cellstyle}. """ table_styles = self.table_styles or [] caption = self.caption @@ -417,7 +418,8 @@ def format(self, formatter, subset=None): return self def render(self, **kwargs): - """Render the built up styles to HTML + """ + Render the built up styles to HTML. Parameters ---------- @@ -467,8 +469,9 @@ def render(self, **kwargs): def _update_ctx(self, attrs): """ - update the state of the Styler. Collects a mapping - of {index_label: [': ']} + Update the state of the Styler. + + Collects a mapping of {index_label: [': ']}. attrs : Series or DataFrame should contain strings of ': ;: ' @@ -504,7 +507,8 @@ def __deepcopy__(self, memo): return self._copy(deepcopy=True) def clear(self): - """"Reset" the styler, removing any previously applied styles. + """ + Reset the styler, removing any previously applied styles. Returns None. """ self.ctx.clear() @@ -696,9 +700,10 @@ def set_precision(self, precision): def set_table_attributes(self, attributes): """ - Set the table attributes. These are the items - that show up in the opening ```` tag in addition - to to automatic (by default) id. + Set the table attributes. + + These are the items that show up in the opening ``
`` tag + in addition to to automatic (by default) id. Parameters ---------- @@ -720,6 +725,7 @@ def set_table_attributes(self, attributes): def export(self): """ Export the styles to applied to the current Styler. + Can be applied to a second style with ``Styler.use``. Returns @@ -785,8 +791,9 @@ def set_caption(self, caption): def set_table_styles(self, table_styles): """ - Set the table styles on a Styler. These are placed in a - ``