diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md deleted file mode 100644 index 765c1b8bff62e..0000000000000 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ /dev/null @@ -1,39 +0,0 @@ ---- - -name: Bug Report -about: Create a bug report to help us improve pandas -title: "BUG:" -labels: "Bug, Needs Triage" - ---- - -- [ ] I have checked that this issue has not already been reported. - -- [ ] I have confirmed this bug exists on the latest version of pandas. - -- [ ] (optional) I have confirmed this bug exists on the master branch of pandas. - ---- - -**Note**: Please read [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) detailing how to provide the necessary information for us to reproduce your bug. - -#### Code Sample, a copy-pastable example - -```python -# Your code here - -``` - -#### Problem description - -[this should explain **why** the current behaviour is a problem and why the expected output is a better solution] - -#### Expected Output - -#### Output of ``pd.show_versions()`` - -
- -[paste the output of ``pd.show_versions()`` here leaving a blank line after the details tag] - -
diff --git a/.github/ISSUE_TEMPLATE/bug_report.yaml b/.github/ISSUE_TEMPLATE/bug_report.yaml new file mode 100644 index 0000000000000..0f50eb47607cd --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.yaml @@ -0,0 +1,67 @@ +name: Bug Report +description: Report incorrect behavior in the pandas library +title: "BUG: " +labels: [Bug, Needs Triage] + +body: + - type: checkboxes + id: checks + attributes: + options: + - label: > + I have checked that this issue has not already been reported. + required: true + - label: > + I have confirmed this bug exists on the + [latest version](https://pandas.pydata.org/docs/whatsnew/index.html) of pandas. + required: true + - label: > + I have confirmed this bug exists on the master branch of pandas. + - type: textarea + id: example + attributes: + label: Reproducible Example + description: > + Please follow [this guide](https://matthewrocklin.com/blog/work/2018/02/28/minimal-bug-reports) on how to + provide a minimal, copy-pastable example. + placeholder: > + import pandas as pd + + df = pd.DataFrame(range(5)) + + ... + render: python + validations: + required: true + - type: textarea + id: problem + attributes: + label: Issue Description + description: > + Please provide a description of the issue shown in the reproducible example. + validations: + required: true + - type: textarea + id: expected-behavior + attributes: + label: Expected Behavior + description: > + Please describe or show a code example of the expected behavior. + validations: + required: true + - type: textarea + id: version + attributes: + label: Installed Versions + description: > + Please paste the output of ``pd.show_versions()`` + value: > +
+ + + Replace this line with the output of pd.show_versions() + + +
+ validations: + required: true diff --git a/.github/ISSUE_TEMPLATE/installation_issue.yaml b/.github/ISSUE_TEMPLATE/installation_issue.yaml new file mode 100644 index 0000000000000..d5db0d1c83a41 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/installation_issue.yaml @@ -0,0 +1,65 @@ +name: Installation Issue +description: Report issues installing the pandas library on your system +title: "BUILD: " +labels: [Build, Needs Triage] + +body: + - type: checkboxes + id: checks + attributes: + options: + - label: > + I have read the [installation guide](https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html#installing-pandas). + required: true + - type: input + id: platform + attributes: + label: Platform + description: > + Please provide the output of ``import platform; print(platform.platform())`` + validations: + required: true + - type: dropdown + id: method + attributes: + label: Installation Method + description: > + Please provide how you tried to install pandas from a clean environment. + options: + - pip install + - conda install + - apt-get install + - Built from source + - Other + validations: + required: true + - type: input + id: pandas + attributes: + label: pandas Version + description: > + Please provide the version of pandas you are trying to install. + validations: + required: true + - type: input + id: python + attributes: + label: Python Version + description: > + Please provide the installed version of Python. + validations: + required: true + - type: textarea + id: logs + attributes: + label: Installation Logs + description: > + If possible, please copy and paste the installation logs when attempting to install pandas. + value: > +
+ + + Replace this line with the installation logs. + + +
diff --git a/.github/ISSUE_TEMPLATE/performance_issue.yaml b/.github/ISSUE_TEMPLATE/performance_issue.yaml new file mode 100644 index 0000000000000..2dcfc94f4a604 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/performance_issue.yaml @@ -0,0 +1,52 @@ +name: Performance Issue +description: Report slow performance or memory issues when running pandas code +title: "PERF: " +labels: [Performance, Needs Triage] + +body: + - type: checkboxes + id: checks + attributes: + options: + - label: > + I have checked that this issue has not already been reported. + required: true + - label: > + I have confirmed this issue exists on the + [latest version](https://pandas.pydata.org/docs/whatsnew/index.html) of pandas. + required: true + - label: > + I have confirmed this issue exists on the master branch of pandas. + - type: textarea + id: example + attributes: + label: Reproducible Example + description: > + Please provide a minimal, copy-pastable example that quantifies + [slow runtime](https://docs.python.org/3/library/timeit.html) or + [memory](https://pypi.org/project/memory-profiler/) issues. + validations: + required: true + - type: textarea + id: version + attributes: + label: Installed Versions + description: > + Please paste the output of ``pd.show_versions()`` + value: > +
+ + + Replace this line with the output of pd.show_versions() + + +
+ validations: + required: true + - type: textarea + id: prior-performance + attributes: + label: Prior Performance + description: > + If applicable, please provide the prior version of pandas and output + of the same reproducible example where the performance issue did not exist. diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 7fb5a6ddf2024..42017db8a05b1 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,4 +1,4 @@ - [ ] closes #xxxx - [ ] tests added / passed -- [ ] Ensure all linting tests pass, see [here](https://pandas.pydata.org/pandas-docs/dev/development/contributing.html#code-standards) for how to run them +- [ ] Ensure all linting tests pass, see [here](https://pandas.pydata.org/pandas-docs/dev/development/contributing_codebase.html#pre-commit) for how to run them - [ ] whatsnew entry diff --git a/.github/workflows/asv-bot.yml b/.github/workflows/asv-bot.yml new file mode 100644 index 0000000000000..c2a49dd96c1c1 --- /dev/null +++ b/.github/workflows/asv-bot.yml @@ -0,0 +1,81 @@ +name: "ASV Bot" + +on: + issue_comment: # Pull requests are issues + types: + - created + +env: + ENV_FILE: environment.yml + COMMENT: ${{github.event.comment.body}} + +jobs: + autotune: + name: "Run benchmarks" + # TODO: Support more benchmarking options later, against different branches, against self, etc + if: startsWith(github.event.comment.body, '@github-actions benchmark') + runs-on: ubuntu-latest + defaults: + run: + shell: bash -l {0} + + concurrency: + # Set concurrency to prevent abuse(full runs are ~5.5 hours !!!) + # each user can only run one concurrent benchmark bot at a time + # We don't cancel in progress jobs, but if you want to benchmark multiple PRs, you're gonna have + # to wait + group: ${{ github.actor }}-asv + cancel-in-progress: false + + steps: + - name: Checkout + uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Cache conda + uses: actions/cache@v2 + with: + path: ~/conda_pkgs_dir + key: ${{ runner.os }}-conda-${{ hashFiles('${{ env.ENV_FILE }}') }} + + # Although asv sets up its own env, deps are still needed + # during discovery process + - uses: conda-incubator/setup-miniconda@v2 + with: + activate-environment: pandas-dev + channel-priority: strict + environment-file: ${{ env.ENV_FILE }} + use-only-tar-bz2: true + + - name: Run benchmarks + id: bench + continue-on-error: true # This is a fake failure, asv will exit code 1 for regressions + run: | + # extracting the regex, see https://stackoverflow.com/a/36798723 + REGEX=$(echo "$COMMENT" | sed -n "s/^.*-b\s*\(\S*\).*$/\1/p") + cd asv_bench + asv check -E existing + git remote add upstream https://github.com/pandas-dev/pandas.git + git fetch upstream + asv machine --yes + asv continuous -f 1.1 -b $REGEX upstream/master HEAD + echo 'BENCH_OUTPUT<> $GITHUB_ENV + asv compare -f 1.1 upstream/master HEAD >> $GITHUB_ENV + echo 'EOF' >> $GITHUB_ENV + echo "REGEX=$REGEX" >> $GITHUB_ENV + + - uses: actions/github-script@v4 + env: + BENCH_OUTPUT: ${{env.BENCH_OUTPUT}} + REGEX: ${{env.REGEX}} + with: + script: | + const ENV_VARS = process.env + const run_url = `https://github.com/${context.repo.owner}/${context.repo.repo}/actions/runs/${context.runId}` + github.issues.createComment({ + issue_number: context.issue.number, + owner: context.repo.owner, + repo: context.repo.repo, + body: '\nBenchmarks completed. View runner logs here.' + run_url + '\nRegex used: '+ 'regex ' + ENV_VARS["REGEX"] + '\n' + ENV_VARS["BENCH_OUTPUT"] + }) diff --git a/.github/workflows/autoupdate-pre-commit-config.yml b/.github/workflows/autoupdate-pre-commit-config.yml index 801e063f72726..3696cba8cf2e6 100644 --- a/.github/workflows/autoupdate-pre-commit-config.yml +++ b/.github/workflows/autoupdate-pre-commit-config.yml @@ -2,7 +2,7 @@ name: "Update pre-commit config" on: schedule: - - cron: "0 7 * * 1" # At 07:00 on each Monday. + - cron: "0 7 1 * *" # At 07:00 on 1st of every month. workflow_dispatch: jobs: diff --git a/asv_bench/benchmarks/dtypes.py b/asv_bench/benchmarks/dtypes.py index c45d5a0814544..55f6be848aa13 100644 --- a/asv_bench/benchmarks/dtypes.py +++ b/asv_bench/benchmarks/dtypes.py @@ -50,15 +50,26 @@ def time_pandas_dtype_invalid(self, dtype): class SelectDtypes: - params = [ - tm.ALL_INT_NUMPY_DTYPES - + tm.ALL_INT_EA_DTYPES - + tm.FLOAT_NUMPY_DTYPES - + tm.COMPLEX_DTYPES - + tm.DATETIME64_DTYPES - + tm.TIMEDELTA64_DTYPES - + tm.BOOL_DTYPES - ] + try: + params = [ + tm.ALL_INT_NUMPY_DTYPES + + tm.ALL_INT_EA_DTYPES + + tm.FLOAT_NUMPY_DTYPES + + tm.COMPLEX_DTYPES + + tm.DATETIME64_DTYPES + + tm.TIMEDELTA64_DTYPES + + tm.BOOL_DTYPES + ] + except AttributeError: + params = [ + tm.ALL_INT_DTYPES + + tm.ALL_EA_INT_DTYPES + + tm.FLOAT_DTYPES + + tm.COMPLEX_DTYPES + + tm.DATETIME64_DTYPES + + tm.TIMEDELTA64_DTYPES + + tm.BOOL_DTYPES + ] param_names = ["dtype"] def setup(self, dtype): diff --git a/asv_bench/benchmarks/indexing.py b/asv_bench/benchmarks/indexing.py index 75ef8a276da5e..58f2a73d82842 100644 --- a/asv_bench/benchmarks/indexing.py +++ b/asv_bench/benchmarks/indexing.py @@ -399,12 +399,14 @@ class ChainIndexing: def setup(self, mode): self.N = 1000000 + self.df = DataFrame({"A": np.arange(self.N), "B": "foo"}) def time_chained_indexing(self, mode): + df = self.df + N = self.N with warnings.catch_warnings(record=True): with option_context("mode.chained_assignment", mode): - df = DataFrame({"A": np.arange(self.N), "B": "foo"}) - df2 = df[df.A > self.N // 2] + df2 = df[df.A > N // 2] df2["C"] = 1.0 diff --git a/asv_bench/benchmarks/inference.py b/asv_bench/benchmarks/inference.py index 0aa924dabd469..4cbaa184791b8 100644 --- a/asv_bench/benchmarks/inference.py +++ b/asv_bench/benchmarks/inference.py @@ -115,19 +115,27 @@ def time_maybe_convert_objects(self): class ToDatetimeFromIntsFloats: def setup(self): self.ts_sec = Series(range(1521080307, 1521685107), dtype="int64") + self.ts_sec_uint = Series(range(1521080307, 1521685107), dtype="uint64") self.ts_sec_float = self.ts_sec.astype("float64") self.ts_nanosec = 1_000_000 * self.ts_sec + self.ts_nanosec_uint = 1_000_000 * self.ts_sec_uint self.ts_nanosec_float = self.ts_nanosec.astype("float64") - # speed of int64 and float64 paths should be comparable + # speed of int64, uint64 and float64 paths should be comparable def time_nanosec_int64(self): to_datetime(self.ts_nanosec, unit="ns") + def time_nanosec_uint64(self): + to_datetime(self.ts_nanosec_uint, unit="ns") + def time_nanosec_float64(self): to_datetime(self.ts_nanosec_float, unit="ns") + def time_sec_uint64(self): + to_datetime(self.ts_sec_uint, unit="s") + def time_sec_int64(self): to_datetime(self.ts_sec, unit="s") diff --git a/asv_bench/benchmarks/sparse.py b/asv_bench/benchmarks/sparse.py index 35e5818cd3b2b..c8c1a962e6861 100644 --- a/asv_bench/benchmarks/sparse.py +++ b/asv_bench/benchmarks/sparse.py @@ -67,16 +67,28 @@ def time_sparse_series_from_coo(self): class ToCoo: - def setup(self): + params = [True, False] + param_names = ["sort_labels"] + + def setup(self, sort_labels): s = Series([np.nan] * 10000) s[0] = 3.0 s[100] = -1.0 s[999] = 12.1 - s.index = MultiIndex.from_product([range(10)] * 4) - self.ss = s.astype("Sparse") - def time_sparse_series_to_coo(self): - self.ss.sparse.to_coo(row_levels=[0, 1], column_levels=[2, 3], sort_labels=True) + s_mult_lvl = s.set_axis(MultiIndex.from_product([range(10)] * 4)) + self.ss_mult_lvl = s_mult_lvl.astype("Sparse") + + s_two_lvl = s.set_axis(MultiIndex.from_product([range(100)] * 2)) + self.ss_two_lvl = s_two_lvl.astype("Sparse") + + def time_sparse_series_to_coo(self, sort_labels): + self.ss_mult_lvl.sparse.to_coo( + row_levels=[0, 1], column_levels=[2, 3], sort_labels=sort_labels + ) + + def time_sparse_series_to_coo_single_level(self, sort_labels): + self.ss_two_lvl.sparse.to_coo(sort_labels=sort_labels) class Arithmetic: diff --git a/doc/source/conf.py b/doc/source/conf.py index 8df048ce65582..0096b3337e19a 100644 --- a/doc/source/conf.py +++ b/doc/source/conf.py @@ -461,7 +461,6 @@ # eg pandas.Series.str and pandas.Series.dt (see GH9322) import sphinx # isort:skip -from sphinx.util import rpartition # isort:skip from sphinx.ext.autodoc import ( # isort:skip AttributeDocumenter, Documenter, @@ -521,8 +520,8 @@ def resolve_name(self, modname, parents, path, base): # HACK: this is added in comparison to ClassLevelDocumenter # mod_cls still exists of class.accessor, so an extra # rpartition is needed - modname, accessor = rpartition(mod_cls, ".") - modname, cls = rpartition(modname, ".") + modname, _, accessor = mod_cls.rpartition(".") + modname, _, cls = modname.rpartition(".") parents = [cls, accessor] # if the module name is still missing, get it like above if not modname: diff --git a/doc/source/development/contributing.rst b/doc/source/development/contributing.rst index f4a09e0daa750..9b3d50069b077 100644 --- a/doc/source/development/contributing.rst +++ b/doc/source/development/contributing.rst @@ -331,7 +331,12 @@ can comment:: @github-actions pre-commit -on that pull request. This will trigger a workflow which will autofix formatting errors. +on that pull request. This will trigger a workflow which will autofix formatting +errors. + +To automatically fix formatting errors on each commit you make, you can +set up pre-commit yourself. First, create a Python :ref:`environment +` and then set up :ref:`pre-commit `. Delete your merged branch (optional) ------------------------------------ diff --git a/doc/source/development/contributing_environment.rst b/doc/source/development/contributing_environment.rst index 1f9b34c7a784e..4c3c12eb9da92 100644 --- a/doc/source/development/contributing_environment.rst +++ b/doc/source/development/contributing_environment.rst @@ -133,7 +133,6 @@ compiler installation instructions. Let us know if you have any difficulties by opening an issue or reaching out on `Gitter `_. - Creating a Python environment ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/doc/source/ecosystem.rst b/doc/source/ecosystem.rst index e58779c090d8f..78caf360519b6 100644 --- a/doc/source/ecosystem.rst +++ b/doc/source/ecosystem.rst @@ -575,3 +575,17 @@ Library Accessor Classes Description .. _composeml: https://github.com/alteryx/compose .. _datatest: https://datatest.readthedocs.io/ .. _woodwork: https://github.com/alteryx/woodwork + +Development tools +---------------------------- + +`pandas-stubs `__ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +While pandas repository is partially typed, the package itself doesn't expose this information for external use. +Install pandas-stubs to enable basic type coverage of pandas API. + +Learn more by reading through these issues `14468 `_, +`26766 `_, `28142 `_. + +See installation and usage instructions on the `github page `__. diff --git a/doc/source/getting_started/comparison/includes/nth_word.rst b/doc/source/getting_started/comparison/includes/nth_word.rst index 7af0285005d5b..20e2ec47a8c9d 100644 --- a/doc/source/getting_started/comparison/includes/nth_word.rst +++ b/doc/source/getting_started/comparison/includes/nth_word.rst @@ -5,5 +5,5 @@ word by index. Note there are more powerful approaches should you need them. firstlast = pd.DataFrame({"String": ["John Smith", "Jane Cook"]}) firstlast["First_Name"] = firstlast["String"].str.split(" ", expand=True)[0] - firstlast["Last_Name"] = firstlast["String"].str.rsplit(" ", expand=True)[0] + firstlast["Last_Name"] = firstlast["String"].str.rsplit(" ", expand=True)[1] firstlast diff --git a/doc/source/reference/series.rst b/doc/source/reference/series.rst index 3ff3b2bb53fda..a60dab549e66d 100644 --- a/doc/source/reference/series.rst +++ b/doc/source/reference/series.rst @@ -427,6 +427,8 @@ strings and apply several methods to it. These can be accessed like Series.str.normalize Series.str.pad Series.str.partition + Series.str.removeprefix + Series.str.removesuffix Series.str.repeat Series.str.replace Series.str.rfind diff --git a/doc/source/user_guide/advanced.rst b/doc/source/user_guide/advanced.rst index 535b503e4372c..fd89e4e896178 100644 --- a/doc/source/user_guide/advanced.rst +++ b/doc/source/user_guide/advanced.rst @@ -851,12 +851,12 @@ values **not** in the categories, similarly to how you can reindex **any** panda Int64Index and RangeIndex ~~~~~~~~~~~~~~~~~~~~~~~~~ -.. note:: - +.. deprecated:: 1.4.0 In pandas 2.0, :class:`NumericIndex` will become the default index type for numeric types instead of ``Int64Index``, ``Float64Index`` and ``UInt64Index`` and those index types - will be removed. See :ref:`here ` for more. - ``RangeIndex`` however, will not be removed, as it represents an optimized version of an integer index. + are therefore deprecated and will be removed in a futire version. + See :ref:`here ` for more. + ``RangeIndex`` will not be removed, as it represents an optimized version of an integer index. :class:`Int64Index` is a fundamental basic index in pandas. This is an immutable array implementing an ordered, sliceable set. @@ -869,12 +869,12 @@ implementing an ordered, sliceable set. Float64Index ~~~~~~~~~~~~ -.. note:: - - In pandas 2.0, :class:`NumericIndex` will become the default index type for numeric types +.. deprecated:: 1.4.0 + :class:`NumericIndex` will become the default index type for numeric types in the future instead of ``Int64Index``, ``Float64Index`` and ``UInt64Index`` and those index types - will be removed. See :ref:`here ` for more. - ``RangeIndex`` however, will not be removed, as it represents an optimized version of an integer index. + are therefore deprecated and will be removed in a future version of Pandas. + See :ref:`here ` for more. + ``RangeIndex`` will not be removed as it represents an optimized version of an integer index. By default a :class:`Float64Index` will be automatically created when passing floating, or mixed-integer-floating values in index creation. This enables a pure label-based slicing paradigm that makes ``[],ix,loc`` for scalar indexing and slicing work exactly the @@ -981,9 +981,9 @@ NumericIndex .. note:: In pandas 2.0, :class:`NumericIndex` will become the default index type for numeric types - instead of ``Int64Index``, ``Float64Index`` and ``UInt64Index`` and those index types - will be removed. - ``RangeIndex`` however, will not be removed, as it represents an optimized version of an integer index. + instead of :class:`Int64Index`, :class:`Float64Index` and :class:`UInt64Index` and those index types + are therefore deprecated and will be removed in a future version. + :class:`RangeIndex` will not be removed as it represents an optimized version of an integer index. :class:`NumericIndex` is an index type that can hold data of any numpy int/uint/float dtype. For example: @@ -998,7 +998,7 @@ NumericIndex ``UInt64Index`` except that it can hold any numpy int, uint or float dtype. Until Pandas 2.0, you will have to call ``NumericIndex`` explicitly in order to use it, like in the example above. -In Pandas 2.0, ``NumericIndex`` will become the default pandas numeric index type and will automatically be used where appropriate. +In the future, ``NumericIndex`` will become the default pandas numeric index type and will automatically be used where appropriate. Please notice that ``NumericIndex`` *can not* hold Pandas numeric dtypes (:class:`Int64Dtype`, :class:`Int32Dtype` etc.). diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst index 1f1556123db17..4c7b13bcf989f 100644 --- a/doc/source/user_guide/io.rst +++ b/doc/source/user_guide/io.rst @@ -2502,14 +2502,16 @@ Read a URL with no options: .. ipython:: python - url = ( - "https://raw.githubusercontent.com/pandas-dev/pandas/master/" - "pandas/tests/io/data/html/spam.html" - ) + url = "https://www.fdic.gov/resources/resolutions/bank-failures/failed-bank-list" dfs = pd.read_html(url) dfs -Read in the content of the "banklist.html" file and pass it to ``read_html`` +.. note:: + + The data from the above URL changes every Monday so the resulting data above + and the data below may be slightly different. + +Read in the content of the file from the above URL and pass it to ``read_html`` as a string: .. ipython:: python diff --git a/doc/source/user_guide/options.rst b/doc/source/user_guide/options.rst index 62a347acdaa34..1193dff4361b4 100644 --- a/doc/source/user_guide/options.rst +++ b/doc/source/user_guide/options.rst @@ -138,7 +138,7 @@ More information can be found in the `IPython documentation import pandas as pd pd.set_option("display.max_rows", 999) - pd.set_option("precision", 5) + pd.set_option("display.precision", 5) .. _options.frequently_used: @@ -253,9 +253,9 @@ This is only a suggestion. .. ipython:: python df = pd.DataFrame(np.random.randn(5, 5)) - pd.set_option("precision", 7) + pd.set_option("display.precision", 7) df - pd.set_option("precision", 4) + pd.set_option("display.precision", 4) df ``display.chop_threshold`` sets at what level pandas rounds to zero when @@ -487,8 +487,27 @@ styler.sparse.index True "Sparsify" MultiIndex displ elements in outer levels within groups). styler.sparse.columns True "Sparsify" MultiIndex display for columns in Styler output. +styler.render.repr html Standard output format for Styler rendered in Jupyter Notebook. + Should be one of "html" or "latex". styler.render.max_elements 262144 Maximum number of datapoints that Styler will render trimming either rows, columns or both to fit. +styler.render.encoding utf-8 Default encoding for output HTML or LaTeX files. +styler.format.formatter None Object to specify formatting functions to ``Styler.format``. +styler.format.na_rep None String representation for missing data. +styler.format.precision 6 Precision to display floating point and complex numbers. +styler.format.decimal . String representation for decimal point separator for floating + point and complex numbers. +styler.format.thousands None String representation for thousands separator for + integers, and floating point and complex numbers. +styler.format.escape None Whether to escape "html" or "latex" special + characters in the display representation. +styler.html.mathjax True If set to False will render specific CSS classes to + table attributes that will prevent Mathjax from rendering + in Jupyter Notebook. +styler.latex.multicol_align r Alignment of headers in a merged column due to sparsification. Can be in {"r", "c", "l"}. +styler.latex.multirow_align c Alignment of index labels in a merged row due to sparsification. Can be in {"c", "t", "b"}. +styler.latex.environment None If given will replace the default ``\\begin{table}`` environment. If "longtable" is specified + this will render with a specific "longtable" template with longtable features. ======================================= ============ ================================== diff --git a/doc/source/user_guide/text.rst b/doc/source/user_guide/text.rst index db9485f3f2348..d350351075cb6 100644 --- a/doc/source/user_guide/text.rst +++ b/doc/source/user_guide/text.rst @@ -335,6 +335,19 @@ regular expression object will raise a ``ValueError``. --------------------------------------------------------------------------- ValueError: case and flags cannot be set when pat is a compiled regex +``removeprefix`` and ``removesuffix`` have the same effect as ``str.removeprefix`` and ``str.removesuffix`` added in Python 3.9 +`__: + +.. versionadded:: 1.4.0 + +.. ipython:: python + + s = pd.Series(["str_foo", "str_bar", "no_prefix"]) + s.str.removeprefix("str_") + + s = pd.Series(["foo_str", "bar_str", "no_suffix"]) + s.str.removesuffix("_str") + .. _text.concatenate: Concatenation @@ -742,6 +755,8 @@ Method summary :meth:`~Series.str.get_dummies`;Split strings on the delimiter returning DataFrame of dummy variables :meth:`~Series.str.contains`;Return boolean array if each string contains pattern/regex :meth:`~Series.str.replace`;Replace occurrences of pattern/regex/string with some other string or the return value of a callable given the occurrence + :meth:`~Series.str.removeprefix`;Remove prefix from string, i.e. only remove if string starts with prefix. + :meth:`~Series.str.removesuffix`;Remove suffix from string, i.e. only remove if string ends with suffix. :meth:`~Series.str.repeat`;Duplicate values (``s.str.repeat(3)`` equivalent to ``x * 3``) :meth:`~Series.str.pad`;"Add whitespace to left, right, or both sides of strings" :meth:`~Series.str.center`;Equivalent to ``str.center`` diff --git a/doc/source/whatsnew/v0.20.0.rst b/doc/source/whatsnew/v0.20.0.rst index 733995cc718dd..8d9821e53e30c 100644 --- a/doc/source/whatsnew/v0.20.0.rst +++ b/doc/source/whatsnew/v0.20.0.rst @@ -248,11 +248,12 @@ or purely non-negative, integers. Previously, handling these integers would result in improper rounding or data-type casting, leading to incorrect results. Notably, a new numerical index, ``UInt64Index``, has been created (:issue:`14937`) -.. ipython:: python +.. code-block:: ipython - idx = pd.UInt64Index([1, 2, 3]) - df = pd.DataFrame({'A': ['a', 'b', 'c']}, index=idx) - df.index + In [1]: idx = pd.UInt64Index([1, 2, 3]) + In [2]: df = pd.DataFrame({'A': ['a', 'b', 'c']}, index=idx) + In [3]: df.index + Out[3]: UInt64Index([1, 2, 3], dtype='uint64') - Bug in converting object elements of array-like objects to unsigned 64-bit integers (:issue:`4471`, :issue:`14982`) - Bug in ``Series.unique()`` in which unsigned 64-bit integers were causing overflow (:issue:`14721`) diff --git a/doc/source/whatsnew/v0.23.0.rst b/doc/source/whatsnew/v0.23.0.rst index f4caea9d363eb..be84c562b3c32 100644 --- a/doc/source/whatsnew/v0.23.0.rst +++ b/doc/source/whatsnew/v0.23.0.rst @@ -861,21 +861,21 @@ Previous behavior: Current behavior: -.. ipython:: python +.. code-block:: ipython - index = pd.Int64Index([-1, 0, 1]) + In [12]: index = pd.Int64Index([-1, 0, 1]) # division by zero gives -infinity where negative, # +infinity where positive, and NaN for 0 / 0 - index / 0 + In [13]: index / 0 # The result of division by zero should not depend on # whether the zero is int or float - index / 0.0 + In [14]: index / 0.0 - index = pd.UInt64Index([0, 1]) - index / np.array([0, 0], dtype=np.uint64) + In [15]: index = pd.UInt64Index([0, 1]) + In [16]: index / np.array([0, 0], dtype=np.uint64) - pd.RangeIndex(1, 5) / 0 + In [17]: pd.RangeIndex(1, 5) / 0 .. _whatsnew_0230.api_breaking.extract: diff --git a/doc/source/whatsnew/v0.25.0.rst b/doc/source/whatsnew/v0.25.0.rst index 89c003f34f0cc..9cbfa49cc8c5c 100644 --- a/doc/source/whatsnew/v0.25.0.rst +++ b/doc/source/whatsnew/v0.25.0.rst @@ -473,10 +473,12 @@ considered commutative, such that ``A.union(B) == B.union(A)`` (:issue:`23525`). *New behavior*: -.. ipython:: python +.. code-block:: python - pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3])) - pd.Index([], dtype=object).union(pd.Index([1, 2, 3])) + In [3]: pd.period_range('19910905', periods=2).union(pd.Int64Index([1, 2, 3])) + Out[3]: Index([1991-09-05, 1991-09-06, 1, 2, 3], dtype='object') + In [4]: pd.Index([], dtype=object).union(pd.Index([1, 2, 3])) + Out[4]: Index([1, 2, 3], dtype='object') Note that integer- and floating-dtype indexes are considered "compatible". The integer values are coerced to floating point, which may result in loss of precision. See diff --git a/doc/source/whatsnew/v1.3.2.rst b/doc/source/whatsnew/v1.3.2.rst index 7a9549affef00..e3c6268547dd2 100644 --- a/doc/source/whatsnew/v1.3.2.rst +++ b/doc/source/whatsnew/v1.3.2.rst @@ -26,6 +26,7 @@ Fixed regressions - Fixed regression in :func:`concat` where ``copy=False`` was not honored in ``axis=1`` Series concatenation (:issue:`42501`) - Regression in :meth:`Series.nlargest` and :meth:`Series.nsmallest` with nullable integer or float dtype (:issue:`42816`) - Fixed regression in :meth:`Series.quantile` with :class:`Int64Dtype` (:issue:`42626`) +- Fixed regression in :meth:`Series.groupby` and :meth:`DataFrame.groupby` where supplying the ``by`` argument with a Series named with a tuple would incorrectly raise (:issue:`42731`) .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.3.3.rst b/doc/source/whatsnew/v1.3.3.rst index 9aac0a9ad9681..5ffc1a20b382f 100644 --- a/doc/source/whatsnew/v1.3.3.rst +++ b/doc/source/whatsnew/v1.3.3.rst @@ -17,7 +17,25 @@ Fixed regressions - Fixed regression in :class:`DataFrame` constructor failing to broadcast for defined :class:`Index` and len one list of :class:`Timestamp` (:issue:`42810`) - Performance regression in :meth:`core.window.ewm.ExponentialMovingWindow.mean` (:issue:`42333`) - Fixed regression in :meth:`.GroupBy.agg` incorrectly raising in some cases (:issue:`42390`) +- Fixed regression in :meth:`.GroupBy.apply` where ``nan`` values were dropped even with ``dropna=False`` (:issue:`43205`) +- Fixed regression in :meth:`.GroupBy.quantile` which was failing with ``pandas.NA`` (:issue:`42849`) +- Fixed regression in :meth:`merge` where ``on`` columns with ``ExtensionDtype`` or ``bool`` data types were cast to ``object`` in ``right`` and ``outer`` merge (:issue:`40073`) - Fixed regression in :meth:`RangeIndex.where` and :meth:`RangeIndex.putmask` raising ``AssertionError`` when result did not represent a :class:`RangeIndex` (:issue:`43240`) +- Fixed regression in :meth:`read_parquet` where the ``fastparquet`` engine would not work properly with fastparquet 0.7.0 (:issue:`43075`) +- Fixed regression in :meth:`DataFrame.loc.__setitem__` raising ``ValueError`` when setting array as cell value (:issue:`43422`) +- Fixed regression in :func:`is_list_like` where objects with ``__iter__`` set to ``None`` would be identified as iterable (:issue:`43373`) +- Fixed regression in :meth:`.Resampler.aggregate` when used after column selection would raise if ``func`` is a list of aggregation functions (:issue:`42905`) +- Fixed regression in :meth:`DataFrame.corr` where Kendall correlation would produce incorrect results for columns with repeated values (:issue:`43401`) + +.. --------------------------------------------------------------------------- + +.. _whatsnew_133.performance: + +Performance improvements +~~~~~~~~~~~~~~~~~~~~~~~~ +- Performance improvement for :meth:`DataFrame.__setitem__` when the key or value is not a :class:`DataFrame`, or key is not list-like (:issue:`43274`) +- +- .. --------------------------------------------------------------------------- diff --git a/doc/source/whatsnew/v1.4.0.rst b/doc/source/whatsnew/v1.4.0.rst index 450ecc85c725b..7107e3eecb2f1 100644 --- a/doc/source/whatsnew/v1.4.0.rst +++ b/doc/source/whatsnew/v1.4.0.rst @@ -29,8 +29,8 @@ It is now possible to create an index of any numpy int/uint/float dtype using th pd.NumericIndex([1, 2, 3], dtype="uint32") pd.NumericIndex([1, 2, 3], dtype="float32") -In order to maintain backwards compatibility, calls to the base :class:`Index` will in -pandas 1.x. return :class:`Int64Index`, :class:`UInt64Index` and :class:`Float64Index`. +In order to maintain backwards compatibility, calls to the base :class:`Index` will currently +return :class:`Int64Index`, :class:`UInt64Index` and :class:`Float64Index`, where relevant. For example, the code below returns an ``Int64Index`` with dtype ``int64``: .. code-block:: ipython @@ -38,11 +38,12 @@ For example, the code below returns an ``Int64Index`` with dtype ``int64``: In [1]: pd.Index([1, 2, 3], dtype="int8") Int64Index([1, 2, 3], dtype='int64') -For the duration of Pandas 1.x, in order to maintain backwards compatibility, all -operations that until now have returned :class:`Int64Index`, :class:`UInt64Index` and -:class:`Float64Index` will continue to so. This means, that in order to use -``NumericIndex``, you will have to call ``NumericIndex`` explicitly. For example the below series -will have an ``Int64Index``: +but will in a future version return a :class:`NumericIndex` with dtype ``int8``. + +More generally, currently, all operations that until now have +returned :class:`Int64Index`, :class:`UInt64Index` and :class:`Float64Index` will +continue to so. This means, that in order to use ``NumericIndex`` in the current version, you +will have to call ``NumericIndex`` explicitly. For example the below series will have an ``Int64Index``: .. code-block:: ipython @@ -50,7 +51,7 @@ will have an ``Int64Index``: In [3]: ser.index Int64Index([1, 2, 3], dtype='int64') -Instead if you want to use a ``NumericIndex``, you should do: +Instead, if you want to use a ``NumericIndex``, you should do: .. ipython:: python @@ -58,10 +59,11 @@ Instead if you want to use a ``NumericIndex``, you should do: ser = pd.Series([1, 2, 3], index=idx) ser.index -In Pandas 2.0, :class:`NumericIndex` will become the default numeric index type and -``Int64Index``, ``UInt64Index`` and ``Float64Index`` will be removed. +In a future version of Pandas, :class:`NumericIndex` will become the default numeric index type and +``Int64Index``, ``UInt64Index`` and ``Float64Index`` are therefore deprecated and will +be removed in the future, see :ref:`here ` for more. -See :ref:`here ` for more. +See :ref:`here ` for more about :class:`NumericIndex`. .. _whatsnew_140.enhancements.styler: @@ -73,11 +75,17 @@ Styler - Styling of indexing has been added, with :meth:`.Styler.apply_index` and :meth:`.Styler.applymap_index`. These mirror the signature of the methods already used to style data values, and work with both HTML and LaTeX format (:issue:`41893`). - :meth:`.Styler.bar` introduces additional arguments to control alignment and display (:issue:`26070`, :issue:`36419`), and it also validates the input arguments ``width`` and ``height`` (:issue:`42511`). - :meth:`.Styler.to_latex` introduces keyword argument ``environment``, which also allows a specific "longtable" entry through a separate jinja2 template (:issue:`41866`). - - :meth:`.Styler.to_html` introduces keyword arguments ``sparse_index`` and ``sparse_columns`` (:issue:`41946`) - - Keyword argument ``level`` is added to :meth:`.Styler.hide_index` and :meth:`.Styler.hide_columns` for optionally controlling hidden levels in a MultiIndex (:issue:`25475`) + - :meth:`.Styler.to_html` introduces keyword arguments ``sparse_index``, ``sparse_columns``, ``bold_headers``, ``caption`` (:issue:`41946`, :issue:`43149`). + - Keyword arguments ``level`` and ``names`` added to :meth:`.Styler.hide_index` and :meth:`.Styler.hide_columns` for additional control of visibility of MultiIndexes and index names (:issue:`25475`, :issue:`43404`, :issue:`43346`) + - Global options have been extended to configure default ``Styler`` properties including formatting and encoding and mathjax options and LaTeX (:issue:`41395`) + - Naive sparsification is now possible for LaTeX without the multirow package (:issue:`43369`) + +Formerly Styler relied on ``display.html.use_mathjax``, which has now been replaced by ``styler.html.mathjax``. There are also bug fixes and deprecations listed below. +Validation now for ``caption`` arg (:issue:`43368`) + .. _whatsnew_140.enhancements.pyarrow_csv_engine: Multithreaded CSV reading with a new CSV Engine based on pyarrow @@ -95,7 +103,9 @@ Other enhancements - :meth:`Series.sample`, :meth:`DataFrame.sample`, and :meth:`.GroupBy.sample` now accept a ``np.random.Generator`` as input to ``random_state``. A generator will be more performant, especially with ``replace=False`` (:issue:`38100`) - :meth:`Series.ewm`, :meth:`DataFrame.ewm`, now support a ``method`` argument with a ``'table'`` option that performs the windowing operation over an entire :class:`DataFrame`. See :ref:`Window Overview ` for performance and functional benefits (:issue:`42273`) - :meth:`.GroupBy.cummin` and :meth:`.GroupBy.cummax` now support the argument ``skipna`` (:issue:`34047`) -- +- :meth:`read_table` now supports the argument ``storage_options`` (:issue:`39167`) +- Methods that relied on hashmap based algos such as :meth:`DataFrameGroupBy.value_counts`, :meth:`DataFrameGroupBy.count` and :func:`factorize` ignored imaginary component for complex numbers (:issue:`17927`) +- Add :meth:`Series.str.removeprefix` and :meth:`Series.str.removesuffix` introduced in Python 3.9 to remove pre-/suffixes from string-type :class:`Series` (:issue:`36944`) .. --------------------------------------------------------------------------- @@ -221,6 +231,41 @@ Other API changes Deprecations ~~~~~~~~~~~~ + +.. _whatsnew_140.deprecations.int64_uint64_float64index: + +Deprecated Int64Index, UInt64Index & Float64Index +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +:class:`Int64Index`, :class:`UInt64Index` and :class:`Float64Index` have been deprecated +in favor of the new :class:`NumericIndex` and will be removed in Pandas 2.0 (:issue:`43028`). + +Currently, in order to maintain backward compatibility, calls to +:class:`Index` will continue to return :class:`Int64Index`, :class:`UInt64Index` and :class:`Float64Index` +when given numeric data, but in the future, a :class:`NumericIndex` will be returned. + +*Current behavior*: + +.. code-block:: ipython + + In [1]: pd.Index([1, 2, 3], dtype="int32") + Out [1]: Int64Index([1, 2, 3], dtype='int64') + In [1]: pd.Index([1, 2, 3], dtype="uint64") + Out [1]: UInt64Index([1, 2, 3], dtype='uint64') + +*Future behavior*: + +.. code-block:: ipython + + In [3]: pd.Index([1, 2, 3], dtype="int32") + Out [3]: NumericIndex([1, 2, 3], dtype='int32') + In [4]: pd.Index([1, 2, 3], dtype="uint64") + Out [4]: NumericIndex([1, 2, 3], dtype='uint64') + + +.. _whatsnew_140.deprecations.other: + +Other Deprecations +^^^^^^^^^^^^^^^^^^ - Deprecated :meth:`Index.is_type_compatible` (:issue:`42113`) - Deprecated ``method`` argument in :meth:`Index.get_loc`, use ``index.get_indexer([label], method=...)`` instead (:issue:`42269`) - Deprecated treating integer keys in :meth:`Series.__setitem__` as positional when the index is a :class:`Float64Index` not containing the key, a :class:`IntervalIndex` with no entries containing the key, or a :class:`MultiIndex` with leading :class:`Float64Index` level not containing the key (:issue:`33469`) @@ -231,6 +276,7 @@ Deprecations - Deprecated dropping of nuisance columns in :class:`Rolling`, :class:`Expanding`, and :class:`EWM` aggregations (:issue:`42738`) - Deprecated :meth:`Index.reindex` with a non-unique index (:issue:`42568`) - Deprecated :meth:`.Styler.render` in favour of :meth:`.Styler.to_html` (:issue:`42140`) +- Deprecated passing in a string column label into ``times`` in :meth:`DataFrame.ewm` (:issue:`43265`) .. --------------------------------------------------------------------------- @@ -245,7 +291,9 @@ Performance improvements - Performance improvement in :meth:`DataFrame.corr` for ``method=pearson`` on data without missing values (:issue:`40956`) - Performance improvement in some :meth:`GroupBy.apply` operations (:issue:`42992`) - Performance improvement in :func:`read_stata` (:issue:`43059`) -- +- Performance improvement in :meth:`to_datetime` with ``uint`` dtypes (:issue:`42606`) +- Performance improvement in :meth:`Series.sparse.to_coo` (:issue:`42880`) +- Performance improvement in indexing with a :class:`MultiIndex` indexer on another :class:`MultiIndex` (:issue:43370`) .. --------------------------------------------------------------------------- @@ -276,7 +324,7 @@ Timedelta Timezones ^^^^^^^^^ -- +- Bug in :meth:`Series.dt.tz_convert` resetting index in a :class:`Series` with :class:`CategoricalIndex` (:issue:`43080`) - Numeric @@ -313,7 +361,8 @@ Indexing - Bug in :meth:`Index.get_indexer_non_unique` when index contains multiple ``np.nan`` (:issue:`35392`) - Bug in :meth:`DataFrame.query` did not handle the degree sign in a backticked column name, such as \`Temp(°C)\`, used in an expression to query a dataframe (:issue:`42826`) - Bug in :meth:`DataFrame.drop` where the error message did not show missing labels with commas when raising ``KeyError`` (:issue:`42881`) -- +- Bug in :meth:`DataFrame.nlargest` and :meth:`Series.nlargest` where sorted result did not count indexes containing ``np.nan`` (:issue:`28984`) + Missing ^^^^^^^ @@ -325,6 +374,7 @@ MultiIndex - Bug in :meth:`MultiIndex.get_loc` where the first level is a :class:`DatetimeIndex` and a string key is passed (:issue:`42465`) - Bug in :meth:`MultiIndex.reindex` when passing a ``level`` that corresponds to an ``ExtensionDtype`` level (:issue:`42043`) - Bug in :meth:`MultiIndex.get_loc` raising ``TypeError`` instead of ``KeyError`` on nested tuple (:issue:`42440`) +- Bug in :meth:`MultiIndex.putmask` where the other value was also a :class:`MultiIndex` (:issue:`43212`) - I/O @@ -332,8 +382,11 @@ I/O - Bug in :func:`read_excel` attempting to read chart sheets from .xlsx files (:issue:`41448`) - Bug in :func:`json_normalize` where ``errors=ignore`` could fail to ignore missing values of ``meta`` when ``record_path`` has a length greater than one (:issue:`41876`) - Bug in :func:`read_csv` with multi-header input and arguments referencing column names as tuples (:issue:`42446`) +- Bug in :func:`read_fwf`, where difference in lengths of ``colspecs`` and ``names`` was not raising ``ValueError`` (:issue:`40830`) - Bug in :func:`Series.to_json` and :func:`DataFrame.to_json` where some attributes were skipped when serialising plain Python objects to JSON (:issue:`42768`, :issue:`33043`) -- +- Column headers are dropped when constructing a :class:`DataFrame` from a sqlalchemy's ``Row`` object (:issue:`40682`) +- Bug in unpickling a :class:`Index` with object dtype incorrectly inferring numeric dtypes (:issue:`43188`) +- Bug in :func:`read_csv` where reading multi-header input with unequal lengths incorrectly raising uncontrolled ``IndexError`` (:issue:`43102`) Period ^^^^^^ @@ -349,19 +402,24 @@ Groupby/resample/rolling ^^^^^^^^^^^^^^^^^^^^^^^^ - Fixed bug in :meth:`SeriesGroupBy.apply` where passing an unrecognized string argument failed to raise ``TypeError`` when the underlying ``Series`` is empty (:issue:`42021`) - Bug in :meth:`Series.rolling.apply`, :meth:`DataFrame.rolling.apply`, :meth:`Series.expanding.apply` and :meth:`DataFrame.expanding.apply` with ``engine="numba"`` where ``*args`` were being cached with the user passed function (:issue:`42287`) +- Bug in :meth:`GroupBy.max` and :meth:`GroupBy.min` with nullable integer dtypes losing precision (:issue:`41743`) - Bug in :meth:`DataFrame.groupby.rolling.var` would calculate the rolling variance only on the first group (:issue:`42442`) - Bug in :meth:`GroupBy.shift` that would return the grouping columns if ``fill_value`` was not None (:issue:`41556`) - Bug in :meth:`SeriesGroupBy.nlargest` and :meth:`SeriesGroupBy.nsmallest` would have an inconsistent index when the input Series was sorted and ``n`` was greater than or equal to all group sizes (:issue:`15272`, :issue:`16345`, :issue:`29129`) - Bug in :meth:`pandas.DataFrame.ewm`, where non-float64 dtypes were silently failing (:issue:`42452`) - Bug in :meth:`pandas.DataFrame.rolling` operation along rows (``axis=1``) incorrectly omits columns containing ``float16`` and ``float32`` (:issue:`41779`) - Bug in :meth:`Resampler.aggregate` did not allow the use of Named Aggregation (:issue:`32803`) -- +- Bug in :meth:`Series.rolling` when the :class:`Series` ``dtype`` was ``Int64`` (:issue:`43016`) +- Bug in :meth:`DataFrame.rolling.corr` when the :class:`DataFrame` columns was a :class:`MultiIndex` (:issue:`21157`) +- Bug in :meth:`DataFrame.groupby.rolling` when specifying ``on`` and calling ``__getitem__`` would subsequently return incorrect results (:issue:`43355`) Reshaping ^^^^^^^^^ - Improved error message when creating a :class:`DataFrame` column from a multi-dimensional :class:`numpy.ndarray` (:issue:`42463`) - :func:`concat` creating :class:`MultiIndex` with duplicate level entries when concatenating a :class:`DataFrame` with duplicates in :class:`Index` and multiple keys (:issue:`42651`) - Bug in :meth:`pandas.cut` on :class:`Series` with duplicate indices (:issue:`42185`) and non-exact :meth:`pandas.CategoricalIndex` (:issue:`42425`) +- Bug in :meth:`DataFrame.append` failing to retain dtypes when appended columns do not match (:issue:`43392`) +- Bug in :func:`concat` of ``bool`` and ``boolean`` dtypes resulting in ``object`` dtype instead of ``boolean`` dtype (:issue:`42800`) - Sparse @@ -380,7 +438,9 @@ Styler - Bug in :meth:`.Styler.to_html` where the ``Styler`` object was updated if the ``to_html`` method was called with some args (:issue:`43034`) - Bug in :meth:`.Styler.copy` where ``uuid`` was not previously copied (:issue:`40675`) - Bug in :meth:`Styler.apply` where functions which returned Series objects were not correctly handled in terms of aligning their index labels (:issue:`13657`, :issue:`42014`) -- +- Bug when rendering an empty DataFrame with a named index (:issue:`43305`). +- Bug when rendering a single level MultiIndex (:issue:`43383`). +- Bug when combining non-sparse rendering and :meth:`.Styler.hide_columns` (:issue:`43464`) Other ^^^^^ diff --git a/pandas/__init__.py b/pandas/__init__.py index 68ca20338e99b..294b092e33c58 100644 --- a/pandas/__init__.py +++ b/pandas/__init__.py @@ -70,10 +70,7 @@ # indexes Index, CategoricalIndex, - Int64Index, - UInt64Index, RangeIndex, - Float64Index, NumericIndex, MultiIndex, IntervalIndex, @@ -186,10 +183,35 @@ # GH 27101 +__deprecated_num_index_names = ["Float64Index", "Int64Index", "UInt64Index"] + + +def __dir__(): + # GH43028 + # Int64Index etc. are deprecated, but we still want them to be available in the dir. + # Remove in Pandas 2.0, when we remove Int64Index etc. from the code base. + return list(globals().keys()) + __deprecated_num_index_names + + def __getattr__(name): import warnings - if name == "datetime": + if name in __deprecated_num_index_names: + warnings.warn( + f"pandas.{name} is deprecated " + "and will be removed from pandas in a future version. " + "Use pandas.NumericIndex with the appropriate dtype instead.", + FutureWarning, + stacklevel=2, + ) + from pandas.core.api import Float64Index, Int64Index, UInt64Index + + return { + "Float64Index": Float64Index, + "Int64Index": Int64Index, + "UInt64Index": UInt64Index, + }[name] + elif name == "datetime": warnings.warn( "The pandas.datetime class is deprecated " "and will be removed from pandas in a future version. " diff --git a/pandas/_libs/algos.pyi b/pandas/_libs/algos.pyi index 9da5534c51321..fdec60a84a708 100644 --- a/pandas/_libs/algos.pyi +++ b/pandas/_libs/algos.pyi @@ -3,6 +3,8 @@ from typing import Any import numpy as np +from pandas._typing import npt + class Infinity: """ Provide a positive Infinity comparison method for ranking. @@ -30,7 +32,7 @@ class NegInfinity: def unique_deltas( arr: np.ndarray, # const int64_t[:] ) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1] -def is_lexsorted(list_of_arrays: list[np.ndarray]) -> bool: ... +def is_lexsorted(list_of_arrays: list[npt.NDArray[np.int64]]) -> bool: ... def groupsort_indexer( index: np.ndarray, # const int64_t[:] ngroups: int, @@ -146,18 +148,20 @@ def diff_2d( axis: int, datetimelike: bool = ..., ) -> None: ... -def ensure_platform_int(arr: object) -> np.ndarray: ... -def ensure_object(arr: object) -> np.ndarray: ... -def ensure_float64(arr: object, copy=True) -> np.ndarray: ... -def ensure_float32(arr: object, copy=True) -> np.ndarray: ... -def ensure_int8(arr: object, copy=True) -> np.ndarray: ... -def ensure_int16(arr: object, copy=True) -> np.ndarray: ... -def ensure_int32(arr: object, copy=True) -> np.ndarray: ... -def ensure_int64(arr: object, copy=True) -> np.ndarray: ... -def ensure_uint8(arr: object, copy=True) -> np.ndarray: ... -def ensure_uint16(arr: object, copy=True) -> np.ndarray: ... -def ensure_uint32(arr: object, copy=True) -> np.ndarray: ... -def ensure_uint64(arr: object, copy=True) -> np.ndarray: ... +def ensure_platform_int(arr: object) -> npt.NDArray[np.intp]: ... +def ensure_object(arr: object) -> npt.NDArray[np.object_]: ... +def ensure_complex64(arr: object, copy=True) -> npt.NDArray[np.complex64]: ... +def ensure_complex128(arr: object, copy=True) -> npt.NDArray[np.complex128]: ... +def ensure_float64(arr: object, copy=True) -> npt.NDArray[np.float64]: ... +def ensure_float32(arr: object, copy=True) -> npt.NDArray[np.float32]: ... +def ensure_int8(arr: object, copy=True) -> npt.NDArray[np.int8]: ... +def ensure_int16(arr: object, copy=True) -> npt.NDArray[np.int16]: ... +def ensure_int32(arr: object, copy=True) -> npt.NDArray[np.int32]: ... +def ensure_int64(arr: object, copy=True) -> npt.NDArray[np.int64]: ... +def ensure_uint8(arr: object, copy=True) -> npt.NDArray[np.uint8]: ... +def ensure_uint16(arr: object, copy=True) -> npt.NDArray[np.uint16]: ... +def ensure_uint32(arr: object, copy=True) -> npt.NDArray[np.uint32]: ... +def ensure_uint64(arr: object, copy=True) -> npt.NDArray[np.uint64]: ... def take_1d_int8_int8( values: np.ndarray, indexer: np.ndarray, out: np.ndarray, fill_value=... ) -> None: ... diff --git a/pandas/_libs/algos.pyx b/pandas/_libs/algos.pyx index 2353c66f3378f..99929c36c0929 100644 --- a/pandas/_libs/algos.pyx +++ b/pandas/_libs/algos.pyx @@ -15,6 +15,8 @@ import numpy as np cimport numpy as cnp from numpy cimport ( + NPY_COMPLEX64, + NPY_COMPLEX128, NPY_FLOAT32, NPY_FLOAT64, NPY_INT8, @@ -122,7 +124,7 @@ cpdef ndarray[int64_t, ndim=1] unique_deltas(const int64_t[:] arr): Parameters ---------- - arr : ndarray[in64_t] + arr : ndarray[int64_t] Returns ------- @@ -516,97 +518,6 @@ def nancorr_spearman(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarr return result -# ---------------------------------------------------------------------- -# Kendall correlation -# Wikipedia article: https://en.wikipedia.org/wiki/Kendall_rank_correlation_coefficient - -@cython.boundscheck(False) -@cython.wraparound(False) -def nancorr_kendall(ndarray[float64_t, ndim=2] mat, Py_ssize_t minp=1) -> ndarray: - """ - Perform kendall correlation on a 2d array - - Parameters - ---------- - mat : np.ndarray[float64_t, ndim=2] - Array to compute kendall correlation on - minp : int, default 1 - Minimum number of observations required per pair of columns - to have a valid result. - - Returns - ------- - numpy.ndarray[float64_t, ndim=2] - Correlation matrix - """ - cdef: - Py_ssize_t i, j, k, xi, yi, N, K - ndarray[float64_t, ndim=2] result - ndarray[float64_t, ndim=2] ranked_mat - ndarray[uint8_t, ndim=2] mask - float64_t currj - ndarray[uint8_t, ndim=1] valid - ndarray[int64_t] sorted_idxs - ndarray[float64_t, ndim=1] col - int64_t n_concordant - int64_t total_concordant = 0 - int64_t total_discordant = 0 - float64_t kendall_tau - int64_t n_obs - - N, K = (mat).shape - - result = np.empty((K, K), dtype=np.float64) - mask = np.isfinite(mat) - - ranked_mat = np.empty((N, K), dtype=np.float64) - - for i in range(K): - ranked_mat[:, i] = rank_1d(mat[:, i]) - - for xi in range(K): - sorted_idxs = ranked_mat[:, xi].argsort() - ranked_mat = ranked_mat[sorted_idxs] - mask = mask[sorted_idxs] - for yi in range(xi + 1, K): - valid = mask[:, xi] & mask[:, yi] - if valid.sum() < minp: - result[xi, yi] = NaN - result[yi, xi] = NaN - else: - # Get columns and order second column using 1st column ranks - if not valid.all(): - col = ranked_mat[valid.nonzero()][:, yi] - else: - col = ranked_mat[:, yi] - n_obs = col.shape[0] - total_concordant = 0 - total_discordant = 0 - for j in range(n_obs - 1): - currj = col[j] - # Count num concordant and discordant pairs - n_concordant = 0 - for k in range(j, n_obs): - if col[k] > currj: - n_concordant += 1 - total_concordant += n_concordant - total_discordant += (n_obs - 1 - j - n_concordant) - # Note: we do total_concordant+total_discordant here which is - # equivalent to the C(n, 2), the total # of pairs, - # listed on wikipedia - kendall_tau = (total_concordant - total_discordant) / \ - (total_concordant + total_discordant) - result[xi, yi] = kendall_tau - result[yi, xi] = kendall_tau - - if mask[:, xi].sum() > minp: - result[xi, xi] = 1 - else: - result[xi, xi] = NaN - - return result - - # ---------------------------------------------------------------------- ctypedef fused algos_t: diff --git a/pandas/_libs/algos_common_helper.pxi.in b/pandas/_libs/algos_common_helper.pxi.in index 64e8bdea4672c..87130906ef28b 100644 --- a/pandas/_libs/algos_common_helper.pxi.in +++ b/pandas/_libs/algos_common_helper.pxi.in @@ -47,6 +47,8 @@ dtypes = [('float64', 'FLOAT64', 'float64'), ('uint16', 'UINT16', 'uint16'), ('uint32', 'UINT32', 'uint32'), ('uint64', 'UINT64', 'uint64'), + ('complex64', 'COMPLEX64', 'complex64'), + ('complex128', 'COMPLEX128', 'complex128') # ('platform_int', 'INT', 'int_'), # ('object', 'OBJECT', 'object_'), ] diff --git a/pandas/_libs/groupby.pyi b/pandas/_libs/groupby.pyi index 7b1dcbe562123..b363524e4e592 100644 --- a/pandas/_libs/groupby.pyi +++ b/pandas/_libs/groupby.pyi @@ -123,6 +123,8 @@ def group_max( values: np.ndarray, # ndarray[groupby_t, ndim=2] labels: np.ndarray, # const int64_t[:] min_count: int = ..., + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., ) -> None: ... def group_min( out: np.ndarray, # groupby_t[:, ::1] @@ -130,6 +132,8 @@ def group_min( values: np.ndarray, # ndarray[groupby_t, ndim=2] labels: np.ndarray, # const int64_t[:] min_count: int = ..., + mask: np.ndarray | None = ..., + result_mask: np.ndarray | None = ..., ) -> None: ... def group_cummin( out: np.ndarray, # groupby_t[:, ::1] diff --git a/pandas/_libs/groupby.pyx b/pandas/_libs/groupby.pyx index 7123d0d543090..40e1049c39588 100644 --- a/pandas/_libs/groupby.pyx +++ b/pandas/_libs/groupby.pyx @@ -1182,7 +1182,9 @@ cdef group_min_max(groupby_t[:, ::1] out, const intp_t[::1] labels, Py_ssize_t min_count=-1, bint is_datetimelike=False, - bint compute_max=True): + bint compute_max=True, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None): """ Compute minimum/maximum of columns of `values`, in row groups `labels`. @@ -1203,6 +1205,12 @@ cdef group_min_max(groupby_t[:, ::1] out, True if `values` contains datetime-like entries. compute_max : bint, default True True to compute group-wise max, False to compute min + mask : ndarray[bool, ndim=2], optional + If not None, indices represent missing values, + otherwise the mask will not be used + result_mask : ndarray[bool, ndim=2], optional + If not None, these specify locations in the output that are NA. + Modified in-place. Notes ----- @@ -1215,6 +1223,8 @@ cdef group_min_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] group_min_or_max bint runtime_error = False int64_t[:, ::1] nobs + bint uses_mask = mask is not None + bint isna_entry # TODO(cython 3.0): # Instead of `labels.shape[0]` use `len(labels)` @@ -1249,7 +1259,12 @@ cdef group_min_max(groupby_t[:, ::1] out, for j in range(K): val = values[i, j] - if not _treat_as_na(val, is_datetimelike): + if uses_mask: + isna_entry = mask[i, j] + else: + isna_entry = _treat_as_na(val, is_datetimelike) + + if not isna_entry: nobs[lab, j] += 1 if compute_max: if val > group_min_or_max[lab, j]: @@ -1265,7 +1280,10 @@ cdef group_min_max(groupby_t[:, ::1] out, runtime_error = True break else: - out[i, j] = nan_val + if uses_mask: + result_mask[i, j] = True + else: + out[i, j] = nan_val else: out[i, j] = group_min_or_max[i, j] @@ -1282,7 +1300,9 @@ def group_max(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, - bint is_datetimelike=False) -> None: + bint is_datetimelike=False, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None) -> None: """See group_min_max.__doc__""" group_min_max( out, @@ -1292,6 +1312,8 @@ def group_max(groupby_t[:, ::1] out, min_count=min_count, is_datetimelike=is_datetimelike, compute_max=True, + mask=mask, + result_mask=result_mask, ) @@ -1302,7 +1324,9 @@ def group_min(groupby_t[:, ::1] out, ndarray[groupby_t, ndim=2] values, const intp_t[::1] labels, Py_ssize_t min_count=-1, - bint is_datetimelike=False) -> None: + bint is_datetimelike=False, + const uint8_t[:, ::1] mask=None, + uint8_t[:, ::1] result_mask=None) -> None: """See group_min_max.__doc__""" group_min_max( out, @@ -1312,6 +1336,8 @@ def group_min(groupby_t[:, ::1] out, min_count=min_count, is_datetimelike=is_datetimelike, compute_max=False, + mask=mask, + result_mask=result_mask, ) diff --git a/pandas/_libs/hashing.pyi b/pandas/_libs/hashing.pyi index 2844ec9b06557..8361026e4a87d 100644 --- a/pandas/_libs/hashing.pyi +++ b/pandas/_libs/hashing.pyi @@ -1,7 +1,9 @@ import numpy as np +from pandas._typing import npt + def hash_object_array( - arr: np.ndarray, # np.ndarray[object] + arr: npt.NDArray[np.object_], key: str, encoding: str = ..., -) -> np.ndarray: ... # np.ndarray[np.uint64] +) -> npt.NDArray[np.uint64]: ... diff --git a/pandas/_libs/index.pyi b/pandas/_libs/index.pyi index 6bb332435be63..c4a695acc2768 100644 --- a/pandas/_libs/index.pyi +++ b/pandas/_libs/index.pyi @@ -1,5 +1,9 @@ import numpy as np +from pandas._typing import npt + +from pandas import MultiIndex + class IndexEngine: over_size_threshold: bool def __init__(self, vgetter, n: int): ... @@ -16,21 +20,18 @@ class IndexEngine: def is_monotonic_decreasing(self) -> bool: ... def get_backfill_indexer( self, other: np.ndarray, limit: int | None = ... - ) -> np.ndarray: ... + ) -> npt.NDArray[np.intp]: ... def get_pad_indexer( self, other: np.ndarray, limit: int | None = ... - ) -> np.ndarray: ... + ) -> npt.NDArray[np.intp]: ... @property def is_mapping_populated(self) -> bool: ... def clear_mapping(self): ... - def get_indexer(self, values: np.ndarray) -> np.ndarray: ... # np.ndarray[np.intp] + def get_indexer(self, values: np.ndarray) -> npt.NDArray[np.intp]: ... def get_indexer_non_unique( self, targets: np.ndarray, - ) -> tuple[ - np.ndarray, # np.ndarray[np.intp] - np.ndarray, # np.ndarray[np.intp] - ]: ... + ) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: ... class Float64Engine(IndexEngine): ... class Float32Engine(IndexEngine): ... @@ -58,9 +59,9 @@ class BaseMultiIndexCodesEngine: ): ... def get_indexer( self, - target: np.ndarray, # np.ndarray[object] - ) -> np.ndarray: ... # np.ndarray[np.intp] - def _extract_level_codes(self, target: object): ... + target: npt.NDArray[np.object_], + ) -> npt.NDArray[np.intp]: ... + def _extract_level_codes(self, target: MultiIndex) -> np.ndarray: ... def get_indexer_with_fill( self, target: np.ndarray, # np.ndarray[object] of tuples diff --git a/pandas/_libs/index.pyx b/pandas/_libs/index.pyx index f2e2abd16b985..7aff683173855 100644 --- a/pandas/_libs/index.pyx +++ b/pandas/_libs/index.pyx @@ -603,26 +603,26 @@ cdef class BaseMultiIndexCodesEngine: def _codes_to_ints(self, ndarray[uint64_t] codes) -> np.ndarray: raise NotImplementedError("Implemented by subclass") - def _extract_level_codes(self, ndarray[object] target) -> np.ndarray: + def _extract_level_codes(self, target) -> np.ndarray: """ Map the requested list of (tuple) keys to their integer representations for searching in the underlying integer index. Parameters ---------- - target : ndarray[object] - Each key is a tuple, with a label for each level of the index. + target : MultiIndex Returns ------ int_keys : 1-dimensional array of dtype uint64 or object Integers representing one combination each """ + zt = [target._get_level_values(i) for i in range(target.nlevels)] level_codes = [lev.get_indexer(codes) + 1 for lev, codes - in zip(self.levels, zip(*target))] + in zip(self.levels, zt)] return self._codes_to_ints(np.array(level_codes, dtype='uint64').T) - def get_indexer(self, ndarray[object] target) -> np.ndarray: + def get_indexer(self, target: np.ndarray) -> np.ndarray: """ Returns an array giving the positions of each value of `target` in `self.values`, where -1 represents a value in `target` which does not @@ -630,16 +630,14 @@ cdef class BaseMultiIndexCodesEngine: Parameters ---------- - target : ndarray[object] - Each key is a tuple, with a label for each level of the index + target : np.ndarray Returns ------- np.ndarray[intp_t, ndim=1] of the indexer of `target` into `self.values` """ - lab_ints = self._extract_level_codes(target) - return self._base.get_indexer(self, lab_ints) + return self._base.get_indexer(self, target) def get_indexer_with_fill(self, ndarray target, ndarray values, str method, object limit) -> np.ndarray: @@ -742,10 +740,9 @@ cdef class BaseMultiIndexCodesEngine: return self._base.get_loc(self, lab_int) - def get_indexer_non_unique(self, ndarray[object] target): - - lab_ints = self._extract_level_codes(target) - indexer = self._base.get_indexer_non_unique(self, lab_ints) + def get_indexer_non_unique(self, target: np.ndarray) -> np.ndarray: + # target: MultiIndex + indexer = self._base.get_indexer_non_unique(self, target) return indexer diff --git a/pandas/_libs/internals.pyi b/pandas/_libs/internals.pyi index 1791cbb85c355..6542b7a251644 100644 --- a/pandas/_libs/internals.pyi +++ b/pandas/_libs/internals.pyi @@ -82,3 +82,4 @@ class BlockManager: self, blocks: tuple[B, ...], axes: list[Index], verify_integrity=True ): ... def get_slice(self: T, slobj: slice, axis: int = ...) -> T: ... + def _rebuild_blknos_and_blklocs(self) -> None: ... diff --git a/pandas/_libs/internals.pyx b/pandas/_libs/internals.pyx index 559359bdf3353..18af600ef4046 100644 --- a/pandas/_libs/internals.pyx +++ b/pandas/_libs/internals.pyx @@ -669,6 +669,43 @@ cdef class BlockManager: self._blknos = None self._blklocs = None + # ------------------------------------------------------------------- + # Block Placement + + def _rebuild_blknos_and_blklocs(self) -> None: + """ + Update mgr._blknos / mgr._blklocs. + """ + cdef: + intp_t blkno, i, j + cnp.npy_intp length = self.shape[0] + SharedBlock blk + BlockPlacement bp + + # equiv: np.empty(length, dtype=np.intp) + new_blknos = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0) + new_blklocs = cnp.PyArray_EMPTY(1, &length, cnp.NPY_INTP, 0) + new_blknos.fill(-1) + new_blklocs.fill(-1) + + for blkno, blk in enumerate(self.blocks): + bp = blk.mgr_locs + # Iterating over `bp` is a faster equivalent to + # new_blknos[bp.indexer] = blkno + # new_blklocs[bp.indexer] = np.arange(len(bp)) + for i, j in enumerate(bp): + new_blknos[j] = blkno + new_blklocs[j] = i + + for blkno in new_blknos: + # If there are any -1s remaining, this indicates that our mgr_locs + # are invalid. + if blkno == -1: + raise AssertionError("Gaps in blk ref_locs") + + self._blknos = new_blknos + self._blklocs = new_blklocs + # ------------------------------------------------------------------- # Pickle diff --git a/pandas/_libs/lib.pyx b/pandas/_libs/lib.pyx index 76955a7e27679..c9548a7e05fc5 100644 --- a/pandas/_libs/lib.pyx +++ b/pandas/_libs/lib.pyx @@ -1092,7 +1092,7 @@ def is_list_like(obj: object, allow_sets: bool = True) -> bool: cdef inline bint c_is_list_like(object obj, bint allow_sets) except -1: return ( # equiv: `isinstance(obj, abc.Iterable)` - hasattr(obj, "__iter__") and not isinstance(obj, type) + getattr(obj, "__iter__", None) is not None and not isinstance(obj, type) # we do not count strings/unicode/bytes as list-like and not isinstance(obj, (str, bytes)) # exclude zero-dimensional numpy arrays, effectively scalars diff --git a/pandas/_libs/reduction.pyx b/pandas/_libs/reduction.pyx index 4d3bdde357e88..77fd4d94d05ac 100644 --- a/pandas/_libs/reduction.pyx +++ b/pandas/_libs/reduction.pyx @@ -64,9 +64,6 @@ cdef class _BaseGrouper: cdef inline _update_cached_objs(self, object cached_series, object cached_index, Slider islider, Slider vslider): - # See the comment in indexes/base.py about _index_data. - # We need this for EA-backed indexes that have a reference - # to a 1-d ndarray like datetime / timedelta / period. cached_index._engine.clear_mapping() cached_index._cache.clear() # e.g. inferred_freq must go cached_series._mgr.set_values(vslider.buf) @@ -93,105 +90,6 @@ cdef class _BaseGrouper: return res, initialized -cdef class SeriesBinGrouper(_BaseGrouper): - """ - Performs grouping operation according to bin edges, rather than labels - """ - cdef: - Py_ssize_t nresults, ngroups - - cdef public: - ndarray bins # ndarray[int64_t] - ndarray arr, index, dummy_arr, dummy_index - object values, f, typ, ityp, name, idtype - - def __init__(self, object series, object f, ndarray[int64_t] bins): - - assert len(bins) > 0 # otherwise we get IndexError in get_result - - self.bins = bins - self.f = f - - values = series.values - if is_array(values) and not values.flags.c_contiguous: - # e.g. Categorical has no `flags` attribute - values = values.copy('C') - self.arr = values - self.typ = series._constructor - self.ityp = series.index._constructor - self.idtype = series.index.dtype - self.index = series.index.values - self.name = series.name - - dummy = series.iloc[:0] - self.dummy_arr, self.dummy_index = self._check_dummy(dummy) - - # kludge for #1688 - if len(bins) > 0 and bins[-1] == len(series): - self.ngroups = len(bins) - else: - # TODO: not reached except in test_series_bin_grouper directly - # constructing SeriesBinGrouper; can we rule this case out? - self.ngroups = len(bins) + 1 - - def get_result(self): - cdef: - ndarray arr, result - ndarray[int64_t] counts - Py_ssize_t i, n, group_size, start, end - object res - bint initialized = 0 - Slider vslider, islider - object cached_series = None, cached_index = None - - counts = np.zeros(self.ngroups, dtype=np.int64) - - if self.ngroups > 0: - counts[0] = self.bins[0] - for i in range(1, self.ngroups): - if i == self.ngroups - 1: - counts[i] = len(self.arr) - self.bins[i - 1] - else: - counts[i] = self.bins[i] - self.bins[i - 1] - - group_size = 0 - n = len(self.arr) - - vslider = Slider(self.arr, self.dummy_arr) - islider = Slider(self.index, self.dummy_index) - - result = np.empty(self.ngroups, dtype='O') - - cached_index, cached_series = self._init_dummy_series_and_index( - islider, vslider - ) - - start = 0 - try: - for i in range(self.ngroups): - group_size = counts[i] - end = start + group_size - - islider.move(start, end) - vslider.move(start, end) - - self._update_cached_objs( - cached_series, cached_index, islider, vslider) - - res, initialized = self._apply_to_group(cached_series, cached_index, - initialized) - start += group_size - - result[i] = res - - finally: - # so we don't free the wrong memory - islider.reset() - vslider.reset() - - return result, counts - - cdef class SeriesGrouper(_BaseGrouper): """ Performs generic grouping operation while avoiding ndarray construction diff --git a/pandas/_libs/tslib.pyx b/pandas/_libs/tslib.pyx index 6b1c0f851f8e7..6feb9ec768655 100644 --- a/pandas/_libs/tslib.pyx +++ b/pandas/_libs/tslib.pyx @@ -248,7 +248,7 @@ def array_with_unit_to_datetime( # if we have nulls that are not type-compat # then need to iterate - if values.dtype.kind == "i" or values.dtype.kind == "f": + if values.dtype.kind in ["i", "f", "u"]: iresult = values.astype("i8", copy=False) # fill missing values by comparing to NPY_NAT mask = iresult == NPY_NAT @@ -263,7 +263,7 @@ def array_with_unit_to_datetime( ): raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'") - if values.dtype.kind == "i": + if values.dtype.kind in ["i", "u"]: result = (iresult * m).astype("M8[ns]") elif values.dtype.kind == "f": diff --git a/pandas/_libs/tslibs/conversion.pyi b/pandas/_libs/tslibs/conversion.pyi index e74a56a519c5a..3d0288160e386 100644 --- a/pandas/_libs/tslibs/conversion.pyi +++ b/pandas/_libs/tslibs/conversion.pyi @@ -5,6 +5,8 @@ from datetime import ( import numpy as np +from pandas._typing import npt + DT64NS_DTYPE: np.dtype TD64NS_DTYPE: np.dtype @@ -22,6 +24,6 @@ def ensure_timedelta64ns( copy: bool = ..., ) -> np.ndarray: ... # np.ndarray[timedelta64ns] def datetime_to_datetime64( - values: np.ndarray, # np.ndarray[object] + values: npt.NDArray[np.object_], ) -> tuple[np.ndarray, tzinfo | None,]: ... # (np.ndarray[dt64ns], _) def localize_pydatetime(dt: datetime, tz: tzinfo | None) -> datetime: ... diff --git a/pandas/_libs/tslibs/fields.pyi b/pandas/_libs/tslibs/fields.pyi index 244af38e25da0..c6ebb1618f1f2 100644 --- a/pandas/_libs/tslibs/fields.pyi +++ b/pandas/_libs/tslibs/fields.pyi @@ -1,33 +1,35 @@ import numpy as np +from pandas._typing import npt + def build_field_sarray( - dtindex: np.ndarray, # const int64_t[:] + dtindex: npt.NDArray[np.int64], # const int64_t[:] ) -> np.ndarray: ... def month_position_check(fields, weekdays) -> str | None: ... def get_date_name_field( - dtindex: np.ndarray, # const int64_t[:] + dtindex: npt.NDArray[np.int64], # const int64_t[:] field: str, locale=..., -) -> np.ndarray: ... # np.ndarray[object] +) -> npt.NDArray[np.object_]: ... def get_start_end_field( - dtindex: np.ndarray, # const int64_t[:] + dtindex: npt.NDArray[np.int64], # const int64_t[:] field: str, freqstr: str | None = ..., month_kw: int = ..., -) -> np.ndarray: ... # np.ndarray[bool] +) -> npt.NDArray[np.bool_]: ... def get_date_field( - dtindex: np.ndarray, # const int64_t[:] + dtindex: npt.NDArray[np.int64], # const int64_t[:] field: str, -) -> np.ndarray: ... # np.ndarray[in32] +) -> npt.NDArray[np.int32]: ... def get_timedelta_field( tdindex: np.ndarray, # const int64_t[:] field: str, -) -> np.ndarray: ... # np.ndarray[int32] +) -> npt.NDArray[np.int32]: ... def isleapyear_arr( years: np.ndarray, -) -> np.ndarray: ... # np.ndarray[bool] +) -> npt.NDArray[np.bool_]: ... def build_isocalendar_sarray( - dtindex: np.ndarray, # const int64_t[:] + dtindex: npt.NDArray[np.int64], # const int64_t[:] ) -> np.ndarray: ... def get_locale_names(name_type: str, locale: object = None): ... @@ -44,7 +46,7 @@ class RoundTo: def NEAREST_HALF_MINUS_INFTY(self) -> int: ... def round_nsint64( - values: np.ndarray, # np.ndarray[np.int64] + values: npt.NDArray[np.int64], mode: RoundTo, nanos: int, -) -> np.ndarray: ... # np.ndarray[np.int64] +) -> npt.NDArray[np.int64]: ... diff --git a/pandas/_libs/tslibs/parsing.pyi b/pandas/_libs/tslibs/parsing.pyi index fc08a48cee343..6a96b05d53c37 100644 --- a/pandas/_libs/tslibs/parsing.pyi +++ b/pandas/_libs/tslibs/parsing.pyi @@ -3,6 +3,7 @@ from datetime import datetime import numpy as np from pandas._libs.tslibs.offsets import BaseOffset +from pandas._typing import npt class DateParseError(ValueError): ... @@ -21,32 +22,32 @@ def parse_time_string( def _does_string_look_like_datetime(py_string: str) -> bool: ... def quarter_to_myear(year: int, quarter: int, freq: str) -> tuple[int, int]: ... def try_parse_dates( - values: np.ndarray, # object[:] + values: npt.NDArray[np.object_], # object[:] parser=..., dayfirst: bool = ..., default: datetime | None = ..., -) -> np.ndarray: ... # np.ndarray[object] +) -> npt.NDArray[np.object_]: ... def try_parse_date_and_time( - dates: np.ndarray, # object[:] - times: np.ndarray, # object[:] + dates: npt.NDArray[np.object_], # object[:] + times: npt.NDArray[np.object_], # object[:] date_parser=..., time_parser=..., dayfirst: bool = ..., default: datetime | None = ..., -) -> np.ndarray: ... # np.ndarray[object] +) -> npt.NDArray[np.object_]: ... def try_parse_year_month_day( - years: np.ndarray, # object[:] - months: np.ndarray, # object[:] - days: np.ndarray, # object[:] -) -> np.ndarray: ... # np.ndarray[object] + years: npt.NDArray[np.object_], # object[:] + months: npt.NDArray[np.object_], # object[:] + days: npt.NDArray[np.object_], # object[:] +) -> npt.NDArray[np.object_]: ... def try_parse_datetime_components( - years: np.ndarray, # object[:] - months: np.ndarray, # object[:] - days: np.ndarray, # object[:] - hours: np.ndarray, # object[:] - minutes: np.ndarray, # object[:] - seconds: np.ndarray, # object[:] -) -> np.ndarray: ... # np.ndarray[object] + years: npt.NDArray[np.object_], # object[:] + months: npt.NDArray[np.object_], # object[:] + days: npt.NDArray[np.object_], # object[:] + hours: npt.NDArray[np.object_], # object[:] + minutes: npt.NDArray[np.object_], # object[:] + seconds: npt.NDArray[np.object_], # object[:] +) -> npt.NDArray[np.object_]: ... def format_is_iso(f: str) -> bool: ... def guess_datetime_format( dt_str, @@ -57,5 +58,5 @@ def guess_datetime_format( def concat_date_cols( date_cols: tuple, keep_trivial_numbers: bool = ..., -) -> np.ndarray: ... # np.ndarray[object] +) -> npt.NDArray[np.object_]: ... def get_rule_month(source: str) -> str: ... diff --git a/pandas/_libs/tslibs/period.pyi b/pandas/_libs/tslibs/period.pyi index 97738d51b5a0e..e5455fa55c5ef 100644 --- a/pandas/_libs/tslibs/period.pyi +++ b/pandas/_libs/tslibs/period.pyi @@ -8,6 +8,7 @@ from pandas._libs.tslibs.timestamps import Timestamp from pandas._typing import ( Frequency, Timezone, + npt, ) INVALID_FREQ_ERR_MSG: str @@ -16,30 +17,30 @@ DIFFERENT_FREQ: str class IncompatibleFrequency(ValueError): ... def periodarr_to_dt64arr( - periodarr: np.ndarray, # const int64_t[:] + periodarr: npt.NDArray[np.int64], # const int64_t[:] freq: int, -) -> np.ndarray: ... # np.ndarray[np.int64] +) -> npt.NDArray[np.int64]: ... def period_asfreq_arr( - arr: np.ndarray, # ndarray[int64_t] arr, + arr: npt.NDArray[np.int64], freq1: int, freq2: int, end: bool, -) -> np.ndarray: ... # np.ndarray[np.int64] +) -> npt.NDArray[np.int64]: ... def get_period_field_arr( field: str, - arr: np.ndarray, # const int64_t[:] + arr: npt.NDArray[np.int64], # const int64_t[:] freq: int, -) -> np.ndarray: ... # np.ndarray[np.int64] +) -> npt.NDArray[np.int64]: ... def from_ordinals( - values: np.ndarray, # const int64_t[:] + values: npt.NDArray[np.int64], # const int64_t[:] freq: Frequency, -) -> np.ndarray: ... # np.ndarray[np.int64] +) -> npt.NDArray[np.int64]: ... def extract_ordinals( - values: np.ndarray, # np.ndarray[object] + values: npt.NDArray[np.object_], freq: Frequency | int, -) -> np.ndarray: ... # np.ndarray[np.int64] +) -> npt.NDArray[np.int64]: ... def extract_freq( - values: np.ndarray, # np.ndarray[object] + values: npt.NDArray[np.object_], ) -> BaseOffset: ... # exposed for tests diff --git a/pandas/_libs/tslibs/strptime.pyi b/pandas/_libs/tslibs/strptime.pyi index 891e257bcbcb4..cf7ae8508a45f 100644 --- a/pandas/_libs/tslibs/strptime.pyi +++ b/pandas/_libs/tslibs/strptime.pyi @@ -1,7 +1,9 @@ import numpy as np +from pandas._typing import npt + def array_strptime( - values: np.ndarray, # np.ndarray[object] + values: npt.NDArray[np.object_], fmt: str | None, exact: bool = True, errors: str = "raise", diff --git a/pandas/_libs/tslibs/timedeltas.pyi b/pandas/_libs/tslibs/timedeltas.pyi index 31a836b2c2079..8de02aa566456 100644 --- a/pandas/_libs/tslibs/timedeltas.pyi +++ b/pandas/_libs/tslibs/timedeltas.pyi @@ -7,6 +7,7 @@ from typing import ( ) import numpy as np +from pands._typing import npt from pandas._libs.tslibs import ( NaTType, @@ -16,11 +17,11 @@ from pandas._libs.tslibs import ( _S = TypeVar("_S") def ints_to_pytimedelta( - arr: np.ndarray, # const int64_t[:] + arr: npt.NDArray[np.int64], # const int64_t[:] box: bool = ..., -) -> np.ndarray: ... # np.ndarray[object] +) -> npt.NDArray[np.object_]: ... def array_to_timedelta64( - values: np.ndarray, # ndarray[object] + values: npt.NDArray[np.object_], unit: str | None = ..., errors: str = ..., ) -> np.ndarray: ... # np.ndarray[m8ns] diff --git a/pandas/_libs/tslibs/timezones.pyi b/pandas/_libs/tslibs/timezones.pyi index a631191f8b005..20c403e93b149 100644 --- a/pandas/_libs/tslibs/timezones.pyi +++ b/pandas/_libs/tslibs/timezones.pyi @@ -6,6 +6,8 @@ from typing import Callable import numpy as np +from pandas._typing import npt + # imported from dateutil.tz dateutil_gettz: Callable[[str], tzinfo] @@ -15,9 +17,9 @@ def infer_tzinfo( start: datetime | None, end: datetime | None, ) -> tzinfo | None: ... - -# ndarrays returned are both int64_t -def get_dst_info(tz: tzinfo) -> tuple[np.ndarray, np.ndarray, str]: ... +def get_dst_info( + tz: tzinfo, +) -> tuple[npt.NDArray[np.int64], npt.NDArray[np.int64], str]: ... def maybe_get_tz(tz: str | int | np.int64 | tzinfo | None) -> tzinfo | None: ... def get_timezone(tz: tzinfo) -> tzinfo | str: ... def is_utc(tz: tzinfo | None) -> bool: ... diff --git a/pandas/_libs/tslibs/tzconversion.pyi b/pandas/_libs/tslibs/tzconversion.pyi index 1cbe55320099b..614c97a1ae0cc 100644 --- a/pandas/_libs/tslibs/tzconversion.pyi +++ b/pandas/_libs/tslibs/tzconversion.pyi @@ -6,14 +6,16 @@ from typing import Iterable import numpy as np +from pandas._typing import npt + def tz_convert_from_utc( - vals: np.ndarray, # const int64_t[:] + vals: npt.NDArray[np.int64], # const int64_t[:] tz: tzinfo, -) -> np.ndarray: ... # np.ndarray[np.int64] +) -> npt.NDArray[np.int64]: ... def tz_convert_from_utc_single(val: np.int64, tz: tzinfo) -> np.int64: ... def tz_localize_to_utc( - vals: np.ndarray, # np.ndarray[np.int64] + vals: npt.NDArray[np.int64], tz: tzinfo | None, ambiguous: str | bool | Iterable[bool] | None = None, nonexistent: str | timedelta | np.timedelta64 | None = None, -) -> np.ndarray: ... # np.ndarray[np.int64] +) -> npt.NDArray[np.int64]: ... diff --git a/pandas/_libs/tslibs/vectorized.pyi b/pandas/_libs/tslibs/vectorized.pyi index 2a23289cdf61b..a53bab26ff42b 100644 --- a/pandas/_libs/tslibs/vectorized.pyi +++ b/pandas/_libs/tslibs/vectorized.pyi @@ -8,28 +8,29 @@ import numpy as np from pandas._libs.tslibs.dtypes import Resolution from pandas._libs.tslibs.offsets import BaseOffset +from pandas._typing import npt def dt64arr_to_periodarr( - stamps: np.ndarray, # const int64_t[:] + stamps: npt.NDArray[np.int64], # const int64_t[:] freq: int, tz: tzinfo | None, -) -> np.ndarray: ... # np.ndarray[np.int64, ndim=1] +) -> npt.NDArray[np.int64]: ... # np.ndarray[np.int64, ndim=1] def is_date_array_normalized( - stamps: np.ndarray, # const int64_t[:] + stamps: npt.NDArray[np.int64], # const int64_t[:] tz: tzinfo | None = None, ) -> bool: ... def normalize_i8_timestamps( - stamps: np.ndarray, # const int64_t[:] + stamps: npt.NDArray[np.int64], # const int64_t[:] tz: tzinfo | None, -) -> np.ndarray: ... # np.ndarray[np.int64] +) -> npt.NDArray[np.int64]: ... def get_resolution( - stamps: np.ndarray, # const int64_t[:] + stamps: npt.NDArray[np.int64], # const int64_t[:] tz: tzinfo | None = None, ) -> Resolution: ... def ints_to_pydatetime( - arr: np.ndarray, # const int64_t[:}] + arr: npt.NDArray[np.int64], # const int64_t[:}] tz: tzinfo | None = None, freq: str | BaseOffset | None = None, fold: bool = False, box: str = "datetime", -) -> np.ndarray: ... # np.ndarray[object] +) -> npt.NDArray[np.object_]: ... diff --git a/pandas/_testing/__init__.py b/pandas/_testing/__init__.py index fd74409e39a82..c54185e324646 100644 --- a/pandas/_testing/__init__.py +++ b/pandas/_testing/__init__.py @@ -45,15 +45,12 @@ CategoricalIndex, DataFrame, DatetimeIndex, - Float64Index, Index, - Int64Index, IntervalIndex, MultiIndex, NumericIndex, RangeIndex, Series, - UInt64Index, bdate_range, ) from pandas._testing._io import ( # noqa:F401 @@ -106,6 +103,11 @@ use_numexpr, with_csv_dialect, ) +from pandas.core.api import ( + Float64Index, + Int64Index, + UInt64Index, +) from pandas.core.arrays import ( DatetimeArray, PandasArray, diff --git a/pandas/_typing.py b/pandas/_typing.py index ef9f38bbf5168..5077e659410e3 100644 --- a/pandas/_typing.py +++ b/pandas/_typing.py @@ -69,6 +69,11 @@ from pandas.io.formats.format import EngFormatter from pandas.tseries.offsets import DateOffset + + # numpy compatible types + NumpyValueArrayLike = Union[npt._ScalarLike_co, npt.ArrayLike] + NumpySorter = Optional[npt._ArrayLikeInt_co] + else: npt: Any = None @@ -85,6 +90,7 @@ PandasScalar = Union["Period", "Timestamp", "Timedelta", "Interval"] Scalar = Union[PythonScalar, PandasScalar] + # timestamp and timedelta convertible types TimestampConvertibleTypes = Union[ @@ -120,10 +126,9 @@ ] # dtypes -NpDtype = Union[str, np.dtype] -Dtype = Union[ - "ExtensionDtype", NpDtype, type_t[Union[str, float, int, complex, bool, object]] -] +NpDtype = Union[str, np.dtype, type_t[Union[str, float, int, complex, bool, object]]] +Dtype = Union["ExtensionDtype", NpDtype] +AstypeArg = Union["ExtensionDtype", "npt.DTypeLike"] # DtypeArg specifies all allowable dtypes in a functions its dtype argument DtypeArg = Union[Dtype, Dict[Hashable, Dtype]] DtypeObj = Union[np.dtype, "ExtensionDtype"] @@ -201,10 +206,16 @@ # indexing # PositionalIndexer -> valid 1D positional indexer, e.g. can pass # to ndarray.__getitem__ +# ScalarIndexer is for a single value as the index +# SequenceIndexer is for list like or slices (but not tuples) +# PositionalIndexerTuple is extends the PositionalIndexer for 2D arrays +# These are used in various __getitem__ overloads # TODO: add Ellipsis, see # https://github.com/python/typing/issues/684#issuecomment-548203158 # https://bugs.python.org/issue41810 -PositionalIndexer = Union[int, np.integer, slice, Sequence[int], np.ndarray] -PositionalIndexer2D = Union[ - PositionalIndexer, Tuple[PositionalIndexer, PositionalIndexer] -] +# Using List[int] here rather than Sequence[int] to disallow tuples. +ScalarIndexer = Union[int, np.integer] +SequenceIndexer = Union[slice, List[int], np.ndarray] +PositionalIndexer = Union[ScalarIndexer, SequenceIndexer] +PositionalIndexerTuple = Tuple[PositionalIndexer, PositionalIndexer] +PositionalIndexer2D = Union[PositionalIndexer, PositionalIndexerTuple] diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 4f9dd61b8e0da..bb4e0dff0f4c7 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -82,6 +82,11 @@ if TYPE_CHECKING: + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + from pandas import ( Categorical, DataFrame, @@ -433,7 +438,7 @@ def unique(values): unique1d = unique -def isin(comps: AnyArrayLike, values: AnyArrayLike) -> np.ndarray: +def isin(comps: AnyArrayLike, values: AnyArrayLike) -> npt.NDArray[np.bool_]: """ Compute the isin boolean array. @@ -903,7 +908,7 @@ def value_counts_arraylike(values, dropna: bool): def duplicated( values: ArrayLike, keep: Literal["first", "last", False] = "first" -) -> np.ndarray: +) -> npt.NDArray[np.bool_]: """ Return boolean ndarray denoting duplicate values. @@ -1032,8 +1037,8 @@ def rank( def checked_add_with_arr( arr: np.ndarray, b, - arr_mask: np.ndarray | None = None, - b_mask: np.ndarray | None = None, + arr_mask: npt.NDArray[np.bool_] | None = None, + b_mask: npt.NDArray[np.bool_] | None = None, ) -> np.ndarray: """ Perform array addition that checks for underflow and overflow. @@ -1247,6 +1252,8 @@ class SelectNSeries(SelectN): def compute(self, method: str) -> Series: + from pandas.core.reshape.concat import concat + n = self.n dtype = self.obj.dtype if not self.is_valid_dtype_n_method(dtype): @@ -1256,6 +1263,7 @@ def compute(self, method: str) -> Series: return self.obj[[]] dropped = self.obj.dropna() + nan_index = self.obj.drop(dropped.index) if is_extension_array_dtype(dropped.dtype): # GH#41816 bc we have dropped NAs above, MaskedArrays can use the @@ -1272,7 +1280,7 @@ def compute(self, method: str) -> Series: # slow method if n >= len(self.obj): ascending = method == "nsmallest" - return dropped.sort_values(ascending=ascending).head(n) + return self.obj.sort_values(ascending=ascending).head(n) # fast method new_dtype = dropped.dtype @@ -1290,6 +1298,8 @@ def compute(self, method: str) -> Series: if self.keep == "last": arr = arr[::-1] + nbase = n + findex = len(self.obj) narr = len(arr) n = min(n, narr) @@ -1301,12 +1311,13 @@ def compute(self, method: str) -> Series: if self.keep != "all": inds = inds[:n] + findex = nbase if self.keep == "last": # reverse indices inds = narr - 1 - inds - return dropped.iloc[inds] + return concat([dropped.iloc[inds], nan_index]).iloc[:findex] class SelectNFrame(SelectN): @@ -1334,7 +1345,7 @@ def __init__(self, obj, n: int, keep: str, columns): def compute(self, method: str) -> DataFrame: - from pandas import Int64Index + from pandas.core.api import Int64Index n = self.n frame = self.obj @@ -1517,7 +1528,12 @@ def take( # ------------ # -def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray: +def searchsorted( + arr: ArrayLike, + value: NumpyValueArrayLike, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, +) -> npt.NDArray[np.intp] | np.intp: """ Find indices where elements should be inserted to maintain order. @@ -1554,8 +1570,9 @@ def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray: Returns ------- - array of ints - Array of insertion points with the same shape as `value`. + array of ints or int + If value is array-like, array of insertion points. + If value is scalar, a single integer. See Also -------- @@ -1583,9 +1600,10 @@ def searchsorted(arr, value, side="left", sorter=None) -> np.ndarray: dtype = value_arr.dtype if is_scalar(value): - value = dtype.type(value) + # We know that value is int + value = cast(int, dtype.type(value)) else: - value = pd_array(value, dtype=dtype) + value = pd_array(cast(ArrayLike, value), dtype=dtype) elif not ( is_object_dtype(arr) or is_numeric_dtype(arr) or is_categorical_dtype(arr) ): diff --git a/pandas/core/arrays/_mixins.py b/pandas/core/arrays/_mixins.py index 0e8097cf1fc78..4c7ccc2f16477 100644 --- a/pandas/core/arrays/_mixins.py +++ b/pandas/core/arrays/_mixins.py @@ -2,10 +2,13 @@ from functools import wraps from typing import ( + TYPE_CHECKING, Any, + Literal, Sequence, TypeVar, cast, + overload, ) import numpy as np @@ -15,7 +18,11 @@ from pandas._typing import ( F, PositionalIndexer2D, + PositionalIndexerTuple, + ScalarIndexer, + SequenceIndexer, Shape, + npt, type_t, ) from pandas.errors import AbstractMethodError @@ -45,6 +52,13 @@ "NDArrayBackedExtensionArrayT", bound="NDArrayBackedExtensionArray" ) +if TYPE_CHECKING: + + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + def ravel_compat(meth: F) -> F: """ @@ -157,12 +171,22 @@ def _concat_same_type( return to_concat[0]._from_backing_data(new_values) # type: ignore[arg-type] @doc(ExtensionArray.searchsorted) - def searchsorted(self, value, side="left", sorter=None): - value = self._validate_searchsorted_value(value) - return self._ndarray.searchsorted(value, side=side, sorter=sorter) - - def _validate_searchsorted_value(self, value): - return value + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp] | np.intp: + npvalue = self._validate_searchsorted_value(value) + return self._ndarray.searchsorted(npvalue, side=side, sorter=sorter) + + def _validate_searchsorted_value( + self, value: NumpyValueArrayLike | ExtensionArray + ) -> NumpyValueArrayLike: + if isinstance(value, ExtensionArray): + return value.to_numpy() + else: + return value @doc(ExtensionArray.shift) def shift(self, periods=1, fill_value=None, axis=0): @@ -185,6 +209,17 @@ def __setitem__(self, key, value): def _validate_setitem_value(self, value): return value + @overload + def __getitem__(self, key: ScalarIndexer) -> Any: + ... + + @overload + def __getitem__( + self: NDArrayBackedExtensionArrayT, + key: SequenceIndexer | PositionalIndexerTuple, + ) -> NDArrayBackedExtensionArrayT: + ... + def __getitem__( self: NDArrayBackedExtensionArrayT, key: PositionalIndexer2D, diff --git a/pandas/core/arrays/base.py b/pandas/core/arrays/base.py index b362769f50fa8..40837ccad6ac8 100644 --- a/pandas/core/arrays/base.py +++ b/pandas/core/arrays/base.py @@ -18,6 +18,7 @@ Sequence, TypeVar, cast, + overload, ) import numpy as np @@ -25,10 +26,14 @@ from pandas._libs import lib from pandas._typing import ( ArrayLike, + AstypeArg, Dtype, FillnaOptions, PositionalIndexer, + ScalarIndexer, + SequenceIndexer, Shape, + npt, ) from pandas.compat import set_function_name from pandas.compat.numpy import function as nv @@ -81,6 +86,11 @@ def any(self, *, skipna: bool = True) -> bool: def all(self, *, skipna: bool = True) -> bool: pass + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + _extension_array_shared_docs: dict[str, str] = {} @@ -290,8 +300,17 @@ def _from_factorized(cls, values, original): # ------------------------------------------------------------------------ # Must be a Sequence # ------------------------------------------------------------------------ + @overload + def __getitem__(self, item: ScalarIndexer) -> Any: + ... + + @overload + def __getitem__(self: ExtensionArrayT, item: SequenceIndexer) -> ExtensionArrayT: + ... - def __getitem__(self, item: PositionalIndexer) -> ExtensionArray | Any: + def __getitem__( + self: ExtensionArrayT, item: PositionalIndexer + ) -> ExtensionArrayT | Any: """ Select a subset of self. @@ -305,6 +324,8 @@ def __getitem__(self, item: PositionalIndexer) -> ExtensionArray | Any: * ndarray: A 1-d boolean NumPy ndarray the same length as 'self' + * list[int]: A list of int + Returns ------- item : scalar or ExtensionArray @@ -514,9 +535,21 @@ def nbytes(self) -> int: # Additional Methods # ------------------------------------------------------------------------ - def astype(self, dtype, copy=True): + @overload + def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: + ... + + @overload + def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: + ... + + @overload + def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: + ... + + def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: """ - Cast to a NumPy array with 'dtype'. + Cast to a NumPy array or ExtensionArray with 'dtype'. Parameters ---------- @@ -529,8 +562,10 @@ def astype(self, dtype, copy=True): Returns ------- - array : ndarray - NumPy ndarray with 'dtype' for its dtype. + array : np.ndarray or ExtensionArray + An ExtensionArray if dtype is StringDtype, + or same as that of underlying array. + Otherwise a NumPy ndarray with 'dtype' for its dtype. """ from pandas.core.arrays.string_ import StringDtype @@ -546,7 +581,11 @@ def astype(self, dtype, copy=True): # allow conversion to StringArrays return dtype.construct_array_type()._from_sequence(self, copy=False) - return np.array(self, dtype=dtype, copy=copy) + # error: Argument "dtype" to "array" has incompatible type + # "Union[ExtensionDtype, dtype[Any]]"; expected "Union[dtype[Any], None, type, + # _SupportsDType, str, Union[Tuple[Any, int], Tuple[Any, Union[int, + # Sequence[int]]], List[Any], _DTypeDict, Tuple[Any, Any]]]" + return np.array(self, dtype=dtype, copy=copy) # type: ignore[arg-type] def isna(self) -> np.ndarray | ExtensionArraySupportsAnyAll: """ @@ -735,7 +774,7 @@ def fillna( new_values = self.copy() return new_values - def dropna(self): + def dropna(self: ExtensionArrayT) -> ExtensionArrayT: """ Return ExtensionArray without NA values. @@ -807,7 +846,12 @@ def unique(self: ExtensionArrayT) -> ExtensionArrayT: uniques = unique(self.astype(object)) return self._from_sequence(uniques, dtype=self.dtype) - def searchsorted(self, value, side="left", sorter=None): + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp] | np.intp: """ Find indices where elements should be inserted to maintain order. @@ -838,8 +882,9 @@ def searchsorted(self, value, side="left", sorter=None): Returns ------- - array of ints - Array of insertion points with the same shape as `value`. + array of ints or int + If value is array-like, array of insertion points. + If value is scalar, a single integer. See Also -------- @@ -851,6 +896,8 @@ def searchsorted(self, value, side="left", sorter=None): # 2. Values between the values in the `data_for_sorting` fixture # 3. Missing values. arr = self.astype(object) + if isinstance(value, ExtensionArray): + value = value.astype(object) return arr.searchsorted(value, side=side, sorter=sorter) def equals(self, other: object) -> bool: @@ -1304,7 +1351,7 @@ def _reduce(self, name: str, *, skipna: bool = True, **kwargs): # ------------------------------------------------------------------------ # Non-Optimized Default Methods - def delete(self: ExtensionArrayT, loc) -> ExtensionArrayT: + def delete(self: ExtensionArrayT, loc: PositionalIndexer) -> ExtensionArrayT: indexer = np.delete(np.arange(len(self)), loc) return self.take(indexer) diff --git a/pandas/core/arrays/boolean.py b/pandas/core/arrays/boolean.py index 14d059c04b7c0..9769183700f27 100644 --- a/pandas/core/arrays/boolean.py +++ b/pandas/core/arrays/boolean.py @@ -1,7 +1,10 @@ from __future__ import annotations import numbers -from typing import TYPE_CHECKING +from typing import ( + TYPE_CHECKING, + overload, +) import warnings import numpy as np @@ -12,7 +15,10 @@ ) from pandas._typing import ( ArrayLike, + AstypeArg, Dtype, + DtypeObj, + npt, type_t, ) from pandas.compat.numpy import function as nv @@ -33,6 +39,7 @@ from pandas.core.dtypes.missing import isna from pandas.core import ops +from pandas.core.arrays import ExtensionArray from pandas.core.arrays.masked import ( BaseMaskedArray, BaseMaskedDtype, @@ -147,6 +154,18 @@ def __from_arrow__( else: return BooleanArray._concat_same_type(results) + def _get_common_dtype(self, dtypes: list[DtypeObj]) -> DtypeObj | None: + # Handle only boolean + np.bool_ -> boolean, since other cases like + # Int64 + boolean -> Int64 will be handled by the other type + if all( + isinstance(t, BooleanDtype) + or (isinstance(t, np.dtype) and (np.issubdtype(t, np.bool_))) + for t in dtypes + ): + return BooleanDtype() + else: + return None + def coerce_to_array( values, mask=None, copy: bool = False @@ -392,7 +411,20 @@ def reconstruct(x): def _coerce_to_array(self, value) -> tuple[np.ndarray, np.ndarray]: return coerce_to_array(value) - def astype(self, dtype, copy: bool = True) -> ArrayLike: + @overload + def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: + ... + + @overload + def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: + ... + + @overload + def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: + ... + + def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: + """ Cast to a NumPy array or ExtensionArray with 'dtype'. diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index c6be9e5886a1d..543b018c07ea5 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -6,11 +6,13 @@ from shutil import get_terminal_size from typing import ( TYPE_CHECKING, + Any, Hashable, Sequence, TypeVar, Union, cast, + overload, ) from warnings import ( catch_warnings, @@ -32,10 +34,15 @@ from pandas._libs.lib import no_default from pandas._typing import ( ArrayLike, + AstypeArg, Dtype, NpDtype, Ordered, + PositionalIndexer2D, + PositionalIndexerTuple, Scalar, + ScalarIndexer, + SequenceIndexer, Shape, npt, type_t, @@ -482,7 +489,19 @@ def _constructor(self) -> type[Categorical]: def _from_sequence(cls, scalars, *, dtype: Dtype | None = None, copy=False): return Categorical(scalars, dtype=dtype, copy=copy) - def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: + @overload + def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: + ... + + @overload + def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: + ... + + @overload + def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: + ... + + def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: """ Coerce this type to another dtype @@ -2003,7 +2022,18 @@ def __repr__(self) -> str: # ------------------------------------------------------------------ - def __getitem__(self, key): + @overload + def __getitem__(self, key: ScalarIndexer) -> Any: + ... + + @overload + def __getitem__( + self: CategoricalT, + key: SequenceIndexer | PositionalIndexerTuple, + ) -> CategoricalT: + ... + + def __getitem__(self: CategoricalT, key: PositionalIndexer2D) -> CategoricalT | Any: """ Return an item. """ @@ -2458,11 +2488,7 @@ def _str_get_dummies(self, sep="|"): # sep may not be in categories. Just bail on this. from pandas.core.arrays import PandasArray - # error: Argument 1 to "PandasArray" has incompatible type - # "ExtensionArray"; expected "Union[ndarray, PandasArray]" - return PandasArray(self.astype(str))._str_get_dummies( # type: ignore[arg-type] - sep - ) + return PandasArray(self.astype(str))._str_get_dummies(sep) # The Series.cat accessor diff --git a/pandas/core/arrays/datetimelike.py b/pandas/core/arrays/datetimelike.py index 3402b9594e6dd..63ba9fdd59fc6 100644 --- a/pandas/core/arrays/datetimelike.py +++ b/pandas/core/arrays/datetimelike.py @@ -49,6 +49,10 @@ DtypeObj, NpDtype, PositionalIndexer2D, + PositionalIndexerTuple, + ScalarIndexer, + SequenceIndexer, + npt, ) from pandas.compat.numpy import function as nv from pandas.errors import ( @@ -274,7 +278,7 @@ def __iter__(self): return (self._box_func(v) for v in self.asi8) @property - def asi8(self) -> np.ndarray: + def asi8(self) -> npt.NDArray[np.int64]: """ Integer representation of the values. @@ -312,17 +316,33 @@ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: return np.array(list(self), dtype=object) return self._ndarray + @overload + def __getitem__(self, item: ScalarIndexer) -> DTScalarOrNaT: + ... + + @overload + def __getitem__( + self: DatetimeLikeArrayT, + item: SequenceIndexer | PositionalIndexerTuple, + ) -> DatetimeLikeArrayT: + ... + def __getitem__( - self, key: PositionalIndexer2D - ) -> DatetimeLikeArrayMixin | DTScalarOrNaT: + self: DatetimeLikeArrayT, key: PositionalIndexer2D + ) -> DatetimeLikeArrayT | DTScalarOrNaT: """ This getitem defers to the underlying array, which by-definition can only handle list-likes, slices, and integer scalars """ - result = super().__getitem__(key) + # Use cast as we know we will get back a DatetimeLikeArray or DTScalar + result = cast( + Union[DatetimeLikeArrayT, DTScalarOrNaT], super().__getitem__(key) + ) if lib.is_scalar(result): return result - + else: + # At this point we know the result is an array. + result = cast(DatetimeLikeArrayT, result) result._freq = self._get_getitem_freq(key) return result @@ -774,7 +794,7 @@ def map(self, mapper): return Index(self).map(mapper).array - def isin(self, values) -> np.ndarray: + def isin(self, values) -> npt.NDArray[np.bool_]: """ Compute boolean array of whether each value is found in the passed set of values. @@ -830,11 +850,11 @@ def isin(self, values) -> np.ndarray: # ------------------------------------------------------------------ # Null Handling - def isna(self) -> np.ndarray: + def isna(self) -> npt.NDArray[np.bool_]: return self._isnan @property # NB: override with cache_readonly in immutable subclasses - def _isnan(self) -> np.ndarray: + def _isnan(self) -> npt.NDArray[np.bool_]: """ return if each value is nan """ @@ -1535,7 +1555,7 @@ class DatelikeOps(DatetimeLikeArrayMixin): URL="https://docs.python.org/3/library/datetime.html" "#strftime-and-strptime-behavior" ) - def strftime(self, date_format: str) -> np.ndarray: + def strftime(self, date_format: str) -> npt.NDArray[np.object_]: """ Convert to Index using specified date_format. @@ -1551,7 +1571,7 @@ def strftime(self, date_format: str) -> np.ndarray: Returns ------- - ndarray + ndarray[object] NumPy ndarray of formatted strings. See Also @@ -1767,11 +1787,7 @@ def factorize(self, na_sentinel=-1, sort: bool = False): uniques = self.copy() # TODO: copy or view? if sort and self.freq.n < 0: codes = codes[::-1] - # TODO: overload __getitem__, a slice indexer returns same type as self - # error: Incompatible types in assignment (expression has type - # "Union[DatetimeLikeArrayMixin, Union[Any, Any]]", variable - # has type "TimelikeOps") - uniques = uniques[::-1] # type: ignore[assignment] + uniques = uniques[::-1] return codes, uniques # FIXME: shouldn't get here; we are ignoring sort return super().factorize(na_sentinel=na_sentinel) diff --git a/pandas/core/arrays/datetimes.py b/pandas/core/arrays/datetimes.py index 17461ad95866e..823103181bb82 100644 --- a/pandas/core/arrays/datetimes.py +++ b/pandas/core/arrays/datetimes.py @@ -9,7 +9,6 @@ from typing import ( TYPE_CHECKING, Literal, - cast, overload, ) import warnings @@ -38,6 +37,7 @@ to_offset, tzconversion, ) +from pandas._typing import npt from pandas.errors import PerformanceWarning from pandas.core.dtypes.cast import astype_dt64_to_dt64tz @@ -110,11 +110,13 @@ def tz_to_dtype(tz): return DatetimeTZDtype(tz=tz) -def _field_accessor(name, field, docstring=None): +def _field_accessor(name: str, field: str, docstring=None): def f(self): values = self._local_timestamps() if field in self._bool_ops: + result: np.ndarray + if field.endswith(("start", "end")): freq = self.freq month_kw = 12 @@ -475,11 +477,9 @@ def _generate_range( index = cls._simple_new(arr, freq=None, dtype=dtype) if not left_closed and len(index) and index[0] == start: - # TODO: overload DatetimeLikeArrayMixin.__getitem__ - index = cast(DatetimeArray, index[1:]) + index = index[1:] if not right_closed and len(index) and index[-1] == end: - # TODO: overload DatetimeLikeArrayMixin.__getitem__ - index = cast(DatetimeArray, index[:-1]) + index = index[:-1] dtype = tz_to_dtype(tz) return cls._simple_new(index._ndarray, freq=freq, dtype=dtype) @@ -656,7 +656,7 @@ def astype(self, dtype, copy: bool = True): @dtl.ravel_compat def _format_native_types( self, na_rep="NaT", date_format=None, **kwargs - ) -> np.ndarray: + ) -> npt.NDArray[np.object_]: from pandas.io.formats.format import get_format_datetime64_from_values fmt = get_format_datetime64_from_values(self, date_format) @@ -1045,7 +1045,7 @@ def tz_localize(self, tz, ambiguous="raise", nonexistent="raise") -> DatetimeArr # ---------------------------------------------------------------- # Conversion Methods - Vectorized analogues of Timestamp methods - def to_pydatetime(self) -> np.ndarray: + def to_pydatetime(self) -> npt.NDArray[np.object_]: """ Return Datetime Array/Index as object ndarray of datetime.datetime objects. @@ -1262,7 +1262,7 @@ def day_name(self, locale=None): return result @property - def time(self) -> np.ndarray: + def time(self) -> npt.NDArray[np.object_]: """ Returns numpy array of datetime.time. The time part of the Timestamps. """ @@ -1274,7 +1274,7 @@ def time(self) -> np.ndarray: return ints_to_pydatetime(timestamps, box="time") @property - def timetz(self) -> np.ndarray: + def timetz(self) -> npt.NDArray[np.object_]: """ Returns numpy array of datetime.time also containing timezone information. The time part of the Timestamps. @@ -1282,7 +1282,7 @@ def timetz(self) -> np.ndarray: return ints_to_pydatetime(self.asi8, self.tz, box="time") @property - def date(self) -> np.ndarray: + def date(self) -> npt.NDArray[np.object_]: """ Returns numpy array of python datetime.date objects (namely, the date part of Timestamps without timezone information). diff --git a/pandas/core/arrays/floating.py b/pandas/core/arrays/floating.py index 1acbcf17dfffd..25b4076bd23c6 100644 --- a/pandas/core/arrays/floating.py +++ b/pandas/core/arrays/floating.py @@ -1,5 +1,6 @@ from __future__ import annotations +from typing import overload import warnings import numpy as np @@ -10,7 +11,9 @@ ) from pandas._typing import ( ArrayLike, + AstypeArg, DtypeObj, + npt, ) from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly @@ -31,6 +34,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.arrays import ExtensionArray from pandas.core.arrays.numeric import ( NumericArray, NumericDtype, @@ -271,7 +275,19 @@ def _from_sequence_of_strings( def _coerce_to_array(self, value) -> tuple[np.ndarray, np.ndarray]: return coerce_to_array(value, dtype=self.dtype) - def astype(self, dtype, copy: bool = True) -> ArrayLike: + @overload + def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: + ... + + @overload + def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: + ... + + @overload + def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: + ... + + def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: """ Cast to a NumPy array or ExtensionArray with 'dtype'. diff --git a/pandas/core/arrays/integer.py b/pandas/core/arrays/integer.py index c9ba762a271bd..e62a2f95b0340 100644 --- a/pandas/core/arrays/integer.py +++ b/pandas/core/arrays/integer.py @@ -1,5 +1,6 @@ from __future__ import annotations +from typing import overload import warnings import numpy as np @@ -11,8 +12,10 @@ ) from pandas._typing import ( ArrayLike, + AstypeArg, Dtype, DtypeObj, + npt, ) from pandas.compat.numpy import function as nv from pandas.util._decorators import cache_readonly @@ -33,6 +36,7 @@ ) from pandas.core.dtypes.missing import isna +from pandas.core.arrays import ExtensionArray from pandas.core.arrays.masked import ( BaseMaskedArray, BaseMaskedDtype, @@ -333,7 +337,19 @@ def _from_sequence_of_strings( def _coerce_to_array(self, value) -> tuple[np.ndarray, np.ndarray]: return coerce_to_array(value, dtype=self.dtype) - def astype(self, dtype, copy: bool = True) -> ArrayLike: + @overload + def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: + ... + + @overload + def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: + ... + + @overload + def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: + ... + + def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: """ Cast to a NumPy array or ExtensionArray with 'dtype'. diff --git a/pandas/core/arrays/interval.py b/pandas/core/arrays/interval.py index 41998218acd7d..732bdb112b8c3 100644 --- a/pandas/core/arrays/interval.py +++ b/pandas/core/arrays/interval.py @@ -9,7 +9,9 @@ from typing import ( Sequence, TypeVar, + Union, cast, + overload, ) import numpy as np @@ -31,6 +33,9 @@ ArrayLike, Dtype, NpDtype, + PositionalIndexer, + ScalarIndexer, + SequenceIndexer, ) from pandas.compat.numpy import function as nv from pandas.util._decorators import Appender @@ -89,6 +94,7 @@ ) IntervalArrayT = TypeVar("IntervalArrayT", bound="IntervalArray") +IntervalOrNA = Union[Interval, float] _interval_shared_docs: dict[str, str] = {} @@ -635,7 +641,17 @@ def __iter__(self): def __len__(self) -> int: return len(self._left) - def __getitem__(self, key): + @overload + def __getitem__(self, key: ScalarIndexer) -> IntervalOrNA: + ... + + @overload + def __getitem__(self: IntervalArrayT, key: SequenceIndexer) -> IntervalArrayT: + ... + + def __getitem__( + self: IntervalArrayT, key: PositionalIndexer + ) -> IntervalArrayT | IntervalOrNA: key = check_array_indexer(self, key) left = self._left[key] right = self._right[key] @@ -1633,10 +1649,11 @@ def _from_combined(self, combined: np.ndarray) -> IntervalArray: return self._shallow_copy(left=new_left, right=new_right) def unique(self) -> IntervalArray: - # Invalid index type "Tuple[slice, int]" for "Union[ExtensionArray, - # ndarray[Any, Any]]"; expected type "Union[int, integer[Any], slice, - # Sequence[int], ndarray[Any, Any]]" - nc = unique(self._combined.view("complex128")[:, 0]) # type: ignore[index] + # No overload variant of "__getitem__" of "ExtensionArray" matches argument + # type "Tuple[slice, int]" + nc = unique( + self._combined.view("complex128")[:, 0] # type: ignore[call-overload] + ) nc = nc[:, None] return self._from_combined(nc) diff --git a/pandas/core/arrays/masked.py b/pandas/core/arrays/masked.py index 1d78a74db98f0..877babe4f18e8 100644 --- a/pandas/core/arrays/masked.py +++ b/pandas/core/arrays/masked.py @@ -5,6 +5,7 @@ Any, Sequence, TypeVar, + overload, ) import numpy as np @@ -15,10 +16,13 @@ ) from pandas._typing import ( ArrayLike, - Dtype, + AstypeArg, NpDtype, PositionalIndexer, Scalar, + ScalarIndexer, + SequenceIndexer, + npt, type_t, ) from pandas.errors import AbstractMethodError @@ -123,6 +127,8 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): raise ValueError("values must be a 1D array") if mask.ndim != 1: raise ValueError("mask must be a 1D array") + if values.shape != mask.shape: + raise ValueError("values and mask must have same shape") if copy: values = values.copy() @@ -135,7 +141,17 @@ def __init__(self, values: np.ndarray, mask: np.ndarray, copy: bool = False): def dtype(self) -> BaseMaskedDtype: raise AbstractMethodError(self) - def __getitem__(self, item: PositionalIndexer) -> BaseMaskedArray | Any: + @overload + def __getitem__(self, item: ScalarIndexer) -> Any: + ... + + @overload + def __getitem__(self: BaseMaskedArrayT, item: SequenceIndexer) -> BaseMaskedArrayT: + ... + + def __getitem__( + self: BaseMaskedArrayT, item: PositionalIndexer + ) -> BaseMaskedArrayT | Any: if is_integer(item): if self._mask[item]: return self.dtype.na_value @@ -280,9 +296,7 @@ def to_numpy( # type: ignore[override] if na_value is lib.no_default: na_value = libmissing.NA if dtype is None: - # error: Incompatible types in assignment (expression has type - # "Type[object]", variable has type "Union[str, dtype[Any], None]") - dtype = object # type: ignore[assignment] + dtype = object if self._hasna: if ( not is_object_dtype(dtype) @@ -301,7 +315,19 @@ def to_numpy( # type: ignore[override] data = self._data.astype(dtype, copy=copy) return data - def astype(self, dtype: Dtype, copy: bool = True) -> ArrayLike: + @overload + def astype(self, dtype: npt.DTypeLike, copy: bool = ...) -> np.ndarray: + ... + + @overload + def astype(self, dtype: ExtensionDtype, copy: bool = ...) -> ExtensionArray: + ... + + @overload + def astype(self, dtype: AstypeArg, copy: bool = ...) -> ArrayLike: + ... + + def astype(self, dtype: AstypeArg, copy: bool = True) -> ArrayLike: dtype = pandas_dtype(dtype) if is_dtype_equal(dtype, self.dtype): diff --git a/pandas/core/arrays/period.py b/pandas/core/arrays/period.py index 471ee295ebd2f..84e611659b165 100644 --- a/pandas/core/arrays/period.py +++ b/pandas/core/arrays/period.py @@ -6,6 +6,7 @@ TYPE_CHECKING, Any, Callable, + Literal, Sequence, ) @@ -41,6 +42,7 @@ AnyArrayLike, Dtype, NpDtype, + npt, ) from pandas.util._decorators import ( cache_readonly, @@ -71,11 +73,19 @@ import pandas.core.algorithms as algos from pandas.core.arrays import datetimelike as dtl +from pandas.core.arrays.base import ExtensionArray import pandas.core.common as com if TYPE_CHECKING: + + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + from pandas.core.arrays import DatetimeArray + _shared_doc_kwargs = { "klass": "PeriodArray", } @@ -341,9 +351,7 @@ def freq(self) -> BaseOffset: def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: if dtype == "i8": return self.asi8 - # error: Non-overlapping equality check (left operand type: "Optional[Union[str, - # dtype[Any]]]", right operand type: "Type[bool]") - elif dtype == bool: # type: ignore[comparison-overlap] + elif dtype == bool: return ~self._isnan # This will raise TypeError for non-object dtypes @@ -644,12 +652,17 @@ def astype(self, dtype, copy: bool = True): return self.asfreq(dtype.freq) return super().astype(dtype, copy=copy) - def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: - value = self._validate_searchsorted_value(value).view("M8[ns]") + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp] | np.intp: + npvalue = self._validate_searchsorted_value(value).view("M8[ns]") # Cast to M8 to get datetime-like NaT placement m8arr = self._ndarray.view("M8[ns]") - return m8arr.searchsorted(value, side=side, sorter=sorter) + return m8arr.searchsorted(npvalue, side=side, sorter=sorter) def fillna(self, value=None, method=None, limit=None) -> PeriodArray: if method is not None: diff --git a/pandas/core/arrays/sparse/accessor.py b/pandas/core/arrays/sparse/accessor.py index 8efdfb719bbfa..f3eccd6aad444 100644 --- a/pandas/core/arrays/sparse/accessor.py +++ b/pandas/core/arrays/sparse/accessor.py @@ -113,6 +113,8 @@ def to_coo(self, row_levels=(0,), column_levels=(1,), sort_labels=False): column_levels : tuple/list sort_labels : bool, default False Sort the row and column labels before forming the sparse matrix. + When `row_levels` and/or `column_levels` refer to a single level, + set to `True` for a faster execution. Returns ------- diff --git a/pandas/core/arrays/sparse/array.py b/pandas/core/arrays/sparse/array.py index 68c9e42ef8e08..77142ef450487 100644 --- a/pandas/core/arrays/sparse/array.py +++ b/pandas/core/arrays/sparse/array.py @@ -7,10 +7,14 @@ import numbers import operator from typing import ( + TYPE_CHECKING, Any, Callable, + Literal, Sequence, TypeVar, + cast, + overload, ) import warnings @@ -25,9 +29,15 @@ ) from pandas._libs.tslibs import NaT from pandas._typing import ( + ArrayLike, + AstypeArg, Dtype, NpDtype, + PositionalIndexer, Scalar, + ScalarIndexer, + SequenceIndexer, + npt, ) from pandas.compat.numpy import function as nv from pandas.errors import PerformanceWarning @@ -77,6 +87,21 @@ import pandas.io.formats.printing as printing +# See https://github.com/python/typing/issues/684 +if TYPE_CHECKING: + from enum import Enum + + class ellipsis(Enum): + Ellipsis = "..." + + Ellipsis = ellipsis.Ellipsis + + from pandas._typing import NumpySorter + +else: + ellipsis = type(Ellipsis) + + # ---------------------------------------------------------------------------- # Array @@ -519,9 +544,7 @@ def __array__(self, dtype: NpDtype | None = None) -> np.ndarray: try: dtype = np.result_type(self.sp_values.dtype, type(fill_value)) except TypeError: - # error: Incompatible types in assignment (expression has type - # "Type[object]", variable has type "Union[str, dtype[Any], None]") - dtype = object # type: ignore[assignment] + dtype = object out = np.full(self.shape, fill_value, dtype=dtype) out[self.sp_index.to_int_index().indices] = self.sp_values @@ -806,8 +829,21 @@ def value_counts(self, dropna: bool = True): # -------- # Indexing # -------- + @overload + def __getitem__(self, key: ScalarIndexer) -> Any: + ... + + @overload + def __getitem__( + self: SparseArrayT, + key: SequenceIndexer | tuple[int | ellipsis, ...], + ) -> SparseArrayT: + ... - def __getitem__(self, key): + def __getitem__( + self: SparseArrayT, + key: PositionalIndexer | tuple[int | ellipsis, ...], + ) -> SparseArrayT | Any: if isinstance(key, tuple): if len(key) > 1: @@ -817,6 +853,8 @@ def __getitem__(self, key): key = key[:-1] if len(key) > 1: raise IndexError("too many indices for array.") + if key[0] is Ellipsis: + raise ValueError("Cannot slice with Ellipsis") key = key[0] if is_integer(key): @@ -845,7 +883,8 @@ def __getitem__(self, key): key = check_array_indexer(self, key) if com.is_bool_indexer(key): - + # mypy doesn't know we have an array here + key = cast(np.ndarray, key) return self.take(np.arange(len(key), dtype=np.int32)[key]) elif hasattr(key, "__len__"): return self.take(key) @@ -992,7 +1031,13 @@ def _take_without_fill(self, indices) -> np.ndarray | SparseArray: return taken - def searchsorted(self, v, side="left", sorter=None): + def searchsorted( + self, + v: ArrayLike | object, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp] | np.intp: + msg = "searchsorted requires high memory usage." warnings.warn(msg, PerformanceWarning, stacklevel=2) if not is_scalar(v): @@ -1058,7 +1103,7 @@ def _concat_same_type( return cls(data, sparse_index=sp_index, fill_value=fill_value) - def astype(self, dtype: Dtype | None = None, copy=True): + def astype(self, dtype: AstypeArg | None = None, copy=True): """ Change the dtype of a SparseArray. diff --git a/pandas/core/arrays/sparse/scipy_sparse.py b/pandas/core/arrays/sparse/scipy_sparse.py index f399d3230d897..3f69321ae98a6 100644 --- a/pandas/core/arrays/sparse/scipy_sparse.py +++ b/pandas/core/arrays/sparse/scipy_sparse.py @@ -3,14 +3,32 @@ Currently only includes to_coo helpers. """ -from pandas.core.indexes.api import ( - Index, - MultiIndex, +from __future__ import annotations + +from typing import ( + TYPE_CHECKING, + Iterable, +) + +import numpy as np + +from pandas._libs import lib +from pandas._typing import ( + IndexLabel, + npt, ) + +from pandas.core.dtypes.missing import notna + +from pandas.core.algorithms import factorize +from pandas.core.indexes.api import MultiIndex from pandas.core.series import Series +if TYPE_CHECKING: + import scipy.sparse + -def _check_is_partition(parts, whole): +def _check_is_partition(parts: Iterable, whole: Iterable): whole = set(whole) parts = [set(x) for x in parts] if set.intersection(*parts) != set(): @@ -19,76 +37,115 @@ def _check_is_partition(parts, whole): raise ValueError("Is not a partition because union is not the whole.") -def _to_ijv(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): - """ - For arbitrary (MultiIndexed) sparse Series return - (v, i, j, ilabels, jlabels) where (v, (i, j)) is suitable for - passing to scipy.sparse.coo constructor. +def _levels_to_axis( + ss, + levels: tuple[int] | list[int], + valid_ilocs: npt.NDArray[np.intp], + sort_labels: bool = False, +) -> tuple[npt.NDArray[np.intp], list[IndexLabel]]: """ - # index and column levels must be a partition of the index - _check_is_partition([row_levels, column_levels], range(ss.index.nlevels)) + For a MultiIndexed sparse Series `ss`, return `ax_coords` and `ax_labels`, + where `ax_coords` are the coordinates along one of the two axes of the + destination sparse matrix, and `ax_labels` are the labels from `ss`' Index + which correspond to these coordinates. + + Parameters + ---------- + ss : Series + levels : tuple/list + valid_ilocs : numpy.ndarray + Array of integer positions of valid values for the sparse matrix in ss. + sort_labels : bool, default False + Sort the axis labels before forming the sparse matrix. When `levels` + refers to a single level, set to True for a faster execution. - # from the sparse Series: get the labels and data for non-null entries - values = ss.array._valid_sp_values - - nonnull_labels = ss.dropna() - - def get_indexers(levels): - """Return sparse coords and dense labels for subset levels""" - # TODO: how to do this better? cleanly slice nonnull_labels given the - # coord - values_ilabels = [tuple(x[i] for i in levels) for x in nonnull_labels.index] - if len(levels) == 1: - values_ilabels = [x[0] for x in values_ilabels] - - # # performance issues with groupby ################################### - # TODO: these two lines can replace the code below but - # groupby is too slow (in some cases at least) - # labels_to_i = ss.groupby(level=levels, sort=sort_labels).first() - # labels_to_i[:] = np.arange(labels_to_i.shape[0]) - - def _get_label_to_i_dict(labels, sort_labels=False): - """ - Return dict of unique labels to number. - Optionally sort by label. - """ - labels = Index(map(tuple, labels)).unique().tolist() # squish - if sort_labels: - labels = sorted(labels) - return {k: i for i, k in enumerate(labels)} - - def _get_index_subset_to_coord_dict(index, subset, sort_labels=False): - ilabels = list(zip(*(index._get_level_values(i) for i in subset))) - labels_to_i = _get_label_to_i_dict(ilabels, sort_labels=sort_labels) - labels_to_i = Series(labels_to_i) - if len(subset) > 1: - labels_to_i.index = MultiIndex.from_tuples(labels_to_i.index) - labels_to_i.index.names = [index.names[i] for i in subset] - else: - labels_to_i.index = Index(x[0] for x in labels_to_i.index) - labels_to_i.index.name = index.names[subset[0]] - - labels_to_i.name = "value" - return labels_to_i - - labels_to_i = _get_index_subset_to_coord_dict( - ss.index, levels, sort_labels=sort_labels + Returns + ------- + ax_coords : numpy.ndarray (axis coordinates) + ax_labels : list (axis labels) + """ + # Since the labels are sorted in `Index.levels`, when we wish to sort and + # there is only one level of the MultiIndex for this axis, the desired + # output can be obtained in the following simpler, more efficient way. + if sort_labels and len(levels) == 1: + ax_coords = ss.index.codes[levels[0]][valid_ilocs] + ax_labels = ss.index.levels[levels[0]] + + else: + levels_values = lib.fast_zip( + [ss.index.get_level_values(lvl).values for lvl in levels] ) - # ##################################################################### - # ##################################################################### + codes, ax_labels = factorize(levels_values, sort=sort_labels) + ax_coords = codes[valid_ilocs] + + ax_labels = ax_labels.tolist() + return ax_coords, ax_labels + + +def _to_ijv( + ss, + row_levels: tuple[int] | list[int] = (0,), + column_levels: tuple[int] | list[int] = (1,), + sort_labels: bool = False, +) -> tuple[ + np.ndarray, + npt.NDArray[np.intp], + npt.NDArray[np.intp], + list[IndexLabel], + list[IndexLabel], +]: + """ + For an arbitrary MultiIndexed sparse Series return (v, i, j, ilabels, + jlabels) where (v, (i, j)) is suitable for passing to scipy.sparse.coo + constructor, and ilabels and jlabels are the row and column labels + respectively. - i_coord = labels_to_i[values_ilabels].tolist() - i_labels = labels_to_i.index.tolist() + Parameters + ---------- + ss : Series + row_levels : tuple/list + column_levels : tuple/list + sort_labels : bool, default False + Sort the row and column labels before forming the sparse matrix. + When `row_levels` and/or `column_levels` refer to a single level, + set to `True` for a faster execution. - return i_coord, i_labels + Returns + ------- + values : numpy.ndarray + Valid values to populate a sparse matrix, extracted from + ss. + i_coords : numpy.ndarray (row coordinates of the values) + j_coords : numpy.ndarray (column coordinates of the values) + i_labels : list (row labels) + j_labels : list (column labels) + """ + # index and column levels must be a partition of the index + _check_is_partition([row_levels, column_levels], range(ss.index.nlevels)) + # From the sparse Series, get the integer indices and data for valid sparse + # entries. + sp_vals = ss.array.sp_values + na_mask = notna(sp_vals) + values = sp_vals[na_mask] + valid_ilocs = ss.array.sp_index.indices[na_mask] + + i_coords, i_labels = _levels_to_axis( + ss, row_levels, valid_ilocs, sort_labels=sort_labels + ) - i_coord, i_labels = get_indexers(row_levels) - j_coord, j_labels = get_indexers(column_levels) + j_coords, j_labels = _levels_to_axis( + ss, column_levels, valid_ilocs, sort_labels=sort_labels + ) - return values, i_coord, j_coord, i_labels, j_labels + return values, i_coords, j_coords, i_labels, j_labels -def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=False): +def sparse_series_to_coo( + ss: Series, + row_levels: Iterable[int] = (0,), + column_levels: Iterable[int] = (1,), + sort_labels: bool = False, +) -> tuple[scipy.sparse.coo_matrix, list[IndexLabel], list[IndexLabel]]: """ Convert a sparse Series to a scipy.sparse.coo_matrix using index levels row_levels, column_levels as the row and column @@ -97,7 +154,7 @@ def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=Fa import scipy.sparse if ss.index.nlevels < 2: - raise ValueError("to_coo requires MultiIndex with nlevels > 2") + raise ValueError("to_coo requires MultiIndex with nlevels >= 2.") if not ss.index.is_unique: raise ValueError( "Duplicate index entries are not allowed in to_coo transformation." @@ -116,7 +173,9 @@ def sparse_series_to_coo(ss, row_levels=(0,), column_levels=(1,), sort_labels=Fa return sparse_matrix, rows, columns -def coo_to_sparse_series(A, dense_index: bool = False): +def coo_to_sparse_series( + A: scipy.sparse.coo_matrix, dense_index: bool = False +) -> Series: """ Convert a scipy.sparse.coo_matrix to a SparseSeries. diff --git a/pandas/core/arrays/string_arrow.py b/pandas/core/arrays/string_arrow.py index ab8599f0f05ba..4be7f4eb0c521 100644 --- a/pandas/core/arrays/string_arrow.py +++ b/pandas/core/arrays/string_arrow.py @@ -6,17 +6,24 @@ TYPE_CHECKING, Any, Sequence, + Union, cast, + overload, ) import numpy as np -from pandas._libs import lib +from pandas._libs import ( + lib, + missing as libmissing, +) from pandas._typing import ( Dtype, NpDtype, PositionalIndexer, Scalar, + ScalarIndexer, + SequenceIndexer, ) from pandas.compat import ( pa_version_under1p0, @@ -77,6 +84,8 @@ if TYPE_CHECKING: from pandas import Series +ArrowStringScalarOrNAT = Union[str, libmissing.NAType] + def _chk_pyarrow_available() -> None: if pa_version_under1p0: @@ -260,7 +269,17 @@ def _concat_same_type(cls, to_concat) -> ArrowStringArray: ) ) - def __getitem__(self, item: PositionalIndexer) -> Any: + @overload + def __getitem__(self, item: ScalarIndexer) -> ArrowStringScalarOrNAT: + ... + + @overload + def __getitem__(self: ArrowStringArray, item: SequenceIndexer) -> ArrowStringArray: + ... + + def __getitem__( + self: ArrowStringArray, item: PositionalIndexer + ) -> ArrowStringArray | ArrowStringScalarOrNAT: """Select a subset of self. Parameters diff --git a/pandas/core/base.py b/pandas/core/base.py index 57e015dc378c8..c7a707fd5cd6e 100644 --- a/pandas/core/base.py +++ b/pandas/core/base.py @@ -66,8 +66,14 @@ if TYPE_CHECKING: + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + from pandas import Categorical + _shared_docs: dict[str, str] = {} _indexops_doc_kwargs = { "klass": "IndexOpsMixin", @@ -1222,7 +1228,12 @@ def factorize(self, sort: bool = False, na_sentinel: int | None = -1): """ @doc(_shared_docs["searchsorted"], klass="Index") - def searchsorted(self, value, side="left", sorter=None) -> npt.NDArray[np.intp]: + def searchsorted( + self, + value: NumpyValueArrayLike, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp] | np.intp: return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) def drop_duplicates(self, keep="first"): diff --git a/pandas/core/common.py b/pandas/core/common.py index b32614577393d..2bf925466e176 100644 --- a/pandas/core/common.py +++ b/pandas/core/common.py @@ -232,12 +232,7 @@ def asarray_tuplesafe(values, dtype: NpDtype | None = None) -> np.ndarray: # expected "ndarray") return values._values # type: ignore[return-value] - # error: Non-overlapping container check (element type: "Union[str, dtype[Any], - # None]", container item type: "type") - if isinstance(values, list) and dtype in [ # type: ignore[comparison-overlap] - np.object_, - object, - ]: + if isinstance(values, list) and dtype in [np.object_, object]: return construct_1d_object_array_from_listlike(values) result = np.asarray(values, dtype=dtype) diff --git a/pandas/core/computation/pytables.py b/pandas/core/computation/pytables.py index ad76a76a954b1..3e041c088f566 100644 --- a/pandas/core/computation/pytables.py +++ b/pandas/core/computation/pytables.py @@ -11,6 +11,7 @@ Timedelta, Timestamp, ) +from pandas._typing import npt from pandas.compat.chainmap import DeepChainMap from pandas.core.dtypes.common import is_list_like @@ -223,6 +224,7 @@ def stringify(value): return TermValue(int(v), v, kind) elif meta == "category": metadata = extract_array(self.metadata, extract_numpy=True) + result: npt.NDArray[np.intp] | np.intp | int if v not in metadata: result = -1 else: diff --git a/pandas/core/config_init.py b/pandas/core/config_init.py index 27b898782fbef..09da9f04f8360 100644 --- a/pandas/core/config_init.py +++ b/pandas/core/config_init.py @@ -10,6 +10,7 @@ """ import os +from typing import Callable import warnings import pandas._config.config as cf @@ -20,6 +21,7 @@ is_int, is_nonnegative_int, is_one_of_factory, + is_str, is_text, ) @@ -756,17 +758,86 @@ def register_converter_cb(key): display each explicit level element in a hierarchical key for each column. """ +styler_render_repr = """ +: str + Determine which output to use in Jupyter Notebook in {"html", "latex"}. +""" + styler_max_elements = """ : int The maximum number of data-cell () elements that will be rendered before trimming will occur over columns, rows or both if needed. """ +styler_precision = """ +: int + The precision for floats and complex numbers. +""" + +styler_decimal = """ +: str + The character representation for the decimal separator for floats and complex. +""" + +styler_thousands = """ +: str, optional + The character representation for thousands separator for floats, int and complex. +""" + +styler_na_rep = """ +: str, optional + The string representation for values identified as missing. +""" + +styler_escape = """ +: str, optional + Whether to escape certain characters according to the given context; html or latex. +""" + +styler_formatter = """ +: str, callable, dict, optional + A formatter object to be used as default within ``Styler.format``. +""" + +styler_multirow_align = """ +: {"c", "t", "b"} + The specifier for vertical alignment of sparsified LaTeX multirows. +""" + +styler_multicol_align = """ +: {"r", "c", "l"} + The specifier for horizontal alignment of sparsified LaTeX multicolumns. +""" + +styler_environment = """ +: str + The environment to replace ``\\begin{table}``. If "longtable" is used results + in a specific longtable environment format. +""" + +styler_encoding = """ +: str + The encoding used for output HTML and LaTeX files. +""" + +styler_mathjax = """ +: bool + If False will render special CSS classes to table attributes that indicate Mathjax + will not be used in Jupyter Notebook. +""" + with cf.config_prefix("styler"): - cf.register_option("sparse.index", True, styler_sparse_index_doc, validator=bool) + cf.register_option("sparse.index", True, styler_sparse_index_doc, validator=is_bool) + + cf.register_option( + "sparse.columns", True, styler_sparse_columns_doc, validator=is_bool + ) cf.register_option( - "sparse.columns", True, styler_sparse_columns_doc, validator=bool + "render.repr", + "html", + styler_render_repr, + validator=is_one_of_factory(["html", "latex"]), ) cf.register_option( @@ -775,3 +846,62 @@ def register_converter_cb(key): styler_max_elements, validator=is_nonnegative_int, ) + + cf.register_option("render.encoding", "utf-8", styler_encoding, validator=is_str) + + cf.register_option("format.decimal", ".", styler_decimal, validator=is_str) + + cf.register_option( + "format.precision", 6, styler_precision, validator=is_nonnegative_int + ) + + cf.register_option( + "format.thousands", + None, + styler_thousands, + validator=is_instance_factory([type(None), str]), + ) + + cf.register_option( + "format.na_rep", + None, + styler_na_rep, + validator=is_instance_factory([type(None), str]), + ) + + cf.register_option( + "format.escape", + None, + styler_escape, + validator=is_one_of_factory([None, "html", "latex"]), + ) + + cf.register_option( + "format.formatter", + None, + styler_formatter, + validator=is_instance_factory([type(None), dict, Callable, str]), + ) + + cf.register_option("html.mathjax", True, styler_mathjax, validator=is_bool) + + cf.register_option( + "latex.multirow_align", + "c", + styler_multirow_align, + validator=is_one_of_factory(["c", "t", "b", "naive"]), + ) + + cf.register_option( + "latex.multicol_align", + "r", + styler_multicol_align, + validator=is_one_of_factory(["r", "c", "l", "naive-l", "naive-r"]), + ) + + cf.register_option( + "latex.environment", + None, + styler_environment, + validator=is_instance_factory([type(None), str]), + ) diff --git a/pandas/core/dtypes/cast.py b/pandas/core/dtypes/cast.py index 6f10a490c7147..d6a8790afd998 100644 --- a/pandas/core/dtypes/cast.py +++ b/pandas/core/dtypes/cast.py @@ -2185,6 +2185,11 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: # ExtensionBlock._can_hold_element return True + # error: Non-overlapping equality check (left operand type: "dtype[Any]", right + # operand type: "Type[object]") + if dtype == object: # type: ignore[comparison-overlap] + return True + tipo = maybe_infer_dtype_type(element) if dtype.kind in ["i", "u"]: @@ -2232,11 +2237,6 @@ def can_hold_element(arr: ArrayLike, element: Any) -> bool: return tipo.kind == "b" return lib.is_bool(element) - # error: Non-overlapping equality check (left operand type: "dtype[Any]", right - # operand type: "Type[object]") - elif dtype == object: # type: ignore[comparison-overlap] - return True - elif dtype.kind == "S": # TODO: test tests.frame.methods.test_replace tests get here, # need more targeted tests. xref phofl has a PR about this diff --git a/pandas/core/dtypes/common.py b/pandas/core/dtypes/common.py index 393fe08f7277c..a9c2b31849425 100644 --- a/pandas/core/dtypes/common.py +++ b/pandas/core/dtypes/common.py @@ -97,6 +97,8 @@ def ensure_float(arr): ensure_int32 = algos.ensure_int32 ensure_int16 = algos.ensure_int16 ensure_int8 = algos.ensure_int8 +ensure_complex64 = algos.ensure_complex64 +ensure_complex128 = algos.ensure_complex128 ensure_platform_int = algos.ensure_platform_int ensure_object = algos.ensure_object diff --git a/pandas/core/dtypes/dtypes.py b/pandas/core/dtypes/dtypes.py index a214371bdf26e..6be2e803b5910 100644 --- a/pandas/core/dtypes/dtypes.py +++ b/pandas/core/dtypes/dtypes.py @@ -29,8 +29,8 @@ from pandas._typing import ( Dtype, DtypeObj, - NpDtype, Ordered, + npt, type_t, ) @@ -936,7 +936,18 @@ def __eq__(self, other: Any) -> bool: if isinstance(other, str): return other in [self.name, self.name.title()] - return isinstance(other, PeriodDtype) and self.freq == other.freq + elif isinstance(other, PeriodDtype): + + # For freqs that can be held by a PeriodDtype, this check is + # equivalent to (and much faster than) self.freq == other.freq + sfreq = self.freq + ofreq = other.freq + return ( + sfreq.n == ofreq.n + and sfreq._period_dtype_code == ofreq._period_dtype_code + ) + + return False def __ne__(self, other: Any) -> bool: return not self.__eq__(other) @@ -1283,7 +1294,7 @@ class PandasDtype(ExtensionDtype): _metadata = ("_dtype",) - def __init__(self, dtype: NpDtype | PandasDtype | None): + def __init__(self, dtype: npt.DTypeLike | PandasDtype | None): if isinstance(dtype, PandasDtype): # make constructor univalent dtype = dtype.numpy_dtype diff --git a/pandas/core/dtypes/inference.py b/pandas/core/dtypes/inference.py index 1360b66e77dc0..ae961e53d8b79 100644 --- a/pandas/core/dtypes/inference.py +++ b/pandas/core/dtypes/inference.py @@ -315,7 +315,7 @@ def is_named_tuple(obj) -> bool: >>> is_named_tuple((1, 2)) False """ - return isinstance(obj, tuple) and hasattr(obj, "_fields") + return isinstance(obj, abc.Sequence) and hasattr(obj, "_fields") def is_hashable(obj) -> bool: diff --git a/pandas/core/frame.py b/pandas/core/frame.py index e02a88aafcf34..304de0a00cded 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -469,7 +469,8 @@ class DataFrame(NDFrame, OpsMixin): ---------- data : ndarray (structured or homogeneous), Iterable, dict, or DataFrame Dict can contain Series, arrays, constants, dataclass or list-like objects. If - data is a dict, column order follows insertion-order. + data is a dict, column order follows insertion-order. If a dict contains Series + which have an index defined, it is aligned by its index. .. versionchanged:: 0.25.0 If data is a list of dicts, column order follows insertion-order. @@ -524,6 +525,16 @@ class DataFrame(NDFrame, OpsMixin): col2 int8 dtype: object + Constructing DataFrame from a dictionary including Series: + + >>> d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])} + >>> pd.DataFrame(data=d, index=[0, 1, 2, 3]) + col1 col2 + 0 0 NaN + 1 1 NaN + 2 2 2.0 + 3 3 3.0 + Constructing DataFrame from numpy ndarray: >>> df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]), @@ -1653,6 +1664,8 @@ def to_numpy( [2, 4.5, Timestamp('2000-01-02 00:00:00')]], dtype=object) """ self._consolidate_inplace() + if dtype is not None: + dtype = np.dtype(dtype) result = self._mgr.as_array( transpose=self._AXIS_REVERSED, dtype=dtype, copy=copy, na_value=na_value ) @@ -3620,9 +3633,11 @@ def __setitem__(self, key, value): self._setitem_array(key, value) elif isinstance(value, DataFrame): self._set_item_frame_value(key, value) - elif is_list_like(value) and 1 < len( - self.columns.get_indexer_for([key]) - ) == len(value): + elif ( + is_list_like(value) + and not self.columns.is_unique + and 1 < len(self.columns.get_indexer_for([key])) == len(value) + ): # Column to set is duplicated self._setitem_array([key], value) else: @@ -8962,6 +8977,7 @@ def append( 3 3 4 4 """ + combined_columns = None if isinstance(other, (Series, dict)): if isinstance(other, dict): if not ignore_index: @@ -8976,21 +8992,15 @@ def append( index = Index([other.name], name=self.index.name) idx_diff = other.index.difference(self.columns) combined_columns = self.columns.append(idx_diff) - other = ( - other.reindex(combined_columns, copy=False) - .to_frame() - .T.infer_objects() - .rename_axis(index.names, copy=False) - ) - if not self.columns.equals(combined_columns): - self = self.reindex(columns=combined_columns) + row_df = other.to_frame().T + # infer_objects is needed for + # test_append_empty_frame_to_series_with_dateutil_tz + other = row_df.infer_objects().rename_axis(index.names, copy=False) elif isinstance(other, list): if not other: pass elif not isinstance(other[0], DataFrame): other = DataFrame(other) - if (self.columns.get_indexer(other.columns) >= 0).all(): - other = other.reindex(columns=self.columns) from pandas.core.reshape.concat import concat @@ -8998,14 +9008,24 @@ def append( to_concat = [self, *other] else: to_concat = [self, other] - return ( - concat( - to_concat, - ignore_index=ignore_index, - verify_integrity=verify_integrity, - sort=sort, - ) - ).__finalize__(self, method="append") + + result = concat( + to_concat, + ignore_index=ignore_index, + verify_integrity=verify_integrity, + sort=sort, + ) + if ( + combined_columns is not None + and not sort + and not combined_columns.equals(result.columns) + ): + # TODO: reindexing here is a kludge bc union_indexes does not + # pass sort to index.union, xref #43375 + # combined_columns.equals check is necessary for preserving dtype + # in test_crosstab_normalize + result = result.reindex(combined_columns, axis=1) + return result.__finalize__(self, method="append") def join( self, @@ -9381,7 +9401,8 @@ def corr( regardless of the callable's behavior. min_periods : int, optional Minimum number of observations required per pair of columns - to have a valid result. + to have a valid result. Currently only available for Pearson + and Spearman correlation. Returns ------- @@ -9415,9 +9436,7 @@ def corr( correl = libalgos.nancorr(mat, minp=min_periods) elif method == "spearman": correl = libalgos.nancorr_spearman(mat, minp=min_periods) - elif method == "kendall": - correl = libalgos.nancorr_kendall(mat, minp=min_periods) - elif callable(method): + elif method == "kendall" or callable(method): if min_periods is None: min_periods = 1 mat = mat.T @@ -9824,26 +9843,28 @@ def _reduce( assert filter_type is None or filter_type == "bool", filter_type out_dtype = "bool" if filter_type == "bool" else None - own_dtypes = [arr.dtype for arr in self._iter_column_arrays()] + if numeric_only is None and name in ["mean", "median"]: + own_dtypes = [arr.dtype for arr in self._mgr.arrays] - dtype_is_dt = np.array( - [is_datetime64_any_dtype(dtype) for dtype in own_dtypes], - dtype=bool, - ) - if numeric_only is None and name in ["mean", "median"] and dtype_is_dt.any(): - warnings.warn( - "DataFrame.mean and DataFrame.median with numeric_only=None " - "will include datetime64 and datetime64tz columns in a " - "future version.", - FutureWarning, - stacklevel=5, + dtype_is_dt = np.array( + [is_datetime64_any_dtype(dtype) for dtype in own_dtypes], + dtype=bool, ) - # Non-copy equivalent to - # cols = self.columns[~dtype_is_dt] - # self = self[cols] - predicate = lambda x: not is_datetime64_any_dtype(x.dtype) - mgr = self._mgr._get_data_subset(predicate) - self = type(self)(mgr) + if dtype_is_dt.any(): + warnings.warn( + "DataFrame.mean and DataFrame.median with numeric_only=None " + "will include datetime64 and datetime64tz columns in a " + "future version.", + FutureWarning, + stacklevel=5, + ) + # Non-copy equivalent to + # dt64_cols = self.dtypes.apply(is_datetime64_any_dtype) + # cols = self.columns[~dt64_cols] + # self = self[cols] + predicate = lambda x: not is_datetime64_any_dtype(x.dtype) + mgr = self._mgr._get_data_subset(predicate) + self = type(self)(mgr) # TODO: Make other agg func handle axis=None properly GH#21597 axis = self._get_axis_number(axis) diff --git a/pandas/core/generic.py b/pandas/core/generic.py index 1f51576cc6e90..48daf7c89fe64 100644 --- a/pandas/core/generic.py +++ b/pandas/core/generic.py @@ -8251,8 +8251,7 @@ def last(self: FrameOrSeries, offset) -> FrameOrSeries: start_date = self.index[-1] - offset start = self.index.searchsorted(start_date, side="right") - # error: Slice index must be an integer or None - return self.iloc[start:] # type: ignore[misc] + return self.iloc[start:] @final def rank( diff --git a/pandas/core/groupby/generic.py b/pandas/core/groupby/generic.py index 35cb247e96bc3..7af32d70c00bc 100644 --- a/pandas/core/groupby/generic.py +++ b/pandas/core/groupby/generic.py @@ -354,35 +354,17 @@ def array_func(values: ArrayLike) -> ArrayLike: ) return self._reindex_output(ser) - def _wrap_aggregated_output( - self, - output: Mapping[base.OutputKey, Series | ArrayLike], + def _indexed_output_to_ndframe( + self, output: Mapping[base.OutputKey, ArrayLike] ) -> Series: """ - Wraps the output of a SeriesGroupBy aggregation into the expected result. - - Parameters - ---------- - output : Mapping[base.OutputKey, Union[Series, ArrayLike]] - Data to wrap. - - Returns - ------- - Series - - Notes - ----- - In the vast majority of cases output will only contain one element. - The exception is operations that expand dimensions, like ohlc. + Wrap the dict result of a GroupBy aggregation into a Series. """ assert len(output) == 1 - - name = self.obj.name - index = self.grouper.result_index values = next(iter(output.values())) - - result = self.obj._constructor(values, index=index, name=name) - return self._reindex_output(result) + result = self.obj._constructor(values) + result.name = self.obj.name + return result def _wrap_transformed_output( self, output: Mapping[base.OutputKey, Series | ArrayLike] @@ -449,16 +431,9 @@ def _wrap_applied_output( ) assert values is not None - def _get_index() -> Index: - if self.grouper.nkeys > 1: - index = MultiIndex.from_tuples(keys, names=self.grouper.names) - else: - index = Index._with_infer(keys, name=self.grouper.names[0]) - return index - if isinstance(values[0], dict): # GH #823 #24880 - index = _get_index() + index = self._group_keys_index res_df = self.obj._constructor_expanddim(values, index=index) res_df = self._reindex_output(res_df) # if self.observed is False, @@ -471,7 +446,7 @@ def _get_index() -> Index: else: # GH #6265 #24880 result = self.obj._constructor( - data=values, index=_get_index(), name=self.obj.name + data=values, index=self._group_keys_index, name=self.obj.name ) return self._reindex_output(result) @@ -1614,46 +1589,19 @@ def _insert_inaxis_grouper_inplace(self, result: DataFrame) -> None: if in_axis and name not in columns: result.insert(0, name, lev) - def _wrap_aggregated_output( - self, - output: Mapping[base.OutputKey, Series | ArrayLike], + def _indexed_output_to_ndframe( + self, output: Mapping[base.OutputKey, ArrayLike] ) -> DataFrame: """ - Wraps the output of DataFrameGroupBy aggregations into the expected result. - - Parameters - ---------- - output : Mapping[base.OutputKey, Union[Series, np.ndarray]] - Data to wrap. - - Returns - ------- - DataFrame + Wrap the dict result of a GroupBy aggregation into a DataFrame. """ - if isinstance(output, DataFrame): - result = output - else: - indexed_output = {key.position: val for key, val in output.items()} - columns = Index([key.label for key in output]) - columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names) - - result = self.obj._constructor(indexed_output) - result.columns = columns - - if not self.as_index: - self._insert_inaxis_grouper_inplace(result) - result = result._consolidate() - else: - result.index = self.grouper.result_index - - if self.axis == 1: - result = result.T - if result.index.equals(self.obj.index): - # Retain e.g. DatetimeIndex/TimedeltaIndex freq - result.index = self.obj.index.copy() - # TODO: Do this more systematically + indexed_output = {key.position: val for key, val in output.items()} + columns = Index([key.label for key in output]) + columns._set_names(self._obj_with_exclusions._get_axis(1 - self.axis).names) - return self._reindex_output(result) + result = self.obj._constructor(indexed_output) + result.columns = columns + return result def _wrap_transformed_output( self, output: Mapping[base.OutputKey, Series | ArrayLike] diff --git a/pandas/core/groupby/groupby.py b/pandas/core/groupby/groupby.py index 8ce4c2cf0a4f4..8022d967a90d3 100644 --- a/pandas/core/groupby/groupby.py +++ b/pandas/core/groupby/groupby.py @@ -63,6 +63,7 @@ class providing the base-class of operations. from pandas.core.dtypes.common import ( is_bool_dtype, is_datetime64_dtype, + is_float_dtype, is_integer_dtype, is_numeric_dtype, is_object_dtype, @@ -1010,7 +1011,11 @@ def reset_identity(values): if not not_indexed_same: result = concat(values, axis=self.axis) - ax = self.filter(lambda x: True).axes[self.axis] + ax = ( + self.filter(lambda x: True).axes[self.axis] + if self.dropna + else self._selected_obj._get_axis(self.axis) + ) # this is a very unfortunate situation # we can't use reindex to restore the original order @@ -1090,9 +1095,54 @@ def _set_result_index_ordered( return result - def _wrap_aggregated_output(self, output: Mapping[base.OutputKey, ArrayLike]): + def _indexed_output_to_ndframe( + self, result: Mapping[base.OutputKey, ArrayLike] + ) -> Series | DataFrame: raise AbstractMethodError(self) + def _wrap_aggregated_output( + self, output: Series | DataFrame | Mapping[base.OutputKey, ArrayLike] + ): + """ + Wraps the output of GroupBy aggregations into the expected result. + + Parameters + ---------- + output : Series, DataFrame, or Mapping[base.OutputKey, ArrayLike] + Data to wrap. + + Returns + ------- + Series or DataFrame + """ + + if isinstance(output, (Series, DataFrame)): + # We get here (for DataFrameGroupBy) if we used Manager.grouped_reduce, + # in which case our columns are already set correctly. + # ATM we do not get here for SeriesGroupBy; when we do, we will + # need to require that result.name already match self.obj.name + result = output + else: + result = self._indexed_output_to_ndframe(output) + + if not self.as_index: + # `not self.as_index` is only relevant for DataFrameGroupBy, + # enforced in __init__ + self._insert_inaxis_grouper_inplace(result) + result = result._consolidate() + else: + result.index = self.grouper.result_index + + if self.axis == 1: + # Only relevant for DataFrameGroupBy, no-op for SeriesGroupBy + result = result.T + if result.index.equals(self.obj.index): + # Retain e.g. DatetimeIndex/TimedeltaIndex freq + result.index = self.obj.index.copy() + # TODO: Do this more systematically + + return self._reindex_output(result) + def _wrap_transformed_output(self, output: Mapping[base.OutputKey, ArrayLike]): raise AbstractMethodError(self) @@ -1126,6 +1176,18 @@ def _resolve_numeric_only(self, numeric_only: bool | lib.NoDefault) -> bool: numeric_only = False return numeric_only + @cache_readonly + def _group_keys_index(self) -> Index: + # The index to use for the result of Groupby Aggregations. + # This _may_ be redundant with self.grouper.result_index, but that + # has not been conclusively proven yet. + keys = self.grouper._get_group_keys() + if self.grouper.nkeys > 1: + index = MultiIndex.from_tuples(keys, names=self.grouper.names) + else: + index = Index._with_infer(keys, name=self.grouper.names[0]) + return index + # ----------------------------------------------------------------- # numba @@ -1193,7 +1255,7 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) data and indices into a Numba jitted function. """ starts, ends, sorted_index, sorted_data = self._numba_prep(func, data) - group_keys = self.grouper._get_group_keys() + index = self._group_keys_index numba_agg_func = numba_.generate_numba_agg_func(kwargs, func, engine_kwargs) result = numba_agg_func( @@ -1201,7 +1263,7 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) sorted_index, starts, ends, - len(group_keys), + len(index), len(data.columns), *args, ) @@ -1210,10 +1272,6 @@ def _aggregate_with_numba(self, data, func, *args, engine_kwargs=None, **kwargs) if cache_key not in NUMBA_FUNC_CACHE: NUMBA_FUNC_CACHE[cache_key] = numba_agg_func - if self.grouper.nkeys > 1: - index = MultiIndex.from_tuples(group_keys, names=self.grouper.names) - else: - index = Index(group_keys, name=self.grouper.names[0]) return result, index # ----------------------------------------------------------------- @@ -2452,6 +2510,9 @@ def pre_processor(vals: ArrayLike) -> tuple[np.ndarray, np.dtype | None]: elif is_timedelta64_dtype(vals.dtype): inference = np.dtype("timedelta64[ns]") out = np.asarray(vals).astype(float) + elif isinstance(vals, ExtensionArray) and is_float_dtype(vals): + inference = np.dtype(np.float64) + out = vals.to_numpy(dtype=float, na_value=np.nan) else: out = np.asarray(vals) @@ -2975,10 +3036,9 @@ def blk_func(values: ArrayLike) -> ArrayLike: if real_2d and values.ndim == 1: assert result.shape[1] == 1, result.shape - # error: Invalid index type "Tuple[slice, int]" for - # "Union[ExtensionArray, ndarray[Any, Any]]"; expected type - # "Union[int, integer[Any], slice, Sequence[int], ndarray[Any, Any]]" - result = result[:, 0] # type: ignore[index] + # error: No overload variant of "__getitem__" of "ExtensionArray" + # matches argument type "Tuple[slice, int]" + result = result[:, 0] # type: ignore[call-overload] if needs_mask: mask = mask[:, 0] @@ -2992,11 +3052,9 @@ def blk_func(values: ArrayLike) -> ArrayLike: if needs_2d and not real_2d: if result.ndim == 2: assert result.shape[1] == 1 - # error: Invalid index type "Tuple[slice, int]" for - # "Union[ExtensionArray, Any, ndarray[Any, Any]]"; expected - # type "Union[int, integer[Any], slice, Sequence[int], - # ndarray[Any, Any]]" - result = result[:, 0] # type: ignore[index] + # error: No overload variant of "__getitem__" of "ExtensionArray" + # matches argument type "Tuple[slice, int]" + result = result[:, 0] # type: ignore[call-overload] return result.T diff --git a/pandas/core/groupby/grouper.py b/pandas/core/groupby/grouper.py index 260416576d79e..c79dadcadc8cd 100644 --- a/pandas/core/groupby/grouper.py +++ b/pandas/core/groupby/grouper.py @@ -845,9 +845,11 @@ def is_in_obj(gpr) -> bool: return False try: return gpr is obj[gpr.name] - except (KeyError, IndexError): + except (KeyError, IndexError, InvalidIndexError): # IndexError reached in e.g. test_skip_group_keys when we pass # lambda here + # InvalidIndexError raised on key-types inappropriate for index, + # e.g. DatetimeIndex.get_loc(tuple()) return False for gpr, level in zip(keys, levels): diff --git a/pandas/core/groupby/ops.py b/pandas/core/groupby/ops.py index f9ba34e916a04..e35f5331195fa 100644 --- a/pandas/core/groupby/ops.py +++ b/pandas/core/groupby/ops.py @@ -138,7 +138,7 @@ def __init__(self, kind: str, how: str): }, } - _MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax"} + _MASKED_CYTHON_FUNCTIONS = {"cummin", "cummax", "min", "max"} _cython_arity = {"ohlc": 4} # OHLC @@ -404,6 +404,7 @@ def _masked_ea_wrap_cython_operation( # Copy to ensure input and result masks don't end up shared mask = values._mask.copy() + result_mask = np.zeros(ngroups, dtype=bool) arr = values._data res_values = self._cython_op_ndim_compat( @@ -412,13 +413,18 @@ def _masked_ea_wrap_cython_operation( ngroups=ngroups, comp_ids=comp_ids, mask=mask, + result_mask=result_mask, **kwargs, ) + dtype = self._get_result_dtype(orig_values.dtype) assert isinstance(dtype, BaseMaskedDtype) cls = dtype.construct_array_type() - return cls(res_values.astype(dtype.type, copy=False), mask) + if self.kind != "aggregate": + return cls(res_values.astype(dtype.type, copy=False), mask) + else: + return cls(res_values.astype(dtype.type, copy=False), result_mask) @final def _cython_op_ndim_compat( @@ -428,7 +434,8 @@ def _cython_op_ndim_compat( min_count: int, ngroups: int, comp_ids: np.ndarray, - mask: np.ndarray | None, + mask: np.ndarray | None = None, + result_mask: np.ndarray | None = None, **kwargs, ) -> np.ndarray: if values.ndim == 1: @@ -436,12 +443,15 @@ def _cython_op_ndim_compat( values2d = values[None, :] if mask is not None: mask = mask[None, :] + if result_mask is not None: + result_mask = result_mask[None, :] res = self._call_cython_op( values2d, min_count=min_count, ngroups=ngroups, comp_ids=comp_ids, mask=mask, + result_mask=result_mask, **kwargs, ) if res.shape[0] == 1: @@ -456,6 +466,7 @@ def _cython_op_ndim_compat( ngroups=ngroups, comp_ids=comp_ids, mask=mask, + result_mask=result_mask, **kwargs, ) @@ -468,6 +479,7 @@ def _call_cython_op( ngroups: int, comp_ids: np.ndarray, mask: np.ndarray | None, + result_mask: np.ndarray | None, **kwargs, ) -> np.ndarray: # np.ndarray[ndim=2] orig_values = values @@ -493,6 +505,8 @@ def _call_cython_op( values = values.T if mask is not None: mask = mask.T + if result_mask is not None: + result_mask = result_mask.T out_shape = self._get_output_shape(ngroups, values) func, values = self.get_cython_func_and_vals(values, is_numeric) @@ -508,6 +522,8 @@ def _call_cython_op( values, comp_ids, min_count, + mask=mask, + result_mask=result_mask, is_datetimelike=is_datetimelike, ) else: @@ -932,6 +948,11 @@ def agg_series( # Preempt TypeError in _aggregate_series_fast result = self._aggregate_series_pure_python(obj, func) + elif isinstance(self, BinGrouper): + # Not yet able to remove the BaseGrouper aggregate_series_fast, + # as test_crosstab.test_categorical breaks without it + result = self._aggregate_series_pure_python(obj, func) + else: result = self._aggregate_series_fast(obj, func) @@ -942,9 +963,7 @@ def agg_series( out = npvalues return out - def _aggregate_series_fast(self, obj: Series, func: F) -> np.ndarray: - # -> np.ndarray[object] - + def _aggregate_series_fast(self, obj: Series, func: F) -> npt.NDArray[np.object_]: # At this point we have already checked that # - obj.index is not a MultiIndex # - obj is backed by an ndarray, not ExtensionArray @@ -962,8 +981,9 @@ def _aggregate_series_fast(self, obj: Series, func: F) -> np.ndarray: return result @final - def _aggregate_series_pure_python(self, obj: Series, func: F) -> np.ndarray: - # -> np.ndarray[object] + def _aggregate_series_pure_python( + self, obj: Series, func: F + ) -> npt.NDArray[np.object_]: ids, _, ngroups = self.group_info counts = np.zeros(ngroups, dtype=int) @@ -1149,15 +1169,9 @@ def groupings(self) -> list[grouper.Grouping]: def _aggregate_series_fast(self, obj: Series, func: F) -> np.ndarray: # -> np.ndarray[object] - - # At this point we have already checked that - # - obj.index is not a MultiIndex - # - obj is backed by an ndarray, not ExtensionArray - # - ngroups != 0 - # - len(self.bins) > 0 - sbg = libreduction.SeriesBinGrouper(obj, func, self.bins) - result, _ = sbg.get_result() - return result + raise NotImplementedError( + "This should not be reached; use _aggregate_series_pure_python" + ) def _is_indexed_like(obj, axes, axis: int) -> bool: @@ -1191,12 +1205,12 @@ def __init__( assert isinstance(axis, int), axis @cache_readonly - def slabels(self) -> np.ndarray: # np.ndarray[np.intp] + def slabels(self) -> npt.NDArray[np.intp]: # Sorted labels return self.labels.take(self._sort_idx) @cache_readonly - def _sort_idx(self) -> np.ndarray: # np.ndarray[np.intp] + def _sort_idx(self) -> npt.NDArray[np.intp]: # Counting sort indexer return get_group_index_sorter(self.labels, self.ngroups) diff --git a/pandas/core/indexes/accessors.py b/pandas/core/indexes/accessors.py index 516c63e0e3727..b8f4b5f9d3423 100644 --- a/pandas/core/indexes/accessors.py +++ b/pandas/core/indexes/accessors.py @@ -492,6 +492,7 @@ def __new__(cls, data: Series): name=orig.name, copy=False, dtype=orig._values.categories.dtype, + index=orig.index, ) if is_datetime64_dtype(data.dtype): diff --git a/pandas/core/indexes/base.py b/pandas/core/indexes/base.py index e5f12ec53a6d4..c73b3e99600d6 100644 --- a/pandas/core/indexes/base.py +++ b/pandas/core/indexes/base.py @@ -242,6 +242,10 @@ def _new_Index(cls, d): # GH#23752 "labels" kwarg has been replaced with "codes" d["codes"] = d.pop("labels") + elif "dtype" not in d and "data" in d: + # Prevent Index.__new__ from conducting inference; + # "data" key not in RangeIndex + d["dtype"] = d["data"].dtype return cls.__new__(cls, **d) @@ -276,9 +280,10 @@ class Index(IndexOpsMixin, PandasObject): DatetimeIndex : Index of datetime64 data. TimedeltaIndex : Index of timedelta64 data. PeriodIndex : Index of Period data. - Int64Index : A special case of :class:`Index` with purely integer labels. - UInt64Index : A special case of :class:`Index` with purely unsigned integer labels. - Float64Index : A special case of :class:`Index` with purely float labels. + NumericIndex : Index of numpy int/uint/float data. + Int64Index : Index of purely int64 labels (deprecated). + UInt64Index : Index of purely uint64 labels (deprecated). + Float64Index : Index of purely float64 labels (deprecated). Notes ----- @@ -571,15 +576,15 @@ def _dtype_to_subclass(cls, dtype: DtypeObj): return TimedeltaIndex elif is_float_dtype(dtype): - from pandas import Float64Index + from pandas.core.api import Float64Index return Float64Index elif is_unsigned_integer_dtype(dtype): - from pandas import UInt64Index + from pandas.core.api import UInt64Index return UInt64Index elif is_signed_integer_dtype(dtype): - from pandas import Int64Index + from pandas.core.api import Int64Index return Int64Index @@ -638,11 +643,6 @@ def _simple_new(cls: type[_IndexT], values, name: Hashable = None) -> _IndexT: result = object.__new__(cls) result._data = values - # _index_data is a (temporary?) fix to ensure that the direct data - # manipulation we do in `_libs/reduction.pyx` continues to work. - # We need access to the actual ndarray, since we're messing with - # data buffers and strides. - result._index_data = values result._name = name result._cache = {} result._reset_identity() @@ -3618,7 +3618,11 @@ def _get_indexer( elif method == "nearest": indexer = self._get_nearest_indexer(target, limit, tolerance) else: - indexer = self._engine.get_indexer(target._get_engine_target()) + tgt_values = target._get_engine_target() + if target._is_multi and self._is_multi: + tgt_values = self._engine._extract_level_codes(target) + + indexer = self._engine.get_indexer(tgt_values) return ensure_platform_int(indexer) @@ -3726,7 +3730,7 @@ def _get_fill_indexer_searchsorted( "if index and target are monotonic" ) - side = "left" if method == "pad" else "right" + side: Literal["left", "right"] = "left" if method == "pad" else "right" # find exact matches first (this simplifies the algorithm) indexer = self.get_indexer(target) @@ -4091,8 +4095,6 @@ def join( join_index, (left_indexer, right_indexer) """ other = ensure_index(other) - self_is_mi = isinstance(self, ABCMultiIndex) - other_is_mi = isinstance(other, ABCMultiIndex) if isinstance(self, ABCDatetimeIndex) and isinstance(other, ABCDatetimeIndex): if (self.tz is None) ^ (other.tz is None): @@ -4112,7 +4114,7 @@ def join( # try to figure out the join level # GH3662 - if level is None and (self_is_mi or other_is_mi): + if level is None and (self._is_multi or other._is_multi): # have the same levels/names so a simple join if self.names == other.names: @@ -4121,7 +4123,7 @@ def join( return self._join_multi(other, how=how) # join on the level - if level is not None and (self_is_mi or other_is_mi): + if level is not None and (self._is_multi or other._is_multi): return self._join_level(other, level, how=how) if len(other) == 0 and how in ("left", "outer"): @@ -4170,8 +4172,20 @@ def join( try: return self._join_monotonic(other, how=how) except TypeError: + # object dtype; non-comparable objects pass + return self._join_via_get_indexer(other, how, sort) + + @final + def _join_via_get_indexer( + self, other: Index, how: str_t, sort: bool + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + # Fallback if we do not have any fastpaths available based on + # uniqueness/monotonicity + + # Note: at this point we have checked matching dtypes + if how == "left": join_index = self elif how == "right": @@ -4281,22 +4295,25 @@ def _join_non_unique( # We only get here if dtypes match assert self.dtype == other.dtype - lvalues = self._get_join_target() - rvalues = other._get_join_target() - left_idx, right_idx = get_join_indexers( - [lvalues], [rvalues], how=how, sort=True + [self._values], [other._values], how=how, sort=True ) + mask = left_idx == -1 - left_idx = ensure_platform_int(left_idx) - right_idx = ensure_platform_int(right_idx) + # error: Argument 1 to "take" of "ExtensionArray" has incompatible + # type "ndarray[Any, dtype[signedinteger[Any]]]"; expected "Sequence[int]" + join_array = self._values.take(left_idx) # type: ignore[arg-type] + # error: Argument 1 to "take" of "ExtensionArray" has incompatible type + # "ndarray[Any, dtype[signedinteger[Any]]]"; expected "Sequence[int]" + right = other._values.take(right_idx) # type: ignore[arg-type] - join_array = np.asarray(lvalues.take(left_idx)) - mask = left_idx == -1 - np.putmask(join_array, mask, rvalues.take(right_idx)) + if isinstance(join_array, np.ndarray): + np.putmask(join_array, mask, right) + else: + # error: "ExtensionArray" has no attribute "putmask" + join_array.putmask(mask, right) # type: ignore[attr-defined] - join_arraylike = self._from_join_target(join_array) - join_index = self._wrap_joined_index(join_arraylike, other) + join_index = self._wrap_joined_index(join_array, other) return join_index, left_idx, right_idx @@ -4449,7 +4466,9 @@ def _get_leaf_sorter(labels: list[np.ndarray]) -> npt.NDArray[np.intp]: return join_index, left_indexer, right_indexer @final - def _join_monotonic(self, other: Index, how: str_t = "left"): + def _join_monotonic( + self, other: Index, how: str_t = "left" + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: # We only get here with matching dtypes assert other.dtype == self.dtype @@ -4784,11 +4803,7 @@ def __getitem__(self, key): result = getitem(key) if not is_scalar(result): - # error: Argument 1 to "ndim" has incompatible type "Union[ExtensionArray, - # Any]"; expected "Union[Union[int, float, complex, str, bytes, generic], - # Sequence[Union[int, float, complex, str, bytes, generic]], - # Sequence[Sequence[Any]], _SupportsArray]" - if np.ndim(result) > 1: # type: ignore[arg-type] + if np.ndim(result) > 1: deprecate_ndim_indexing(result) return result # NB: Using _constructor._simple_new would break if MultiIndex @@ -4857,6 +4872,7 @@ def _concat(self, to_concat: list[Index], name: Hashable) -> Index: result = concat_compat(to_concat_vals) return Index._with_infer(result, name=name) + @final def putmask(self, mask, value) -> Index: """ Return a new Index of the values set with the mask. @@ -4879,19 +4895,24 @@ def putmask(self, mask, value) -> Index: try: converted = self._validate_fill_value(value) except (ValueError, TypeError) as err: - if is_object_dtype(self): + if is_object_dtype(self): # pragma: no cover raise err dtype = self._find_common_type_compat(value) return self.astype(dtype).putmask(mask, value) values = self._values.copy() - # error: Argument 1 to "setitem_datetimelike_compat" has incompatible type - # "Union[ExtensionArray, ndarray]"; expected "ndarray" - converted = setitem_datetimelike_compat( - values, mask.sum(), converted # type: ignore[arg-type] - ) - np.putmask(values, mask, converted) + + if isinstance(values, np.ndarray): + converted = setitem_datetimelike_compat(values, mask.sum(), converted) + np.putmask(values, mask, converted) + + else: + # Note: we use the original value here, not converted, as + # _validate_fill_value is not idempotent + # error: "ExtensionArray" has no attribute "putmask" + values.putmask(mask, value) # type: ignore[attr-defined] + return self._shallow_copy(values) def equals(self, other: Any) -> bool: @@ -5097,13 +5118,17 @@ def asof_locs(self, where: Index, mask: np.ndarray) -> npt.NDArray[np.intp]: which correspond to the return values of the `asof` function for every element in `where`. """ - locs = self._values[mask].searchsorted(where._values, side="right") + # error: No overload variant of "searchsorted" of "ndarray" matches argument + # types "Union[ExtensionArray, ndarray[Any, Any]]", "str" + # TODO: will be fixed when ExtensionArray.searchsorted() is fixed + locs = self._values[mask].searchsorted( + where._values, side="right" # type: ignore[call-overload] + ) locs = np.where(locs > 0, locs - 1, 0) result = np.arange(len(self), dtype=np.intp)[mask].take(locs) - # TODO: overload return type of ExtensionArray.__getitem__ - first_value = cast(Any, self._values[mask.argmax()]) + first_value = self._values[mask.argmax()] result[(locs == 0) & (where._values < first_value)] = -1 return result @@ -5438,6 +5463,8 @@ def get_indexer_non_unique( # Note: _maybe_promote ensures we never get here with MultiIndex # self and non-Multi target tgt_values = target._get_engine_target() + if self._is_multi and target._is_multi: + tgt_values = self._engine._extract_level_codes(target) indexer, missing = self._engine.get_indexer_non_unique(tgt_values) return ensure_platform_int(indexer), ensure_platform_int(missing) @@ -6036,7 +6063,7 @@ def _maybe_cast_slice_bound(self, label, side: str_t, kind=no_default): return label - def _searchsorted_monotonic(self, label, side: str_t = "left"): + def _searchsorted_monotonic(self, label, side: Literal["left", "right"] = "left"): if self.is_monotonic_increasing: return self.searchsorted(label, side=side) elif self.is_monotonic_decreasing: @@ -6050,7 +6077,9 @@ def _searchsorted_monotonic(self, label, side: str_t = "left"): raise ValueError("index must be monotonic increasing or decreasing") - def get_slice_bound(self, label, side: str_t, kind=no_default) -> int: + def get_slice_bound( + self, label, side: Literal["left", "right"], kind=no_default + ) -> int: """ Calculate slice bound that corresponds to given label. diff --git a/pandas/core/indexes/datetimelike.py b/pandas/core/indexes/datetimelike.py index 5d778af954eef..d2f598261a776 100644 --- a/pandas/core/indexes/datetimelike.py +++ b/pandas/core/indexes/datetimelike.py @@ -674,8 +674,7 @@ def _fast_union(self: _T, other: _T, sort=None) -> _T: left, right = self, other left_start = left[0] loc = right.searchsorted(left_start, side="left") - # error: Slice index must be an integer or None - right_chunk = right._values[:loc] # type: ignore[misc] + right_chunk = right._values[:loc] dates = concat_compat((left._values, right_chunk)) # With sort being False, we can't infer that result.freq == self.freq # TODO: no tests rely on the _with_freq("infer"); needed? @@ -691,8 +690,7 @@ def _fast_union(self: _T, other: _T, sort=None) -> _T: # concatenate if left_end < right_end: loc = right.searchsorted(left_end, side="right") - # error: Slice index must be an integer or None - right_chunk = right._values[loc:] # type: ignore[misc] + right_chunk = right._values[loc:] dates = concat_compat([left._values, right_chunk]) # The can_fast_union check ensures that the result.freq # should match self.freq diff --git a/pandas/core/indexes/datetimes.py b/pandas/core/indexes/datetimes.py index 8a5811da4dd5a..fbbe6606ba522 100644 --- a/pandas/core/indexes/datetimes.py +++ b/pandas/core/indexes/datetimes.py @@ -11,6 +11,7 @@ from typing import ( TYPE_CHECKING, Hashable, + Literal, ) import warnings @@ -765,7 +766,9 @@ def check_str_or_none(point): return indexer @doc(Index.get_slice_bound) - def get_slice_bound(self, label, side: str, kind=lib.no_default) -> int: + def get_slice_bound( + self, label, side: Literal["left", "right"], kind=lib.no_default + ) -> int: # GH#42855 handle date here instead of _maybe_cast_slice_bound if isinstance(label, date) and not isinstance(label, datetime): label = Timestamp(label).to_pydatetime() diff --git a/pandas/core/indexes/extension.py b/pandas/core/indexes/extension.py index b835b79b1e3e2..b446dfe045e62 100644 --- a/pandas/core/indexes/extension.py +++ b/pandas/core/indexes/extension.py @@ -4,8 +4,11 @@ from __future__ import annotations from typing import ( + TYPE_CHECKING, Hashable, + Literal, TypeVar, + overload, ) import numpy as np @@ -31,7 +34,6 @@ ABCSeries, ) -from pandas.core.array_algos.putmask import validate_putmask from pandas.core.arrays import ( Categorical, DatetimeArray, @@ -40,10 +42,18 @@ TimedeltaArray, ) from pandas.core.arrays._mixins import NDArrayBackedExtensionArray +from pandas.core.arrays.base import ExtensionArray from pandas.core.indexers import deprecate_ndim_indexing from pandas.core.indexes.base import Index from pandas.core.ops import get_op_result_name +if TYPE_CHECKING: + + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + _T = TypeVar("_T", bound="NDArrayBackedExtensionIndex") @@ -319,25 +329,40 @@ def __getitem__(self, key): deprecate_ndim_indexing(result) return result - def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: + # This overload is needed so that the call to searchsorted in + # pandas.core.resample.TimeGrouper._get_period_bins picks the correct result + + @overload + # The following ignore is also present in numpy/__init__.pyi + # Possibly a mypy bug?? + # error: Overloaded function signatures 1 and 2 overlap with incompatible + # return types [misc] + def searchsorted( # type: ignore[misc] + self, + value: npt._ScalarLike_co, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> np.intp: + ... + + @overload + def searchsorted( + self, + value: npt.ArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp]: + ... + + def searchsorted( + self, + value: NumpyValueArrayLike | ExtensionArray, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp] | np.intp: # overriding IndexOpsMixin improves performance GH#38083 return self._data.searchsorted(value, side=side, sorter=sorter) - def putmask(self, mask, value) -> Index: - mask, noop = validate_putmask(self._data, mask) - if noop: - return self.copy() - - try: - self._validate_fill_value(value) - except (ValueError, TypeError): - dtype = self._find_common_type_compat(value) - return self.astype(dtype).putmask(mask, value) - - arr = self._data.copy() - arr.putmask(mask, value) - return type(self)._simple_new(arr, name=self.name) - # --------------------------------------------------------------------- def _get_engine_target(self) -> np.ndarray: @@ -464,19 +489,6 @@ class NDArrayBackedExtensionIndex(ExtensionIndex): _data: NDArrayBackedExtensionArray - @classmethod - def _simple_new( - cls, - values: NDArrayBackedExtensionArray, - name: Hashable = None, - ): - result = super()._simple_new(values, name) - - # For groupby perf. See note in indexes/base about _index_data - result._index_data = values._ndarray - - return result - def _get_engine_target(self) -> np.ndarray: return self._data._ndarray diff --git a/pandas/core/indexes/multi.py b/pandas/core/indexes/multi.py index 60ae71e8f888f..0c158d47cfa3b 100644 --- a/pandas/core/indexes/multi.py +++ b/pandas/core/indexes/multi.py @@ -3683,7 +3683,12 @@ def astype(self, dtype, copy: bool = True): return self def _validate_fill_value(self, item): - if not isinstance(item, tuple): + if isinstance(item, MultiIndex): + # GH#43212 + if item.nlevels != self.nlevels: + raise ValueError("Item must have length equal to number of levels.") + return item._values + elif not isinstance(item, tuple): # Pad the key with empty strings if lower levels of the key # aren't specified: item = (item,) + ("",) * (self.nlevels - 1) diff --git a/pandas/core/indexes/numeric.py b/pandas/core/indexes/numeric.py index 2d8948a64dde7..8e8ed294304c5 100644 --- a/pandas/core/indexes/numeric.py +++ b/pandas/core/indexes/numeric.py @@ -15,6 +15,7 @@ from pandas._typing import ( Dtype, DtypeObj, + npt, ) from pandas.util._decorators import ( cache_readonly, @@ -42,20 +43,19 @@ maybe_extract_name, ) -_num_index_shared_docs = {} - -_num_index_shared_docs[ - "class_descr" -] = """ +class NumericIndex(Index): + """ Immutable sequence used for indexing and alignment. The basic object - storing axis labels for all pandas objects. %(klass)s is a special case - of `Index` with purely %(ltype)s labels. %(extra)s. + storing axis labels for all pandas objects. NumericIndex is a special case + of `Index` with purely numpy int/uint/float labels. + + .. versionadded:: 1.4.0 Parameters ---------- data : array-like (1-dimensional) - dtype : NumPy dtype (default: %(dtype)s) + dtype : NumPy dtype (default: None) copy : bool Make a copy of input ndarray. name : object @@ -66,27 +66,29 @@ None Methods - ------- + ---------- None See Also -------- Index : The base pandas Index type. + Int64Index : Index of purely int64 labels (deprecated). + UInt64Index : Index of purely uint64 labels (deprecated). + Float64Index : Index of purely float64 labels (deprecated). Notes ----- - An Index instance can **only** contain hashable objects. -""" - + An NumericIndex instance can **only** contain numpy int64/32/16/8, uint64/32/16/8 or + float64/32/16 dtype. In particular, ``NumericIndex`` *can not* hold Pandas numeric + dtypes (:class:`Int64Dtype`, :class:`Int32Dtype` etc.). -class NumericIndex(Index): - _index_descr_args = { - "klass": "NumericIndex", - "ltype": "integer or float", - "dtype": "inferred", - "extra": "", - } - __doc__ = _num_index_shared_docs["class_descr"] % _index_descr_args + Examples + -------- + >>> pd.NumericIndex([1, 2, 3], dtype="int8") + NumericIndex([1, 2, 3], dtype='int8') + >>> pd.NumericIndex([1, 2, 3], dtype="float32") + NumericIndex([1.0, 2.0, 3.0], dtype='float32') + """ _typ = "numericindex" _values: np.ndarray @@ -360,6 +362,48 @@ def _format_native_types( ) +_num_index_shared_docs = {} + + +_num_index_shared_docs[ + "class_descr" +] = """ + Immutable sequence used for indexing and alignment. The basic object + storing axis labels for all pandas objects. %(klass)s is a special case + of `Index` with purely %(ltype)s labels. %(extra)s. + + .. deprecated:: 1.4.0 + In pandas v2.0 %(klass)s will be removed and :class:`NumericIndex` used instead. + %(klass)s will remain fully functional for the duration of pandas 1.x. + + Parameters + ---------- + data : array-like (1-dimensional) + dtype : NumPy dtype (default: %(dtype)s) + copy : bool + Make a copy of input ndarray. + name : object + Name to be stored in the index. + + Attributes + ---------- + None + + Methods + ---------- + None + + See Also + -------- + Index : The base pandas Index type. + NumericIndex : Index of numpy int/uint/float data. + + Notes + ----- + An Index instance can **only** contain hashable objects. +""" + + class IntegerIndex(NumericIndex): """ This is an abstract class for Int64Index, UInt64Index. @@ -368,7 +412,7 @@ class IntegerIndex(NumericIndex): _is_backward_compat_public_numeric_index: bool = False @property - def asi8(self) -> np.ndarray: + def asi8(self) -> npt.NDArray[np.int64]: # do not cache or you'll create a memory leak warnings.warn( "Index.asi8 is deprecated and will be removed in a future version.", diff --git a/pandas/core/internals/array_manager.py b/pandas/core/internals/array_manager.py index 475bfe958ea06..f645cc81e8171 100644 --- a/pandas/core/internals/array_manager.py +++ b/pandas/core/internals/array_manager.py @@ -315,10 +315,9 @@ def apply_with_block(self: T, f, align_keys=None, swap_axis=True, **kwargs) -> T if self.ndim == 2 and arr.ndim == 2: # 2D for np.ndarray or DatetimeArray/TimedeltaArray assert len(arr) == 1 - # error: Invalid index type "Tuple[int, slice]" for - # "Union[ndarray, ExtensionArray]"; expected type - # "Union[int, slice, ndarray]" - arr = arr[0, :] # type: ignore[index] + # error: No overload variant of "__getitem__" of "ExtensionArray" + # matches argument type "Tuple[int, slice]" + arr = arr[0, :] # type: ignore[call-overload] result_arrays.append(arr) return type(self)(result_arrays, self._axes) @@ -841,10 +840,9 @@ def iset(self, loc: int | slice | np.ndarray, value: ArrayLike): assert value.shape[0] == len(self._axes[0]) for value_idx, mgr_idx in enumerate(indices): - # error: Invalid index type "Tuple[slice, int]" for - # "Union[ExtensionArray, ndarray]"; expected type - # "Union[int, slice, ndarray]" - value_arr = value[:, value_idx] # type: ignore[index] + # error: No overload variant of "__getitem__" of "ExtensionArray" matches + # argument type "Tuple[slice, int]" + value_arr = value[:, value_idx] # type: ignore[call-overload] self.arrays[mgr_idx] = value_arr return @@ -864,10 +862,9 @@ def insert(self, loc: int, item: Hashable, value: ArrayLike) -> None: value = extract_array(value, extract_numpy=True) if value.ndim == 2: if value.shape[0] == 1: - # error: Invalid index type "Tuple[int, slice]" for - # "Union[Any, ExtensionArray, ndarray]"; expected type - # "Union[int, slice, ndarray]" - value = value[0, :] # type: ignore[index] + # error: No overload variant of "__getitem__" of "ExtensionArray" + # matches argument type "Tuple[int, slice]" + value = value[0, :] # type: ignore[call-overload] else: raise ValueError( f"Expected a 1D array, got an array with shape {value.shape}" diff --git a/pandas/core/internals/blocks.py b/pandas/core/internals/blocks.py index e2680712483a4..e3fcff1557ca9 100644 --- a/pandas/core/internals/blocks.py +++ b/pandas/core/internals/blocks.py @@ -624,7 +624,11 @@ def should_store(self, value: ArrayLike) -> bool: ------- bool """ - return is_dtype_equal(value.dtype, self.dtype) + # faster equivalent to is_dtype_equal(value.dtype, self.dtype) + try: + return value.dtype == self.dtype + except TypeError: + return False @final def to_native_types(self, na_rep="nan", quoting=None, **kwargs): @@ -1080,6 +1084,23 @@ def interpolate( # If there are no NAs, then interpolate is a no-op return [self] if inplace else [self.copy()] + if self.is_object and self.ndim == 2 and self.shape[0] != 1 and axis == 0: + # split improves performance in ndarray.copy() + return self.split_and_operate( + type(self).interpolate, + method, + axis, + index, + inplace, + limit, + limit_direction, + limit_area, + fill_value, + coerce, + downcast, + **kwargs, + ) + try: m = missing.clean_fill_method(method) except ValueError: @@ -1281,6 +1302,10 @@ def _unstack(self, unstacker, fill_value, new_placement, allow_fill: bool): mask = mask.any(0) # TODO: in all tests we have mask.all(); can we rely on that? + # Note: these next two lines ensure that + # mask.sum() == sum(len(nb.mgr_locs) for nb in blocks) + # which the calling function needs in order to pass verify_integrity=False + # to the BlockManager constructor new_values = new_values.T[mask] new_placement = new_placement[mask] @@ -1344,7 +1369,7 @@ def get_values(self, dtype: DtypeObj | None = None) -> np.ndarray: """ return object dtype as boxed values, such as Timestamps/Timedelta """ - values = self.values + values: ArrayLike = self.values if dtype == _dtype_obj: values = values.astype(object) # TODO(EA2D): reshape not needed with 2D EAs @@ -1645,20 +1670,25 @@ def _unstack(self, unstacker, fill_value, new_placement, allow_fill: bool): # converting to a 2-D ndarray of objects. # Instead, we unstack an ndarray of integer positions, followed by # a `take` on the actual values. - n_rows = self.shape[-1] - dummy_arr = np.arange(n_rows) - new_values, mask = unstacker.get_new_values(dummy_arr, fill_value=-1) - mask = mask.any(0) - # TODO: in all tests we have mask.all(); can we rely on that? + # Caller is responsible for ensuring self.shape[-1] == len(unstacker.index) + new_values, mask = unstacker.arange_result + + # Note: these next two lines ensure that + # mask.sum() == sum(len(nb.mgr_locs) for nb in blocks) + # which the calling function needs in order to pass verify_integrity=False + # to the BlockManager constructor + new_values = new_values.T[mask] + new_placement = new_placement[mask] blocks = [ # TODO: could cast to object depending on fill_value? - self.make_block_same_class( + type(self)( self.values.take(indices, allow_fill=allow_fill, fill_value=fill_value), BlockPlacement(place), + ndim=2, ) - for indices, place in zip(new_values.T, new_placement) + for indices, place in zip(new_values, new_placement) ] return blocks, mask diff --git a/pandas/core/internals/concat.py b/pandas/core/internals/concat.py index 0aed9e697ca66..d0e017b06ffbc 100644 --- a/pandas/core/internals/concat.py +++ b/pandas/core/internals/concat.py @@ -32,6 +32,7 @@ is_1d_only_ea_obj, is_datetime64tz_dtype, is_dtype_equal, + is_scalar, needs_i8_conversion, ) from pandas.core.dtypes.concat import ( @@ -41,6 +42,7 @@ from pandas.core.dtypes.dtypes import ExtensionDtype from pandas.core.dtypes.missing import ( is_valid_na_for_dtype, + isna, isna_all, ) @@ -63,6 +65,7 @@ if TYPE_CHECKING: from pandas import Index + from pandas.core.internals.blocks import Block def _concatenate_array_managers( @@ -196,6 +199,8 @@ def concatenate_managers( if isinstance(mgrs_indexers[0][0], ArrayManager): return _concatenate_array_managers(mgrs_indexers, axes, concat_axis, copy) + mgrs_indexers = _maybe_reindex_columns_na_proxy(axes, mgrs_indexers) + concat_plans = [ _get_mgr_concatenation_plan(mgr, indexers) for mgr, indexers in mgrs_indexers ] @@ -243,6 +248,38 @@ def concatenate_managers( return BlockManager(tuple(blocks), axes) +def _maybe_reindex_columns_na_proxy( + axes: list[Index], mgrs_indexers: list[tuple[BlockManager, dict[int, np.ndarray]]] +) -> list[tuple[BlockManager, dict[int, np.ndarray]]]: + """ + Reindex along columns so that all of the BlockManagers being concatenated + have matching columns. + + Columns added in this reindexing have dtype=np.void, indicating they + should be ignored when choosing a column's final dtype. + """ + new_mgrs_indexers = [] + for mgr, indexers in mgrs_indexers: + # We only reindex for axis=0 (i.e. columns), as this can be done cheaply + if 0 in indexers: + new_mgr = mgr.reindex_indexer( + axes[0], + indexers[0], + axis=0, + copy=False, + only_slice=True, + allow_dups=True, + use_na_proxy=True, + ) + new_indexers = indexers.copy() + del new_indexers[0] + new_mgrs_indexers.append((new_mgr, new_indexers)) + else: + new_mgrs_indexers.append((mgr, indexers)) + + return new_mgrs_indexers + + def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarray]): """ Construct concatenation plan for given block manager and indexers. @@ -264,26 +301,20 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra mgr_shape_list[ax] = len(indexer) mgr_shape = tuple(mgr_shape_list) - has_column_indexer = False - - if 0 in indexers: - has_column_indexer = True - ax0_indexer = indexers.pop(0) - blknos = algos.take_nd(mgr.blknos, ax0_indexer, fill_value=-1) - blklocs = algos.take_nd(mgr.blklocs, ax0_indexer, fill_value=-1) - else: + assert 0 not in indexers - if mgr.is_single_block: - blk = mgr.blocks[0] - return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] + if mgr.is_single_block: + blk = mgr.blocks[0] + return [(blk.mgr_locs, JoinUnit(blk, mgr_shape, indexers))] - blknos = mgr.blknos - blklocs = mgr.blklocs + blknos = mgr.blknos + blklocs = mgr.blklocs plan = [] for blkno, placements in libinternals.get_blkno_placements(blknos, group=False): assert placements.is_slice_like + assert blkno != -1 join_unit_indexers = indexers.copy() @@ -291,41 +322,33 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra shape_list[0] = len(placements) shape = tuple(shape_list) - if blkno == -1: - # only reachable in the `0 in indexers` case - unit = JoinUnit(None, shape) - else: - blk = mgr.blocks[blkno] - ax0_blk_indexer = blklocs[placements.indexer] - - unit_no_ax0_reindexing = ( - len(placements) == len(blk.mgr_locs) - and - # Fastpath detection of join unit not - # needing to reindex its block: no ax0 - # reindexing took place and block - # placement was sequential before. - ( - ( - not has_column_indexer - and blk.mgr_locs.is_slice_like - and blk.mgr_locs.as_slice.step == 1 - ) - or - # Slow-ish detection: all indexer locs - # are sequential (and length match is - # checked above). - (np.diff(ax0_blk_indexer) == 1).all() - ) + blk = mgr.blocks[blkno] + ax0_blk_indexer = blklocs[placements.indexer] + + unit_no_ax0_reindexing = ( + len(placements) == len(blk.mgr_locs) + and + # Fastpath detection of join unit not + # needing to reindex its block: no ax0 + # reindexing took place and block + # placement was sequential before. + ( + (blk.mgr_locs.is_slice_like and blk.mgr_locs.as_slice.step == 1) + or + # Slow-ish detection: all indexer locs + # are sequential (and length match is + # checked above). + (np.diff(ax0_blk_indexer) == 1).all() ) + ) - # Omit indexer if no item reindexing is required. - if unit_no_ax0_reindexing: - join_unit_indexers.pop(0, None) - else: - join_unit_indexers[0] = ax0_blk_indexer + # Omit indexer if no item reindexing is required. + if unit_no_ax0_reindexing: + join_unit_indexers.pop(0, None) + else: + join_unit_indexers[0] = ax0_blk_indexer - unit = JoinUnit(blk, shape, join_unit_indexers) + unit = JoinUnit(blk, shape, join_unit_indexers) plan.append((placements, unit)) @@ -333,7 +356,7 @@ def _get_mgr_concatenation_plan(mgr: BlockManager, indexers: dict[int, np.ndarra class JoinUnit: - def __init__(self, block, shape: Shape, indexers=None): + def __init__(self, block: Block, shape: Shape, indexers=None): # Passing shape explicitly is required for cases when block is None. # Note: block is None implies indexers is None, but not vice-versa if indexers is None: @@ -357,7 +380,7 @@ def needs_filling(self) -> bool: @cache_readonly def dtype(self): blk = self.block - if blk is None: + if blk.values.dtype.kind == "V": raise AssertionError("Block is None, no dtype") if not self.needs_filling: @@ -371,7 +394,7 @@ def _is_valid_na_for(self, dtype: DtypeObj) -> bool: """ if not self.is_na: return False - if self.block is None: + if self.block.dtype.kind == "V": return True if self.dtype == object: @@ -396,25 +419,37 @@ def _is_valid_na_for(self, dtype: DtypeObj) -> bool: @cache_readonly def is_na(self) -> bool: - if self.block is None: + blk = self.block + if blk.dtype.kind == "V": return True - if not self.block._can_hold_na: + if not blk._can_hold_na: return False - values = self.block.values - if isinstance(self.block.values.dtype, SparseDtype): + values = blk.values + if values.size == 0: + return True + if isinstance(values.dtype, SparseDtype): return False - elif self.block.is_extension: + + if values.ndim == 1: # TODO(EA2D): no need for special case with 2D EAs - values_flat = values + val = values[0] + if not is_scalar(val) or not isna(val): + # ideally isna_all would do this short-circuiting + return False + return isna_all(values) else: - values_flat = values.ravel(order="K") - - return isna_all(values_flat) + val = values[0][0] + if not is_scalar(val) or not isna(val): + # ideally isna_all would do this short-circuiting + return False + return all(isna_all(row) for row in values) def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: - if upcasted_na is None: + values: ArrayLike + + if upcasted_na is None and self.block.dtype.kind != "V": # No upcasting is necessary fill_value = self.block.fill_value values = self.block.get_values() @@ -422,8 +457,8 @@ def get_reindexed_values(self, empty_dtype: DtypeObj, upcasted_na) -> ArrayLike: fill_value = upcasted_na if self._is_valid_na_for(empty_dtype): - # note: always holds when self.block is None - blk_dtype = getattr(self.block, "dtype", None) + # note: always holds when self.block.dtype.kind == "V" + blk_dtype = self.block.dtype if blk_dtype == np.dtype("object"): # we want to avoid filling with np.nan if we are @@ -500,7 +535,7 @@ def _concatenate_join_units( empty_dtype = _get_empty_dtype(join_units) - has_none_blocks = any(unit.block is None for unit in join_units) + has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) upcasted_na = _dtype_to_na_value(empty_dtype, has_none_blocks) to_concat = [ @@ -526,10 +561,10 @@ def _concatenate_join_units( # concatting with at least one EA means we are concatting a single column # the non-EA values are 2D arrays with shape (1, n) - # error: Invalid index type "Tuple[int, slice]" for - # "Union[ExtensionArray, ndarray]"; expected type "Union[int, slice, ndarray]" + # error: No overload variant of "__getitem__" of "ExtensionArray" matches + # argument type "Tuple[int, slice]" to_concat = [ - t if is_1d_only_ea_obj(t) else t[0, :] # type: ignore[index] + t if is_1d_only_ea_obj(t) else t[0, :] # type: ignore[call-overload] for t in to_concat ] concat_values = concat_compat(to_concat, axis=0, ea_compat_axis=True) @@ -576,21 +611,18 @@ def _get_empty_dtype(join_units: Sequence[JoinUnit]) -> DtypeObj: """ if len(join_units) == 1: blk = join_units[0].block - if blk is None: - return np.dtype(np.float64) + return blk.dtype if _is_uniform_reindex(join_units): # FIXME: integrate property empty_dtype = join_units[0].block.dtype return empty_dtype - has_none_blocks = any(unit.block is None for unit in join_units) + has_none_blocks = any(unit.block.dtype.kind == "V" for unit in join_units) - dtypes = [ - unit.dtype for unit in join_units if unit.block is not None and not unit.is_na - ] + dtypes = [unit.dtype for unit in join_units if not unit.is_na] if not len(dtypes): - dtypes = [unit.dtype for unit in join_units if unit.block is not None] + dtypes = [unit.dtype for unit in join_units if unit.block.dtype.kind != "V"] dtype = find_common_type(dtypes) if has_none_blocks: @@ -606,7 +638,7 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: """ first = join_units[0].block - if first is None: + if first.dtype.kind == "V": return False return ( # exclude cases where a) ju.block is None or b) we have e.g. Int64+int64 @@ -636,7 +668,7 @@ def _is_uniform_join_units(join_units: list[JoinUnit]) -> bool: def _is_uniform_reindex(join_units) -> bool: return ( # TODO: should this be ju.block._can_hold_na? - all(ju.block and ju.block.is_extension for ju in join_units) + all(ju.block.is_extension for ju in join_units) and len({ju.block.dtype.name for ju in join_units}) == 1 ) diff --git a/pandas/core/internals/construction.py b/pandas/core/internals/construction.py index 86834a8dccf40..63d84ab39fa96 100644 --- a/pandas/core/internals/construction.py +++ b/pandas/core/internals/construction.py @@ -386,7 +386,9 @@ def ndarray_to_mgr( if len(columns) == 0: block_values = [] - return create_block_manager_from_blocks(block_values, [columns, index]) + return create_block_manager_from_blocks( + block_values, [columns, index], verify_integrity=False + ) def _check_values_indices_shape_match( diff --git a/pandas/core/internals/managers.py b/pandas/core/internals/managers.py index 7213c03aa3a9c..a9894ab5acf23 100644 --- a/pandas/core/internals/managers.py +++ b/pandas/core/internals/managers.py @@ -66,6 +66,7 @@ from pandas.core.internals.blocks import ( Block, DatetimeTZBlock, + NumpyBlock, ensure_block_shape, extend_blocks, get_block_type, @@ -217,27 +218,6 @@ def is_single_block(self) -> bool: # Assumes we are 2D; overridden by SingleBlockManager return len(self.blocks) == 1 - def _rebuild_blknos_and_blklocs(self) -> None: - """ - Update mgr._blknos / mgr._blklocs. - """ - new_blknos = np.empty(self.shape[0], dtype=np.intp) - new_blklocs = np.empty(self.shape[0], dtype=np.intp) - new_blknos.fill(-1) - new_blklocs.fill(-1) - - for blkno, blk in enumerate(self.blocks): - rl = blk.mgr_locs - new_blknos[rl.indexer] = blkno - new_blklocs[rl.indexer] = np.arange(len(rl)) - - if (new_blknos == -1).any(): - # TODO: can we avoid this? it isn't cheap - raise AssertionError("Gaps in blk ref_locs") - - self._blknos = new_blknos - self._blklocs = new_blklocs - @property def items(self) -> Index: return self.axes[0] @@ -330,7 +310,8 @@ def apply( if ignore_failures: return self._combine(result_blocks) - return type(self).from_blocks(result_blocks, self.axes) + out = type(self).from_blocks(result_blocks, self.axes) + return out def where(self: T, other, cond, align: bool, errors: str) -> T: if align: @@ -526,7 +507,13 @@ def get_numeric_data(self: T, copy: bool = False) -> T: copy : bool, default False Whether to copy the blocks """ - return self._combine([b for b in self.blocks if b.is_numeric], copy) + numeric_blocks = [blk for blk in self.blocks if blk.is_numeric] + if len(numeric_blocks) == len(self.blocks): + # Avoid somewhat expensive _combine + if copy: + return self.copy(deep=True) + return self + return self._combine(numeric_blocks, copy) def _combine( self: T, blocks: list[Block], copy: bool = True, index: Index | None = None @@ -589,8 +576,14 @@ def copy_func(ax): new_axes = list(self.axes) res = self.apply("copy", deep=deep) + res.axes = new_axes + if self.ndim > 1: + # Avoid needing to re-compute these + res._blknos = self.blknos.copy() + res._blklocs = self.blklocs.copy() + if deep: res._consolidate_inplace() return res @@ -621,6 +614,8 @@ def reindex_indexer( copy: bool = True, consolidate: bool = True, only_slice: bool = False, + *, + use_na_proxy: bool = False, ) -> T: """ Parameters @@ -635,6 +630,8 @@ def reindex_indexer( Whether to consolidate inplace before reindexing. only_slice : bool, default False Whether to take views, not copies, along columns. + use_na_proxy : bool, default False + Whether to use a np.void ndarray for newly introduced columns. pandas-indexer with -1's only. """ @@ -659,7 +656,10 @@ def reindex_indexer( if axis == 0: new_blocks = self._slice_take_blocks_ax0( - indexer, fill_value=fill_value, only_slice=only_slice + indexer, + fill_value=fill_value, + only_slice=only_slice, + use_na_proxy=use_na_proxy, ) else: new_blocks = [ @@ -683,6 +683,8 @@ def _slice_take_blocks_ax0( slice_or_indexer: slice | np.ndarray, fill_value=lib.no_default, only_slice: bool = False, + *, + use_na_proxy: bool = False, ) -> list[Block]: """ Slice/take blocks along axis=0. @@ -696,6 +698,8 @@ def _slice_take_blocks_ax0( only_slice : bool, default False If True, we always return views on existing arrays, never copies. This is used when called from ops.blockwise.operate_blockwise. + use_na_proxy : bool, default False + Whether to use a np.void ndarray for newly introduced columns. Returns ------- @@ -764,7 +768,11 @@ def _slice_take_blocks_ax0( # If we've got here, fill_value was not lib.no_default blocks.append( - self._make_na_block(placement=mgr_locs, fill_value=fill_value) + self._make_na_block( + placement=mgr_locs, + fill_value=fill_value, + use_na_proxy=use_na_proxy, + ) ) else: blk = self.blocks[blkno] @@ -806,7 +814,16 @@ def _slice_take_blocks_ax0( return blocks - def _make_na_block(self, placement: BlockPlacement, fill_value=None) -> Block: + def _make_na_block( + self, placement: BlockPlacement, fill_value=None, use_na_proxy: bool = False + ) -> Block: + + if use_na_proxy: + assert fill_value is None + shape = (len(placement), self.shape[1]) + vals = np.empty(shape, dtype=np.void) + nb = NumpyBlock(vals, placement, ndim=2) + return nb if fill_value is None: fill_value = np.nan @@ -941,13 +958,11 @@ def fast_xs(self, loc: int) -> ArrayLike: n = len(self) if isinstance(dtype, ExtensionDtype): - # we'll eventually construct an ExtensionArray. - result = np.empty(n, dtype=object) - # TODO: let's just use dtype.empty? + cls = dtype.construct_array_type() + result = cls._empty((n,), dtype=dtype) else: result = np.empty(n, dtype=dtype) - - result = ensure_wrapped_if_datetimelike(result) + result = ensure_wrapped_if_datetimelike(result) for blk in self.blocks: # Such assignment may incorrectly coerce NaT to None @@ -955,9 +970,6 @@ def fast_xs(self, loc: int) -> ArrayLike: for i, rl in enumerate(blk.mgr_locs): result[rl] = blk.iget((i, loc)) - if isinstance(dtype, ExtensionDtype): - result = dtype.construct_array_type()._from_sequence(result, dtype=dtype) - return result def iget(self, i: int) -> SingleBlockManager: @@ -1010,7 +1022,7 @@ def iset(self, loc: int | slice | np.ndarray, value: ArrayLike): Set new item in-place. Does not consolidate. Adds new Block if not contained in the current set of items """ - value = extract_array(value, extract_numpy=True) + # FIXME: refactor, clearly separate broadcasting & zip-like assignment # can prob also fix the various if tests for sparse/categorical if self._blklocs is None and self.ndim > 1: @@ -1397,9 +1409,15 @@ def unstack(self, unstacker, fill_value) -> BlockManager: new_blocks.extend(blocks) columns_mask.extend(mask) + # Block._unstack should ensure this holds, + assert mask.sum() == sum(len(nb._mgr_locs) for nb in blocks) + # In turn this ensures that in the BlockManager call below + # we have len(new_columns) == sum(x.shape[0] for x in new_blocks) + # which suffices to allow us to pass verify_inegrity=False + new_columns = new_columns[columns_mask] - bm = BlockManager(new_blocks, [new_columns, new_index]) + bm = BlockManager(new_blocks, [new_columns, new_index], verify_integrity=False) return bm def to_dict(self, copy: bool = True): @@ -1425,7 +1443,7 @@ def to_dict(self, copy: bool = True): def as_array( self, transpose: bool = False, - dtype: npt.DTypeLike | None = None, + dtype: np.dtype | None = None, copy: bool = False, na_value=lib.no_default, ) -> np.ndarray: @@ -1436,7 +1454,7 @@ def as_array( ---------- transpose : bool, default False If True, transpose the return array. - dtype : object, default None + dtype : np.dtype or None, default None Data type of the return array. copy : bool, default False If True then guarantee that a copy is returned. A value of @@ -1465,15 +1483,7 @@ def as_array( # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no # attribute "to_numpy" arr = blk.values.to_numpy( # type: ignore[union-attr] - # pandas/core/internals/managers.py:1428: error: Argument "dtype" to - # "to_numpy" of "ExtensionArray" has incompatible type - # "Optional[Union[dtype[Any], None, type, _SupportsDType, str, - # Union[Tuple[Any, int], Tuple[Any, Union[SupportsIndex, - # Sequence[SupportsIndex]]], List[Any], _DTypeDict, Tuple[Any, - # Any]]]]"; expected "Optional[Union[ExtensionDtype, Union[str, - # dtype[Any]], Type[str], Type[float], Type[int], Type[complex], - # Type[bool], Type[object]]]" - dtype=dtype, # type: ignore[arg-type] + dtype=dtype, na_value=na_value, ).reshape(blk.shape) else: @@ -1495,7 +1505,7 @@ def as_array( def _interleave( self, - dtype: npt.DTypeLike | ExtensionDtype | None = None, + dtype: np.dtype | None = None, na_value=lib.no_default, ) -> np.ndarray: """ @@ -1503,26 +1513,36 @@ def _interleave( Items must be contained in the blocks """ if not dtype: - dtype = interleaved_dtype([blk.dtype for blk in self.blocks]) + # Incompatible types in assignment (expression has type + # "Optional[Union[dtype[Any], ExtensionDtype]]", variable has + # type "Optional[dtype[Any]]") + dtype = interleaved_dtype( # type: ignore[assignment] + [blk.dtype for blk in self.blocks] + ) # TODO: https://github.com/pandas-dev/pandas/issues/22791 # Give EAs some input on what happens here. Sparse needs this. if isinstance(dtype, SparseDtype): dtype = dtype.subtype + dtype = cast(np.dtype, dtype) elif isinstance(dtype, ExtensionDtype): dtype = np.dtype("object") elif is_dtype_equal(dtype, str): dtype = np.dtype("object") - # error: Argument "dtype" to "empty" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object], None]"; expected - # "Union[dtype[Any], None, type, _SupportsDType, str, Union[Tuple[Any, int], - # Tuple[Any, Union[int, Sequence[int]]], List[Any], _DTypeDict, - # Tuple[Any, Any]]]" - result = np.empty(self.shape, dtype=dtype) # type: ignore[arg-type] + result = np.empty(self.shape, dtype=dtype) itemmask = np.zeros(self.shape[0]) + if dtype == np.dtype("object") and na_value is lib.no_default: + # much more performant than using to_numpy below + for blk in self.blocks: + rl = blk.mgr_locs + arr = blk.get_values(dtype) + result[rl.indexer] = arr + itemmask[rl.indexer] = 1 + return result + for blk in self.blocks: rl = blk.mgr_locs if blk.is_extension: @@ -1531,22 +1551,11 @@ def _interleave( # error: Item "ndarray" of "Union[ndarray, ExtensionArray]" has no # attribute "to_numpy" arr = blk.values.to_numpy( # type: ignore[union-attr] - # pandas/core/internals/managers.py:1485: error: Argument "dtype" to - # "to_numpy" of "ExtensionArray" has incompatible type - # "Union[dtype[Any], None, type, _SupportsDType, str, Tuple[Any, - # Union[SupportsIndex, Sequence[SupportsIndex]]], List[Any], - # _DTypeDict, Tuple[Any, Any], ExtensionDtype]"; expected - # "Optional[Union[ExtensionDtype, Union[str, dtype[Any]], Type[str], - # Type[float], Type[int], Type[complex], Type[bool], Type[object]]]" - # [arg-type] - dtype=dtype, # type: ignore[arg-type] + dtype=dtype, na_value=na_value, ) else: - # error: Argument 1 to "get_values" of "Block" has incompatible type - # "Union[ExtensionDtype, str, dtype[Any], Type[object], None]"; expected - # "Union[dtype[Any], ExtensionDtype, None]" - arr = blk.get_values(dtype) # type: ignore[arg-type] + arr = blk.get_values(dtype) result[rl.indexer] = arr itemmask[rl.indexer] = 1 @@ -1567,6 +1576,11 @@ def is_consolidated(self) -> bool: return self._is_consolidated def _consolidate_check(self) -> None: + if len(self.blocks) == 1: + # fastpath + self._is_consolidated = True + self._known_consolidated = True + return dtypes = [blk.dtype for blk in self.blocks if blk._can_consolidate] self._is_consolidated = len(dtypes) == len(set(dtypes)) self._known_consolidated = True @@ -1706,7 +1720,7 @@ def get_slice(self, slobj: slice, axis: int = 0) -> SingleBlockManager: blk = self._block array = blk._slice(slobj) bp = BlockPlacement(slice(0, len(array))) - block = blk.make_block_same_class(array, placement=bp) + block = type(blk)(array, placement=bp, ndim=1) new_index = self.index._getitem_slice(slobj) return type(self)(block, new_index) @@ -1789,10 +1803,20 @@ def _equal_values(self: T, other: T) -> bool: def create_block_manager_from_blocks( - blocks: list[Block], axes: list[Index], consolidate: bool = True + blocks: list[Block], + axes: list[Index], + consolidate: bool = True, + verify_integrity: bool = True, ) -> BlockManager: + # If verify_integrity=False, then caller is responsible for checking + # all(x.shape[-1] == len(axes[1]) for x in blocks) + # sum(x.shape[0] for x in blocks) == len(axes[0]) + # set(x for for blk in blocks for x in blk.mgr_locs) == set(range(len(axes[0]))) + # all(blk.ndim == 2 for blk in blocks) + # This allows us to safely pass verify_integrity=False + try: - mgr = BlockManager(blocks, axes) + mgr = BlockManager(blocks, axes, verify_integrity=verify_integrity) except ValueError as err: arrays = [blk.values for blk in blocks] diff --git a/pandas/core/internals/ops.py b/pandas/core/internals/ops.py index 5f03d6709dfa4..35caeea9b9067 100644 --- a/pandas/core/internals/ops.py +++ b/pandas/core/internals/ops.py @@ -106,28 +106,28 @@ def _get_same_shape_values( # TODO(EA2D): with 2D EAs only this first clause would be needed if not (left_ea or right_ea): - # error: Invalid index type "Tuple[Any, slice]" for "Union[ndarray, - # ExtensionArray]"; expected type "Union[int, slice, ndarray]" - lvals = lvals[rblk.mgr_locs.indexer, :] # type: ignore[index] + # error: No overload variant of "__getitem__" of "ExtensionArray" matches + # argument type "Tuple[Union[ndarray, slice], slice]" + lvals = lvals[rblk.mgr_locs.indexer, :] # type: ignore[call-overload] assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) elif left_ea and right_ea: assert lvals.shape == rvals.shape, (lvals.shape, rvals.shape) elif right_ea: # lvals are 2D, rvals are 1D - # error: Invalid index type "Tuple[Any, slice]" for "Union[ndarray, - # ExtensionArray]"; expected type "Union[int, slice, ndarray]" - lvals = lvals[rblk.mgr_locs.indexer, :] # type: ignore[index] + # error: No overload variant of "__getitem__" of "ExtensionArray" matches + # argument type "Tuple[Union[ndarray, slice], slice]" + lvals = lvals[rblk.mgr_locs.indexer, :] # type: ignore[call-overload] assert lvals.shape[0] == 1, lvals.shape - # error: Invalid index type "Tuple[int, slice]" for "Union[Any, - # ExtensionArray]"; expected type "Union[int, slice, ndarray]" - lvals = lvals[0, :] # type: ignore[index] + # error: No overload variant of "__getitem__" of "ExtensionArray" matches + # argument type "Tuple[int, slice]" + lvals = lvals[0, :] # type: ignore[call-overload] else: # lvals are 1D, rvals are 2D assert rvals.shape[0] == 1, rvals.shape - # error: Invalid index type "Tuple[int, slice]" for "Union[ndarray, - # ExtensionArray]"; expected type "Union[int, slice, ndarray]" - rvals = rvals[0, :] # type: ignore[index] + # error: No overload variant of "__getitem__" of "ExtensionArray" matches + # argument type "Tuple[int, slice]" + rvals = rvals[0, :] # type: ignore[call-overload] return lvals, rvals diff --git a/pandas/core/missing.py b/pandas/core/missing.py index 1a07b5614eb38..9e85cbec0f299 100644 --- a/pandas/core/missing.py +++ b/pandas/core/missing.py @@ -23,6 +23,7 @@ ArrayLike, Axis, F, + npt, ) from pandas.compat._optional import import_optional_dependency @@ -57,7 +58,7 @@ def check_value_size(value, mask: np.ndarray, length: int): return value -def mask_missing(arr: ArrayLike, values_to_mask) -> np.ndarray: +def mask_missing(arr: ArrayLike, values_to_mask) -> npt.NDArray[np.bool_]: """ Return a masking array of same size/shape as arr with entries equaling any member of values_to_mask set to True diff --git a/pandas/core/nanops.py b/pandas/core/nanops.py index a80bd8ba76dac..5d96e9bb6cd19 100644 --- a/pandas/core/nanops.py +++ b/pandas/core/nanops.py @@ -449,6 +449,35 @@ def _na_for_min_count(values: np.ndarray, axis: int | None) -> Scalar | np.ndarr return np.full(result_shape, fill_value, dtype=values.dtype) +def maybe_operate_rowwise(func): + """ + NumPy operations on C-contiguous ndarrays with axis=1 can be + very slow. Operate row-by-row and concatenate the results. + """ + + @functools.wraps(func) + def newfunc(values: np.ndarray, *, axis: int | None = None, **kwargs): + if ( + axis == 1 + and values.ndim == 2 + and values.flags["C_CONTIGUOUS"] + and values.dtype != object + ): + arrs = list(values) + if kwargs.get("mask") is not None: + mask = kwargs.pop("mask") + results = [ + func(arrs[i], mask=mask[i], **kwargs) for i in range(len(arrs)) + ] + else: + results = [func(x, **kwargs) for x in arrs] + return np.array(results) + + return func(values, axis=axis, **kwargs) + + return newfunc + + def nanany( values: np.ndarray, *, @@ -543,6 +572,7 @@ def nanall( @disallow("M8") @_datetimelike_compat +@maybe_operate_rowwise def nansum( values: np.ndarray, *, @@ -1111,6 +1141,7 @@ def nanargmin( @disallow("M8", "m8") +@maybe_operate_rowwise def nanskew( values: np.ndarray, *, @@ -1198,6 +1229,7 @@ def nanskew( @disallow("M8", "m8") +@maybe_operate_rowwise def nankurt( values: np.ndarray, *, @@ -1294,6 +1326,7 @@ def nankurt( @disallow("M8", "m8") +@maybe_operate_rowwise def nanprod( values: np.ndarray, *, diff --git a/pandas/core/resample.py b/pandas/core/resample.py index 88bab20cd5168..c7c2fbc2deaf5 100644 --- a/pandas/core/resample.py +++ b/pandas/core/resample.py @@ -25,6 +25,7 @@ ) from pandas._typing import ( FrameOrSeries, + IndexLabel, T, TimedeltaConvertibleTypes, TimestampConvertibleTypes, @@ -1032,6 +1033,7 @@ class _GroupByMixin(PandasObject): """ _attributes: list[str] # in practice the same as Resampler._attributes + _selection: IndexLabel | None = None def __init__(self, obj, parent=None, groupby=None, **kwargs): # reached via ._gotitem and _get_resampler_for_grouping @@ -1043,6 +1045,7 @@ def __init__(self, obj, parent=None, groupby=None, **kwargs): # the resampler attributes for attr in self._attributes: setattr(self, attr, kwargs.get(attr, getattr(parent, attr))) + self._selection = kwargs.get("selection") self.binner = parent.binner diff --git a/pandas/core/reshape/merge.py b/pandas/core/reshape/merge.py index 501ad383168a0..bdba1249ffafe 100644 --- a/pandas/core/reshape/merge.py +++ b/pandas/core/reshape/merge.py @@ -70,6 +70,7 @@ Categorical, Index, MultiIndex, + Series, ) from pandas.core import groupby import pandas.core.algorithms as algos @@ -81,10 +82,7 @@ from pandas.core.sorting import is_int64_overflow_possible if TYPE_CHECKING: - from pandas import ( - DataFrame, - Series, - ) + from pandas import DataFrame from pandas.core.arrays import DatetimeArray @@ -748,7 +746,7 @@ def _maybe_drop_cross_column( self, result: DataFrame, cross_col: str | None ) -> None: if cross_col is not None: - result.drop(columns=cross_col, inplace=True) + del result[cross_col] def _indicator_pre_merge( self, left: DataFrame, right: DataFrame @@ -904,17 +902,22 @@ def _maybe_add_join_keys( # error: Item "bool" of "Union[Any, bool]" has no attribute "all" if mask_left.all(): # type: ignore[union-attr] key_col = Index(rvals) + result_dtype = rvals.dtype # error: Item "bool" of "Union[Any, bool]" has no attribute "all" elif ( right_indexer is not None and mask_right.all() # type: ignore[union-attr] ): key_col = Index(lvals) + result_dtype = lvals.dtype else: key_col = Index(lvals).where(~mask_left, rvals) + result_dtype = lvals.dtype if result._is_label_reference(name): - result[name] = key_col + result[name] = Series( + key_col, dtype=result_dtype, index=result.index + ) elif result._is_level_reference(name): if isinstance(result.index, MultiIndex): key_col.name = name @@ -931,17 +934,16 @@ def _maybe_add_join_keys( else: result.insert(i, name or f"key_{i}", key_col) - def _get_join_indexers(self) -> tuple[np.ndarray, np.ndarray]: + def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: """return the join indexers""" - # Both returned ndarrays are np.intp return get_join_indexers( self.left_join_keys, self.right_join_keys, sort=self.sort, how=self.how ) def _get_join_info( self, - ) -> tuple[Index, np.ndarray | None, np.ndarray | None]: - # Both returned ndarrays are np.intp (if not None) + ) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp] | None]: + left_ax = self.left.axes[self.axis] right_ax = self.right.axes[self.axis] @@ -1892,8 +1894,7 @@ def _get_merge_keys(self): return left_join_keys, right_join_keys, join_names - def _get_join_indexers(self) -> tuple[np.ndarray, np.ndarray]: - # Both returned ndarrays are np.intp + def _get_join_indexers(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: """return the join indexers""" def flip(xs) -> np.ndarray: @@ -1987,8 +1988,7 @@ def flip(xs) -> np.ndarray: def _get_multiindex_indexer( join_keys, index: MultiIndex, sort: bool -) -> tuple[np.ndarray, np.ndarray]: - # Both returned ndarrays are np.intp +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: # left & right join labels and num. of levels at each location mapped = ( @@ -2026,8 +2026,7 @@ def _get_multiindex_indexer( def _get_single_indexer( join_key, index: Index, sort: bool = False -) -> tuple[np.ndarray, np.ndarray]: - # Both returned ndarrays are np.intp +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: left_key, right_key, count = _factorize_keys(join_key, index._values, sort=sort) return libjoin.left_outer_join(left_key, right_key, count, sort=sort) @@ -2035,8 +2034,7 @@ def _get_single_indexer( def _left_join_on_index( left_ax: Index, right_ax: Index, join_keys, sort: bool = False -) -> tuple[Index, np.ndarray | None, np.ndarray]: - # Both returned ndarrays are np.intp (if not None) +) -> tuple[Index, npt.NDArray[np.intp] | None, npt.NDArray[np.intp]]: if len(join_keys) > 1: if not ( isinstance(right_ax, MultiIndex) and len(join_keys) == right_ax.nlevels @@ -2205,8 +2203,7 @@ def _factorize_keys( def _sort_labels( uniques: np.ndarray, left: np.ndarray, right: np.ndarray -) -> tuple[np.ndarray, np.ndarray]: - # Both returned ndarrays are np.intp +) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.intp]]: llength = len(left) labels = np.concatenate([left, right]) diff --git a/pandas/core/reshape/reshape.py b/pandas/core/reshape/reshape.py index 8d1d258a5c84c..567f0c20bbfd5 100644 --- a/pandas/core/reshape/reshape.py +++ b/pandas/core/reshape/reshape.py @@ -189,6 +189,18 @@ def _make_selectors(self): self.unique_groups = obs_ids self.compressor = comp_index.searchsorted(np.arange(ngroups)) + @cache_readonly + def mask_all(self) -> bool: + return bool(self.mask.all()) + + @cache_readonly + def arange_result(self) -> tuple[npt.NDArray[np.intp], npt.NDArray[np.bool_]]: + # We cache this for re-use in ExtensionBlock._unstack + dummy_arr = np.arange(len(self.index), dtype=np.intp) + new_values, mask = self.get_new_values(dummy_arr, fill_value=-1) + return new_values, mask.any(0) + # TODO: in all tests we have mask.any(0).all(); can we rely on that? + def get_result(self, values, value_columns, fill_value): if values.ndim == 1: @@ -216,7 +228,7 @@ def get_new_values(self, values, fill_value=None): result_width = width * stride result_shape = (length, result_width) mask = self.mask - mask_all = mask.all() + mask_all = self.mask_all # we can simply reshape if we don't have a mask if mask_all and len(values): @@ -510,7 +522,11 @@ def _unstack_extension_series(series, level, fill_value): # Defer to the logic in ExtensionBlock._unstack df = series.to_frame() result = df.unstack(level=level, fill_value=fill_value) - return result.droplevel(level=0, axis=1) + + # equiv: result.droplevel(level=0, axis=1) + # but this avoids an extra copy + result.columns = result.columns.droplevel(0) + return result def stack(frame, level=-1, dropna=True): diff --git a/pandas/core/series.py b/pandas/core/series.py index a5ec4125f54a4..6f964ab09e978 100644 --- a/pandas/core/series.py +++ b/pandas/core/series.py @@ -144,6 +144,11 @@ if TYPE_CHECKING: + from pandas._typing import ( + NumpySorter, + NumpyValueArrayLike, + ) + from pandas.core.frame import DataFrame from pandas.core.groupby.generic import SeriesGroupBy from pandas.core.resample import Resampler @@ -2778,7 +2783,12 @@ def __rmatmul__(self, other): return self.dot(np.transpose(other)) @doc(base.IndexOpsMixin.searchsorted, klass="Series") - def searchsorted(self, value, side="left", sorter=None) -> np.ndarray: + def searchsorted( + self, + value: NumpyValueArrayLike, + side: Literal["left", "right"] = "left", + sorter: NumpySorter = None, + ) -> npt.NDArray[np.intp] | np.intp: return algorithms.searchsorted(self._values, value, side=side, sorter=sorter) # ------------------------------------------------------------------- diff --git a/pandas/core/strings/accessor.py b/pandas/core/strings/accessor.py index 55a55d0111397..4ea29edb7d41b 100644 --- a/pandas/core/strings/accessor.py +++ b/pandas/core/strings/accessor.py @@ -1907,6 +1907,69 @@ def rstrip(self, to_strip=None): result = self._data.array._str_rstrip(to_strip) return self._wrap_result(result) + _shared_docs[ + "str_removefix" + ] = r""" + Remove a %(side)s from an object series. If the %(side)s is not present, + the original string will be returned. + + Parameters + ---------- + %(side)s : str + %(side)s to remove. + + Returns + ------- + Series/Index: object + The Series or Index with given %(side)s removed. + + See Also + -------- + Series.str.remove%(other_side)s : Remove a %(other_side)s from an object series. + + Examples + -------- + >>> s = pd.Series(["str_foo", "str_bar", "no_prefix"]) + >>> s + 0 str_foo + 1 str_bar + 2 no_prefix + dtype: object + >>> s.str.removeprefix("str_") + 0 foo + 1 bar + 2 no_prefix + dtype: object + + >>> s = pd.Series(["foo_str", "bar_str", "no_suffix"]) + >>> s + 0 foo_str + 1 bar_str + 2 no_suffix + dtype: object + >>> s.str.removesuffix("_str") + 0 foo + 1 bar + 2 no_suffix + dtype: object + """ + + @Appender( + _shared_docs["str_removefix"] % {"side": "prefix", "other_side": "suffix"} + ) + @forbid_nonstring_types(["bytes"]) + def removeprefix(self, prefix): + result = self._data.array._str_removeprefix(prefix) + return self._wrap_result(result) + + @Appender( + _shared_docs["str_removefix"] % {"side": "suffix", "other_side": "prefix"} + ) + @forbid_nonstring_types(["bytes"]) + def removesuffix(self, suffix): + result = self._data.array._str_removesuffix(suffix) + return self._wrap_result(result) + @forbid_nonstring_types(["bytes"]) def wrap(self, width, **kwargs): r""" diff --git a/pandas/core/strings/base.py b/pandas/core/strings/base.py index cd71844d3b527..ef0c3f8c2321d 100644 --- a/pandas/core/strings/base.py +++ b/pandas/core/strings/base.py @@ -3,11 +3,15 @@ import abc from collections.abc import Callable # noqa: PDF001 import re +from typing import TYPE_CHECKING import numpy as np from pandas._typing import Scalar +if TYPE_CHECKING: + from pandas import Series + class BaseStringArrayMethods(abc.ABC): """ @@ -223,6 +227,14 @@ def _str_lstrip(self, to_strip=None): def _str_rstrip(self, to_strip=None): pass + @abc.abstractmethod + def _str_removeprefix(self, prefix: str) -> Series: + pass + + @abc.abstractmethod + def _str_removesuffix(self, suffix: str) -> Series: + pass + @abc.abstractmethod def _str_split(self, pat=None, n=-1, expand=False): pass diff --git a/pandas/core/strings/object_array.py b/pandas/core/strings/object_array.py index 02bdb7f181583..76ee55ef5f9ad 100644 --- a/pandas/core/strings/object_array.py +++ b/pandas/core/strings/object_array.py @@ -3,6 +3,7 @@ from collections.abc import Callable # noqa: PDF001 import re import textwrap +from typing import TYPE_CHECKING import unicodedata import numpy as np @@ -20,6 +21,9 @@ from pandas.core.strings.base import BaseStringArrayMethods +if TYPE_CHECKING: + from pandas import Series + class ObjectStringArrayMixin(BaseStringArrayMethods): """ @@ -36,7 +40,7 @@ def _str_map( self, f, na_value=None, dtype: Dtype | None = None, convert: bool = True ): """ - Map a callable over valid element of the array. + Map a callable over valid elements of the array. Parameters ---------- @@ -414,6 +418,30 @@ def _str_lstrip(self, to_strip=None): def _str_rstrip(self, to_strip=None): return self._str_map(lambda x: x.rstrip(to_strip)) + def _str_removeprefix(self, prefix: str) -> Series: + # outstanding question on whether to use native methods for users + # on Python 3.9+ https://git.io/JE9QK, in which case we could do + # return self._str_map(str.removeprefix) + + def removeprefix(text: str) -> str: + if text.startswith(prefix): + return text[len(prefix) :] + return text + + return self._str_map(removeprefix) + + def _str_removesuffix(self, suffix: str) -> Series: + # this could be used on Python 3.9+ + # f = lambda x: x.removesuffix(suffix) + # return self._str_map(str.removesuffix) + + def removesuffix(text: str) -> str: + if text.endswith(suffix): + return text[: -len(suffix)] + return text + + return self._str_map(removesuffix) + def _str_extract(self, pat: str, flags: int = 0, expand: bool = True): regex = re.compile(pat, flags=flags) na_value = self._str_na_value diff --git a/pandas/core/window/common.py b/pandas/core/window/common.py index e0720c5d86df1..15144116fa924 100644 --- a/pandas/core/window/common.py +++ b/pandas/core/window/common.py @@ -83,8 +83,24 @@ def dataframe_from_int_dict(data, frame_template): # mypy needs to know columns is a MultiIndex, Index doesn't # have levels attribute arg2.columns = cast(MultiIndex, arg2.columns) - result.index = MultiIndex.from_product( - arg2.columns.levels + [result_index] + # GH 21157: Equivalent to MultiIndex.from_product( + # [result_index], , + # ) + # A normal MultiIndex.from_product will produce too many + # combinations. + result_level = np.tile( + result_index, len(result) // len(result_index) + ) + arg2_levels = ( + np.repeat( + arg2.columns.get_level_values(i), + len(result) // len(arg2.columns), + ) + for i in range(arg2.columns.nlevels) + ) + result_names = list(arg2.columns.names) + [result_index.name] + result.index = MultiIndex.from_arrays( + [*arg2_levels, result_level], names=result_names ) # GH 34440 num_levels = len(result.index.levels) diff --git a/pandas/core/window/ewm.py b/pandas/core/window/ewm.py index f0bba8ae9727f..7b58af87fb1d8 100644 --- a/pandas/core/window/ewm.py +++ b/pandas/core/window/ewm.py @@ -21,6 +21,7 @@ from pandas.compat.numpy import function as nv from pandas.util._decorators import doc +from pandas.util._exceptions import find_stack_level from pandas.core.dtypes.common import is_datetime64_ns_dtype from pandas.core.dtypes.missing import isna @@ -315,6 +316,15 @@ def __init__( if not self.adjust: raise NotImplementedError("times is not supported with adjust=False.") if isinstance(self.times, str): + warnings.warn( + ( + "Specifying times as a string column label is deprecated " + "and will be removed in a future version. Pass the column " + "into times instead." + ), + FutureWarning, + stacklevel=find_stack_level(), + ) self.times = self._selected_obj[self.times] if not is_datetime64_ns_dtype(self.times): raise ValueError("times must be datetime64[ns] dtype.") diff --git a/pandas/core/window/rolling.py b/pandas/core/window/rolling.py index ab23b84a3b8c6..a8e2ecf3d7f54 100644 --- a/pandas/core/window/rolling.py +++ b/pandas/core/window/rolling.py @@ -50,6 +50,7 @@ from pandas.core.algorithms import factorize from pandas.core.apply import ResamplerWindowApply +from pandas.core.arrays import ExtensionArray from pandas.core.base import ( DataError, SelectionMixin, @@ -317,7 +318,10 @@ def _prep_values(self, values: ArrayLike) -> np.ndarray: # GH #12373 : rolling functions error on float32 data # make sure the data is coerced to float64 try: - values = ensure_float64(values) + if isinstance(values, ExtensionArray): + values = values.to_numpy(np.float64, na_value=np.nan) + else: + values = ensure_float64(values) except (ValueError, TypeError) as err: raise TypeError(f"cannot handle this type -> {values.dtype}") from err @@ -762,7 +766,8 @@ def _gotitem(self, key, ndim, subset=None): # here so our index is carried through to the selected obj # when we do the splitting for the groupby if self.on is not None: - self.obj = self.obj.set_index(self._on) + # GH 43355 + subset = self.obj.set_index(self._on) return super()._gotitem(key, ndim, subset=subset) def _validate_monotonic(self): diff --git a/pandas/io/feather_format.py b/pandas/io/feather_format.py index b5d819fefb370..b1ff188a7906b 100644 --- a/pandas/io/feather_format.py +++ b/pandas/io/feather_format.py @@ -9,12 +9,12 @@ from pandas.compat._optional import import_optional_dependency from pandas.util._decorators import doc -from pandas import ( +from pandas.core import generic +from pandas.core.api import ( DataFrame, Int64Index, RangeIndex, ) -from pandas.core import generic from pandas.io.common import get_handle diff --git a/pandas/io/formats/style.py b/pandas/io/formats/style.py index 8432a0b61b89a..2a063501976da 100644 --- a/pandas/io/formats/style.py +++ b/pandas/io/formats/style.py @@ -52,6 +52,7 @@ from pandas.io.formats.style_render import ( CSSProperties, CSSStyles, + ExtFormatter, StylerRenderer, Subset, Tooltips, @@ -85,8 +86,11 @@ class Styler(StylerRenderer): ---------- data : Series or DataFrame Data to be styled - either a Series or DataFrame. - precision : int - Precision to round floats to, defaults to pd.options.display.precision. + precision : int, optional + Precision to round floats to. If not given defaults to + ``pandas.options.styler.format.precision``. + + .. versionchanged:: 1.4.0 table_styles : list-like, default None List of {selector: (attr, value)} dicts; see Notes. uuid : str, default None @@ -103,7 +107,8 @@ class Styler(StylerRenderer): number and ```` is the column number. na_rep : str, optional Representation for missing values. - If ``na_rep`` is None, no special formatting is applied. + If ``na_rep`` is None, no special formatting is applied, and falls back to + ``pandas.options.styler.format.na_rep``. .. versionadded:: 1.0.0 @@ -113,13 +118,15 @@ class Styler(StylerRenderer): .. versionadded:: 1.2.0 - decimal : str, default "." - Character used as decimal separator for floats, complex and integers + decimal : str, optional + Character used as decimal separator for floats, complex and integers. If not + given uses ``pandas.options.styler.format.decimal``. .. versionadded:: 1.3.0 thousands : str, optional, default None - Character used as thousands separator for floats, complex and integers + Character used as thousands separator for floats, complex and integers. If not + given uses ``pandas.options.styler.format.thousands``. .. versionadded:: 1.3.0 @@ -128,9 +135,14 @@ class Styler(StylerRenderer): in cell display string with HTML-safe sequences. Use 'latex' to replace the characters ``&``, ``%``, ``$``, ``#``, ``_``, ``{``, ``}``, ``~``, ``^``, and ``\`` in the cell display string with - LaTeX-safe sequences. + LaTeX-safe sequences. If not given uses ``pandas.options.styler.format.escape`` .. versionadded:: 1.3.0 + formatter : str, callable, dict, optional + Object to define how values are displayed. See ``Styler.format``. If not given + uses ``pandas.options.styler.format.formatter``. + + .. versionadded:: 1.4.0 Attributes ---------- @@ -184,9 +196,10 @@ def __init__( cell_ids: bool = True, na_rep: str | None = None, uuid_len: int = 5, - decimal: str = ".", + decimal: str | None = None, thousands: str | None = None, escape: str | None = None, + formatter: ExtFormatter | None = None, ): super().__init__( data=data, @@ -196,13 +209,21 @@ def __init__( table_attributes=table_attributes, caption=caption, cell_ids=cell_ids, + precision=precision, ) # validate ordered args + thousands = thousands or get_option("styler.format.thousands") + decimal = decimal or get_option("styler.format.decimal") + na_rep = na_rep or get_option("styler.format.na_rep") + escape = escape or get_option("styler.format.escape") + formatter = formatter or get_option("styler.format.formatter") + # precision is handled by superclass as default for performance + self.precision = precision # can be removed on set_precision depr cycle self.na_rep = na_rep # can be removed on set_na_rep depr cycle self.format( - formatter=None, + formatter=formatter, precision=precision, na_rep=na_rep, escape=escape, @@ -210,11 +231,19 @@ def __init__( thousands=thousands, ) - def _repr_html_(self) -> str: + def _repr_html_(self) -> str | None: """ - Hooks into Jupyter notebook rich display system. + Hooks into Jupyter notebook rich display system, which calls _repr_html_ by + default if an object is returned at the end of a cell. """ - return self.to_html() + if get_option("styler.render.repr") == "html": + return self.to_html() + return None + + def _repr_latex_(self) -> str | None: + if get_option("styler.render.repr") == "latex": + return self.to_latex() + return None def render( self, @@ -437,8 +466,8 @@ def to_latex( caption: str | tuple | None = None, sparse_index: bool | None = None, sparse_columns: bool | None = None, - multirow_align: str = "c", - multicol_align: str = "r", + multirow_align: str | None = None, + multicol_align: str | None = None, siunitx: bool = False, environment: str | None = None, encoding: str | None = None, @@ -491,22 +520,32 @@ def to_latex( Whether to sparsify the display of a hierarchical index. Setting to False will display each explicit level element in a hierarchical key for each column. Defaults to ``pandas.options.styler.sparse.columns`` value. - multirow_align : {"c", "t", "b"} + multirow_align : {"c", "t", "b", "naive"}, optional If sparsifying hierarchical MultiIndexes whether to align text centrally, - at the top or bottom. - multicol_align : {"r", "c", "l"} + at the top or bottom using the multirow package. If not given defaults to + ``pandas.options.styler.latex.multirow_align``. If "naive" is given renders + without multirow. + + .. versionchanged:: 1.4.0 + multicol_align : {"r", "c", "l", "naive-l", "naive-r"}, optional If sparsifying hierarchical MultiIndex columns whether to align text at - the left, centrally, or at the right. + the left, centrally, or at the right. If not given defaults to + ``pandas.options.styler.latex.multicol_align``. If a naive option is + given renders without multicol. + + .. versionchanged:: 1.4.0 siunitx : bool, default False Set to ``True`` to structure LaTeX compatible with the {siunitx} package. environment : str, optional If given, the environment that will replace 'table' in ``\\begin{table}``. If 'longtable' is specified then a more suitable template is - rendered. + rendered. If not given defaults to + ``pandas.options.styler.latex.environment``. .. versionadded:: 1.4.0 - encoding : str, default "utf-8" - Character encoding setting. + encoding : str, optional + Character encoding setting. Defaults + to ``pandas.options.styler.render.encoding`` value of "utf-8". convert_css : bool, default False Convert simple cell-styles from CSS to LaTeX format. Any CSS not found in conversion table is dropped. A style can be forced by adding option @@ -818,7 +857,9 @@ def to_latex( sparse_index = get_option("styler.sparse.index") if sparse_columns is None: sparse_columns = get_option("styler.sparse.columns") - + environment = environment or get_option("styler.latex.environment") + multicol_align = multicol_align or get_option("styler.latex.multicol_align") + multirow_align = multirow_align or get_option("styler.latex.multirow_align") latex = obj._render_latex( sparse_index=sparse_index, sparse_columns=sparse_columns, @@ -826,9 +867,13 @@ def to_latex( multicol_align=multicol_align, environment=environment, convert_css=convert_css, + siunitx=siunitx, ) - return save_to_buffer(latex, buf=buf, encoding=encoding) + encoding = encoding or get_option("styler.render.encoding") + return save_to_buffer( + latex, buf=buf, encoding=None if buf is None else encoding + ) def to_html( self, @@ -838,6 +883,8 @@ def to_html( table_attributes: str | None = None, sparse_index: bool | None = None, sparse_columns: bool | None = None, + bold_headers: bool = False, + caption: str | None = None, encoding: str | None = None, doctype_html: bool = False, exclude_styles: bool = False, @@ -875,10 +922,18 @@ def to_html( will display each explicit level element in a hierarchical key for each column. Defaults to ``pandas.options.styler.sparse.columns`` value. + .. versionadded:: 1.4.0 + bold_headers : bool, optional + Adds "font-weight: bold;" as a CSS property to table style header cells. + + .. versionadded:: 1.4.0 + caption : str, optional + Set, or overwrite, the caption on Styler before rendering. + .. versionadded:: 1.4.0 encoding : str, optional - Character encoding setting for file output, and HTML meta tags, - defaults to "utf-8" if None. + Character encoding setting for file output, and HTML meta tags. + Defaults to ``pandas.options.styler.render.encoding`` value of "utf-8". doctype_html : bool, default False Whether to output a fully structured HTML file including all HTML elements, or just the core ``