diff --git a/pandas/core/frame.py b/pandas/core/frame.py index 27713b5bde201..3f943ddc3d433 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -159,7 +159,7 @@ from pandas.io.common import get_handle from pandas.io.formats import console, format as fmt -from pandas.io.formats.info import DataFrameInfo +from pandas.io.formats.info import BaseInfo, DataFrameInfo import pandas.plotting if TYPE_CHECKING: @@ -2523,16 +2523,25 @@ def to_html( @Substitution( klass="DataFrame", type_sub=" and columns", - max_cols_sub=( - """max_cols : int, optional + max_cols_sub=dedent( + """\ + max_cols : int, optional When to switch from the verbose to the truncated output. If the DataFrame has more than `max_cols` columns, the truncated output is used. By default, the setting in - ``pandas.options.display.max_info_columns`` is used. - """ + ``pandas.options.display.max_info_columns`` is used.""" ), - examples_sub=( - """ + null_counts_sub=dedent( + """\ + null_counts : bool, optional + Whether to show the non-null counts. By default, this is shown + only if the DataFrame is smaller than + ``pandas.options.display.max_info_rows`` and + ``pandas.options.display.max_info_columns``. A value of True always + shows the counts, and False never shows the counts.""" + ), + examples_sub=dedent( + """\ >>> int_values = [1, 2, 3, 4, 5] >>> text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon'] >>> float_values = [0.0, 0.25, 0.5, 0.75, 1.0] @@ -2615,14 +2624,15 @@ def to_html( dtypes: object(3) memory usage: 165.9 MB""" ), - see_also_sub=( - """ + see_also_sub=dedent( + """\ DataFrame.describe: Generate descriptive statistics of DataFrame columns. DataFrame.memory_usage: Memory usage of DataFrame columns.""" ), + version_added_sub="", ) - @doc(DataFrameInfo.to_buffer) + @doc(BaseInfo.render) def info( self, verbose: Optional[bool] = None, @@ -2635,7 +2645,7 @@ def info( data=self, memory_usage=memory_usage, ) - info.to_buffer( + info.render( buf=buf, max_cols=max_cols, verbose=verbose, diff --git a/pandas/io/formats/info.py b/pandas/io/formats/info.py index 891b3ea7af0e2..563dbaa06e526 100644 --- a/pandas/io/formats/info.py +++ b/pandas/io/formats/info.py @@ -1,10 +1,20 @@ from abc import ABC, abstractmethod import sys -from typing import IO, TYPE_CHECKING, Iterator, List, Mapping, Optional, Sequence, Union +from typing import ( + IO, + TYPE_CHECKING, + Iterable, + Iterator, + List, + Mapping, + Optional, + Sequence, + Union, +) from pandas._config import get_option -from pandas._typing import Dtype, FrameOrSeries +from pandas._typing import Dtype, FrameOrSeriesUnion from pandas.core.indexes.api import Index @@ -13,7 +23,6 @@ if TYPE_CHECKING: from pandas.core.frame import DataFrame - from pandas.core.series import Series def _put_str(s: Union[str, Dtype], space: int) -> str: @@ -83,11 +92,12 @@ def _initialize_memory_usage( class BaseInfo(ABC): - """Base class for DataFrameInfo and SeriesInfo. + """ + Base class for DataFrameInfo and SeriesInfo. Parameters ---------- - data : FrameOrSeries + data : DataFrame or Series Either dataframe or series. memory_usage : bool or str, optional If "deep", introspect the data deeply by interrogating object dtypes @@ -95,18 +105,20 @@ class BaseInfo(ABC): values. """ - def __init__( - self, - data: FrameOrSeries, - memory_usage: Optional[Union[bool, str]] = None, - ): - self.data = data - self.memory_usage = _initialize_memory_usage(memory_usage) + data: FrameOrSeriesUnion + memory_usage: Union[bool, str] @property @abstractmethod - def ids(self) -> Index: - """Column names or index names.""" + def dtypes(self) -> Iterable[Dtype]: + """ + Dtypes. + + Returns + ------- + dtypes : sequence + Dtype of each of the DataFrame's columns (or one series column). + """ @property @abstractmethod @@ -120,30 +132,15 @@ def non_null_counts(self) -> Sequence[int]: @property @abstractmethod - def dtypes(self) -> "Series": - """Dtypes. - - Returns - ------- - dtypes : Series - Dtype of each of the DataFrame's columns. - """ - return self.data.dtypes - - @property def memory_usage_bytes(self) -> int: - """Memory usage in bytes. + """ + Memory usage in bytes. Returns ------- memory_usage_bytes : int Object's total memory usage in bytes. """ - if self.memory_usage == "deep": - deep = True - else: - deep = False - return self.data.memory_usage(index=True, deep=deep).sum() @property def memory_usage_string(self) -> str: @@ -165,49 +162,8 @@ def size_qualifier(self) -> str: size_qualifier = "+" return size_qualifier - -class DataFrameInfo(BaseInfo): - """Class storing dataframe-specific info.""" - - @property - def ids(self) -> Index: - """Column names. - - Returns - ------- - ids : Index - DataFrame's column names. - """ - return self.data.columns - - @property - def dtypes(self) -> "Series": - """Dtypes. - - Returns - ------- - dtypes : Series - Dtype of each of the DataFrame's columns. - """ - return self.data.dtypes - - @property - def dtype_counts(self) -> Mapping[str, int]: - """Mapping dtype - number of counts.""" - # groupby dtype.name to collect e.g. Categorical columns - return self.dtypes.value_counts().groupby(lambda x: x.name).sum() - - @property - def non_null_counts(self) -> Sequence[int]: - """Sequence of non-null counts for all columns.""" - return self.data.count() - - @property - def col_count(self) -> int: - """Number of columns to be summarized.""" - return len(self.ids) - - def to_buffer( + @abstractmethod + def render( self, *, buf: Optional[IO[str]], @@ -220,6 +176,7 @@ def to_buffer( This method prints information about a %(klass)s including the index dtype%(type_sub)s, non-null values and memory usage. + %(version_added_sub)s\ Parameters ---------- @@ -246,12 +203,7 @@ def to_buffer( consume the same memory amount for corresponding dtypes. With deep memory introspection, a real memory usage calculation is performed at the cost of computational resources. - null_counts : bool, optional - Whether to show the non-null counts. By default, this is shown - only if the %(klass)s is smaller than - ``pandas.options.display.max_info_rows`` and - ``pandas.options.display.max_info_columns``. A value of True always - shows the counts, and False never shows the counts. + %(null_counts_sub)s Returns ------- @@ -266,7 +218,76 @@ def to_buffer( -------- %(examples_sub)s """ - printer = InfoPrinter( + + +class DataFrameInfo(BaseInfo): + """ + Class storing dataframe-specific info. + """ + + def __init__( + self, + data: "DataFrame", + memory_usage: Optional[Union[bool, str]] = None, + ): + self.data: "DataFrame" = data + self.memory_usage = _initialize_memory_usage(memory_usage) + + @property + def dtype_counts(self) -> Mapping[str, int]: + return _get_dataframe_dtype_counts(self.data) + + @property + def dtypes(self) -> Iterable[Dtype]: + """ + Dtypes. + + Returns + ------- + dtypes + Dtype of each of the DataFrame's columns. + """ + return self.data.dtypes + + @property + def ids(self) -> Index: + """ + Column names. + + Returns + ------- + ids : Index + DataFrame's column names. + """ + return self.data.columns + + @property + def col_count(self) -> int: + """Number of columns to be summarized.""" + return len(self.ids) + + @property + def non_null_counts(self) -> Sequence[int]: + """Sequence of non-null counts for all columns or column (if series).""" + return self.data.count() + + @property + def memory_usage_bytes(self) -> int: + if self.memory_usage == "deep": + deep = True + else: + deep = False + return self.data.memory_usage(index=True, deep=deep).sum() + + def render( + self, + *, + buf: Optional[IO[str]], + max_cols: Optional[int], + verbose: Optional[bool], + show_counts: Optional[bool], + ) -> None: + printer = DataFrameInfoPrinter( info=self, max_cols=max_cols, verbose=verbose, @@ -275,8 +296,27 @@ def to_buffer( printer.to_buffer(buf) -class InfoPrinter: - """Class for printing dataframe or series info. +class InfoPrinterAbstract: + """ + Class for printing dataframe or series info. + """ + + def to_buffer(self, buf: Optional[IO[str]] = None) -> None: + """Save dataframe info into buffer.""" + table_builder = self._create_table_builder() + lines = table_builder.get_lines() + if buf is None: # pragma: no cover + buf = sys.stdout + fmt.buffer_put_lines(buf, lines) + + @abstractmethod + def _create_table_builder(self) -> "TableBuilderAbstract": + """Create instance of table builder.""" + + +class DataFrameInfoPrinter(InfoPrinterAbstract): + """ + Class for printing dataframe info. Parameters ---------- @@ -334,14 +374,6 @@ def _initialize_show_counts(self, show_counts: Optional[bool]) -> bool: else: return show_counts - def to_buffer(self, buf: Optional[IO[str]] = None) -> None: - """Save dataframe info into buffer.""" - table_builder = self._create_table_builder() - lines = table_builder.get_lines() - if buf is None: # pragma: no cover - buf = sys.stdout - fmt.buffer_put_lines(buf, lines) - def _create_table_builder(self) -> "DataFrameTableBuilder": """ Create instance of table builder based on verbosity and display settings. @@ -364,26 +396,73 @@ def _create_table_builder(self) -> "DataFrameTableBuilder": class TableBuilderAbstract(ABC): - """Abstract builder for info table. - - Parameters - ---------- - info : BaseInfo - Instance of DataFrameInfo or SeriesInfo. + """ + Abstract builder for info table. """ _lines: List[str] - - def __init__(self, *, info): - self.info = info + info: BaseInfo @abstractmethod def get_lines(self) -> List[str]: """Product in a form of list of lines (strings).""" + @property + def data(self) -> FrameOrSeriesUnion: + return self.info.data + + @property + def dtypes(self) -> Iterable[Dtype]: + """Dtypes of each of the DataFrame's columns.""" + return self.info.dtypes + + @property + def dtype_counts(self) -> Mapping[str, int]: + """Mapping dtype - number of counts.""" + return self.info.dtype_counts + + @property + def display_memory_usage(self) -> bool: + """Whether to display memory usage.""" + return bool(self.info.memory_usage) + + @property + def memory_usage_string(self) -> str: + """Memory usage string with proper size qualifier.""" + return self.info.memory_usage_string + + @property + def non_null_counts(self) -> Sequence[int]: + return self.info.non_null_counts + + def add_object_type_line(self) -> None: + """Add line with string representation of dataframe to the table.""" + self._lines.append(str(type(self.data))) + + def add_index_range_line(self) -> None: + """Add line with range of indices to the table.""" + self._lines.append(self.data.index._summary()) + + def add_dtypes_line(self) -> None: + """Add summary line with dtypes present in dataframe.""" + collected_dtypes = [ + f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items()) + ] + self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") + class DataFrameTableBuilder(TableBuilderAbstract): - """Abstract builder for dataframe info table.""" + """ + Abstract builder for dataframe info table. + + Parameters + ---------- + info : DataFrameInfo. + Instance of DataFrameInfo. + """ + + def __init__(self, *, info: DataFrameInfo): + self.info: DataFrameInfo = info def get_lines(self) -> List[str]: self._lines = [] @@ -399,144 +478,62 @@ def _fill_empty_info(self) -> None: self.add_index_range_line() self._lines.append(f"Empty {type(self.data).__name__}") + @abstractmethod def _fill_non_empty_info(self) -> None: """Add lines to the info table, pertaining to non-empty dataframe.""" - self.add_object_type_line() - self.add_index_range_line() - self.add_columns_summary_line() - self.add_header_line() - self.add_separator_line() - self.add_body_lines() - self.add_dtypes_line() - if self.display_memory_usage: - self.add_memory_usage_line() @property def data(self) -> "DataFrame": """DataFrame.""" return self.info.data - @property - def dtype_counts(self) -> Mapping[str, int]: - """Mapping dtype - number of counts.""" - return self.info.dtype_counts - - @property - def non_null_counts(self) -> Sequence[int]: - return self.info.non_null_counts - - @property - def display_memory_usage(self) -> bool: - """Whether to display memory usage.""" - return self.info.memory_usage - - @property - def memory_usage_string(self) -> str: - """Memory usage string with proper size qualifier.""" - return self.info.memory_usage_string - @property def ids(self) -> Index: """Dataframe columns.""" return self.info.ids - @property - def dtypes(self) -> "Series": - """Dtypes of each of the DataFrame's columns.""" - return self.info.dtypes - @property def col_count(self) -> int: """Number of dataframe columns to be summarized.""" return self.info.col_count - def add_object_type_line(self) -> None: - """Add line with string representation of dataframe to the table.""" - self._lines.append(str(type(self.data))) - - def add_index_range_line(self) -> None: - """Add line with range of indices to the table.""" - self._lines.append(self.data.index._summary()) - - @abstractmethod - def add_columns_summary_line(self) -> None: - """Add line with columns summary to the table.""" - - @abstractmethod - def add_header_line(self) -> None: - """Add header line to the table.""" - - @abstractmethod - def add_separator_line(self) -> None: - """Add separator line between header and body of the table.""" - - @abstractmethod - def add_body_lines(self) -> None: - """Add content of the table body.""" - - def add_dtypes_line(self) -> None: - """Add summary line with dtypes present in dataframe.""" - collected_dtypes = [ - f"{key}({val:d})" for key, val in sorted(self.dtype_counts.items()) - ] - self._lines.append(f"dtypes: {', '.join(collected_dtypes)}") - def add_memory_usage_line(self) -> None: """Add line containing memory usage.""" self._lines.append(f"memory usage: {self.memory_usage_string}") class DataFrameTableBuilderNonVerbose(DataFrameTableBuilder): - """Info table builder for non-verbose output.""" + """ + Dataframe info table builder for non-verbose output. + """ + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_columns_summary_line() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() def add_columns_summary_line(self) -> None: self._lines.append(self.ids._summary(name="Columns")) - def add_header_line(self) -> None: - """No header in non-verbose output.""" - - def add_separator_line(self) -> None: - """No separator in non-verbose output.""" - def add_body_lines(self) -> None: - """No body in non-verbose output.""" - - -class DataFrameTableBuilderVerbose(DataFrameTableBuilder): - """Info table builder for verbose output.""" - - SPACING = " " * 2 +class TableBuilderVerboseMixin(TableBuilderAbstract): + """ + Mixin for verbose info output. + """ - def __init__( - self, - *, - info: DataFrameInfo, - with_counts: bool, - ): - super().__init__(info=info) - self.with_counts = with_counts - self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) - self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() + SPACING: str = " " * 2 + strrows: Sequence[Sequence[str]] + gross_column_widths: Sequence[int] + with_counts: bool @property + @abstractmethod def headers(self) -> Sequence[str]: """Headers names of the columns in verbose table.""" - if self.with_counts: - return [" # ", "Column", "Non-Null Count", "Dtype"] - return [" # ", "Column", "Dtype"] - - def _gen_rows(self) -> Iterator[Sequence[str]]: - """Generator function yielding rows content. - - Each element represents a row comprising a sequence of strings. - """ - if self.with_counts: - return self._gen_rows_with_counts() - else: - return self._gen_rows_without_counts() - - def add_columns_summary_line(self) -> None: - self._lines.append(f"Data columns (total {self.col_count} columns):") @property def header_column_widths(self) -> Sequence[int]: @@ -556,6 +553,25 @@ def _get_body_column_widths(self) -> Sequence[int]: strcols: Sequence[Sequence[str]] = list(zip(*self.strrows)) return [max(len(x) for x in col) for col in strcols] + def _gen_rows(self) -> Iterator[Sequence[str]]: + """ + Generator function yielding rows content. + + Each element represents a row comprising a sequence of strings. + """ + if self.with_counts: + return self._gen_rows_with_counts() + else: + return self._gen_rows_without_counts() + + @abstractmethod + def _gen_rows_with_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data with counts.""" + + @abstractmethod + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: + """Iterator with string representation of body data without counts.""" + def add_header_line(self) -> None: header_line = self.SPACING.join( [ @@ -586,6 +602,55 @@ def add_body_lines(self) -> None: ) self._lines.append(body_line) + def _gen_non_null_counts(self) -> Iterator[str]: + """Iterator with string representation of non-null counts.""" + for count in self.non_null_counts: + yield f"{count} non-null" + + def _gen_dtypes(self) -> Iterator[str]: + """Iterator with string representation of column dtypes.""" + for dtype in self.dtypes: + yield pprint_thing(dtype) + + +class DataFrameTableBuilderVerbose(DataFrameTableBuilder, TableBuilderVerboseMixin): + """ + Dataframe info table builder for verbose output. + """ + + def __init__( + self, + *, + info: DataFrameInfo, + with_counts: bool, + ): + self.info = info + self.with_counts = with_counts + self.strrows: Sequence[Sequence[str]] = list(self._gen_rows()) + self.gross_column_widths: Sequence[int] = self._get_gross_column_widths() + + def _fill_non_empty_info(self) -> None: + """Add lines to the info table, pertaining to non-empty dataframe.""" + self.add_object_type_line() + self.add_index_range_line() + self.add_columns_summary_line() + self.add_header_line() + self.add_separator_line() + self.add_body_lines() + self.add_dtypes_line() + if self.display_memory_usage: + self.add_memory_usage_line() + + @property + def headers(self) -> Sequence[str]: + """Headers names of the columns in verbose table.""" + if self.with_counts: + return [" # ", "Column", "Non-Null Count", "Dtype"] + return [" # ", "Column", "Dtype"] + + def add_columns_summary_line(self) -> None: + self._lines.append(f"Data columns (total {self.col_count} columns):") + def _gen_rows_without_counts(self) -> Iterator[Sequence[str]]: """Iterator with string representation of body data without counts.""" yield from zip( @@ -613,12 +678,10 @@ def _gen_columns(self) -> Iterator[str]: for col in self.ids: yield pprint_thing(col) - def _gen_dtypes(self) -> Iterator[str]: - """Iterator with string representation of column dtypes.""" - for dtype in self.dtypes: - yield pprint_thing(dtype) - def _gen_non_null_counts(self) -> Iterator[str]: - """Iterator with string representation of non-null counts.""" - for count in self.non_null_counts: - yield f"{count} non-null" +def _get_dataframe_dtype_counts(df: "DataFrame") -> Mapping[str, int]: + """ + Create mapping between datatypes and their number of occurences. + """ + # groupby dtype.name to collect e.g. Categorical columns + return df.dtypes.value_counts().groupby(lambda x: x.name).sum()