Skip to content

Commit

Permalink
feat(rust, python): thousand separators in shape of repr DataFrame (#…
Browse files Browse the repository at this point in the history
  • Loading branch information
advoet authored Mar 26, 2023
1 parent 7c3daa0 commit aaf6155
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 10 deletions.
30 changes: 25 additions & 5 deletions polars/polars-core/src/fmt.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#[cfg(any(feature = "fmt", feature = "fmt_no_tty"))]
use std::borrow::Cow;
use std::fmt;
use std::fmt::{Debug, Display, Formatter};
use std::sync::atomic::{AtomicU8, Ordering};
use std::{fmt, str};

#[cfg(any(
feature = "dtype-date",
Expand Down Expand Up @@ -52,7 +52,7 @@ macro_rules! format_array {
write!(
$f,
"shape: ({},)\n{}: '{}' [{}]\n[\n",
$a.len(),
fmt_uint(&$a.len()),
$array_type,
$name,
$dtype
Expand Down Expand Up @@ -133,7 +133,7 @@ fn format_object_array(
write!(
f,
"shape: ({},)\n{}: '{}' [o][{}]\n[\n",
object.len(),
fmt_uint(&object.len()),
array_type,
name,
inner_type
Expand Down Expand Up @@ -364,6 +364,24 @@ fn env_is_true(varname: &str) -> bool {
std::env::var(varname).as_deref().unwrap_or("0") == "1"
}

fn fmt_uint(num: &usize) -> String {
// Return a string with thousands separated by _
// e.g. 1_000_000
num.to_string()
.as_bytes()
.rchunks(3)
.rev()
.map(str::from_utf8)
.collect::<Result<Vec<&str>, _>>()
.unwrap()
.join("_") // separator
}

fn fmt_df_shape((shape0, shape1): &(usize, usize)) -> String {
// e.g. (1_000_000, 4_000)
format!("({}, {})", fmt_uint(shape0), fmt_uint(shape1))
}

impl Display for DataFrame {
fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
#[cfg(any(feature = "fmt", feature = "fmt_no_tty"))]
Expand Down Expand Up @@ -572,12 +590,14 @@ impl Display for DataFrame {
}

// establish 'shape' information (above/below/hidden)
let shape_str = fmt_df_shape(&self.shape());

if env_is_true(FMT_TABLE_HIDE_DATAFRAME_SHAPE_INFORMATION) {
write!(f, "{table}")?;
} else if env_is_true(FMT_TABLE_DATAFRAME_SHAPE_BELOW) {
write!(f, "{table}\nshape: {:?}", self.shape())?;
write!(f, "{table}\nshape: {}", shape_str)?;
} else {
write!(f, "shape: {:?}\n{}", self.shape(), table)?;
write!(f, "shape: {}\n{}", shape_str, table)?;
}
}

Expand Down
2 changes: 1 addition & 1 deletion polars/polars-sql/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ quit \q Exit

>> register taxis /home/ritchie46/example/csv-benchmark/yellow_tripdata_2010-01.parquet
Added dataframe "taxis" from file /home/ritchie46/example/csv-benchmark/yellow_tripdata_2010-01.parquet
shape: (14863778, 18)
shape: (14_863_778, 18)
┌───────────┬─────────────────────┬─────────────────────┬─────────────────┬─────┬─────────┬────────────┬──────────────┬──────────────┐
│ vendor_id ┆ pickup_datetime ┆ dropoff_datetime ┆ passenger_count ┆ ... ┆ mta_tax ┆ tip_amount ┆ tolls_amount ┆ total_amount │
│ --- ┆ --- ┆ --- ┆ --- ┆ ┆ --- ┆ --- ┆ --- ┆ --- │
Expand Down
8 changes: 4 additions & 4 deletions py-polars/polars/expr/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -1022,7 +1022,7 @@ def microsecond(self) -> Expr:
... pl.col("date").dt.microsecond().alias("microseconds"),
... ]
... )
shape: (1001, 2)
shape: (1_001, 2)
┌─────────────────────────┬──────────────┐
│ date ┆ microseconds │
│ --- ┆ --- │
Expand Down Expand Up @@ -1492,7 +1492,7 @@ def milliseconds(self) -> Expr:
... pl.col("date").diff().dt.milliseconds().alias("milliseconds_diff"),
... ]
... )
shape: (1001, 2)
shape: (1_001, 2)
┌─────────────────────────┬───────────────────┐
│ date ┆ milliseconds_diff │
│ --- ┆ --- │
Expand Down Expand Up @@ -1536,7 +1536,7 @@ def microseconds(self) -> Expr:
... pl.col("date").diff().dt.microseconds().alias("microseconds_diff"),
... ]
... )
shape: (1001, 2)
shape: (1_001, 2)
┌─────────────────────────┬───────────────────┐
│ date ┆ microseconds_diff │
│ --- ┆ --- │
Expand Down Expand Up @@ -1580,7 +1580,7 @@ def nanoseconds(self) -> Expr:
... pl.col("date").diff().dt.nanoseconds().alias("nanoseconds_diff"),
... ]
... )
shape: (1001, 2)
shape: (1_001, 2)
┌─────────────────────────┬──────────────────┐
│ date ┆ nanoseconds_diff │
│ --- ┆ --- │
Expand Down
75 changes: 75 additions & 0 deletions py-polars/tests/unit/test_cfg.py
Original file line number Diff line number Diff line change
Expand Up @@ -386,6 +386,81 @@ def test_shape_below_table_and_inlined_dtype() -> None:
)


def test_shape_format_for_big_numbers() -> None:
df = pl.DataFrame({"a": range(1, 1001), "b": range(1001, 1001 + 1000)})

pl.Config.set_tbl_column_data_type_inline(True).set_tbl_dataframe_shape_below(True)
pl.Config.set_tbl_formatting("UTF8_FULL", rounded_corners=True)
assert (
str(df) == ""
"╭─────────┬─────────╮\n"
"│ a (i64) ┆ b (i64) │\n"
"╞═════════╪═════════╡\n"
"│ 1 ┆ 1001 │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ 2 ┆ 1002 │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ 3 ┆ 1003 │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ 4 ┆ 1004 │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ … ┆ … │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ 997 ┆ 1997 │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ 998 ┆ 1998 │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ 999 ┆ 1999 │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ 1000 ┆ 2000 │\n"
"╰─────────┴─────────╯\n"
"shape: (1_000, 2)"
)

pl.Config.set_tbl_column_data_type_inline(True).set_tbl_dataframe_shape_below(False)
assert (
str(df) == "shape: (1_000, 2)\n"
"╭─────────┬─────────╮\n"
"│ a (i64) ┆ b (i64) │\n"
"╞═════════╪═════════╡\n"
"│ 1 ┆ 1001 │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ 2 ┆ 1002 │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ 3 ┆ 1003 │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ 4 ┆ 1004 │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ … ┆ … │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ 997 ┆ 1997 │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ 998 ┆ 1998 │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ 999 ┆ 1999 │\n"
"├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
"│ 1000 ┆ 2000 │\n"
"╰─────────┴─────────╯"
)

pl.Config.set_tbl_rows(0)
ser = pl.Series("ser", range(1000))
assert str(ser) == "shape: (1_000,)\n" "Series: 'ser' [i64]\n" "[\n" "\t\n" "]"

pl.Config.set_tbl_rows(1)
pl.Config.set_tbl_cols(1)
df = pl.DataFrame({str(col_num): 1 for col_num in range(1000)})

assert (
str(df) == "shape: (1, 1_000)\n"
"╭─────────┬───╮\n"
"│ 0 (i64) ┆ … │\n"
"╞═════════╪═══╡\n"
"│ 1 ┆ … │\n"
"╰─────────┴───╯"
)


def test_string_cache() -> None:
df1 = pl.DataFrame({"a": ["foo", "bar", "ham"], "b": [1, 2, 3]})
df2 = pl.DataFrame({"a": ["foo", "spam", "eggs"], "c": [3, 2, 2]})
Expand Down

0 comments on commit aaf6155

Please sign in to comment.