feat(rust, python): thousand separators in shape of repr DataFrame (#…

…7775)
pola-rs · Mar 26, 2023 · aaf6155 · aaf6155
1 parent 7c3daa0
commit aaf6155
Show file tree

Hide file tree

Showing 4 changed files with 105 additions and 10 deletions.
diff --git a/polars/polars-core/src/fmt.rs b/polars/polars-core/src/fmt.rs
@@ -1,8 +1,8 @@
 #[cfg(any(feature = "fmt", feature = "fmt_no_tty"))]
 use std::borrow::Cow;
-use std::fmt;
 use std::fmt::{Debug, Display, Formatter};
 use std::sync::atomic::{AtomicU8, Ordering};
+use std::{fmt, str};
 
 #[cfg(any(
     feature = "dtype-date",
@@ -52,7 +52,7 @@ macro_rules! format_array {
         write!(
             $f,
             "shape: ({},)\n{}: '{}' [{}]\n[\n",
-            $a.len(),
+            fmt_uint(&$a.len()),
             $array_type,
             $name,
             $dtype
@@ -133,7 +133,7 @@ fn format_object_array(
             write!(
                 f,
                 "shape: ({},)\n{}: '{}' [o][{}]\n[\n",
-                object.len(),
+                fmt_uint(&object.len()),
                 array_type,
                 name,
                 inner_type
@@ -364,6 +364,24 @@ fn env_is_true(varname: &str) -> bool {
     std::env::var(varname).as_deref().unwrap_or("0") == "1"
 }
 
+fn fmt_uint(num: &usize) -> String {
+    // Return a string with thousands separated by _
+    // e.g. 1_000_000
+    num.to_string()
+        .as_bytes()
+        .rchunks(3)
+        .rev()
+        .map(str::from_utf8)
+        .collect::<Result<Vec<&str>, _>>()
+        .unwrap()
+        .join("_") // separator
+}
+
+fn fmt_df_shape((shape0, shape1): &(usize, usize)) -> String {
+    // e.g. (1_000_000, 4_000)
+    format!("({}, {})", fmt_uint(shape0), fmt_uint(shape1))
+}
+
 impl Display for DataFrame {
     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
         #[cfg(any(feature = "fmt", feature = "fmt_no_tty"))]
@@ -572,12 +590,14 @@ impl Display for DataFrame {
             }
 
             // establish 'shape' information (above/below/hidden)
+            let shape_str = fmt_df_shape(&self.shape());
+
             if env_is_true(FMT_TABLE_HIDE_DATAFRAME_SHAPE_INFORMATION) {
                 write!(f, "{table}")?;
             } else if env_is_true(FMT_TABLE_DATAFRAME_SHAPE_BELOW) {
-                write!(f, "{table}\nshape: {:?}", self.shape())?;
+                write!(f, "{table}\nshape: {}", shape_str)?;
             } else {
-                write!(f, "shape: {:?}\n{}", self.shape(), table)?;
+                write!(f, "shape: {}\n{}", shape_str, table)?;
             }
         }
 

diff --git a/polars/polars-sql/README.md b/polars/polars-sql/README.md
@@ -20,7 +20,7 @@ quit                \q          Exit
 
 >> register taxis /home/ritchie46/example/csv-benchmark/yellow_tripdata_2010-01.parquet
 Added dataframe "taxis" from file /home/ritchie46/example/csv-benchmark/yellow_tripdata_2010-01.parquet
-shape: (14863778, 18)
+shape: (14_863_778, 18)
 ┌───────────┬─────────────────────┬─────────────────────┬─────────────────┬─────┬─────────┬────────────┬──────────────┬──────────────┐
 │ vendor_id ┆ pickup_datetime     ┆ dropoff_datetime    ┆ passenger_count ┆ ... ┆ mta_tax ┆ tip_amount ┆ tolls_amount ┆ total_amount │
 │ ---       ┆ ---                 ┆ ---                 ┆ ---             ┆     ┆ ---     ┆ ---        ┆ ---          ┆ ---          │

diff --git a/py-polars/polars/expr/datetime.py b/py-polars/polars/expr/datetime.py
@@ -1022,7 +1022,7 @@ def microsecond(self) -> Expr:
         ...         pl.col("date").dt.microsecond().alias("microseconds"),
         ...     ]
         ... )
-        shape: (1001, 2)
+        shape: (1_001, 2)
         ┌─────────────────────────┬──────────────┐
         │ date                    ┆ microseconds │
         │ ---                     ┆ ---          │
@@ -1492,7 +1492,7 @@ def milliseconds(self) -> Expr:
         ...         pl.col("date").diff().dt.milliseconds().alias("milliseconds_diff"),
         ...     ]
         ... )
-        shape: (1001, 2)
+        shape: (1_001, 2)
         ┌─────────────────────────┬───────────────────┐
         │ date                    ┆ milliseconds_diff │
         │ ---                     ┆ ---               │
@@ -1536,7 +1536,7 @@ def microseconds(self) -> Expr:
         ...         pl.col("date").diff().dt.microseconds().alias("microseconds_diff"),
         ...     ]
         ... )
-        shape: (1001, 2)
+        shape: (1_001, 2)
         ┌─────────────────────────┬───────────────────┐
         │ date                    ┆ microseconds_diff │
         │ ---                     ┆ ---               │
@@ -1580,7 +1580,7 @@ def nanoseconds(self) -> Expr:
         ...         pl.col("date").diff().dt.nanoseconds().alias("nanoseconds_diff"),
         ...     ]
         ... )
-        shape: (1001, 2)
+        shape: (1_001, 2)
         ┌─────────────────────────┬──────────────────┐
         │ date                    ┆ nanoseconds_diff │
         │ ---                     ┆ ---              │

diff --git a/py-polars/tests/unit/test_cfg.py b/py-polars/tests/unit/test_cfg.py
@@ -386,6 +386,81 @@ def test_shape_below_table_and_inlined_dtype() -> None:
     )
 
 
+def test_shape_format_for_big_numbers() -> None:
+    df = pl.DataFrame({"a": range(1, 1001), "b": range(1001, 1001 + 1000)})
+
+    pl.Config.set_tbl_column_data_type_inline(True).set_tbl_dataframe_shape_below(True)
+    pl.Config.set_tbl_formatting("UTF8_FULL", rounded_corners=True)
+    assert (
+        str(df) == ""
+        "╭─────────┬─────────╮\n"
+        "│ a (i64) ┆ b (i64) │\n"
+        "╞═════════╪═════════╡\n"
+        "│ 1       ┆ 1001    │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ 2       ┆ 1002    │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ 3       ┆ 1003    │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ 4       ┆ 1004    │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ …       ┆ …       │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ 997     ┆ 1997    │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ 998     ┆ 1998    │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ 999     ┆ 1999    │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ 1000    ┆ 2000    │\n"
+        "╰─────────┴─────────╯\n"
+        "shape: (1_000, 2)"
+    )
+
+    pl.Config.set_tbl_column_data_type_inline(True).set_tbl_dataframe_shape_below(False)
+    assert (
+        str(df) == "shape: (1_000, 2)\n"
+        "╭─────────┬─────────╮\n"
+        "│ a (i64) ┆ b (i64) │\n"
+        "╞═════════╪═════════╡\n"
+        "│ 1       ┆ 1001    │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ 2       ┆ 1002    │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ 3       ┆ 1003    │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ 4       ┆ 1004    │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ …       ┆ …       │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ 997     ┆ 1997    │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ 998     ┆ 1998    │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ 999     ┆ 1999    │\n"
+        "├╌╌╌╌╌╌╌╌╌┼╌╌╌╌╌╌╌╌╌┤\n"
+        "│ 1000    ┆ 2000    │\n"
+        "╰─────────┴─────────╯"
+    )
+
+    pl.Config.set_tbl_rows(0)
+    ser = pl.Series("ser", range(1000))
+    assert str(ser) == "shape: (1_000,)\n" "Series: 'ser' [i64]\n" "[\n" "\t…\n" "]"
+
+    pl.Config.set_tbl_rows(1)
+    pl.Config.set_tbl_cols(1)
+    df = pl.DataFrame({str(col_num): 1 for col_num in range(1000)})
+
+    assert (
+        str(df) == "shape: (1, 1_000)\n"
+        "╭─────────┬───╮\n"
+        "│ 0 (i64) ┆ … │\n"
+        "╞═════════╪═══╡\n"
+        "│ 1       ┆ … │\n"
+        "╰─────────┴───╯"
+    )
+
+
 def test_string_cache() -> None:
     df1 = pl.DataFrame({"a": ["foo", "bar", "ham"], "b": [1, 2, 3]})
     df2 = pl.DataFrame({"a": ["foo", "spam", "eggs"], "c": [3, 2, 2]})