Skip to content

Commit

Permalink
perf(rust, python): use inlined strings for field and schema (#7272)
Browse files Browse the repository at this point in the history
  • Loading branch information
ritchie46 authored Mar 1, 2023
1 parent 7f04d02 commit 59536fc
Show file tree
Hide file tree
Showing 46 changed files with 209 additions and 141 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ hashbrown = { version = "0.13.1", features = ["rayon", "ahash"] }
bitflags = "1.3"
once_cell = "1"
memchr = "2"
smartstring = { version = "1" }

[workspace.dependencies.arrow]
package = "arrow2"
Expand Down
4 changes: 2 additions & 2 deletions polars/polars-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ parquet = ["arrow/io_parquet"]
bigidx = ["polars-arrow/bigidx"]
python = []

serde-lazy = ["serde", "polars-arrow/serde", "indexmap/serde"]
serde-lazy = ["serde", "polars-arrow/serde", "indexmap/serde", "smartstring/serde"]

docs-selection = [
"ndarray",
Expand Down Expand Up @@ -174,7 +174,7 @@ regex = { version = "1.6", optional = true }
# activate if you want serde support for Series and DataFrames
serde = { version = "1", features = ["derive"], optional = true }
serde_json = { version = "1", optional = true }
smartstring = { version = "1" }
smartstring.workspace = true
thiserror.workspace = true
url = { version = "2.3.1", optional = true }
xxhash-rust.workspace = true
Expand Down
6 changes: 4 additions & 2 deletions polars/polars-core/src/chunked_array/logical/struct_/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ mod from;

use std::collections::BTreeMap;

use smartstring::alias::String as SmartString;

use super::*;
use crate::datatypes::*;
use crate::utils::index_to_chunked_index2;
Expand Down Expand Up @@ -163,7 +165,7 @@ impl StructChunked {
&self.field
}

pub fn name(&self) -> &String {
pub fn name(&self) -> &SmartString {
self.field.name()
}

Expand All @@ -176,7 +178,7 @@ impl StructChunked {
}

pub fn rename(&mut self, name: &str) {
self.field.set_name(name.to_string())
self.field.set_name(name.into())
}

pub(crate) fn try_apply_fields<F>(&self, func: F) -> PolarsResult<Self>
Expand Down
1 change: 1 addition & 0 deletions polars/polars-core/src/chunked_array/ops/sort/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ use std::cmp::Ordering;
use std::hint::unreachable_unchecked;
use std::iter::FromIterator;

#[cfg(feature = "sort_multiple")]
pub(crate) use arg_sort_multiple::argsort_multiple_row_fmt;
use arrow::bitmap::MutableBitmap;
use arrow::buffer::Buffer;
Expand Down
21 changes: 10 additions & 11 deletions polars/polars-core/src/datatypes/field.rs
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
use smartstring::alias::String as SmartString;

use super::*;

/// Characterizes the name and the [`DataType`] of a column.
#[derive(Clone, Debug, PartialEq, Eq)]
#[cfg_attr(
any(feature = "serde", feature = "serde-lazy"),
derive(Serialize, Deserialize)
)]
#[cfg_attr(feature = "serde-lazy", derive(Serialize, Deserialize))]
pub struct Field {
pub name: String,
pub name: SmartString,
pub dtype: DataType,
}

Expand All @@ -25,12 +24,12 @@ impl Field {
#[inline]
pub fn new(name: &str, dtype: DataType) -> Self {
Field {
name: name.to_string(),
name: name.into(),
dtype,
}
}

pub fn from_owned(name: String, dtype: DataType) -> Self {
pub fn from_owned(name: SmartString, dtype: DataType) -> Self {
Field { name, dtype }
}

Expand All @@ -45,7 +44,7 @@ impl Field {
/// assert_eq!(f.name(), "Year");
/// ```
#[inline]
pub fn name(&self) -> &String {
pub fn name(&self) -> &SmartString {
&self.name
}

Expand Down Expand Up @@ -86,11 +85,11 @@ impl Field {
/// ```rust
/// # use polars_core::prelude::*;
/// let mut f = Field::new("Atomic number", DataType::UInt32);
/// f.set_name("Proton".to_owned());
/// f.set_name("Proton".into());
///
/// assert_eq!(f, Field::new("Proton", DataType::UInt32));
/// ```
pub fn set_name(&mut self, name: String) {
pub fn set_name(&mut self, name: SmartString) {
self.name = name;
}

Expand All @@ -106,7 +105,7 @@ impl Field {
/// assert_eq!(f.to_arrow(), af);
/// ```
pub fn to_arrow(&self) -> ArrowField {
ArrowField::new(&self.name, self.dtype.to_arrow(), true)
ArrowField::new(self.name.as_str(), self.dtype.to_arrow(), true)
}
}

Expand Down
4 changes: 2 additions & 2 deletions polars/polars-core/src/datatypes/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,9 @@ use num_traits::{Bounded, FromPrimitive, Num, NumCast, Zero};
use polars_arrow::data_types::IsFloat;
#[cfg(feature = "serde")]
use serde::de::{EnumAccess, Error, Unexpected, VariantAccess, Visitor};
#[cfg(feature = "serde")]
#[cfg(any(feature = "serde", feature = "serde-lazy"))]
use serde::{Deserialize, Serialize};
#[cfg(feature = "serde")]
#[cfg(any(feature = "serde", feature = "serde-lazy"))]
use serde::{Deserializer, Serializer};
pub use time_unit::*;

Expand Down
15 changes: 5 additions & 10 deletions polars/polars-core/src/frame/asof_join/groups.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ use ahash::RandomState;
use arrow::types::NativeType;
use num_traits::Zero;
use rayon::prelude::*;
use smartstring::alias::String as SmartString;

use super::*;
use crate::frame::groupby::hashing::HASHMAP_INIT_SIZE;
Expand Down Expand Up @@ -634,8 +635,8 @@ impl DataFrame {
other: &DataFrame,
left_on: &str,
right_on: &str,
left_by: Vec<String>,
right_by: Vec<String>,
left_by: Vec<SmartString>,
right_by: Vec<SmartString>,
strategy: AsofStrategy,
tolerance: Option<AnyValue<'static>>,
suffix: Option<&str>,
Expand Down Expand Up @@ -727,14 +728,8 @@ impl DataFrame {
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
let left_by = left_by
.into_iter()
.map(|s| s.as_ref().to_string())
.collect();
let right_by = right_by
.into_iter()
.map(|s| s.as_ref().to_string())
.collect();
let left_by = left_by.into_iter().map(|s| s.as_ref().into()).collect();
let right_by = right_by.into_iter().map(|s| s.as_ref().into()).collect();
self._join_asof_by(
other, left_on, right_on, left_by, right_by, strategy, tolerance, None, None,
)
Expand Down
8 changes: 4 additions & 4 deletions polars/polars-core/src/frame/asof_join/mod.rs
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
mod asof;
mod groups;

use std::borrow::Cow;

use asof::*;
use num_traits::Bounded;
#[cfg(feature = "serde")]
use serde::{Deserialize, Serialize};
use smartstring::alias::String as SmartString;

use crate::prelude::*;
use crate::utils::slice_slice;
Expand All @@ -22,9 +22,9 @@ pub struct AsOfOptions {
/// - "2h15m"
/// - "1d6h"
/// etc
pub tolerance_str: Option<String>,
pub left_by: Option<Vec<String>>,
pub right_by: Option<Vec<String>>,
pub tolerance_str: Option<SmartString>,
pub left_by: Option<Vec<SmartString>>,
pub right_by: Option<Vec<SmartString>>,
}

fn check_asof_columns(a: &Series, b: &Series) -> PolarsResult<()> {
Expand Down
19 changes: 10 additions & 9 deletions polars/polars-core/src/frame/explode.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
use arrow::offset::OffsetsBuffer;
use polars_arrow::kernels::concatenate::concatenate_owned_unchecked;
#[cfg(feature = "serde")]
#[cfg(feature = "serde-lazy")]
use serde::{Deserialize, Serialize};
use smartstring::alias::String as SmartString;

use crate::chunked_array::ops::explode::offsets_to_indexes;
use crate::prelude::*;
Expand All @@ -20,12 +21,12 @@ fn get_exploded(series: &Series) -> PolarsResult<(Series, OffsetsBuffer<i64>)> {

/// Arguments for `[DataFrame::melt]` function
#[derive(Clone, Default, Debug)]
#[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
#[cfg_attr(feature = "serde-lazy", derive(Serialize, Deserialize))]
pub struct MeltArgs {
pub id_vars: Vec<String>,
pub value_vars: Vec<String>,
pub variable_name: Option<String>,
pub value_name: Option<String>,
pub id_vars: Vec<SmartString>,
pub value_vars: Vec<SmartString>,
pub variable_name: Option<SmartString>,
pub value_name: Option<SmartString>,
}

impl DataFrame {
Expand Down Expand Up @@ -209,8 +210,8 @@ impl DataFrame {
/// ```
pub fn melt<I, J>(&self, id_vars: I, value_vars: J) -> PolarsResult<Self>
where
I: IntoVec<String>,
J: IntoVec<String>,
I: IntoVec<SmartString>,
J: IntoVec<SmartString>,
{
let id_vars = id_vars.into_vec();
let value_vars = value_vars.into_vec();
Expand Down Expand Up @@ -242,7 +243,7 @@ impl DataFrame {
if id_vars_set.contains(s.name()) {
None
} else {
Some(s.name().to_string())
Some(s.name().into())
}
})
.collect();
Expand Down
27 changes: 14 additions & 13 deletions polars/polars-core/src/schema.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,14 @@ use std::fmt::{Debug, Formatter};
use indexmap::IndexMap;
#[cfg(feature = "serde-lazy")]
use serde::{Deserialize, Serialize};
use smartstring::alias::String as SmartString;

use crate::prelude::*;

#[derive(Eq, Clone, Default)]
#[cfg_attr(feature = "serde-lazy", derive(Serialize, Deserialize))]
pub struct Schema {
inner: PlIndexMap<String, DataType>,
inner: PlIndexMap<SmartString, DataType>,
}

// IndexMap does not care about order.
Expand Down Expand Up @@ -90,7 +91,7 @@ impl Schema {
self.inner.is_empty()
}

pub fn rename(&mut self, old: &str, new: String) -> Option<()> {
pub fn rename(&mut self, old: &str, new: SmartString) -> Option<()> {
// we first append the new name
// and then remove the old name
// this works because the removed slot is swapped with the last value in the indexmap
Expand All @@ -100,7 +101,7 @@ impl Schema {
Some(())
}

pub fn insert_index(&self, index: usize, name: String, dtype: DataType) -> Option<Self> {
pub fn insert_index(&self, index: usize, name: SmartString, dtype: DataType) -> Option<Self> {
// 0 and self.len() 0 is allowed
if index > self.len() {
return None;
Expand All @@ -125,7 +126,7 @@ impl Schema {
.ok_or_else(|| PolarsError::SchemaFieldNotFound(name.to_string().into()))
}

pub fn try_get_full(&self, name: &str) -> PolarsResult<(usize, &String, &DataType)> {
pub fn try_get_full(&self, name: &str) -> PolarsResult<(usize, &SmartString, &DataType)> {
self.inner
.get_full(name)
.ok_or_else(|| PolarsError::SchemaFieldNotFound(name.to_string().into()))
Expand All @@ -135,7 +136,7 @@ impl Schema {
self.inner.remove(name)
}

pub fn get_full(&self, name: &str) -> Option<(usize, &String, &DataType)> {
pub fn get_full(&self, name: &str) -> Option<(usize, &SmartString, &DataType)> {
self.inner.get_full(name)
}

Expand All @@ -152,15 +153,15 @@ impl Schema {
.map(|dtype| Field::new(name, dtype.clone()))
}

pub fn get_index(&self, index: usize) -> Option<(&String, &DataType)> {
pub fn get_index(&self, index: usize) -> Option<(&SmartString, &DataType)> {
self.inner.get_index(index)
}

pub fn contains(&self, name: &str) -> bool {
self.get(name).is_some()
}

pub fn get_index_mut(&mut self, index: usize) -> Option<(&mut String, &mut DataType)> {
pub fn get_index_mut(&mut self, index: usize) -> Option<(&mut SmartString, &mut DataType)> {
self.inner.get_index_mut(index)
}

Expand All @@ -184,7 +185,7 @@ impl Schema {
/// inserted, last in order, and `None` is returned.
///
/// Computes in **O(1)** time (amortized average).
pub fn with_column(&mut self, name: String, dtype: DataType) -> Option<DataType> {
pub fn with_column(&mut self, name: SmartString, dtype: DataType) -> Option<DataType> {
self.inner.insert(name, dtype)
}

Expand All @@ -196,7 +197,7 @@ impl Schema {
let fields: Vec<_> = self
.inner
.iter()
.map(|(name, dtype)| ArrowField::new(name, dtype.to_arrow(), true))
.map(|(name, dtype)| ArrowField::new(name.as_str(), dtype.to_arrow(), true))
.collect();
ArrowSchema::from(fields)
}
Expand All @@ -211,19 +212,19 @@ impl Schema {
self.inner.iter().map(|(_name, dtype)| dtype)
}

pub fn iter_names(&self) -> impl Iterator<Item = &String> + '_ + ExactSizeIterator {
pub fn iter_names(&self) -> impl Iterator<Item = &SmartString> + '_ + ExactSizeIterator {
self.inner.iter().map(|(name, _dtype)| name)
}
pub fn iter(&self) -> impl Iterator<Item = (&String, &DataType)> + '_ {
pub fn iter(&self) -> impl Iterator<Item = (&SmartString, &DataType)> + '_ {
self.inner.iter()
}
}

pub type SchemaRef = Arc<Schema>;

impl IntoIterator for Schema {
type Item = (String, DataType);
type IntoIter = <PlIndexMap<String, DataType> as IntoIterator>::IntoIter;
type Item = (SmartString, DataType);
type IntoIter = <PlIndexMap<SmartString, DataType> as IntoIterator>::IntoIter;

fn into_iter(self) -> Self::IntoIter {
self.inner.into_iter()
Expand Down
11 changes: 11 additions & 0 deletions polars/polars-core/src/utils/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ use num_traits::{One, Zero};
pub use polars_arrow::utils::{TrustMyLength, *};
use rayon::prelude::*;
pub use series::*;
use smartstring::alias::String as SmartString;
pub use supertype::*;
pub use {arrow, rayon};

Expand Down Expand Up @@ -824,6 +825,16 @@ where
}
}

impl<I, S> IntoVec<SmartString> for I
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
fn into_vec(self) -> Vec<SmartString> {
self.into_iter().map(|s| s.as_ref().into()).collect()
}
}

/// This logic is same as the impl on ChunkedArray
/// The difference is that there is less indirection because the caller should preallocate
/// `chunk_lens` once. On the `ChunkedArray` we indirect through an `ArrayRef` which is an indirection
Expand Down
2 changes: 1 addition & 1 deletion polars/polars-io/src/ndjson_core/buffer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ pub(crate) fn init_buffers(
.iter()
.map(|(name, dtype)| {
let av_buf = (dtype, capacity).into();
let key = KnownKey::from(name);
let key = KnownKey::from(name.as_str());
Ok((BufferKey(key), Buffer(name, av_buf)))
})
.collect()
Expand Down
Loading

0 comments on commit 59536fc

Please sign in to comment.