From cdd64d0ffad2529baad71325a83aeb0cee89f4d9 Mon Sep 17 00:00:00 2001 From: Ritchie Vink Date: Thu, 12 Sep 2024 11:44:25 +0200 Subject: [PATCH] chore: Remove IR info from DSL (#18712) --- crates/polars-lazy/src/scan/ndjson.rs | 1 - crates/polars-plan/src/plans/builder_dsl.rs | 14 +--------- .../src/plans/conversion/dsl_to_ir.rs | 26 +++++-------------- .../polars-plan/src/plans/conversion/mod.rs | 22 +++++----------- crates/polars-plan/src/plans/mod.rs | 19 +++----------- py-polars/polars/lazyframe/frame.py | 2 -- 6 files changed, 17 insertions(+), 67 deletions(-) diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs index e53171d28f51..cbd7bbcee8f1 100644 --- a/crates/polars-lazy/src/scan/ndjson.rs +++ b/crates/polars-lazy/src/scan/ndjson.rs @@ -157,7 +157,6 @@ impl LazyFileListReader for LazyJsonLineReader { Ok(LazyFrame::from(DslPlan::Scan { sources: Arc::new(Mutex::new(self.sources.to_dsl(false))), file_info: Arc::new(RwLock::new(None)), - predicate: None, file_options, scan_type, })) diff --git a/crates/polars-plan/src/plans/builder_dsl.rs b/crates/polars-plan/src/plans/builder_dsl.rs index bbbe12ef95de..4040087833f0 100644 --- a/crates/polars-plan/src/plans/builder_dsl.rs +++ b/crates/polars-plan/src/plans/builder_dsl.rs @@ -13,7 +13,6 @@ use polars_io::HiveOptions; #[cfg(any(feature = "parquet", feature = "csv", feature = "ipc"))] use polars_io::RowIndex; -use crate::constants::UNLIMITED_CACHE; #[cfg(feature = "python")] use crate::prelude::python_udf::PythonFunction; use crate::prelude::*; @@ -63,7 +62,6 @@ impl DslBuilder { is_expanded: true, })), file_info: Arc::new(RwLock::new(Some(file_info))), - predicate: None, file_options, scan_type: FileScan::Anonymous { function, @@ -106,7 +104,6 @@ impl DslBuilder { Ok(DslPlan::Scan { sources: Arc::new(Mutex::new(sources)), file_info: Arc::new(RwLock::new(None)), - predicate: None, file_options: options, scan_type: FileScan::Parquet { options: ParquetOptions { @@ -148,7 +145,6 @@ impl DslBuilder { glob: true, include_file_paths, }, - predicate: None, scan_type: FileScan::Ipc { options, cloud_options, @@ -190,7 +186,6 @@ impl DslBuilder { sources: Arc::new(Mutex::new(sources)), file_info: Arc::new(RwLock::new(None)), file_options: options, - predicate: None, scan_type: FileScan::Csv { options: read_options, cloud_options, @@ -202,12 +197,7 @@ impl DslBuilder { pub fn cache(self) -> Self { let input = Arc::new(self.0); let id = input.as_ref() as *const DslPlan as usize; - DslPlan::Cache { - input, - id, - cache_hits: UNLIMITED_CACHE, - } - .into() + DslPlan::Cache { input, id }.into() } pub fn drop(self, to_drop: Vec, strict: bool) -> Self { @@ -321,8 +311,6 @@ impl DslBuilder { DslPlan::DataFrameScan { df: Arc::new(df), schema, - output_schema: None, - filter: None, } .into() } diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs index 23f2050dec4c..7cbb37bcb8e7 100644 --- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs +++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs @@ -107,7 +107,6 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult DslPlan::Scan { sources, file_info, - predicate, mut file_options, mut scan_type, } => { @@ -276,9 +275,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult file_info: resolved_file_info, hive_parts, output_schema: None, - predicate: predicate - .map(|expr| to_expr_ir(expr, ctxt.expr_arena)) - .transpose()?, + predicate: None, scan_type, file_options, } @@ -374,18 +371,11 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult to_alp_impl(owned(input), ctxt).map_err(|e| e.context(failed_input!(slice)))?; IR::Slice { input, offset, len } }, - DslPlan::DataFrameScan { + DslPlan::DataFrameScan { df, schema } => IR::DataFrameScan { df, schema, - output_schema, - filter: selection, - } => IR::DataFrameScan { - df, - schema, - output_schema, - filter: selection - .map(|expr| to_expr_ir(expr, ctxt.expr_arena)) - .transpose()?, + output_schema: None, + filter: None, }, DslPlan::Select { expr, @@ -488,17 +478,13 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult return run_conversion(lp, ctxt, "sort"); }, - DslPlan::Cache { - input, - id, - cache_hits, - } => { + DslPlan::Cache { input, id } => { let input = to_alp_impl(owned(input), ctxt).map_err(|e| e.context(failed_input!(cache)))?; IR::Cache { input, id, - cache_hits, + cache_hits: crate::constants::UNLIMITED_CACHE, } }, DslPlan::GroupBy { diff --git a/crates/polars-plan/src/plans/conversion/mod.rs b/crates/polars-plan/src/plans/conversion/mod.rs index c79b6444effe..f05482de7670 100644 --- a/crates/polars-plan/src/plans/conversion/mod.rs +++ b/crates/polars-plan/src/plans/conversion/mod.rs @@ -53,7 +53,7 @@ impl IR { sources, file_info, hive_parts: _, - predicate, + predicate: _, scan_type, output_schema: _, file_options: options, @@ -63,7 +63,6 @@ impl IR { is_expanded: true, })), file_info: Arc::new(RwLock::new(Some(file_info))), - predicate: predicate.map(|e| e.to_expr(expr_arena)), scan_type, file_options: options, }, @@ -109,14 +108,9 @@ impl IR { IR::DataFrameScan { df, schema, - output_schema, - filter: selection, - } => DslPlan::DataFrameScan { - df, - schema, - output_schema, - filter: selection.map(|e| e.to_expr(expr_arena)), - }, + output_schema: _, + filter: _, + } => DslPlan::DataFrameScan { df, schema }, IR::Select { expr, input, @@ -170,14 +164,10 @@ impl IR { IR::Cache { input, id, - cache_hits, + cache_hits: _, } => { let input = Arc::new(convert_to_lp(input, lp_arena)); - DslPlan::Cache { - input, - id, - cache_hits, - } + DslPlan::Cache { input, id } }, IR::GroupBy { input, diff --git a/crates/polars-plan/src/plans/mod.rs b/crates/polars-plan/src/plans/mod.rs index 5f06f240b25f..100c16625fa5 100644 --- a/crates/polars-plan/src/plans/mod.rs +++ b/crates/polars-plan/src/plans/mod.rs @@ -64,7 +64,6 @@ pub struct DslScanSources { pub is_expanded: bool, } -// https://stackoverflow.com/questions/1031076/what-are-projection-and-selection #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))] pub enum DslPlan { #[cfg(feature = "python")] @@ -75,11 +74,7 @@ pub enum DslPlan { predicate: Expr, }, /// Cache the input at this point in the LP - Cache { - input: Arc, - id: usize, - cache_hits: u32, - }, + Cache { input: Arc, id: usize }, Scan { sources: Arc>, // Option as this is mostly materialized on the IR phase. @@ -88,7 +83,6 @@ pub enum DslPlan { // are used as base of different queries in a loop. That way // the expensive schema resolving is cached. file_info: Arc>>, - predicate: Option, file_options: FileScanOptions, scan_type: FileScan, }, @@ -97,9 +91,6 @@ pub enum DslPlan { DataFrameScan { df: Arc, schema: SchemaRef, - // schema of the projected file - output_schema: Option, - filter: Option, }, /// Polars' `select` operation, this can mean projection, but also full data access. Select { @@ -196,9 +187,9 @@ impl Clone for DslPlan { #[cfg(feature = "python")] Self::PythonScan { options } => Self::PythonScan { options: options.clone() }, Self::Filter { input, predicate } => Self::Filter { input: input.clone(), predicate: predicate.clone() }, - Self::Cache { input, id, cache_hits } => Self::Cache { input: input.clone(), id: id.clone(), cache_hits: cache_hits.clone() }, - Self::Scan { sources, file_info, predicate, file_options, scan_type } => Self::Scan { sources: sources.clone(), file_info: file_info.clone(), predicate: predicate.clone(), file_options: file_options.clone(), scan_type: scan_type.clone() }, - Self::DataFrameScan { df, schema, output_schema, filter: selection } => Self::DataFrameScan { df: df.clone(), schema: schema.clone(), output_schema: output_schema.clone(), filter: selection.clone() }, + Self::Cache { input, id } => Self::Cache { input: input.clone(), id: id.clone() }, + Self::Scan { sources, file_info, file_options, scan_type } => Self::Scan { sources: sources.clone(), file_info: file_info.clone(), file_options: file_options.clone(), scan_type: scan_type.clone() }, + Self::DataFrameScan { df, schema, } => Self::DataFrameScan { df: df.clone(), schema: schema.clone(), }, Self::Select { expr, input, options } => Self::Select { expr: expr.clone(), input: input.clone(), options: options.clone() }, Self::GroupBy { input, keys, aggs, apply, maintain_order, options } => Self::GroupBy { input: input.clone(), keys: keys.clone(), aggs: aggs.clone(), apply: apply.clone(), maintain_order: maintain_order.clone(), options: options.clone() }, Self::Join { input_left, input_right, left_on, right_on, predicates, options } => Self::Join { input_left: input_left.clone(), input_right: input_right.clone(), left_on: left_on.clone(), right_on: right_on.clone(), options: options.clone(), predicates: predicates.clone() }, @@ -223,8 +214,6 @@ impl Default for DslPlan { DslPlan::DataFrameScan { df: Arc::new(df), schema: Arc::new(schema), - output_schema: None, - filter: None, } } } diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py index d50c2867dd24..0eb14cba9e88 100644 --- a/py-polars/polars/lazyframe/frame.py +++ b/py-polars/polars/lazyframe/frame.py @@ -699,8 +699,6 @@ def serialize( >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum() >>> bytes = lf.serialize() - >>> bytes # doctest: +ELLIPSIS - b'\xa1kMapFunction\xa2einput\xa1mDataFrameScan\xa4bdf\xa1gcolumns\x81\xa4d...' The bytes can later be deserialized back into a LazyFrame.