From cdd64d0ffad2529baad71325a83aeb0cee89f4d9 Mon Sep 17 00:00:00 2001
From: Ritchie Vink <ritchie46@gmail.com>
Date: Thu, 12 Sep 2024 11:44:25 +0200
Subject: [PATCH] chore: Remove IR info from DSL (#18712)

---
 crates/polars-lazy/src/scan/ndjson.rs         |  1 -
 crates/polars-plan/src/plans/builder_dsl.rs   | 14 +---------
 .../src/plans/conversion/dsl_to_ir.rs         | 26 +++++--------------
 .../polars-plan/src/plans/conversion/mod.rs   | 22 +++++-----------
 crates/polars-plan/src/plans/mod.rs           | 19 +++-----------
 py-polars/polars/lazyframe/frame.py           |  2 --
 6 files changed, 17 insertions(+), 67 deletions(-)
diff --git a/crates/polars-lazy/src/scan/ndjson.rs b/crates/polars-lazy/src/scan/ndjson.rs
index e53171d28f51..cbd7bbcee8f1 100644
--- a/crates/polars-lazy/src/scan/ndjson.rs
+++ b/crates/polars-lazy/src/scan/ndjson.rs
@@ -157,7 +157,6 @@ impl LazyFileListReader for LazyJsonLineReader {
         Ok(LazyFrame::from(DslPlan::Scan {
             sources: Arc::new(Mutex::new(self.sources.to_dsl(false))),
             file_info: Arc::new(RwLock::new(None)),
-            predicate: None,
             file_options,
             scan_type,
         }))
diff --git a/crates/polars-plan/src/plans/builder_dsl.rs b/crates/polars-plan/src/plans/builder_dsl.rs
index bbbe12ef95de..4040087833f0 100644
--- a/crates/polars-plan/src/plans/builder_dsl.rs
+++ b/crates/polars-plan/src/plans/builder_dsl.rs
@@ -13,7 +13,6 @@ use polars_io::HiveOptions;
 #[cfg(any(feature = "parquet", feature = "csv", feature = "ipc"))]
 use polars_io::RowIndex;
 
-use crate::constants::UNLIMITED_CACHE;
 #[cfg(feature = "python")]
 use crate::prelude::python_udf::PythonFunction;
 use crate::prelude::*;
@@ -63,7 +62,6 @@ impl DslBuilder {
                 is_expanded: true,
             })),
             file_info: Arc::new(RwLock::new(Some(file_info))),
-            predicate: None,
             file_options,
             scan_type: FileScan::Anonymous {
                 function,
@@ -106,7 +104,6 @@ impl DslBuilder {
         Ok(DslPlan::Scan {
             sources: Arc::new(Mutex::new(sources)),
             file_info: Arc::new(RwLock::new(None)),
-            predicate: None,
             file_options: options,
             scan_type: FileScan::Parquet {
                 options: ParquetOptions {
@@ -148,7 +145,6 @@ impl DslBuilder {
                 glob: true,
                 include_file_paths,
             },
-            predicate: None,
             scan_type: FileScan::Ipc {
                 options,
                 cloud_options,
@@ -190,7 +186,6 @@ impl DslBuilder {
             sources: Arc::new(Mutex::new(sources)),
             file_info: Arc::new(RwLock::new(None)),
             file_options: options,
-            predicate: None,
             scan_type: FileScan::Csv {
                 options: read_options,
                 cloud_options,
@@ -202,12 +197,7 @@ impl DslBuilder {
     pub fn cache(self) -> Self {
         let input = Arc::new(self.0);
         let id = input.as_ref() as *const DslPlan as usize;
-        DslPlan::Cache {
-            input,
-            id,
-            cache_hits: UNLIMITED_CACHE,
-        }
-        .into()
+        DslPlan::Cache { input, id }.into()
     }
 
     pub fn drop(self, to_drop: Vec<Selector>, strict: bool) -> Self {
@@ -321,8 +311,6 @@ impl DslBuilder {
         DslPlan::DataFrameScan {
             df: Arc::new(df),
             schema,
-            output_schema: None,
-            filter: None,
         }
         .into()
     }
diff --git a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
index 23f2050dec4c..7cbb37bcb8e7 100644
--- a/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
+++ b/crates/polars-plan/src/plans/conversion/dsl_to_ir.rs
@@ -107,7 +107,6 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
         DslPlan::Scan {
             sources,
             file_info,
-            predicate,
             mut file_options,
             mut scan_type,
         } => {
@@ -276,9 +275,7 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
                 file_info: resolved_file_info,
                 hive_parts,
                 output_schema: None,
-                predicate: predicate
-                    .map(|expr| to_expr_ir(expr, ctxt.expr_arena))
-                    .transpose()?,
+                predicate: None,
                 scan_type,
                 file_options,
             }
@@ -374,18 +371,11 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
                 to_alp_impl(owned(input), ctxt).map_err(|e| e.context(failed_input!(slice)))?;
             IR::Slice { input, offset, len }
         },
-        DslPlan::DataFrameScan {
+        DslPlan::DataFrameScan { df, schema } => IR::DataFrameScan {
             df,
             schema,
-            output_schema,
-            filter: selection,
-        } => IR::DataFrameScan {
-            df,
-            schema,
-            output_schema,
-            filter: selection
-                .map(|expr| to_expr_ir(expr, ctxt.expr_arena))
-                .transpose()?,
+            output_schema: None,
+            filter: None,
         },
         DslPlan::Select {
             expr,
@@ -488,17 +478,13 @@ pub fn to_alp_impl(lp: DslPlan, ctxt: &mut DslConversionContext) -> PolarsResult
 
             return run_conversion(lp, ctxt, "sort");
         },
-        DslPlan::Cache {
-            input,
-            id,
-            cache_hits,
-        } => {
+        DslPlan::Cache { input, id } => {
             let input =
                 to_alp_impl(owned(input), ctxt).map_err(|e| e.context(failed_input!(cache)))?;
             IR::Cache {
                 input,
                 id,
-                cache_hits,
+                cache_hits: crate::constants::UNLIMITED_CACHE,
             }
         },
         DslPlan::GroupBy {
diff --git a/crates/polars-plan/src/plans/conversion/mod.rs b/crates/polars-plan/src/plans/conversion/mod.rs
index c79b6444effe..f05482de7670 100644
--- a/crates/polars-plan/src/plans/conversion/mod.rs
+++ b/crates/polars-plan/src/plans/conversion/mod.rs
@@ -53,7 +53,7 @@ impl IR {
                 sources,
                 file_info,
                 hive_parts: _,
-                predicate,
+                predicate: _,
                 scan_type,
                 output_schema: _,
                 file_options: options,
@@ -63,7 +63,6 @@ impl IR {
                     is_expanded: true,
                 })),
                 file_info: Arc::new(RwLock::new(Some(file_info))),
-                predicate: predicate.map(|e| e.to_expr(expr_arena)),
                 scan_type,
                 file_options: options,
             },
@@ -109,14 +108,9 @@ impl IR {
             IR::DataFrameScan {
                 df,
                 schema,
-                output_schema,
-                filter: selection,
-            } => DslPlan::DataFrameScan {
-                df,
-                schema,
-                output_schema,
-                filter: selection.map(|e| e.to_expr(expr_arena)),
-            },
+                output_schema: _,
+                filter: _,
+            } => DslPlan::DataFrameScan { df, schema },
             IR::Select {
                 expr,
                 input,
@@ -170,14 +164,10 @@ impl IR {
             IR::Cache {
                 input,
                 id,
-                cache_hits,
+                cache_hits: _,
             } => {
                 let input = Arc::new(convert_to_lp(input, lp_arena));
-                DslPlan::Cache {
-                    input,
-                    id,
-                    cache_hits,
-                }
+                DslPlan::Cache { input, id }
             },
             IR::GroupBy {
                 input,
diff --git a/crates/polars-plan/src/plans/mod.rs b/crates/polars-plan/src/plans/mod.rs
index 5f06f240b25f..100c16625fa5 100644
--- a/crates/polars-plan/src/plans/mod.rs
+++ b/crates/polars-plan/src/plans/mod.rs
@@ -64,7 +64,6 @@ pub struct DslScanSources {
     pub is_expanded: bool,
 }
 
-// https://stackoverflow.com/questions/1031076/what-are-projection-and-selection
 #[cfg_attr(feature = "serde", derive(Serialize, Deserialize))]
 pub enum DslPlan {
     #[cfg(feature = "python")]
@@ -75,11 +74,7 @@ pub enum DslPlan {
         predicate: Expr,
     },
     /// Cache the input at this point in the LP
-    Cache {
-        input: Arc<DslPlan>,
-        id: usize,
-        cache_hits: u32,
-    },
+    Cache { input: Arc<DslPlan>, id: usize },
     Scan {
         sources: Arc<Mutex<DslScanSources>>,
         // Option as this is mostly materialized on the IR phase.
@@ -88,7 +83,6 @@ pub enum DslPlan {
         // are used as base of different queries in a loop. That way
         // the expensive schema resolving is cached.
         file_info: Arc<RwLock<Option<FileInfo>>>,
-        predicate: Option<Expr>,
         file_options: FileScanOptions,
         scan_type: FileScan,
     },
@@ -97,9 +91,6 @@ pub enum DslPlan {
     DataFrameScan {
         df: Arc<DataFrame>,
         schema: SchemaRef,
-        // schema of the projected file
-        output_schema: Option<SchemaRef>,
-        filter: Option<Expr>,
     },
     /// Polars' `select` operation, this can mean projection, but also full data access.
     Select {
@@ -196,9 +187,9 @@ impl Clone for DslPlan {
             #[cfg(feature = "python")]
             Self::PythonScan { options } => Self::PythonScan { options: options.clone() },
             Self::Filter { input, predicate } => Self::Filter { input: input.clone(), predicate: predicate.clone() },
-            Self::Cache { input, id, cache_hits } => Self::Cache { input: input.clone(), id: id.clone(), cache_hits: cache_hits.clone() },
-            Self::Scan { sources, file_info, predicate, file_options, scan_type } => Self::Scan { sources: sources.clone(), file_info: file_info.clone(), predicate: predicate.clone(), file_options: file_options.clone(), scan_type: scan_type.clone() },
-            Self::DataFrameScan { df, schema, output_schema, filter: selection } => Self::DataFrameScan { df: df.clone(), schema: schema.clone(), output_schema: output_schema.clone(), filter: selection.clone() },
+            Self::Cache { input, id } => Self::Cache { input: input.clone(), id: id.clone() },
+            Self::Scan { sources, file_info, file_options, scan_type } => Self::Scan { sources: sources.clone(), file_info: file_info.clone(), file_options: file_options.clone(), scan_type: scan_type.clone() },
+            Self::DataFrameScan { df, schema, } => Self::DataFrameScan { df: df.clone(), schema: schema.clone(),  },
             Self::Select { expr, input, options } => Self::Select { expr: expr.clone(), input: input.clone(), options: options.clone() },
             Self::GroupBy { input, keys, aggs,  apply, maintain_order, options } => Self::GroupBy { input: input.clone(), keys: keys.clone(), aggs: aggs.clone(), apply: apply.clone(), maintain_order: maintain_order.clone(), options: options.clone() },
             Self::Join { input_left, input_right, left_on, right_on, predicates, options } => Self::Join { input_left: input_left.clone(), input_right: input_right.clone(), left_on: left_on.clone(), right_on: right_on.clone(), options: options.clone(), predicates: predicates.clone() },
@@ -223,8 +214,6 @@ impl Default for DslPlan {
         DslPlan::DataFrameScan {
             df: Arc::new(df),
             schema: Arc::new(schema),
-            output_schema: None,
-            filter: None,
         }
     }
 }
diff --git a/py-polars/polars/lazyframe/frame.py b/py-polars/polars/lazyframe/frame.py
index d50c2867dd24..0eb14cba9e88 100644
--- a/py-polars/polars/lazyframe/frame.py
+++ b/py-polars/polars/lazyframe/frame.py
@@ -699,8 +699,6 @@ def serialize(
 
         >>> lf = pl.LazyFrame({"a": [1, 2, 3]}).sum()
         >>> bytes = lf.serialize()
-        >>> bytes  # doctest: +ELLIPSIS
-        b'\xa1kMapFunction\xa2einput\xa1mDataFrameScan\xa4bdf\xa1gcolumns\x81\xa4d...'
 
         The bytes can later be deserialized back into a LazyFrame.