feat: Configurable retention on PyDict sources (#744)

This renames the `PyList` source to `PyDict`. This allows the in-memory retention to be disabled for any source, but exposes this for the `PyDict` source, which is often used with materializations.
kaskada-ai · Sep 8, 2023 · d0b0f85 · d0b0f85
1 parent 2d261b5
commit d0b0f85
Show file tree

Hide file tree

Showing 26 changed files with 147 additions and 58 deletions.
diff --git a/crates/sparrow-merge/src/in_memory_batches.rs b/crates/sparrow-merge/src/in_memory_batches.rs
@@ -20,21 +20,53 @@ impl error_stack::Context for Error {}
 /// Struct for managing in-memory batches.
 #[derive(Debug)]
 pub struct InMemoryBatches {
-    pub schema: SchemaRef,
-    current: RwLock<(usize, RecordBatch)>,
+    retained: bool,
+    current: RwLock<Current>,
     updates: tokio::sync::broadcast::Sender<(usize, RecordBatch)>,
     /// A subscriber that is never used -- it exists only to keep the sender
     /// alive.
     _subscriber: tokio::sync::broadcast::Receiver<(usize, RecordBatch)>,
 }
 
-impl InMemoryBatches {
+#[derive(Debug)]
+struct Current {
+    schema: SchemaRef,
+    version: usize,
+    batch: RecordBatch,
+}
+
+impl Current {
     pub fn new(schema: SchemaRef) -> Self {
-        let (updates, _subscriber) = tokio::sync::broadcast::channel(10);
-        let merged = RecordBatch::new_empty(schema.clone());
+        let batch = RecordBatch::new_empty(schema.clone());
         Self {
             schema,
-            current: RwLock::new((0, merged)),
+            version: 0,
+            batch,
+        }
+    }
+
+    pub fn add_batch(&mut self, batch: &RecordBatch) -> error_stack::Result<(), Error> {
+        if self.batch.num_rows() == 0 {
+            self.batch = batch.clone();
+        } else {
+            // This assumes that cloning the old batch is cheap.
+            // If it isn't, we could replace it with an empty batch (`std::mem::replace`),
+            // put it in an option, or allow `homogeneous_merge` to take `&RecordBatch`.
+            self.batch = homogeneous_merge(&self.schema, vec![self.batch.clone(), batch.clone()])
+                .into_report()
+                .change_context(Error::Add)?;
+        }
+        Ok(())
+    }
+}
+
+impl InMemoryBatches {
+    pub fn new(retained: bool, schema: SchemaRef) -> Self {
+        let (updates, _subscriber) = tokio::sync::broadcast::channel(10);
+        let current = RwLock::new(Current::new(schema.clone()));
+        Self {
+            retained,
+            current,
             updates,
             _subscriber,
         }
@@ -50,19 +82,11 @@ impl InMemoryBatches {
 
         let new_version = {
             let mut write = self.current.write().map_err(|_| Error::Add)?;
-            let (version, old) = &*write;
-            let version = *version;
-
-            let merged = if old.num_rows() == 0 {
-                batch.clone()
-            } else {
-                homogeneous_merge(&self.schema, vec![old.clone(), batch.clone()])
-                    .into_report()
-                    .change_context(Error::Add)?
-            };
-
-            *write = (version + 1, merged);
-            version + 1
+            if self.retained {
+                write.add_batch(&batch)?;
+            }
+            write.version += 1;
+            write.version
         };
 
         self.updates
@@ -79,7 +103,10 @@ impl InMemoryBatches {
     pub fn subscribe(
         &self,
     ) -> impl Stream<Item = error_stack::Result<RecordBatch, Error>> + 'static {
-        let (mut version, merged) = self.current.read().unwrap().clone();
+        let (mut version, merged) = {
+            let read = self.current.read().unwrap();
+            (read.version, read.batch.clone())
+        };
         let mut recv = self.updates.subscribe();
 
         async_stream::try_stream! {
@@ -111,6 +138,6 @@ impl InMemoryBatches {
 
     /// Retrieve the current in-memory batch.
     pub fn current(&self) -> RecordBatch {
-        self.current.read().unwrap().1.clone()
+        self.current.read().unwrap().batch.clone()
     }
 }
diff --git a/crates/sparrow-runtime/src/execute/operation/scan.rs b/crates/sparrow-runtime/src/execute/operation/scan.rs
@@ -194,12 +194,16 @@ impl ScanOperation {
                     .boxed()
             } else {
                 let batch = in_memory.current();
-                futures::stream::once(async move {
-                    Batch::try_new_from_batch(batch)
-                        .into_report()
-                        .change_context(Error::internal_msg("invalid input"))
-                })
-                .boxed()
+                if batch.num_rows() != 0 {
+                    futures::stream::once(async move {
+                        Batch::try_new_from_batch(batch)
+                            .into_report()
+                            .change_context(Error::internal_msg("invalid input"))
+                    })
+                    .boxed()
+                } else {
+                    futures::stream::empty().boxed()
+                }
             };
             return Ok(Box::new(Self {
                 projected_schema,

diff --git a/crates/sparrow-session/src/session.rs b/crates/sparrow-session/src/session.rs
@@ -87,6 +87,7 @@ impl Session {
         name: &str,
         schema: SchemaRef,
         time_column_name: &str,
+        retained: bool,
         subsort_column_name: Option<&str>,
         key_column_name: &str,
         grouping_name: Option<&str>,
@@ -144,7 +145,14 @@ impl Session {
             })
             .clone();
 
-        Table::new(table_info, key_hash_inverse, key_column, expr, time_unit)
+        Table::new(
+            table_info,
+            key_hash_inverse,
+            key_column,
+            expr,
+            retained,
+            time_unit,
+        )
     }
 
     pub fn add_cast(
@@ -575,7 +583,16 @@ mod tests {
             Field::new("b", DataType::Int64, true),
         ]));
         let table = session
-            .add_table("table", schema, "time", None, "key", Some("user"), None)
+            .add_table(
+                "table",
+                schema,
+                "time",
+                true,
+                None,
+                "key",
+                Some("user"),
+                None,
+            )
             .unwrap();
 
         let field_name = session

diff --git a/crates/sparrow-session/src/table.rs b/crates/sparrow-session/src/table.rs
@@ -26,6 +26,7 @@ impl Table {
         key_hash_inverse: Arc<ThreadSafeKeyHashInverse>,
         key_column: usize,
         expr: Expr,
+        retained: bool,
         time_unit: Option<&str>,
     ) -> error_stack::Result<Self, Error> {
         let prepared_fields: Fields = KEY_FIELDS
@@ -37,7 +38,7 @@ impl Table {
         let prepare_hash = 0;
 
         assert!(table_info.in_memory.is_none());
-        let in_memory_batches = Arc::new(InMemoryBatches::new(prepared_schema.clone()));
+        let in_memory_batches = Arc::new(InMemoryBatches::new(retained, prepared_schema.clone()));
         table_info.in_memory = Some(in_memory_batches.clone());
 
         let preparer = Preparer::new(

diff --git a/examples/event-api/server.py b/examples/event-api/server.py
@@ -6,16 +6,17 @@
 
 async def main():
     kd.init_session()
-    
+
     start = time.time()
     requestmap = dict()
 
-    # Initialize event source with historical data
-    events = kd.sources.PyList(
+    # Initialize event source with schema from historical data.
+    events = kd.sources.PyDict(
         rows = [{"ts": start, "user": "user_1", "request_id": "12345678-1234-5678-1234-567812345678"}],
         time_column = "ts",
         key_column = "user",
-        time_unit = "s"
+        time_unit = "s",
+        retained=False,
     )
 
     # Compute features over events
@@ -32,11 +33,11 @@ async def main():
     async def handle_http(req: web.Request) -> web.Response:
         data = await req.json()
 
-        # Add the current time to the event 
+        # Add the current time to the event
         data["ts"] = time.time()
 
         # Create a future so the aggregated result can be returned in the API response
-        request_id = str(uuid.uuid4()) 
+        request_id = str(uuid.uuid4())
         requestmap[request_id] = asyncio.Future()
         data["request_id"] = request_id
 
@@ -59,7 +60,7 @@ async def handle_http(req: web.Request) -> web.Response:
     await runner.setup()
     site = web.TCPSite(runner, 'localhost', 8080)
     await site.start()
-    
+
 
     # Handle each conversation as it occurs
     print(f"Waiting for events...")
@@ -80,7 +81,7 @@ async def handle_http(req: web.Request) -> web.Response:
             fut.set_result(row["response"])
 
         except Exception as e:
-            print(f"Failed to handle live event from Kaskada: {e}") 
+            print(f"Failed to handle live event from Kaskada: {e}")
 
     # Wait for web server to terminate gracefully
     await runner.cleanup()

diff --git a/python/docs/source/conf.py b/python/docs/source/conf.py
@@ -103,7 +103,6 @@
     # TODO: Version switcher.
     # This would require hosting multiple versions of the docs.
     # https://pydata-sphinx-theme.readthedocs.io/en/stable/user_guide/version-dropdown.html
-
 }
 
 templates_path = ["_templates"]

diff --git a/python/docs/source/index.md b/python/docs/source/index.md
@@ -59,7 +59,7 @@ import kaskada as kd
 kd.init_session()
 
 # Bootstrap from historical data
-messages = kd.sources.PyList(
+messages = kd.sources.PyDict(
     rows = pyarrow.parquet.read_table("./messages.parquet")
         .to_pylist(),
     time_column = "ts",

diff --git a/python/docs/source/reference/sources.md b/python/docs/source/reference/sources.md
@@ -12,5 +12,5 @@
         JsonlString
         Pandas
         Parquet
-        PyList
+        PyDict
 ```
diff --git a/python/pysrc/kaskada/_ffi.pyi b/python/pysrc/kaskada/_ffi.pyi
@@ -53,6 +53,7 @@ class Table(Expr):
         time_column: str,
         key_column: str,
         schema: pa.Schema,
+        retained: bool,
         subsort_column: Optional[str],
         grouping_name: Optional[str],
         time_unit: Optional[str],

diff --git a/python/pysrc/kaskada/_timestream.py b/python/pysrc/kaskada/_timestream.py
@@ -89,7 +89,9 @@ def _literal(value: LiteralValue, session: _ffi.Session) -> Timestream:
             seconds = int(us / 1_000_000)
             # Get the leftover nanoseconds
             nanoseconds = int((us % 1_000_000) * 1_000)
-            return Timestream(_ffi.Expr.literal_timedelta(session, seconds, nanoseconds))
+            return Timestream(
+                _ffi.Expr.literal_timedelta(session, seconds, nanoseconds)
+            )
         else:
             return Timestream(_ffi.Expr.literal(session, value))
 

diff --git a/python/pysrc/kaskada/sources/__init__.py b/python/pysrc/kaskada/sources/__init__.py
@@ -1,6 +1,6 @@
 """Sources of data for Kaskada queries."""
-from .arrow import CsvString, JsonlString, Pandas, Parquet, PyList
+from .arrow import CsvString, JsonlString, Pandas, Parquet, PyDict
 from .source import Source
 
 
-__all__ = ["Source", "CsvString", "Pandas", "JsonlString", "PyList", "Parquet"]
+__all__ = ["Source", "CsvString", "Pandas", "JsonlString", "PyDict", "Parquet"]
diff --git a/python/pysrc/kaskada/sources/arrow.py b/python/pysrc/kaskada/sources/arrow.py
@@ -61,7 +61,7 @@ def add_data(self, data: pd.DataFrame) -> None:
             self._ffi_table.add_pyarrow(batch)
 
 
-class PyList(Source):
+class PyDict(Source):
     """Source reading data from lists of dicts."""
 
     def __init__(
@@ -70,6 +70,7 @@ def __init__(
         *,
         time_column: str,
         key_column: str,
+        retained: bool = True,
         subsort_column: Optional[str] = None,
         schema: Optional[pa.Schema] = None,
         grouping_name: Optional[str] = None,
@@ -81,6 +82,11 @@ def __init__(
             rows: One or more rows represented as dicts.
             time_column: The name of the column containing the time.
             key_column: The name of the column containing the key.
+            retained: Whether added rows should be retained for queries.
+              If True, rows (both provided to the constructor and added later) will be retained
+              for interactive queries. If False, rows will be discarded after being sent to any
+              running materializations. Consider setting this to False when the source will only
+              be used for materialization to avoid unnecessary memory usage.
             subsort_column: The name of the column containing the subsort.
               If not provided, the subsort will be assigned by the system.
             schema: The schema to use. If not provided, it will be inferred from the input.
@@ -93,6 +99,7 @@ def __init__(
         if schema is None:
             schema = pa.Table.from_pylist(rows).schema
         super().__init__(
+            retained=retained,
             schema=schema,
             time_column=time_column,
             key_column=key_column,

diff --git a/python/pysrc/kaskada/sources/source.py b/python/pysrc/kaskada/sources/source.py
@@ -25,6 +25,7 @@ def __init__(
         schema: pa.Schema,
         time_column: str,
         key_column: str,
+        retained: bool = True,
         subsort_column: Optional[str] = None,
         grouping_name: Optional[str] = None,
         time_unit: Optional[TimeUnit] = None,
@@ -62,6 +63,7 @@ def fix_field(field: pa.Field) -> pa.Field:
             time_column,
             key_column,
             schema,
+            retained,
             subsort_column,
             grouping_name,
             time_unit,

diff --git a/python/pytests/aggregation/sum_test.py b/python/pytests/aggregation/sum_test.py
@@ -19,8 +19,8 @@ def source() -> kd.sources.CsvString:
 
 
 @pytest.fixture(scope="module")
-def source_spread_across_days() -> kd.sources.PyList:
-    return kd.sources.PyList(
+def source_spread_across_days() -> kd.sources.PyDict:
+    return kd.sources.PyDict(
         rows=[
             {"time": "2021-01-01T00:00:00", "key": "A", "m": 1, "n": 2},
             {"time": "2021-01-01T01:10:01", "key": "A", "m": 3, "n": 4},

diff --git a/python/pytests/execution_test.py b/python/pytests/execution_test.py
@@ -107,12 +107,16 @@ def test_history(golden, source_int64) -> None:
     golden.jsonl(query.to_pandas(kd.results.History()))
     golden.jsonl(
         query.to_pandas(
-            kd.results.History(since=datetime.fromisoformat("1996-12-19T16:39:59+00:00"))
+            kd.results.History(
+                since=datetime.fromisoformat("1996-12-19T16:39:59+00:00")
+            )
         )
     )
     golden.jsonl(
         query.to_pandas(
-            kd.results.History(until=datetime.fromisoformat("1996-12-20T12:00:00+00:00"))
+            kd.results.History(
+                until=datetime.fromisoformat("1996-12-20T12:00:00+00:00")
+            )
         )
     )
     golden.jsonl(

diff --git a/python/pytests/flatten_test.py b/python/pytests/flatten_test.py
@@ -2,7 +2,7 @@
 
 
 def test_flatten(golden) -> None:
-    source = kd.sources.PyList(
+    source = kd.sources.PyDict(
         [
             {"time": "1996-12-19T16:39:57", "user": "A", "m": [[5]]},
             {"time": "1996-12-19T17:39:57", "user": "A", "m": []},

diff --git a/...pylist_source_test/test_read_pylist.jsonl → ...pydict_source_test/test_read_pydict.jsonl b/...pylist_source_test/test_read_pylist.jsonl → ...pydict_source_test/test_read_pydict.jsonl
diff --git a/...list_source_test/test_read_pylist_1.jsonl → ...dict_source_test/test_read_pydict_1.jsonl b/...list_source_test/test_read_pylist_1.jsonl → ...dict_source_test/test_read_pydict_1.jsonl
diff --git a/...list_source_test/test_read_pylist_2.jsonl → ...dict_source_test/test_read_pydict_2.jsonl b/...list_source_test/test_read_pylist_2.jsonl → ...dict_source_test/test_read_pydict_2.jsonl
diff --git a/...test/test_read_pylist_ignore_column.jsonl → ...test/test_read_pydict_ignore_column.jsonl b/...test/test_read_pylist_ignore_column.jsonl → ...test/test_read_pydict_ignore_column.jsonl
diff --git a/...st/test_read_pylist_ignore_column_1.jsonl → ...st/test_read_pydict_ignore_column_1.jsonl b/...st/test_read_pylist_ignore_column_1.jsonl → ...st/test_read_pydict_ignore_column_1.jsonl
diff --git a/..._source_test/test_read_pylist_lists.jsonl → ..._source_test/test_read_pydict_lists.jsonl b/..._source_test/test_read_pylist_lists.jsonl → ..._source_test/test_read_pydict_lists.jsonl
diff --git a/...ource_test/test_read_pylist_lists_1.jsonl → ...ource_test/test_read_pydict_lists_1.jsonl b/...ource_test/test_read_pylist_lists_1.jsonl → ...ource_test/test_read_pydict_lists_1.jsonl