feat(page_service): timeout-based batching of requests (#9321)

## Problem We don't take advantage of queue depth generated by the compute on the pageserver. We can process getpage requests more efficiently by batching them. ## Summary of changes Batch up incoming getpage requests that arrive within a configurable time window (`server_side_batch_timeout`). Then process the entire batch via one `get_vectored` timeline operation. By default, no merging takes place. ## Testing * **Functional**: #9792 * **Performance**: will be done in staging/pre-prod # Refs * #9377 * #9376 Co-authored-by: Christian Schwarz <christian@neon.tech>
neondatabase · Nov 18, 2024 · d7662fd · d7662fd · github-actions · Nov 18, 2024
1 parent e5c89f3
commit d7662fd
Show file tree

Hide file tree

Showing 9 changed files with 705 additions and 195 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
@@ -109,6 +109,8 @@ pub struct ConfigToml {
     pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
     #[serde(skip_serializing_if = "Option::is_none")]
     pub no_sync: Option<bool>,
+    #[serde(with = "humantime_serde")]
+    pub server_side_batch_timeout: Option<Duration>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -317,6 +319,8 @@ pub mod defaults {
     pub const DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB: usize = 0;
 
     pub const DEFAULT_IO_BUFFER_ALIGNMENT: usize = 512;
+
+    pub const DEFAULT_SERVER_SIDE_BATCH_TIMEOUT: Option<&str> = None;
 }
 
 impl Default for ConfigToml {
@@ -397,6 +401,8 @@ impl Default for ConfigToml {
             ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: None,
             virtual_file_io_mode: None,
+            server_side_batch_timeout: DEFAULT_SERVER_SIDE_BATCH_TIMEOUT
+                .map(|duration| humantime::parse_duration(duration).unwrap()),
             tenant_config: TenantConfigToml::default(),
             no_sync: None,
         }

diff --git a/libs/postgres_backend/src/lib.rs b/libs/postgres_backend/src/lib.rs
@@ -716,6 +716,9 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
         Ok(())
     }
 
+    // Proto looks like this:
+    // FeMessage::Query("pagestream_v2{FeMessage::CopyData(PagesetreamFeMessage::GetPage(..))}")
+
     async fn process_message(
         &mut self,
         handler: &mut impl Handler<IO>,

diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml
@@ -84,6 +84,7 @@ enumset = { workspace = true, features = ["serde"]}
 strum.workspace = true
 strum_macros.workspace = true
 wal_decoder.workspace = true
+smallvec.workspace = true
 
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true

diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
@@ -182,6 +182,10 @@ pub struct PageServerConf {
 
     /// Optionally disable disk syncs (unsafe!)
     pub no_sync: bool,
+
+    /// Maximum amount of time for which a get page request request
+    /// might be held up for request merging.
+    pub server_side_batch_timeout: Option<Duration>,
 }
 
 /// Token for authentication to safekeepers
@@ -336,6 +340,7 @@ impl PageServerConf {
             concurrent_tenant_warmup,
             concurrent_tenant_size_logical_size_queries,
             virtual_file_io_engine,
+            server_side_batch_timeout,
             tenant_config,
             no_sync,
         } = config_toml;
@@ -377,6 +382,7 @@ impl PageServerConf {
             image_compression,
             timeline_offloading,
             ephemeral_bytes_per_memory_kb,
+            server_side_batch_timeout,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
@@ -1187,6 +1187,7 @@ struct GlobalAndPerTimelineHistogramTimer<'a, 'c> {
     ctx: &'c RequestContext,
     start: std::time::Instant,
     op: SmgrQueryType,
+    count: usize,
 }
 
 impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> {
@@ -1214,10 +1215,13 @@ impl Drop for GlobalAndPerTimelineHistogramTimer<'_, '_> {
                 elapsed
             }
         };
-        self.global_latency_histo
-            .observe(ex_throttled.as_secs_f64());
-        if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
-            per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
+
+        for _ in 0..self.count {
+            self.global_latency_histo
+                .observe(ex_throttled.as_secs_f64());
+            if let Some(per_timeline_getpage_histo) = self.per_timeline_latency_histo {
+                per_timeline_getpage_histo.observe(ex_throttled.as_secs_f64());
+            }
         }
     }
 }
@@ -1385,6 +1389,14 @@ impl SmgrQueryTimePerTimeline {
         &'a self,
         op: SmgrQueryType,
         ctx: &'c RequestContext,
+    ) -> Option<impl Drop + 'a> {
+        self.start_timer_many(op, 1, ctx)
+    }
+    pub(crate) fn start_timer_many<'c: 'a, 'a>(
+        &'a self,
+        op: SmgrQueryType,
+        count: usize,
+        ctx: &'c RequestContext,
     ) -> Option<impl Drop + 'a> {
         let start = Instant::now();
 
@@ -1422,6 +1434,7 @@ impl SmgrQueryTimePerTimeline {
             ctx,
             start,
             op,
+            count,
         })
     }
 }