apollographql · garypen · Dec 7, 2021 · Dec 2, 2021 · Dec 2, 2021 · Dec 2, 2021
diff --git a/apollo-router-core/src/cache.rs b/apollo-router-core/src/cache.rs
@@ -0,0 +1,210 @@
+use crate::{CacheResolver, CacheResolverError};
+use derivative::Derivative;
+use futures::lock::Mutex;
+use lru::LruCache;
+use std::cmp::Eq;
+use std::collections::HashMap;
+use std::fmt;
+use std::hash::Hash;
+use tokio::sync::broadcast::{self, Sender};
+
+/// A caching map optimised for slow value resolution.
+///
+/// The CachingMap hold values in an LruCache. Values are loaded into the cache on a cache miss and
+/// the cache relies on the resolver to provide values. There is no way to manually remove, update
+/// or otherwise invalidate a cache value at this time. Values will be evicted from the cache once
+/// the cache_limit is reached.
+#[derive(Derivative)]
+#[derivative(Debug)]
+pub struct CachingMap<K, V> {
+    #[derivative(Debug = "ignore")]
+    cached: Mutex<LruCache<K, Result<V, CacheResolverError>>>,
+    #[allow(clippy::type_complexity)]
+    #[derivative(Debug = "ignore")]
+    wait_map: Mutex<HashMap<K, Sender<(K, Result<V, CacheResolverError>)>>>,
+    cache_limit: usize,
+    #[derivative(Debug = "ignore")]
+    resolver: Box<dyn CacheResolver<K, V> + Send + Sync>,
+}
+
+impl<K, V> CachingMap<K, V>
+where
+    K: Clone + fmt::Debug + Eq + Hash + Send + Sync + 'static,
+    V: fmt::Debug + Send + Sync + 'static,
+    Result<V, CacheResolverError>: Clone,
+{
+    /// Create a new CachingMap.
+    ///
+    /// resolver is used to resolve cache misses.
+    /// cache_limit specifies the size (number of items) of the cache
+    pub fn new(resolver: Box<(dyn CacheResolver<K, V> + Send + Sync)>, cache_limit: usize) -> Self {
+        Self {
+            cached: Mutex::new(LruCache::new(cache_limit)),
+            wait_map: Mutex::new(HashMap::new()),
+            cache_limit,
+            resolver,
+        }
+    }
+
+    /// Get a value from the cache.
+    pub async fn get(&self, key: K) -> Result<V, CacheResolverError> {
+        let mut locked_cache = self.cached.lock().await;
+        if let Some(value) = locked_cache.get(&key).cloned() {
+            return value;
+        }
+
+        // Holding a lock across the delegated get is a bad idea because
+        // the delegate get() could take a long time during which all
+        // other get() requests are blocked.
+        // Alternatively, if we don't hold the lock, there is a risk
+        // that we will do the work multiple times. This is also
+        // sub-optimal.
+
+        // To work around this, we keep a list of keys we are currently
+        // processing in the delegate. If we try to get a key on this
+        // list, we block and wait for it to complete and then retry.
+        //
+        // This is more complex than either of the two simple
+        // alternatives but succeeds in providing a mechanism where each
+        // client only waits for uncached values that they are going to
+        // use AND avoids generating the value multiple times.
+
+        let mut locked_wait_map = self.wait_map.lock().await;
+
+        // We must only drop the locked cache after we have locked the
+        // wait map. Otherwise,we might get a race that causes us to
+        // miss a broadcast.
+        drop(locked_cache);
+
+        match locked_wait_map.get_mut(&key) {
+            Some(waiter) => {
+                // Register interest in key
+                let mut receiver = waiter.subscribe();
+                drop(locked_wait_map);
+                // Our use case is very specific, so we are sure
+                // that we won't get any errors here.
+                let (recv_key, recv_value) = receiver.recv().await.expect(
+                    "the sender won't ever be dropped before all the receivers finish; qed",
+                );
+                debug_assert_eq!(recv_key, key);
+                recv_value
+            }
+            None => {
+                let (tx, _rx) = broadcast::channel(1);
+                locked_wait_map.insert(key.clone(), tx.clone());
+                drop(locked_wait_map);
+                // This is the potentially high duration operation where we ask our resolver to
+                // resolve the key (retrieve a value) for us
+                // No cache locks are held here
+                let value = self.resolver.retrieve(key.clone()).await;
+                // Update our cache
+                let mut locked_cache = self.cached.lock().await;
+                locked_cache.put(key.clone(), value.clone());
+                // Update our wait list
+                let mut locked_wait_map = self.wait_map.lock().await;
+                locked_wait_map.remove(&key);
+                // Let our waiters know
+                let broadcast_value = value.clone();
+                // Our use case is very specific, so we are sure that
+                // we won't get any errors here.
+                tokio::task::spawn_blocking(move || {
+                    tx.send((key, broadcast_value))
+                        .expect("there is always at least one receiver alive, the _rx guard; qed")
+                })
+                .await
+                .expect("can only fail if the task is aborted or if the internal code panics, neither is possible here; qed");
+                value
+            }
+        }
+    }
+
+    /// Get the top 20% of most recently (LRU) used keys
+    pub async fn get_hot_keys(&self) -> Vec<K> {
+        let locked_cache = self.cached.lock().await;
+        locked_cache
+            .iter()
+            .take(self.cache_limit / 5)
+            .map(|(key, _value)| key.clone())
+            .collect()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::CacheResolverError;
+    use async_trait::async_trait;
+    use futures::stream::{FuturesUnordered, StreamExt};
+    use mockall::mock;
+    use test_log::test;
+
+    struct HasACache {
+        cm: CachingMap<usize, usize>,
+    }
+
+    struct HasACacheResolver {}
+
+    impl HasACache {
+        // fn new(resolver: limit: usize) -> Self {
+        fn new(
+            resolver: Box<(dyn CacheResolver<usize, usize> + Send + Sync)>,
+            cache_limit: usize,
+        ) -> Self {
+            // let resolver = Box::new(HasACacheResolver {});
+            let cm = CachingMap::new(resolver, cache_limit);
+            Self { cm }
+        }
+
+        async fn get(&self, key: usize) -> Result<usize, CacheResolverError> {
+            self.cm.get(key).await
+        }
+    }
+
+    #[async_trait]
+    impl CacheResolver<usize, usize> for HasACacheResolver {
+        async fn retrieve(&self, key: usize) -> Result<usize, CacheResolverError> {
+            Ok(key)
+        }
+    }
+
+    mock! {
+        HasACacheResolver {}
+
+        #[async_trait]
+        impl CacheResolver<usize, usize> for HasACacheResolver {
+            async fn retrieve(&self, key: usize) -> Result<usize, CacheResolverError>;
+        }
+    }
+
+    #[test(tokio::test)]
+    async fn it_should_enforce_cache_limits() {
+        let cache = HasACache::new(Box::new(HasACacheResolver {}), 13);
+
+        for i in 0..14 {
+            cache.get(i).await.expect("gets the value");
+        }
+        let guard = cache.cm.cached.lock().await;
+        assert_eq!(guard.len(), 13);
+    }
+
+    #[test(tokio::test)]
+    async fn it_should_only_delegate_once_per_key() {
+        let mut mock = MockHasACacheResolver::new();
+
+        mock.expect_retrieve().times(1).return_const(Ok(1));
+
+        let cache = HasACache::new(Box::new(mock), 10);
+
+        // Let's trigger 100 concurrent gets of the same value and ensure only
+        // one delegated retrieve is made
+        let mut computations: FuturesUnordered<_> = (0..100).map(|_| cache.get(1)).collect();
+
+        while let Some(result) = computations.next().await {
+            result.expect("result retrieved");
+        }
+
+        // To be really sure, check there is only one value in the cache
+        let guard = cache.cm.cached.lock().await;
+        assert_eq!(guard.len(), 1);
+    }
+}
diff --git a/apollo-router-core/src/error.rs b/apollo-router-core/src/error.rs
@@ -150,6 +150,19 @@ impl From<QueryPlannerError> for FetchError {
     }
 }
 
+/// Error types for CacheResolver
+#[derive(Error, Debug, Display, Clone)]
+pub enum CacheResolverError {
+    /// Value retrieval failed: {0}
+    RetrievalError(Arc<QueryPlannerError>),
+}
+
+impl From<QueryPlannerError> for CacheResolverError {
+    fn from(err: QueryPlannerError) -> Self {
+        CacheResolverError::RetrievalError(Arc::new(err))
+    }
+}
+
 /// An error while processing JSON data.
 #[derive(Debug, Error, Display)]
 pub enum JsonExtError {
@@ -168,6 +181,9 @@ pub enum QueryPlannerError {
     /// Query planning panicked: {0}
     JoinError(Arc<JoinError>),
 
+    /// Cache resolution failed: {0}
+    CacheResolverError(Arc<CacheResolverError>),
+
     /// Unhandled planner result.
     UnhandledPlannerResult,
 }
@@ -184,6 +200,12 @@ impl From<JoinError> for QueryPlannerError {
     }
 }
 
+impl From<CacheResolverError> for QueryPlannerError {
+    fn from(err: CacheResolverError) -> Self {
+        QueryPlannerError::CacheResolverError(Arc::new(err))
+    }
+}
+
 impl From<QueryPlannerError> for ResponseStream {
     fn from(err: QueryPlannerError) -> Self {
         stream::once(future::ready(FetchError::from(err).to_response(true))).boxed()

diff --git a/apollo-router-core/src/lib.rs b/apollo-router-core/src/lib.rs
@@ -22,6 +22,7 @@ macro_rules! failfast_error {
     }};
 }
 
+mod cache;
 mod error;
 mod json_ext;
 mod naive_introspection;
@@ -33,6 +34,7 @@ mod response;
 mod schema;
 mod traits;
 
+pub use cache::*;
 pub use error::*;
 pub use json_ext::*;
 pub use naive_introspection::*;