Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds caching layer to tensor allocations #670

Merged
merged 41 commits into from
Apr 12, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
41 commits
Select commit Hold shift + click to select a range
421c107
Adding caching to cpu
coreylowman Apr 6, 2023
f24863a
Merge branch 'main' into cpu-caching
coreylowman Apr 7, 2023
3f000cb
Merge branch 'main' into cpu-caching
coreylowman Apr 8, 2023
10fd4ef
Tmp commit of cuda caching
coreylowman Apr 9, 2023
2ae8496
check passing
coreylowman Apr 9, 2023
7c6fc52
Adding alloc_empty
coreylowman Apr 9, 2023
78bd1f4
Updating conv2d
coreylowman Apr 9, 2023
a705132
Using alloc_empty in cuda kernels
coreylowman Apr 10, 2023
d077ece
Reusing on clone
coreylowman Apr 10, 2023
3d33c9c
Using alloc_empty for tensor_from_host_buf
coreylowman Apr 10, 2023
731096c
Using dev.null instead of replace_with_empty
coreylowman Apr 10, 2023
2fe9084
Fixing issue with clone
coreylowman Apr 10, 2023
bfe00f7
Fixing cpu cache allocations
coreylowman Apr 10, 2023
2d2fc69
Adding custom Clone impl for CachableVec
coreylowman Apr 10, 2023
99054a5
Using alloc_elem in stack/concat
coreylowman Apr 10, 2023
e112b24
Adding empty_cache to DeviceStorage
coreylowman Apr 10, 2023
947a835
Adds TensorCache and uses in Cpu
coreylowman Apr 10, 2023
cb72ceb
Merge branch 'cpu-caching' of https://github.com/coreylowman/dfdx int…
coreylowman Apr 10, 2023
0769887
Using TensorCache object in cuda
coreylowman Apr 10, 2023
fa83774
Adding comments to tensor cache
coreylowman Apr 10, 2023
19daef0
Styling
coreylowman Apr 10, 2023
d472116
Cleanup
coreylowman Apr 10, 2023
ea52f38
Making CPU caching safer
coreylowman Apr 11, 2023
995d2b9
Adding ability to disable cache
coreylowman Apr 12, 2023
f99ad4f
Adding allocation details to tensor docstring
coreylowman Apr 12, 2023
d0fd8fa
Formatting and adding unit tests
coreylowman Apr 12, 2023
ed19aff
Adding unit tests for Cpu & Cuda
coreylowman Apr 12, 2023
fa9ceb5
Adding second pass forward for resnet18 integration test
coreylowman Apr 12, 2023
fcf33bf
Fixing integration tests
coreylowman Apr 12, 2023
a67104f
Fixing cpu tests
coreylowman Apr 12, 2023
ec4badf
Fixing cuda unit tests
coreylowman Apr 12, 2023
057f7a4
Fixing allocation error without fast alloc
coreylowman Apr 12, 2023
bbeebf3
Fixing cudnn kernel
coreylowman Apr 12, 2023
df1d960
Fixing memory usage from cuda -> cpu transfer
coreylowman Apr 12, 2023
6e53e4d
Merge branch 'main' into cpu-caching
coreylowman Apr 12, 2023
c113a4a
Fixing tensor_to_array
coreylowman Apr 12, 2023
7635f85
Updating cudarc version
coreylowman Apr 12, 2023
b5aadb0
Merge branch 'main' into cpu-caching
coreylowman Apr 12, 2023
5200b1b
Clippy suggestions
coreylowman Apr 12, 2023
9780e03
Fixing no-std support
coreylowman Apr 12, 2023
4b86657
Satify clippy
coreylowman Apr 12, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@ features = ["nightly", "numpy", "safetensors", "cuda", "ci-check"]

[dependencies]
no-std-compat = { version = "0.4.1", default-features = false, features = [ "alloc", "compat_hash" ], optional = true }
spin = { version = "0.9.6", default-features = false, features = ["spin_mutex"], optional = true }
spin = { version = "0.9.6", default-features = false, features = ["spin_mutex", "rwlock"], optional = true }
rand = { version = "0.8.5", default-features = false, features = ["std_rng"] }
rand_distr = { version = "0.4.3", default-features = false, features = ["std_math"] }
matrixmultiply = { version = "0.3.2", default-features = false, optional = true }
zip = { version = "0.6.2", default-features = false, optional = true }
cblas-sys = { version = "0.1.4", default-features = false, optional = true }
libc = { version = "0.2", default-features = false, optional = true }
cudarc = { version = "0.9.6", default-features = false, optional = true, features = ["driver", "cublas", "nvrtc"] }
cudarc = { version = "0.9.7", default-features = false, optional = true, features = ["driver", "cublas", "nvrtc"] }
num-traits = { version = "0.2.15", default-features = false }
safetensors = { version = "0.3", default-features = false, optional = true }
memmap2 = { version = "0.5", default-features = false, optional = true }
Expand Down
234 changes: 234 additions & 0 deletions src/tensor/cache.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,234 @@
use std::{alloc::Layout, collections::BTreeMap, vec::Vec};

#[cfg(not(feature = "no-std"))]
use std::sync::RwLock;

#[cfg(feature = "no-std")]
use spin::RwLock;

/// A key for the tensor cache. Contains both number of bytes and informatino
/// about the layout of the allocation.
///
/// Since [Layout] doesn't impl Ord, we can't use it directly as a key
/// for a hasmap, meaning we need this extra datastructure. Otherwise
/// we could just using `(usize, Layout)` as the key.
#[derive(Debug, Copy, Clone, Hash, Eq, PartialEq, Ord, PartialOrd)]
pub(crate) struct AllocationKey {
pub num_bytes: usize,
/// The size of the allocation in bytes - from [Layout].
pub size: usize,
/// The alignment of the allocation in bytes - from [Layout].
pub alignment: usize,
}

/// A cache of allocations that can be reused.
///
/// The key is the number of bytes in the allocation, AND the layout
/// that the allocation was created with. This is necessary for safely
/// reusing allocations, especially on the rust side of things, where the
/// allocator assumes memory is allocated & deallocated with the same layout.
/// The value is a list of allocations of that size.
///
/// The prescense of a key in the map, indicates that there is *at least one*
/// valid allocation. When the last value is removed from the list, the key
/// is removed.
#[derive(Debug)]
pub(crate) struct TensorCache<Ptr> {
pub(crate) allocations: RwLock<BTreeMap<AllocationKey, Vec<Ptr>>>,
pub(crate) enabled: RwLock<bool>,
}

impl<Ptr> Default for TensorCache<Ptr> {
fn default() -> Self {
Self {
allocations: Default::default(),
enabled: RwLock::new(true),
}
}
}

impl<Ptr> TensorCache<Ptr> {
/// Returns the number of allocations in the cache.
#[allow(unused)]
pub(crate) fn len(&self) -> usize {
#[cfg(not(feature = "no-std"))]
{
self.allocations.read().unwrap().len()
}

#[cfg(feature = "no-std")]
{
self.allocations.read().len()
}
}

/// Returns `true` if the cache is enabled.
pub(crate) fn is_enabled(&self) -> bool {
#[cfg(not(feature = "no-std"))]
{
*self.enabled.read().unwrap()
}
#[cfg(feature = "no-std")]
{
*self.enabled.read()
}
}

/// Disables the cache.
pub(crate) fn disable(&self) {
#[cfg(not(feature = "no-std"))]
{
*self.enabled.write().unwrap() = false;
}

#[cfg(feature = "no-std")]
{
*self.enabled.write() = false;
}
}

/// Returns a cached allocation if one exists.
/// Otherwise, returns `None`.
pub(crate) fn try_pop<E>(&self, len: usize) -> Option<Ptr> {
if !self.is_enabled() {
return None;
}

let layout = Layout::new::<E>();
let num_bytes = len * std::mem::size_of::<E>();
let key = AllocationKey {
num_bytes,
size: layout.size(),
alignment: layout.align(),
};
// Check if there is a cached allocation.
let reuse = {
#[cfg(not(feature = "no-std"))]
let cache = self.allocations.read().unwrap();
#[cfg(feature = "no-std")]
let cache = self.allocations.read();
cache.contains_key(&key)
};
// If there is, remove it from the cache.
// Otherwise, return `None`.
if reuse {
#[cfg(not(feature = "no-std"))]
let mut cache = self.allocations.write().unwrap();
#[cfg(feature = "no-std")]
let mut cache = self.allocations.write();
// unwrap is safe because we just checked for contains key above.
let items = cache.get_mut(&key).unwrap();
// unwrap is safe because reuse is only true if there's at least one item,
// which is also maintained by the block directly below.
let allocation = items.pop().unwrap();
// If there are no more cached allocations of this size,
// remove the entry from the cache.
// This is important for correctness, because the presence
// of an entry in the cache indicates that there are valid
// allocations to use. (see `let reuse = { ... }` above).
if items.is_empty() {
cache.remove(&key);
}
Some(allocation)
} else {
None
}
}

/// Inserts an allocation into the cache.
pub(crate) fn insert<E>(&self, len: usize, allocation: Ptr) {
if !self.is_enabled() {
// This is a panic because it's a bug in the library.
panic!("Tried to insert into a disabled cache.");
}

let layout = Layout::new::<E>();
let num_bytes = len * std::mem::size_of::<E>();
let key = AllocationKey {
num_bytes,
size: layout.size(),
alignment: layout.align(),
};
#[cfg(not(feature = "no-std"))]
let mut cache = self.allocations.write().unwrap();
#[cfg(feature = "no-std")]
let mut cache = self.allocations.write();
if let std::collections::btree_map::Entry::Vacant(e) = cache.entry(key) {
#[cfg(not(feature = "no-std"))]
{
e.insert(std::vec![allocation]);
}
#[cfg(feature = "no-std")]
{
let mut allocations = Vec::new();
allocations.push(allocation);
e.insert(allocations);
}
} else {
cache.get_mut(&key).unwrap().push(allocation);
}
}
}

#[cfg(test)]
mod test {
use super::*;

#[test]
#[should_panic(expected = "Tried to insert into a disabled cache.")]
fn test_insert_on_disabled_cache() {
let cache: TensorCache<usize> = Default::default();
cache.disable();
cache.insert::<f32>(1, 0);
}

#[test]
fn test_try_pop_on_disabled_cache() {
let cache: TensorCache<usize> = Default::default();
assert!(cache.is_enabled());
cache.disable();
assert!(!cache.is_enabled());
assert_eq!(cache.try_pop::<f32>(1), None);
assert_eq!(cache.try_pop::<f32>(1), None);
}

#[test]
fn test_try_pop_on_empty_cache() {
let cache: TensorCache<usize> = Default::default();
assert_eq!(cache.try_pop::<f32>(1), None);
assert_eq!(cache.try_pop::<f32>(1), None);
}

#[test]
fn test_try_pop_on_cache_with_multiple_sizes_and_alignment() {
let cache: TensorCache<usize> = Default::default();
cache.insert::<f32>(1, 0);
cache.insert::<f32>(1, 1);
cache.insert::<f32>(1, 2);
cache.insert::<f32>(2, 3);
cache.insert::<f32>(2, 4);
cache.insert::<f32>(2, 5);
cache.insert::<f64>(1, 6);
cache.insert::<f64>(1, 7);
cache.insert::<f64>(1, 8);
cache.insert::<f64>(2, 9);
cache.insert::<f64>(2, 10);
cache.insert::<f64>(2, 11);
assert_eq!(cache.try_pop::<f32>(1), Some(2));
assert_eq!(cache.try_pop::<f32>(1), Some(1));
assert_eq!(cache.try_pop::<f32>(1), Some(0));
assert_eq!(cache.try_pop::<f32>(1), None);
assert_eq!(cache.try_pop::<f32>(2), Some(5));
assert_eq!(cache.try_pop::<f32>(2), Some(4));
assert_eq!(cache.try_pop::<f32>(2), Some(3));
assert_eq!(cache.try_pop::<f32>(2), None);
assert_eq!(cache.try_pop::<f64>(1), Some(8));
assert_eq!(cache.try_pop::<f64>(1), Some(7));
assert_eq!(cache.try_pop::<f64>(1), Some(6));
assert_eq!(cache.try_pop::<f64>(1), None);
assert_eq!(cache.try_pop::<f64>(2), Some(11));
assert_eq!(cache.try_pop::<f64>(2), Some(10));
assert_eq!(cache.try_pop::<f64>(2), Some(9));
assert_eq!(cache.try_pop::<f64>(2), None);
}
}
52 changes: 38 additions & 14 deletions src/tensor/cpu/allocate.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,17 @@ use crate::{
tensor::{masks::triangle_mask, storage_traits::*, unique_id, Tensor},
};

use super::{Cpu, CpuError, LendingIterator};
use super::{CachableVec, Cpu, CpuError, LendingIterator};

use rand::{distributions::Distribution, Rng};
use std::{sync::Arc, vec::Vec};

impl Cpu {
#[inline]
pub(crate) fn try_alloc_zeros<E: Unit>(&self, numel: usize) -> Result<Vec<E>, CpuError> {
pub(crate) fn try_alloc_zeros<E: Unit>(
&self,
numel: usize,
) -> Result<CachableVec<E>, CpuError> {
self.try_alloc_elem::<E>(numel, Default::default())
}

Expand All @@ -21,19 +24,36 @@ impl Cpu {
&self,
numel: usize,
elem: E,
) -> Result<Vec<E>, CpuError> {
#[cfg(feature = "fast-alloc")]
{
Ok(std::vec![elem; numel])
}
) -> Result<CachableVec<E>, CpuError> {
let data = self.cache.try_pop::<E>(numel).map_or_else(
#[cfg(feature = "fast-alloc")]
|| Ok(std::vec![elem; numel]),
#[cfg(not(feature = "fast-alloc"))]
|| {
let mut data: Vec<E> = Vec::new();
data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?;
data.resize(numel, elem);
Ok(data)
},
|allocation| {
// SAFETY:
// - ✅ "ptr must have been allocated using the global allocator, such as via the alloc::alloc function."
// - ✅ handled by tensor cache "T needs to have the same alignment as what ptr was allocated with."
// - ✅ handled by tensor cache "The size of T times the capacity needs to be the same size as the pointer was allocated with."
// - ✅ "length needs to be less than or equal to capacity."
// - ✅ all the dtypes for this are builtin numbers "The first length values must be properly initialized values of type T."
// - ✅ "capacity needs to be the capacity that the pointer was allocated with."
// - ✅ "The allocated size in bytes must be no larger than isize::MAX. See the safety documentation of pointer::offset."
let mut data = unsafe { Vec::from_raw_parts(allocation.0 as *mut E, numel, numel) };
data.fill(elem);
Ok(data)
},
)?;

#[cfg(not(feature = "fast-alloc"))]
{
let mut data: Vec<E> = Vec::new();
data.try_reserve(numel).map_err(|_| CpuError::OutOfMemory)?;
data.resize(numel, elem);
Ok(data)
}
Ok(CachableVec {
data,
cache: self.cache.clone(),
})
}
}

Expand Down Expand Up @@ -187,6 +207,10 @@ impl<E: Unit> TensorFromVec<E> for Cpu {
if src.len() != num_elements {
Err(CpuError::WrongNumElements)
} else {
let src = CachableVec {
data: src,
cache: self.cache.clone(),
};
Ok(Tensor {
id: unique_id(),
data: Arc::new(src),
Expand Down
Loading