precompute softmax D in non-cuda ring flash attn

lucidrains · Apr 9, 2024 · 011ec89 · 011ec89
1 parent 0b5d5af
commit 011ec89
Show file tree

Hide file tree

Showing 3 changed files with 12 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -99,6 +99,7 @@ $ python assert.py --use-cuda --causal --striped-ring-attn
 - [x] for cuda striped attention, for backwards hack, pad the extra token once and index out when passing into Tri's cuda kernel
 - [x] find a machine with 8 GPUs and test with a quarter million tokens first
 
+- [ ] see for cuda version whether softmax_D can be computed once and cached over the ring reduce. go for modified triton backwards if notattn)
 - [ ] think about how to craft a special `Dataset` that shards across sequence length (take into account labels for cross entropy loss) for ring transformer training
 - [ ] add ring attention to Tri's flash attention implementation. find some cuda ring reduce impl
 - [ ] figure out how to pytest distributed pytorch

diff --git a/ring_attention_pytorch/ring_flash_attention.py b/ring_attention_pytorch/ring_flash_attention.py
@@ -256,6 +256,13 @@ def backward(ctx, do):
         receive_kv_and_dkv = None
         receive_mask = None
 
+        # precompute the softmax D
+
+        D = (do * o).sum(dim = -1, keepdims = True)
+        D = rearrange(D, 'b n h 1 -> b h n 1')
+
+        # ring reduce key / values
+
         for (ring_rank, _), ((kv_and_dkv, mask), (receive_kv_and_dkv, receive_mask)) in ring_pass_fn(kv_and_dkv, mask, receive_buffers = (receive_kv_and_dkv, receive_mask), max_iters = max_ring_passes, ring_size = ring_size):
             k_ring_rank = ring_rank % ring_size
 
@@ -274,13 +281,13 @@ def backward(ctx, do):
 
                 row_splits = zip(
                     q.split(bucket_size, dim = 1),
-                    o.split(bucket_size, dim = 1),
                     do.split(bucket_size, dim = 1),
+                    D.split(bucket_size, dim = -2),
                     lse.split(bucket_size, dim = -2),
                     dq.split(bucket_size, dim = 1)
                 )
 
-                for ind, (qc, oc, doc, lsec, dqc) in enumerate(row_splits):
+                for ind, (qc, doc, Dc, lsec, dqc) in enumerate(row_splits):
                     row_bucket_index = row_ring_rank * per_machine_buckets + ind
 
                     attn_weights = einsum('b i h d, b j h d -> b h i j', qc, kc) * scale
@@ -311,9 +318,7 @@ def backward(ctx, do):
                     dv_chunk = einsum('b h i j, b i h d -> b j h d', p, doc)
                     dp = einsum('b i h d, b j h d -> b h i j', doc, vc)
 
-                    D = (doc * oc).sum(dim = -1, keepdims = True)
-                    D = rearrange(D, 'b n h 1 -> b h n 1')
-                    ds = p * scale * (dp - D)
+                    ds = p * scale * (dp - Dc)
 
                     dq_chunk = einsum('b h i j, b j h d -> b i h d', ds, kc)
                     dk_chunk = einsum('b h i j, b i h d -> b j h d', ds, qc)

diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'ring-attention-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.3.7',
+  version = '0.3.8',
   license='MIT',
   description = 'Ring Attention - Pytorch',
   author = 'Phil Wang',