Optimize the cache fetch for forward split, pt. 2B (pytorch#2282)

Summary: This follows up the work on D51865590 and D52679387 by plumbing the `uvm_cache_stats` argument passing up to the Python API level. `local_uvm_cache_stats` is now zeroed out before the prefetch step as opposed to after, to allow for the data to be passed into the forward step. This is a re-attempt of landing D51995949 with additions copied from D52670550 Differential Revision: D53033916
q10 · Feb 5, 2024 · 7bbb442 · 7bbb442
1 parent 7889f64
commit 7bbb442
Show file tree

Hide file tree

Showing 4 changed files with 14 additions and 1 deletion.
diff --git a/fbgemm_gpu/codegen/lookup_args.py b/fbgemm_gpu/codegen/lookup_args.py
@@ -38,6 +38,7 @@ class CommonArgs(NamedTuple):
     indice_weights: Optional[torch.Tensor]
     feature_requires_grad: Optional[torch.Tensor]
     lxu_cache_locations: torch.Tensor
+    uvm_cache_stats: Optional[torch.Tensor]
     output_dtype: int
     vbe_metadata: VBEMetadata
     is_experimental: bool

diff --git a/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template b/fbgemm_gpu/codegen/split_embedding_codegen_lookup_invoker.template
@@ -218,6 +218,7 @@ def invoke(
         indice_weights=common_args.indice_weights,
         feature_requires_grad=common_args.feature_requires_grad,
         lxu_cache_locations=common_args.lxu_cache_locations,
+        uvm_cache_stats=common_args.uvm_cache_stats,
         # VBE metadata
         B_offsets=vbe_metadata.B_offsets,
         vbe_output_offsets_feature_rank=vbe_metadata.output_offsets_feature_rank,

diff --git a/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py b/fbgemm_gpu/fbgemm_gpu/split_table_batched_embeddings_ops_training.py
@@ -1011,6 +1011,11 @@ def forward(  # noqa: C901
             indice_weights=per_sample_weights,
             feature_requires_grad=feature_requires_grad,
             lxu_cache_locations=self.lxu_cache_locations,
+            # Pass the local_uvm_cache_stats bc only that information is
+            # relevant for the current iteration
+            uvm_cache_stats=self.local_uvm_cache_stats
+            if self.gather_uvm_cache_stats
+            else None,
             output_dtype=self.output_dtype,
             vbe_metadata=vbe_metadata,
             is_experimental=self.is_experimental,
@@ -1206,6 +1211,12 @@ def _prefetch(self, indices: Tensor, offsets: Tensor) -> None:
         if not self.lxu_cache_weights.numel():
             return
 
+        # Clear the local_uvm_cache_stats before the prefetch instead of after
+        # the prefetch step, since it will be used in the CommonArgs in the
+        # forward step
+        if self.gather_uvm_cache_stats:
+            self.local_uvm_cache_stats.zero_()
+
         linear_cache_indices = torch.ops.fbgemm.linearize_cache_indices(
             self.cache_hash_size_cumsum,
             indices,
@@ -1287,7 +1298,6 @@ def _prefetch(self, indices: Tensor, offsets: Tensor) -> None:
             self.uvm_cache_stats = torch.add(
                 self.uvm_cache_stats, self.local_uvm_cache_stats
             )
-            self.local_uvm_cache_stats.zero_()
 
     def _prefetch_tensors_record_stream(
         self, forward_stream: torch.cuda.Stream

diff --git a/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py b/fbgemm_gpu/fbgemm_gpu/ssd_split_table_batched_embeddings_ops.py
@@ -461,6 +461,7 @@ def forward(
             indice_weights=per_sample_weights,
             feature_requires_grad=feature_requires_grad,
             lxu_cache_locations=lxu_cache_locations,
+            uvm_cache_stats=None,
             vbe_metadata=invokers.lookup_args.VBEMetadata(
                 B_offsets=None,
                 output_offsets_feature_rank=None,