[GraphBolt][CUDA] Refactor gb.unique_and_compact, add async_op.

dmlc · Sep 9, 2024 · b010268 · b010268
1 parent e8022e9
commit b010268
Showing 1 changed file with 32 additions and 20 deletions.
diff --git a/python/dgl/graphbolt/internal/sample_utils.py b/python/dgl/graphbolt/internal/sample_utils.py
@@ -15,14 +15,15 @@ def unique_and_compact(
     ],
     rank: int = 0,
     world_size: int = 1,
+    async_op: bool = False,
 ):
     """
     Compact a list of nodes tensor. The `rank` and `world_size` parameters are
     relevant when using Cooperative Minibatching, which was initially proposed
     in `Deep Graph Library PR#4337<https://github.com/dmlc/dgl/pull/4337>`__ and
     was later first fully described in
     `Cooperative Minibatching in Graph Neural Networks
-    <https://arxiv.org/abs/2310.12403>`__
+    <https://arxiv.org/abs/2310.12403>`__.
     Cooperation between the GPUs eliminates duplicate work performed across the
     GPUs due to the overlapping sampled k-hop neighborhoods of seed nodes when
     performing GNN minibatching.
@@ -48,6 +49,9 @@ def unique_and_compact(
         The rank of the current process.
     world_size : int
         The number of processes.
+    async_op: bool
+        Boolean indicating whether the call is asynchronous. If so, the result
+        can be obtained by calling wait on the returned future.
 
     Returns
     -------
@@ -63,27 +67,35 @@ def unique_and_compact(
     """
     is_heterogeneous = isinstance(nodes, dict)
 
-    def unique_and_compact_per_type(nodes):
-        nums = [node.size(0) for node in nodes]
-        nodes = torch.cat(nodes)
-        empty_tensor = nodes.new_empty(0)
-        unique, compacted, _, offsets = torch.ops.graphbolt.unique_and_compact(
-            nodes, empty_tensor, empty_tensor, rank, world_size
-        )
-        compacted = compacted.split(nums)
-        return unique, list(compacted), offsets
-
+    if not is_heterogeneous:
+        homo_ntype = 'a'
+        nodes = {homo_ntype: nodes}
+
+    nums = {}
+    concat_nodes, empties = [], []
+    for ntype, nodes_of_type in nodes.items():
+        nums[ntype] = [node.size(0) for node in nodes_of_type]
+        concat_nodes.append(torch.cat(nodes_of_type))
+        empties.append(concat_nodes[-1].new_empty(0))
+    unique_fn = (
+        torch.ops.graphbolt.unique_and_compact_batched_async
+        if async_op
+        else torch.ops.graphbolt.unique_and_compact_batched
+    )
+    results = unique_fn(concat_nodes, empties, empties, rank, world_size)
+    unique, compacted, offsets = {}, {}, {}
+    for ntype, result in zip(nodes.keys(), results):
+        (
+            unique[ntype],
+            concat_compacted,
+            _,
+            offsets[ntype],
+        ) = result
+        compacted[ntype] = list(concat_compacted.split(nums[ntype]))
     if is_heterogeneous:
-        unique, compacted, offsets = {}, {}, {}
-        for ntype, nodes_of_type in nodes.items():
-            (
-                unique[ntype],
-                compacted[ntype],
-                offsets[ntype],
-            ) = unique_and_compact_per_type(nodes_of_type)
         return unique, compacted, offsets
     else:
-        return unique_and_compact_per_type(nodes)
+        return unique[homo_ntype], compacted[homo_ntype], offsets[homo_ntype]
 
 
 def compact_temporal_nodes(nodes, nodes_timestamp):
@@ -161,7 +173,7 @@ def unique_and_compact_csc_formats(
     `Deep Graph Library PR#4337<https://github.com/dmlc/dgl/pull/4337>`__
     and was later first fully described in
     `Cooperative Minibatching in Graph Neural Networks
-    <https://arxiv.org/abs/2310.12403>`__
+    <https://arxiv.org/abs/2310.12403>`__.
     Cooperation between the GPUs eliminates duplicate work performed across the
     GPUs due to the overlapping sampled k-hop neighborhoods of seed nodes when
     performing GNN minibatching.