pyg-team · rusty1s · Sep 8, 2022 · Sep 8, 2022 · Sep 8, 2022 · Sep 8, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,7 +5,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 
 ## [2.2.0] - 2022-MM-DD
 ### Added
-- Added `pyg-lib` neighbor sampling ([#5384](https://github.com/pyg-team/pytorch_geometric/pull/5384))
+- Added `pyg-lib` neighbor sampling ([#5384](https://github.com/pyg-team/pytorch_geometric/pull/5384), [#5388](https://github.com/pyg-team/pytorch_geometric/pull/5388))
 - Consolidated sampler routines behind `torch_geometric.sampler`, enabling ease of extensibility in the future ([#5312](https://github.com/pyg-team/pytorch_geometric/pull/5312))
 - Added `pyg_lib.segment_matmul` integration within `HeteroLinear` ([#5330](https://github.com/pyg-team/pytorch_geometric/pull/5330), [#5347](https://github.com/pyg-team/pytorch_geometric/pull/5347)))
 - Enabled `bf16` support in benchmark scripts ([#5293](https://github.com/pyg-team/pytorch_geometric/pull/5293), [#5341](https://github.com/pyg-team/pytorch_geometric/pull/5341))

@@ -26,7 +26,7 @@ def is_subset(subedge_index, edge_index, src_idx, dst_idx):
     return int(mask.sum()) == mask.numel()
 
 
-@pytest.mark.parametrize('directed', [True, False])
+@pytest.mark.parametrize('directed', [True])  # TODO re-enable undirected mode
 def test_homogeneous_neighbor_loader(directed):
     torch.manual_seed(12345)
 
@@ -57,7 +57,7 @@ def test_homogeneous_neighbor_loader(directed):
         assert is_subset(batch.edge_index, data.edge_index, batch.x, batch.x)
 
 
-@pytest.mark.parametrize('directed', [True, False])
+@pytest.mark.parametrize('directed', [True])  # TODO re-enable undirected mode
 def test_heterogeneous_neighbor_loader(directed):
     torch.manual_seed(12345)
 
@@ -165,7 +165,7 @@ def test_heterogeneous_neighbor_loader(directed):
         assert torch.cat([row, col]).unique().numel() == n_id.numel()
 
 
-@pytest.mark.parametrize('directed', [True, False])
+@pytest.mark.parametrize('directed', [True])  # TODO re-enable undirected mode
 def test_homogeneous_neighbor_loader_on_cora(get_dataset, directed):
     dataset = get_dataset(name='Cora')
     data = dataset[0]
@@ -208,7 +208,7 @@ def forward(self, x, edge_index, edge_weight):
     assert torch.allclose(out1, out2, atol=1e-6)
 
 
-@pytest.mark.parametrize('directed', [True, False])
+@pytest.mark.parametrize('directed', [True])  # TODO re-enable undirected mode
 def test_heterogeneous_neighbor_loader_on_cora(get_dataset, directed):
     dataset = get_dataset(name='Cora')
     data = dataset[0]

@@ -14,6 +14,7 @@
     filter_hetero_data,
 )
 from torch_geometric.sampler import NeighborSampler
+from torch_geometric.sampler.base import HeteroSamplerOutput, SamplerOutput
 from torch_geometric.typing import InputNodes, NumNeighbors
 
 
@@ -205,28 +206,33 @@ def __init__(
 
         super().__init__(input_nodes, collate_fn=self.collate_fn, **kwargs)
 
-    def filter_fn(self, out: Any) -> Union[Data, HeteroData]:
+    def filter_fn(
+        self,
+        out: Union[SamplerOutput, HeteroSamplerOutput],
+    ) -> Union[Data, HeteroData]:
         # TODO(manan): remove special access of input_type and perm_dict here:
-        if isinstance(self.data, Data):
-            node, row, col, edge, batch_size = out
-            data = filter_data(self.data, node, row, col, edge,
+        if isinstance(out, SamplerOutput):
+            data = filter_data(self.data, out.node, out.row, out.col, out.edge,
                                self.neighbor_sampler.perm)
-            data.batch_size = batch_size
-
-        elif isinstance(self.data, HeteroData):
-            node_dict, row_dict, col_dict, edge_dict, batch_size = out
-            data = filter_hetero_data(self.data, node_dict, row_dict, col_dict,
-                                      edge_dict,
-                                      self.neighbor_sampler.perm_dict)
-            data[self.neighbor_sampler.input_type].batch_size = batch_size
-
-        else:  # Tuple[FeatureStore, GraphStore]
-            # TODO support for feature stores with no edge types
-            node_dict, row_dict, col_dict, edge_dict, batch_size = out
-            feature_store, graph_store = self.data
-            data = filter_custom_store(feature_store, graph_store, node_dict,
-                                       row_dict, col_dict, edge_dict)
-            data[self.neighbor_sampler.input_type].batch_size = batch_size
+            data.batch = out.batch
+            data.batch_size = out.metadata
+
+        elif isinstance(out, HeteroSamplerOutput):
+            if isinstance(self.data, HeteroData):
+                data = filter_hetero_data(self.data, out.node, out.row,
+                                          out.col, out.edge,
+                                          self.neighbor_sampler.perm_dict)
+            else:  # Tuple[FeatureStore, GraphStore]
+                data = filter_custom_store(*self.data, out.node, out.row,
+                                           out.col, out.edge)
+
+            for key, batch in (out.batch or {}).items():
+                data[key].batch = batch
+            data[self.neighbor_sampler.input_type].batch_size = out.metadata
+
+        else:
+            raise TypeError(f"'{self.__class__.__name__}'' found invalid "
+                            f"type: '{type(out)}'")
 
         return data if self.transform is None else self.transform(data)
 

@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import Any, Dict, NamedTuple, Union
+from typing import Any, Dict, NamedTuple, Optional, Union
 
 import torch
 
@@ -33,27 +33,21 @@ class SamplerOutput(NamedTuple):
     row: torch.Tensor
     col: torch.Tensor
     edge: torch.Tensor
-
+    batch: Optional[torch.Tensor] = None
     # TODO(manan): refine this further; it does not currently define a proper
     # API for the expected output of a sampler.
-    metadata: Any
-
-    # TODO(manan): include a 'batch' attribute that assigns each node to an
-    # example; this is necessary for integration with pyg-lib
+    metadata: Optional[Any] = None
 
 
 class HeteroSamplerOutput(NamedTuple):
     node: Dict[NodeType, torch.Tensor]
     row: Dict[EdgeType, torch.Tensor]
     col: Dict[EdgeType, torch.Tensor]
     edge: Dict[EdgeType, torch.Tensor]
-
+    batch: Optional[Dict[NodeType, torch.Tensor]] = None
     # TODO(manan): refine this further; it does not currently define a proper
     # API for the expected output of a sampler.
-    metadata: Any
-
-    # TODO(manan): include a 'batch' attribute that assigns each node to an
-    # example; this is necessary for integration with pyg-lib
+    metadata: Optional[Any] = None
 
 
 class BaseSampler(ABC):

@@ -1,7 +1,6 @@
 from typing import Any, Dict, Optional, Tuple, Union
 
 import torch
-from torch import Tensor
 
 from torch_geometric.data import Data, HeteroData
 from torch_geometric.data.feature_store import FeatureStore
@@ -15,6 +14,12 @@
 from torch_geometric.sampler.utils import to_csc, to_hetero_csc
 from torch_geometric.typing import NumNeighbors
 
+try:
+    import pyg_lib  # noqa
+    _WITH_PYG_LIB = True
+except ImportError:
+    _WITH_PYG_LIB = False
+
 
 class NeighborSampler(BaseSampler):
     r"""An implementation of an in-memory neighbor sampler."""
@@ -42,11 +47,9 @@ def __init__(
         # If we are working with a `Data` object, convert the edge_index to
         # CSC and store it:
         if isinstance(data, Data):
+            self.node_time = None
             if time_attr is not None:
-                # TODO `time_attr` support for homogeneous graphs
-                raise ValueError(
-                    f"'time_attr' attribute not yet supported for "
-                    f"'{data.__class__.__name__}' object")
+                self.node_time = data[time_attr]
 
             # Convert the graph data into a suitable format for sampling.
             out = to_csc(data, device='cpu', share_memory=share_memory,
@@ -57,10 +60,9 @@ def __init__(
         # If we are working with a `HeteroData` object, convert each edge
         # type's edge_index to CSC and store it:
         elif isinstance(data, HeteroData):
+            self.node_time_dict = None
             if time_attr is not None:
                 self.node_time_dict = data.collect(time_attr)
-            else:
-                self.node_time_dict = None
 
             self.node_types, self.edge_types = data.metadata()
             self._set_num_neighbors_and_num_hops(num_neighbors)
@@ -145,8 +147,8 @@ def __init__(
             self.num_neighbors = remap_keys(self.num_neighbors,
                                             self.to_rel_type)
         else:
-            raise TypeError(
-                f'{self.__class__.__name__} found invalid type: {type(data)}')
+            raise TypeError(f"'{self.__class__.__name__}'' found invalid "
+                            f"type: '{type(data)}'")
 
     def _set_num_neighbors_and_num_hops(self, num_neighbors):
         if isinstance(num_neighbors, (list, tuple)):
@@ -160,7 +162,127 @@ def _set_num_neighbors_and_num_hops(self, num_neighbors):
         self.num_hops = max([0] +
                             [len(v) for v in self.num_neighbors.values()])
 
-    def _sparse_neighbor_sample(self, index: Tensor):
+    def sample(
+        self,
+        index: SamplerInput,
+        **kwargs,
+    ) -> Union[SamplerOutput, HeteroSamplerOutput]:
+        r"""Implements neighbor sampling by calling :obj:`pyg-lib` or
+        :obj:`torch-sparse` sampling routines, conditional on the type of
+        :obj:`data` object."""
+        if self.data_cls == 'custom' or issubclass(self.data_cls, HeteroData):
+            if _WITH_PYG_LIB:
+                # TODO (matthias) Add `disjoint` option to `NeighborSampler`
+                # TODO (matthias) `return_edge_id` if edge features present
+                disjoint = self.node_time_dict is not None
+                out = torch.ops.pyg.hetero_neighbor_sample_cpu(
+                    self.node_types,
+                    self.edge_types,
+                    self.colptr_dict,
+                    self.row_dict,
+                    {self.input_type: index},  # seed_dict
+                    self.num_neighbors,
+                    kwargs.get('node_time_dict', self.node_time_dict),
+                    True,  # csc
+                    self.replace,
+                    self.directed,
+                    disjoint,
+                    True,  # return_edge_id
+                )
+                row, col, node, edge, batch = out + (None, )
+                if disjoint:
+                    node = {k: v.t().contiguous() for k, v in node.items()}
+                    batch = {k: v[0] for k, v in node.items()}
+                    node = {k: v[1] for k, v in node.items()}
+
+            else:  # _WITH_PYTORCH_SPARSE
+                if self.node_time_dict is None:
+                    out = torch.ops.torch_sparse.hetero_neighbor_sample(
+                        self.node_types,
+                        self.edge_types,
+                        self.colptr_dict,
+                        self.row_dict,
+                        {self.input_type: index},  # seed
+                        self.num_neighbors,
+                        self.num_hops,
+                        self.replace,
+                        self.directed,
+                    )
+                else:
+                    fn = torch.ops.torch_sparse.hetero_temporal_neighbor_sample
+                    out = fn(
+                        self.node_types,
+                        self.edge_types,
+                        self.colptr_dict,
+                        self.row_dict,
+                        {self.input_type: index},  # seed_dict
+                        self.num_neighbors,
+                        kwargs.get('node_time_dict', self.node_time_dict),
+                        self.num_hops,
+                        self.replace,
+                        self.directed,
+                    )
+                node, row, col, edge, batch = out + (None, )
+
+            return HeteroSamplerOutput(
+                node=node,
+                row=remap_keys(row, self.to_edge_type),
+                col=remap_keys(col, self.to_edge_type),
+                edge=remap_keys(edge, self.to_edge_type),
+                batch=batch,
+                metadata=index.numel(),
+            )
+
+        if issubclass(self.data_cls, Data):
+            if _WITH_PYG_LIB:
+                # TODO (matthias) Add `disjoint` option to `NeighborSampler`
+                # TODO (matthias) `return_edge_id` if edge features present
+                disjoint = self.node_time is not None
+                out = torch.ops.pyg.neighbor_sample(
+                    self.colptr,
+                    self.row,
+                    index,  # seed
+                    self.num_neighbors,
+                    kwargs.get('node_time', self.node_time),
+                    True,  # csc
+                    self.replace,
+                    self.directed,
+                    disjoint,
+                    True,  # return_edge_id
+                )
+                row, col, node, edge, batch = out + (None, )
+                if disjoint:
+                    batch, node = node.t().contiguous()
+
+            else:  # _WITH_PYTORCH_SPARSE
+                if self.node_time is not None:
+                    raise ValueError("'time_attr' not supported for "
+                                     "neighbor sampling via 'torch-sparse'")
+                out = torch.ops.torch_sparse.neighbor_sample(
+                    self.colptr,
+                    self.row,
+                    index,  # seed
+                    self.num_neighbors,
+                    self.replace,
+                    self.directed,
+                )
+                node, row, col, edge, batch = out + (None, )
+
+            return SamplerOutput(
+                node=node,
+                row=row,
+                col=col,
+                edge=edge,
+                batch=batch,
+                metadata=index.numel(),
+            )
+
+        raise TypeError(f"'{self.__class__.__name__}'' found invalid "
+                        f"type: '{type(self.data_cls)}'")
+
+    # TODO Remove once better link prediction sample support lands ############
+
+    def _sparse_neighbor_sample(self, index: torch.Tensor):
         fn = torch.ops.torch_sparse.neighbor_sample
         node, row, col, edge = fn(
             self.colptr,
@@ -174,7 +296,7 @@ def _sparse_neighbor_sample(self, index: Tensor):
 
     def _hetero_sparse_neighbor_sample(
         self,
-        index_dict: Dict[str, Tensor],
+        index_dict: Dict[str, torch.Tensor],
         **kwargs,
     ):
         if self.node_time_dict is None:
@@ -214,41 +336,6 @@ def _hetero_sparse_neighbor_sample(
             )
         return node_dict, row_dict, col_dict, edge_dict
 
-    def sample(
-        self,
-        index: SamplerInput,
-    ) -> Union[SamplerOutput, HeteroSamplerOutput]:
-        r"""Implements neighbor sampling by calling 'torch-sparse' sampling
-        routines, conditional on the type of data object."""
-
-        # Tuple[FeatureStore, GraphStore] currently only supports heterogeneous
-        # sampling:
-        if self.data_cls == 'custom' or issubclass(self.data_cls, HeteroData):
-            node, row, col, edge = self._hetero_sparse_neighbor_sample(
-                {self.input_type: index})
-
-            # Convert back from edge type strings to PyG EdgeType, as required
-            # by SamplerOutput:
-            return HeteroSamplerOutput(
-                metadata=index.numel(),
-                node=node,
-                row=remap_keys(row, self.to_edge_type),
-                col=remap_keys(col, self.to_edge_type),
-                edge=remap_keys(edge, self.to_edge_type),
-            )
-        elif issubclass(self.data_cls, Data):
-            node, row, col, edge = self._sparse_neighbor_sample(index)
-            return SamplerOutput(
-                metadata=index.numel(),
-                node=node,
-                row=row,
-                col=col,
-                edge=edge,
-            )
-        else:
-            raise TypeError(f'{self.__class__.__name__} found invalid type: '
-                            f'{type(self.data_cls)}')
-
 
 ###############################################################################