From 25f6d30611f827f8509a20b9793acbcd5689f112 Mon Sep 17 00:00:00 2001 From: Yu Gai Date: Fri, 18 Jan 2019 12:52:02 +0800 Subject: [PATCH 01/10] signed and unsigned integer conversion --- include/dgl/immutable_graph.h | 4 ++-- src/c_api_common.cc | 2 +- src/graph/graph.cc | 20 ++++++++++---------- src/graph/graph_apis.cc | 2 +- src/graph/immutable_graph.cc | 18 +++++++++--------- 5 files changed, 23 insertions(+), 23 deletions(-) diff --git a/include/dgl/immutable_graph.h b/include/dgl/immutable_graph.h index 849b160847e7..35133b2eb9f1 100644 --- a/include/dgl/immutable_graph.h +++ b/include/dgl/immutable_graph.h @@ -56,7 +56,7 @@ class ImmutableGraph: public GraphInterface { return indices.size(); } - int64_t GetDegree(dgl_id_t vid) const { + uint64_t GetDegree(dgl_id_t vid) const { return indptr[vid + 1] - indptr[vid]; } DegreeArray GetDegrees(IdArray vids) const; @@ -81,7 +81,7 @@ class ImmutableGraph: public GraphInterface { * we simply sort on the input edge list. We allow sorting on both end points of an edge, * which is specified by `sort_on`. */ - static CSR::Ptr FromEdges(std::vector *edges, int sort_on, int64_t num_nodes); + static CSR::Ptr FromEdges(std::vector *edges, int sort_on, uint64_t num_nodes); }; /*! \brief Construct an immutable graph from the COO format. */ diff --git a/src/c_api_common.cc b/src/c_api_common.cc index 0326f87a9b8b..63184615b82c 100644 --- a/src/c_api_common.cc +++ b/src/c_api_common.cc @@ -24,7 +24,7 @@ DLManagedTensor* CreateTmpDLManagedTensor(const DGLArgValue& arg) { PackedFunc ConvertNDArrayVectorToPackedFunc(const std::vector& vec) { auto body = [vec](DGLArgs args, DGLRetValue* rv) { - const int which = args[0]; + const size_t which = args[0]; if (which >= vec.size()) { LOG(FATAL) << "invalid choice"; } else { diff --git a/src/graph/graph.cc b/src/graph/graph.cc index b64b7aa59625..f5a69ea5c8ed 100644 --- a/src/graph/graph.cc +++ b/src/graph/graph.cc @@ -20,14 +20,14 @@ Graph::Graph(IdArray src_ids, IdArray dst_ids, IdArray edge_ids, size_t num_node CHECK(IsValidIdArray(edge_ids)); this->AddVertices(num_nodes); num_edges_ = src_ids->shape[0]; - CHECK(num_edges_ == dst_ids->shape[0]) << "vectors in COO must have the same length"; - CHECK(num_edges_ == edge_ids->shape[0]) << "vectors in COO must have the same length"; + CHECK(static_cast(num_edges_) == dst_ids->shape[0]) << "vectors in COO must have the same length"; + CHECK(static_cast(num_edges_) == edge_ids->shape[0]) << "vectors in COO must have the same length"; const dgl_id_t *src_data = static_cast(src_ids->data); const dgl_id_t *dst_data = static_cast(dst_ids->data); const dgl_id_t *edge_data = static_cast(edge_ids->data); all_edges_src_.reserve(num_edges_); all_edges_dst_.reserve(num_edges_); - for (int64_t i = 0; i < num_edges_; i++) { + for (uint64_t i = 0; i < num_edges_; i++) { auto src = src_data[i]; auto dst = dst_data[i]; auto eid = edge_data[i]; @@ -504,10 +504,10 @@ Subgraph Graph::EdgeSubgraph(IdArray eids) const { } std::vector Graph::GetAdj(bool transpose, const std::string &fmt) const { - int64_t num_edges = NumEdges(); - int64_t num_nodes = NumVertices(); + uint64_t num_edges = NumEdges(); + uint64_t num_nodes = NumVertices(); if (fmt == "coo") { - IdArray idx = IdArray::Empty({2 * num_edges}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray idx = IdArray::Empty({2 * static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); int64_t *idx_data = static_cast(idx->data); if (transpose) { std::copy(all_edges_src_.begin(), all_edges_src_.end(), idx_data); @@ -516,17 +516,17 @@ std::vector Graph::GetAdj(bool transpose, const std::string &fmt) const std::copy(all_edges_dst_.begin(), all_edges_dst_.end(), idx_data); std::copy(all_edges_src_.begin(), all_edges_src_.end(), idx_data + num_edges); } - IdArray eid = IdArray::Empty({num_edges}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray eid = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); int64_t *eid_data = static_cast(eid->data); for (uint64_t eid = 0; eid < num_edges; ++eid) { eid_data[eid] = eid; } return std::vector{idx, eid}; } else if (fmt == "csr") { - IdArray indptr = IdArray::Empty({num_nodes + 1}, DLDataType{kDLInt, 64, 1}, + IdArray indptr = IdArray::Empty({static_cast(num_nodes) + 1}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - IdArray indices = IdArray::Empty({num_edges}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - IdArray eid = IdArray::Empty({num_edges}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray indices = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray eid = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); int64_t *indptr_data = static_cast(indptr->data); int64_t *indices_data = static_cast(indices->data); int64_t *eid_data = static_cast(eid->data); diff --git a/src/graph/graph_apis.cc b/src/graph/graph_apis.cc index dac42cd10b5b..58726437460f 100644 --- a/src/graph/graph_apis.cc +++ b/src/graph/graph_apis.cc @@ -70,7 +70,7 @@ PackedFunc ConvertSubgraphToPackedFunc(const Subgraph& sg) { // Convert Sampled Subgraph structures to PackedFunc. PackedFunc ConvertSubgraphToPackedFunc(const std::vector& sg) { auto body = [sg] (DGLArgs args, DGLRetValue* rv) { - const int which = args[0]; + const size_t which = args[0]; if (which < sg.size()) { GraphInterface* gptr = sg[which].graph->Reset(); GraphHandle ghandle = gptr; diff --git a/src/graph/immutable_graph.cc b/src/graph/immutable_graph.cc index 0fafc1797898..108c2994b6f6 100644 --- a/src/graph/immutable_graph.cc +++ b/src/graph/immutable_graph.cc @@ -191,7 +191,7 @@ std::pair ImmutableGraph::CSR::VertexSubgraph // Store the non-zeros in a subgraph with edge attributes of new edge ids. sub_csr->edge_ids.resize(sub_csr->indices.size()); - for (int64_t i = 0; i < sub_csr->edge_ids.size(); i++) + for (size_t i = 0; i < sub_csr->edge_ids.size(); i++) sub_csr->edge_ids[i] = i; IdArray rst_eids = IdArray::Empty({static_cast(orig_edge_ids.size())}, @@ -203,7 +203,7 @@ std::pair ImmutableGraph::CSR::VertexSubgraph } ImmutableGraph::CSR::Ptr ImmutableGraph::CSR::FromEdges(std::vector *edges, - int sort_on, int64_t num_nodes) { + int sort_on, uint64_t num_nodes) { CHECK(sort_on == 0 || sort_on == 1) << "we must sort on the first or the second vector"; int other_end = sort_on == 1 ? 0 : 1; // TODO(zhengda) we should sort in parallel. @@ -285,7 +285,7 @@ BoolArray ImmutableGraph::HasVertices(IdArray vids) const { BoolArray rst = BoolArray::Empty({len}, vids->dtype, vids->ctx); const dgl_id_t* vid_data = static_cast(vids->data); dgl_id_t* rst_data = static_cast(rst->data); - const int64_t nverts = NumVertices(); + const uint64_t nverts = NumVertices(); for (int64_t i = 0; i < len; ++i) { rst_data[i] = (vid_data[i] < nverts)? 1 : 0; } @@ -804,7 +804,7 @@ SampledSubgraph ImmutableGraph::SampleSubgraph(IdArray seed_arr, // BFS traverse the graph and sample vertices // std::unordered_set sub_ver_map; - std::vector > sub_vers; + std::vector > sub_vers; sub_vers.reserve(num_seeds * 10); // add seed vertices for (size_t i = 0; i < num_seeds; ++i) { @@ -893,20 +893,20 @@ SampledSubgraph ImmutableGraph::SampleSubgraph(IdArray seed_arr, // Copy sub_ver_map to output[0] // Copy layer - int64_t num_vertices = sub_ver_map.size(); + uint64_t num_vertices = sub_ver_map.size(); std::sort(sub_vers.begin(), sub_vers.end(), [](const std::pair &a1, const std::pair &a2) { return a1.first < a2.first; }); SampledSubgraph subg; - subg.induced_vertices = IdArray::Empty({num_vertices}, + subg.induced_vertices = IdArray::Empty({static_cast(num_vertices)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - subg.induced_edges = IdArray::Empty({num_edges}, + subg.induced_edges = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - subg.layer_ids = IdArray::Empty({num_vertices}, + subg.layer_ids = IdArray::Empty({static_cast(num_vertices)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - subg.sample_prob = runtime::NDArray::Empty({num_vertices}, + subg.sample_prob = runtime::NDArray::Empty({static_cast(num_vertices)}, DLDataType{kDLFloat, 32, 1}, DLContext{kDLCPU, 0}); dgl_id_t *out = static_cast(subg.induced_vertices->data); From 6255aab73e1fb1c633e2944f423ceac47d3c5502 Mon Sep 17 00:00:00 2001 From: Yu Gai Date: Fri, 18 Jan 2019 13:41:21 +0800 Subject: [PATCH 02/10] replace long with uint64_t --- src/graph/graph.cc | 14 +++++++------- src/graph/immutable_graph.cc | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/graph/graph.cc b/src/graph/graph.cc index f5a69ea5c8ed..0f6f6877ab26 100644 --- a/src/graph/graph.cc +++ b/src/graph/graph.cc @@ -20,8 +20,8 @@ Graph::Graph(IdArray src_ids, IdArray dst_ids, IdArray edge_ids, size_t num_node CHECK(IsValidIdArray(edge_ids)); this->AddVertices(num_nodes); num_edges_ = src_ids->shape[0]; - CHECK(static_cast(num_edges_) == dst_ids->shape[0]) << "vectors in COO must have the same length"; - CHECK(static_cast(num_edges_) == edge_ids->shape[0]) << "vectors in COO must have the same length"; + CHECK(static_cast(num_edges_) == dst_ids->shape[0]) << "vectors in COO must have the same length"; + CHECK(static_cast(num_edges_) == edge_ids->shape[0]) << "vectors in COO must have the same length"; const dgl_id_t *src_data = static_cast(src_ids->data); const dgl_id_t *dst_data = static_cast(dst_ids->data); const dgl_id_t *edge_data = static_cast(edge_ids->data); @@ -507,7 +507,7 @@ std::vector Graph::GetAdj(bool transpose, const std::string &fmt) const uint64_t num_edges = NumEdges(); uint64_t num_nodes = NumVertices(); if (fmt == "coo") { - IdArray idx = IdArray::Empty({2 * static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray idx = IdArray::Empty({2 * static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); int64_t *idx_data = static_cast(idx->data); if (transpose) { std::copy(all_edges_src_.begin(), all_edges_src_.end(), idx_data); @@ -516,17 +516,17 @@ std::vector Graph::GetAdj(bool transpose, const std::string &fmt) const std::copy(all_edges_dst_.begin(), all_edges_dst_.end(), idx_data); std::copy(all_edges_src_.begin(), all_edges_src_.end(), idx_data + num_edges); } - IdArray eid = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray eid = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); int64_t *eid_data = static_cast(eid->data); for (uint64_t eid = 0; eid < num_edges; ++eid) { eid_data[eid] = eid; } return std::vector{idx, eid}; } else if (fmt == "csr") { - IdArray indptr = IdArray::Empty({static_cast(num_nodes) + 1}, DLDataType{kDLInt, 64, 1}, + IdArray indptr = IdArray::Empty({static_cast(num_nodes) + 1}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - IdArray indices = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - IdArray eid = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray indices = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray eid = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); int64_t *indptr_data = static_cast(indptr->data); int64_t *indices_data = static_cast(indices->data); int64_t *eid_data = static_cast(eid->data); diff --git a/src/graph/immutable_graph.cc b/src/graph/immutable_graph.cc index 108c2994b6f6..e3d9337a2239 100644 --- a/src/graph/immutable_graph.cc +++ b/src/graph/immutable_graph.cc @@ -900,13 +900,13 @@ SampledSubgraph ImmutableGraph::SampleSubgraph(IdArray seed_arr, }); SampledSubgraph subg; - subg.induced_vertices = IdArray::Empty({static_cast(num_vertices)}, + subg.induced_vertices = IdArray::Empty({static_cast(num_vertices)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - subg.induced_edges = IdArray::Empty({static_cast(num_edges)}, + subg.induced_edges = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - subg.layer_ids = IdArray::Empty({static_cast(num_vertices)}, + subg.layer_ids = IdArray::Empty({static_cast(num_vertices)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - subg.sample_prob = runtime::NDArray::Empty({static_cast(num_vertices)}, + subg.sample_prob = runtime::NDArray::Empty({static_cast(num_vertices)}, DLDataType{kDLFloat, 32, 1}, DLContext{kDLCPU, 0}); dgl_id_t *out = static_cast(subg.induced_vertices->data); From 353f622a7af247470f34215c3eb1b6f7ad8914b9 Mon Sep 17 00:00:00 2001 From: Yu Gai Date: Fri, 18 Jan 2019 13:50:45 +0800 Subject: [PATCH 03/10] replace uint64_t with int64_t --- src/graph/graph.cc | 14 +++++++------- src/graph/immutable_graph.cc | 8 ++++---- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/src/graph/graph.cc b/src/graph/graph.cc index 0f6f6877ab26..3928f505fc72 100644 --- a/src/graph/graph.cc +++ b/src/graph/graph.cc @@ -20,8 +20,8 @@ Graph::Graph(IdArray src_ids, IdArray dst_ids, IdArray edge_ids, size_t num_node CHECK(IsValidIdArray(edge_ids)); this->AddVertices(num_nodes); num_edges_ = src_ids->shape[0]; - CHECK(static_cast(num_edges_) == dst_ids->shape[0]) << "vectors in COO must have the same length"; - CHECK(static_cast(num_edges_) == edge_ids->shape[0]) << "vectors in COO must have the same length"; + CHECK(static_cast(num_edges_) == dst_ids->shape[0]) << "vectors in COO must have the same length"; + CHECK(static_cast(num_edges_) == edge_ids->shape[0]) << "vectors in COO must have the same length"; const dgl_id_t *src_data = static_cast(src_ids->data); const dgl_id_t *dst_data = static_cast(dst_ids->data); const dgl_id_t *edge_data = static_cast(edge_ids->data); @@ -507,7 +507,7 @@ std::vector Graph::GetAdj(bool transpose, const std::string &fmt) const uint64_t num_edges = NumEdges(); uint64_t num_nodes = NumVertices(); if (fmt == "coo") { - IdArray idx = IdArray::Empty({2 * static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray idx = IdArray::Empty({2 * static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); int64_t *idx_data = static_cast(idx->data); if (transpose) { std::copy(all_edges_src_.begin(), all_edges_src_.end(), idx_data); @@ -516,17 +516,17 @@ std::vector Graph::GetAdj(bool transpose, const std::string &fmt) const std::copy(all_edges_dst_.begin(), all_edges_dst_.end(), idx_data); std::copy(all_edges_src_.begin(), all_edges_src_.end(), idx_data + num_edges); } - IdArray eid = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray eid = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); int64_t *eid_data = static_cast(eid->data); for (uint64_t eid = 0; eid < num_edges; ++eid) { eid_data[eid] = eid; } return std::vector{idx, eid}; } else if (fmt == "csr") { - IdArray indptr = IdArray::Empty({static_cast(num_nodes) + 1}, DLDataType{kDLInt, 64, 1}, + IdArray indptr = IdArray::Empty({static_cast(num_nodes) + 1}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - IdArray indices = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - IdArray eid = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray indices = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray eid = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); int64_t *indptr_data = static_cast(indptr->data); int64_t *indices_data = static_cast(indices->data); int64_t *eid_data = static_cast(eid->data); diff --git a/src/graph/immutable_graph.cc b/src/graph/immutable_graph.cc index e3d9337a2239..4fe3e17877c3 100644 --- a/src/graph/immutable_graph.cc +++ b/src/graph/immutable_graph.cc @@ -900,13 +900,13 @@ SampledSubgraph ImmutableGraph::SampleSubgraph(IdArray seed_arr, }); SampledSubgraph subg; - subg.induced_vertices = IdArray::Empty({static_cast(num_vertices)}, + subg.induced_vertices = IdArray::Empty({static_cast(num_vertices)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - subg.induced_edges = IdArray::Empty({static_cast(num_edges)}, + subg.induced_edges = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - subg.layer_ids = IdArray::Empty({static_cast(num_vertices)}, + subg.layer_ids = IdArray::Empty({static_cast(num_vertices)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - subg.sample_prob = runtime::NDArray::Empty({static_cast(num_vertices)}, + subg.sample_prob = runtime::NDArray::Empty({static_cast(num_vertices)}, DLDataType{kDLFloat, 32, 1}, DLContext{kDLCPU, 0}); dgl_id_t *out = static_cast(subg.induced_vertices->data); From 020964d6596f0e766266257a71ff39c2faee3b30 Mon Sep 17 00:00:00 2001 From: Yu Gai Date: Mon, 21 Jan 2019 11:04:47 +0800 Subject: [PATCH 04/10] layer sampler --- include/dgl/graph_interface.h | 8 ++ src/graph/immutable_graph.cc | 146 +++++++++++++++++++++++++++++++++- 2 files changed, 151 insertions(+), 3 deletions(-) diff --git a/include/dgl/graph_interface.h b/include/dgl/graph_interface.h index 7e617ac9d5c5..f7e0895e1730 100644 --- a/include/dgl/graph_interface.h +++ b/include/dgl/graph_interface.h @@ -340,6 +340,14 @@ class GraphInterface { */ virtual SampledSubgraph NeighborUniformSample(IdArray seeds, const std::string &neigh_type, int num_hops, int expand_factor) const = 0; + + /*! + * \brief Sample a subgraph from the seed vertices with layer sampling. + * The layers are sampled with a uniform distribution. + * \return a subgraph + */ + /* virtual SampledSubgraph LayerUniformSample(IdArray seeds, const std::string &neigh_type, + int n_layers, size_t layer_size) const = 0; */ }; /*! \brief Subgraph data structure */ diff --git a/src/graph/immutable_graph.cc b/src/graph/immutable_graph.cc index 4fe3e17877c3..4b9b90e3672c 100644 --- a/src/graph/immutable_graph.cc +++ b/src/graph/immutable_graph.cc @@ -788,7 +788,7 @@ struct neigh_list { : neighs(_neighs), edges(_edges) {} }; -SampledSubgraph ImmutableGraph::SampleSubgraph(IdArray seed_arr, +SampledSubgraph ImmutableGraph::NeighborSample(IdArray seed_arr, const float* probability, const std::string &neigh_type, int num_hops, @@ -907,7 +907,7 @@ SampledSubgraph ImmutableGraph::SampleSubgraph(IdArray seed_arr, subg.layer_ids = IdArray::Empty({static_cast(num_vertices)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); subg.sample_prob = runtime::NDArray::Empty({static_cast(num_vertices)}, - DLDataType{kDLFloat, 32, 1}, DLContext{kDLCPU, 0}); + DLDataType{kDLFloat, 8 * sizeof(float), 1}, DLContext{kDLCPU, 0}); dgl_id_t *out = static_cast(subg.induced_vertices->data); dgl_id_t *out_layer = static_cast(subg.layer_ids->data); @@ -975,6 +975,134 @@ SampledSubgraph ImmutableGraph::SampleSubgraph(IdArray seed_arr, return subg; } +SampledSubgraph ImmutableGraph::LayerSample(IdArray seed_array, + const float* probability, + const std::string &neigh_type, + int n_layers, size_t layer_size) const { + unsigned int rand_seed = time(nullptr); + auto g_csr = neigh_type == "in" ? GetInCSR() : GetOutCSR(); + const int64_t* indptr = g_csr->indptr.data(); + const dgl_id_t* indices = g_csr->indices.data(); + const dgl_id_t* eids = g_csr->edge_ids.data(); + + std::vector nodes; + std::vector layer_ids; + std::vector probabilities; + std::vector edges; + + std::unordered_set candidate_set; + std::vector candidates; + std::map n_times; + + size_t n_seeds = seed_array->shape[0]; + const dgl_id_t* seed_data = static_cast(seed_array->data); + candidate_set.insert(seed_data, seed_data + n_seeds); + std::copy(candidate_set.begin(), candidate_set.end(), nodes.begin()); + layer_ids.insert(layer_ids.end(), nodes.size(), 0); + probabilities.insert(probabilities.end(), nodes.size(), 0); + + std::vector positions {0, nodes.size()}; + for (int i = 0; i != n_layers; i++) { + candidate_set.clear(); + for (auto j = positions.end()[-2]; j != positions.back(); ++j) { + auto src = nodes[j]; + candidate_set.insert(indices + indptr[src], indices + indptr[src + 1]); + } + + candidates.clear(); + std::copy(candidate_set.begin(), candidate_set.end(), candidates.begin()); + + auto n_candidates = candidates.size(); + n_times.clear(); + for (size_t j = 0; j != layer_size; ++j) { + auto dst = rand_r(&rand_seed) % n_candidates; + if (!n_times.insert(std::make_pair(dst, 1)).second) { + ++n_times[dst]; + } + } + + auto n_nodes = static_cast(NumVertices()); + for (auto const &pair : n_times) { + nodes.push_back(pair.first); + layer_ids.push_back(i + 1); + probabilities.push_back(pair.second / n_nodes); + } + + positions.push_back(nodes.size()); + } + + /* + std::vector idx(nodes.size()); + iota(idx.begin(), idx.end(), 0); + auto less = [&nodes](size_t i, size_t j) { + return nodes[i] < nodes[j]; + }; + sort(idx.begin(), idx.end(), less); + + for (size_t i = 0; i < n_nodes; i++) { + subg.induced_vertices[i] = nodes[idx[i]]; + subg.layer_ids[i] = node_info[i].first; + subg.sample_prob[i] = node_info[i].second; + } + */ + + std::vector ret_indices; + std::vector ret_eids; + std::vector new_indptr; + std::vector new_indices; + std::vector new_eids; + new_indptr.push_back(0); + for (size_t i = 0; i < positions.size() - 2; ++i) { + auto curr_begin = positions[i]; + auto curr_end = positions[i + 1]; + // auto curr_len = curr_end - curr_begin; + auto next_begin = curr_end; + auto next_end = positions[i + 2]; + auto next_len = next_end - next_begin; + HashTableChecker checker(&nodes[next_begin], next_len); + for (size_t j = curr_begin; j != curr_end; ++j) { + ret_indices.clear(); + ret_eids.clear(); + auto src = nodes[j]; + auto dis = indptr[src]; + auto len = indptr[src + 1] - indptr[src]; + checker.CollectOnRow(indices + dis, eids + dis, len, &ret_indices, &ret_eids); + new_indptr.push_back(ret_indices.size()); + new_indices.insert(new_indices.end(), ret_indices.begin(), ret_indices.end()); + new_eids.insert(new_eids.end(), ret_eids.begin(), ret_eids.end()); + } + } + + long n_nodes = nodes.size(); + long n_edges = new_eids.size(); + auto subg_csr = std::make_shared(n_nodes, n_edges); + subg_csr->indptr.resize(n_nodes + 1); + subg_csr->indices.resize(n_edges); + subg_csr->edge_ids.resize(n_edges); + std::copy(new_indptr.begin(), new_indptr.end(), subg_csr->indptr.data()); + std::copy(new_indices.begin(), new_indices.end(), subg_csr->indices.data()); + std::copy(new_eids.begin(), new_eids.end(), subg_csr->edge_ids.data()); + + SampledSubgraph subg; + subg.induced_vertices = IdArray::Empty({n_nodes}, + DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + subg.induced_edges = IdArray::Empty({n_edges}, + DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + subg.layer_ids = IdArray::Empty({n_nodes}, + DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + subg.sample_prob = runtime::NDArray::Empty({n_nodes}, + DLDataType{kDLFloat, 8 * sizeof(float), 1}, DLContext{kDLCPU, 0}); + std::copy(nodes.begin(), nodes.end(), static_cast(subg.induced_vertices->data)); + std::copy(layer_ids.begin(), layer_ids.end(), static_cast(subg.layer_ids->data)); + std::copy(probabilities.begin(), probabilities.end(), static_cast(subg.sample_prob->data)); + + if (neigh_type == "in") + subg.graph = GraphPtr(new ImmutableGraph(subg_csr, nullptr, IsMultigraph())); + else + subg.graph = GraphPtr(new ImmutableGraph(nullptr, subg_csr, IsMultigraph())); + + return subg; +} void CompactSubgraph(ImmutableGraph::CSR *subg, const std::unordered_map &id_map) { for (size_t i = 0; i < subg->indices.size(); i++) { @@ -1000,7 +1128,7 @@ void ImmutableGraph::CompactSubgraph(IdArray induced_vertices) { SampledSubgraph ImmutableGraph::NeighborUniformSample(IdArray seeds, const std::string &neigh_type, int num_hops, int expand_factor) const { - auto ret = SampleSubgraph(seeds, // seed vector + auto ret = NeighborSample(seeds, // seed vector nullptr, // sample_id_probability neigh_type, num_hops, @@ -1009,4 +1137,16 @@ SampledSubgraph ImmutableGraph::NeighborUniformSample(IdArray seeds, return ret; } +SampledSubgraph ImmutableGraph::LayerUniformSample(IdArray seeds, + const std::string &neigh_type, + int n_layers, size_t layer_size) const { + auto ret = LayerSample(seeds, + nullptr, + neigh_type, + n_layers, + layer_size); + std::static_pointer_cast(ret.graph)->CompactSubgraph(ret.induced_vertices); + return ret; +} + } // namespace dgl From bb78184618dd0277bd2d4543ab4b9051bc2f237d Mon Sep 17 00:00:00 2001 From: Yu Gai Date: Mon, 21 Jan 2019 11:50:28 +0800 Subject: [PATCH 05/10] c api interface --- include/dgl/immutable_graph.h | 14 ++- python/dgl/contrib/sampling/sampler.py | 128 ++++++++++++++++++++++++- python/dgl/graph_index.py | 61 +++++++++--- src/graph/graph_apis.cc | 55 +++++++++-- 4 files changed, 236 insertions(+), 22 deletions(-) diff --git a/include/dgl/immutable_graph.h b/include/dgl/immutable_graph.h index 35133b2eb9f1..4693936d1ded 100644 --- a/include/dgl/immutable_graph.h +++ b/include/dgl/immutable_graph.h @@ -464,6 +464,14 @@ class ImmutableGraph: public GraphInterface { SampledSubgraph NeighborUniformSample(IdArray seeds, const std::string &neigh_type, int num_hops, int expand_factor) const; + /*! + * \brief Sample a subgraph from the seed vertices with layer sampling. + * The layers are sampled with a uniform distribution. + * \return a subgraph + */ + SampledSubgraph LayerUniformSample(IdArray seeds, const std::string &neigh_type, + int n_layers, size_t layer_size) const; + /*! * \brief Get the adjacency matrix of the graph. * @@ -517,10 +525,14 @@ class ImmutableGraph: public GraphInterface { */ CSRArray GetOutCSRArray() const; - SampledSubgraph SampleSubgraph(IdArray seed_arr, const float* probability, + SampledSubgraph NeighborSample(IdArray seed_arr, const float* probability, const std::string &neigh_type, int num_hops, size_t num_neighbor) const; + SampledSubgraph LayerSample(IdArray seed_arr, const float* probability, + const std::string &neigh_type, + int n_layers, size_t layer_size) const; + /*! * \brief Compact a subgraph. * In a sampled subgraph, the vertex Id is still in the ones in the original graph. diff --git a/python/dgl/contrib/sampling/sampler.py b/python/dgl/contrib/sampling/sampler.py index 65ba70fd4372..62296cf1d83a 100644 --- a/python/dgl/contrib/sampling/sampler.py +++ b/python/dgl/contrib/sampling/sampler.py @@ -14,7 +14,7 @@ except ImportError: import queue -__all__ = ['NeighborSampler'] +__all__ = ['NeighborSampler', 'LayerSampler'] class NSSubgraphLoader(object): def __init__(self, g, batch_size, expand_factor, num_hops=1, @@ -79,6 +79,69 @@ def __next__(self): aux_infos['seeds'] = self._seed_ids.pop(0).tousertensor() return self._subgraphs.pop(0), aux_infos +class LSSubgraphLoader(object): + def __init__(self, g, batch_size, layer_size, n_layers=1, + neighbor_type='in', node_prob=None, seed_nodes=None, + shuffle=False, num_workers=1, return_seed_id=False): + self._g = g + if not g._graph.is_readonly(): + raise NotImplementedError("subgraph loader only support read-only graphs.") + self._batch_size = batch_size + self._layer_size = layer_size + self._n_layers = n_layers + self._node_prob = node_prob + self._return_seed_id = return_seed_id + if self._node_prob is not None: + assert self._node_prob.shape[0] == g.number_of_nodes(), \ + "We need to know the sampling probability of every node" + if seed_nodes is None: + self._seed_nodes = F.arange(0, g.number_of_nodes()) + else: + self._seed_nodes = seed_nodes + if shuffle: + self._seed_nodes = F.rand_shuffle(self._seed_nodes) + self._num_workers = num_workers + self._neighbor_type = neighbor_type + self._subgraphs = [] + self._seed_ids = [] + self._subgraph_idx = 0 + + def _prefetch(self): + seed_ids = [] + num_nodes = len(self._seed_nodes) + for i in range(self._num_workers): + start = self._subgraph_idx * self._batch_size + # if we have visited all nodes, don't do anything. + if start >= num_nodes: + break + end = min((self._subgraph_idx + 1) * self._batch_size, num_nodes) + seed_ids.append(utils.toindex(self._seed_nodes[start:end])) + self._subgraph_idx += 1 + sgi = self._g._graph.neighbor_sampling(seed_ids, self._layer_size, + self._n_layers, self._neighbor_type, + self._node_prob) + subgraphs = [DGLSubGraph(self._g, i.induced_nodes, i.induced_edges, \ + i) for i in sgi] + self._subgraphs.extend(subgraphs) + if self._return_seed_id: + self._seed_ids.extend(seed_ids) + + def __iter__(self): + return self + + def __next__(self): + # If we don't have prefetched subgraphs, let's prefetch them. + if len(self._subgraphs) == 0: + self._prefetch() + # At this point, if we still don't have subgraphs, we must have + # iterate all subgraphs and we should stop the iterator now. + if len(self._subgraphs) == 0: + raise StopIteration + aux_infos = {} + if self._return_seed_id: + aux_infos['seeds'] = self._seed_ids.pop(0).tousertensor() + return self._subgraphs.pop(0), aux_infos + class _Prefetcher(object): """Internal shared prefetcher logic. It can be sub-classed by a Thread-based implementation or Process-based implementation.""" @@ -255,3 +318,66 @@ def NeighborSampler(g, batch_size, expand_factor, num_hops=1, return loader else: return _PrefetchingLoader(loader, num_prefetch=num_workers*2) + +def NeighborSampler(g, batch_size, layer_size, n_layers=1, + neighbor_type='in', node_prob=None, seed_nodes=None, + shuffle=False, num_workers=1, + return_seed_id=False, prefetch=False): + '''Create a sampler that samples neighborhood. + + This creates a subgraph data loader that samples subgraphs from the input graph + with neighbor sampling. This sampling method is implemented in C and can perform + sampling very efficiently. + + A subgraph grows from a seed vertex. It contains sampled neighbors + of the seed vertex as well as the edges that connect neighbor nodes with + seed nodes. When the number of hops is k (>1), the neighbors are sampled + from the k-hop neighborhood. In this case, the sampled edges are the ones + that connect the source nodes and the sampled neighbor nodes of the source + nodes. + + The subgraph loader returns a list of subgraphs and a dictionary of additional + information about the subgraphs. The size of the subgraph list is the number of workers. + + The dictionary contains: + + - seeds: a list of 1D tensors of seed Ids, if return_seed_id is True. + + Parameters + ---------- + g: the DGLGraph where we sample subgraphs. + batch_size: The number of subgraphs in a batch. + layer_size: the number of neighbors sampled from the neighbor list + of a vertex. The value of this parameter can be + an integer: indicates the number of neighbors sampled from a neighbor list. + a floating-point: indicates the ratio of the sampled neighbors in a neighbor list. + string: indicates some common ways of calculating the number of sampled neighbors, + e.g., 'sqrt(deg)'. + n_layers: The size of the neighborhood where we sample vertices. + neighbor_type: indicates the neighbors on different types of edges. + "in" means the neighbors on the in-edges, "out" means the neighbors on + the out-edges and "both" means neighbors on both types of edges. + node_prob: the probability that a neighbor node is sampled. + 1D Tensor. None means uniform sampling. Otherwise, the number of elements + should be the same as the number of vertices in the graph. + seed_nodes: a list of nodes where we sample subgraphs from. + If it's None, the seed vertices are all vertices in the graph. + shuffle: indicates the sampled subgraphs are shuffled. + num_workers: the number of worker threads that sample subgraphs in parallel. + return_seed_id: indicates whether to return seed ids along with the subgraphs. + The seed Ids are in the parent graph. + prefetch : bool, default False + Whether to prefetch the samples in the next batch. + + Returns + ------- + A subgraph iterator + The iterator returns a list of batched subgraphs and a dictionary of additional + information about the subgraphs. + ''' + loader = LSSubgraphLoader(g, batch_size, layer_size, n_layers, neighbor_type, node_prob, + seed_nodes, shuffle, num_workers, return_seed_id) + if not prefetch: + return loader + else: + return _PrefetchingLoader(loader, num_prefetch=num_workers*2) diff --git a/python/dgl/graph_index.py b/python/dgl/graph_index.py index f3326d338a14..5516a510b4f1 100644 --- a/python/dgl/graph_index.py +++ b/python/dgl/graph_index.py @@ -669,10 +669,25 @@ def neighbor_sampling(self, seed_ids, expand_factor, num_hops, neighbor_type, no seed_ids = [v.todgltensor() for v in seed_ids] num_subgs = len(seed_ids) if node_prob is None: - rst = _uniform_sampling(self, seed_ids, neighbor_type, num_hops, expand_factor) + rst = _uniform_neighbor_sampling(self, seed_ids, neighbor_type, num_hops, + expand_factor) else: - rst = _nonuniform_sampling(self, node_prob, seed_ids, neighbor_type, num_hops, - expand_factor) + rst = _nonuniform_neighbor_sampling(self, node_prob, seed_ids, neighbor_type, num_hops, expand_factor) + + return [SubgraphIndex(rst(i), self, utils.toindex(rst(num_subgs + i)), + utils.toindex(rst(num_subgs * 2 + i))) for i in range(num_subgs)] + + def layer_sampling(self, seed_ids, layer_size, n_layers, layer_type, node_prob): + """Neighborhood sampling""" + if len(seed_ids) == 0: + return [] + + seed_ids = [v.todgltensor() for v in seed_ids] + num_subgs = len(seed_ids) + if node_prob is None: + rst = _uniform_layer_sampling(self, seed_ids, layer_type, n_layers, layer_size) + else: + rst = _nonuniform_layer_sampling(self, node_prob, seed_ids, layer_type, n_layers, layer_size) return [SubgraphIndex(rst(i), self, utils.toindex(rst(num_subgs + i)), utils.toindex(rst(num_subgs * 2 + i))) for i in range(num_subgs)] @@ -1003,19 +1018,19 @@ def create_graph_index(graph_data=None, multigraph=False, readonly=False): # TODO(zhengda): we'll support variable-length inputs. _NEIGHBOR_SAMPLING_APIS = { - 1: _CAPI_DGLGraphUniformSampling, - 2: _CAPI_DGLGraphUniformSampling2, - 4: _CAPI_DGLGraphUniformSampling4, - 8: _CAPI_DGLGraphUniformSampling8, - 16: _CAPI_DGLGraphUniformSampling16, - 32: _CAPI_DGLGraphUniformSampling32, - 64: _CAPI_DGLGraphUniformSampling64, - 128: _CAPI_DGLGraphUniformSampling128, + 1: _CAPI_DGLGraphNeighborUniformSampling, + 2: _CAPI_DGLGraphNeighborUniformSampling2, + 4: _CAPI_DGLGraphNeighborUniformSampling4, + 8: _CAPI_DGLGraphNeighborUniformSampling8, + 16: _CAPI_DGLGraphNeighborUniformSampling16, + 32: _CAPI_DGLGraphNeighborUniformSampling32, + 64: _CAPI_DGLGraphNeighborUniformSampling64, + 128: _CAPI_DGLGraphNeighborUniformSampling128, } _EMPTY_ARRAYS = [utils.toindex(F.ones(shape=(0), dtype=F.int64, ctx=F.cpu()))] -def _uniform_sampling(gidx, seed_ids, neigh_type, num_hops, expand_factor): +def _uniform_neighbor_sampling(gidx, seed_ids, neigh_type, num_hops, expand_factor): num_seeds = len(seed_ids) empty_ids = [] if len(seed_ids) > 1 and len(seed_ids) not in _NEIGHBOR_SAMPLING_APIS.keys(): @@ -1025,3 +1040,25 @@ def _uniform_sampling(gidx, seed_ids, neigh_type, num_hops, expand_factor): assert len(seed_ids) in _NEIGHBOR_SAMPLING_APIS.keys() return _NEIGHBOR_SAMPLING_APIS[len(seed_ids)](gidx._handle, *seed_ids, neigh_type, num_hops, expand_factor, num_seeds) + +_LAYER_SAMPLING_APIS = { + 1: _CAPI_DGLGraphLayerUniformSampling, + 2: _CAPI_DGLGraphLayerUniformSampling2, + 4: _CAPI_DGLGraphLayerUniformSampling4, + 8: _CAPI_DGLGraphLayerUniformSampling8, + 16: _CAPI_DGLGraphLayerUniformSampling16, + 32: _CAPI_DGLGraphLayerUniformSampling32, + 64: _CAPI_DGLGraphLayerUniformSampling64, + 128: _CAPI_DGLGraphLayerUniformSampling128, +} + +def _uniform_layer_sampling(gidx, seed_ids, neigh_type, n_layers, layer_size): + num_seeds = len(seed_ids) + empty_ids = [] + if len(seed_ids) > 1 and len(seed_ids) not in _LAYER_SAMPLING_APIS.keys(): + remain = 2**int(math.ceil(math.log2(len(dgl_ids)))) - len(dgl_ids) + empty_ids = _EMPTY_ARRAYS[0:remain] + seed_ids.extend([empty.todgltensor() for empty in empty_ids]) + assert len(seed_ids) in _LAYER_SAMPLING_APIS.keys() + return _LAYER_SAMPLING_APIS[len(seed_ids)](gidx._handle, *seed_ids, neigh_type, + n_layers, layer_size, num_seeds) diff --git a/src/graph/graph_apis.cc b/src/graph/graph_apis.cc index 58726437460f..4b9dbf6f5f12 100644 --- a/src/graph/graph_apis.cc +++ b/src/graph/graph_apis.cc @@ -454,23 +454,62 @@ void CAPI_NeighborUniformSample(DGLArgs args, DGLRetValue* rv) { *rv = ConvertSubgraphToPackedFunc(subgs); } -DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling") +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphNeighborUniformSampling") .set_body(CAPI_NeighborUniformSample<1>); -DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling2") +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphNeighborUniformSampling2") .set_body(CAPI_NeighborUniformSample<2>); -DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling4") +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphNeighborUniformSampling4") .set_body(CAPI_NeighborUniformSample<4>); -DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling8") +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphNeighborUniformSampling8") .set_body(CAPI_NeighborUniformSample<8>); -DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling16") +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphNeighborUniformSampling16") .set_body(CAPI_NeighborUniformSample<16>); -DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling32") +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphNeighborUniformSampling32") .set_body(CAPI_NeighborUniformSample<32>); -DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling64") +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphNeighborUniformSampling64") .set_body(CAPI_NeighborUniformSample<64>); -DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphUniformSampling128") +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphNeighborUniformSampling128") .set_body(CAPI_NeighborUniformSample<128>); +template +void CAPI_LayerUniformSample(DGLArgs args, DGLRetValue* rv) { + GraphHandle ghandle = args[0]; + std::vector seeds(num_seeds); + for (size_t i = 0; i < seeds.size(); i++) + seeds[i] = IdArray::FromDLPack(CreateTmpDLManagedTensor(args[i + 1])); + std::string neigh_type = args[num_seeds + 1]; + const int num_hops = args[num_seeds + 2]; + const int num_neighbors = args[num_seeds + 3]; + const int num_valid_seeds = args[num_seeds + 4]; + const GraphInterface *ptr = static_cast(ghandle); + const ImmutableGraph *gptr = dynamic_cast(ptr); + CHECK(gptr) << "sampling isn't implemented in mutable graph"; + CHECK(num_valid_seeds <= num_seeds); + std::vector subgs(seeds.size()); +#pragma omp parallel for + for (int i = 0; i < num_valid_seeds; i++) { + subgs[i] = gptr->LayerUniformSample(seeds[i], neigh_type, num_hops, num_neighbors); + } + *rv = ConvertSubgraphToPackedFunc(subgs); +} + +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling") +.set_body(CAPI_LayerUniformSample<1>); +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling2") +.set_body(CAPI_LayerUniformSample<2>); +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling4") +.set_body(CAPI_LayerUniformSample<4>); +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling8") +.set_body(CAPI_LayerUniformSample<8>); +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling16") +.set_body(CAPI_LayerUniformSample<16>); +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling32") +.set_body(CAPI_LayerUniformSample<32>); +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling64") +.set_body(CAPI_LayerUniformSample<64>); +DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphLayerUniformSampling128") +.set_body(CAPI_LayerUniformSample<128>); + DGL_REGISTER_GLOBAL("graph_index._CAPI_DGLGraphGetAdj") .set_body([] (DGLArgs args, DGLRetValue* rv) { GraphHandle ghandle = args[0]; From 358f7462b71ba974a0f2339c9c719854a91c8ace Mon Sep 17 00:00:00 2001 From: Yu Gai Date: Mon, 21 Jan 2019 12:07:50 +0800 Subject: [PATCH 06/10] test layer sampler with sse --- examples/mxnet/sse/sse_batch.py | 36 ++++++++++++++++++------- python/dgl/contrib/sampling/__init__.py | 1 + python/dgl/contrib/sampling/sampler.py | 8 +++--- 3 files changed, 32 insertions(+), 13 deletions(-) diff --git a/examples/mxnet/sse/sse_batch.py b/examples/mxnet/sse/sse_batch.py index 808800ec2389..f964e9cdd8fe 100644 --- a/examples/mxnet/sse/sse_batch.py +++ b/examples/mxnet/sse/sse_batch.py @@ -266,9 +266,16 @@ def main(args, data): neigh_expand = args.neigh_expand # initialize graph dur = [] - sampler = dgl.contrib.sampling.NeighborSampler(g, args.batch_size, neigh_expand, - neighbor_type='in', num_workers=args.num_parallel_subgraphs, seed_nodes=train_vs, - shuffle=True, return_seed_id=True) + if args.sampler == 'neighbor': + sampler = dgl.contrib.sampling.NeighborSampler(g, args.batch_size, neigh_expand, + neighbor_type='in', num_workers=args.num_parallel_subgraphs, seed_nodes=train_vs, + shuffle=True, return_seed_id=True) + elif args.sampler == 'layer': + sampler = dgl.contrib.sampling.LayerSampler(g, args.batch_size, neigh_expand, + neighbor_type='in', num_workers=args.num_parallel_subgraphs, seed_nodes=train_vs, + shuffle=True, return_seed_id=True) + else: + raise RuntimeError("Unsupported sampler!") if args.cache_subgraph: sampler = CachedSubgraphLoader(sampler, shuffle=True) for epoch in range(args.n_epochs): @@ -313,11 +320,20 @@ def main(args, data): if args.cache_subgraph: sampler.restart() else: - sampler = dgl.contrib.sampling.NeighborSampler(g, args.batch_size, neigh_expand, - neighbor_type='in', - num_workers=args.num_parallel_subgraphs, - seed_nodes=train_vs, shuffle=True, - return_seed_id=True) + if args.sampler == 'neighbor': + sampler = dgl.contrib.sampling.NeighborSampler(g, args.batch_size, neigh_expand, + neighbor_type='in', + num_workers=args.num_parallel_subgraphs, + seed_nodes=train_vs, shuffle=True, + return_seed_id=True) + elif args.sampler == 'layer': + sampler = dgl.contrib.sampling.LayerSampler(g, args.batch_size, neigh_expand, + neighbor_type='in', + num_workers=args.num_parallel_subgraphs, + seed_nodes=train_vs, shuffle=True, + return_seed_id=True) + else: + raise RuntimeError("Unsupported sampler!") # test set accuracy logits = model_infer(g, eval_vs) @@ -368,7 +384,7 @@ def __init__(self, csr, num_feats): self.train_mask = None if __name__ == '__main__': - parser = argparse.ArgumentParser(description='GCN') + parser = argparse.ArgumentParser(description='SSE') register_data_args(parser) parser.add_argument("--graph-file", type=str, default="", help="graph file") @@ -400,6 +416,8 @@ def __init__(self, csr, num_feats): help="the number of subgraphs to construct in parallel.") parser.add_argument("--neigh-expand", type=int, default=16, help="the number of neighbors to sample.") + parser.add_argument("--sampler", type=str, default="neighbor", + help="neighbor/layer sampler") args = parser.parse_args() print("cache: " + str(args.cache_subgraph)) diff --git a/python/dgl/contrib/sampling/__init__.py b/python/dgl/contrib/sampling/__init__.py index 1999bf619390..18ad3f0031be 100644 --- a/python/dgl/contrib/sampling/__init__.py +++ b/python/dgl/contrib/sampling/__init__.py @@ -1 +1,2 @@ from .sampler import NeighborSampler +from .sampler import LayerSampler diff --git a/python/dgl/contrib/sampling/sampler.py b/python/dgl/contrib/sampling/sampler.py index 62296cf1d83a..dc93d5710361 100644 --- a/python/dgl/contrib/sampling/sampler.py +++ b/python/dgl/contrib/sampling/sampler.py @@ -319,10 +319,10 @@ def NeighborSampler(g, batch_size, expand_factor, num_hops=1, else: return _PrefetchingLoader(loader, num_prefetch=num_workers*2) -def NeighborSampler(g, batch_size, layer_size, n_layers=1, - neighbor_type='in', node_prob=None, seed_nodes=None, - shuffle=False, num_workers=1, - return_seed_id=False, prefetch=False): +def LayerSampler(g, batch_size, layer_size, n_layers=1, + neighbor_type='in', node_prob=None, seed_nodes=None, + shuffle=False, num_workers=1, + return_seed_id=False, prefetch=False): '''Create a sampler that samples neighborhood. This creates a subgraph data loader that samples subgraphs from the input graph From 49ea85700cfaa9d36c8deef337765ca70878285f Mon Sep 17 00:00:00 2001 From: Yu Gai Date: Mon, 21 Jan 2019 15:24:17 +0800 Subject: [PATCH 07/10] test with multi-layer gcn --- examples/pytorch/gcn/gcn_ls.py | 281 +++++++++++++++++++++++++++++++++ 1 file changed, 281 insertions(+) create mode 100644 examples/pytorch/gcn/gcn_ls.py diff --git a/examples/pytorch/gcn/gcn_ls.py b/examples/pytorch/gcn/gcn_ls.py new file mode 100644 index 000000000000..760f33176b78 --- /dev/null +++ b/examples/pytorch/gcn/gcn_ls.py @@ -0,0 +1,281 @@ +''' +Adapted from Ziyue's code. +''' + + +import argparse, time, math +import numpy as np +import mxnet as mx +from mxnet import gluon +import dgl +from dgl import DGLGraph +import dgl.function as fn +from dgl.data import register_data_args, load_data +import scipy as sp +from dgl import utils + + +def generate_rand_graph(n): + arr = (sp.sparse.random(n, n, density=0.1, format='coo') != 0).astype(np.int64) + return dgl.DGLGraph(arr, readonly=True) + + +def sample_subgraph(g, seed_nodes, n_layers, layer_size): + induced_nodes = [] + seeds = seed_nodes + parent_uv_edges_per_hop = [] + for _ in range(n_layers): + for subg, aux in dgl.contrib.sampling.LayerSampler(g, 1000000, layer_size, + neighbor_type='in', + seed_nodes=np.array(seeds), + return_seed_id=True): + #seed_ids = aux['seeds'] + #print(g.in_edges(seeds, form='all')) + subg_src, subg_dst = subg.edges() + parent_nid = subg.parent_nid + src = parent_nid[subg_src] + dst = parent_nid[subg_dst] + parent_uv_edges_per_hop.append((src.asnumpy(), dst.asnumpy())) + #print((src, dst)) + seeds = list(np.unique(src.asnumpy())) + induced_nodes.extend(list(parent_nid.asnumpy())) + + subgraph = g.subgraph(list(np.unique(np.array(induced_nodes)))) + #print(subgraph.parent_nid) + #print(parent_uv_edges_per_hop) + subg_uv_edges_per_hop = [(subgraph.map_to_subgraph_nid(src).asnumpy(), + subgraph.map_to_subgraph_nid(dst).asnumpy()) + for src, dst in parent_uv_edges_per_hop] + #print(subg_uv_edges_per_hop) + return subgraph, subg_uv_edges_per_hop + + +def test_sample(): + g = generate_rand_graph(100) + n_layers = 3 + seeds = [10, 20] + layer_size = 2 + subgraph, subg_uv_edges_per_hop = sample_subgraph(g, seeds, n_layers, layer_size) + + +class GCNLayer(gluon.Block): + def __init__(self, + in_feats, + out_feats, + activation, + dropout, + bias=True): + super(GCNLayer, self).__init__() + with self.name_scope(): + stdv = 1. / math.sqrt(out_feats) + self.weight = self.params.get('weight', shape=(in_feats, out_feats), + init=mx.init.Uniform(stdv)) + if bias: + self.bias = self.params.get('bias', shape=(out_feats,), + init=mx.init.Uniform(stdv)) + else: + self.bias = None + self.activation = activation + self.dropout = dropout + + def forward(self, h, subg, subg_edges): + if self.dropout: + h = mx.nd.Dropout(h, p=self.dropout) + h = mx.nd.dot(h, self.weight.data(h.context)) + # TODO: the normalization term need to be tuned. + # temporarilly normalized by sqrt(layer_size). + # for an unbiased estimator, it should be normalized by n(v)/D(v), + # but might be bad in practice + h = h / math.sqrt(3.) + subg.ndata['h'] = h + if subg_edges is not None: + subg.send_and_recv(subg_edges, fn.copy_src(src='h', out='m'), + fn.sum(msg='m', out='h')) + else: + subg.update_all(fn.copy_src(src='h', out='m'), + fn.sum(msg='m', out='h')) + h = subg.ndata.pop('h') + # same as TODO above + h = h / math.sqrt(3.) + # bias + if self.bias is not None: + h = h + self.bias.data(h.context) + if self.activation: + h = self.activation(h) + return h + + +class GCN(gluon.Block): + def __init__(self, + in_feats, + n_hidden, + n_classes, + n_layers, + activation, + dropout, + normalization): + super(GCN, self).__init__() + self.n_layers = n_layers + self.layers = gluon.nn.Sequential() + # input layer + self.layers.add(GCNLayer(in_feats, n_hidden, activation, 0.)) + # hidden layers + for i in range(n_layers - 1): + self.layers.add(GCNLayer(n_hidden, n_hidden, activation, dropout)) + # output layer + self.layers.add(GCNLayer(n_hidden, n_classes, None, dropout)) + + + def forward(self, subg, subg_edges_per_hop): + h = subg.ndata['in'] + + for i, layer in enumerate(self.layers): + h = layer(h, subg, subg_edges_per_hop[self.n_layers-i]) + + return h + + +def evaluate(model, g, n_layers, labels, mask): + pred = model(g, [None for i in range(n_layers)]).argmax(axis=1) + accuracy = ((pred == labels) * mask).sum() / mask.sum().asscalar() + return accuracy.asscalar() + + +def main(args): + # load and preprocess dataset + data = load_data(args) + + if args.self_loop: + data.graph.add_edges_from([(i,i) for i in range(len(data.graph))]) + + features = mx.nd.array(data.features) + labels = mx.nd.array(data.labels) + train_mask = mx.nd.array(data.train_mask) + val_mask = mx.nd.array(data.val_mask) + test_mask = mx.nd.array(data.test_mask) + in_feats = features.shape[1] + n_classes = data.num_labels + n_nodes = data.graph.number_of_nodes() + n_edges = data.graph.number_of_edges() + print("""----Data statistics------' + #Nodes %d + #Edges %d + #Classes %d + #Train samples %d + #Val samples %d + #Test samples %d""" % + (n_nodes, n_edges, n_classes, + train_mask.sum().asscalar(), + val_mask.sum().asscalar(), + test_mask.sum().asscalar())) + + if args.gpu < 0: + cuda = False + ctx = mx.cpu(0) + else: + cuda = True + ctx = mx.gpu(args.gpu) + + features = features.as_in_context(ctx) + labels = labels.as_in_context(ctx) + train_mask = train_mask.as_in_context(ctx) + val_mask = val_mask.as_in_context(ctx) + test_mask = test_mask.as_in_context(ctx) + + layer_size = args.layer_size + # create GCN model + g = DGLGraph(data.graph, readonly=True) + # normalization + degs = g.in_degrees().astype('float32') + degs[degs > layer_size] = layer_size + norm = mx.nd.power(degs, -0.5) + if cuda: + norm = norm.as_in_context(ctx) + g.ndata['norm'] = mx.nd.expand_dims(norm, 1) + g.ndata['in'] = features + + num_data = len(train_mask.asnumpy()) + full_idx = np.arange(0, num_data) + train_idx = full_idx[train_mask.asnumpy() == 1] + + seed_nodes = list(train_idx) + n_layers = args.n_layers + 1 + + model = GCN(in_feats, + args.n_hidden, + n_classes, + args.n_layers, + mx.nd.relu, + args.dropout, + args.normalization) + model.initialize(ctx=ctx) + n_train_samples = train_mask.sum().asscalar() + loss_fcn = gluon.loss.SoftmaxCELoss() + + # use optimizer + print(model.collect_params()) + trainer = gluon.Trainer(model.collect_params(), 'adam', + {'learning_rate': args.lr, 'wd': args.weight_decay}) + + # initialize graph + dur = [] + for epoch in range(args.n_epochs): + if epoch >= 3: + t0 = time.time() + + subg, subg_edges_per_hop = sample_subgraph(g, seed_nodes, n_layers, layer_size) + subg_train_mask = subg.map_to_subgraph_nid(train_idx) + subg.copy_from_parent() + + # forward + smask = np.zeros((len(subg.parent_nid),)) + smask[subg_train_mask.asnumpy()] = 1 + + with mx.autograd.record(): + pred = model(subg, subg_edges_per_hop) + loss = loss_fcn(pred, labels[subg.parent_nid], mx.nd.expand_dims(mx.nd.array(smask), 1)) + loss = loss.sum() / n_train_samples + + print(loss.asnumpy()) + loss.backward() + trainer.step(batch_size=1) + + if epoch >= 3: + dur.append(time.time() - t0) + acc = evaluate(model, g, n_layers, labels, val_mask) + print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | " + "ETputs(KTEPS) {:.2f}". format( + epoch, np.mean(dur), loss.asscalar(), acc, n_edges / np.mean(dur) / 1000)) + + acc = evaluate(model, g, n_layers, labels, test_mask) + print("Test accuracy {:.2%}".format(acc)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='GCN') + register_data_args(parser) + parser.add_argument("--dropout", type=float, default=0.5, + help="dropout probability") + parser.add_argument("--gpu", type=int, default=-1, + help="gpu") + parser.add_argument("--lr", type=float, default=3e-2, + help="learning rate") + parser.add_argument("--n-epochs", type=int, default=200, + help="number of training epochs") + parser.add_argument("--n-hidden", type=int, default=16, + help="number of hidden gcn units") + parser.add_argument("--n-layers", type=int, default=1, + help="number of hidden gcn layers") + parser.add_argument("--layer-size", type=int, default=128, + help="number of neighbors to be sampled") + parser.add_argument("--normalization", + choices=['sym','left'], default=None, + help="graph normalization types (default=None)") + parser.add_argument("--self-loop", action='store_true', + help="graph self-loop (default=False)") + parser.add_argument("--weight-decay", type=float, default=5e-4, + help="Weight for L2 loss") + args = parser.parse_args() + + print(args) + + main(args) From 9a81b6bfc1221b6a167ce05d8bef767496264499 Mon Sep 17 00:00:00 2001 From: Yu Gai Date: Mon, 21 Jan 2019 15:38:07 +0800 Subject: [PATCH 08/10] lint --- python/dgl/graph_index.py | 6 ++++-- src/graph/graph.cc | 22 ++++++++++++++-------- src/graph/immutable_graph.cc | 19 ++++++++++++------- 3 files changed, 30 insertions(+), 17 deletions(-) diff --git a/python/dgl/graph_index.py b/python/dgl/graph_index.py index 5516a510b4f1..903468c0ae92 100644 --- a/python/dgl/graph_index.py +++ b/python/dgl/graph_index.py @@ -672,7 +672,8 @@ def neighbor_sampling(self, seed_ids, expand_factor, num_hops, neighbor_type, no rst = _uniform_neighbor_sampling(self, seed_ids, neighbor_type, num_hops, expand_factor) else: - rst = _nonuniform_neighbor_sampling(self, node_prob, seed_ids, neighbor_type, num_hops, expand_factor) + rst = _nonuniform_neighbor_sampling(self, node_prob, seed_ids, neighbor_type, + num_hops, expand_factor) return [SubgraphIndex(rst(i), self, utils.toindex(rst(num_subgs + i)), utils.toindex(rst(num_subgs * 2 + i))) for i in range(num_subgs)] @@ -687,7 +688,8 @@ def layer_sampling(self, seed_ids, layer_size, n_layers, layer_type, node_prob): if node_prob is None: rst = _uniform_layer_sampling(self, seed_ids, layer_type, n_layers, layer_size) else: - rst = _nonuniform_layer_sampling(self, node_prob, seed_ids, layer_type, n_layers, layer_size) + rst = _nonuniform_layer_sampling(self, node_prob, seed_ids, layer_type, + n_layers, layer_size) return [SubgraphIndex(rst(i), self, utils.toindex(rst(num_subgs + i)), utils.toindex(rst(num_subgs * 2 + i))) for i in range(num_subgs)] diff --git a/src/graph/graph.cc b/src/graph/graph.cc index 3928f505fc72..4865895400bc 100644 --- a/src/graph/graph.cc +++ b/src/graph/graph.cc @@ -20,8 +20,10 @@ Graph::Graph(IdArray src_ids, IdArray dst_ids, IdArray edge_ids, size_t num_node CHECK(IsValidIdArray(edge_ids)); this->AddVertices(num_nodes); num_edges_ = src_ids->shape[0]; - CHECK(static_cast(num_edges_) == dst_ids->shape[0]) << "vectors in COO must have the same length"; - CHECK(static_cast(num_edges_) == edge_ids->shape[0]) << "vectors in COO must have the same length"; + CHECK(static_cast(num_edges_) == dst_ids->shape[0]) + << "vectors in COO must have the same length"; + CHECK(static_cast(num_edges_) == edge_ids->shape[0]) + << "vectors in COO must have the same length"; const dgl_id_t *src_data = static_cast(src_ids->data); const dgl_id_t *dst_data = static_cast(dst_ids->data); const dgl_id_t *edge_data = static_cast(edge_ids->data); @@ -507,7 +509,8 @@ std::vector Graph::GetAdj(bool transpose, const std::string &fmt) const uint64_t num_edges = NumEdges(); uint64_t num_nodes = NumVertices(); if (fmt == "coo") { - IdArray idx = IdArray::Empty({2 * static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray idx = IdArray::Empty({2 * static_cast(num_edges)}, + DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); int64_t *idx_data = static_cast(idx->data); if (transpose) { std::copy(all_edges_src_.begin(), all_edges_src_.end(), idx_data); @@ -516,17 +519,20 @@ std::vector Graph::GetAdj(bool transpose, const std::string &fmt) const std::copy(all_edges_dst_.begin(), all_edges_dst_.end(), idx_data); std::copy(all_edges_src_.begin(), all_edges_src_.end(), idx_data + num_edges); } - IdArray eid = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray eid = IdArray::Empty({static_cast(num_edges)}, + DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); int64_t *eid_data = static_cast(eid->data); for (uint64_t eid = 0; eid < num_edges; ++eid) { eid_data[eid] = eid; } return std::vector{idx, eid}; } else if (fmt == "csr") { - IdArray indptr = IdArray::Empty({static_cast(num_nodes) + 1}, DLDataType{kDLInt, 64, 1}, - DLContext{kDLCPU, 0}); - IdArray indices = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - IdArray eid = IdArray::Empty({static_cast(num_edges)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray indptr = IdArray::Empty({static_cast(num_nodes) + 1}, + DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray indices = IdArray::Empty({static_cast(num_edges)}, + DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + IdArray eid = IdArray::Empty({static_cast(num_edges)}, + DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); int64_t *indptr_data = static_cast(indptr->data); int64_t *indices_data = static_cast(indices->data); int64_t *eid_data = static_cast(eid->data); diff --git a/src/graph/immutable_graph.cc b/src/graph/immutable_graph.cc index a3562a2f25aa..dd92cc2c9fe6 100644 --- a/src/graph/immutable_graph.cc +++ b/src/graph/immutable_graph.cc @@ -908,7 +908,8 @@ SampledSubgraph ImmutableGraph::NeighborSample(IdArray seed_arr, subg.layer_ids = IdArray::Empty({static_cast(num_vertices)}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); subg.sample_prob = runtime::NDArray::Empty({static_cast(num_vertices)}, - DLDataType{kDLFloat, 8 * sizeof(float), 1}, DLContext{kDLCPU, 0}); + DLDataType{kDLFloat, 8 * sizeof(float), 1}, + DLContext{kDLCPU, 0}); dgl_id_t *out = static_cast(subg.induced_vertices->data); dgl_id_t *out_layer = static_cast(subg.layer_ids->data); @@ -1074,8 +1075,8 @@ SampledSubgraph ImmutableGraph::LayerSample(IdArray seed_array, } } - long n_nodes = nodes.size(); - long n_edges = new_eids.size(); + int64_t n_nodes = nodes.size(); + int64_t n_edges = new_eids.size(); auto subg_csr = std::make_shared(n_nodes, n_edges); subg_csr->indptr.resize(n_nodes + 1); subg_csr->indices.resize(n_edges); @@ -1092,10 +1093,14 @@ SampledSubgraph ImmutableGraph::LayerSample(IdArray seed_array, subg.layer_ids = IdArray::Empty({n_nodes}, DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); subg.sample_prob = runtime::NDArray::Empty({n_nodes}, - DLDataType{kDLFloat, 8 * sizeof(float), 1}, DLContext{kDLCPU, 0}); - std::copy(nodes.begin(), nodes.end(), static_cast(subg.induced_vertices->data)); - std::copy(layer_ids.begin(), layer_ids.end(), static_cast(subg.layer_ids->data)); - std::copy(probabilities.begin(), probabilities.end(), static_cast(subg.sample_prob->data)); + DLDataType{kDLFloat, 8 * sizeof(float), 1}, + DLContext{kDLCPU, 0}); + std::copy(nodes.begin(), nodes.end(), + static_cast(subg.induced_vertices->data)); + std::copy(layer_ids.begin(), layer_ids.end(), + static_cast(subg.layer_ids->data)); + std::copy(probabilities.begin(), probabilities.end(), + static_cast(subg.sample_prob->data)); if (neigh_type == "in") subg.graph = GraphPtr(new ImmutableGraph(subg_csr, nullptr, IsMultigraph())); From 707af9ba84c9fbcada90c15233b324b810d2d410 Mon Sep 17 00:00:00 2001 From: Yu Gai Date: Mon, 28 Jan 2019 16:26:38 +0800 Subject: [PATCH 09/10] convergence issue in layer-wise sampling --- examples/mxnet/gcn/gcn_ls.py | 190 +++++++++++++++++ examples/pytorch/gcn/gcn_ls.py | 281 ------------------------- include/dgl/immutable_graph.h | 4 + python/dgl/contrib/sampling/sampler.py | 20 +- python/dgl/graph_index.py | 13 +- src/graph/graph_apis.cc | 1 + src/graph/immutable_graph.cc | 138 ++++++------ 7 files changed, 281 insertions(+), 366 deletions(-) create mode 100644 examples/mxnet/gcn/gcn_ls.py delete mode 100644 examples/pytorch/gcn/gcn_ls.py diff --git a/examples/mxnet/gcn/gcn_ls.py b/examples/mxnet/gcn/gcn_ls.py new file mode 100644 index 000000000000..a63183f9e184 --- /dev/null +++ b/examples/mxnet/gcn/gcn_ls.py @@ -0,0 +1,190 @@ +import argparse, time +import numpy as np +import mxnet as mx +import mxnet.ndarray as nd +from mxnet import gluon +import dgl +from dgl import DGLGraph +from dgl.contrib.sampling import LayerSampler +from dgl.data import register_data_args, load_data +import dgl.function as fn +from dgl.subgraph import DGLSubGraph + +class GCNLayer(gluon.Block): + def __init__(self, in_feats, out_feats, activation, dropout=0): + super(GCNLayer, self).__init__() + self.dropout = dropout + with self.name_scope(): + self.dense = mx.gluon.nn.Dense(out_feats, activation) + + def forward(self, sub_g, src, dst): + if self.dropout > 0: + sub_g.apply_nodes(lambda nodes: {'h' : nd.Dropout(nodes.data['h'], + p=self.dropout)}) + # TODO normalization + if src is None: + sub_g.update_all(fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='h')) + else: + sub_g.send_and_recv((src, dst), + fn.copy_src(src='h', out='m'), + fn.sum(msg='m', out='h')) + sub_g.ndata['h'] = self.dense(sub_g.ndata['h']) + +class GCN(gluon.Block): + def __init__(self, in_feats, n_hidden, n_classes, n_layers, + activation, dropout, normalization): + super(GCN, self).__init__() + self.n_layers = n_layers + self.layers = gluon.nn.Sequential() + # input layer + self.layers.add(GCNLayer(in_feats, n_hidden, activation, 0.)) + # hidden layers + for i in range(n_layers - 1): + self.layers.add(GCNLayer(n_hidden, n_hidden, activation, dropout)) + # output layer + self.dense = mx.gluon.nn.Dense(n_classes) + + def forward(self, sub_g): + sub_g.ndata['h'] = sub_g.ndata['x'] + if isinstance(sub_g, DGLSubGraph): + n = sub_g.number_of_nodes() + nid = np.arange(n) + src, dst, eid = sub_g.edge_ids(nid, nid) + src = src.asnumpy() + dst = dst.asnumpy() + eid = eid.asnumpy() + for i, layer in enumerate(self.layers): + mask = eid == i + src = src[mask] + dst = dst[mask] + h = sub_g.ndata['h'] + sample_prob = sub_g.sample_prob.asnumpy() + p = np.expand_dims(np.where(np.isin(nid, src), sample_prob, np.ones(n)), axis=1) + sub_g.ndata['h'] = h * nd.array(p).as_in_context(h.context) + layer(sub_g, src, dst) + else: + for layer in self.layers: + layer(sub_g, None, None) + return self.dense(sub_g.pop_n_repr('h')) + +def evaluate(model, g): + y = g.ndata['y'] + y_bar = nd.argmax(model(g), axis=1) + mask = g.ndata['val_mask'] + accuracy = nd.sum(mask * (y == y_bar)) / nd.sum(mask) + return accuracy.asscalar() + +def main(args): + # load and preprocess dataset + data = load_data(args) + if args.self_loop: + data.graph.add_edges_from([(i, i) for i in range(len(data.graph))]) + n_nodes = data.graph.number_of_nodes() + n_edges = data.graph.number_of_edges() + features = nd.array(data.features) + in_feats = features.shape[1] + labels = nd.array(data.labels) + n_classes = data.num_labels + train_mask = nd.array(data.train_mask) + val_mask = nd.array(data.val_mask) + test_mask = nd.array(data.test_mask) + print("""-----Data statistics----- + # Nodes %d + # Edges %d + # Features %d + # Classes %d + # Train samples %d + # Val samples %d + # Test samples %d""" % (n_nodes, n_edges, in_feats, n_classes, + train_mask.sum().asscalar(), + val_mask.sum().asscalar(), + test_mask.sum().asscalar())) + + train_nid = np.arange(n_nodes)[data.train_mask.astype(bool)].tolist() + + ctx = mx.cpu(0) if args.gpu < 0 else mx.gpu(args.gpu) + features = features.as_in_context(ctx) + labels = labels.as_in_context(ctx) + train_mask = train_mask.as_in_context(ctx) + val_mask = val_mask.as_in_context(ctx) + test_mask = test_mask.as_in_context(ctx) + + g = DGLGraph(data.graph, readonly=True) + g.ndata['x'] = features + g.ndata['y'] = labels + g.ndata['train_mask'] = train_mask + g.ndata['val_mask'] = val_mask + g.ndata['test_mask'] = test_mask + deg = g.in_degrees().astype('float32').as_in_context(ctx) + g.ndata['normalizer'] = nd.expand_dims(nd.power(deg, -0.5), 1) + assert not g.is_multigraph + + model = GCN(in_feats, args.n_hidden, n_classes, args.n_layers, + 'relu', args.dropout, args.normalization) + model.initialize(ctx=ctx) + print(model.collect_params()) + + trainer = gluon.Trainer(model.collect_params(), 'adam', + {'learning_rate': args.lr, 'wd': args.weight_decay}) + + def sampler(): + for x in LayerSampler(g, 1000000, args.layer_size, args.n_layers, + neighbor_type='in', seed_nodes=train_nid, + return_prob=True): + yield x + + dur = [] + for epoch in range(args.n_epochs): + t0 = time.time() + + sub_g, _ = next(sampler()) + sub_g.copy_from_parent() + +# print(sub_g.number_of_nodes(), sub_g.number_of_edges()) + + with mx.autograd.record(): + y = sub_g.ndata['y'] + y_bar = model(sub_g) + loss = nd.mean(gluon.loss.SoftmaxCELoss()(y_bar, y)) + + loss.backward() + trainer.step(batch_size=1) + + dur.append(time.time() - t0) + acc = evaluate(model, g) + print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | " + "ETputs(KTEPS) {:.2f}".format( + epoch, np.mean(dur), loss.asscalar(), acc, n_edges / np.mean(dur) / 1000)) + + acc = evaluate(model, g) + print("Test accuracy {:.2%}".format(acc)) + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='GCN') + register_data_args(parser) + parser.add_argument("--dropout", type=float, default=0.5, + help="dropout probability") + parser.add_argument("--gpu", type=int, default=-1, + help="gpu") + parser.add_argument("--lr", type=float, default=3e-2, + help="learning rate") + parser.add_argument("--n-epochs", type=int, default=200, + help="number of training epochs") + parser.add_argument("--n-hidden", type=int, default=16, + help="number of hidden gcn units") + parser.add_argument("--n-layers", type=int, default=1, + help="number of hidden gcn layers") + parser.add_argument("--layer-size", type=int, default=128, + help="number of neighbors to be sampled") + parser.add_argument("--normalization", + choices=['sym'], default=None, + help="graph normalization types (default=None)") + parser.add_argument("--self-loop", action='store_true', + help="graph self-loop (default=False)") + parser.add_argument("--weight-decay", type=float, default=5e-4, + help="Weight for L2 loss") + args = parser.parse_args() + + print(args) + + main(args) diff --git a/examples/pytorch/gcn/gcn_ls.py b/examples/pytorch/gcn/gcn_ls.py deleted file mode 100644 index 760f33176b78..000000000000 --- a/examples/pytorch/gcn/gcn_ls.py +++ /dev/null @@ -1,281 +0,0 @@ -''' -Adapted from Ziyue's code. -''' - - -import argparse, time, math -import numpy as np -import mxnet as mx -from mxnet import gluon -import dgl -from dgl import DGLGraph -import dgl.function as fn -from dgl.data import register_data_args, load_data -import scipy as sp -from dgl import utils - - -def generate_rand_graph(n): - arr = (sp.sparse.random(n, n, density=0.1, format='coo') != 0).astype(np.int64) - return dgl.DGLGraph(arr, readonly=True) - - -def sample_subgraph(g, seed_nodes, n_layers, layer_size): - induced_nodes = [] - seeds = seed_nodes - parent_uv_edges_per_hop = [] - for _ in range(n_layers): - for subg, aux in dgl.contrib.sampling.LayerSampler(g, 1000000, layer_size, - neighbor_type='in', - seed_nodes=np.array(seeds), - return_seed_id=True): - #seed_ids = aux['seeds'] - #print(g.in_edges(seeds, form='all')) - subg_src, subg_dst = subg.edges() - parent_nid = subg.parent_nid - src = parent_nid[subg_src] - dst = parent_nid[subg_dst] - parent_uv_edges_per_hop.append((src.asnumpy(), dst.asnumpy())) - #print((src, dst)) - seeds = list(np.unique(src.asnumpy())) - induced_nodes.extend(list(parent_nid.asnumpy())) - - subgraph = g.subgraph(list(np.unique(np.array(induced_nodes)))) - #print(subgraph.parent_nid) - #print(parent_uv_edges_per_hop) - subg_uv_edges_per_hop = [(subgraph.map_to_subgraph_nid(src).asnumpy(), - subgraph.map_to_subgraph_nid(dst).asnumpy()) - for src, dst in parent_uv_edges_per_hop] - #print(subg_uv_edges_per_hop) - return subgraph, subg_uv_edges_per_hop - - -def test_sample(): - g = generate_rand_graph(100) - n_layers = 3 - seeds = [10, 20] - layer_size = 2 - subgraph, subg_uv_edges_per_hop = sample_subgraph(g, seeds, n_layers, layer_size) - - -class GCNLayer(gluon.Block): - def __init__(self, - in_feats, - out_feats, - activation, - dropout, - bias=True): - super(GCNLayer, self).__init__() - with self.name_scope(): - stdv = 1. / math.sqrt(out_feats) - self.weight = self.params.get('weight', shape=(in_feats, out_feats), - init=mx.init.Uniform(stdv)) - if bias: - self.bias = self.params.get('bias', shape=(out_feats,), - init=mx.init.Uniform(stdv)) - else: - self.bias = None - self.activation = activation - self.dropout = dropout - - def forward(self, h, subg, subg_edges): - if self.dropout: - h = mx.nd.Dropout(h, p=self.dropout) - h = mx.nd.dot(h, self.weight.data(h.context)) - # TODO: the normalization term need to be tuned. - # temporarilly normalized by sqrt(layer_size). - # for an unbiased estimator, it should be normalized by n(v)/D(v), - # but might be bad in practice - h = h / math.sqrt(3.) - subg.ndata['h'] = h - if subg_edges is not None: - subg.send_and_recv(subg_edges, fn.copy_src(src='h', out='m'), - fn.sum(msg='m', out='h')) - else: - subg.update_all(fn.copy_src(src='h', out='m'), - fn.sum(msg='m', out='h')) - h = subg.ndata.pop('h') - # same as TODO above - h = h / math.sqrt(3.) - # bias - if self.bias is not None: - h = h + self.bias.data(h.context) - if self.activation: - h = self.activation(h) - return h - - -class GCN(gluon.Block): - def __init__(self, - in_feats, - n_hidden, - n_classes, - n_layers, - activation, - dropout, - normalization): - super(GCN, self).__init__() - self.n_layers = n_layers - self.layers = gluon.nn.Sequential() - # input layer - self.layers.add(GCNLayer(in_feats, n_hidden, activation, 0.)) - # hidden layers - for i in range(n_layers - 1): - self.layers.add(GCNLayer(n_hidden, n_hidden, activation, dropout)) - # output layer - self.layers.add(GCNLayer(n_hidden, n_classes, None, dropout)) - - - def forward(self, subg, subg_edges_per_hop): - h = subg.ndata['in'] - - for i, layer in enumerate(self.layers): - h = layer(h, subg, subg_edges_per_hop[self.n_layers-i]) - - return h - - -def evaluate(model, g, n_layers, labels, mask): - pred = model(g, [None for i in range(n_layers)]).argmax(axis=1) - accuracy = ((pred == labels) * mask).sum() / mask.sum().asscalar() - return accuracy.asscalar() - - -def main(args): - # load and preprocess dataset - data = load_data(args) - - if args.self_loop: - data.graph.add_edges_from([(i,i) for i in range(len(data.graph))]) - - features = mx.nd.array(data.features) - labels = mx.nd.array(data.labels) - train_mask = mx.nd.array(data.train_mask) - val_mask = mx.nd.array(data.val_mask) - test_mask = mx.nd.array(data.test_mask) - in_feats = features.shape[1] - n_classes = data.num_labels - n_nodes = data.graph.number_of_nodes() - n_edges = data.graph.number_of_edges() - print("""----Data statistics------' - #Nodes %d - #Edges %d - #Classes %d - #Train samples %d - #Val samples %d - #Test samples %d""" % - (n_nodes, n_edges, n_classes, - train_mask.sum().asscalar(), - val_mask.sum().asscalar(), - test_mask.sum().asscalar())) - - if args.gpu < 0: - cuda = False - ctx = mx.cpu(0) - else: - cuda = True - ctx = mx.gpu(args.gpu) - - features = features.as_in_context(ctx) - labels = labels.as_in_context(ctx) - train_mask = train_mask.as_in_context(ctx) - val_mask = val_mask.as_in_context(ctx) - test_mask = test_mask.as_in_context(ctx) - - layer_size = args.layer_size - # create GCN model - g = DGLGraph(data.graph, readonly=True) - # normalization - degs = g.in_degrees().astype('float32') - degs[degs > layer_size] = layer_size - norm = mx.nd.power(degs, -0.5) - if cuda: - norm = norm.as_in_context(ctx) - g.ndata['norm'] = mx.nd.expand_dims(norm, 1) - g.ndata['in'] = features - - num_data = len(train_mask.asnumpy()) - full_idx = np.arange(0, num_data) - train_idx = full_idx[train_mask.asnumpy() == 1] - - seed_nodes = list(train_idx) - n_layers = args.n_layers + 1 - - model = GCN(in_feats, - args.n_hidden, - n_classes, - args.n_layers, - mx.nd.relu, - args.dropout, - args.normalization) - model.initialize(ctx=ctx) - n_train_samples = train_mask.sum().asscalar() - loss_fcn = gluon.loss.SoftmaxCELoss() - - # use optimizer - print(model.collect_params()) - trainer = gluon.Trainer(model.collect_params(), 'adam', - {'learning_rate': args.lr, 'wd': args.weight_decay}) - - # initialize graph - dur = [] - for epoch in range(args.n_epochs): - if epoch >= 3: - t0 = time.time() - - subg, subg_edges_per_hop = sample_subgraph(g, seed_nodes, n_layers, layer_size) - subg_train_mask = subg.map_to_subgraph_nid(train_idx) - subg.copy_from_parent() - - # forward - smask = np.zeros((len(subg.parent_nid),)) - smask[subg_train_mask.asnumpy()] = 1 - - with mx.autograd.record(): - pred = model(subg, subg_edges_per_hop) - loss = loss_fcn(pred, labels[subg.parent_nid], mx.nd.expand_dims(mx.nd.array(smask), 1)) - loss = loss.sum() / n_train_samples - - print(loss.asnumpy()) - loss.backward() - trainer.step(batch_size=1) - - if epoch >= 3: - dur.append(time.time() - t0) - acc = evaluate(model, g, n_layers, labels, val_mask) - print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | " - "ETputs(KTEPS) {:.2f}". format( - epoch, np.mean(dur), loss.asscalar(), acc, n_edges / np.mean(dur) / 1000)) - - acc = evaluate(model, g, n_layers, labels, test_mask) - print("Test accuracy {:.2%}".format(acc)) - -if __name__ == '__main__': - parser = argparse.ArgumentParser(description='GCN') - register_data_args(parser) - parser.add_argument("--dropout", type=float, default=0.5, - help="dropout probability") - parser.add_argument("--gpu", type=int, default=-1, - help="gpu") - parser.add_argument("--lr", type=float, default=3e-2, - help="learning rate") - parser.add_argument("--n-epochs", type=int, default=200, - help="number of training epochs") - parser.add_argument("--n-hidden", type=int, default=16, - help="number of hidden gcn units") - parser.add_argument("--n-layers", type=int, default=1, - help="number of hidden gcn layers") - parser.add_argument("--layer-size", type=int, default=128, - help="number of neighbors to be sampled") - parser.add_argument("--normalization", - choices=['sym','left'], default=None, - help="graph normalization types (default=None)") - parser.add_argument("--self-loop", action='store_true', - help="graph self-loop (default=False)") - parser.add_argument("--weight-decay", type=float, default=5e-4, - help="Weight for L2 loss") - args = parser.parse_args() - - print(args) - - main(args) diff --git a/include/dgl/immutable_graph.h b/include/dgl/immutable_graph.h index 4693936d1ded..5a5b55a4324b 100644 --- a/include/dgl/immutable_graph.h +++ b/include/dgl/immutable_graph.h @@ -14,6 +14,10 @@ #include "runtime/ndarray.h" #include "graph_interface.h" +// TODO remove +#include +#include + namespace dgl { /*! diff --git a/python/dgl/contrib/sampling/sampler.py b/python/dgl/contrib/sampling/sampler.py index dc93d5710361..09abf6f381ee 100644 --- a/python/dgl/contrib/sampling/sampler.py +++ b/python/dgl/contrib/sampling/sampler.py @@ -82,7 +82,7 @@ def __next__(self): class LSSubgraphLoader(object): def __init__(self, g, batch_size, layer_size, n_layers=1, neighbor_type='in', node_prob=None, seed_nodes=None, - shuffle=False, num_workers=1, return_seed_id=False): + shuffle=False, num_workers=1, return_seed_id=False, return_prob=False): self._g = g if not g._graph.is_readonly(): raise NotImplementedError("subgraph loader only support read-only graphs.") @@ -91,6 +91,7 @@ def __init__(self, g, batch_size, layer_size, n_layers=1, self._n_layers = n_layers self._node_prob = node_prob self._return_seed_id = return_seed_id + self._return_prob = return_prob if self._node_prob is not None: assert self._node_prob.shape[0] == g.number_of_nodes(), \ "We need to know the sampling probability of every node" @@ -117,11 +118,14 @@ def _prefetch(self): end = min((self._subgraph_idx + 1) * self._batch_size, num_nodes) seed_ids.append(utils.toindex(self._seed_nodes[start:end])) self._subgraph_idx += 1 - sgi = self._g._graph.neighbor_sampling(seed_ids, self._layer_size, - self._n_layers, self._neighbor_type, - self._node_prob) - subgraphs = [DGLSubGraph(self._g, i.induced_nodes, i.induced_edges, \ - i) for i in sgi] + sgi = self._g._graph.layer_sampling(seed_ids, self._layer_size, + self._n_layers, self._neighbor_type, + self._node_prob, self._return_prob) + subgraphs = [DGLSubGraph(self._g, i.induced_nodes, i.induced_edges, i) for i in sgi] + if self._return_prob: + for sg, i in zip(subgraphs, sgi): + setattr(sg, 'layer_ids', i.layer_ids) + setattr(sg, 'sample_prob', i.sample_prob) self._subgraphs.extend(subgraphs) if self._return_seed_id: self._seed_ids.extend(seed_ids) @@ -322,7 +326,7 @@ def NeighborSampler(g, batch_size, expand_factor, num_hops=1, def LayerSampler(g, batch_size, layer_size, n_layers=1, neighbor_type='in', node_prob=None, seed_nodes=None, shuffle=False, num_workers=1, - return_seed_id=False, prefetch=False): + return_seed_id=False, prefetch=False, return_prob=False): '''Create a sampler that samples neighborhood. This creates a subgraph data loader that samples subgraphs from the input graph @@ -376,7 +380,7 @@ def LayerSampler(g, batch_size, layer_size, n_layers=1, information about the subgraphs. ''' loader = LSSubgraphLoader(g, batch_size, layer_size, n_layers, neighbor_type, node_prob, - seed_nodes, shuffle, num_workers, return_seed_id) + seed_nodes, shuffle, num_workers, return_seed_id, return_prob) if not prefetch: return loader else: diff --git a/python/dgl/graph_index.py b/python/dgl/graph_index.py index 903468c0ae92..8d56a44d8f55 100644 --- a/python/dgl/graph_index.py +++ b/python/dgl/graph_index.py @@ -678,7 +678,7 @@ def neighbor_sampling(self, seed_ids, expand_factor, num_hops, neighbor_type, no return [SubgraphIndex(rst(i), self, utils.toindex(rst(num_subgs + i)), utils.toindex(rst(num_subgs * 2 + i))) for i in range(num_subgs)] - def layer_sampling(self, seed_ids, layer_size, n_layers, layer_type, node_prob): + def layer_sampling(self, seed_ids, layer_size, n_layers, layer_type, node_prob, return_prob=False): """Neighborhood sampling""" if len(seed_ids) == 0: return [] @@ -691,8 +691,15 @@ def layer_sampling(self, seed_ids, layer_size, n_layers, layer_type, node_prob): rst = _nonuniform_layer_sampling(self, node_prob, seed_ids, layer_type, n_layers, layer_size) - return [SubgraphIndex(rst(i), self, utils.toindex(rst(num_subgs + i)), - utils.toindex(rst(num_subgs * 2 + i))) for i in range(num_subgs)] + ret = [SubgraphIndex(rst(i), self, utils.toindex(rst(num_subgs + i)), + utils.toindex(rst(num_subgs * 2 + i))) for i in range(num_subgs)] + if return_prob: + for i, si in enumerate(ret): + l = rst(num_subgs * 3 + i).to_dlpack() + setattr(si, 'layer_ids', F.unsqueeze(F.zerocopy_from_dlpack(l), 1)) + p = rst(num_subgs * 4 + i).to_dlpack() + setattr(si, 'sample_prob', F.zerocopy_from_dlpack(p)) + return ret def to_networkx(self): """Convert to networkx graph. diff --git a/src/graph/graph_apis.cc b/src/graph/graph_apis.cc index 4b9dbf6f5f12..2136c27a7e21 100644 --- a/src/graph/graph_apis.cc +++ b/src/graph/graph_apis.cc @@ -7,6 +7,7 @@ #include #include #include "../c_api_common.h" +#include using dgl::runtime::DGLArgs; using dgl::runtime::DGLArgValue; diff --git a/src/graph/immutable_graph.cc b/src/graph/immutable_graph.cc index dd92cc2c9fe6..499d111a74f1 100644 --- a/src/graph/immutable_graph.cc +++ b/src/graph/immutable_graph.cc @@ -7,6 +7,8 @@ #include #include #include +#include +#include #ifdef _MSC_VER // rand in MS compiler works well in multi-threading. @@ -249,6 +251,7 @@ void ImmutableGraph::CSR::ReadAllEdges(std::vector *edges) const { (*edges)[indptr[i] + j] = e; } } + edges->resize(NumEdges()); } ImmutableGraph::CSR::Ptr ImmutableGraph::CSR::Transpose() const { @@ -992,6 +995,7 @@ SampledSubgraph ImmutableGraph::LayerSample(IdArray seed_array, std::vector probabilities; std::vector edges; + std::queue sizes; std::unordered_set candidate_set; std::vector candidates; std::map n_times; @@ -999,116 +1003,102 @@ SampledSubgraph ImmutableGraph::LayerSample(IdArray seed_array, size_t n_seeds = seed_array->shape[0]; const dgl_id_t* seed_data = static_cast(seed_array->data); candidate_set.insert(seed_data, seed_data + n_seeds); - std::copy(candidate_set.begin(), candidate_set.end(), nodes.begin()); - layer_ids.insert(layer_ids.end(), nodes.size(), 0); - probabilities.insert(probabilities.end(), nodes.size(), 0); + std::copy(candidate_set.begin(), candidate_set.end(), std::back_inserter(nodes)); + sizes.push(nodes.size()); - std::vector positions {0, nodes.size()}; + size_t curr = 0; for (int i = 0; i != n_layers; i++) { candidate_set.clear(); - for (auto j = positions.end()[-2]; j != positions.back(); ++j) { + for (auto j = curr; j != sizes.back(); ++j) { auto src = nodes[j]; candidate_set.insert(indices + indptr[src], indices + indptr[src + 1]); } candidates.clear(); - std::copy(candidate_set.begin(), candidate_set.end(), candidates.begin()); + std::copy(candidate_set.begin(), candidate_set.end(), std::back_inserter(candidates)); - auto n_candidates = candidates.size(); n_times.clear(); + auto n_candidates = candidates.size(); for (size_t j = 0; j != layer_size; ++j) { - auto dst = rand_r(&rand_seed) % n_candidates; + auto dst = candidates[rand_r(&rand_seed) % n_candidates]; if (!n_times.insert(std::make_pair(dst, 1)).second) { ++n_times[dst]; } } - auto n_nodes = static_cast(NumVertices()); for (auto const &pair : n_times) { nodes.push_back(pair.first); - layer_ids.push_back(i + 1); - probabilities.push_back(pair.second / n_nodes); + probabilities.push_back(pair.second * n_candidates / static_cast(layer_size)); } - positions.push_back(nodes.size()); + curr = sizes.back(); + sizes.push(nodes.size()); } - /* - std::vector idx(nodes.size()); - iota(idx.begin(), idx.end(), 0); - auto less = [&nodes](size_t i, size_t j) { - return nodes[i] < nodes[j]; - }; - sort(idx.begin(), idx.end(), less); - - for (size_t i = 0; i < n_nodes; i++) { - subg.induced_vertices[i] = nodes[idx[i]]; - subg.layer_ids[i] = node_info[i].first; - subg.sample_prob[i] = node_info[i].second; - } - */ - - std::vector ret_indices; - std::vector ret_eids; - std::vector new_indptr; - std::vector new_indices; - std::vector new_eids; - new_indptr.push_back(0); - for (size_t i = 0; i < positions.size() - 2; ++i) { - auto curr_begin = positions[i]; - auto curr_end = positions[i + 1]; - // auto curr_len = curr_end - curr_begin; - auto next_begin = curr_end; - auto next_end = positions[i + 2]; - auto next_len = next_end - next_begin; - HashTableChecker checker(&nodes[next_begin], next_len); - for (size_t j = curr_begin; j != curr_end; ++j) { - ret_indices.clear(); - ret_eids.clear(); + std::vector ret_idx; + std::vector ret_eid; + std::vector sub_indptr; + std::vector sub_indices; + std::vector sub_eids; + sub_indptr.push_back(0); + curr = 0; + int i = n_layers - 1; + while (sizes.size() > 1) { + auto next = sizes.front(); + sizes.pop(); + auto next_size = sizes.front() - next; + + HashTableChecker checker(nodes.data() + next, next_size); + for (size_t j = curr; j != next; ++j) { + ret_idx.clear(); + ret_eid.clear(); auto src = nodes[j]; - auto dis = indptr[src]; + auto d = indptr[src]; auto len = indptr[src + 1] - indptr[src]; - checker.CollectOnRow(indices + dis, eids + dis, len, &ret_indices, &ret_eids); - new_indptr.push_back(ret_indices.size()); - new_indices.insert(new_indices.end(), ret_indices.begin(), ret_indices.end()); - new_eids.insert(new_eids.end(), ret_eids.begin(), ret_eids.end()); + checker.CollectOnRow(indices + d, eids + d, len, &ret_idx, &ret_eid); + sub_indptr.push_back(sub_indptr.back() + ret_idx.size()); + for (auto k : ret_idx) { + sub_indices.push_back(nodes[next + k]); + layer_ids.push_back(); + } + std::copy(ret_eid.begin(), ret_eid.end(), std::back_inserter(sub_eids)); } + curr = next; } + sub_indptr.insert(sub_indptr.end(), nodes.size() - sub_indptr.size() + 1, sub_indptr.back()); int64_t n_nodes = nodes.size(); - int64_t n_edges = new_eids.size(); - auto subg_csr = std::make_shared(n_nodes, n_edges); - subg_csr->indptr.resize(n_nodes + 1); - subg_csr->indices.resize(n_edges); - subg_csr->edge_ids.resize(n_edges); - std::copy(new_indptr.begin(), new_indptr.end(), subg_csr->indptr.data()); - std::copy(new_indices.begin(), new_indices.end(), subg_csr->indices.data()); - std::copy(new_eids.begin(), new_eids.end(), subg_csr->edge_ids.data()); - - SampledSubgraph subg; - subg.induced_vertices = IdArray::Empty({n_nodes}, - DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - subg.induced_edges = IdArray::Empty({n_edges}, - DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - subg.layer_ids = IdArray::Empty({n_nodes}, - DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); - subg.sample_prob = runtime::NDArray::Empty({n_nodes}, - DLDataType{kDLFloat, 8 * sizeof(float), 1}, - DLContext{kDLCPU, 0}); + int64_t n_edges = sub_eids.size(); + auto sub_csr = std::make_shared(n_nodes, n_edges); + std::copy(sub_indptr.begin(), sub_indptr.end(), sub_csr->indptr.begin()); + std::copy(sub_indices.begin(), sub_indices.end(), std::back_inserter(sub_csr->indices)); + std::copy(sub_eids.begin(), sub_eids.end(), std::back_inserter(sub_csr->edge_ids)); + + SampledSubgraph sub_g; + sub_g.induced_vertices = IdArray::Empty({n_nodes}, + DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + sub_g.induced_edges = IdArray::Empty({n_edges}, + DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + sub_g.layer_ids = IdArray::Empty({n_nodes}, + DLDataType{kDLInt, 64, 1}, DLContext{kDLCPU, 0}); + sub_g.sample_prob = runtime::NDArray::Empty({n_nodes}, + DLDataType{kDLFloat, 8 * sizeof(float), 1}, + DLContext{kDLCPU, 0}); std::copy(nodes.begin(), nodes.end(), - static_cast(subg.induced_vertices->data)); + static_cast(sub_g.induced_vertices->data)); std::copy(layer_ids.begin(), layer_ids.end(), - static_cast(subg.layer_ids->data)); + static_cast(sub_g.layer_ids->data)); std::copy(probabilities.begin(), probabilities.end(), - static_cast(subg.sample_prob->data)); + static_cast(sub_g.sample_prob->data)); if (neigh_type == "in") - subg.graph = GraphPtr(new ImmutableGraph(subg_csr, nullptr, IsMultigraph())); + sub_g.graph = GraphPtr(new ImmutableGraph(sub_csr, nullptr, IsMultigraph())); else - subg.graph = GraphPtr(new ImmutableGraph(nullptr, subg_csr, IsMultigraph())); + sub_g.graph = GraphPtr(new ImmutableGraph(nullptr, sub_csr, IsMultigraph())); - return subg; + return sub_g; } + void CompactSubgraph(ImmutableGraph::CSR *subg, const std::unordered_map &id_map) { for (size_t i = 0; i < subg->indices.size(); i++) { From e3385db1de84f5ba6e06672488bffcaf875f496b Mon Sep 17 00:00:00 2001 From: Yu Gai Date: Sat, 9 Feb 2019 01:12:48 +0800 Subject: [PATCH 10/10] sampler checker --- examples/mxnet/gcn/gcn_ls.py | 71 +++++++++++++++------ include/dgl/immutable_graph.h | 1 + python/dgl/graph_index.py | 3 +- src/graph/immutable_graph.cc | 117 +++++++++++++++++++++++----------- 4 files changed, 134 insertions(+), 58 deletions(-) diff --git a/examples/mxnet/gcn/gcn_ls.py b/examples/mxnet/gcn/gcn_ls.py index a63183f9e184..0a9b5f0d3f2e 100644 --- a/examples/mxnet/gcn/gcn_ls.py +++ b/examples/mxnet/gcn/gcn_ls.py @@ -1,5 +1,6 @@ import argparse, time import numpy as np +import numpy.random as npr import mxnet as mx import mxnet.ndarray as nd from mxnet import gluon @@ -10,6 +11,33 @@ import dgl.function as fn from dgl.subgraph import DGLSubGraph +def check(g, sub_g, n_layers, layer_size, train_nid): + n = sub_g.number_of_nodes() + m = sub_g.number_of_edges() +# print('# nodes: %d, # edges: %d, # seeds: %d' % (n, m, len(train_nid))) + nid = np.arange(n) + src, dst = sub_g.edges() + src = src.asnumpy() + dst = dst.asnumpy() + lid = sub_g.layer_ids.asnumpy() +# print('np.unique(lid)', np.unique(lid)) + for i in range(n_layers + 1): + nmask = lid == i + src_mask = np.isin(src, nid[nmask]) + dst_mask = np.isin(dst, nid[nmask]) + nn = np.sum(nmask) + mm_src = np.sum(src_mask) + mm_dst = np.sum(dst_mask) + assert nn <= layer_size + if i == 0: + assert mm_dst == 0 + if i == n_layers + 1: + assert mm_src == 0 +# print('[layer %d]# nodes: %d, # src: %d, # dst: %d' % (i, nn, mm_src, mm_dst)) + src_lid = lid[src] + dst_lid = lid[dst] + assert np.all(dst_lid - src_lid == 1) + class GCNLayer(gluon.Block): def __init__(self, in_feats, out_feats, activation, dropout=0): super(GCNLayer, self).__init__() @@ -19,16 +47,18 @@ def __init__(self, in_feats, out_feats, activation, dropout=0): def forward(self, sub_g, src, dst): if self.dropout > 0: - sub_g.apply_nodes(lambda nodes: {'h' : nd.Dropout(nodes.data['h'], - p=self.dropout)}) - # TODO normalization + dropout = lambda nodes: {'h' : nd.Dropout(nodes.data['h'], p=self.dropout)} + sub_g.apply_nodes(dropout) + # normalize = lambda nodes : {'h' : nodes.data['h'] * nodes.data['normalizer']} + # sub_g.apply_nodes(normalize) if src is None: sub_g.update_all(fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='h')) else: sub_g.send_and_recv((src, dst), fn.copy_src(src='h', out='m'), fn.sum(msg='m', out='h')) - sub_g.ndata['h'] = self.dense(sub_g.ndata['h']) + # sub_g.apply_nodes(normalize) + sub_g.apply_nodes(lambda nodes : {'h' : self.dense(nodes.data['h'])}) class GCN(gluon.Block): def __init__(self, in_feats, n_hidden, n_classes, n_layers, @@ -49,28 +79,30 @@ def forward(self, sub_g): if isinstance(sub_g, DGLSubGraph): n = sub_g.number_of_nodes() nid = np.arange(n) - src, dst, eid = sub_g.edge_ids(nid, nid) + src, dst = sub_g.edges() src = src.asnumpy() dst = dst.asnumpy() - eid = eid.asnumpy() + layer_ids = sub_g.layer_ids.asnumpy() + sample_prob = sub_g.sample_prob.asnumpy() for i, layer in enumerate(self.layers): - mask = eid == i - src = src[mask] - dst = dst[mask] + nmask = layer_ids == i + emask = np.isin(src, nid[nmask]) + src = src[emask] + dst = dst[emask] h = sub_g.ndata['h'] - sample_prob = sub_g.sample_prob.asnumpy() - p = np.expand_dims(np.where(np.isin(nid, src), sample_prob, np.ones(n)), axis=1) - sub_g.ndata['h'] = h * nd.array(p).as_in_context(h.context) + p = np.expand_dims(np.where(nmask, sample_prob, np.ones(n)), axis=1) + sub_g.ndata['h'] = h +# sub_g.ndata['h'] = h * nd.array(p).as_in_context(h.context) layer(sub_g, src, dst) else: for layer in self.layers: layer(sub_g, None, None) return self.dense(sub_g.pop_n_repr('h')) -def evaluate(model, g): +def evaluate(model, g, val=False): y = g.ndata['y'] y_bar = nd.argmax(model(g), axis=1) - mask = g.ndata['val_mask'] + mask = g.ndata['val_mask'] if val else g.ndata['test_mask'] accuracy = nd.sum(mask * (y == y_bar)) / nd.sum(mask) return accuracy.asscalar() @@ -128,8 +160,9 @@ def main(args): {'learning_rate': args.lr, 'wd': args.weight_decay}) def sampler(): + seed_nodes = npr.choice(train_nid, 32, replace=False) for x in LayerSampler(g, 1000000, args.layer_size, args.n_layers, - neighbor_type='in', seed_nodes=train_nid, + neighbor_type='in', seed_nodes=seed_nodes, return_prob=True): yield x @@ -139,19 +172,19 @@ def sampler(): sub_g, _ = next(sampler()) sub_g.copy_from_parent() - -# print(sub_g.number_of_nodes(), sub_g.number_of_edges()) + # check(g, sub_g, args.n_layers, args.layer_size, train_nid) with mx.autograd.record(): y = sub_g.ndata['y'] y_bar = model(sub_g) - loss = nd.mean(gluon.loss.SoftmaxCELoss()(y_bar, y)) + mask = sub_g.layer_ids.as_in_context(y) == args.n_layers + loss = nd.sum(mask * gluon.loss.SoftmaxCELoss()(y_bar, y)) / nd.sum(mask) loss.backward() trainer.step(batch_size=1) dur.append(time.time() - t0) - acc = evaluate(model, g) + acc = evaluate(model, g, val=True) print("Epoch {:05d} | Time(s) {:.4f} | Loss {:.4f} | Accuracy {:.4f} | " "ETputs(KTEPS) {:.2f}".format( epoch, np.mean(dur), loss.asscalar(), acc, n_edges / np.mean(dur) / 1000)) diff --git a/include/dgl/immutable_graph.h b/include/dgl/immutable_graph.h index 5a5b55a4324b..7501af4a69ea 100644 --- a/include/dgl/immutable_graph.h +++ b/include/dgl/immutable_graph.h @@ -16,6 +16,7 @@ // TODO remove #include +#include #include namespace dgl { diff --git a/python/dgl/graph_index.py b/python/dgl/graph_index.py index 8d56a44d8f55..3c231ae6c26c 100644 --- a/python/dgl/graph_index.py +++ b/python/dgl/graph_index.py @@ -679,7 +679,6 @@ def neighbor_sampling(self, seed_ids, expand_factor, num_hops, neighbor_type, no utils.toindex(rst(num_subgs * 2 + i))) for i in range(num_subgs)] def layer_sampling(self, seed_ids, layer_size, n_layers, layer_type, node_prob, return_prob=False): - """Neighborhood sampling""" if len(seed_ids) == 0: return [] @@ -696,7 +695,7 @@ def layer_sampling(self, seed_ids, layer_size, n_layers, layer_type, node_prob, if return_prob: for i, si in enumerate(ret): l = rst(num_subgs * 3 + i).to_dlpack() - setattr(si, 'layer_ids', F.unsqueeze(F.zerocopy_from_dlpack(l), 1)) + setattr(si, 'layer_ids', F.zerocopy_from_dlpack(l)) p = rst(num_subgs * 4 + i).to_dlpack() setattr(si, 'sample_prob', F.zerocopy_from_dlpack(p)) return ret diff --git a/src/graph/immutable_graph.cc b/src/graph/immutable_graph.cc index 499d111a74f1..fd773ca3bdd8 100644 --- a/src/graph/immutable_graph.cc +++ b/src/graph/immutable_graph.cc @@ -991,82 +991,109 @@ SampledSubgraph ImmutableGraph::LayerSample(IdArray seed_array, const dgl_id_t* eids = g_csr->edge_ids.data(); std::vector nodes; - std::vector layer_ids; - std::vector probabilities; std::vector edges; + std::vector layer_ids; + std::vector sample_prob; - std::queue sizes; + std::queue size_queue; std::unordered_set candidate_set; - std::vector candidates; - std::map n_times; + std::vector candidate_vector; + std::unordered_map n_map; size_t n_seeds = seed_array->shape[0]; const dgl_id_t* seed_data = static_cast(seed_array->data); candidate_set.insert(seed_data, seed_data + n_seeds); std::copy(candidate_set.begin(), candidate_set.end(), std::back_inserter(nodes)); - sizes.push(nodes.size()); + layer_ids.insert(layer_ids.end(), nodes.size(), n_layers); + size_queue.push(nodes.size()); size_t curr = 0; - for (int i = 0; i != n_layers; i++) { + for (int i = 0; i != n_layers; ++i) { candidate_set.clear(); - for (auto j = curr; j != sizes.back(); ++j) { + for (auto j = curr; j != size_queue.back(); ++j) { auto src = nodes[j]; candidate_set.insert(indices + indptr[src], indices + indptr[src + 1]); } - candidates.clear(); - std::copy(candidate_set.begin(), candidate_set.end(), std::back_inserter(candidates)); + candidate_vector.clear(); + std::copy(candidate_set.begin(), candidate_set.end(), std::back_inserter(candidate_vector)); - n_times.clear(); - auto n_candidates = candidates.size(); + n_map.clear(); + auto n_candidates = candidate_vector.size(); for (size_t j = 0; j != layer_size; ++j) { - auto dst = candidates[rand_r(&rand_seed) % n_candidates]; - if (!n_times.insert(std::make_pair(dst, 1)).second) { - ++n_times[dst]; + auto dst = candidate_vector[rand_r(&rand_seed) % n_candidates]; + if (!n_map.insert(std::make_pair(dst, 1)).second) { + ++n_map[dst]; } } - for (auto const &pair : n_times) { + layer_ids.insert(layer_ids.end(), n_map.size(), n_layers - i - 1); + for (auto const &pair : n_map) { nodes.push_back(pair.first); - probabilities.push_back(pair.second * n_candidates / static_cast(layer_size)); + float sp = pair.second * n_candidates / static_cast(layer_size); + sample_prob.push_back(sp); } - curr = sizes.back(); - sizes.push(nodes.size()); + curr = size_queue.back(); + size_queue.push(nodes.size()); } - std::vector ret_idx; - std::vector ret_eid; - std::vector sub_indptr; + std::vector sub_indptr = {0}; std::vector sub_indices; std::vector sub_eids; - sub_indptr.push_back(0); curr = 0; - int i = n_layers - 1; - while (sizes.size() > 1) { - auto next = sizes.front(); - sizes.pop(); - auto next_size = sizes.front() - next; - - HashTableChecker checker(nodes.data() + next, next_size); - for (size_t j = curr; j != next; ++j) { - ret_idx.clear(); - ret_eid.clear(); + while (size_queue.size() > 1) { + auto next = size_queue.front(); + size_queue.pop(); + auto next_size = size_queue.front() - next; + + // HashTableChecker checker(nodes.data() + next, next_size); + std::unordered_map checker; + for (dgl_id_t j = next; j < size_queue.front(); ++j) { + checker.insert(std::make_pair(nodes[j], j)); + } + // std::cout << __FILE__ << ' ' << __LINE__ << ' ' << checker.size() << std::endl; + for (size_t j = curr; j < next; ++j) { + auto src = nodes[j]; + for (int64_t k = indptr[src]; k < indptr[src + 1]; ++k) { + auto iterator = checker.find(indices[k]); + if (iterator != checker.end()) { + // std::cout << __FILE__ << ' ' << __LINE__ << ' ' << indices[k] << std::endl; + sub_indices.push_back(iterator->second); + sub_eids.push_back(eids[k]); + } + } + sub_indptr.push_back(sub_indices.size()); + /* auto src = nodes[j]; auto d = indptr[src]; auto len = indptr[src + 1] - indptr[src]; + std::vector ret_idx; + std::vector ret_eid; checker.CollectOnRow(indices + d, eids + d, len, &ret_idx, &ret_eid); sub_indptr.push_back(sub_indptr.back() + ret_idx.size()); - for (auto k : ret_idx) { + for (const auto &k : ret_idx) { sub_indices.push_back(nodes[next + k]); - layer_ids.push_back(); } std::copy(ret_eid.begin(), ret_eid.end(), std::back_inserter(sub_eids)); + + // CHECK + for (const auto &k : ret_idx) { + CHECK(layer_ids[next + k] + 1 == layer_ids[j]) + << layer_ids[next + k] << ' ' << layer_ids[j]; + } + */ } curr = next; } sub_indptr.insert(sub_indptr.end(), nodes.size() - sub_indptr.size() + 1, sub_indptr.back()); + /* + std::cout << __FILE__ << ' ' << __LINE__ << ' ' + << *std::min_element(sub_indptr.begin(), sub_indptr.end()) << ' ' + << *std::max_element(sub_indptr.begin(), sub_indptr.end()) << std::endl; + */ + int64_t n_nodes = nodes.size(); int64_t n_edges = sub_eids.size(); auto sub_csr = std::make_shared(n_nodes, n_edges); @@ -1088,7 +1115,7 @@ SampledSubgraph ImmutableGraph::LayerSample(IdArray seed_array, static_cast(sub_g.induced_vertices->data)); std::copy(layer_ids.begin(), layer_ids.end(), static_cast(sub_g.layer_ids->data)); - std::copy(probabilities.begin(), probabilities.end(), + std::copy(sample_prob.begin(), sample_prob.end(), static_cast(sub_g.sample_prob->data)); if (neigh_type == "in") @@ -1096,6 +1123,22 @@ SampledSubgraph ImmutableGraph::LayerSample(IdArray seed_array, else sub_g.graph = GraphPtr(new ImmutableGraph(nullptr, sub_csr, IsMultigraph())); + // CHECK + for (size_t i = 0; i < sub_indptr.size() - 1; ++i) { + for (size_t j = sub_indptr[i]; j < sub_indptr[i + 1]; ++j) { + /* + CHECK(layer_ids[i] - 1 == layer_ids[old_nid2new_nid[sub_indices[j]]]) + << layer_ids[i] << ' ' << layer_ids[old_nid2new_nid[sub_indices[j]]]; + */ + } + } + + auto p = static_cast(sub_g.sample_prob->data); + std::cout << __FILE__ << ' ' << __LINE__ << ' ' + << *std::min_element(p, p + sample_prob.size()) << ' ' + << *std::max_element(p, p + sample_prob.size()) << ' ' + << n_nodes << ' ' << sub_g.sample_prob->shape[0] << std::endl; + return sub_g; } @@ -1141,7 +1184,7 @@ SampledSubgraph ImmutableGraph::LayerUniformSample(IdArray seeds, neigh_type, n_layers, layer_size); - std::static_pointer_cast(ret.graph)->CompactSubgraph(ret.induced_vertices); + // std::static_pointer_cast(ret.graph)->CompactSubgraph(ret.induced_vertices); return ret; }