From 4c71970057e11d969513fa28161cbc05511e4a68 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 19 Feb 2020 11:15:00 -0600 Subject: [PATCH 001/660] add libdeepgalois and lonestargnn --- CMakeLists.txt | 5 + libdeepgalois/CMakeLists.txt | 23 + libdeepgalois/gnn.h | 31 ++ libdeepgalois/layers.h | 8 + libdeepgalois/layers/arithmetic_layer.h | 22 + libdeepgalois/layers/graph_conv_layer.h | 186 ++++++++ libdeepgalois/layers/layer.h | 156 +++++++ libdeepgalois/layers/linear_layer.h | 28 ++ libdeepgalois/layers/relu_layer.h | 24 ++ libdeepgalois/layers/softmax_loss_layer.h | 47 ++ libdeepgalois/lgraph.h | 179 ++++++++ libdeepgalois/math_functions.hpp | 500 ++++++++++++++++++++++ libdeepgalois/net.h | 341 +++++++++++++++ libdeepgalois/node.h | 109 +++++ libdeepgalois/optimizer.h | 221 ++++++++++ libdeepgalois/random.h | 63 +++ libdeepgalois/timer.h | 21 + libdeepgalois/types.h | 34 ++ libdeepgalois/utils.h | 119 +++++ lonestargnn/CMakeLists.txt | 8 + lonestargnn/README.md | 60 +++ lonestargnn/gcn/CMakeLists.txt | 16 + lonestargnn/gcn/gcn.cpp | 47 ++ lonestargnn/graphsage/gs-mean.cpp | 41 ++ lonestargnn/lonestargnn.h | 50 +++ lonestargnn/run-citeseer.sh | 1 + 26 files changed, 2340 insertions(+) create mode 100644 libdeepgalois/CMakeLists.txt create mode 100644 libdeepgalois/gnn.h create mode 100644 libdeepgalois/layers.h create mode 100644 libdeepgalois/layers/arithmetic_layer.h create mode 100644 libdeepgalois/layers/graph_conv_layer.h create mode 100644 libdeepgalois/layers/layer.h create mode 100644 libdeepgalois/layers/linear_layer.h create mode 100644 libdeepgalois/layers/relu_layer.h create mode 100644 libdeepgalois/layers/softmax_loss_layer.h create mode 100644 libdeepgalois/lgraph.h create mode 100644 libdeepgalois/math_functions.hpp create mode 100644 libdeepgalois/net.h create mode 100644 libdeepgalois/node.h create mode 100644 libdeepgalois/optimizer.h create mode 100644 libdeepgalois/random.h create mode 100644 libdeepgalois/timer.h create mode 100644 libdeepgalois/types.h create mode 100644 libdeepgalois/utils.h create mode 100644 lonestargnn/CMakeLists.txt create mode 100644 lonestargnn/README.md create mode 100644 lonestargnn/gcn/CMakeLists.txt create mode 100644 lonestargnn/gcn/gcn.cpp create mode 100644 lonestargnn/graphsage/gs-mean.cpp create mode 100644 lonestargnn/lonestargnn.h create mode 100755 lonestargnn/run-citeseer.sh diff --git a/CMakeLists.txt b/CMakeLists.txt index 40a7a7fb7b..8ce9f7f3a9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,6 +45,7 @@ set(USE_SANITIZER OFF CACHE BOOL "Use address and memory sanatizer") set(INSTALL_APPS OFF CACHE BOOL "Install apps as well as library") set(SKIP_COMPILE_APPS OFF CACHE BOOL "Skip compilation of applications using Galois library") set(GRAPH_LOCATION "" CACHE PATH "Location of inputs for tests if downloaded/stored separately.") +set(USE_DEEPGALOIS OFF CACHE BOOL "Install gnn apps as well as the DeepGalois library") if(WIN32 AND NOT CYGWIN) set(DEFAULT_INSTALL_CMAKE_DIR "${CMAKE_INSTALL_PREFIX}/CMake") @@ -514,6 +515,10 @@ endfunction() add_subdirectory(libllvm) add_subdirectory(libgalois) +if(USE_DEEPGALOIS) + add_subdirectory(libdeepgalois) + add_subdirectory(lonestargnn) +endif(USE_DEEPGALOIS) if(ENABLE_DIST_GALOIS) add_subdirectory(libdist) add_subdirectory(libcusp) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt new file mode 100644 index 0000000000..8caa65ebc9 --- /dev/null +++ b/libdeepgalois/CMakeLists.txt @@ -0,0 +1,23 @@ +set(sources + $ +# $ + $ +) + +add_library(deepgalois STATIC ${sources}) + +target_link_libraries(deepgalois galois_shmem galois_dist_async gllvm) +target_link_libraries(deepgalois ${MPI_CXX_LIBRARIES}) + +target_include_directories(deepgalois PUBLIC + ${CMAKE_SOURCE_DIR}/libllvm/include + ${CMAKE_SOURCE_DIR}/libgalois/include + ${CMAKE_SOURCE_DIR}/libdist/include + ${CMAKE_SOURCE_DIR}/libdeepgalios/include + ${CMAKE_CURRENT_SOURCE_DIR}/include +) + +set_target_properties (deepgalois PROPERTIES + INTERFACE_POSITION_INDEPENDENT_CODE On + POSITION_INDEPENDENT_CODE On +) diff --git a/libdeepgalois/gnn.h b/libdeepgalois/gnn.h new file mode 100644 index 0000000000..d2d2bafb28 --- /dev/null +++ b/libdeepgalois/gnn.h @@ -0,0 +1,31 @@ +#ifndef _GNN_H_ +#define _GNN_H_ + +#include "galois/Galois.h" +#include "galois/Reduction.h" +#include "galois/Timer.h" +#include "galois/ParallelSTL.h" +#include "llvm/Support/CommandLine.h" +#include "galois/runtime/Profile.h" +#include + +namespace cll = llvm::cl; +static cll::opt dataset(cll::Positional, cll::desc(""), cll::Required); // 'cora', 'citeseer', 'pubmed' +static cll::opt filetype(cll::Positional, cll::desc(""), cll::init("gr")); // file format of the input graph +static cll::opt model("m", cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense' +static cll::opt learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01)); +static cll::opt epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1)); +static cll::opt hidden1("h", cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16)); +static cll::opt dropout_rate("d", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5)); +static cll::opt weight_decay("wd", cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4)); +static cll::opt early_stopping("es", cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10)); +static cll::opt max_degree("md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"), cll::init(3)); +static cll::opt do_validate("dv", cll::desc("enable validation"), cll::init(1)); +static cll::opt do_test("dt", cll::desc("enable test"), cll::init(1)); +#define CHUNK_SIZE 256 + +#include "types.h" +#include "utils.h" +#include "net.h" + +#endif diff --git a/libdeepgalois/layers.h b/libdeepgalois/layers.h new file mode 100644 index 0000000000..9650e931a9 --- /dev/null +++ b/libdeepgalois/layers.h @@ -0,0 +1,8 @@ +#ifndef _LAYERS_H_ +#define _LAYERS_H_ +#include "layers/relu_layer.h" +#include "layers/linear_layer.h" +#include "layers/arithmetic_layer.h" +#include "layers/graph_conv_layer.h" +#include "layers/softmax_loss_layer.h" +#endif diff --git a/libdeepgalois/layers/arithmetic_layer.h b/libdeepgalois/layers/arithmetic_layer.h new file mode 100644 index 0000000000..aed91e0379 --- /dev/null +++ b/libdeepgalois/layers/arithmetic_layer.h @@ -0,0 +1,22 @@ +#pragma once +#include "layer.h" + +// element-wise add N vectors ```y_i = x0_i + x1_i + ... + xnum_i``` +class elementwise_add_layer : public layer { +public: + elementwise_add_layer(unsigned level, std::vector in_dim, + std::vector out_dim) : layer(level, in_dim, out_dim) { + trainable_ = false; + } + std::string layer_type() const override { return std::string("elementwise_add"); } + void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { + for (size_t sample = 0; sample < in_data.size(); ++sample) { + for (size_t j = 0; j < in_data[0].size(); j++) + out_data[sample][j] = in_data[sample][j]; + } + } + void back_propagation(const tensor_t &in_data, const tensor_t &out_data, + tensor_t &out_grad, tensor_t &in_grad) override { + in_grad = out_grad; + } +}; diff --git a/libdeepgalois/layers/graph_conv_layer.h b/libdeepgalois/layers/graph_conv_layer.h new file mode 100644 index 0000000000..b81f7bc10e --- /dev/null +++ b/libdeepgalois/layers/graph_conv_layer.h @@ -0,0 +1,186 @@ +#pragma once +#include "layer.h" + +/* GraphConv Layer + Parameters + ---------- + x: int, number of samples. + y: int, Input feature size. + z: int, Output feature size. + dropout: bool, optional, if True, a dropout operation is applied before other operations. + norm : bool, optional, if True, the normalizer :math:`c_{ij}` is applied. Default: ``True``. + bias : bool, optional, if True, adds a learnable bias to the output. Default: ``False``. + activation: callable activation function/layer or None, optional + If not None, applies an activation function to the updated node features. Default: ``None``. +*/ +class graph_conv_layer: public layer { +public: + graph_conv_layer(unsigned level, Graph *g, bool act, bool norm, bool bias, bool dropout, + std::vector in_dims, std::vector out_dims) : + layer(level, in_dims, out_dims), graph(g), act_(act), norm_(norm), bias_(bias), dropout_(dropout) { + assert(input_dims[0] == output_dims[0]); // num_vertices + x = input_dims[0]; + y = input_dims[1]; + z = output_dims[1]; + trainable_ = true; + name_ = layer_type() + "_" + std::to_string(level); + //std::cout << name_ << " constructed: act(" << act_ << ") dropout(" << dropout << ")\n"; + init(); + } + void init() { + std::cout << name_ << ": allocating memory for parameters and intermediate data... "; + Timer t_alloc; + t_alloc.Start(); + // randomly initialize trainable parameters for conv layers + rand_init_matrix(y, z, W); + //rand_init_matrix(y, z, Q); + zero_init_matrix(y, z, weight_grad); + alloc_grad(); + if (dropout_) { + dropout_mask.resize(x); + for (size_t i = 0; i < x; i++) dropout_mask[i].resize(y); + } + in_temp.resize(x*y); + //for (size_t i = 0; i < x; ++i) in_temp[i].resize(y); + out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py + //for (size_t i = 0; i < x; ++i) out_temp[i].resize(z); + trans_data.resize(y*x); // y*x + //for (size_t i = 0; i < y; ++i) trans_data[i].resize(x); + if (norm_) norm_factor_counting(); + t_alloc.Stop(); + std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; + } + graph_conv_layer(unsigned level, std::vector in_dims, + std::vector out_dims) : graph_conv_layer(level, NULL, false, true, false, true, in_dims, out_dims) {} + ~graph_conv_layer() {} + std::string layer_type() const override { return std::string("graph_conv"); } + + // user-defined aggregate function + void aggregate(Graph *g, const vec_t &in, tensor_t &out) { update_all(g, in, out, true, norm_factor); } + + // user-defined combine function + void combine(const vec_t &self, const vec_t &neighbors, const vec_t mat_v, const vec_t mat_u, vec_t &out) { + vec_t a(out.size(), 0); + vec_t b(out.size(), 0); + mvmul(mat_v, self, a); + mvmul(mat_u, neighbors, b); + vadd(a, b, out); // out = W*self + Q*neighbors + } + + void set_context(net_phase ctx) override { phase_ = ctx; } + + // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) + void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { + // input: x*y; W: y*z; output: x*z + // if y > z: + // mult W first to reduce the feature size for aggregation + // else: aggregate first then mult W (not implemented yet) + //Timer t_matmul, t_agg, t_dropout; + //t_matmul.Start(); + if (dropout_ && phase_ == net_phase::train) { + //t_dropout.Start(); + //for (size_t i = 0; i < x; ++i) { + galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { + dropout(in_data[i], dropout_mask[i], &in_temp[i*y]); + }, galois::loopname("dropout")); + //t_dropout.Stop(); + matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z + } else matmul2D1D(z, in_data, W, out_temp); // x*y; y*z; x*z + //t_matmul.Stop(); + //t_agg.Start(); + aggregate(graph, out_temp, out_data); // aggregate + //t_agg.Stop(); + if (act_) { + galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { + relu(out_data[i], out_data[i]); + }, galois::loopname("relu")); + } + //double dropout_time = 0; + //if (dropout_ && phase_ == net_phase::train) dropout_time = t_dropout.Millisecs(); + //std::cout << "\n\t" << name_ << " matmul time: " << t_matmul.Millisecs() + // << ", aggregation time: " << t_agg.Millisecs() << ", dropout time: " << dropout_time << "\n"; + } + + // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ + void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) override { + if (act_) { + //for (size_t j = 0; j < z; ++j) + galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { + for (size_t j = 0; j < z; ++j) + //if (out_data[i][j] <= 0.0) out_temp[i][j] = 0.0; + out_temp[i*z+j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0); + }, galois::loopname("d_relu")); + //} else out_temp = out_grad; // TODO: avoid copying + } else copy2D1D(out_grad, out_temp); + if (level_ != 0) { // no need to calculate in_grad for the first layer + vec_t trans_W(z*y); + transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix + matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y + update_all(graph, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y NOTE: since graph is symmetric, the derivative is the same + if (dropout_) { + galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { + d_dropout(in_grad[i], dropout_mask[i], in_grad[i]); + }, galois::chunk_size(), galois::steal(), galois::loopname("d_dropout")); + } + } + + // calculate weight gradients + transpose2D1D(in_data, trans_data); // y*x + matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z + } + + void degree_counting() { + assert(x == graph->size()); + degrees.resize(x); + galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) { + degrees[v] = std::distance(graph->edge_begin(v), graph->edge_end(v)); + }, galois::loopname("DegreeCounting")); + } + + // for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v + void norm_factor_counting() { + degree_counting(); + norm_factor.resize(x); + galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) { + float_t temp = std::sqrt(float_t(degrees[v])); + if (temp == 0.0) norm_factor[v] = 0.0; + else norm_factor[v] = 1.0 / temp; + }, galois::loopname("NormCounting")); + } + +private: + Graph *graph; + bool act_; // whether to use activation function at the end + bool norm_; // whether to normalize data + bool bias_; // whether to add bias afterwards + bool dropout_; // whether to use dropout at first + net_phase phase_; + size_t x; + size_t y; + size_t z; + vec_t out_temp; + vec_t in_temp; + vec_t trans_data; // y*x + std::vector degrees; + std::vector norm_factor; // normalization constant based on graph structure + std::vector > dropout_mask; + + // Glorot & Bengio (AISTATS 2010) init + inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) { + auto init_range = sqrt(6.0/(dim_x + dim_y)); + std::default_random_engine rng; + std::uniform_real_distribution dist(-init_range, init_range); + matrix.resize(dim_x * dim_y); + for (size_t i = 0; i < dim_x; ++i) { + for (size_t j = 0; j < dim_y; ++j) + matrix[i*dim_y+j] = dist(rng); + } + } + inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) { + matrix.resize(dim_x * dim_y); + for (size_t i = 0; i < dim_x; ++i) { + for (size_t j = 0; j < dim_y; ++j) + matrix[i*dim_y+j] = 0; + } + } +}; diff --git a/libdeepgalois/layers/layer.h b/libdeepgalois/layers/layer.h new file mode 100644 index 0000000000..4a8a545738 --- /dev/null +++ b/libdeepgalois/layers/layer.h @@ -0,0 +1,156 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "../node.h" +#include "../types.h" +#include "../utils.h" +#include "../optimizer.h" +#include "../math_functions.hpp" +/** + * base class of all kind of NN layers + * + * sub-class should override these methods: + * - forward_propagation ... body of forward-pass calculation + * - back_propagation ... body of backward-pass calculation + * - in_shape ... specify input data shapes + * - out_shape ... specify output data shapes + * - layer_type ... name of layer + **/ + +class layer : public node { +public: + layer(unsigned level, std::vector in_dims, std::vector out_dims) : + node(in_dims.size(), out_dims.size()), + level_(level), begin_(0), end_(0), num_dims(in_dims.size()), + input_dims(in_dims), output_dims(out_dims) { add_edge(); } + virtual ~layer() = default; + virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data) = 0; + virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, + tensor_t &out_grad, tensor_t &in_grad) = 0; + virtual std::string layer_type() const = 0; + virtual void set_context(net_phase ctx) {} + //virtual void setup(Graph *g, vec_t *diff, LabelList *lab) = 0; + + void set_trainable(bool trainable) { trainable_ = trainable; } + bool trainable() const { return trainable_; } + void set_name(std::string name) { name_ = name; } + std::string get_name() { return name_; } + void print_layer_info() { + std::cout << "Layer" << level_ << " type: " << layer_type() + << " input[" << input_dims[0] << "," << input_dims[1] + << "] output[" << output_dims[0] << "," << output_dims[1] << "]\n"; + } + virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, MaskList &masks) { + begin_ = sample_begin; + end_ = sample_end; + count_ = sample_count; + masks_ = masks; + } + void set_in_data(tensor_t data) { + prev_ = std::make_shared(this, input_dims[1]); + prev_->get_data() = data; + prev_->get_gradient().resize(input_dims[0]); + // allocate memory for intermediate gradients + //std::cout << "l0 in_grad alloc: x=" << output_dims[0] << ", y=" << output_dims[1] << "\n"; + for (size_t i = 0; i < input_dims[0]; ++i) + prev_->get_gradient()[i].resize(input_dims[1]); + } + void add_edge() { + // add an outgoing edge + next_ = std::make_shared(this, output_dims[1]); + // allocate memory for intermediate feature vectors + next_->get_data().resize(output_dims[0]); + for (size_t i = 0; i < output_dims[0]; ++i) + next_->get_data()[i].resize(output_dims[1]); + } + void alloc_grad() { + // allocate memory for intermediate gradients + //std::cout << "l" << level_ << " out_grad alloc: x=" << output_dims[0] << ", y=" << output_dims[1] << "\n"; + next_->get_gradient().resize(output_dims[0]); + for (size_t i = 0; i < output_dims[0]; ++i) + next_->get_gradient()[i].resize(output_dims[1]); + } + void forward() { + forward_propagation(prev()->get_data(), next()->get_data()); + } + void backward() { + back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient()); + } + void update_weight(optimizer *opt) { + //std::cout << "[debug] " << name_ << ": updating weight...\n"; + // parallelize only when target size is big enough to mitigate thread spawning overhead. + bool parallel = (W.size() >= 512); + //vec_t diff; + //prev()->merge_grads(&diff); + //auto in_data = prev()->get_data(); + //float_t rcp_batch_size = float_t(1.0) / in_data.size(); + //for (size_t i = 0; i < diff.size(); ++i) + // diff[i] *= rcp_batch_size; + opt->update(weight_grad, W, parallel); // W += grad + prev()->clear_grads(); + } + inline acc_t get_masked_loss() { + //acc_t total_loss = acc_t(0); + //size_t valid_sample_count = 0; + AccumF total_loss; + AccumU valid_sample_count; + total_loss.reset(); + valid_sample_count.reset(); + //for (size_t i = begin_; i < end_; i ++) { + galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { + if (masks_[i]) { + total_loss += loss[i]; + valid_sample_count += 1; + } + }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); + //} + assert(valid_sample_count.reduce() == count_); + return total_loss.reduce() / (acc_t)count_; + } + +protected: + unsigned level_; // layer id: [0, num_layers-1] + size_t begin_; // sample begin index + size_t end_; // sample end index + size_t count_; // number of samples + MaskList masks_; // masks to show which samples are valid + size_t num_dims; // number of dimensions + std::vector input_dims; // input dimensions + std::vector output_dims; // output dimentions + std::string name_; // name of this layer + bool trainable_; // is this layer trainable + vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E + vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x 16, layer1: 16 x E + vec_t weight_grad; // weight gradient for updating parameters + vec_t loss; // error for each vertex: N x 1 +}; + +// head: layer i+1, tail: layer i +inline void connect(layer *head, layer *tail, + size_t head_index = 0, size_t tail_index = 0) { + //auto out_shape = head->out_shape()[head_index]; + //auto in_shape = tail->in_shape()[tail_index]; + //head->setup(false); + //if (in_shape.size() == 0) { + // tail->set_in_shape(out_shape); + // in_shape = out_shape; + //} + //if (out_shape.size() != in_shape.size()) + // connection_mismatch(*head, *tail); + //if (!head->next_[head_index]) + // throw nn_error("output edge must not be null"); + tail->prev_ = head->next_; + tail->prev_->add_next_node(tail); +} + diff --git a/libdeepgalois/layers/linear_layer.h b/libdeepgalois/layers/linear_layer.h new file mode 100644 index 0000000000..e4ff524f3f --- /dev/null +++ b/libdeepgalois/layers/linear_layer.h @@ -0,0 +1,28 @@ +#pragma once +#include "layer.h" + +class linear_layer : public layer { +public: + linear_layer(unsigned level, float_t scale, float_t bias, + std::vector in_dims, std::vector out_dims) : + layer(level, in_dims, out_dims), scale_(scale), bias_(bias) { + trainable_ = false; } + linear_layer(unsigned level, std::vector in_dim, + std::vector out_dim) : linear_layer(level, 1.0, 0.0, in_dim, out_dim) { } + std::string layer_type() const override { return "linear"; } + + void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { + for (size_t sample = 0; sample < input_dims[0]; ++sample) { + for (size_t i = 0; i < input_dims[1]; i ++) + out_data[sample][i] = scale_ * in_data[sample][i] + bias_; + } + } + void back_propagation(const tensor_t &in_data, const tensor_t &out_data, + tensor_t &out_grad, tensor_t &in_grad) override { + for (size_t sample = 0; sample < input_dims[0]; ++sample) + for (size_t i = 0; i < input_dims[1]; i++) + in_grad[sample][i] = out_grad[sample][i] * scale_; + } +protected: + float_t scale_, bias_; +}; diff --git a/libdeepgalois/layers/relu_layer.h b/libdeepgalois/layers/relu_layer.h new file mode 100644 index 0000000000..389e6b3c1f --- /dev/null +++ b/libdeepgalois/layers/relu_layer.h @@ -0,0 +1,24 @@ +#pragma once +#include "layer.h" + +// ReLU Layer +class relu_layer : public layer { +public: + relu_layer(unsigned level, std::vector in_dims, std::vector out_dims) + : layer(level, in_dims, out_dims) { + trainable_ = false; + } + std::string layer_type() const override { return std::string("relu"); } + // ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) + void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { + galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) { + for (size_t j = 0; j < input_dims[1]; ++j) + out_data[i][j] = std::max(in_data[i][j], (float_t)0) + + negative_slope * std::min(in_data[i][j], (float_t)0); + }, galois::chunk_size(), galois::steal(), galois::loopname("relu_layer-fw")); + } + // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) + // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ , ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ + void back_propagation(const tensor_t &in_data, const tensor_t &out_data, + tensor_t &out_grad, tensor_t &in_grad) override {} +}; diff --git a/libdeepgalois/layers/softmax_loss_layer.h b/libdeepgalois/layers/softmax_loss_layer.h new file mode 100644 index 0000000000..bdd52e4d38 --- /dev/null +++ b/libdeepgalois/layers/softmax_loss_layer.h @@ -0,0 +1,47 @@ +#pragma once +#include "layer.h" + +class softmax_loss_layer: public layer { +public: + softmax_loss_layer(unsigned level, std::vector in_dims, + std::vector out_dims, LabelList *lab) + : layer(level, in_dims, out_dims), labels(lab) { + trainable_ = false; + loss.resize(in_dims[0]); // error for each sample + name_ = layer_type() + "_" + std::to_string(level); + } + softmax_loss_layer(unsigned level, std::vector in_dims, + std::vector out_dims) : + softmax_loss_layer(level, in_dims, out_dims, NULL) {} + ~softmax_loss_layer() {} + std::string layer_type() const override { return std::string("softmax_loss"); } + + // TODO: need kernel fusion optimization + // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] + void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { + galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { + if (masks_[i] == 1) { // masked + softmax(in_data[i], out_data[i]); // normalize using softmax + // y is a one hot encoded vector for the labels + std::vector y(output_dims[1], 0.0); // ground truth + y[(*labels)[i]] = 1.0; // one-hot + loss[i] = cross_entropy(y, out_data[i]); + } + }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-fw")); + } + + void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) override { + //std::cout << name_ << " backward: x=" << in_grad.size() << ", y=" << in_grad[0].size() << "\n"; + galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { + vec_t norm_grad(output_dims[1]); + std::vector y(output_dims[1], 0.0); // ground truth + y[(*labels)[i]] = 1.0; + d_cross_entropy(y, out_data[i], norm_grad); + d_softmax(in_data[i], out_data[i], in_grad[i], norm_grad); + }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); + } + +private: + LabelList *labels; +}; + diff --git a/libdeepgalois/lgraph.h b/libdeepgalois/lgraph.h new file mode 100644 index 0000000000..78f6f76aec --- /dev/null +++ b/libdeepgalois/lgraph.h @@ -0,0 +1,179 @@ +#ifndef __LGRAPH_HPP__ +#define __LGRAPH_HPP__ + +//defines the Learning Graph (LGraph) data structure +#include +#include +#include +#include +#include +#include +typedef unsigned IndexT; +typedef float ValueT; + +struct Edge { + IndexT src; + IndexT dst; + ValueT elabel; + Edge() : src(0), dst(0), elabel(0) {} + Edge(IndexT from, IndexT to, ValueT el) : + src(from), dst(to), elabel(el) {} + std::string to_string() const { + std::stringstream ss; + ss << "e(" << src << "," << dst << "," << elabel << ")"; + return ss.str(); + } +}; +typedef std::vector EdgeList; + +class LGraph { +public: + LGraph() : symmetrize_(false), directed_(false) {} + void clean() { + delete[] rowptr_; + delete[] colidx_; + delete[] weight_; + degrees.clear(); + el.clear(); + //labels_.clear(); + //vertices.clear(); + } + bool directed() const { return directed_; } + size_t num_vertices() const { return num_vertices_; } + size_t num_edges() const { return num_edges_; } + IndexT * out_rowptr() const { return rowptr_; } + IndexT * out_colidx() const { return colidx_; } + unsigned out_degree(IndexT n) const { return rowptr_[n+1] - rowptr_[n]; } + IndexT get_offset(IndexT n) { return rowptr_[n]; } + IndexT get_dest(IndexT n) { return colidx_[n]; } + ValueT get_weight(IndexT n) { return weight_[n]; } + unsigned get_max_degree() { return max_degree; } + //ValueT * labels() { return labels_.data(); } + //ValueT get_label(IndexT n) { return labels_[n]; } + void read_edgelist(const char *filename, bool symmetrize = false) { + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + IndexT max_vid = 0; + while (std::getline(in, line)) { + std::istringstream edge_stream(line); + IndexT u, v; + edge_stream >> u; + edge_stream >> v; + el.push_back(Edge(u, v, 1)); + if (symmetrize) el.push_back(Edge(v, u, 1)); + if (u > max_vid) max_vid = u; + if (v > max_vid) max_vid = v; + } + in.close(); + directed_ = true; + num_vertices_ = max_vid+1; + num_edges_ = el.size(); + std::cout << "num_vertices_ " << num_vertices_ << " num_edges_ " << num_edges_ << "\n"; + MakeGraphFromEL(); + } + +private: + EdgeList el; + bool symmetrize_; // whether to symmetrize a directed graph + bool directed_; + size_t num_vertices_; + size_t num_edges_; + IndexT *rowptr_; + IndexT *colidx_; + ValueT *weight_; + unsigned max_degree; + std::vector degrees; + std::vector labels_; + std::vector > vertices; + + static bool compare_id(Edge a, Edge b) { return (a.dst < b.dst); } + + void MakeGraphFromEL() { + SquishGraph(); + MakeCSR(false); + } + + void SquishGraph(bool remove_selfloops = true, bool remove_redundents = true) { + std::vector neighbors; + for (size_t i = 0; i < num_vertices_; i++) + vertices.push_back(neighbors); + for (size_t i = 0; i < num_edges_; i ++) + vertices[el[i].src].push_back(el[i]); + el.clear(); + printf("Sorting the neighbor lists..."); + for (size_t i = 0; i < num_vertices_; i ++) + std::sort(vertices[i].begin(), vertices[i].end(), compare_id); + printf(" Done\n"); + //remove self loops + int num_selfloops = 0; + if(remove_selfloops) { + printf("Removing self loops..."); + for(size_t i = 0; i < num_vertices_; i ++) { + for(unsigned j = 0; j < vertices[i].size(); j ++) { + if(i == vertices[i][j].dst) { + vertices[i].erase(vertices[i].begin()+j); + num_selfloops ++; + j --; + } + } + } + printf(" %d selfloops are removed\n", num_selfloops); + num_edges_ -= num_selfloops; + } + // remove redundent + int num_redundents = 0; + if(remove_redundents) { + printf("Removing redundent edges..."); + for (size_t i = 0; i < num_vertices_; i ++) { + for (unsigned j = 1; j < vertices[i].size(); j ++) { + if (vertices[i][j].dst == vertices[i][j-1].dst) { + vertices[i].erase(vertices[i].begin()+j); + num_redundents ++; + j --; + } + } + } + printf(" %d redundent edges are removed\n", num_redundents); + num_edges_ -= num_redundents; + } + } + + void MakeCSR(bool transpose) { + degrees.resize(num_vertices_); + std::fill(degrees.begin(), degrees.end(), 0); + for (size_t i = 0; i < num_vertices_; i ++) + degrees[i] = vertices[i].size(); + max_degree = *(std::max_element(degrees.begin(), degrees.end())); + + std::vector offsets(degrees.size() + 1); + IndexT total = 0; + for (size_t n = 0; n < degrees.size(); n++) { + offsets[n] = total; + total += degrees[n]; + } + offsets[degrees.size()] = total; + + assert(num_edges_ == offsets[num_vertices_]); + weight_ = new ValueT[num_edges_]; + colidx_ = new IndexT[num_edges_]; + rowptr_ = new IndexT[num_vertices_+1]; + for (size_t i = 0; i < num_vertices_+1; i ++) rowptr_[i] = offsets[i]; + for (size_t i = 0; i < num_vertices_; i ++) { + for (auto it = vertices[i].begin(); it < vertices[i].end(); it ++) { + Edge e = *it; + assert(i == e.src); + if (symmetrize_ || (!symmetrize_ && !transpose)) { + weight_[offsets[e.src]] = e.elabel; + colidx_[offsets[e.src]++] = e.dst; + } + if (symmetrize_ || (!symmetrize_ && transpose)) { + weight_[offsets[e.dst]] = e.elabel; + colidx_[offsets[e.dst]++] = e.src; + } + } + } + } +}; + +#endif diff --git a/libdeepgalois/math_functions.hpp b/libdeepgalois/math_functions.hpp new file mode 100644 index 0000000000..8791416441 --- /dev/null +++ b/libdeepgalois/math_functions.hpp @@ -0,0 +1,500 @@ +#ifndef _MATH_FUNCTIONS_ +#define _MATH_FUNCTIONS_ +#include +#include "utils.h" +#include + +#ifdef WITH_BLAS +extern "C" { +#include +//#include +} +#endif + +const float negative_slope = 0; + +// vector add +template +inline void vadd(const std::vector &a, const std::vector &b, std::vector &out) { + //for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i]; + size_t n = out.size(); + size_t vec_len = 8; + const size_t alignedN = n - n % vec_len; + for (size_t i = 0; i < alignedN; i += vec_len) + _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); + for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i]; +} + +template +inline void vadd(size_t n, const DataTy *a, const DataTy *b, DataTy *out) { + size_t vec_len = 8; + const size_t alignedN = n - n % vec_len; + for (size_t i = 0; i < alignedN; i += vec_len) + _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); + for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i]; +} + +// vector subtract +template +inline void vsub(const std::vector &in_a, const std::vector &in_b, std::vector &out) { + for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] - in_b[i]; +} + +// vector multiply +template +inline void vmul(const std::vector &in_a, const std::vector &in_b, std::vector &out) { + for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] * in_b[i]; +} + +// vector divide +template +inline void vdiv(const std::vector &in_a, const std::vector &in_b, std::vector &out) { + for (size_t i = 0; i < out.size(); ++i) { + assert(in_b[i] != 0); + out[i] = in_a[i] / in_b[i]; + } +} + +// vector add scalar +template +inline void add_scalar(const DataTy alpha, std::vector &Y) { + for (size_t i = 0; i < Y.size(); ++i) Y[i] += alpha; +} + +// vector subtract scalar +template +inline void sub_scalar(const DataTy alpha, std::vector &Y) { + for (size_t i = 0; i < Y.size(); ++i) Y[i] -= alpha; +} + +// vector multiply scalar +template +inline void mul_scalar(const DataTy alpha, std::vector &Y) { + for (size_t i = 0; i < Y.size(); ++i) Y[i] *= alpha; +} + +template +inline void mul_scalar(size_t n, const DataTy alpha, const DataTy *in, DataTy *out) { + for (size_t i = 0; i < n; ++i) out[i] = alpha *in[i]; +} + +// vector divide scalar +template +inline void div_scalar(const DataTy alpha, std::vector &Y) { + assert(alpha != 0); + for (size_t i = 0; i < Y.size(); ++i) Y[i] /= alpha; +} + +// dot product +template +inline DataTy dot(const std::vector &x, const std::vector &y) { + DataTy sum = 0; + for (size_t i = 0; i < x.size(); ++i) + sum += x[i] * y[i]; + return sum; +} + +// matrix-vector multiply +inline void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector) { + size_t m = out_vector.size(); + size_t n = in_vector.size(); + for (size_t i = 0; i < m; ++i) { + for (size_t j = 0; j < n; ++j) { + out_vector[i] += matrix[i*n+j] * in_vector[j]; + } + } +} + +// vector-vector multiply +inline void vvmul(const vec_t &a, const vec_t &b, tensor_t &out) { + size_t m = a.size(); + size_t n = b.size(); + for (size_t i = 0; i < m; ++i) { + for (size_t j = 0; j < n; ++j) { + out[i][j] += a[i] * b[j]; + } + } +} + +// matrix addition +inline void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C) { + for (size_t i = 0; i < x; ++i) + for (size_t j = 0; j < y; ++j) + C[i][j] = A[i][j] + B[i][j]; +} + +// TODO: vectorize +template +inline void copy2D1D(const tensor_t &in, vec_t &out) { + size_t x = in.size(); + size_t y = in[0].size(); +#ifdef WITH_BLAS + auto ptr = &out[0]; + for (size_t i = 0; i < x; i++) { + std::copy(in[i].begin(), in[i].end(), ptr); + ptr += y; + } +#else + assert(out.size() == x*y); + for (size_t i = 0; i < x; i ++) { + for (size_t j = 0; j < y; j ++) { + out[i*y+j] = in[i][j]; + } + } +#endif +} + +// matrix multiply: all 2D +inline void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) { + // A: x*z; B: z*y; C: x*y + size_t dim_x = A.size(); + size_t dim_y = C[0].size(); + size_t dim_z = A[0].size(); + assert(C.size() == dim_x); + assert(B.size() == dim_z); + assert(B[0].size() == dim_y); + + for (size_t i = 0; i < dim_x; ++i) { + for (size_t j = 0; j < dim_y; ++j) { + C[i][j] = 0; + for (size_t k = 0; k < dim_z; ++k) { + C[i][j] += A[i][k] * B[k][j]; + } + } + } +} + +inline void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, + const vec_t &A, const vec_t &B, vec_t &C) { + galois::StatTimer Tmatmul("MatMul"); + Tmatmul.start(); +#ifdef WITH_BLAS + const int M = dim_x; + const int N = dim_y; + const int K = dim_z; + const float alpha = 1.0; + const float beta = 0.0; + const CBLAS_TRANSPOSE TransA = CblasNoTrans; + const CBLAS_TRANSPOSE TransB = CblasNoTrans; + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, &A[0], lda, &B[0], ldb, beta, &C[0], N); +#else + //std::cout << "using naive matmul, slow\n"; + assert(A.size() == dim_x*dim_z); + assert(B.size() == dim_z*dim_y); + assert(C.size() == dim_x*dim_y); + + for (size_t i = 0; i < dim_x; ++i) { + for (size_t j = 0; j < dim_y; ++j) { + C[i*dim_y+j] = 0; + for (size_t k = 0; k < dim_z; ++k) { + C[i*dim_y+j] += A[i*dim_z+k] * B[k*dim_y+j]; + } + } + } +#endif + Tmatmul.stop(); +} + +inline void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C) { + // A: x*z; B: z*y; C: x*y + size_t dim_x = A.size(); + size_t dim_z = A[0].size(); + assert(B.size() == dim_z*dim_y); + assert(C.size() == dim_x*dim_y); + +#ifdef WITH_BLAS + vec_t A1D(dim_x*dim_z); + copy2D1D(A, A1D); + matmul1D1D(dim_x, dim_y, dim_z, A1D, B, C); +#else + for (size_t i = 0; i < dim_x; ++i) { + for (size_t j = 0; j < dim_y; ++j) { + C[i*dim_y+j] = 0; + for (size_t k = 0; k < dim_z; ++k) { + C[i*dim_y+j] += A[i][k] * B[k][j]; + } + } + } +#endif +} + +// matrix multiply +inline void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) { + // A: x*z; B: z*y; C: x*y + size_t dim_x = C.size(); + size_t dim_y = C[0].size(); + size_t dim_z = A[0].size(); + assert(A.size() == dim_x); + assert(B.size() == dim_y*dim_z); + +#ifdef WITH_BLAS + vec_t A1D(dim_x*dim_z); + vec_t C1D(dim_x*dim_y, 0); + auto ptr = &A1D[0]; + for (size_t i = 0; i < dim_x; i++) { + std::copy(A[i].begin(), A[i].end(), ptr); + ptr += dim_z; + } + matmul1D1D(dim_x, dim_y, dim_z, A1D, B, C1D); + for (size_t i = 0; i < dim_x; i++) { + for (size_t j = 0; j < dim_y; ++j) { + C[i][j] = C1D[i*dim_y+j]; + } + } +#else + for (size_t i = 0; i < dim_x; ++i) { + for (size_t j = 0; j < dim_y; ++j) { + C[i][j] = 0; + for (size_t k = 0; k < dim_z; ++k) { + C[i][j] += A[i][k] * B[k*dim_y+j]; + } + } + } +#endif +} + +template +inline void transpose2D(const tensor_t &in, tensor_t &out) { + size_t x = in.size(); + size_t y = in[0].size(); + for (size_t i = 0; i < y; i ++) { + for (size_t j = 0; j < x; j ++) { + out[i][j] = in[j][i]; + } + } +} + +// TODO: vectorize +template +inline void transpose2D1D(const tensor_t &in, vec_t &out) { + size_t x = in.size(); + size_t y = in[0].size(); + assert(out.size() == x*y); + for (size_t i = 0; i < y; i ++) { + for (size_t j = 0; j < x; j ++) { + out[i*x+j] = in[j][i]; + } + } +} + +template +inline void transpose(size_t x, size_t y, const vec_t &in, vec_t &out) { + for (size_t i = 0; i < y; i ++) { + for (size_t j = 0; j < x; j ++) { + out[i*x+j] = in[j*y+i]; + } + } +} + +template +inline int argmax(const size_t n, const std::vector &x) { + DataTy max = x[0]; + int max_ind = 0; + for (size_t i = 1; i < n; i++) { + if (x[i] > max) { + max_ind = i; + max = x[i]; + } + } + return max_ind; +} + +inline void clear(vec_t &in) { + for (size_t i = 0; i < in.size(); i++) in[i] = 0; +} + +inline void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { + galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) { + clear(out[src]); // TODO: vectorize clear + float_t a = 0.0, b = 0.0; + if (norm) a = norm_factor[src]; + // gather neighbors' embeddings + for (const auto e : g->edges(src)) { + const auto dst = g->getEdgeDst(e); + if (norm) { + b = a * norm_factor[dst]; + vec_t neighbor = in[dst]; + mul_scalar(b, neighbor); + vadd(out[src], neighbor, out[src]); // out[src] += in[dst] + } else vadd(out[src], in[dst], out[src]); // out[src] += in[dst] + } + }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); +} + +inline void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { + size_t len = out[0].size(); + galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) { + clear(out[src]); + float_t a = 0.0, b = 0.0; + if (norm) a = norm_factor[src]; + // gather neighbors' embeddings + for (const auto e : g->edges(src)) { + const auto dst = g->getEdgeDst(e); + if (norm) { + b = a * norm_factor[dst]; + vec_t neighbor(len); + mul_scalar(len, b, &in[dst*len], neighbor.data()); + vadd(out[src], neighbor, out[src]); // out[src] += in[dst] + } else vadd(len, out[src].data(), &in[dst*len], out[src].data()); // out[src] += in[dst] + } + }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); +} + +template +inline void relu(const std::vector &in, std::vector &out) { + for (size_t i = 0; i < out.size(); ++i) { + out[i] = std::max(in[i], (DataTy)0) + negative_slope * std::min(in[i], (DataTy)0); + } +} + +template +inline void d_relu(const std::vector &in_diff, const std::vector &fv, std::vector &out_diff) { + for (size_t i = 0; i < out_diff.size(); ++i) { + out_diff[i] = in_diff[i] * ((fv[i] > (DataTy)0) + negative_slope * (fv[i] <= (DataTy)0)); + } +} + +inline void d_mvmul(vec_t &in_diff, vec_t &h_in, tensor_t &out_diff) { + vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff +} + +inline void d_vadd(vec_t &in_diff, vec_t &out_diff) { + for (size_t i = 0; i < out_diff.size(); ++i) + out_diff[i] = in_diff[i]; +} + +template +inline float reduce_mean(const std::vector &x) { + size_t n = x.size(); + assert(n > 0); + float sum = (float)x[0]; + for (size_t i = 1; i < n; i++) { + sum += (float)x[i]; + } + return sum / (float)n; +} + +const float scale_ = 1. / (1. - dropout_rate); + +inline void dropout(const vec_t &in, std::vector &mask, vec_t &out) { + assert(mask.size() == out.size()); + //rng_bernoulli(1. - dropout_rate, mask); // Create random numbers + for (size_t i = 0; i < in.size(); ++i) + mask[i] = bernoulli(dropout_rate); + for (size_t i = 0; i < in.size(); ++i) + out[i] = in[i] * mask[i] * scale_; +} + +inline void dropout(const vec_t &in, std::vector &mask, float_t *out) { + for (size_t i = 0; i < in.size(); ++i) + mask[i] = bernoulli(dropout_rate); + for (size_t i = 0; i < in.size(); ++i) + out[i] = in[i] * mask[i] * scale_; +} + +inline void d_dropout(const vec_t &in_diff, std::vector &mask, vec_t &out_diff) { + for (size_t i = 0; i < in_diff.size(); ++i) + out_diff[i] = in_diff[i] * mask[i] * scale_; +} + +template +inline DataTy sigmoid_func(DataTy x) { + return 0.5 * tanh(0.5 * x) + 0.5; +} + +// Sigmoid +template +inline void sigmoid(std::vector &fv) { + size_t count = fv.size(); + for (size_t i = 0; i < count; ++i) { + fv[i] = sigmoid_func(fv[i]); + } +} + +// Softmax function takes an N-dimensional vector (X) of real number, +// and transforms it into a vector of real number in range (0,1) which add upto 1. +// To make softmax func numerically stable, we simply normalize the values in the vector, +// by multiplying the numerator and denominator with a constant C, where log(C)=-max(X) +// exps = np.exp(X - np.max(X)) +// exps / np.sum(exps) +template +inline void softmax(const std::vector &input, std::vector &output) { + const float_t max = *std::max_element(input.begin(), input.end()); + float_t denominator(0); + for (size_t i = 0; i < input.size(); i++) { + output[i] = std::exp(input[i] - max); + denominator += output[i]; + } + for (size_t i = 0; i < input.size(); i++) + output[i] /= denominator; +} + +template +inline void log_softmax(const std::vector &input, std::vector &output) { + const float_t max = *std::max_element(input.begin(), input.end()); + float_t denominator(0); + for (size_t i = 0; i < input.size(); i++) + denominator += std::exp(input[i] - max); + for (size_t i = 0; i < input.size(); i++) + output[i] = input[i] - max - denominator; +} + +// Due to the desirable property of softmax function outputting a probability distribution, +// we often use it as the final layer in neural networks. +// For this we need to calculate the derivative or gradient, +// and pass it back to the previous layer during backpropagation. +template +inline void d_softmax(const std::vector &y, const std::vector &p, + std::vector &dy, const std::vector &dp) { + auto n = y.size(); + vec_t df(n, 0); + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < n; j++) { + //DataTy delta_ij = i == j? 1 : 0; + //df[i] += p[j] * (delta_ij - p[i]); + df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i]; + } + // dy = dp * (gradient of softmax) + dy[i] = dot(dp, df); + } +/* + for (size_t j = 0; j < x.size(); j++) { + for (size_t k = 0; k < x.size(); k++) { + df[k] = (k == j) ? y[j] * (float_t(1) - y[j]) : -y[k] * y[j]; + } + dx[j] = vectorize::dot(&dy[0], &df[0], len); + } +*/ +} + +// cross-entropy loss function for multi-class classification +// y: ground truth +// p: predicted probability +template +inline DataTy cross_entropy(const std::vector &y, const std::vector &p) { + auto n = y.size(); + assert(n > 0); + DataTy loss = 0.0; + for (size_t i = 0; i < n; i++) { + if (y[i] == float_t(0)) continue; + if (p[i] == float_t(0)) loss -= y[i] * std::log(float_t(1e-10)); + //if (p[i]==float_t(1)) loss -= (float_t(1) - y[i]) * std::log(float_t(1e-10)); + else loss -= y[i] * std::log(p[i]);// + (float_t(1) - y[i]) * std::log(float_t(1) - p[i]); + //loss -= y[i] * std::log(p[i]); + } + return loss; +} + +template +inline void d_cross_entropy(const std::vector &y, const std::vector &p, std::vector &d) { + auto n = y.size(); + //for (size_t i = 0; i < n; i++) d[i] = (p[i] - y[i]) / (p[i] * (float_t(1) - p[i])); + for (size_t i = 0; i < n; i++) { + d[i] = -y[i] / (p[i] + float_t(1e-10)); + //d[i] = p[i] - y[i]; + } +} + +#endif diff --git a/libdeepgalois/net.h b/libdeepgalois/net.h new file mode 100644 index 0000000000..fac7caee00 --- /dev/null +++ b/libdeepgalois/net.h @@ -0,0 +1,341 @@ +#ifndef _MODEL_H_ +#define _MODEL_H_ + +#include +#include "gnn.h" +#include "lgraph.h" +#include "layers.h" +#include "optimizer.h" + +#define NUM_CONV_LAYERS 2 + +// N: number of vertices, D: feature vector dimentions, +// E: number of distinct labels, i.e. number of vertex classes +// layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16) +// layer 2: features N x 16, weights 16 x E, out N x E +class Net { +public: + Net() {} + + // user-defined aggregate function + virtual void aggregate(Graph *g, size_t dim, const tensor_t &in_feats, tensor_t &out_feats) {} + + // user-defined combine function + virtual void combine(const vec_t ma, const vec_t mb, const vec_t &a, const vec_t &b, vec_t &out) {} + + void init() { + assert(dropout_rate < 1.0); + read_graph(dataset, g); + n = g.size(); // N + labels.resize(n, 0); // label for each vertex: N x 1 + num_classes = read_labels(dataset, labels); + + std::cout << "Reading label masks ... "; + train_mask.resize(n, 0); + val_mask.resize(n, 0); + if (dataset == "reddit") { + train_begin = 0, train_count = 153431, train_end = train_begin + train_count; + val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; + for (size_t i = train_begin; i < train_end; i++) train_mask[i] = 1; + for (size_t i = val_begin; i < val_end; i++) val_mask[i] = 1; + } else { + train_count = read_masks(dataset, "train", train_begin, train_end, train_mask); + val_count = read_masks(dataset, "val", val_begin, val_end, val_mask); + } + std::cout << "Done\n"; + + num_layers = NUM_CONV_LAYERS + 1; + feature_dims.resize(num_layers + 1); + input_features.resize(n); // input embedding: N x D + feature_dims[0] = read_features(dataset, input_features); // input feature dimension: D + feature_dims[1] = hidden1; // hidden1 level embedding: 16 + feature_dims[2] = num_classes; // output embedding: E + feature_dims[3] = num_classes; // normalized output embedding: E + layers.resize(num_layers); + } + size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } + size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id+1]; } + size_t get_nnodes() { return n; } + size_t get_nedges() { return g.sizeEdges(); } + size_t get_ft_dim() { return feature_dims[0]; } + size_t get_nclasses() { return num_classes; } + size_t get_label(size_t i) { return labels[i]; } + void construct_layers() { + std::cout << "\nConstructing layers...\n"; + append_conv_layer(0, true); // first conv layer + append_conv_layer(1); // hidden1 layer + append_out_layer(2); // output layer + layers[0]->set_in_data(input_features); // feed input data + } + + void set_netphase(net_phase phase) { + for (size_t i = 0; i < num_layers; i ++) + layers[i]->set_context(phase); + } + + void print_layers_info() { + for (size_t i = 0; i < num_layers; i ++) + layers[i]->print_layer_info(); + } + + void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, bool bias = false, bool dropout = true) { + assert(layer_id < NUM_CONV_LAYERS); + std::vector in_dims(2), out_dims(2); + in_dims[0] = out_dims[0] = n; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + layers[layer_id] = new graph_conv_layer(layer_id, &g, act, norm, bias, dropout, in_dims, out_dims); + if(layer_id > 0) connect(layers[layer_id-1], layers[layer_id]); + } + + void append_out_layer(size_t layer_id) { + assert(layer_id > 0); // can not be the first layer + std::vector in_dims(2), out_dims(2); + in_dims[0] = out_dims[0] = n; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims, &labels); + connect(layers[layer_id-1], layers[layer_id]); + } + + // forward propagation: [begin, end) is the range of samples used. + acc_t fprop(size_t begin, size_t end, size_t count, MaskList &masks) { + // set mask for the last layer + layers[num_layers-1]->set_sample_mask(begin, end, count, masks); + // layer0: from N x D to N x 16 + // layer1: from N x 16 to N x E + // layer2: from N x E to N x E (normalize only) + for (size_t i = 0; i < num_layers; i ++) + layers[i]->forward(); + return layers[num_layers-1]->get_masked_loss(); + } + + // back propogation + void bprop() { + for (size_t i = num_layers; i != 0; i --) + layers[i-1]->backward(); + } + + // update trainable weights after back-propagation + void update_weights(optimizer *opt) { + for (size_t i = 0; i < num_layers; i ++) + if (layers[i]->trainable()) layers[i]->update_weight(opt); + } + + // evaluate, i.e. inference or predict + double evaluate(size_t begin, size_t end, size_t count, MaskList &masks, acc_t &loss, acc_t &acc) { + Timer t_eval; + t_eval.Start(); + loss = fprop(begin, end, count, masks); + acc = masked_accuracy(begin, end, count, masks); + t_eval.Stop(); + return t_eval.Millisecs(); + } + + // training + void train(optimizer *opt) { + std::cout << "\nStart training...\n"; + galois::StatTimer Tupdate("Train-WeightUpdate"); + galois::StatTimer Tfw("Train-Forward"); + galois::StatTimer Tbw("Train-Backward"); + galois::StatTimer Tval("Validation"); + Timer t_epoch; + // run epoches + for (size_t i = 0; i < epochs; i++) { + std::cout << "Epoch " << std::setw(2) << i << std::fixed << std::setprecision(3) << ":"; + t_epoch.Start(); + + // training steps + set_netphase(net_phase::train); + acc_t train_loss = 0.0, train_acc = 0.0; + Tfw.start(); + train_loss = fprop(train_begin, train_end, train_count, train_mask); // forward + train_acc = masked_accuracy(train_begin, train_end, train_count, train_mask); // predict + Tfw.stop(); + Tbw.start(); + bprop(); // back propogation + Tbw.stop(); + Tupdate.start(); + update_weights(opt); // update parameters + Tupdate.stop(); + set_netphase(net_phase::test); + std::cout << " train_loss = " << std::setw(5) << train_loss << " train_acc = " << std::setw(5) << train_acc; + t_epoch.Stop(); + double epoch_time = t_epoch.Millisecs(); + + if (do_validate) { + // Validation + acc_t val_loss = 0.0, val_acc = 0.0; + Tval.start(); + double val_time = evaluate(val_begin, val_end, val_count, val_mask, val_loss, val_acc); + Tval.stop(); + std::cout << " val_loss = " << std::setw(5) << val_loss << " val_acc = " << std::setw(5) << val_acc; + std::cout << " time = " << epoch_time + val_time << " ms (train_time = " << epoch_time << " val_time = " << val_time << ")\n"; + } else { + std::cout << " train_time = " << epoch_time << " ms\n"; + } + } + } + +protected: + size_t n; // number of samples: N + size_t num_classes; // number of vertex classes: E + size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 + std::vector feature_dims; // feature dimnesions for each layer + + Graph g; // the input graph, |V| = N + tensor_t input_features; // input features: N x D + std::vector labels; // labels for classification: N x 1 + MaskList train_mask, val_mask; // masks for traning and validation + size_t train_begin, train_end, train_count, val_begin, val_end, val_count; + + std::vector layers; // all the layers in the neural network + /* + inline void init_features(size_t dim, vec_t &x) { + std::default_random_engine rng; + std::uniform_real_distribution dist(0, 0.1); + for (size_t i = 0; i < dim; ++i) + x[i] = dist(rng); + } + //*/ + + // labels contain the ground truth (e.g. vertex classes) for each example (num_examples x 1). + // Note that labels is not one-hot encoded vector and it can be computed + // as y.argmax(axis=1) from one-hot encoded vector (y) of labels if required. + size_t read_labels(std::string dataset_str, LabelList &labels) { + std::cout << "Reading labels ... "; + Timer t_read; + t_read.Start(); + std::string filename = path + dataset_str + "-labels.txt"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + size_t m, n; + in >> m >> n >> std::ws; + assert(m == labels.size()); // number of vertices + unsigned v = 0; + while (std::getline(in, line)) { + std::istringstream label_stream(line); + unsigned x; + for (size_t idx = 0; idx < n; ++idx) { + label_stream >> x; + if (x != 0) { + labels[v] = idx; + break; + } + } + v ++; + } + in.close(); + t_read.Stop(); + // number of vertex classes + std::cout << "Done, unique label counts: " << n << ", time: " << t_read.Millisecs() << " ms\n"; + return n; + } + + size_t read_features(std::string dataset_str, tensor_t &feats) { + std::cout << "Reading features ... "; + Timer t_read; + t_read.Start(); + std::string filename = path + dataset_str + ".ft"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + size_t m, n; + in >> m >> n >> std::ws; + assert(m == feats.size()); // m = number of vertices + for (size_t i = 0; i < m; ++i) { + feats[i].resize(n); + for (size_t j = 0; j < n; ++j) + feats[i][j] = 0; + } + while (std::getline(in, line)) { + std::istringstream edge_stream(line); + unsigned u, v; + float_t w; + edge_stream >> u; + edge_stream >> v; + edge_stream >> w; + feats[u][v] = w; + } + /* + for (size_t i = 0; i < 10; ++i) + for (size_t j = 0; j < n; ++j) + if (feats[i][j] > 0) + std::cout << "feats[" << i << "][" << j << "]: " << feats[i][j] << std::endl; + //*/ + in.close(); + t_read.Stop(); + std::cout << "Done, feature dimention: " << n << ", time: " << t_read.Millisecs() << " ms\n"; + return n; + } + + unsigned read_graph(std::string dataset_str, Graph &graph) { + //printf("Start readGraph\n"); + galois::StatTimer Tread("GraphReadingTime"); + Tread.start(); + LGraph lgraph; + unsigned max_degree = 0; + if (filetype == "el") { + std::string filename = path + dataset_str + ".el"; + printf("Reading .el file: %s\n", filename.c_str()); + lgraph.read_edgelist(filename.c_str(), true); //symmetrize + genGraph(lgraph, graph); + } else if (filetype == "gr") { + std::string filename = path + dataset_str + ".csgr"; + printf("Reading .gr file: %s\n", filename.c_str()); + galois::graphs::readGraph(graph, filename); + /* + galois::do_all(galois::iterate(graph.begin(), graph.end()), [&](const auto& vid) { + graph.getData(vid) = 1; + //for (auto e : graph.edges(n)) graph.getEdgeData(e) = 1; + }, galois::chunk_size<256>(), galois::steal(), galois::loopname("assignVertexLabels")); + std::vector degrees(graph.size()); + galois::do_all(galois::iterate(graph.begin(), graph.end()), [&](const auto& vid) { + degrees[vid] = std::distance(graph.edge_begin(vid), graph.edge_end(vid)); + }, galois::loopname("computeMaxDegree")); + max_degree = *(std::max_element(degrees.begin(), degrees.end())); + */ + } else { printf("Unkown file format\n"); exit(1); } + if (filetype != "gr") { + max_degree = lgraph.get_max_degree(); + lgraph.clean(); + } + printf("max degree = %u\n", max_degree); + Tread.stop(); + //printf("Done readGraph\n"); + std::cout << "num_vertices " << g.size() << " num_edges " << g.sizeEdges() << "\n"; + return max_degree; + } + + void genGraph(LGraph &lg, Graph &g) { + g.allocateFrom(lg.num_vertices(), lg.num_edges()); + g.constructNodes(); + for (size_t i = 0; i < lg.num_vertices(); i++) { + g.getData(i) = 1; + auto row_begin = lg.get_offset(i); + auto row_end = lg.get_offset(i+1); + g.fixEndEdge(i, row_end); + for (auto offset = row_begin; offset < row_end; offset ++) + g.constructEdge(offset, lg.get_dest(offset), 0); // do not consider edge labels now + } + } + + inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, MaskList &masks) { + // comparing outputs with the ground truth (labels) + //acc_t accuracy_all = 0.0; + AccumF accuracy_all; + accuracy_all.reset(); + //for (size_t i = begin; i < end; i++) { + galois::do_all(galois::iterate(begin, end), [&](const auto& i) { + if (masks[i] == 1) { + int prediction = argmax(num_classes, layers[NUM_CONV_LAYERS-1]->next()->get_data()[i]); + if ((label_t)prediction == labels[i]) accuracy_all += 1.0; + } + }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); + //} + return accuracy_all.reduce() / (acc_t)count; + } +}; + +#endif diff --git a/libdeepgalois/node.h b/libdeepgalois/node.h new file mode 100644 index 0000000000..deffebad9b --- /dev/null +++ b/libdeepgalois/node.h @@ -0,0 +1,109 @@ +#pragma once +#include +class node; +class layer; +class edge; + +typedef std::shared_ptr edgeptr_t; + +// node data structure +class node : public std::enable_shared_from_this { +public: + node(size_t in_size, size_t out_size) {}//: prev_(in_size), next_(out_size) {} + virtual ~node() {} + const edgeptr_t prev() const { return prev_; } + //const std::vector &prev() const { return prev_; } + const edgeptr_t next() const { return next_; } + //const std::vector &next() const { return next_; } + //std::vector prev_nodes() const; + //std::vector next_nodes() const; + +protected: + node() = delete; + friend void connect(layer *head, layer *tail, size_t head_index, size_t tail_index); + //mutable std::vector prev_; + //mutable std::vector next_; + mutable edgeptr_t prev_; + mutable edgeptr_t next_; +}; + +// edges manage the input/output data and gradients between nodes +class edge { +public: + edge(node *prev, size_t len) : + ft_dim_(len), + data_({vec_t(len)}), + grad_({vec_t(len)}), + prev_(prev) {} + + void merge_grads(vec_t *dst) { + assert(!grad_.empty()); + const auto &grad_head = grad_[0]; + size_t sz = grad_head.size(); + dst->resize(sz); + float_t *pdst = &(*dst)[0]; + std::copy(grad_head.begin(), grad_head.end(), pdst); + // @todo consider adding parallelism and vectorization + for (size_t sample = 1; sample < grad_.size(); ++sample) { + for (size_t i = 0; i < sz; i++) + pdst[i] += grad_[sample][i]; + //vectorize::reduce(&grad_[sample][0], sz, pdst); + } + } + void clear_grads() { + for (size_t sample = 0; sample < grad_.size(); ++sample) { + auto &g = grad_[sample]; + std::fill(g.begin(), g.end(), 0.0); // TODO: need vectorize + //vectorize::fill(&g[0], g.size(), float_t{0}); + } + } + + tensor_t *get_data_ptr() { return &data_; } + tensor_t &get_data() { return data_; } + //const tensor_t *get_data() const { return &data_; } + const tensor_t &get_data() const { return data_; } + //tensor_t *get_gradient() { return &grad_; } + tensor_t &get_gradient() { return grad_; } + //const tensor_t *get_gradient() const { return &grad_; } + const tensor_t &get_gradient() const { return grad_; } + + //const std::vector &next() const { return next_; } + const node *next() const { return next_; } + node *prev() { return prev_; } + const node *prev() const { return prev_; } + //const shape3d &shape() const { return shape_; } + //vector_type vtype() const { return vtype_; } + //void add_next_node(node *next) { next_.push_back(next); } + void add_next_node(node *next) { next_ = next; } +private: + //shape3d shape_; + size_t ft_dim_; + //vector_type vtype_; + tensor_t data_; + tensor_t grad_; + node *prev_; // previous node, "producer" of this tensor + node *next_; // next node, "consumer" of this tensor + //std::vector next_; // next nodes, "consumers" of this tensor +}; +/* +inline std::vector node::prev_nodes() const { + std::vector vecs; + for (auto &e : prev_) { + if (e && e->prev()) { + vecs.insert(vecs.end(), e->prev()); + } + } + return vecs; +} + +inline std::vector node::next_nodes() const { + std::vector vecs; + for (auto &e : next_) { + if (e) { + auto n = e->next(); + vecs.insert(vecs.end(), n.begin(), n.end()); + } + } + return vecs; +} +*/ diff --git a/libdeepgalois/optimizer.h b/libdeepgalois/optimizer.h new file mode 100644 index 0000000000..2896881fed --- /dev/null +++ b/libdeepgalois/optimizer.h @@ -0,0 +1,221 @@ +#pragma once + +#include +#include +#include "types.h" + +// base class of optimizer +// usesHessian : true if an optimizer uses hessian (2nd order derivative of loss function) +struct optimizer { + optimizer() = default; + optimizer(const optimizer &) = default; + optimizer(optimizer &&) = default; + optimizer &operator=(const optimizer &) = default; + optimizer &operator=(optimizer &&) = default; + virtual ~optimizer() = default; + virtual void update(const vec_t &dW, vec_t &W, bool parallelize) = 0; + virtual void reset() {} // override to implement pre-learning action +}; + +// helper class to hold N values for each weight +template +struct stateful_optimizer : public optimizer { + void reset() override { for (auto &e : E_) e.clear(); } +protected: + template + vec_t &get(const vec_t &key) { + static_assert(Index < N, "index out of range"); + if (E_[Index][&key].empty()) E_[Index][&key].resize(key.size(), float_t()); + return E_[Index][&key]; + } + std::unordered_map E_[N]; +}; + +/** + * adaptive gradient method + * + * J Duchi, E Hazan and Y Singer, + * Adaptive subgradient methods for online learning and stochastic optimization + * The Journal of Machine Learning Research, pages 2121-2159, 2011. + **/ +struct adagrad : public stateful_optimizer<1> { + adagrad() : alpha(learning_rate), eps(float_t(1e-8)) {} + void update(const vec_t &dW, vec_t &W, bool parallelize) { + vec_t &g = get<0>(W); + if (parallelize) { + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + g[i] += dW[i] * dW[i]; + W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); + }, galois::loopname("adagrad_update")); + } else { + for (size_t i = 0; i < W.size(); i++) { + g[i] += dW[i] * dW[i]; + W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); + } + } + } + float_t alpha; // learning rate + private: + float_t eps; +}; + +/** + * RMSprop + * + * T Tieleman, and G E Hinton, + * Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning (2012) + **/ +struct RMSprop : public stateful_optimizer<1> { + RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {} + void update(const vec_t &dW, vec_t &W, bool parallelize) { + vec_t &g = get<0>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i]; + W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps); + }, galois::loopname("rms_update")); + } + float_t alpha; // learning rate + float_t mu; // decay term +private: + float_t eps; // constant value to avoid zero-division +}; + +// Adam: A Method for Stochastic Optimization +// http://arxiv.org/abs/1412.6980 +struct adam : public stateful_optimizer<2> { + adam() : alpha(learning_rate), b1(float_t(0.9)), + b2(float_t(0.999)), b1_t(float_t(0.9)), + b2_t(float_t(0.999)), eps(float_t(1e-8)) {} + + void update(const vec_t &dW, vec_t &W, bool parallelize) { + vec_t &mt = get<0>(W); + vec_t &vt = get<1>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; + // L2 norm based update rule + W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / + std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); + }, galois::chunk_size(), galois::steal(), galois::loopname("adam_update")); + b1_t *= b1; + b2_t *= b2; + } + + float_t alpha; // learning rate + float_t b1; // decay term + float_t b2; // decay term + float_t b1_t; // decay term power t + float_t b2_t; // decay term power t + +private: + float_t eps; // constant value to avoid zero-division +}; + +/** + * @brief [a new optimizer (2015)] + * @details [see Adam: A Method for Stochastic Optimization (Algorithm 2) + * http://arxiv.org/abs/1412.6980] + * + */ +struct adamax : public stateful_optimizer<2> { + adamax() + : alpha(float_t(0.002)), + b1(float_t(0.9)), + b2(float_t(0.999)), + b1_t(b1), + eps(float_t(1e-8)) {} + + void update(const vec_t &dW, vec_t &W, bool parallelize) { + vec_t &mt = get<0>(W); + vec_t &ut = get<1>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); + // Lp norm based update rule + W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); + }, galois::loopname("adamax_update")); + b1_t *= b1; + } + + float_t alpha; // learning rate + float_t b1; // decay term + float_t b2; // decay term + float_t b1_t; // decay term power t + +private: + float_t eps; // constant value to avoid zero-division +}; + +/** + * SGD without momentum + * + * slightly faster than tiny_dnn::momentum + **/ +struct gradient_descent : public optimizer { + gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {} + void update(const vec_t &dW, vec_t &W, bool parallelize) { + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); + }, galois::loopname("gradient_descent_update")); + } + float_t alpha; // learning rate + float_t lambda; // weight decay +}; + +/** + * SGD with momentum + * + * B T Polyak, + * Some methods of speeding up the convergence of iteration methods + * USSR Computational Mathematics and Mathematical Physics, 4(5):1-17, 1964. + **/ +struct momentum : public stateful_optimizer<1> { + public: + momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} + + void update(const vec_t &dW, vec_t &W, bool parallelize) { + vec_t &dWprev = get<0>(W); + + //for_i(parallelize, W.size(), [&](size_t i) { + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += V; + dWprev[i] = V; + //}); + }, galois::loopname("momentum_update")); + } + + float_t alpha; // learning rate + float_t lambda; // weight decay + float_t mu; // momentum +}; + +/** + * SGD with Nesterov momentum + * + * Y Nesterov, + * A method for unconstrained convex minimization problem with the rate of + * convergence o(1/k2), Doklady ANSSSR, vol.269, pp.543-547, 1983. + **/ +struct nesterov_momentum : public stateful_optimizer<1> { + public: + nesterov_momentum() + : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} + + void update(const vec_t &dW, vec_t &W, bool parallelize) { + vec_t &dWprev = get<0>(W); + + //for_i(parallelize, W.size(), [&](size_t i) { + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += (-mu) * dWprev[i] + (1 + mu) * V; + dWprev[i] = V; + //}); + }, galois::loopname("nesterov_momentum_update")); + } + + float_t alpha; // learning rate + float_t lambda; // weight decay + float_t mu; // momentum +}; + diff --git a/libdeepgalois/random.h b/libdeepgalois/random.h new file mode 100644 index 0000000000..9236e9c391 --- /dev/null +++ b/libdeepgalois/random.h @@ -0,0 +1,63 @@ +#ifndef RANDOM_H +#define RANDOM_H +typedef boost::mt19937 rng_t; + +// random seeding +int64_t seedgen(void) { + int64_t s, seed, pid; + FILE* f = fopen("/dev/urandom", "rb"); + if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { + fclose(f); + return seed; + } + std::cout << "System entropy source not available, using fallback algorithm to generate seed instead."; + if (f) fclose(f); + pid = getpid(); + s = time(NULL); + seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729); + return seed; +} + +// This random number generator facade hides boost and CUDA rng +// implementation from one another (for cross-platform compatibility). +class RNG { +public: + RNG() : generator_(new Generator()) { } + explicit RNG(unsigned int seed) : generator_(new Generator(seed)) { } + explicit RNG(const RNG&); + RNG& operator=(const RNG& other) { generator_ = other.generator_; return *this; } + void* generator() { return static_cast(generator_->rng()); } +private: + class Generator { + public: + Generator() : rng_(new rng_t(seedgen())) {} + explicit Generator(unsigned seed) : rng_(new rng_t(seed)) {} + rng_t* rng() { return rng_.get(); } + private: + std::shared_ptr rng_; + }; + + std::shared_ptr generator_; +}; + +std::shared_ptr random_generator_; +inline static RNG& rng_stream() { + random_generator_.reset(new RNG()); + return *random_generator_; +} + +inline rng_t* rng() { + return static_cast(rng_stream().generator()); +} + +#include +template +void rng_bernoulli(const DataTy p, std::vector &r) { + boost::bernoulli_distribution random_distribution(p); + boost::variate_generator > + variate_generator(rng(), random_distribution); + for (size_t i = 0; i < r.size(); ++i) + r[i] = static_cast(variate_generator()); +} + +#endif diff --git a/libdeepgalois/timer.h b/libdeepgalois/timer.h new file mode 100644 index 0000000000..e6c838c37b --- /dev/null +++ b/libdeepgalois/timer.h @@ -0,0 +1,21 @@ +#ifndef TIMER_H_ +#define TIMER_H_ +#include + +class Timer { +public: + Timer() {} + void Start() { gettimeofday(&start_time_, NULL); } + void Stop() { + gettimeofday(&elapsed_time_, NULL); + elapsed_time_.tv_sec -= start_time_.tv_sec; + elapsed_time_.tv_usec -= start_time_.tv_usec; + } + double Seconds() const { return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1e6; } + double Millisecs() const { return 1000*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1000; } + double Microsecs() const { return 1e6*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; } +private: + struct timeval start_time_; + struct timeval elapsed_time_; +}; +#endif // TIMER_H_ diff --git a/libdeepgalois/types.h b/libdeepgalois/types.h new file mode 100644 index 0000000000..bc9fe21049 --- /dev/null +++ b/libdeepgalois/types.h @@ -0,0 +1,34 @@ +#ifndef TYPES_H +#define TYPES_H +#include +#include "galois/Galois.h" +#include "galois/graphs/LCGraph.h" + +#ifdef CNN_USE_DOUBLE +typedef double float_t; +typedef double feature_t; +#else +typedef float float_t; +typedef float feature_t; // feature type +#endif +typedef std::vector vec_t; // feature vector (1D) +typedef std::vector tensor_t; // feature vectors (2D): num_samples x feature_dim +typedef std::vector FV; // feature vector +typedef std::vector FV2D; // feature vectors: num_samples x feature_dim +typedef float acc_t; // Accuracy type +typedef short label_t; // label is for classification (supervised learning) +typedef uint8_t mask_t; // mask is used to indicate different uses of labels: train, val, test +typedef std::vector LabelList; // label list to store label for each vertex +typedef std::vector MaskList; // mask list to store mask for each vertex +typedef galois::GAccumulator AccumF; +typedef galois::GAccumulator AccumU; + +#ifdef EDGE_LABEL +typedef galois::graphs::LC_CSR_Graph::with_numa_alloc::type ::with_no_lockable::type Graph; +#else +typedef galois::graphs::LC_CSR_Graph::with_numa_alloc::type ::with_no_lockable::type Graph; +#endif + +typedef Graph::GraphNode GNode; + +#endif diff --git a/libdeepgalois/utils.h b/libdeepgalois/utils.h new file mode 100644 index 0000000000..70356654b9 --- /dev/null +++ b/libdeepgalois/utils.h @@ -0,0 +1,119 @@ +#pragma once + +#include +#include +#include +#include +#include +#include "gnn.h" + +std::string path = "/h2/xchen/datasets/Learning/"; // path to the input dataset +enum class net_phase { train, test }; + +class ResourceManager { +public: + ResourceManager() {} + ~ResourceManager(){} + //peak memory usage + std::string get_peak_memory() { + double kbm; + struct rusage CurUsage; + getrusage(RUSAGE_SELF, &CurUsage); + kbm = (double)CurUsage.ru_maxrss; + double mbm = kbm / 1024.0; + double gbm = mbm / 1024.0; + return + "Peak memory: " + + to_string_with_precision(mbm, 3) + " MB; " + + to_string_with_precision(gbm, 3) + " GB"; + } +private: + template + std::string to_string_with_precision(const T a_value, const int& n) { + std::ostringstream out; + out << std::fixed; + out << std::setprecision(n) << a_value; + return out.str(); + } +}; + +class Timer { +public: + Timer() {} + void Start() { gettimeofday(&start_time_, NULL); } + void Stop() { + gettimeofday(&elapsed_time_, NULL); + elapsed_time_.tv_sec -= start_time_.tv_sec; + elapsed_time_.tv_usec -= start_time_.tv_usec; + } + double Seconds() const { return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1e6; } + double Millisecs() const { return 1000*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1000; } + double Microsecs() const { return 1e6*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; } +private: + struct timeval start_time_; + struct timeval elapsed_time_; +}; + +class random_generator { +public: + static random_generator &get_instance() { + static random_generator instance; + return instance; + } + std::mt19937 &operator()() { return gen_; } + void set_seed(unsigned int seed) { gen_.seed(seed); } + +private: + random_generator() : gen_(1) {} + std::mt19937 gen_; +}; + +template +inline typename std::enable_if::value, T>::type +uniform_rand(T min, T max) { + std::uniform_int_distribution dst(min, max); + return dst(random_generator::get_instance()()); +} + +template +inline typename std::enable_if::value, T>::type +uniform_rand(T min, T max) { + std::uniform_real_distribution dst(min, max); + return dst(random_generator::get_instance()()); +} + +inline bool bernoulli(float_t p) { + return uniform_rand(float_t{0}, float_t{1}) <= p; +} + +size_t read_masks(std::string dataset_str, std::string mask_type, size_t &begin, size_t &end, MaskList &masks) { + if (dataset_str != "citeseer" && dataset_str != "cora") { + std::cout << "Dataset currently not supported\n"; + exit(1); + } + size_t i = 0; + size_t sample_count = 0; + std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; + //std::cout << "Reading " << filename << "\n"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + in >> begin >> end >> std::ws; + while (std::getline(in, line)) { + std::istringstream mask_stream(line); + if (i >= begin && i < end) { + unsigned mask = 0; + mask_stream >> mask; + if (mask == 1) { + masks[i] = 1; + sample_count ++; + } + } + i ++; + } + //std::cout << mask_type + "_mask range: [" << begin << ", " << end + // << ") Number of valid samples: " << sample_count << "\n"; + in.close(); + return sample_count; +} + diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt new file mode 100644 index 0000000000..c03a5c6676 --- /dev/null +++ b/lonestargnn/CMakeLists.txt @@ -0,0 +1,8 @@ +include_directories(BEFORE + ${CMAKE_SOURCE_DIR}/libllvm/include + ${CMAKE_CURRENT_BINARY_DIR}/../libllvm/include +) +include_directories(${CMAKE_SOURCE_DIR}/lonestargnn) +include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois) + +add_subdirectory(gcn) diff --git a/lonestargnn/README.md b/lonestargnn/README.md new file mode 100644 index 0000000000..bae49e36a0 --- /dev/null +++ b/lonestargnn/README.md @@ -0,0 +1,60 @@ +DESCRIPTION +=========== + +This application does vertex classification in an undirected graph. +It uses graph neural network (GNN) to train the vertex features +which are then used to classify vertices into different classes. + +INPUT +=========== + +The input dataset contains three parts: +1. the input graph file: edgelist format of a |V| x |V| sparse matrix. +2. the vertex label file: |V| lines with each line a integer. +3. the input feature file: edgelist format of |V| x |D| sparse matrix. + +Vertex ids are expected to be sequential integers between 0 and |V|-1. +|V| is the number of vertices. |D| is the dimension of input feature vectors. + +BUILD +=========== + +1. Run cmake at BUILD directory `cd build; cmake -DUSE_DEEPGALOIS=1 -DUSE_BLAS=1 ../` + +2. Run `cd /lonestargnn/gcn; make -j` + +RUN +=========== + +The following are a few example command lines. + +$ export OPENBLAS_NUM_THREADS=28 +$ ./gnn cora -t=1 -k=3 +$ ./gnn citeseer -t=3 -k=30 +$ ./gnn reddit -t=56 -k=3 + +PERFORMANCE +=========== +- I +- I +- I + +REFERENCES +=========== +The GCN model: +Semi-Supervised Classification with Graph Convolutional Networks (ICLR 2017) +http://arxiv.org/abs/1609.02907 +https://github.com/tkipf/gcn + +DGL: +Deep Graph Library: Towards Efficient and Scalable Deep Learning on Graphs +https://arxiv.org/abs/1909.01315 +https://github.com/dmlc/dgl + +GraphSAGE: +Inductive Representation Learning on Large Graphs +http://snap.stanford.edu/graphsage/ + +NeuGraph: Parallel Deep Neural Network Computation on Large Graphs +https://www.usenix.org/conference/atc19/presentation/ma + diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt new file mode 100644 index 0000000000..f1a65740f7 --- /dev/null +++ b/lonestargnn/gcn/CMakeLists.txt @@ -0,0 +1,16 @@ +SET(USE_BLAS ON CACHE BOOL "Use blas") + +SET(BLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) +SET(BLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) + +if (USE_BLAS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWITH_BLAS") + include_directories(${BLAS_INC}) + link_directories(${BLAS_LIB}) +endif() + +app(gcn gcn.cpp) + +if (USE_BLAS) + target_link_libraries(gcn -lopenblas) +endif() diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp new file mode 100644 index 0000000000..8d1f792fd1 --- /dev/null +++ b/lonestargnn/gcn/gcn.cpp @@ -0,0 +1,47 @@ +// Graph Neural Networks +// Xuhao Chen +#include "gnn.h" +#include "lonestargnn.h" + +const char* name = "Graph Convolutional Networks"; +const char* desc = "Graph convolutional neural networks on an undirected graph"; +const char* url = 0; + +int main(int argc, char** argv) { + galois::SharedMemSys G; + LonestarGnnStart(argc, argv, name, desc, url); + Net network; // the neural network to train + network.init(); + network.construct_layers(); // default setting for now; see its implementation to find how to customize it by the user + network.print_layers_info(); + ResourceManager rm; + + // the optimizer used to update parameters, see optimizer.h for more details + //optimizer *opt = new gradient_descent(); + //optimizer *opt = new adagrad(); + optimizer *opt = new adam(); + galois::StatTimer Ttrain("TrainAndVal"); + Ttrain.start(); + network.train(opt); // do training using training samples + Ttrain.stop(); + + if (do_test) { + // test using test samples + size_t n = network.get_nnodes(); + acc_t test_loss = 0.0, test_acc = 0.0; + size_t test_begin = 0, test_end = n, test_count = n; + MaskList test_mask(n, 0); + if (dataset == "reddit") { + test_begin = 177262; test_count = 55703; test_end = test_begin + test_count; + for (size_t i = test_begin; i < test_end; i++) test_mask[i] = 1; + } else test_count = read_masks(dataset, "test", test_begin, test_end, test_mask); + galois::StatTimer Ttest("Test"); + Ttest.start(); + double test_time = network.evaluate(test_begin, test_end, test_count, test_mask, test_loss, test_acc); + std::cout << "\nTesting: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n"; + Ttest.stop(); + } + std::cout << "\n" << rm.get_peak_memory() << "\n\n"; + return 0; +} + diff --git a/lonestargnn/graphsage/gs-mean.cpp b/lonestargnn/graphsage/gs-mean.cpp new file mode 100644 index 0000000000..b70cdc183c --- /dev/null +++ b/lonestargnn/graphsage/gs-mean.cpp @@ -0,0 +1,41 @@ +// Graph Neural Networks +// Xuhao Chen +#include "gnn.h" + +const char* name = "GraphSage"; +const char* desc = "A graph neural network variant: GraphSAGE"; +const char* url = 0; + +class GraphSageMean: public graph_conv_layer { + // user-defined combine function +}; + +int main(int argc, char** argv) { + galois::SharedMemSys G; + LonestarStart(argc, argv, name, desc, url); + Net network; // the neural network to train + network.init(); // default setting for now; see its implementation to find how to customize it by the user + ResourceManager rm; + + // the optimizer used to update parameters, see optimizer.h for more details + //optimizer *opt = new gradient_descent(); + //optimizer *opt = new adagrad(); + optimizer *opt = new adam(); + galois::StatTimer Ttrain("Train"); + Ttrain.start(); + network.train(opt); // do training using training samples + Ttrain.stop(); + + // test using test samples + acc_t test_loss = 0.0, test_acc = 0.0; + size_t test_begin = 2312, test_end = 3312; // [2312, 3327) test size = 1015 TODO: replace ad-hoc settings + galois::StatTimer Ttest("Test"); + Ttest.start(); + double test_time = network.evaluate(test_begin, test_end, test_loss, test_acc); + std::cout << "\nTesting: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n"; + Ttest.stop(); + + std::cout << "\n" << rm.get_peak_memory() << "\n\n"; + return 0; +} + diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h new file mode 100644 index 0000000000..e53dc2e461 --- /dev/null +++ b/lonestargnn/lonestargnn.h @@ -0,0 +1,50 @@ +#pragma once + +#include +#include +#include "galois/Galois.h" +#include "galois/Version.h" +#include "llvm/Support/CommandLine.h" + +//! standard global options to the benchmarks +extern llvm::cl::opt skipVerify; +extern llvm::cl::opt numThreads; +extern llvm::cl::opt statFile; + +//! standard global options to the benchmarks +llvm::cl::opt skipVerify("noverify", llvm::cl::desc("Skip verification step (default value false)"), llvm::cl::init(false)); +llvm::cl::optnumThreads("t", llvm::cl::desc("Number of threads (default value 1)"), llvm::cl::init(1)); +llvm::cl::opt statFile("statFile", llvm::cl::desc("ouput file to print stats to (default value empty)"), llvm::cl::init("")); + +static void LonestarGnnPrintVersion() { + std::cout << "LoneStar Benchmark Suite v" << galois::getVersion() << " (" << galois::getRevision() << ")\n"; +} + +//! initialize lonestargnn benchmark +void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, const char* url) { + llvm::cl::SetVersionPrinter(LonestarGnnPrintVersion); + llvm::cl::ParseCommandLineOptions(argc, argv); + numThreads = galois::setActiveThreads(numThreads); + galois::runtime::setStatFile(statFile); + LonestarGnnPrintVersion(); + std::cout << "Copyright (C) " << galois::getCopyrightYear() << " The University of Texas at Austin\n"; + std::cout << "http://iss.ices.utexas.edu/galois/\n\n"; + std::cout << "application: " << (app ? app : "unspecified") << "\n"; + if (desc) std::cout << desc << "\n"; + if (url) std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" << url << "\n"; + std::cout << "\n"; + + std::ostringstream cmdout; + for (int i = 0; i < argc; ++i) { + cmdout << argv[i]; + if (i != argc - 1) cmdout << " "; + } + + galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str()); + galois::runtime::reportParam("(NULL)", "Threads", numThreads); + + char name[256]; + gethostname(name, 256); + galois::runtime::reportParam("(NULL)", "Hostname", name); +} + diff --git a/lonestargnn/run-citeseer.sh b/lonestargnn/run-citeseer.sh new file mode 100755 index 0000000000..a70f0bdc1f --- /dev/null +++ b/lonestargnn/run-citeseer.sh @@ -0,0 +1 @@ +./gcn citeseer -t=56 -k=3 From f175999b9a60423d9c994a7e39e5a4a3dbcf28d2 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 19 Feb 2020 14:42:07 -0600 Subject: [PATCH 002/660] add gpu --- libdeepgalois/common.h | 50 +++++++++++++++++++++++++++++++ libdeepgalois/cutils.h | 40 +++++++++++++++++++++++++ libdeepgalois/gnn.h | 31 ------------------- libdeepgalois/gpu_kernels.hpp | 41 +++++++++++++++++++++++++ libdeepgalois/layers/relu_layer.h | 10 +++++-- libdeepgalois/math_functions.hpp | 26 ++++++++-------- libdeepgalois/net.h | 3 +- libdeepgalois/utils.h | 1 - lonestargnn/gcn/gcn.cpp | 1 - lonestargnn/lonestargnn.h | 31 +++++++++++++++---- 10 files changed, 179 insertions(+), 55 deletions(-) create mode 100644 libdeepgalois/common.h create mode 100644 libdeepgalois/cutils.h delete mode 100644 libdeepgalois/gnn.h create mode 100644 libdeepgalois/gpu_kernels.hpp diff --git a/libdeepgalois/common.h b/libdeepgalois/common.h new file mode 100644 index 0000000000..e1bff6901d --- /dev/null +++ b/libdeepgalois/common.h @@ -0,0 +1,50 @@ +#pragma once +#include "cutils.h" + +class DeepGalois { +public: + ~DeepGalois(); + enum Brew { CPU, GPU }; + static DeepGalois& Get() { + } + inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; } + inline static curandGenerator_t curand_generator() { return Get().curand_generator_; } + inline static Brew mode() { return Get().mode_; } + inline static void set_mode(Brew mode) { Get().mode_ = mode; } + inline static int solver_count() { return Get().solver_count_; } + inline static void set_solver_count(int val) { Get().solver_count_ = val; } + inline static int solver_rank() { return Get().solver_rank_; } + inline static void set_solver_rank(int val) { Get().solver_rank_ = val; } + inline static bool multiprocess() { return Get().multiprocess_; } + inline static void set_multiprocess(bool val) { Get().multiprocess_ = val; } + inline static bool root_solver() { return Get().solver_rank_ == 0; } + static void SetDevice(const int device_id) { + int current_device; + CUDA_CHECK(cudaGetDevice(¤t_device)); + if (current_device == device_id) return; + CUDA_CHECK(cudaSetDevice(device_id)); + if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); + if (Get().curand_generator_) CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); + CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); + CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); + } + static void DeviceQuery(); + static bool CheckDevice(const int device_id); + static int FindDevice(const int start_id = 0); + +protected: + cublasHandle_t cublas_handle_; + curandGenerator_t curand_generator_; + shared_ptr random_generator_; + Brew mode_; + // Parallel training + int solver_count_; + int solver_rank_; + bool multiprocess_; + +private: + // The private constructor to avoid duplicate instantiation. + DeepGalois(); +}; + diff --git a/libdeepgalois/cutils.h b/libdeepgalois/cutils.h new file mode 100644 index 0000000000..4356ec2979 --- /dev/null +++ b/libdeepgalois/cutils.h @@ -0,0 +1,40 @@ +#pragma once + +// CUDA: use 256 threads per block +const int CUDA_NUM_THREADS = 256; + +// CUDA: number of blocks for threads. +inline int CUDA_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +// CUDA: various checks for different function calls. +#define CUDA_CHECK(condition) \ + // Code block avoids redefinition of cudaError_t error \ + do { \ + cudaError_t error = condition; \ + CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ + } while (0) + +#define CUBLAS_CHECK(condition) \ + do { \ + cublasStatus_t status = condition; \ + CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \ + << caffe::cublasGetErrorString(status); \ + } while (0) + +#define CURAND_CHECK(condition) \ + do { \ + curandStatus_t status = condition; \ + CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \ + << caffe::curandGetErrorString(status); \ + } while (0) + +// CUDA: grid stride looping +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ + i < (n); i += blockDim.x * gridDim.x) + +// CUDA: check for error after kernel execution and exit loudly if there is one. +#define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError()) + diff --git a/libdeepgalois/gnn.h b/libdeepgalois/gnn.h deleted file mode 100644 index d2d2bafb28..0000000000 --- a/libdeepgalois/gnn.h +++ /dev/null @@ -1,31 +0,0 @@ -#ifndef _GNN_H_ -#define _GNN_H_ - -#include "galois/Galois.h" -#include "galois/Reduction.h" -#include "galois/Timer.h" -#include "galois/ParallelSTL.h" -#include "llvm/Support/CommandLine.h" -#include "galois/runtime/Profile.h" -#include - -namespace cll = llvm::cl; -static cll::opt dataset(cll::Positional, cll::desc(""), cll::Required); // 'cora', 'citeseer', 'pubmed' -static cll::opt filetype(cll::Positional, cll::desc(""), cll::init("gr")); // file format of the input graph -static cll::opt model("m", cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense' -static cll::opt learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01)); -static cll::opt epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1)); -static cll::opt hidden1("h", cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16)); -static cll::opt dropout_rate("d", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5)); -static cll::opt weight_decay("wd", cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4)); -static cll::opt early_stopping("es", cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10)); -static cll::opt max_degree("md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"), cll::init(3)); -static cll::opt do_validate("dv", cll::desc("enable validation"), cll::init(1)); -static cll::opt do_test("dt", cll::desc("enable test"), cll::init(1)); -#define CHUNK_SIZE 256 - -#include "types.h" -#include "utils.h" -#include "net.h" - -#endif diff --git a/libdeepgalois/gpu_kernels.hpp b/libdeepgalois/gpu_kernels.hpp new file mode 100644 index 0000000000..7cb1068fc6 --- /dev/null +++ b/libdeepgalois/gpu_kernels.hpp @@ -0,0 +1,41 @@ +#pragma once +#include +#include +#include +#include +#include "cutils.h" + +// flattern data into 1D before feed into the ReLU operater +__global__ void relu_gpu(const int n, const float_t* in, float_t* out) { + CUDA_KERNEL_LOOP(index, n) { + out[index] = in[index] > 0 ? in[index] : 0; + } +} + +__global__ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* in_data, float_t* out_diff) { + CUDA_KERNEL_LOOP(index, n) { + out_diff[index] = in_data[index] > 0 ? in_diff[index] : 0; + } +} + +void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C) { + // Note that cublas follows fortran order. + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + CUBLAS_CHECK(cublasSgemm(cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); +} + +void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float* A, const float* x, const float beta, float* y) { + cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK(cublasSgemv(cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); +} + +void scal_gpu(const int N, const float alpha, float *X) { + CUBLAS_CHECK(cublasSscal(cublas_handle(), N, &alpha, X, 1)); +} + diff --git a/libdeepgalois/layers/relu_layer.h b/libdeepgalois/layers/relu_layer.h index 389e6b3c1f..2795fc404e 100644 --- a/libdeepgalois/layers/relu_layer.h +++ b/libdeepgalois/layers/relu_layer.h @@ -13,12 +13,16 @@ class relu_layer : public layer { void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) { for (size_t j = 0; j < input_dims[1]; ++j) - out_data[i][j] = std::max(in_data[i][j], (float_t)0) + - negative_slope * std::min(in_data[i][j], (float_t)0); + out_data[i][j] = std::max(in_data[i][j], (float_t)0); }, galois::chunk_size(), galois::steal(), galois::loopname("relu_layer-fw")); } // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ , ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ void back_propagation(const tensor_t &in_data, const tensor_t &out_data, - tensor_t &out_grad, tensor_t &in_grad) override {} + tensor_t &out_grad, tensor_t &in_grad) override { + galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) { + for (size_t j = 0; j < input_dims[1]; ++j) + in_grad[i][j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0); + }, galois::chunk_size(), galois::steal(), galois::loopname("relu_layer-bw")); + } }; diff --git a/libdeepgalois/math_functions.hpp b/libdeepgalois/math_functions.hpp index 8791416441..f1612aac1c 100644 --- a/libdeepgalois/math_functions.hpp +++ b/libdeepgalois/math_functions.hpp @@ -164,27 +164,27 @@ inline void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) { } } +void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +} + inline void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const vec_t &A, const vec_t &B, vec_t &C) { galois::StatTimer Tmatmul("MatMul"); Tmatmul.start(); -#ifdef WITH_BLAS - const int M = dim_x; - const int N = dim_y; - const int K = dim_z; - const float alpha = 1.0; - const float beta = 0.0; - const CBLAS_TRANSPOSE TransA = CblasNoTrans; - const CBLAS_TRANSPOSE TransB = CblasNoTrans; - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, &A[0], lda, &B[0], ldb, beta, &C[0], N); -#else - //std::cout << "using naive matmul, slow\n"; assert(A.size() == dim_x*dim_z); assert(B.size() == dim_z*dim_y); assert(C.size() == dim_x*dim_y); +#ifdef WITH_BLAS + const CBLAS_TRANSPOSE TransA = CblasNoTrans; + const CBLAS_TRANSPOSE TransB = CblasNoTrans; + sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, &A[0], &B[0], 0.0, &C[0]); +#else for (size_t i = 0; i < dim_x; ++i) { for (size_t j = 0; j < dim_y; ++j) { C[i*dim_y+j] = 0; diff --git a/libdeepgalois/net.h b/libdeepgalois/net.h index fac7caee00..f6d6930d5a 100644 --- a/libdeepgalois/net.h +++ b/libdeepgalois/net.h @@ -2,7 +2,8 @@ #define _MODEL_H_ #include -#include "gnn.h" +#include "galois/Galois.h" +#include "galois/Timer.h" #include "lgraph.h" #include "layers.h" #include "optimizer.h" diff --git a/libdeepgalois/utils.h b/libdeepgalois/utils.h index 70356654b9..100a997b57 100644 --- a/libdeepgalois/utils.h +++ b/libdeepgalois/utils.h @@ -5,7 +5,6 @@ #include #include #include -#include "gnn.h" std::string path = "/h2/xchen/datasets/Learning/"; // path to the input dataset enum class net_phase { train, test }; diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 8d1f792fd1..72fc8373fc 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -1,6 +1,5 @@ // Graph Neural Networks // Xuhao Chen -#include "gnn.h" #include "lonestargnn.h" const char* name = "Graph Convolutional Networks"; diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h index e53dc2e461..efbb862fd7 100644 --- a/lonestargnn/lonestargnn.h +++ b/lonestargnn/lonestargnn.h @@ -1,10 +1,30 @@ #pragma once -#include #include +#include +#include "galois/Timer.h" #include "galois/Galois.h" #include "galois/Version.h" +#include "galois/Reduction.h" +#include "galois/ParallelSTL.h" +#include "galois/runtime/Profile.h" #include "llvm/Support/CommandLine.h" +#include + +namespace cll = llvm::cl; +static cll::opt dataset(cll::Positional, cll::desc(""), cll::Required); // 'cora', 'citeseer', 'pubmed' +static cll::opt filetype(cll::Positional, cll::desc(""), cll::init("gr")); // file format of the input graph +static cll::opt model("m", cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense' +static cll::opt learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01)); +static cll::opt epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1)); +static cll::opt hidden1("h", cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16)); +static cll::opt dropout_rate("d", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5)); +static cll::opt weight_decay("wd", cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4)); +static cll::opt early_stopping("es", cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10)); +static cll::opt max_degree("md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"), cll::init(3)); +static cll::opt do_validate("dv", cll::desc("enable validation"), cll::init(1)); +static cll::opt do_test("dt", cll::desc("enable test"), cll::init(1)); +#define CHUNK_SIZE 256 //! standard global options to the benchmarks extern llvm::cl::opt skipVerify; @@ -17,7 +37,7 @@ llvm::cl::optnumThreads("t", llvm::cl::desc("Number of threads (default val llvm::cl::opt statFile("statFile", llvm::cl::desc("ouput file to print stats to (default value empty)"), llvm::cl::init("")); static void LonestarGnnPrintVersion() { - std::cout << "LoneStar Benchmark Suite v" << galois::getVersion() << " (" << galois::getRevision() << ")\n"; + std::cout << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " (" << galois::getRevision() << ")\n"; } //! initialize lonestargnn benchmark @@ -33,18 +53,19 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, if (desc) std::cout << desc << "\n"; if (url) std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" << url << "\n"; std::cout << "\n"; - std::ostringstream cmdout; for (int i = 0; i < argc; ++i) { cmdout << argv[i]; if (i != argc - 1) cmdout << " "; } - galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str()); galois::runtime::reportParam("(NULL)", "Threads", numThreads); - char name[256]; gethostname(name, 256); galois::runtime::reportParam("(NULL)", "Hostname", name); } +#include "types.h" +#include "utils.h" +#include "net.h" + From b12af3048addd7133db6c0162abe0bc3d948c32c Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Thu, 20 Feb 2020 13:29:30 -0600 Subject: [PATCH 003/660] add cpp --- libdeepgalois/CMakeLists.txt | 45 ++- libdeepgalois/gpu_kernels.hpp | 41 --- libdeepgalois/{ => include}/common.h | 8 +- libdeepgalois/{ => include}/cutils.h | 6 +- libdeepgalois/{ => include}/layers.h | 0 .../{ => include}/layers/arithmetic_layer.h | 0 .../{ => include}/layers/graph_conv_layer.h | 20 +- libdeepgalois/{ => include}/layers/layer.h | 0 .../{ => include}/layers/linear_layer.h | 0 libdeepgalois/include/layers/relu_layer.h | 15 + .../include/layers/softmax_loss_layer.h | 18 ++ libdeepgalois/{ => include}/lgraph.h | 0 libdeepgalois/include/math_functions.hpp | 43 +++ libdeepgalois/{ => include}/net.h | 6 +- libdeepgalois/{ => include}/node.h | 1 + libdeepgalois/{ => include}/optimizer.h | 6 +- libdeepgalois/{ => include}/random.h | 0 libdeepgalois/{ => include}/timer.h | 0 libdeepgalois/{ => include}/types.h | 1 + libdeepgalois/{ => include}/utils.h | 5 +- libdeepgalois/layers/relu_layer.h | 28 -- libdeepgalois/layers/softmax_loss_layer.h | 47 --- libdeepgalois/src/layers/relu_layer.cpp | 19 ++ .../src/layers/softmax_loss_layer.cpp | 34 +++ .../math_functions.cpp} | 281 ++++++------------ libdeepgalois/src/math_functions.cu | 84 ++++++ lonestargnn/CMakeLists.txt | 2 +- lonestargnn/gcn/CMakeLists.txt | 18 +- lonestargnn/lonestargnn.h | 1 - 29 files changed, 382 insertions(+), 347 deletions(-) delete mode 100644 libdeepgalois/gpu_kernels.hpp rename libdeepgalois/{ => include}/common.h (90%) rename libdeepgalois/{ => include}/cutils.h (91%) rename libdeepgalois/{ => include}/layers.h (100%) rename libdeepgalois/{ => include}/layers/arithmetic_layer.h (100%) rename libdeepgalois/{ => include}/layers/graph_conv_layer.h (94%) rename libdeepgalois/{ => include}/layers/layer.h (100%) rename libdeepgalois/{ => include}/layers/linear_layer.h (100%) create mode 100644 libdeepgalois/include/layers/relu_layer.h create mode 100644 libdeepgalois/include/layers/softmax_loss_layer.h rename libdeepgalois/{ => include}/lgraph.h (100%) create mode 100644 libdeepgalois/include/math_functions.hpp rename libdeepgalois/{ => include}/net.h (98%) rename libdeepgalois/{ => include}/node.h (99%) rename libdeepgalois/{ => include}/optimizer.h (97%) rename libdeepgalois/{ => include}/random.h (100%) rename libdeepgalois/{ => include}/timer.h (100%) rename libdeepgalois/{ => include}/types.h (98%) rename libdeepgalois/{ => include}/utils.h (92%) delete mode 100644 libdeepgalois/layers/relu_layer.h delete mode 100644 libdeepgalois/layers/softmax_loss_layer.h create mode 100644 libdeepgalois/src/layers/relu_layer.cpp create mode 100644 libdeepgalois/src/layers/softmax_loss_layer.cpp rename libdeepgalois/{math_functions.hpp => src/math_functions.cpp} (60%) create mode 100644 libdeepgalois/src/math_functions.cu diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 8caa65ebc9..4f51532898 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -1,23 +1,52 @@ +SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) +SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) +include_directories(${OPENBLAS_INC}) +link_directories(${OPENBLAS_LIB}) + +#SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include) +#SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-10.0/lib64/) +#SET(ENABLE_GPU OFF CACHE BOOL "Use GPU for DeepGalois") +#if (ENABLE_GPU) +# target_compile_definitions(distbench PRIVATE __GALOIS_HET_CUDA__=1) +# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_GPU") +# include_directories(${CUDA_INC}) +# link_directories(${CUDA_LIB}) +#endif() + +#set(sources +# src/layers/relu_layer.cu +#) +#cuda_add_library(deepgalois_gpu +# ${sources} +#OPTIONS -D_FORCE_INLINES +#) +#target_include_directories(deepgalois_gpu PUBLIC +# ${CMAKE_SOURCE_DIR}/libgpu/include +#) +#set_target_properties(deepgalois_gpu PROPERTIES +# INTERFACE_POSITION_INDEPENDENT_CODE On +# POSITION_INDEPENDENT_CODE On +#) +#target_link_libraries(deepgalois -lcudart -lcublas) + set(sources - $ -# $ - $ + src/layers/relu_layer.cpp + src/layers/softmax_loss_layer.cpp + src/math_functions.cpp ) - add_library(deepgalois STATIC ${sources}) -target_link_libraries(deepgalois galois_shmem galois_dist_async gllvm) +target_link_libraries(deepgalois galois_shmem gllvm) target_link_libraries(deepgalois ${MPI_CXX_LIBRARIES}) +target_link_libraries(deepgalois -lopenblas) target_include_directories(deepgalois PUBLIC ${CMAKE_SOURCE_DIR}/libllvm/include ${CMAKE_SOURCE_DIR}/libgalois/include - ${CMAKE_SOURCE_DIR}/libdist/include - ${CMAKE_SOURCE_DIR}/libdeepgalios/include ${CMAKE_CURRENT_SOURCE_DIR}/include ) -set_target_properties (deepgalois PROPERTIES +set_target_properties(deepgalois PROPERTIES INTERFACE_POSITION_INDEPENDENT_CODE On POSITION_INDEPENDENT_CODE On ) diff --git a/libdeepgalois/gpu_kernels.hpp b/libdeepgalois/gpu_kernels.hpp deleted file mode 100644 index 7cb1068fc6..0000000000 --- a/libdeepgalois/gpu_kernels.hpp +++ /dev/null @@ -1,41 +0,0 @@ -#pragma once -#include -#include -#include -#include -#include "cutils.h" - -// flattern data into 1D before feed into the ReLU operater -__global__ void relu_gpu(const int n, const float_t* in, float_t* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = in[index] > 0 ? in[index] : 0; - } -} - -__global__ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* in_data, float_t* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - out_diff[index] = in_data[index] > 0 ? in_diff[index] : 0; - } -} - -void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, - const int M, const int N, const int K, const float alpha, - const float* A, const float* B, const float beta, float* C) { - // Note that cublas follows fortran order. - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasSgemm(cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); -} - -void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const float alpha, const float* A, const float* x, const float beta, float* y) { - cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasSgemv(cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); -} - -void scal_gpu(const int N, const float alpha, float *X) { - CUBLAS_CHECK(cublasSscal(cublas_handle(), N, &alpha, X, 1)); -} - diff --git a/libdeepgalois/common.h b/libdeepgalois/include/common.h similarity index 90% rename from libdeepgalois/common.h rename to libdeepgalois/include/common.h index e1bff6901d..0c3023c3f2 100644 --- a/libdeepgalois/common.h +++ b/libdeepgalois/include/common.h @@ -1,4 +1,6 @@ #pragma once +#include "types.h" +#include "utils.h" #include "cutils.h" class DeepGalois { @@ -29,9 +31,9 @@ class DeepGalois { CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); } - static void DeviceQuery(); - static bool CheckDevice(const int device_id); - static int FindDevice(const int start_id = 0); + static void DeviceQuery() {} + static bool CheckDevice(const int device_id) { return true; } + static int FindDevice(const int start_id = 0) { return 0; } protected: cublasHandle_t cublas_handle_; diff --git a/libdeepgalois/cutils.h b/libdeepgalois/include/cutils.h similarity index 91% rename from libdeepgalois/cutils.h rename to libdeepgalois/include/cutils.h index 4356ec2979..cda8d23cba 100644 --- a/libdeepgalois/cutils.h +++ b/libdeepgalois/include/cutils.h @@ -1,4 +1,9 @@ #pragma once +#include +#include +#include +#include +#include // CUDA: use 256 threads per block const int CUDA_NUM_THREADS = 256; @@ -10,7 +15,6 @@ inline int CUDA_GET_BLOCKS(const int N) { // CUDA: various checks for different function calls. #define CUDA_CHECK(condition) \ - // Code block avoids redefinition of cudaError_t error \ do { \ cudaError_t error = condition; \ CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ diff --git a/libdeepgalois/layers.h b/libdeepgalois/include/layers.h similarity index 100% rename from libdeepgalois/layers.h rename to libdeepgalois/include/layers.h diff --git a/libdeepgalois/layers/arithmetic_layer.h b/libdeepgalois/include/layers/arithmetic_layer.h similarity index 100% rename from libdeepgalois/layers/arithmetic_layer.h rename to libdeepgalois/include/layers/arithmetic_layer.h diff --git a/libdeepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h similarity index 94% rename from libdeepgalois/layers/graph_conv_layer.h rename to libdeepgalois/include/layers/graph_conv_layer.h index b81f7bc10e..2e304a0c98 100644 --- a/libdeepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/layers/graph_conv_layer.h @@ -16,8 +16,9 @@ class graph_conv_layer: public layer { public: graph_conv_layer(unsigned level, Graph *g, bool act, bool norm, bool bias, bool dropout, - std::vector in_dims, std::vector out_dims) : - layer(level, in_dims, out_dims), graph(g), act_(act), norm_(norm), bias_(bias), dropout_(dropout) { + float dropout_rate, std::vector in_dims, std::vector out_dims) : + layer(level, in_dims, out_dims), graph(g), act_(act), norm_(norm), bias_(bias), + dropout_(dropout), dropout_rate_(dropout_rate) { assert(input_dims[0] == output_dims[0]); // num_vertices x = input_dims[0]; y = input_dims[1]; @@ -26,7 +27,11 @@ class graph_conv_layer: public layer { name_ = layer_type() + "_" + std::to_string(level); //std::cout << name_ << " constructed: act(" << act_ << ") dropout(" << dropout << ")\n"; init(); + scale_ = 1. / (1. - dropout_rate_); } + graph_conv_layer(unsigned level, std::vector in_dims, + std::vector out_dims) : graph_conv_layer(level, NULL, false, true, false, true, 0.5, in_dims, out_dims) {} + ~graph_conv_layer() {} void init() { std::cout << name_ << ": allocating memory for parameters and intermediate data... "; Timer t_alloc; @@ -50,9 +55,6 @@ class graph_conv_layer: public layer { t_alloc.Stop(); std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; } - graph_conv_layer(unsigned level, std::vector in_dims, - std::vector out_dims) : graph_conv_layer(level, NULL, false, true, false, true, in_dims, out_dims) {} - ~graph_conv_layer() {} std::string layer_type() const override { return std::string("graph_conv"); } // user-defined aggregate function @@ -78,12 +80,10 @@ class graph_conv_layer: public layer { //Timer t_matmul, t_agg, t_dropout; //t_matmul.Start(); if (dropout_ && phase_ == net_phase::train) { - //t_dropout.Start(); //for (size_t i = 0; i < x; ++i) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - dropout(in_data[i], dropout_mask[i], &in_temp[i*y]); + dropout(scale_, dropout_rate_, in_data[i], dropout_mask[i], &in_temp[i*y]); }, galois::loopname("dropout")); - //t_dropout.Stop(); matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z } else matmul2D1D(z, in_data, W, out_temp); // x*y; y*z; x*z //t_matmul.Stop(); @@ -119,7 +119,7 @@ class graph_conv_layer: public layer { update_all(graph, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y NOTE: since graph is symmetric, the derivative is the same if (dropout_) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - d_dropout(in_grad[i], dropout_mask[i], in_grad[i]); + d_dropout(scale_, in_grad[i], dropout_mask[i], in_grad[i]); }, galois::chunk_size(), galois::steal(), galois::loopname("d_dropout")); } } @@ -154,6 +154,8 @@ class graph_conv_layer: public layer { bool norm_; // whether to normalize data bool bias_; // whether to add bias afterwards bool dropout_; // whether to use dropout at first + const float dropout_rate_; + float scale_; net_phase phase_; size_t x; size_t y; diff --git a/libdeepgalois/layers/layer.h b/libdeepgalois/include/layers/layer.h similarity index 100% rename from libdeepgalois/layers/layer.h rename to libdeepgalois/include/layers/layer.h diff --git a/libdeepgalois/layers/linear_layer.h b/libdeepgalois/include/layers/linear_layer.h similarity index 100% rename from libdeepgalois/layers/linear_layer.h rename to libdeepgalois/include/layers/linear_layer.h diff --git a/libdeepgalois/include/layers/relu_layer.h b/libdeepgalois/include/layers/relu_layer.h new file mode 100644 index 0000000000..c4acdd50ac --- /dev/null +++ b/libdeepgalois/include/layers/relu_layer.h @@ -0,0 +1,15 @@ +#pragma once +#include "layer.h" + +// ReLU Layer +class relu_layer : public layer { +public: + relu_layer(unsigned level, std::vector in_dims, std::vector out_dims) + : layer(level, in_dims, out_dims) { + trainable_ = false; + } + ~relu_layer() {} + std::string layer_type() const override { return std::string("relu"); } + virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data); + virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad); +}; diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h new file mode 100644 index 0000000000..6375f72121 --- /dev/null +++ b/libdeepgalois/include/layers/softmax_loss_layer.h @@ -0,0 +1,18 @@ +#pragma once +#include "layer.h" + +class softmax_loss_layer: public layer { +public: + softmax_loss_layer(unsigned level, std::vector in_dims, + std::vector out_dims, LabelList *lab); + softmax_loss_layer(unsigned level, std::vector in_dims, + std::vector out_dims) : softmax_loss_layer(level, in_dims, out_dims, NULL) {} + ~softmax_loss_layer() {} + std::string layer_type() const override { return std::string("softmax_loss"); } + virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data); + virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad); + +private: + LabelList *labels; +}; + diff --git a/libdeepgalois/lgraph.h b/libdeepgalois/include/lgraph.h similarity index 100% rename from libdeepgalois/lgraph.h rename to libdeepgalois/include/lgraph.h diff --git a/libdeepgalois/include/math_functions.hpp b/libdeepgalois/include/math_functions.hpp new file mode 100644 index 0000000000..d3d08b10b2 --- /dev/null +++ b/libdeepgalois/include/math_functions.hpp @@ -0,0 +1,43 @@ +#ifndef _MATH_FUNCTIONS_ +#define _MATH_FUNCTIONS_ +#include +#include "types.h" +#include + +const float negative_slope = 0; + +void vadd(const vec_t &a, const vec_t &b, vec_t &out); +void vadd(size_t n, const float_t *a, const float_t *b, float_t *out); +void vsub(const vec_t &a, const vec_t &b, vec_t &out); +void vmul(const vec_t &a, const vec_t &b, vec_t &out); +void vdiv(const vec_t &a, const vec_t &b, vec_t &out); +void add_scalar(const float_t alpha, vec_t &Y); +void sub_scalar(const float_t alpha, vec_t &Y); +void mul_scalar(const float_t alpha, vec_t &Y); +void mul_scalar(size_t n, const float_t alpha, const float_t *in, float_t *out); +void div_scalar(const float_t alpha, vec_t &Y); +float_t dot(const vec_t &x, const vec_t &y); +void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector); +void vvmul(const vec_t &a, const vec_t &b, tensor_t &out); +void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C); +void copy2D1D(const tensor_t &in, vec_t &out); +void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C); +void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const vec_t &A, const vec_t &B, vec_t &C); +void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C); +void transpose2D(const tensor_t &in, tensor_t &out); +void transpose2D1D(const tensor_t &in, vec_t &out); +void transpose(size_t x, size_t y, const vec_t &in, vec_t &out); +int argmax(const size_t n, const vec_t &x); +void clear(vec_t &in); +void relu(const vec_t &in, vec_t &out); +void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); +void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); +void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &mask, vec_t &out); +void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &mask, float_t *out); +void d_dropout(const float scale, const vec_t &in_diff, std::vector &mask, vec_t &out_diff); +void softmax(const vec_t &input, vec_t &output); +void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp); +float_t cross_entropy(const vec_t &y, const vec_t &p); +void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d); + +#endif diff --git a/libdeepgalois/net.h b/libdeepgalois/include/net.h similarity index 98% rename from libdeepgalois/net.h rename to libdeepgalois/include/net.h index f6d6930d5a..f845eed82e 100644 --- a/libdeepgalois/net.h +++ b/libdeepgalois/include/net.h @@ -25,7 +25,6 @@ class Net { virtual void combine(const vec_t ma, const vec_t mb, const vec_t &a, const vec_t &b, vec_t &out) {} void init() { - assert(dropout_rate < 1.0); read_graph(dataset, g); n = g.size(); // N labels.resize(n, 0); // label for each vertex: N x 1 @@ -79,13 +78,14 @@ class Net { layers[i]->print_layer_info(); } - void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, bool bias = false, bool dropout = true) { + void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, bool bias = false, bool dropout = true, float dropout_rate = 0.5) { + assert(dropout_rate < 1.0); assert(layer_id < NUM_CONV_LAYERS); std::vector in_dims(2), out_dims(2); in_dims[0] = out_dims[0] = n; in_dims[1] = get_in_dim(layer_id); out_dims[1] = get_out_dim(layer_id); - layers[layer_id] = new graph_conv_layer(layer_id, &g, act, norm, bias, dropout, in_dims, out_dims); + layers[layer_id] = new graph_conv_layer(layer_id, &g, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); if(layer_id > 0) connect(layers[layer_id-1], layers[layer_id]); } diff --git a/libdeepgalois/node.h b/libdeepgalois/include/node.h similarity index 99% rename from libdeepgalois/node.h rename to libdeepgalois/include/node.h index deffebad9b..1a50080934 100644 --- a/libdeepgalois/node.h +++ b/libdeepgalois/include/node.h @@ -1,5 +1,6 @@ #pragma once #include +#include "types.h" class node; class layer; class edge; diff --git a/libdeepgalois/optimizer.h b/libdeepgalois/include/optimizer.h similarity index 97% rename from libdeepgalois/optimizer.h rename to libdeepgalois/include/optimizer.h index 2896881fed..d0f35eac11 100644 --- a/libdeepgalois/optimizer.h +++ b/libdeepgalois/include/optimizer.h @@ -39,7 +39,7 @@ struct stateful_optimizer : public optimizer { * The Journal of Machine Learning Research, pages 2121-2159, 2011. **/ struct adagrad : public stateful_optimizer<1> { - adagrad() : alpha(learning_rate), eps(float_t(1e-8)) {} + adagrad() : alpha(0.01), eps(float_t(1e-8)) {} void update(const vec_t &dW, vec_t &W, bool parallelize) { vec_t &g = get<0>(W); if (parallelize) { @@ -83,7 +83,7 @@ struct RMSprop : public stateful_optimizer<1> { // Adam: A Method for Stochastic Optimization // http://arxiv.org/abs/1412.6980 struct adam : public stateful_optimizer<2> { - adam() : alpha(learning_rate), b1(float_t(0.9)), + adam() : alpha(0.01), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {} @@ -96,7 +96,7 @@ struct adam : public stateful_optimizer<2> { // L2 norm based update rule W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); - }, galois::chunk_size(), galois::steal(), galois::loopname("adam_update")); + }, galois::chunk_size<256>(), galois::steal(), galois::loopname("adam_update")); b1_t *= b1; b2_t *= b2; } diff --git a/libdeepgalois/random.h b/libdeepgalois/include/random.h similarity index 100% rename from libdeepgalois/random.h rename to libdeepgalois/include/random.h diff --git a/libdeepgalois/timer.h b/libdeepgalois/include/timer.h similarity index 100% rename from libdeepgalois/timer.h rename to libdeepgalois/include/timer.h diff --git a/libdeepgalois/types.h b/libdeepgalois/include/types.h similarity index 98% rename from libdeepgalois/types.h rename to libdeepgalois/include/types.h index bc9fe21049..0aa80cce4f 100644 --- a/libdeepgalois/types.h +++ b/libdeepgalois/include/types.h @@ -30,5 +30,6 @@ typedef galois::graphs::LC_CSR_Graph::with_numa_alloc::typ #endif typedef Graph::GraphNode GNode; +#define CHUNK_SIZE 256 #endif diff --git a/libdeepgalois/utils.h b/libdeepgalois/include/utils.h similarity index 92% rename from libdeepgalois/utils.h rename to libdeepgalois/include/utils.h index 100a997b57..ceb49b0e41 100644 --- a/libdeepgalois/utils.h +++ b/libdeepgalois/include/utils.h @@ -3,10 +3,11 @@ #include #include #include +#include #include #include -std::string path = "/h2/xchen/datasets/Learning/"; // path to the input dataset +const std::string path = "/h2/xchen/datasets/Learning/"; // path to the input dataset enum class net_phase { train, test }; class ResourceManager { @@ -85,7 +86,7 @@ inline bool bernoulli(float_t p) { return uniform_rand(float_t{0}, float_t{1}) <= p; } -size_t read_masks(std::string dataset_str, std::string mask_type, size_t &begin, size_t &end, MaskList &masks) { +inline size_t read_masks(std::string dataset_str, std::string mask_type, size_t &begin, size_t &end, std::vector &masks) { if (dataset_str != "citeseer" && dataset_str != "cora") { std::cout << "Dataset currently not supported\n"; exit(1); diff --git a/libdeepgalois/layers/relu_layer.h b/libdeepgalois/layers/relu_layer.h deleted file mode 100644 index 2795fc404e..0000000000 --- a/libdeepgalois/layers/relu_layer.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once -#include "layer.h" - -// ReLU Layer -class relu_layer : public layer { -public: - relu_layer(unsigned level, std::vector in_dims, std::vector out_dims) - : layer(level, in_dims, out_dims) { - trainable_ = false; - } - std::string layer_type() const override { return std::string("relu"); } - // ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) - void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { - galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) { - for (size_t j = 0; j < input_dims[1]; ++j) - out_data[i][j] = std::max(in_data[i][j], (float_t)0); - }, galois::chunk_size(), galois::steal(), galois::loopname("relu_layer-fw")); - } - // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) - // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ , ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ - void back_propagation(const tensor_t &in_data, const tensor_t &out_data, - tensor_t &out_grad, tensor_t &in_grad) override { - galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) { - for (size_t j = 0; j < input_dims[1]; ++j) - in_grad[i][j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0); - }, galois::chunk_size(), galois::steal(), galois::loopname("relu_layer-bw")); - } -}; diff --git a/libdeepgalois/layers/softmax_loss_layer.h b/libdeepgalois/layers/softmax_loss_layer.h deleted file mode 100644 index bdd52e4d38..0000000000 --- a/libdeepgalois/layers/softmax_loss_layer.h +++ /dev/null @@ -1,47 +0,0 @@ -#pragma once -#include "layer.h" - -class softmax_loss_layer: public layer { -public: - softmax_loss_layer(unsigned level, std::vector in_dims, - std::vector out_dims, LabelList *lab) - : layer(level, in_dims, out_dims), labels(lab) { - trainable_ = false; - loss.resize(in_dims[0]); // error for each sample - name_ = layer_type() + "_" + std::to_string(level); - } - softmax_loss_layer(unsigned level, std::vector in_dims, - std::vector out_dims) : - softmax_loss_layer(level, in_dims, out_dims, NULL) {} - ~softmax_loss_layer() {} - std::string layer_type() const override { return std::string("softmax_loss"); } - - // TODO: need kernel fusion optimization - // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] - void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { - galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - if (masks_[i] == 1) { // masked - softmax(in_data[i], out_data[i]); // normalize using softmax - // y is a one hot encoded vector for the labels - std::vector y(output_dims[1], 0.0); // ground truth - y[(*labels)[i]] = 1.0; // one-hot - loss[i] = cross_entropy(y, out_data[i]); - } - }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-fw")); - } - - void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) override { - //std::cout << name_ << " backward: x=" << in_grad.size() << ", y=" << in_grad[0].size() << "\n"; - galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - vec_t norm_grad(output_dims[1]); - std::vector y(output_dims[1], 0.0); // ground truth - y[(*labels)[i]] = 1.0; - d_cross_entropy(y, out_data[i], norm_grad); - d_softmax(in_data[i], out_data[i], in_grad[i], norm_grad); - }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); - } - -private: - LabelList *labels; -}; - diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp new file mode 100644 index 0000000000..ccabc8a090 --- /dev/null +++ b/libdeepgalois/src/layers/relu_layer.cpp @@ -0,0 +1,19 @@ +#include "layers/relu_layer.h" + +// ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) +void relu_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) { + galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) { + for (size_t j = 0; j < input_dims[1]; ++j) + out_data[i][j] = std::max(in_data[i][j], (float_t)0); + }, galois::chunk_size(), galois::steal(), galois::loopname("relu_layer-fw")); +} + +// ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) +// = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ , ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ +void relu_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, + tensor_t &out_grad, tensor_t &in_grad) { + galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) { + for (size_t j = 0; j < input_dims[1]; ++j) + in_grad[i][j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0); + }, galois::chunk_size(), galois::steal(), galois::loopname("relu_layer-bw")); +} diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp new file mode 100644 index 0000000000..61f63f6f0e --- /dev/null +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -0,0 +1,34 @@ +#include "layers/softmax_loss_layer.h" + +softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims, LabelList *lab) + : layer(level, in_dims, out_dims), labels(lab) { + trainable_ = false; + loss.resize(in_dims[0]); // error for each sample + name_ = layer_type() + "_" + std::to_string(level); +} + +// TODO: need kernel fusion optimization +// ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] +void softmax_loss_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) { + galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { + if (masks_[i] == 1) { // masked + softmax(in_data[i], out_data[i]); // normalize using softmax + // y is a one hot encoded vector for the labels + std::vector y(output_dims[1], 0.0); // ground truth + y[(*labels)[i]] = 1.0; // one-hot + loss[i] = cross_entropy(y, out_data[i]); + } + }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-fw")); +} + +void softmax_loss_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) { + //std::cout << name_ << " backward: x=" << in_grad.size() << ", y=" << in_grad[0].size() << "\n"; + galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { + vec_t norm_grad(output_dims[1]); + std::vector y(output_dims[1], 0.0); // ground truth + y[(*labels)[i]] = 1.0; + d_cross_entropy(y, out_data[i], norm_grad); + d_softmax(in_data[i], out_data[i], in_grad[i], norm_grad); + }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); +} + diff --git a/libdeepgalois/math_functions.hpp b/libdeepgalois/src/math_functions.cpp similarity index 60% rename from libdeepgalois/math_functions.hpp rename to libdeepgalois/src/math_functions.cpp index f1612aac1c..a4d1d77719 100644 --- a/libdeepgalois/math_functions.hpp +++ b/libdeepgalois/src/math_functions.cpp @@ -1,21 +1,13 @@ -#ifndef _MATH_FUNCTIONS_ -#define _MATH_FUNCTIONS_ -#include +#include "math_functions.hpp" #include "utils.h" -#include -#ifdef WITH_BLAS extern "C" { #include //#include } -#endif - -const float negative_slope = 0; // vector add -template -inline void vadd(const std::vector &a, const std::vector &b, std::vector &out) { +void vadd(const vec_t &a, const vec_t &b, vec_t &out) { //for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i]; size_t n = out.size(); size_t vec_len = 8; @@ -25,8 +17,7 @@ inline void vadd(const std::vector &a, const std::vector &b, std for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i]; } -template -inline void vadd(size_t n, const DataTy *a, const DataTy *b, DataTy *out) { +void vadd(size_t n, const float_t *a, const float_t *b, float_t *out) { size_t vec_len = 8; const size_t alignedN = n - n % vec_len; for (size_t i = 0; i < alignedN; i += vec_len) @@ -35,20 +26,17 @@ inline void vadd(size_t n, const DataTy *a, const DataTy *b, DataTy *out) { } // vector subtract -template -inline void vsub(const std::vector &in_a, const std::vector &in_b, std::vector &out) { +void vsub(const vec_t &in_a, const vec_t &in_b, vec_t &out) { for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] - in_b[i]; } // vector multiply -template -inline void vmul(const std::vector &in_a, const std::vector &in_b, std::vector &out) { +void vmul(const vec_t &in_a, const vec_t &in_b, vec_t &out) { for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] * in_b[i]; } // vector divide -template -inline void vdiv(const std::vector &in_a, const std::vector &in_b, std::vector &out) { +void vdiv(const vec_t &in_a, const vec_t &in_b, vec_t &out) { for (size_t i = 0; i < out.size(); ++i) { assert(in_b[i] != 0); out[i] = in_a[i] / in_b[i]; @@ -56,46 +44,40 @@ inline void vdiv(const std::vector &in_a, const std::vector &in_ } // vector add scalar -template -inline void add_scalar(const DataTy alpha, std::vector &Y) { +void add_scalar(const float_t alpha, vec_t &Y) { for (size_t i = 0; i < Y.size(); ++i) Y[i] += alpha; } // vector subtract scalar -template -inline void sub_scalar(const DataTy alpha, std::vector &Y) { +void sub_scalar(const float_t alpha, vec_t &Y) { for (size_t i = 0; i < Y.size(); ++i) Y[i] -= alpha; } // vector multiply scalar -template -inline void mul_scalar(const DataTy alpha, std::vector &Y) { +void mul_scalar(const float_t alpha, vec_t &Y) { for (size_t i = 0; i < Y.size(); ++i) Y[i] *= alpha; } -template -inline void mul_scalar(size_t n, const DataTy alpha, const DataTy *in, DataTy *out) { +void mul_scalar(size_t n, const float_t alpha, const float_t *in, float_t *out) { for (size_t i = 0; i < n; ++i) out[i] = alpha *in[i]; } // vector divide scalar -template -inline void div_scalar(const DataTy alpha, std::vector &Y) { +void div_scalar(const float_t alpha, vec_t &Y) { assert(alpha != 0); for (size_t i = 0; i < Y.size(); ++i) Y[i] /= alpha; } // dot product -template -inline DataTy dot(const std::vector &x, const std::vector &y) { - DataTy sum = 0; +float_t dot(const vec_t &x, const vec_t &y) { + float_t sum = 0; for (size_t i = 0; i < x.size(); ++i) sum += x[i] * y[i]; return sum; } // matrix-vector multiply -inline void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector) { +void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector) { size_t m = out_vector.size(); size_t n = in_vector.size(); for (size_t i = 0; i < m; ++i) { @@ -106,7 +88,7 @@ inline void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector } // vector-vector multiply -inline void vvmul(const vec_t &a, const vec_t &b, tensor_t &out) { +void vvmul(const vec_t &a, const vec_t &b, tensor_t &out) { size_t m = a.size(); size_t n = b.size(); for (size_t i = 0; i < m; ++i) { @@ -117,35 +99,32 @@ inline void vvmul(const vec_t &a, const vec_t &b, tensor_t &out) { } // matrix addition -inline void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C) { +void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C) { for (size_t i = 0; i < x; ++i) for (size_t j = 0; j < y; ++j) C[i][j] = A[i][j] + B[i][j]; } // TODO: vectorize -template -inline void copy2D1D(const tensor_t &in, vec_t &out) { +void copy2D1D(const tensor_t &in, vec_t &out) { size_t x = in.size(); size_t y = in[0].size(); -#ifdef WITH_BLAS auto ptr = &out[0]; for (size_t i = 0; i < x; i++) { std::copy(in[i].begin(), in[i].end(), ptr); ptr += y; } -#else - assert(out.size() == x*y); - for (size_t i = 0; i < x; i ++) { - for (size_t j = 0; j < y; j ++) { - out[i*y+j] = in[i][j]; - } - } -#endif } -// matrix multiply: all 2D -inline void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) { +void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); +} + +void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) { // A: x*z; B: z*y; C: x*y size_t dim_x = A.size(); size_t dim_y = C[0].size(); @@ -164,72 +143,37 @@ inline void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) { } } -void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, - const int M, const int N, const int K, const float alpha, - const float* A, const float* B, const float beta, float* C) { - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); -} - -inline void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, +void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const vec_t &A, const vec_t &B, vec_t &C) { galois::StatTimer Tmatmul("MatMul"); Tmatmul.start(); assert(A.size() == dim_x*dim_z); assert(B.size() == dim_z*dim_y); assert(C.size() == dim_x*dim_y); - -#ifdef WITH_BLAS const CBLAS_TRANSPOSE TransA = CblasNoTrans; const CBLAS_TRANSPOSE TransB = CblasNoTrans; sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, &A[0], &B[0], 0.0, &C[0]); -#else - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) { - C[i*dim_y+j] = 0; - for (size_t k = 0; k < dim_z; ++k) { - C[i*dim_y+j] += A[i*dim_z+k] * B[k*dim_y+j]; - } - } - } -#endif Tmatmul.stop(); } -inline void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C) { +void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C) { // A: x*z; B: z*y; C: x*y size_t dim_x = A.size(); size_t dim_z = A[0].size(); assert(B.size() == dim_z*dim_y); assert(C.size() == dim_x*dim_y); - -#ifdef WITH_BLAS vec_t A1D(dim_x*dim_z); copy2D1D(A, A1D); matmul1D1D(dim_x, dim_y, dim_z, A1D, B, C); -#else - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) { - C[i*dim_y+j] = 0; - for (size_t k = 0; k < dim_z; ++k) { - C[i*dim_y+j] += A[i][k] * B[k][j]; - } - } - } -#endif } -// matrix multiply -inline void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) { +void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) { // A: x*z; B: z*y; C: x*y size_t dim_x = C.size(); size_t dim_y = C[0].size(); size_t dim_z = A[0].size(); assert(A.size() == dim_x); assert(B.size() == dim_y*dim_z); - -#ifdef WITH_BLAS vec_t A1D(dim_x*dim_z); vec_t C1D(dim_x*dim_y, 0); auto ptr = &A1D[0]; @@ -243,20 +187,9 @@ inline void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) { C[i][j] = C1D[i*dim_y+j]; } } -#else - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) { - C[i][j] = 0; - for (size_t k = 0; k < dim_z; ++k) { - C[i][j] += A[i][k] * B[k*dim_y+j]; - } - } - } -#endif } -template -inline void transpose2D(const tensor_t &in, tensor_t &out) { +void transpose2D(const tensor_t &in, tensor_t &out) { size_t x = in.size(); size_t y = in[0].size(); for (size_t i = 0; i < y; i ++) { @@ -267,8 +200,7 @@ inline void transpose2D(const tensor_t &in, tensor_t &out) { } // TODO: vectorize -template -inline void transpose2D1D(const tensor_t &in, vec_t &out) { +void transpose2D1D(const tensor_t &in, vec_t &out) { size_t x = in.size(); size_t y = in[0].size(); assert(out.size() == x*y); @@ -279,18 +211,15 @@ inline void transpose2D1D(const tensor_t &in, vec_t &out) { } } -template -inline void transpose(size_t x, size_t y, const vec_t &in, vec_t &out) { +void transpose(size_t x, size_t y, const vec_t &in, vec_t &out) { for (size_t i = 0; i < y; i ++) { for (size_t j = 0; j < x; j ++) { out[i*x+j] = in[j*y+i]; } } } - -template -inline int argmax(const size_t n, const std::vector &x) { - DataTy max = x[0]; +int argmax(const size_t n, const vec_t &x) { + float_t max = x[0]; int max_ind = 0; for (size_t i = 1; i < n; i++) { if (x[i] > max) { @@ -301,72 +230,32 @@ inline int argmax(const size_t n, const std::vector &x) { return max_ind; } -inline void clear(vec_t &in) { +void clear(vec_t &in) { for (size_t i = 0; i < in.size(); i++) in[i] = 0; } -inline void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { - galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) { - clear(out[src]); // TODO: vectorize clear - float_t a = 0.0, b = 0.0; - if (norm) a = norm_factor[src]; - // gather neighbors' embeddings - for (const auto e : g->edges(src)) { - const auto dst = g->getEdgeDst(e); - if (norm) { - b = a * norm_factor[dst]; - vec_t neighbor = in[dst]; - mul_scalar(b, neighbor); - vadd(out[src], neighbor, out[src]); // out[src] += in[dst] - } else vadd(out[src], in[dst], out[src]); // out[src] += in[dst] - } - }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); -} - -inline void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { - size_t len = out[0].size(); - galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) { - clear(out[src]); - float_t a = 0.0, b = 0.0; - if (norm) a = norm_factor[src]; - // gather neighbors' embeddings - for (const auto e : g->edges(src)) { - const auto dst = g->getEdgeDst(e); - if (norm) { - b = a * norm_factor[dst]; - vec_t neighbor(len); - mul_scalar(len, b, &in[dst*len], neighbor.data()); - vadd(out[src], neighbor, out[src]); // out[src] += in[dst] - } else vadd(len, out[src].data(), &in[dst*len], out[src].data()); // out[src] += in[dst] - } - }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); -} - -template -inline void relu(const std::vector &in, std::vector &out) { +void relu(const vec_t &in, vec_t &out) { for (size_t i = 0; i < out.size(); ++i) { - out[i] = std::max(in[i], (DataTy)0) + negative_slope * std::min(in[i], (DataTy)0); + out[i] = std::max(in[i], (float_t)0) + negative_slope * std::min(in[i], (float_t)0); } } -template -inline void d_relu(const std::vector &in_diff, const std::vector &fv, std::vector &out_diff) { +void d_relu(const vec_t &in_diff, const vec_t &fv, vec_t &out_diff) { for (size_t i = 0; i < out_diff.size(); ++i) { - out_diff[i] = in_diff[i] * ((fv[i] > (DataTy)0) + negative_slope * (fv[i] <= (DataTy)0)); + out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) + negative_slope * (fv[i] <= (float_t)0)); } } -inline void d_mvmul(vec_t &in_diff, vec_t &h_in, tensor_t &out_diff) { +void d_mvmul(vec_t &in_diff, vec_t &h_in, tensor_t &out_diff) { vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff } -inline void d_vadd(vec_t &in_diff, vec_t &out_diff) { +void d_vadd(vec_t &in_diff, vec_t &out_diff) { for (size_t i = 0; i < out_diff.size(); ++i) out_diff[i] = in_diff[i]; } -template -inline float reduce_mean(const std::vector &x) { +float reduce_mean(const vec_t &x) { size_t n = x.size(); assert(n > 0); float sum = (float)x[0]; @@ -376,51 +265,83 @@ inline float reduce_mean(const std::vector &x) { return sum / (float)n; } -const float scale_ = 1. / (1. - dropout_rate); - -inline void dropout(const vec_t &in, std::vector &mask, vec_t &out) { +void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &mask, vec_t &out) { assert(mask.size() == out.size()); //rng_bernoulli(1. - dropout_rate, mask); // Create random numbers for (size_t i = 0; i < in.size(); ++i) mask[i] = bernoulli(dropout_rate); for (size_t i = 0; i < in.size(); ++i) - out[i] = in[i] * mask[i] * scale_; + out[i] = in[i] * mask[i] * scale; } -inline void dropout(const vec_t &in, std::vector &mask, float_t *out) { +void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &mask, float_t *out) { for (size_t i = 0; i < in.size(); ++i) mask[i] = bernoulli(dropout_rate); for (size_t i = 0; i < in.size(); ++i) - out[i] = in[i] * mask[i] * scale_; + out[i] = in[i] * mask[i] * scale; } -inline void d_dropout(const vec_t &in_diff, std::vector &mask, vec_t &out_diff) { +void d_dropout(const float scale, const vec_t &in_diff, std::vector &mask, vec_t &out_diff) { for (size_t i = 0; i < in_diff.size(); ++i) - out_diff[i] = in_diff[i] * mask[i] * scale_; + out_diff[i] = in_diff[i] * mask[i] * scale; } -template -inline DataTy sigmoid_func(DataTy x) { +float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; } // Sigmoid -template -inline void sigmoid(std::vector &fv) { +void sigmoid(vec_t &fv) { size_t count = fv.size(); for (size_t i = 0; i < count; ++i) { fv[i] = sigmoid_func(fv[i]); } } +void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { + galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) { + clear(out[src]); // TODO: vectorize clear + float_t a = 0.0, b = 0.0; + if (norm) a = norm_factor[src]; + // gather neighbors' embeddings + for (const auto e : g->edges(src)) { + const auto dst = g->getEdgeDst(e); + if (norm) { + b = a * norm_factor[dst]; + vec_t neighbor = in[dst]; + mul_scalar(b, neighbor); + vadd(out[src], neighbor, out[src]); // out[src] += in[dst] + } else vadd(out[src], in[dst], out[src]); // out[src] += in[dst] + } + }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); +} + +void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { + size_t len = out[0].size(); + galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) { + clear(out[src]); + float_t a = 0.0, b = 0.0; + if (norm) a = norm_factor[src]; + // gather neighbors' embeddings + for (const auto e : g->edges(src)) { + const auto dst = g->getEdgeDst(e); + if (norm) { + b = a * norm_factor[dst]; + vec_t neighbor(len); + mul_scalar(len, b, &in[dst*len], neighbor.data()); + vadd(out[src], neighbor, out[src]); // out[src] += in[dst] + } else vadd(len, out[src].data(), &in[dst*len], out[src].data()); // out[src] += in[dst] + } + }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); +} + // Softmax function takes an N-dimensional vector (X) of real number, // and transforms it into a vector of real number in range (0,1) which add upto 1. // To make softmax func numerically stable, we simply normalize the values in the vector, // by multiplying the numerator and denominator with a constant C, where log(C)=-max(X) // exps = np.exp(X - np.max(X)) // exps / np.sum(exps) -template -inline void softmax(const std::vector &input, std::vector &output) { +void softmax(const vec_t &input, vec_t &output) { const float_t max = *std::max_element(input.begin(), input.end()); float_t denominator(0); for (size_t i = 0; i < input.size(); i++) { @@ -431,8 +352,7 @@ inline void softmax(const std::vector &input, std::vector &outpu output[i] /= denominator; } -template -inline void log_softmax(const std::vector &input, std::vector &output) { +void log_softmax(const vec_t &input, vec_t &output) { const float_t max = *std::max_element(input.begin(), input.end()); float_t denominator(0); for (size_t i = 0; i < input.size(); i++) @@ -445,38 +365,27 @@ inline void log_softmax(const std::vector &input, std::vector &o // we often use it as the final layer in neural networks. // For this we need to calculate the derivative or gradient, // and pass it back to the previous layer during backpropagation. -template -inline void d_softmax(const std::vector &y, const std::vector &p, - std::vector &dy, const std::vector &dp) { +void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp) { auto n = y.size(); vec_t df(n, 0); for (size_t i = 0; i < n; i++) { for (size_t j = 0; j < n; j++) { - //DataTy delta_ij = i == j? 1 : 0; + //float_t delta_ij = i == j? 1 : 0; //df[i] += p[j] * (delta_ij - p[i]); df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i]; } // dy = dp * (gradient of softmax) dy[i] = dot(dp, df); } -/* - for (size_t j = 0; j < x.size(); j++) { - for (size_t k = 0; k < x.size(); k++) { - df[k] = (k == j) ? y[j] * (float_t(1) - y[j]) : -y[k] * y[j]; - } - dx[j] = vectorize::dot(&dy[0], &df[0], len); - } -*/ } // cross-entropy loss function for multi-class classification // y: ground truth // p: predicted probability -template -inline DataTy cross_entropy(const std::vector &y, const std::vector &p) { +float_t cross_entropy(const vec_t &y, const vec_t &p) { auto n = y.size(); assert(n > 0); - DataTy loss = 0.0; + float_t loss = 0.0; for (size_t i = 0; i < n; i++) { if (y[i] == float_t(0)) continue; if (p[i] == float_t(0)) loss -= y[i] * std::log(float_t(1e-10)); @@ -487,8 +396,7 @@ inline DataTy cross_entropy(const std::vector &y, const std::vector -inline void d_cross_entropy(const std::vector &y, const std::vector &p, std::vector &d) { +void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d) { auto n = y.size(); //for (size_t i = 0; i < n; i++) d[i] = (p[i] - y[i]) / (p[i] * (float_t(1) - p[i])); for (size_t i = 0; i < n; i++) { @@ -497,4 +405,3 @@ inline void d_cross_entropy(const std::vector &y, const std::vector 0 ? in[index] : 0; + } +} + +__global__ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* in_data, float_t* out_diff) { + CUDA_KERNEL_LOOP(index, n) { + out_diff[index] = in_data[index] > 0 ? in_diff[index] : 0; + } +} + +void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C) { + // Note that cublas follows fortran order. + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + CUBLAS_CHECK(cublasSgemm(cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); +} + +void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float* A, const float* x, const float beta, float* y) { + cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK(cublasSgemv(cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); +} + +void scal_gpu(const int N, const float alpha, float *X) { + CUBLAS_CHECK(cublasSscal(cublas_handle(), N, &alpha, X, 1)); +} + +void dot_gpu(const int n, const float* x, const float* y, float* out) { + CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); +} + +void asum_gpu(const int n, const float* x, float* y) { + CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y)); +} + +void scale_gpu(const int n, const float alpha, const float *x, float* y) { + CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1)); + CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1)); +} + +__global__ void set_kernel(const int n, const float_t alpha, float_t* y) { + CUDA_KERNEL_LOOP(index, n) { + y[index] = alpha; + } +} + +void set_gpu(const int N, const float_t alpha, float_t* Y) { + if (alpha == 0) { + CUDA_CHECK(cudaMemset(Y, 0, sizeof(float_t) * N)); + return; + } + set_kernel<<>>(N, alpha, Y); +} + +__global__ void add_scalar_kernel(const int n, const float_t alpha, float_t* y) { + CUDA_KERNEL_LOOP(index, n) { + y[index] += alpha; + } +} + +void add_scalar_gpu(const int N, const float alpha, float* Y) { + add_scalar_kernel<<>>(N, alpha, Y); +} + +__global__ void add_kernel(const int n, const float_t* a, const float_t* b, float_t* y) { + CUDA_KERNEL_LOOP(index, n) { + y[index] = a[index] + b[index]; + } +} + +void add_gpu(const int N, const float* a, const float* b, float* y) { + add_kernel<<>>(N, a, b, y); +} + diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index c03a5c6676..9e2597dffb 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -3,6 +3,6 @@ include_directories(BEFORE ${CMAKE_CURRENT_BINARY_DIR}/../libllvm/include ) include_directories(${CMAKE_SOURCE_DIR}/lonestargnn) -include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois) +include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include) add_subdirectory(gcn) diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt index f1a65740f7..05484252b8 100644 --- a/lonestargnn/gcn/CMakeLists.txt +++ b/lonestargnn/gcn/CMakeLists.txt @@ -1,16 +1,8 @@ -SET(USE_BLAS ON CACHE BOOL "Use blas") - -SET(BLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) -SET(BLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) - -if (USE_BLAS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWITH_BLAS") - include_directories(${BLAS_INC}) - link_directories(${BLAS_LIB}) -endif() +SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) +SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) +include_directories(${OPENBLAS_INC}) +link_directories(${OPENBLAS_LIB}) app(gcn gcn.cpp) +target_link_libraries(gcn deepgalois) -if (USE_BLAS) - target_link_libraries(gcn -lopenblas) -endif() diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h index efbb862fd7..cbf3c1ae2a 100644 --- a/lonestargnn/lonestargnn.h +++ b/lonestargnn/lonestargnn.h @@ -24,7 +24,6 @@ static cll::opt early_stopping("es", cll::desc("Tolerance for early stopp static cll::opt max_degree("md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"), cll::init(3)); static cll::opt do_validate("dv", cll::desc("enable validation"), cll::init(1)); static cll::opt do_test("dt", cll::desc("enable test"), cll::init(1)); -#define CHUNK_SIZE 256 //! standard global options to the benchmarks extern llvm::cl::opt skipVerify; From 55f96412a8bdebddf572625a22d4e5f6c305c13f Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 21 Feb 2020 09:51:17 -0600 Subject: [PATCH 004/660] add cuda in cmake --- libdeepgalois/CMakeLists.txt | 37 ++++++---- libdeepgalois/include/aggregator.h | 6 ++ libdeepgalois/include/common.h | 26 +++++-- libdeepgalois/include/cutils.h | 14 ++-- libdeepgalois/include/gtypes.h | 15 ++++ .../include/layers/graph_conv_layer.h | 72 +++---------------- libdeepgalois/include/layers/layer.h | 10 +-- libdeepgalois/include/layers/relu_layer.h | 2 + .../include/layers/softmax_loss_layer.h | 2 + .../{math_functions.hpp => math_functions.hh} | 27 ++++--- libdeepgalois/include/net.h | 8 +-- libdeepgalois/include/node.h | 55 +++----------- libdeepgalois/include/types.h | 13 +--- libdeepgalois/include/utils.h | 1 + libdeepgalois/src/aggregator.cpp | 40 +++++++++++ libdeepgalois/src/aggregator.cu | 7 ++ libdeepgalois/src/layers/graph_conv_layer.cpp | 54 ++++++++++++++ libdeepgalois/src/layers/relu_layer.cpp | 16 ++++- .../src/layers/softmax_loss_layer.cpp | 3 + libdeepgalois/src/math_functions.cpp | 43 ++--------- libdeepgalois/src/math_functions.cu | 60 +++++++++++----- 21 files changed, 285 insertions(+), 226 deletions(-) create mode 100644 libdeepgalois/include/aggregator.h create mode 100644 libdeepgalois/include/gtypes.h rename libdeepgalois/include/{math_functions.hpp => math_functions.hh} (58%) create mode 100644 libdeepgalois/src/aggregator.cpp create mode 100644 libdeepgalois/src/aggregator.cu create mode 100644 libdeepgalois/src/layers/graph_conv_layer.cpp diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 4f51532898..7e558221f6 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -3,23 +3,15 @@ SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) include_directories(${OPENBLAS_INC}) link_directories(${OPENBLAS_LIB}) -#SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include) -#SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-10.0/lib64/) -#SET(ENABLE_GPU OFF CACHE BOOL "Use GPU for DeepGalois") -#if (ENABLE_GPU) -# target_compile_definitions(distbench PRIVATE __GALOIS_HET_CUDA__=1) -# set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DENABLE_GPU") -# include_directories(${CUDA_INC}) -# link_directories(${CUDA_LIB}) -#endif() +SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include) +SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-8.0/lib64/) +include_directories(${CUDA_INC}) +link_directories(${CUDA_LIB}) +link_directories(${CMAKE_SOURCE_DIR}/libgpu) #set(sources # src/layers/relu_layer.cu #) -#cuda_add_library(deepgalois_gpu -# ${sources} -#OPTIONS -D_FORCE_INLINES -#) #target_include_directories(deepgalois_gpu PUBLIC # ${CMAKE_SOURCE_DIR}/libgpu/include #) @@ -27,18 +19,33 @@ link_directories(${OPENBLAS_LIB}) # INTERFACE_POSITION_INDEPENDENT_CODE On # POSITION_INDEPENDENT_CODE On #) -#target_link_libraries(deepgalois -lcudart -lcublas) +cmake_minimum_required(VERSION 2.8) +find_package(CUDA REQUIRED) +set(CUDA_SEPARABLE_COMPILATION ON) +set(CUDA_PROPAGATE_HOST_FLAGS OFF) +#set(CUDA_HOST_COMPILER g++) +list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60; -std=c++11") +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) +include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) +file(GLOB CUDA_FILES "src/" *.cu) +#CUDA_COMPILE(CU_O src/math_functions.cu) +CUDA_COMPILE(CU_O ${CUDA_FILES}) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") set(sources src/layers/relu_layer.cpp + src/layers/graph_conv_layer.cpp src/layers/softmax_loss_layer.cpp src/math_functions.cpp + src/aggregator.cpp + ${CU_O} ) add_library(deepgalois STATIC ${sources}) -target_link_libraries(deepgalois galois_shmem gllvm) +target_link_libraries(deepgalois galois_shmem gllvm galois_gpu) target_link_libraries(deepgalois ${MPI_CXX_LIBRARIES}) target_link_libraries(deepgalois -lopenblas) +target_link_libraries(deepgalois -lcudart -lcublas) target_include_directories(deepgalois PUBLIC ${CMAKE_SOURCE_DIR}/libllvm/include diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h new file mode 100644 index 0000000000..4e178d89b8 --- /dev/null +++ b/libdeepgalois/include/aggregator.h @@ -0,0 +1,6 @@ +#pragma once +#include "types.h" +#include "gtypes.h" + +void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); +void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); diff --git a/libdeepgalois/include/common.h b/libdeepgalois/include/common.h index 0c3023c3f2..f942fd106c 100644 --- a/libdeepgalois/include/common.h +++ b/libdeepgalois/include/common.h @@ -2,6 +2,7 @@ #include "types.h" #include "utils.h" #include "cutils.h" +//#include "random.h" class DeepGalois { public: @@ -28,17 +29,17 @@ class DeepGalois { if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); if (Get().curand_generator_) CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); - CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); + //CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); + //CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); } static void DeviceQuery() {} static bool CheckDevice(const int device_id) { return true; } static int FindDevice(const int start_id = 0) { return 0; } protected: - cublasHandle_t cublas_handle_; - curandGenerator_t curand_generator_; - shared_ptr random_generator_; + cublasHandle_t cublas_handle_; // used to call cuBLAS + curandGenerator_t curand_generator_; // used to generate random numbers on GPU + //shared_ptr random_generator_; Brew mode_; // Parallel training int solver_count_; @@ -47,6 +48,19 @@ class DeepGalois { private: // The private constructor to avoid duplicate instantiation. - DeepGalois(); + DeepGalois() : cublas_handle_(NULL), curand_generator_(NULL), + //random_generator_(NULL), mode_(DeepGalois::CPU), + mode_(DeepGalois::CPU), + solver_count_(1), solver_rank_(0), multiprocess_(false) { + // Try to create a cublas handler, and report an error if failed (but we will + // keep the program running as one might just want to run CPU code). + if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { + std::cout << "Cannot create Cublas handle. Cublas won't be available."; + } + // Try to create a curand handler. + //if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS || + // curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS) + // std::cout << "Cannot create Curand generator. Curand won't be available."; + } }; diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h index cda8d23cba..8a0fcaa3a1 100644 --- a/libdeepgalois/include/cutils.h +++ b/libdeepgalois/include/cutils.h @@ -17,21 +17,23 @@ inline int CUDA_GET_BLOCKS(const int N) { #define CUDA_CHECK(condition) \ do { \ cudaError_t error = condition; \ - CHECK_EQ(error, cudaSuccess) << " " << cudaGetErrorString(error); \ + if (error != cudaSuccess) { \ + fprintf(stderr, "error %d: Cuda error in file '%s' in line %i : %s.\n", \ + error, __FILE__, __LINE__, cudaGetErrorString(error) ); \ + exit(EXIT_FAILURE); \ + } \ } while (0) #define CUBLAS_CHECK(condition) \ do { \ - cublasStatus_t status = condition; \ - CHECK_EQ(status, CUBLAS_STATUS_SUCCESS) << " " \ - << caffe::cublasGetErrorString(status); \ + cublasStatus_t status = condition; \ + if (status != CUBLAS_STATUS_SUCCESS) \ + ; \ } while (0) #define CURAND_CHECK(condition) \ do { \ curandStatus_t status = condition; \ - CHECK_EQ(status, CURAND_STATUS_SUCCESS) << " " \ - << caffe::curandGetErrorString(status); \ } while (0) // CUDA: grid stride looping diff --git a/libdeepgalois/include/gtypes.h b/libdeepgalois/include/gtypes.h new file mode 100644 index 0000000000..a30468b0f9 --- /dev/null +++ b/libdeepgalois/include/gtypes.h @@ -0,0 +1,15 @@ +#pragma once +#include "galois/Galois.h" +#include "galois/graphs/LCGraph.h" + +typedef galois::GAccumulator AccumF; +typedef galois::GAccumulator AccumU; + +#ifdef EDGE_LABEL +typedef galois::graphs::LC_CSR_Graph::with_numa_alloc::type ::with_no_lockable::type Graph; +#else +typedef galois::graphs::LC_CSR_Graph::with_numa_alloc::type ::with_no_lockable::type Graph; +#endif + +typedef Graph::GraphNode GNode; + diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h index 2e304a0c98..0633ec63e9 100644 --- a/libdeepgalois/include/layers/graph_conv_layer.h +++ b/libdeepgalois/include/layers/graph_conv_layer.h @@ -1,5 +1,6 @@ #pragma once #include "layer.h" +#include "gtypes.h" /* GraphConv Layer Parameters @@ -56,12 +57,15 @@ class graph_conv_layer: public layer { std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; } std::string layer_type() const override { return std::string("graph_conv"); } - + void set_context(net_phase ctx) override { phase_ = ctx; } + virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data); + virtual void forward_propagation(const float_t *in_data, float_t *out_data); + virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad); + virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); // user-defined aggregate function - void aggregate(Graph *g, const vec_t &in, tensor_t &out) { update_all(g, in, out, true, norm_factor); } - + virtual void aggregate(Graph *g, const vec_t &in, tensor_t &out); // user-defined combine function - void combine(const vec_t &self, const vec_t &neighbors, const vec_t mat_v, const vec_t mat_u, vec_t &out) { + virtual void combine(const vec_t &self, const vec_t &neighbors, const vec_t mat_v, const vec_t mat_u, vec_t &out) { vec_t a(out.size(), 0); vec_t b(out.size(), 0); mvmul(mat_v, self, a); @@ -69,66 +73,6 @@ class graph_conv_layer: public layer { vadd(a, b, out); // out = W*self + Q*neighbors } - void set_context(net_phase ctx) override { phase_ = ctx; } - - // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) - void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { - // input: x*y; W: y*z; output: x*z - // if y > z: - // mult W first to reduce the feature size for aggregation - // else: aggregate first then mult W (not implemented yet) - //Timer t_matmul, t_agg, t_dropout; - //t_matmul.Start(); - if (dropout_ && phase_ == net_phase::train) { - //for (size_t i = 0; i < x; ++i) { - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - dropout(scale_, dropout_rate_, in_data[i], dropout_mask[i], &in_temp[i*y]); - }, galois::loopname("dropout")); - matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z - } else matmul2D1D(z, in_data, W, out_temp); // x*y; y*z; x*z - //t_matmul.Stop(); - //t_agg.Start(); - aggregate(graph, out_temp, out_data); // aggregate - //t_agg.Stop(); - if (act_) { - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - relu(out_data[i], out_data[i]); - }, galois::loopname("relu")); - } - //double dropout_time = 0; - //if (dropout_ && phase_ == net_phase::train) dropout_time = t_dropout.Millisecs(); - //std::cout << "\n\t" << name_ << " matmul time: " << t_matmul.Millisecs() - // << ", aggregation time: " << t_agg.Millisecs() << ", dropout time: " << dropout_time << "\n"; - } - - // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ - void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) override { - if (act_) { - //for (size_t j = 0; j < z; ++j) - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - for (size_t j = 0; j < z; ++j) - //if (out_data[i][j] <= 0.0) out_temp[i][j] = 0.0; - out_temp[i*z+j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0); - }, galois::loopname("d_relu")); - //} else out_temp = out_grad; // TODO: avoid copying - } else copy2D1D(out_grad, out_temp); - if (level_ != 0) { // no need to calculate in_grad for the first layer - vec_t trans_W(z*y); - transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix - matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y - update_all(graph, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y NOTE: since graph is symmetric, the derivative is the same - if (dropout_) { - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - d_dropout(scale_, in_grad[i], dropout_mask[i], in_grad[i]); - }, galois::chunk_size(), galois::steal(), galois::loopname("d_dropout")); - } - } - - // calculate weight gradients - transpose2D1D(in_data, trans_data); // y*x - matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z - } - void degree_counting() { assert(x == graph->size()); degrees.resize(x); diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 4a8a545738..076253fe82 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -15,8 +16,9 @@ #include "../node.h" #include "../types.h" #include "../utils.h" +#include "../gtypes.h" #include "../optimizer.h" -#include "../math_functions.hpp" +#include "../math_functions.hh" /** * base class of all kind of NN layers * @@ -36,11 +38,11 @@ class layer : public node { input_dims(in_dims), output_dims(out_dims) { add_edge(); } virtual ~layer() = default; virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data) = 0; - virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, - tensor_t &out_grad, tensor_t &in_grad) = 0; + virtual void forward_propagation(const float_t *in_data, float_t *out_data) = 0; + virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) = 0; + virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) = 0; virtual std::string layer_type() const = 0; virtual void set_context(net_phase ctx) {} - //virtual void setup(Graph *g, vec_t *diff, LabelList *lab) = 0; void set_trainable(bool trainable) { trainable_ = trainable; } bool trainable() const { return trainable_; } diff --git a/libdeepgalois/include/layers/relu_layer.h b/libdeepgalois/include/layers/relu_layer.h index c4acdd50ac..285e09b472 100644 --- a/libdeepgalois/include/layers/relu_layer.h +++ b/libdeepgalois/include/layers/relu_layer.h @@ -11,5 +11,7 @@ class relu_layer : public layer { ~relu_layer() {} std::string layer_type() const override { return std::string("relu"); } virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data); + virtual void forward_propagation(const float_t *in_data, float_t *out_data); virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad); + virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); }; diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h index 6375f72121..236fd35118 100644 --- a/libdeepgalois/include/layers/softmax_loss_layer.h +++ b/libdeepgalois/include/layers/softmax_loss_layer.h @@ -10,7 +10,9 @@ class softmax_loss_layer: public layer { ~softmax_loss_layer() {} std::string layer_type() const override { return std::string("softmax_loss"); } virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data); + virtual void forward_propagation(const float_t *in_data, float_t *out_data); virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad); + virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); private: LabelList *labels; diff --git a/libdeepgalois/include/math_functions.hpp b/libdeepgalois/include/math_functions.hh similarity index 58% rename from libdeepgalois/include/math_functions.hpp rename to libdeepgalois/include/math_functions.hh index d3d08b10b2..86363f4ba3 100644 --- a/libdeepgalois/include/math_functions.hpp +++ b/libdeepgalois/include/math_functions.hh @@ -1,12 +1,14 @@ #ifndef _MATH_FUNCTIONS_ #define _MATH_FUNCTIONS_ #include +#include +#include +#include #include "types.h" -#include const float negative_slope = 0; -void vadd(const vec_t &a, const vec_t &b, vec_t &out); +void vadd(const vec_t &a, const vec_t &b, vec_t &out); // vector add void vadd(size_t n, const float_t *a, const float_t *b, float_t *out); void vsub(const vec_t &a, const vec_t &b, vec_t &out); void vmul(const vec_t &a, const vec_t &b, vec_t &out); @@ -22,22 +24,29 @@ void vvmul(const vec_t &a, const vec_t &b, tensor_t &out); void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C); void copy2D1D(const tensor_t &in, vec_t &out); void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C); -void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const vec_t &A, const vec_t &B, vec_t &C); +void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const vec_t &A, const vec_t &B, vec_t &C); // matrix multiply void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C); void transpose2D(const tensor_t &in, tensor_t &out); void transpose2D1D(const tensor_t &in, vec_t &out); void transpose(size_t x, size_t y, const vec_t &in, vec_t &out); -int argmax(const size_t n, const vec_t &x); +int argmax(const size_t n, const vec_t &x); // the arguments of the maxima void clear(vec_t &in); -void relu(const vec_t &in, vec_t &out); -void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); -void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); -void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &mask, vec_t &out); +void relu(const vec_t &in, vec_t &out); // ReLU +void d_relu(const vec_t &in_diff, const vec_t &data, vec_t &out_diff); // ReLU derivative +void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &mask, vec_t &out); // dropout void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &mask, float_t *out); -void d_dropout(const float scale, const vec_t &in_diff, std::vector &mask, vec_t &out_diff); +void d_dropout(const float scale, const vec_t &in_diff, std::vector &mask, vec_t &out_diff); // dropout derivative void softmax(const vec_t &input, vec_t &output); void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp); float_t cross_entropy(const vec_t &y, const vec_t &p); void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d); +void vadd_gpu(const size_t n, const float_t *a, const float_t *b, float_t *out); // vector add +void relu_gpu(const size_t n, const float_t *in, float_t *out); // ReLU +void d_relu_gpu(const size_t n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative +void dropout_gpu(const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out); // dropout +void d_dropout_gpu(const float scale, const float_t *in_diff, const unsigned *mask, float_t *out_diff); // dropout derivative +void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply +int argmax_gpu(const size_t n, const float_t *x); // the arguments of the maxima + #endif diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index f845eed82e..8b51b6d350 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -2,7 +2,7 @@ #define _MODEL_H_ #include -#include "galois/Galois.h" +#include "gtypes.h" #include "galois/Timer.h" #include "lgraph.h" #include "layers.h" @@ -18,12 +18,6 @@ class Net { public: Net() {} - // user-defined aggregate function - virtual void aggregate(Graph *g, size_t dim, const tensor_t &in_feats, tensor_t &out_feats) {} - - // user-defined combine function - virtual void combine(const vec_t ma, const vec_t mb, const vec_t &a, const vec_t &b, vec_t &out) {} - void init() { read_graph(dataset, g); n = g.size(); // N diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h index 1a50080934..eec041e0e1 100644 --- a/libdeepgalois/include/node.h +++ b/libdeepgalois/include/node.h @@ -13,17 +13,11 @@ class node : public std::enable_shared_from_this { node(size_t in_size, size_t out_size) {}//: prev_(in_size), next_(out_size) {} virtual ~node() {} const edgeptr_t prev() const { return prev_; } - //const std::vector &prev() const { return prev_; } const edgeptr_t next() const { return next_; } - //const std::vector &next() const { return next_; } - //std::vector prev_nodes() const; - //std::vector next_nodes() const; protected: node() = delete; friend void connect(layer *head, layer *tail, size_t head_index, size_t tail_index); - //mutable std::vector prev_; - //mutable std::vector next_; mutable edgeptr_t prev_; mutable edgeptr_t next_; }; @@ -46,8 +40,7 @@ class edge { std::copy(grad_head.begin(), grad_head.end(), pdst); // @todo consider adding parallelism and vectorization for (size_t sample = 1; sample < grad_.size(); ++sample) { - for (size_t i = 0; i < sz; i++) - pdst[i] += grad_[sample][i]; + for (size_t i = 0; i < sz; i++) pdst[i] += grad_[sample][i]; //vectorize::reduce(&grad_[sample][0], sz, pdst); } } @@ -61,50 +54,24 @@ class edge { tensor_t *get_data_ptr() { return &data_; } tensor_t &get_data() { return data_; } - //const tensor_t *get_data() const { return &data_; } const tensor_t &get_data() const { return data_; } - //tensor_t *get_gradient() { return &grad_; } tensor_t &get_gradient() { return grad_; } - //const tensor_t *get_gradient() const { return &grad_; } const tensor_t &get_gradient() const { return grad_; } + float_t *get_gpu_data() const { return gpu_data_; } + float_t *get_gpu_gradient() { return gpu_grad_; } - //const std::vector &next() const { return next_; } const node *next() const { return next_; } node *prev() { return prev_; } const node *prev() const { return prev_; } - //const shape3d &shape() const { return shape_; } - //vector_type vtype() const { return vtype_; } - //void add_next_node(node *next) { next_.push_back(next); } void add_next_node(node *next) { next_ = next; } + private: - //shape3d shape_; - size_t ft_dim_; - //vector_type vtype_; - tensor_t data_; - tensor_t grad_; - node *prev_; // previous node, "producer" of this tensor - node *next_; // next node, "consumer" of this tensor - //std::vector next_; // next nodes, "consumers" of this tensor + size_t ft_dim_; // feature dimensions + tensor_t data_; // feature vectors on CPU + tensor_t grad_; // gradients on CPU + float_t *gpu_data_; // feature vectors on GPU + float_t *gpu_grad_; // gradients on CPU + node *prev_; // previous node, "producer" of this tensor + node *next_; // next node, "consumer" of this tensor }; -/* -inline std::vector node::prev_nodes() const { - std::vector vecs; - for (auto &e : prev_) { - if (e && e->prev()) { - vecs.insert(vecs.end(), e->prev()); - } - } - return vecs; -} -inline std::vector node::next_nodes() const { - std::vector vecs; - for (auto &e : next_) { - if (e) { - auto n = e->next(); - vecs.insert(vecs.end(), n.begin(), n.end()); - } - } - return vecs; -} -*/ diff --git a/libdeepgalois/include/types.h b/libdeepgalois/include/types.h index 0aa80cce4f..8d78e03d48 100644 --- a/libdeepgalois/include/types.h +++ b/libdeepgalois/include/types.h @@ -1,8 +1,7 @@ #ifndef TYPES_H #define TYPES_H #include -#include "galois/Galois.h" -#include "galois/graphs/LCGraph.h" +#include #ifdef CNN_USE_DOUBLE typedef double float_t; @@ -20,16 +19,6 @@ typedef short label_t; // label is for classification (supervised learning) typedef uint8_t mask_t; // mask is used to indicate different uses of labels: train, val, test typedef std::vector LabelList; // label list to store label for each vertex typedef std::vector MaskList; // mask list to store mask for each vertex -typedef galois::GAccumulator AccumF; -typedef galois::GAccumulator AccumU; - -#ifdef EDGE_LABEL -typedef galois::graphs::LC_CSR_Graph::with_numa_alloc::type ::with_no_lockable::type Graph; -#else -typedef galois::graphs::LC_CSR_Graph::with_numa_alloc::type ::with_no_lockable::type Graph; -#endif - -typedef Graph::GraphNode GNode; #define CHUNK_SIZE 256 #endif diff --git a/libdeepgalois/include/utils.h b/libdeepgalois/include/utils.h index ceb49b0e41..caf27c56a3 100644 --- a/libdeepgalois/include/utils.h +++ b/libdeepgalois/include/utils.h @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp new file mode 100644 index 0000000000..4b3f7cbab6 --- /dev/null +++ b/libdeepgalois/src/aggregator.cpp @@ -0,0 +1,40 @@ +#include "aggregator.h" +#include "math_functions.hh" + +void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { + galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) { + clear(out[src]); // TODO: vectorize clear + float_t a = 0.0, b = 0.0; + if (norm) a = norm_factor[src]; + // gather neighbors' embeddings + for (const auto e : g->edges(src)) { + const auto dst = g->getEdgeDst(e); + if (norm) { + b = a * norm_factor[dst]; + vec_t neighbor = in[dst]; + mul_scalar(b, neighbor); + vadd(out[src], neighbor, out[src]); // out[src] += in[dst] + } else vadd(out[src], in[dst], out[src]); // out[src] += in[dst] + } + }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); +} + +void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { + size_t len = out[0].size(); + galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) { + clear(out[src]); + float_t a = 0.0, b = 0.0; + if (norm) a = norm_factor[src]; + // gather neighbors' embeddings + for (const auto e : g->edges(src)) { + const auto dst = g->getEdgeDst(e); + if (norm) { + b = a * norm_factor[dst]; + vec_t neighbor(len); + mul_scalar(len, b, &in[dst*len], neighbor.data()); + vadd(out[src], neighbor, out[src]); // out[src] += in[dst] + } else vadd(len, out[src].data(), &in[dst*len], out[src].data()); // out[src] += in[dst] + } + }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); +} + diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu new file mode 100644 index 0000000000..1cc93e6866 --- /dev/null +++ b/libdeepgalois/src/aggregator.cu @@ -0,0 +1,7 @@ +#include "csr_graph.h" +#include "aggregator.h" +#include "math_functions.hh" + +void update_all(CSRGraph *g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { +} + diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp new file mode 100644 index 0000000000..863b5df73c --- /dev/null +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -0,0 +1,54 @@ +#include "layers/graph_conv_layer.h" +#include "aggregator.h" + +void graph_conv_layer::aggregate(Graph *g, const vec_t &in, tensor_t &out) { + update_all(g, in, out, true, norm_factor); +} + +// ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) +void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) { + // input: x*y; W: y*z; output: x*z + // if y > z: mult W first to reduce the feature size for aggregation + // else: aggregate first then mult W (not implemented yet) + if (dropout_ && phase_ == net_phase::train) { + galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { + dropout(scale_, dropout_rate_, in_data[i], dropout_mask[i], &in_temp[i*y]); + }, galois::loopname("dropout")); + matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z + } else matmul2D1D(z, in_data, W, out_temp); // x*y; y*z; x*z + aggregate(graph, out_temp, out_data); // aggregate + if (act_) { + galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { + relu(out_data[i], out_data[i]); + }, galois::loopname("relu")); + } +} + +void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { } + +// ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ +void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) { + if (act_) { + galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { + for (size_t j = 0; j < z; ++j) //TODO: use in_data or out_data? + out_temp[i*z+j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0); + }, galois::loopname("d_relu")); + } else copy2D1D(out_grad, out_temp); // TODO: avoid copying + if (level_ != 0) { // no need to calculate in_grad for the first layer + vec_t trans_W(z*y); + transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix + matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y + update_all(graph, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y NOTE: since graph is symmetric, the derivative is the same + if (dropout_) { + galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { + d_dropout(scale_, in_grad[i], dropout_mask[i], in_grad[i]); + }, galois::chunk_size(), galois::steal(), galois::loopname("d_dropout")); + } + } + // calculate weight gradients + transpose2D1D(in_data, trans_data); // y*x + matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z +} + +void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { } + diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp index ccabc8a090..eb02f66d50 100644 --- a/libdeepgalois/src/layers/relu_layer.cpp +++ b/libdeepgalois/src/layers/relu_layer.cpp @@ -8,12 +8,24 @@ void relu_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data }, galois::chunk_size(), galois::steal(), galois::loopname("relu_layer-fw")); } +// ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) +void relu_layer::forward_propagation(const float_t *in_data, float_t *out_data) { + const size_t count = input_dims[0] * input_dims[1]; + relu_gpu(count, in_data, out_data); +} + // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ , ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ -void relu_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, - tensor_t &out_grad, tensor_t &in_grad) { +void relu_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) { galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) { for (size_t j = 0; j < input_dims[1]; ++j) in_grad[i][j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0); }, galois::chunk_size(), galois::steal(), galois::loopname("relu_layer-bw")); } + +// ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) +// = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ , ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ +void relu_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { + const size_t count = input_dims[0] * input_dims[1]; + d_relu_gpu(count, out_grad, in_data, in_grad); +} diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 61f63f6f0e..22a9d1a83c 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -21,6 +21,8 @@ void softmax_loss_layer::forward_propagation(const tensor_t &in_data, tensor_t & }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-fw")); } +void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { } + void softmax_loss_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) { //std::cout << name_ << " backward: x=" << in_grad.size() << ", y=" << in_grad[0].size() << "\n"; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { @@ -32,3 +34,4 @@ void softmax_loss_layer::back_propagation(const tensor_t &in_data, const tensor_ }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); } +void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { } diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index a4d1d77719..1e3e0e1d79 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -1,5 +1,7 @@ -#include "math_functions.hpp" +#include "math_functions.hh" #include "utils.h" +#include "galois/Timer.h" +#include extern "C" { #include @@ -242,7 +244,7 @@ void relu(const vec_t &in, vec_t &out) { void d_relu(const vec_t &in_diff, const vec_t &fv, vec_t &out_diff) { for (size_t i = 0; i < out_diff.size(); ++i) { - out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) + negative_slope * (fv[i] <= (float_t)0)); + out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) + negative_slope * (fv[i] <= (float_t)0)); } } @@ -298,43 +300,6 @@ void sigmoid(vec_t &fv) { } } -void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { - galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) { - clear(out[src]); // TODO: vectorize clear - float_t a = 0.0, b = 0.0; - if (norm) a = norm_factor[src]; - // gather neighbors' embeddings - for (const auto e : g->edges(src)) { - const auto dst = g->getEdgeDst(e); - if (norm) { - b = a * norm_factor[dst]; - vec_t neighbor = in[dst]; - mul_scalar(b, neighbor); - vadd(out[src], neighbor, out[src]); // out[src] += in[dst] - } else vadd(out[src], in[dst], out[src]); // out[src] += in[dst] - } - }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); -} - -void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { - size_t len = out[0].size(); - galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) { - clear(out[src]); - float_t a = 0.0, b = 0.0; - if (norm) a = norm_factor[src]; - // gather neighbors' embeddings - for (const auto e : g->edges(src)) { - const auto dst = g->getEdgeDst(e); - if (norm) { - b = a * norm_factor[dst]; - vec_t neighbor(len); - mul_scalar(len, b, &in[dst*len], neighbor.data()); - vadd(out[src], neighbor, out[src]); // out[src] += in[dst] - } else vadd(len, out[src].data(), &in[dst*len], out[src].data()); // out[src] += in[dst] - } - }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); -} - // Softmax function takes an N-dimensional vector (X) of real number, // and transforms it into a vector of real number in range (0,1) which add upto 1. // To make softmax func numerically stable, we simply normalize the values in the vector, diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 7e96afc0c3..0179c46d56 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -1,19 +1,32 @@ -#pragma once -#include "cutils.h" +#include "math_functions.hh" +#include "common.h" + +extern "C" { +#include +//#include +} // flattern data into 1D before feed into the ReLU operater -__global__ void relu_gpu(const int n, const float_t* in, float_t* out) { +__global__ void relu_kernel(const int n, const float_t* in, float_t* out) { CUDA_KERNEL_LOOP(index, n) { out[index] = in[index] > 0 ? in[index] : 0; } } -__global__ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* in_data, float_t* out_diff) { +void relu_gpu(const int n, const float_t *in_data, float_t* out_data) { + relu_kernel<<>>(n, in_data, out_data); +} + +__global__ void d_relu_kernel(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff) { CUDA_KERNEL_LOOP(index, n) { - out_diff[index] = in_data[index] > 0 ? in_diff[index] : 0; + out_diff[index] = data[index] > 0 ? in_diff[index] : 0; } } +void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff) { + d_relu_kernel<<>>(n, in_diff, data, out_diff); +} + void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C) { @@ -22,30 +35,41 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, int ldb = (TransB == CblasNoTrans) ? N : K; cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasSgemm(cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + CUBLAS_CHECK(cublasSgemm(DeepGalois::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); +} + +void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C) { + const CBLAS_TRANSPOSE TransA = CblasNoTrans; + const CBLAS_TRANSPOSE TransB = CblasNoTrans; + sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); +} + +// the arguments of the maxima +int argmax_gpu(const size_t n, const float_t *x) { + return 0; } void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float* A, const float* x, const float beta, float* y) { cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasSgemv(cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); + CUBLAS_CHECK(cublasSgemv(DeepGalois::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); } void scal_gpu(const int N, const float alpha, float *X) { - CUBLAS_CHECK(cublasSscal(cublas_handle(), N, &alpha, X, 1)); + CUBLAS_CHECK(cublasSscal(DeepGalois::cublas_handle(), N, &alpha, X, 1)); } void dot_gpu(const int n, const float* x, const float* y, float* out) { - CUBLAS_CHECK(cublasSdot(Caffe::cublas_handle(), n, x, 1, y, 1, out)); + CUBLAS_CHECK(cublasSdot(DeepGalois::cublas_handle(), n, x, 1, y, 1, out)); } void asum_gpu(const int n, const float* x, float* y) { - CUBLAS_CHECK(cublasSasum(Caffe::cublas_handle(), n, x, 1, y)); + CUBLAS_CHECK(cublasSasum(DeepGalois::cublas_handle(), n, x, 1, y)); } void scale_gpu(const int n, const float alpha, const float *x, float* y) { - CUBLAS_CHECK(cublasScopy(Caffe::cublas_handle(), n, x, 1, y, 1)); - CUBLAS_CHECK(cublasSscal(Caffe::cublas_handle(), n, &alpha, y, 1)); + CUBLAS_CHECK(cublasScopy(DeepGalois::cublas_handle(), n, x, 1, y, 1)); + CUBLAS_CHECK(cublasSscal(DeepGalois::cublas_handle(), n, &alpha, y, 1)); } __global__ void set_kernel(const int n, const float_t alpha, float_t* y) { @@ -59,7 +83,7 @@ void set_gpu(const int N, const float_t alpha, float_t* Y) { CUDA_CHECK(cudaMemset(Y, 0, sizeof(float_t) * N)); return; } - set_kernel<<>>(N, alpha, Y); + set_kernel<<>>(N, alpha, Y); } __global__ void add_scalar_kernel(const int n, const float_t alpha, float_t* y) { @@ -68,17 +92,17 @@ __global__ void add_scalar_kernel(const int n, const float_t alpha, float_t* y) } } -void add_scalar_gpu(const int N, const float alpha, float* Y) { - add_scalar_kernel<<>>(N, alpha, Y); +void add_scalar_gpu(const int N, const float_t alpha, float_t* Y) { + add_scalar_kernel<<>>(N, alpha, Y); } -__global__ void add_kernel(const int n, const float_t* a, const float_t* b, float_t* y) { +__global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, float_t* y) { CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] + b[index]; } } -void add_gpu(const int N, const float* a, const float* b, float* y) { - add_kernel<<>>(N, a, b, y); +void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) { + vadd_kernel<<>>(N, a, b, y); } From 1f9627f32c1a93865f4ea1c3d355c47515f1c686 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 21 Feb 2020 12:56:53 -0600 Subject: [PATCH 005/660] update CMakeLists --- CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index f1b0489c10..d25c2764a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -504,6 +504,10 @@ endif(USE_PANGOLIN) if(USE_DEEPGALOIS) add_subdirectory(libdeepgalois) add_subdirectory(lonestargnn) + cuda_include_directories("${CUB_ROOT}") + cuda_include_directories("${MGPU_ROOT}/src") + cuda_include_directories("${CMAKE_SOURCE_DIR}/libgpu/include") + add_subdirectory(libgpu) endif(USE_DEEPGALOIS) if(ENABLE_DIST_GALOIS) add_subdirectory(libdist) From b9c8b80566d55f6e9e315abd7fda2610f372bcce Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 21 Feb 2020 13:34:40 -0600 Subject: [PATCH 006/660] fix CMakeLists --- CMakeLists.txt | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d25c2764a4..79555a0b31 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -502,11 +502,26 @@ if(USE_PANGOLIN) add_subdirectory(lonestarmine) endif(USE_PANGOLIN) if(USE_DEEPGALOIS) + SET(CUDA_SEPARABLE_COMPILATION ON) + find_package(CUDA REQUIRED) + set(CUDA_PROPAGATE_HOST_FLAGS off) + set(CUDA_SEPARABLE_COMPILATION on) + set(CUDA_HOST_COMPILER g++) + string(REPLACE "." "" GENCODES ${CUDA_CAPABILITY}) + string(REPLACE "," ";" GENCODES ${GENCODES}) + foreach(GENCODE ${GENCODES}) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; --expt-extended-lambda -gencode arch=compute_${GENCODE},code=sm_${GENCODE}) + endforeach() + list(APPEND CUDA_NVCC_FLAGS "-std=c++11") + cuda_include_directories("${CMAKE_SOURCE_DIR}/libgpu/include") add_subdirectory(libdeepgalois) add_subdirectory(lonestargnn) + set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers cuda_include_directories("${CUB_ROOT}") + link_directories(${CMAKE_SOURCE_DIR}/cub) + set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers cuda_include_directories("${MGPU_ROOT}/src") - cuda_include_directories("${CMAKE_SOURCE_DIR}/libgpu/include") + link_directories(${CMAKE_SOURCE_DIR}/moderngpu/src) add_subdirectory(libgpu) endif(USE_DEEPGALOIS) if(ENABLE_DIST_GALOIS) From d7801b8ae670ce8935d1d6c5a308073616f5a368 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 22 Feb 2020 10:01:06 -0600 Subject: [PATCH 007/660] add CPU_ONLY flag --- libdeepgalois/CMakeLists.txt | 59 ++++++++++-------- libdeepgalois/include/aggregator.h | 11 +++- .../include/layers/graph_conv_layer.h | 60 +++++++----------- libdeepgalois/include/net.h | 53 ++++++---------- libdeepgalois/src/aggregator.cu | 3 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 61 +++++++++++++++++-- 6 files changed, 139 insertions(+), 108 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 7e558221f6..0b0be6217c 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -1,35 +1,40 @@ +cmake_minimum_required(VERSION 2.8) + SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) +set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers +set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers include_directories(${OPENBLAS_INC}) link_directories(${OPENBLAS_LIB}) +link_directories(${CMAKE_SOURCE_DIR}/libgalois) -SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include) -SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-8.0/lib64/) -include_directories(${CUDA_INC}) -link_directories(${CUDA_LIB}) -link_directories(${CMAKE_SOURCE_DIR}/libgpu) - -#set(sources -# src/layers/relu_layer.cu -#) -#target_include_directories(deepgalois_gpu PUBLIC -# ${CMAKE_SOURCE_DIR}/libgpu/include -#) -#set_target_properties(deepgalois_gpu PROPERTIES -# INTERFACE_POSITION_INDEPENDENT_CODE On -# POSITION_INDEPENDENT_CODE On -#) -cmake_minimum_required(VERSION 2.8) -find_package(CUDA REQUIRED) -set(CUDA_SEPARABLE_COMPILATION ON) -set(CUDA_PROPAGATE_HOST_FLAGS OFF) -#set(CUDA_HOST_COMPILER g++) -list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60; -std=c++11") -include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) -include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) -file(GLOB CUDA_FILES "src/" *.cu) -#CUDA_COMPILE(CU_O src/math_functions.cu) -CUDA_COMPILE(CU_O ${CUDA_FILES}) +#deepgalois_option(CPU_ONLY "Build DeepGalois without CUDA support" OFF) +set(CPU_ONLY ON CACHE BOOL "Build DeepGalois without CUDA support") +if(CPU_ONLY) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") +else() + SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include) + SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-8.0/lib64/) + include_directories(${CUDA_INC}) + link_directories(${CUDA_LIB}) + link_directories(${CMAKE_SOURCE_DIR}/libgpu) + find_package(CUDA REQUIRED) + set(CUDA_SEPARABLE_COMPILATION ON) + set(CUDA_PROPAGATE_HOST_FLAGS OFF) + set(CUDA_HOST_COMPILER g++) + #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60") + #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60; -std=c++11") + include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) + include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) + include_directories(${CMAKE_SOURCE_DIR}/libgalois/include) + #include_directories(${CUB_ROOT}) + #include_directories(${MGPU_ROOT}/src) + cuda_include_directories("${CUB_ROOT}") + cuda_include_directories("${MGPU_ROOT}/src") + file(GLOB CUDA_FILES "src/" *.cu) + CUDA_COMPILE(CU_O src/math_functions.cu src/aggregator.cu) + #CUDA_COMPILE(CU_O ${CUDA_FILES}) +endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") set(sources diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h index 4e178d89b8..a071781f54 100644 --- a/libdeepgalois/include/aggregator.h +++ b/libdeepgalois/include/aggregator.h @@ -1,6 +1,15 @@ #pragma once #include "types.h" +#ifdef CPU_ONLY #include "gtypes.h" - void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); +#else +#include "gg.h" +#include "ggcuda.h" +#include "cub/cub.cuh" +#define TB_SIZE 256 +#define WARP_SIZE 32 +void update_all(CSRGraph g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor); +#endif + diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h index 0633ec63e9..44dfd197af 100644 --- a/libdeepgalois/include/layers/graph_conv_layer.h +++ b/libdeepgalois/include/layers/graph_conv_layer.h @@ -1,6 +1,6 @@ #pragma once #include "layer.h" -#include "gtypes.h" +#include "aggregator.h" /* GraphConv Layer Parameters @@ -16,22 +16,15 @@ */ class graph_conv_layer: public layer { public: +#ifdef CPU_ONLY graph_conv_layer(unsigned level, Graph *g, bool act, bool norm, bool bias, bool dropout, - float dropout_rate, std::vector in_dims, std::vector out_dims) : - layer(level, in_dims, out_dims), graph(g), act_(act), norm_(norm), bias_(bias), - dropout_(dropout), dropout_rate_(dropout_rate) { - assert(input_dims[0] == output_dims[0]); // num_vertices - x = input_dims[0]; - y = input_dims[1]; - z = output_dims[1]; - trainable_ = true; - name_ = layer_type() + "_" + std::to_string(level); - //std::cout << name_ << " constructed: act(" << act_ << ") dropout(" << dropout << ")\n"; - init(); - scale_ = 1. / (1. - dropout_rate_); - } - graph_conv_layer(unsigned level, std::vector in_dims, - std::vector out_dims) : graph_conv_layer(level, NULL, false, true, false, true, 0.5, in_dims, out_dims) {} + float dropout_rate, std::vector in_dims, std::vector out_dims); +#else + graph_conv_layer(unsigned level, CSRGraph *g, bool act, bool norm, bool bias, bool dropout, + float dropout_rate, std::vector in_dims, std::vector out_dims); +#endif + graph_conv_layer(unsigned level, std::vector in_dims, std::vector out_dims) : + graph_conv_layer(level, NULL, false, true, false, true, 0.5, in_dims, out_dims) {} ~graph_conv_layer() {} void init() { std::cout << name_ << ": allocating memory for parameters and intermediate data... "; @@ -47,11 +40,8 @@ class graph_conv_layer: public layer { for (size_t i = 0; i < x; i++) dropout_mask[i].resize(y); } in_temp.resize(x*y); - //for (size_t i = 0; i < x; ++i) in_temp[i].resize(y); out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py - //for (size_t i = 0; i < x; ++i) out_temp[i].resize(z); trans_data.resize(y*x); // y*x - //for (size_t i = 0; i < y; ++i) trans_data[i].resize(x); if (norm_) norm_factor_counting(); t_alloc.Stop(); std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; @@ -59,20 +49,21 @@ class graph_conv_layer: public layer { std::string layer_type() const override { return std::string("graph_conv"); } void set_context(net_phase ctx) override { phase_ = ctx; } virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data); - virtual void forward_propagation(const float_t *in_data, float_t *out_data); virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad); + virtual void forward_propagation(const float_t *in_data, float_t *out_data); virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); // user-defined aggregate function +#ifdef CPU_ONLY virtual void aggregate(Graph *g, const vec_t &in, tensor_t &out); +#else + virtual void aggregate(CSRGraph g, const float_t *in, float_t *out); +#endif // user-defined combine function - virtual void combine(const vec_t &self, const vec_t &neighbors, const vec_t mat_v, const vec_t mat_u, vec_t &out) { - vec_t a(out.size(), 0); - vec_t b(out.size(), 0); - mvmul(mat_v, self, a); - mvmul(mat_u, neighbors, b); - vadd(a, b, out); // out = W*self + Q*neighbors - } + virtual void combine(const vec_t &self, const vec_t &neighbors, vec_t &out); + // user-defined pre-computing function, called during initialization + virtual void norm_factor_counting(); +protected: void degree_counting() { assert(x == graph->size()); degrees.resize(x); @@ -81,19 +72,12 @@ class graph_conv_layer: public layer { }, galois::loopname("DegreeCounting")); } - // for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v - void norm_factor_counting() { - degree_counting(); - norm_factor.resize(x); - galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) { - float_t temp = std::sqrt(float_t(degrees[v])); - if (temp == 0.0) norm_factor[v] = 0.0; - else norm_factor[v] = 1.0 / temp; - }, galois::loopname("NormCounting")); - } - private: +#ifdef CPU_ONLY Graph *graph; +#else + CSRGraph graph_gpu; +#endif bool act_; // whether to use activation function at the end bool norm_; // whether to normalize data bool bias_; // whether to add bias afterwards diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index 8b51b6d350..288e3aac3b 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -19,8 +19,7 @@ class Net { Net() {} void init() { - read_graph(dataset, g); - n = g.size(); // N + n = read_graph_cpu(dataset, graph_cpu); labels.resize(n, 0); // label for each vertex: N x 1 num_classes = read_labels(dataset, labels); @@ -50,7 +49,6 @@ class Net { size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id+1]; } size_t get_nnodes() { return n; } - size_t get_nedges() { return g.sizeEdges(); } size_t get_ft_dim() { return feature_dims[0]; } size_t get_nclasses() { return num_classes; } size_t get_label(size_t i) { return labels[i]; } @@ -79,7 +77,11 @@ class Net { in_dims[0] = out_dims[0] = n; in_dims[1] = get_in_dim(layer_id); out_dims[1] = get_out_dim(layer_id); - layers[layer_id] = new graph_conv_layer(layer_id, &g, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); +#ifdef CPU_ONLY + layers[layer_id] = new graph_conv_layer(layer_id, &graph_cpu, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); +#else + layers[layer_id] = new graph_conv_layer(layer_id, &graph_gpu, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); +#endif if(layer_id > 0) connect(layers[layer_id-1], layers[layer_id]); } @@ -177,8 +179,11 @@ class Net { size_t num_classes; // number of vertex classes: E size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 std::vector feature_dims; // feature dimnesions for each layer - - Graph g; // the input graph, |V| = N +#ifdef CPU_ONLY + Graph graph_cpu; // the input graph, |V| = N +#else + CSRGraph graph_gpu; // the input graph, |V| = N +#endif tensor_t input_features; // input features: N x D std::vector labels; // labels for classification: N x 1 MaskList train_mask, val_mask; // masks for traning and validation @@ -265,42 +270,23 @@ class Net { return n; } - unsigned read_graph(std::string dataset_str, Graph &graph) { - //printf("Start readGraph\n"); + size_t read_graph_cpu(std::string dataset_str, Graph &graph) { galois::StatTimer Tread("GraphReadingTime"); Tread.start(); LGraph lgraph; - unsigned max_degree = 0; if (filetype == "el") { std::string filename = path + dataset_str + ".el"; printf("Reading .el file: %s\n", filename.c_str()); lgraph.read_edgelist(filename.c_str(), true); //symmetrize genGraph(lgraph, graph); + lgraph.clean(); } else if (filetype == "gr") { std::string filename = path + dataset_str + ".csgr"; printf("Reading .gr file: %s\n", filename.c_str()); galois::graphs::readGraph(graph, filename); - /* - galois::do_all(galois::iterate(graph.begin(), graph.end()), [&](const auto& vid) { - graph.getData(vid) = 1; - //for (auto e : graph.edges(n)) graph.getEdgeData(e) = 1; - }, galois::chunk_size<256>(), galois::steal(), galois::loopname("assignVertexLabels")); - std::vector degrees(graph.size()); - galois::do_all(galois::iterate(graph.begin(), graph.end()), [&](const auto& vid) { - degrees[vid] = std::distance(graph.edge_begin(vid), graph.edge_end(vid)); - }, galois::loopname("computeMaxDegree")); - max_degree = *(std::max_element(degrees.begin(), degrees.end())); - */ } else { printf("Unkown file format\n"); exit(1); } - if (filetype != "gr") { - max_degree = lgraph.get_max_degree(); - lgraph.clean(); - } - printf("max degree = %u\n", max_degree); Tread.stop(); - //printf("Done readGraph\n"); - std::cout << "num_vertices " << g.size() << " num_edges " << g.sizeEdges() << "\n"; - return max_degree; + return graph.size(); } void genGraph(LGraph &lg, Graph &g) { @@ -312,23 +298,20 @@ class Net { auto row_end = lg.get_offset(i+1); g.fixEndEdge(i, row_end); for (auto offset = row_begin; offset < row_end; offset ++) - g.constructEdge(offset, lg.get_dest(offset), 0); // do not consider edge labels now + g.constructEdge(offset, lg.get_dest(offset), 0); } } + // comparing outputs with the ground truth (labels) inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, MaskList &masks) { - // comparing outputs with the ground truth (labels) - //acc_t accuracy_all = 0.0; AccumF accuracy_all; accuracy_all.reset(); - //for (size_t i = begin; i < end; i++) { galois::do_all(galois::iterate(begin, end), [&](const auto& i) { if (masks[i] == 1) { - int prediction = argmax(num_classes, layers[NUM_CONV_LAYERS-1]->next()->get_data()[i]); - if ((label_t)prediction == labels[i]) accuracy_all += 1.0; + int preds = argmax(num_classes, layers[NUM_CONV_LAYERS-1]->next()->get_data()[i]); + if ((label_t)preds == labels[i]) accuracy_all += 1.0; } }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); - //} return accuracy_all.reduce() / (acc_t)count; } }; diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index 1cc93e6866..44a3e59d2d 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -1,7 +1,6 @@ -#include "csr_graph.h" #include "aggregator.h" #include "math_functions.hh" -void update_all(CSRGraph *g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { +void update_all(CSRGraph g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 863b5df73c..71d5f18f5b 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -1,10 +1,43 @@ #include "layers/graph_conv_layer.h" -#include "aggregator.h" void graph_conv_layer::aggregate(Graph *g, const vec_t &in, tensor_t &out) { update_all(g, in, out, true, norm_factor); } +// for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v +void graph_conv_layer::norm_factor_counting() { + degree_counting(); + norm_factor.resize(x); + galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) { + float_t temp = std::sqrt(float_t(degrees[v])); + if (temp == 0.0) norm_factor[v] = 0.0; + else norm_factor[v] = 1.0 / temp; + }, galois::loopname("NormCounting")); +} + +void graph_conv_layer::combine(const vec_t &self, const vec_t &neighbors, vec_t &out) { + vec_t a(out.size(), 0); + vec_t b(out.size(), 0); + mvmul(Q, self, a); + mvmul(W, neighbors, b); + vadd(a, b, out); // out = W*self + Q*neighbors +} + +#ifdef CPU_ONLY +graph_conv_layer::graph_conv_layer(unsigned level, Graph *g, bool act, bool norm, bool bias, + bool dropout, float dropout_rate, std::vector in_dims, std::vector out_dims) : + layer(level, in_dims, out_dims), graph(g), act_(act), norm_(norm), + bias_(bias), dropout_(dropout), dropout_rate_(dropout_rate) { + assert(input_dims[0] == output_dims[0]); // num_vertices + x = input_dims[0]; + y = input_dims[1]; + z = output_dims[1]; + trainable_ = true; + name_ = layer_type() + "_" + std::to_string(level); + init(); + scale_ = 1. / (1. - dropout_rate_); +} + // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) { // input: x*y; W: y*z; output: x*z @@ -24,8 +57,6 @@ void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &ou } } -void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { } - // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) { if (act_) { @@ -49,6 +80,26 @@ void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t transpose2D1D(in_data, trans_data); // y*x matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z } +#else +graph_conv_layer::graph_conv_layer(unsigned level, CSRGraph *g, bool act, bool norm, bool bias, + bool dropout, float dropout_rate, std::vector in_dims, std::vector out_dims) : + layer(level, in_dims, out_dims), graph_gpu(*g), act_(act), norm_(norm), + bias_(bias), dropout_(dropout), dropout_rate_(dropout_rate) { + assert(input_dims[0] == output_dims[0]); // num_vertices + x = input_dims[0]; + y = input_dims[1]; + z = output_dims[1]; + trainable_ = true; + name_ = layer_type() + "_" + std::to_string(level); + init(); + scale_ = 1. / (1. - dropout_rate_); +} + +// GPU forward +void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { +} -void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { } - +// GPU backward +void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { +} +#endif From 6af133b1cac0d57566d4f5f101ea12eeb73bac87 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 22 Feb 2020 11:32:14 -0600 Subject: [PATCH 008/660] fix bug --- libdeepgalois/CMakeLists.txt | 24 +++++++++---------- libdeepgalois/src/layers/graph_conv_layer.cpp | 8 ++++++- lonestargnn/gcn/CMakeLists.txt | 3 +++ 3 files changed, 21 insertions(+), 14 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 0b0be6217c..3519694c57 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -4,35 +4,33 @@ SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers +SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include) +SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-8.0/lib64/) include_directories(${OPENBLAS_INC}) +include_directories(${CMAKE_SOURCE_DIR}/libgalois/include) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) link_directories(${OPENBLAS_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgalois) #deepgalois_option(CPU_ONLY "Build DeepGalois without CUDA support" OFF) -set(CPU_ONLY ON CACHE BOOL "Build DeepGalois without CUDA support") -if(CPU_ONLY) +set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support") +if(USE_CPU) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") else() - SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include) - SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-8.0/lib64/) - include_directories(${CUDA_INC}) - link_directories(${CUDA_LIB}) - link_directories(${CMAKE_SOURCE_DIR}/libgpu) find_package(CUDA REQUIRED) set(CUDA_SEPARABLE_COMPILATION ON) set(CUDA_PROPAGATE_HOST_FLAGS OFF) set(CUDA_HOST_COMPILER g++) #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60") #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60; -std=c++11") - include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) - include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) - include_directories(${CMAKE_SOURCE_DIR}/libgalois/include) - #include_directories(${CUB_ROOT}) - #include_directories(${MGPU_ROOT}/src) + cuda_include_directories(${CUDA_INC}) cuda_include_directories("${CUB_ROOT}") cuda_include_directories("${MGPU_ROOT}/src") + cuda_include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) + link_directories(${CUDA_LIB}) + link_directories(${CMAKE_SOURCE_DIR}/libgpu) file(GLOB CUDA_FILES "src/" *.cu) - CUDA_COMPILE(CU_O src/math_functions.cu src/aggregator.cu) + cuda_cpmpile(CU_O src/math_functions.cu src/aggregator.cu) #CUDA_COMPILE(CU_O ${CUDA_FILES}) endif() diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 71d5f18f5b..98e9e14211 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -80,6 +80,9 @@ void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t transpose2D1D(in_data, trans_data); // y*x matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z } + +void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {} +void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {} #else graph_conv_layer::graph_conv_layer(unsigned level, CSRGraph *g, bool act, bool norm, bool bias, bool dropout, float dropout_rate, std::vector in_dims, std::vector out_dims) : @@ -94,7 +97,10 @@ graph_conv_layer::graph_conv_layer(unsigned level, CSRGraph *g, bool act, bool n init(); scale_ = 1. / (1. - dropout_rate_); } - + +void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) {} +void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {} + // GPU forward void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { } diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt index 05484252b8..ae1d2dff4b 100644 --- a/lonestargnn/gcn/CMakeLists.txt +++ b/lonestargnn/gcn/CMakeLists.txt @@ -2,6 +2,9 @@ SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) include_directories(${OPENBLAS_INC}) link_directories(${OPENBLAS_LIB}) +if(USE_CPU) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") +endif() app(gcn gcn.cpp) target_link_libraries(gcn deepgalois) From e4535e3ab843f57a1f9b38369b96b64edd7ef021 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 22 Feb 2020 15:29:22 -0600 Subject: [PATCH 009/660] add context --- libdeepgalois/CMakeLists.txt | 8 +- libdeepgalois/include/aggregator.h | 6 +- libdeepgalois/include/common.h | 66 ------ libdeepgalois/include/context.h | 64 ++++++ .../include/layers/graph_conv_layer.h | 56 +---- libdeepgalois/include/layers/layer.h | 5 +- .../include/layers/softmax_loss_layer.h | 8 +- libdeepgalois/include/net.h | 212 ++---------------- libdeepgalois/src/aggregator.cpp | 16 +- libdeepgalois/src/context.cpp | 156 +++++++++++++ libdeepgalois/src/layers/graph_conv_layer.cpp | 60 +++-- .../src/layers/softmax_loss_layer.cpp | 12 +- libdeepgalois/src/math_functions.cu | 2 +- libdeepgalois/src/net.cpp | 107 +++++++++ lonestargnn/CMakeLists.txt | 10 + lonestargnn/gcn/CMakeLists.txt | 8 - lonestargnn/gcn/gcn.cpp | 6 +- 17 files changed, 423 insertions(+), 379 deletions(-) delete mode 100644 libdeepgalois/include/common.h create mode 100644 libdeepgalois/include/context.h create mode 100644 libdeepgalois/src/context.cpp create mode 100644 libdeepgalois/src/net.cpp diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 3519694c57..514af263d4 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -9,6 +9,7 @@ SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-8.0/lib64/) include_directories(${OPENBLAS_INC}) include_directories(${CMAKE_SOURCE_DIR}/libgalois/include) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) +include_directories(${CUDA_INC}) link_directories(${OPENBLAS_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgalois) @@ -23,14 +24,13 @@ else() set(CUDA_HOST_COMPILER g++) #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60") #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60; -std=c++11") - cuda_include_directories(${CUDA_INC}) cuda_include_directories("${CUB_ROOT}") cuda_include_directories("${MGPU_ROOT}/src") cuda_include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) link_directories(${CUDA_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgpu) file(GLOB CUDA_FILES "src/" *.cu) - cuda_cpmpile(CU_O src/math_functions.cu src/aggregator.cu) + cuda_compile(CU_O src/math_functions.cu src/aggregator.cu) #CUDA_COMPILE(CU_O ${CUDA_FILES}) endif() @@ -41,6 +41,8 @@ set(sources src/layers/softmax_loss_layer.cpp src/math_functions.cpp src/aggregator.cpp + src/context.cpp + src/net.cpp ${CU_O} ) add_library(deepgalois STATIC ${sources}) @@ -48,7 +50,7 @@ add_library(deepgalois STATIC ${sources}) target_link_libraries(deepgalois galois_shmem gllvm galois_gpu) target_link_libraries(deepgalois ${MPI_CXX_LIBRARIES}) target_link_libraries(deepgalois -lopenblas) -target_link_libraries(deepgalois -lcudart -lcublas) +target_link_libraries(deepgalois -lcudart -lcublas -lcurand) target_include_directories(deepgalois PUBLIC ${CMAKE_SOURCE_DIR}/libllvm/include diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h index a071781f54..6fb4ec8d41 100644 --- a/libdeepgalois/include/aggregator.h +++ b/libdeepgalois/include/aggregator.h @@ -2,14 +2,14 @@ #include "types.h" #ifdef CPU_ONLY #include "gtypes.h" -void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); -void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); +void update_all(Graph &g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); +void update_all(Graph &g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); #else #include "gg.h" #include "ggcuda.h" #include "cub/cub.cuh" #define TB_SIZE 256 #define WARP_SIZE 32 -void update_all(CSRGraph g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor); +void update_all(CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor); #endif diff --git a/libdeepgalois/include/common.h b/libdeepgalois/include/common.h deleted file mode 100644 index f942fd106c..0000000000 --- a/libdeepgalois/include/common.h +++ /dev/null @@ -1,66 +0,0 @@ -#pragma once -#include "types.h" -#include "utils.h" -#include "cutils.h" -//#include "random.h" - -class DeepGalois { -public: - ~DeepGalois(); - enum Brew { CPU, GPU }; - static DeepGalois& Get() { - } - inline static cublasHandle_t cublas_handle() { return Get().cublas_handle_; } - inline static curandGenerator_t curand_generator() { return Get().curand_generator_; } - inline static Brew mode() { return Get().mode_; } - inline static void set_mode(Brew mode) { Get().mode_ = mode; } - inline static int solver_count() { return Get().solver_count_; } - inline static void set_solver_count(int val) { Get().solver_count_ = val; } - inline static int solver_rank() { return Get().solver_rank_; } - inline static void set_solver_rank(int val) { Get().solver_rank_ = val; } - inline static bool multiprocess() { return Get().multiprocess_; } - inline static void set_multiprocess(bool val) { Get().multiprocess_ = val; } - inline static bool root_solver() { return Get().solver_rank_ == 0; } - static void SetDevice(const int device_id) { - int current_device; - CUDA_CHECK(cudaGetDevice(¤t_device)); - if (current_device == device_id) return; - CUDA_CHECK(cudaSetDevice(device_id)); - if (Get().cublas_handle_) CUBLAS_CHECK(cublasDestroy(Get().cublas_handle_)); - if (Get().curand_generator_) CURAND_CHECK(curandDestroyGenerator(Get().curand_generator_)); - CUBLAS_CHECK(cublasCreate(&Get().cublas_handle_)); - //CURAND_CHECK(curandCreateGenerator(&Get().curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); - //CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(Get().curand_generator_, cluster_seedgen())); - } - static void DeviceQuery() {} - static bool CheckDevice(const int device_id) { return true; } - static int FindDevice(const int start_id = 0) { return 0; } - -protected: - cublasHandle_t cublas_handle_; // used to call cuBLAS - curandGenerator_t curand_generator_; // used to generate random numbers on GPU - //shared_ptr random_generator_; - Brew mode_; - // Parallel training - int solver_count_; - int solver_rank_; - bool multiprocess_; - -private: - // The private constructor to avoid duplicate instantiation. - DeepGalois() : cublas_handle_(NULL), curand_generator_(NULL), - //random_generator_(NULL), mode_(DeepGalois::CPU), - mode_(DeepGalois::CPU), - solver_count_(1), solver_rank_(0), multiprocess_(false) { - // Try to create a cublas handler, and report an error if failed (but we will - // keep the program running as one might just want to run CPU code). - if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { - std::cout << "Cannot create Cublas handle. Cublas won't be available."; - } - // Try to create a curand handler. - //if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS || - // curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS) - // std::cout << "Cannot create Curand generator. Curand won't be available."; - } -}; - diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h new file mode 100644 index 0000000000..1fc8b6ffc4 --- /dev/null +++ b/libdeepgalois/include/context.h @@ -0,0 +1,64 @@ +#pragma once +#include +#include +#include "types.h" +#include "utils.h" +#include "lgraph.h" +#include "gtypes.h" +#include "cutils.h" +//#include "random.h" + +class Context { +public: + Context(); + ~Context(); + enum Brew { CPU, GPU }; + //static Context& Get(); + cublasHandle_t cublas_handle() { return cublas_handle_; } + curandGenerator_t curand_generator() { return curand_generator_; } + Brew mode() { return mode_; } + void set_mode(Brew mode) { mode_ = mode; } + int solver_count() { return solver_count_; } + void set_solver_count(int val) { solver_count_ = val; } + int solver_rank() { return solver_rank_; } + void set_solver_rank(int val) { solver_rank_ = val; } + bool multiprocess() { return multiprocess_; } + void set_multiprocess(bool val) { multiprocess_ = val; } + bool root_solver() { return solver_rank_ == 0; } + void SetDevice(const int device_id); + void DeviceQuery() {} + bool CheckDevice(const int device_id) { return true; } + int FindDevice(const int start_id = 0) { return 0; } + size_t read_graph(std::string dataset_str); + size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr"); + size_t read_graph_gpu(std::string dataset_str); + size_t read_labels(std::string dataset_str, size_t num); + label_t get_label(size_t i) { return labels[i]; } + label_t *get_labels_ptr(size_t i) { return &(labels[0]); } + void degree_counting(); + void norm_factor_counting(); +#ifdef CPU_ONLY + Graph graph_cpu; // the input graph, |V| = N +#else + CSRGraph graph_gpu; // the input graph, |V| = N +#endif + std::vector labels; // labels for classification: N x 1 + std::vector norm_factor; // normalization constant based on graph structure + std::vector degrees; + +protected: + Brew mode_; + cublasHandle_t cublas_handle_; // used to call cuBLAS + curandGenerator_t curand_generator_; // used to generate random numbers on GPU + //shared_ptr random_generator_; + // Parallel training + int solver_count_; + int solver_rank_; + bool multiprocess_; + void genGraph(LGraph &lg, Graph &g); + +private: + // The private constructor to avoid duplicate instantiation. + //Context(); +}; + diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h index 44dfd197af..ff7fb82b31 100644 --- a/libdeepgalois/include/layers/graph_conv_layer.h +++ b/libdeepgalois/include/layers/graph_conv_layer.h @@ -16,68 +16,28 @@ */ class graph_conv_layer: public layer { public: -#ifdef CPU_ONLY - graph_conv_layer(unsigned level, Graph *g, bool act, bool norm, bool bias, bool dropout, - float dropout_rate, std::vector in_dims, std::vector out_dims); -#else - graph_conv_layer(unsigned level, CSRGraph *g, bool act, bool norm, bool bias, bool dropout, + graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout, float dropout_rate, std::vector in_dims, std::vector out_dims); -#endif graph_conv_layer(unsigned level, std::vector in_dims, std::vector out_dims) : - graph_conv_layer(level, NULL, false, true, false, true, 0.5, in_dims, out_dims) {} + graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {} ~graph_conv_layer() {} - void init() { - std::cout << name_ << ": allocating memory for parameters and intermediate data... "; - Timer t_alloc; - t_alloc.Start(); - // randomly initialize trainable parameters for conv layers - rand_init_matrix(y, z, W); - //rand_init_matrix(y, z, Q); - zero_init_matrix(y, z, weight_grad); - alloc_grad(); - if (dropout_) { - dropout_mask.resize(x); - for (size_t i = 0; i < x; i++) dropout_mask[i].resize(y); - } - in_temp.resize(x*y); - out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py - trans_data.resize(y*x); // y*x - if (norm_) norm_factor_counting(); - t_alloc.Stop(); - std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; - } + void init(); std::string layer_type() const override { return std::string("graph_conv"); } - void set_context(net_phase ctx) override { phase_ = ctx; } + void set_netphase(net_phase ctx) override { phase_ = ctx; } virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data); virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad); virtual void forward_propagation(const float_t *in_data, float_t *out_data); virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); // user-defined aggregate function #ifdef CPU_ONLY - virtual void aggregate(Graph *g, const vec_t &in, tensor_t &out); + virtual void aggregate(Graph &g, const vec_t &in, tensor_t &out); #else - virtual void aggregate(CSRGraph g, const float_t *in, float_t *out); + virtual void aggregate(CSRGraph &g, const float_t *in, float_t *out); #endif // user-defined combine function virtual void combine(const vec_t &self, const vec_t &neighbors, vec_t &out); - // user-defined pre-computing function, called during initialization - virtual void norm_factor_counting(); - -protected: - void degree_counting() { - assert(x == graph->size()); - degrees.resize(x); - galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) { - degrees[v] = std::distance(graph->edge_begin(v), graph->edge_end(v)); - }, galois::loopname("DegreeCounting")); - } private: -#ifdef CPU_ONLY - Graph *graph; -#else - CSRGraph graph_gpu; -#endif bool act_; // whether to use activation function at the end bool norm_; // whether to normalize data bool bias_; // whether to add bias afterwards @@ -91,11 +51,9 @@ class graph_conv_layer: public layer { vec_t out_temp; vec_t in_temp; vec_t trans_data; // y*x - std::vector degrees; - std::vector norm_factor; // normalization constant based on graph structure std::vector > dropout_mask; - // Glorot & Bengio (AISTATS 2010) init + // Glorot & Bengio (AISTATS 2010) inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) { auto init_range = sqrt(6.0/(dim_x + dim_y)); std::default_random_engine rng; diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 076253fe82..15e7d88900 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -17,6 +17,7 @@ #include "../types.h" #include "../utils.h" #include "../gtypes.h" +#include "../context.h" #include "../optimizer.h" #include "../math_functions.hh" /** @@ -42,7 +43,8 @@ class layer : public node { virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) = 0; virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) = 0; virtual std::string layer_type() const = 0; - virtual void set_context(net_phase ctx) {} + virtual void set_context(Context *ctx) { context = ctx; } + virtual void set_netphase(net_phase phase) {} void set_trainable(bool trainable) { trainable_ = trainable; } bool trainable() const { return trainable_; } @@ -136,6 +138,7 @@ class layer : public node { vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x 16, layer1: 16 x E vec_t weight_grad; // weight gradient for updating parameters vec_t loss; // error for each vertex: N x 1 + Context *context; }; // head: layer i+1, tail: layer i diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h index 236fd35118..cb698491fc 100644 --- a/libdeepgalois/include/layers/softmax_loss_layer.h +++ b/libdeepgalois/include/layers/softmax_loss_layer.h @@ -3,18 +3,12 @@ class softmax_loss_layer: public layer { public: - softmax_loss_layer(unsigned level, std::vector in_dims, - std::vector out_dims, LabelList *lab); - softmax_loss_layer(unsigned level, std::vector in_dims, - std::vector out_dims) : softmax_loss_layer(level, in_dims, out_dims, NULL) {} + softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims); ~softmax_loss_layer() {} std::string layer_type() const override { return std::string("softmax_loss"); } virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data); virtual void forward_propagation(const float_t *in_data, float_t *out_data); virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad); virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); - -private: - LabelList *labels; }; diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index 288e3aac3b..dba2753221 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -2,9 +2,10 @@ #define _MODEL_H_ #include +#include "types.h" #include "gtypes.h" +#include "context.h" #include "galois/Timer.h" -#include "lgraph.h" #include "layers.h" #include "optimizer.h" @@ -17,52 +18,28 @@ class Net { public: Net() {} - - void init() { - n = read_graph_cpu(dataset, graph_cpu); - labels.resize(n, 0); // label for each vertex: N x 1 - num_classes = read_labels(dataset, labels); - - std::cout << "Reading label masks ... "; - train_mask.resize(n, 0); - val_mask.resize(n, 0); - if (dataset == "reddit") { - train_begin = 0, train_count = 153431, train_end = train_begin + train_count; - val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; - for (size_t i = train_begin; i < train_end; i++) train_mask[i] = 1; - for (size_t i = val_begin; i < val_end; i++) val_mask[i] = 1; - } else { - train_count = read_masks(dataset, "train", train_begin, train_end, train_mask); - val_count = read_masks(dataset, "val", val_begin, val_end, val_mask); - } - std::cout << "Done\n"; - - num_layers = NUM_CONV_LAYERS + 1; - feature_dims.resize(num_layers + 1); - input_features.resize(n); // input embedding: N x D - feature_dims[0] = read_features(dataset, input_features); // input feature dimension: D - feature_dims[1] = hidden1; // hidden1 level embedding: 16 - feature_dims[2] = num_classes; // output embedding: E - feature_dims[3] = num_classes; // normalized output embedding: E - layers.resize(num_layers); - } + void init(std::string dataset_str, unsigned epochs, unsigned hidden1); size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id+1]; } - size_t get_nnodes() { return n; } size_t get_ft_dim() { return feature_dims[0]; } - size_t get_nclasses() { return num_classes; } - size_t get_label(size_t i) { return labels[i]; } + size_t read_features(std::string dataset_str, tensor_t &feats); void construct_layers() { std::cout << "\nConstructing layers...\n"; append_conv_layer(0, true); // first conv layer append_conv_layer(1); // hidden1 layer append_out_layer(2); // output layer layers[0]->set_in_data(input_features); // feed input data + set_contexts(); } - void set_netphase(net_phase phase) { + void set_contexts() { for (size_t i = 0; i < num_layers; i ++) - layers[i]->set_context(phase); + layers[i]->set_context(context); + } + + void set_netphases(net_phase phase) { + for (size_t i = 0; i < num_layers; i ++) + layers[i]->set_netphase(phase); } void print_layers_info() { @@ -78,9 +55,9 @@ class Net { in_dims[1] = get_in_dim(layer_id); out_dims[1] = get_out_dim(layer_id); #ifdef CPU_ONLY - layers[layer_id] = new graph_conv_layer(layer_id, &graph_cpu, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); + layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); #else - layers[layer_id] = new graph_conv_layer(layer_id, &graph_gpu, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); + layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); #endif if(layer_id > 0) connect(layers[layer_id-1], layers[layer_id]); } @@ -91,7 +68,7 @@ class Net { in_dims[0] = out_dims[0] = n; in_dims[1] = get_in_dim(layer_id); out_dims[1] = get_out_dim(layer_id); - layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims, &labels); + layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); connect(layers[layer_id-1], layers[layer_id]); } @@ -130,65 +107,19 @@ class Net { } // training - void train(optimizer *opt) { - std::cout << "\nStart training...\n"; - galois::StatTimer Tupdate("Train-WeightUpdate"); - galois::StatTimer Tfw("Train-Forward"); - galois::StatTimer Tbw("Train-Backward"); - galois::StatTimer Tval("Validation"); - Timer t_epoch; - // run epoches - for (size_t i = 0; i < epochs; i++) { - std::cout << "Epoch " << std::setw(2) << i << std::fixed << std::setprecision(3) << ":"; - t_epoch.Start(); - - // training steps - set_netphase(net_phase::train); - acc_t train_loss = 0.0, train_acc = 0.0; - Tfw.start(); - train_loss = fprop(train_begin, train_end, train_count, train_mask); // forward - train_acc = masked_accuracy(train_begin, train_end, train_count, train_mask); // predict - Tfw.stop(); - Tbw.start(); - bprop(); // back propogation - Tbw.stop(); - Tupdate.start(); - update_weights(opt); // update parameters - Tupdate.stop(); - set_netphase(net_phase::test); - std::cout << " train_loss = " << std::setw(5) << train_loss << " train_acc = " << std::setw(5) << train_acc; - t_epoch.Stop(); - double epoch_time = t_epoch.Millisecs(); - - if (do_validate) { - // Validation - acc_t val_loss = 0.0, val_acc = 0.0; - Tval.start(); - double val_time = evaluate(val_begin, val_end, val_count, val_mask, val_loss, val_acc); - Tval.stop(); - std::cout << " val_loss = " << std::setw(5) << val_loss << " val_acc = " << std::setw(5) << val_acc; - std::cout << " time = " << epoch_time + val_time << " ms (train_time = " << epoch_time << " val_time = " << val_time << ")\n"; - } else { - std::cout << " train_time = " << epoch_time << " ms\n"; - } - } - } + void train(optimizer *opt, bool need_validate); + size_t get_nnodes() { return n; } protected: + Context *context; size_t n; // number of samples: N size_t num_classes; // number of vertex classes: E size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 + unsigned num_epochs; // number of epochs std::vector feature_dims; // feature dimnesions for each layer -#ifdef CPU_ONLY - Graph graph_cpu; // the input graph, |V| = N -#else - CSRGraph graph_gpu; // the input graph, |V| = N -#endif tensor_t input_features; // input features: N x D - std::vector labels; // labels for classification: N x 1 MaskList train_mask, val_mask; // masks for traning and validation size_t train_begin, train_end, train_count, val_begin, val_end, val_count; - std::vector layers; // all the layers in the neural network /* inline void init_features(size_t dim, vec_t &x) { @@ -199,109 +130,6 @@ class Net { } //*/ - // labels contain the ground truth (e.g. vertex classes) for each example (num_examples x 1). - // Note that labels is not one-hot encoded vector and it can be computed - // as y.argmax(axis=1) from one-hot encoded vector (y) of labels if required. - size_t read_labels(std::string dataset_str, LabelList &labels) { - std::cout << "Reading labels ... "; - Timer t_read; - t_read.Start(); - std::string filename = path + dataset_str + "-labels.txt"; - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - size_t m, n; - in >> m >> n >> std::ws; - assert(m == labels.size()); // number of vertices - unsigned v = 0; - while (std::getline(in, line)) { - std::istringstream label_stream(line); - unsigned x; - for (size_t idx = 0; idx < n; ++idx) { - label_stream >> x; - if (x != 0) { - labels[v] = idx; - break; - } - } - v ++; - } - in.close(); - t_read.Stop(); - // number of vertex classes - std::cout << "Done, unique label counts: " << n << ", time: " << t_read.Millisecs() << " ms\n"; - return n; - } - - size_t read_features(std::string dataset_str, tensor_t &feats) { - std::cout << "Reading features ... "; - Timer t_read; - t_read.Start(); - std::string filename = path + dataset_str + ".ft"; - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - size_t m, n; - in >> m >> n >> std::ws; - assert(m == feats.size()); // m = number of vertices - for (size_t i = 0; i < m; ++i) { - feats[i].resize(n); - for (size_t j = 0; j < n; ++j) - feats[i][j] = 0; - } - while (std::getline(in, line)) { - std::istringstream edge_stream(line); - unsigned u, v; - float_t w; - edge_stream >> u; - edge_stream >> v; - edge_stream >> w; - feats[u][v] = w; - } - /* - for (size_t i = 0; i < 10; ++i) - for (size_t j = 0; j < n; ++j) - if (feats[i][j] > 0) - std::cout << "feats[" << i << "][" << j << "]: " << feats[i][j] << std::endl; - //*/ - in.close(); - t_read.Stop(); - std::cout << "Done, feature dimention: " << n << ", time: " << t_read.Millisecs() << " ms\n"; - return n; - } - - size_t read_graph_cpu(std::string dataset_str, Graph &graph) { - galois::StatTimer Tread("GraphReadingTime"); - Tread.start(); - LGraph lgraph; - if (filetype == "el") { - std::string filename = path + dataset_str + ".el"; - printf("Reading .el file: %s\n", filename.c_str()); - lgraph.read_edgelist(filename.c_str(), true); //symmetrize - genGraph(lgraph, graph); - lgraph.clean(); - } else if (filetype == "gr") { - std::string filename = path + dataset_str + ".csgr"; - printf("Reading .gr file: %s\n", filename.c_str()); - galois::graphs::readGraph(graph, filename); - } else { printf("Unkown file format\n"); exit(1); } - Tread.stop(); - return graph.size(); - } - - void genGraph(LGraph &lg, Graph &g) { - g.allocateFrom(lg.num_vertices(), lg.num_edges()); - g.constructNodes(); - for (size_t i = 0; i < lg.num_vertices(); i++) { - g.getData(i) = 1; - auto row_begin = lg.get_offset(i); - auto row_end = lg.get_offset(i+1); - g.fixEndEdge(i, row_end); - for (auto offset = row_begin; offset < row_end; offset ++) - g.constructEdge(offset, lg.get_dest(offset), 0); - } - } - // comparing outputs with the ground truth (labels) inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, MaskList &masks) { AccumF accuracy_all; @@ -309,7 +137,7 @@ class Net { galois::do_all(galois::iterate(begin, end), [&](const auto& i) { if (masks[i] == 1) { int preds = argmax(num_classes, layers[NUM_CONV_LAYERS-1]->next()->get_data()[i]); - if ((label_t)preds == labels[i]) accuracy_all += 1.0; + if ((label_t)preds == context->get_label(i)) accuracy_all += 1.0; } }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); return accuracy_all.reduce() / (acc_t)count; diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp index 4b3f7cbab6..e9fc27d04a 100644 --- a/libdeepgalois/src/aggregator.cpp +++ b/libdeepgalois/src/aggregator.cpp @@ -1,14 +1,14 @@ #include "aggregator.h" #include "math_functions.hh" -void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { - galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) { +void update_all(Graph &g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { + galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto& src) { clear(out[src]); // TODO: vectorize clear float_t a = 0.0, b = 0.0; if (norm) a = norm_factor[src]; // gather neighbors' embeddings - for (const auto e : g->edges(src)) { - const auto dst = g->getEdgeDst(e); + for (const auto e : g.edges(src)) { + const auto dst = g.getEdgeDst(e); if (norm) { b = a * norm_factor[dst]; vec_t neighbor = in[dst]; @@ -19,15 +19,15 @@ void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const ve }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); } -void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { +void update_all(Graph &g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { size_t len = out[0].size(); - galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) { + galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto& src) { clear(out[src]); float_t a = 0.0, b = 0.0; if (norm) a = norm_factor[src]; // gather neighbors' embeddings - for (const auto e : g->edges(src)) { - const auto dst = g->getEdgeDst(e); + for (const auto e : g.edges(src)) { + const auto dst = g.getEdgeDst(e); if (norm) { b = a * norm_factor[dst]; vec_t neighbor(len); diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp new file mode 100644 index 0000000000..59895347f1 --- /dev/null +++ b/libdeepgalois/src/context.cpp @@ -0,0 +1,156 @@ +#include "context.h" +#include +#include + +// random seeding +int64_t cluster_seedgen(void) { + int64_t s, seed, pid; + FILE* f = fopen("/dev/urandom", "rb"); + if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { + fclose(f); + return seed; + } + std::cout << "System entropy source not available, " + "using fallback algorithm to generate seed instead."; + if (f) fclose(f); + pid = getpid(); + s = time(NULL); + seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729); + return seed; +} + +void Context::SetDevice(const int device_id) { + int current_device; + CUDA_CHECK(cudaGetDevice(¤t_device)); + if (current_device == device_id) return; + CUDA_CHECK(cudaSetDevice(device_id)); + if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); + if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_)); + CUBLAS_CHECK(cublasCreate(&cublas_handle_)); + CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); +} + +Context::Context() : + mode_(Context::CPU), + cublas_handle_(NULL), curand_generator_(NULL), + //random_generator_(NULL), mode_(Context::CPU), + solver_count_(1), solver_rank_(0), multiprocess_(false) { +#ifndef CPU_ONLY + mode_ = Context::GPU; + // Try to create a cublas handler, and report an error if failed (but we will + // keep the program running as one might just want to run CPU code). + if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { + std::cout << "Cannot create Cublas handle. Cublas won't be available."; + } + // Try to create a curand handler. + if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS || + curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS) + std::cout << "Cannot create Curand generator. Curand won't be available."; +#endif +} + +size_t Context::read_graph(std::string dataset_str) { +#ifdef CPU_ONLY + size_t n = read_graph_cpu(dataset_str, "gr"); +#else + size_t n = read_graph_gpu(dataset_str); +#endif + return n; +} + +size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype) { + galois::StatTimer Tread("GraphReadingTime"); + Tread.start(); + LGraph lgraph; + if (filetype == "el") { + std::string filename = path + dataset_str + ".el"; + printf("Reading .el file: %s\n", filename.c_str()); + lgraph.read_edgelist(filename.c_str(), true); //symmetrize + genGraph(lgraph, graph_cpu); + lgraph.clean(); + } else if (filetype == "gr") { + std::string filename = path + dataset_str + ".csgr"; + printf("Reading .gr file: %s\n", filename.c_str()); + galois::graphs::readGraph(graph_cpu, filename); + } else { printf("Unkown file format\n"); exit(1); } + Tread.stop(); + std::cout << "num_vertices " << graph_cpu.size() << " num_edges " << graph_cpu.sizeEdges() << "\n"; + return graph_cpu.size(); +} + +size_t Context::read_graph_gpu(std::string dataset_str) { +} + +void Context::genGraph(LGraph &lg, Graph &g) { + g.allocateFrom(lg.num_vertices(), lg.num_edges()); + g.constructNodes(); + for (size_t i = 0; i < lg.num_vertices(); i++) { + g.getData(i) = 1; + auto row_begin = lg.get_offset(i); + auto row_end = lg.get_offset(i+1); + g.fixEndEdge(i, row_end); + for (auto offset = row_begin; offset < row_end; offset ++) + g.constructEdge(offset, lg.get_dest(offset), 0); + } +} + +// user-defined pre-computing function, called during initialization +// for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v +void Context::norm_factor_counting() { +#ifdef CPU_ONLY + size_t n = graph_cpu.size(); + norm_factor.resize(n); + galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) { + float_t temp = std::sqrt(float_t(degrees[v])); + if (temp == 0.0) norm_factor[v] = 0.0; + else norm_factor[v] = 1.0 / temp; + }, galois::loopname("NormCounting")); +#endif +} + +void Context::degree_counting() { +#ifdef CPU_ONLY + size_t n = graph_cpu.size(); + degrees.resize(n); + galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) { + degrees[v] = std::distance(graph_cpu.edge_begin(v), graph_cpu.edge_end(v)); + }, galois::loopname("DegreeCounting")); +#endif +} + +// labels contain the ground truth (e.g. vertex classes) for each example (num_examples x 1). +// Note that labels is not one-hot encoded vector and it can be computed +// as y.argmax(axis=1) from one-hot encoded vector (y) of labels if required. +size_t Context::read_labels(std::string dataset_str, size_t num) { + std::cout << "Reading labels ... "; + labels.resize(num, 0); // label for each vertex: N x 1 + Timer t_read; + t_read.Start(); + std::string filename = path + dataset_str + "-labels.txt"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + size_t m, n; + in >> m >> n >> std::ws; + assert(m == labels.size()); // number of vertices + unsigned v = 0; + while (std::getline(in, line)) { + std::istringstream label_stream(line); + unsigned x; + for (size_t idx = 0; idx < n; ++idx) { + label_stream >> x; + if (x != 0) { + labels[v] = idx; + break; + } + } + v ++; + } + in.close(); + t_read.Stop(); + // number of vertex classes + std::cout << "Done, unique label counts: " << n << ", time: " << t_read.Millisecs() << " ms\n"; + return n; +} + diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 98e9e14211..0dd83b6b07 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -1,18 +1,7 @@ #include "layers/graph_conv_layer.h" -void graph_conv_layer::aggregate(Graph *g, const vec_t &in, tensor_t &out) { - update_all(g, in, out, true, norm_factor); -} - -// for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v -void graph_conv_layer::norm_factor_counting() { - degree_counting(); - norm_factor.resize(x); - galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) { - float_t temp = std::sqrt(float_t(degrees[v])); - if (temp == 0.0) norm_factor[v] = 0.0; - else norm_factor[v] = 1.0 / temp; - }, galois::loopname("NormCounting")); +void graph_conv_layer::aggregate(Graph &g, const vec_t &in, tensor_t &out) { + update_all(g, in, out, true, context->norm_factor); } void graph_conv_layer::combine(const vec_t &self, const vec_t &neighbors, vec_t &out) { @@ -23,10 +12,9 @@ void graph_conv_layer::combine(const vec_t &self, const vec_t &neighbors, vec_t vadd(a, b, out); // out = W*self + Q*neighbors } -#ifdef CPU_ONLY -graph_conv_layer::graph_conv_layer(unsigned level, Graph *g, bool act, bool norm, bool bias, +graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout, float dropout_rate, std::vector in_dims, std::vector out_dims) : - layer(level, in_dims, out_dims), graph(g), act_(act), norm_(norm), + layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias), dropout_(dropout), dropout_rate_(dropout_rate) { assert(input_dims[0] == output_dims[0]); // num_vertices x = input_dims[0]; @@ -38,6 +26,27 @@ graph_conv_layer::graph_conv_layer(unsigned level, Graph *g, bool act, bool norm scale_ = 1. / (1. - dropout_rate_); } +void graph_conv_layer::init() { + std::cout << name_ << ": allocating memory for parameters and intermediate data... "; + Timer t_alloc; + t_alloc.Start(); + // randomly initialize trainable parameters for conv layers + rand_init_matrix(y, z, W); + //rand_init_matrix(y, z, Q); + zero_init_matrix(y, z, weight_grad); + alloc_grad(); + if (dropout_) { + dropout_mask.resize(x); + for (size_t i = 0; i < x; i++) dropout_mask[i].resize(y); + } + in_temp.resize(x*y); + out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py + trans_data.resize(y*x); // y*x + t_alloc.Stop(); + std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; +} + +#ifdef CPU_ONLY // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) { // input: x*y; W: y*z; output: x*z @@ -49,7 +58,7 @@ void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &ou }, galois::loopname("dropout")); matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z } else matmul2D1D(z, in_data, W, out_temp); // x*y; y*z; x*z - aggregate(graph, out_temp, out_data); // aggregate + aggregate(context->graph_cpu, out_temp, out_data); // aggregate if (act_) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { relu(out_data[i], out_data[i]); @@ -69,7 +78,8 @@ void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t vec_t trans_W(z*y); transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y - update_all(graph, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y NOTE: since graph is symmetric, the derivative is the same + //NOTE: since graph is symmetric, the derivative is the same + update_all(context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y if (dropout_) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { d_dropout(scale_, in_grad[i], dropout_mask[i], in_grad[i]); @@ -84,20 +94,6 @@ void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {} void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {} #else -graph_conv_layer::graph_conv_layer(unsigned level, CSRGraph *g, bool act, bool norm, bool bias, - bool dropout, float dropout_rate, std::vector in_dims, std::vector out_dims) : - layer(level, in_dims, out_dims), graph_gpu(*g), act_(act), norm_(norm), - bias_(bias), dropout_(dropout), dropout_rate_(dropout_rate) { - assert(input_dims[0] == output_dims[0]); // num_vertices - x = input_dims[0]; - y = input_dims[1]; - z = output_dims[1]; - trainable_ = true; - name_ = layer_type() + "_" + std::to_string(level); - init(); - scale_ = 1. / (1. - dropout_rate_); -} - void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) {} void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {} diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 22a9d1a83c..3d8c22bf49 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -1,7 +1,7 @@ #include "layers/softmax_loss_layer.h" -softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims, LabelList *lab) - : layer(level, in_dims, out_dims), labels(lab) { +softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector in_dims, + std::vector out_dims) : layer(level, in_dims, out_dims) { trainable_ = false; loss.resize(in_dims[0]); // error for each sample name_ = layer_type() + "_" + std::to_string(level); @@ -15,7 +15,7 @@ void softmax_loss_layer::forward_propagation(const tensor_t &in_data, tensor_t & softmax(in_data[i], out_data[i]); // normalize using softmax // y is a one hot encoded vector for the labels std::vector y(output_dims[1], 0.0); // ground truth - y[(*labels)[i]] = 1.0; // one-hot + y[context->get_label(i)] = 1.0; // one-hot loss[i] = cross_entropy(y, out_data[i]); } }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-fw")); @@ -24,14 +24,14 @@ void softmax_loss_layer::forward_propagation(const tensor_t &in_data, tensor_t & void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { } void softmax_loss_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) { - //std::cout << name_ << " backward: x=" << in_grad.size() << ", y=" << in_grad[0].size() << "\n"; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { vec_t norm_grad(output_dims[1]); std::vector y(output_dims[1], 0.0); // ground truth - y[(*labels)[i]] = 1.0; + y[context->get_label(i)] = 1.0; d_cross_entropy(y, out_data[i], norm_grad); d_softmax(in_data[i], out_data[i], in_grad[i], norm_grad); }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); } -void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { } +void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { +} diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 0179c46d56..a7e25a7256 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -1,5 +1,5 @@ #include "math_functions.hh" -#include "common.h" +#include "context.h" extern "C" { #include diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp new file mode 100644 index 0000000000..aff96fde56 --- /dev/null +++ b/libdeepgalois/src/net.cpp @@ -0,0 +1,107 @@ +#include "net.h" + +void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { + context = new Context(); + n = context->read_graph(dataset_str); + num_classes = context->read_labels(dataset_str, n); + context->degree_counting(); + context->norm_factor_counting(); // pre-compute normalizing factor + num_epochs = epochs; + std::cout << "Reading label masks ... "; + train_mask.resize(n, 0); + val_mask.resize(n, 0); + if (dataset_str == "reddit") { + train_begin = 0, train_count = 153431, train_end = train_begin + train_count; + val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; + for (size_t i = train_begin; i < train_end; i++) train_mask[i] = 1; + for (size_t i = val_begin; i < val_end; i++) val_mask[i] = 1; + } else { + train_count = read_masks(dataset_str, "train", train_begin, train_end, train_mask); + val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask); + } + std::cout << "Done\n"; + + num_layers = NUM_CONV_LAYERS + 1; + feature_dims.resize(num_layers + 1); + input_features.resize(n); // input embedding: N x D + feature_dims[0] = read_features(dataset_str, input_features); // input feature dimension: D + feature_dims[1] = hidden1; // hidden1 level embedding: 16 + feature_dims[2] = num_classes; // output embedding: E + feature_dims[3] = num_classes; // normalized output embedding: E + layers.resize(num_layers); +} + +size_t Net::read_features(std::string dataset_str, tensor_t &feats) { + std::cout << "Reading features ... "; + Timer t_read; + t_read.Start(); + std::string filename = path + dataset_str + ".ft"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + size_t m, n; + in >> m >> n >> std::ws; + assert(m == feats.size()); // m = number of vertices + for (size_t i = 0; i < m; ++i) { + feats[i].resize(n); + for (size_t j = 0; j < n; ++j) + feats[i][j] = 0; + } + while (std::getline(in, line)) { + std::istringstream edge_stream(line); + unsigned u, v; + float_t w; + edge_stream >> u; + edge_stream >> v; + edge_stream >> w; + feats[u][v] = w; + } + in.close(); + t_read.Stop(); + std::cout << "Done, feature dimention: " << n << ", time: " << t_read.Millisecs() << " ms\n"; + return n; +} + +void Net::train(optimizer *opt, bool need_validate) { + std::cout << "\nStart training...\n"; + galois::StatTimer Tupdate("Train-WeightUpdate"); + galois::StatTimer Tfw("Train-Forward"); + galois::StatTimer Tbw("Train-Backward"); + galois::StatTimer Tval("Validation"); + Timer t_epoch; + // run epoches + for (unsigned i = 0; i < num_epochs; i++) { + std::cout << "Epoch " << std::setw(2) << i << std::fixed << std::setprecision(3) << ":"; + t_epoch.Start(); + + // training steps + set_netphases(net_phase::train); + acc_t train_loss = 0.0, train_acc = 0.0; + Tfw.start(); + train_loss = fprop(train_begin, train_end, train_count, train_mask); // forward + train_acc = masked_accuracy(train_begin, train_end, train_count, train_mask); // predict + Tfw.stop(); + Tbw.start(); + bprop(); // back propogation + Tbw.stop(); + Tupdate.start(); + update_weights(opt); // update parameters + Tupdate.stop(); + set_netphases(net_phase::test); + std::cout << " train_loss = " << std::setw(5) << train_loss << " train_acc = " << std::setw(5) << train_acc; + t_epoch.Stop(); + double epoch_time = t_epoch.Millisecs(); + if (need_validate) { + // Validation + acc_t val_loss = 0.0, val_acc = 0.0; + Tval.start(); + double val_time = evaluate(val_begin, val_end, val_count, val_mask, val_loss, val_acc); + Tval.stop(); + std::cout << " val_loss = " << std::setw(5) << val_loss << " val_acc = " << std::setw(5) << val_acc; + std::cout << " time = " << epoch_time + val_time << " ms (train_time = " << epoch_time << " val_time = " << val_time << ")\n"; + } else { + std::cout << " train_time = " << epoch_time << " ms\n"; + } + } +} + diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index 9e2597dffb..338ded6c67 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -4,5 +4,15 @@ include_directories(BEFORE ) include_directories(${CMAKE_SOURCE_DIR}/lonestargnn) include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include) +SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include) +include_directories(${CUDA_INC}) + +SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) +SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) +include_directories(${OPENBLAS_INC}) +link_directories(${OPENBLAS_LIB}) +if(USE_CPU) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") +endif() add_subdirectory(gcn) diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt index ae1d2dff4b..ccc59d83de 100644 --- a/lonestargnn/gcn/CMakeLists.txt +++ b/lonestargnn/gcn/CMakeLists.txt @@ -1,11 +1,3 @@ -SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) -SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) -include_directories(${OPENBLAS_INC}) -link_directories(${OPENBLAS_LIB}) -if(USE_CPU) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") -endif() - app(gcn gcn.cpp) target_link_libraries(gcn deepgalois) diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 72fc8373fc..7540a4b0e4 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -10,8 +10,8 @@ int main(int argc, char** argv) { galois::SharedMemSys G; LonestarGnnStart(argc, argv, name, desc, url); Net network; // the neural network to train - network.init(); - network.construct_layers(); // default setting for now; see its implementation to find how to customize it by the user + network.init(dataset, epochs, hidden1); + network.construct_layers(); // default setting for now; can be customized by the user network.print_layers_info(); ResourceManager rm; @@ -21,7 +21,7 @@ int main(int argc, char** argv) { optimizer *opt = new adam(); galois::StatTimer Ttrain("TrainAndVal"); Ttrain.start(); - network.train(opt); // do training using training samples + network.train(opt, do_validate); // do training using training samples Ttrain.stop(); if (do_test) { From 591cec5063ca2aba9e0ca9949d699377952a5170 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 22 Feb 2020 18:27:44 -0600 Subject: [PATCH 010/660] add graph_gpu --- libdeepgalois/CMakeLists.txt | 6 +- libdeepgalois/include/aggregator.h | 4 +- libdeepgalois/include/context.h | 31 ++-- libdeepgalois/src/aggregator.cpp | 2 + libdeepgalois/src/aggregator.cu | 5 +- libdeepgalois/src/context.cpp | 52 ++++--- libdeepgalois/src/layers/graph_conv_layer.cpp | 5 + libdeepgalois/src/math_functions.cu | 14 +- libgpu/include/checker.h | 15 ++ libgpu/include/csr_graph.h | 119 +-------------- libgpu/include/gg.h | 10 +- libgpu/include/graph_gpu.h | 137 ++++++++++++++++++ lonestargnn/CMakeLists.txt | 1 + 13 files changed, 227 insertions(+), 174 deletions(-) create mode 100644 libgpu/include/checker.h create mode 100644 libgpu/include/graph_gpu.h diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 514af263d4..e27f822b69 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -10,6 +10,9 @@ include_directories(${OPENBLAS_INC}) include_directories(${CMAKE_SOURCE_DIR}/libgalois/include) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) include_directories(${CUDA_INC}) +include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) +include_directories("${CUB_ROOT}") +include_directories("${MGPU_ROOT}/src") link_directories(${OPENBLAS_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgalois) @@ -24,9 +27,6 @@ else() set(CUDA_HOST_COMPILER g++) #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60") #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60; -std=c++11") - cuda_include_directories("${CUB_ROOT}") - cuda_include_directories("${MGPU_ROOT}/src") - cuda_include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) link_directories(${CUDA_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgpu) file(GLOB CUDA_FILES "src/" *.cu) diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h index 6fb4ec8d41..61befebf2d 100644 --- a/libdeepgalois/include/aggregator.h +++ b/libdeepgalois/include/aggregator.h @@ -5,9 +5,7 @@ void update_all(Graph &g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); void update_all(Graph &g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); #else -#include "gg.h" -#include "ggcuda.h" -#include "cub/cub.cuh" +#include "graph_gpu.h" #define TB_SIZE 256 #define WARP_SIZE 32 void update_all(CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor); diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index 1fc8b6ffc4..884eba685b 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -4,9 +4,12 @@ #include "types.h" #include "utils.h" #include "lgraph.h" +#ifdef CPU_ONLY #include "gtypes.h" +#else +#include "graph_gpu.h" +#endif #include "cutils.h" -//#include "random.h" class Context { public: @@ -14,8 +17,10 @@ class Context { ~Context(); enum Brew { CPU, GPU }; //static Context& Get(); - cublasHandle_t cublas_handle() { return cublas_handle_; } - curandGenerator_t curand_generator() { return curand_generator_; } +#ifndef CPU_ONLY + inline static cublasHandle_t cublas_handle() { return cublas_handle_; } + inline static curandGenerator_t curand_generator() { return curand_generator_; } +#endif Brew mode() { return mode_; } void set_mode(Brew mode) { mode_ = mode; } int solver_count() { return solver_count_; } @@ -25,13 +30,7 @@ class Context { bool multiprocess() { return multiprocess_; } void set_multiprocess(bool val) { multiprocess_ = val; } bool root_solver() { return solver_rank_ == 0; } - void SetDevice(const int device_id); - void DeviceQuery() {} - bool CheckDevice(const int device_id) { return true; } - int FindDevice(const int start_id = 0) { return 0; } size_t read_graph(std::string dataset_str); - size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr"); - size_t read_graph_gpu(std::string dataset_str); size_t read_labels(std::string dataset_str, size_t num); label_t get_label(size_t i) { return labels[i]; } label_t *get_labels_ptr(size_t i) { return &(labels[0]); } @@ -39,23 +38,31 @@ class Context { void norm_factor_counting(); #ifdef CPU_ONLY Graph graph_cpu; // the input graph, |V| = N + void genGraph(LGraph &lg, Graph &g); + size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr"); #else CSRGraph graph_gpu; // the input graph, |V| = N + size_t read_graph_gpu(std::string dataset_str); + void SetDevice(const int device_id); + void DeviceQuery() {} + bool CheckDevice(const int device_id) { return true; } + int FindDevice(const int start_id = 0) { return 0; } #endif std::vector labels; // labels for classification: N x 1 std::vector norm_factor; // normalization constant based on graph structure std::vector degrees; protected: +#ifndef CPU_ONLY + static cublasHandle_t cublas_handle_; // used to call cuBLAS + static curandGenerator_t curand_generator_; // used to generate random numbers on GPU +#endif Brew mode_; - cublasHandle_t cublas_handle_; // used to call cuBLAS - curandGenerator_t curand_generator_; // used to generate random numbers on GPU //shared_ptr random_generator_; // Parallel training int solver_count_; int solver_rank_; bool multiprocess_; - void genGraph(LGraph &lg, Graph &g); private: // The private constructor to avoid duplicate instantiation. diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp index e9fc27d04a..723a36e9e9 100644 --- a/libdeepgalois/src/aggregator.cpp +++ b/libdeepgalois/src/aggregator.cpp @@ -1,3 +1,5 @@ +#include "types.h" +#include "gtypes.h" #include "aggregator.h" #include "math_functions.hh" diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index 44a3e59d2d..49fed1e67e 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -1,6 +1,9 @@ +#include "gg.h" +#include "ggcuda.h" +#include "cub/cub.cuh" #include "aggregator.h" #include "math_functions.hh" -void update_all(CSRGraph g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { +void update_all(CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 59895347f1..8d7fa0e00c 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -1,4 +1,5 @@ #include "context.h" +#include "gtypes.h" #include #include @@ -19,6 +20,31 @@ int64_t cluster_seedgen(void) { return seed; } +#ifdef CPU_ONLY +Context::Context() : mode_(Context::CPU), solver_count_(1), + solver_rank_(0), multiprocess_(false) { } +Context::~Context() {} +#else +Context::Context() : mode_(Context::GPU), solver_count_(1), + solver_rank_(0), multiprocess_(false) { + // Try to create a cublas handler, and report an error if failed (but we will + // keep the program running as one might just want to run CPU code). + if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { + std::cout << "Cannot create Cublas handle. Cublas won't be available."; + } + // Try to create a curand handler. + if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS || + curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS) + std::cout << "Cannot create Curand generator. Curand won't be available."; +} + +Context::~Context() { + if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); + if (curand_generator_) { + CURAND_CHECK(curandDestroyGenerator(curand_generator_)); + } +} + void Context::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); @@ -30,25 +56,7 @@ void Context::SetDevice(const int device_id) { CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); } - -Context::Context() : - mode_(Context::CPU), - cublas_handle_(NULL), curand_generator_(NULL), - //random_generator_(NULL), mode_(Context::CPU), - solver_count_(1), solver_rank_(0), multiprocess_(false) { -#ifndef CPU_ONLY - mode_ = Context::GPU; - // Try to create a cublas handler, and report an error if failed (but we will - // keep the program running as one might just want to run CPU code). - if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { - std::cout << "Cannot create Cublas handle. Cublas won't be available."; - } - // Try to create a curand handler. - if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS || - curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS) - std::cout << "Cannot create Curand generator. Curand won't be available."; #endif -} size_t Context::read_graph(std::string dataset_str) { #ifdef CPU_ONLY @@ -59,6 +67,7 @@ size_t Context::read_graph(std::string dataset_str) { return n; } +#ifdef CPU_ONLY size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype) { galois::StatTimer Tread("GraphReadingTime"); Tread.start(); @@ -79,9 +88,6 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype) { return graph_cpu.size(); } -size_t Context::read_graph_gpu(std::string dataset_str) { -} - void Context::genGraph(LGraph &lg, Graph &g) { g.allocateFrom(lg.num_vertices(), lg.num_edges()); g.constructNodes(); @@ -94,6 +100,10 @@ void Context::genGraph(LGraph &lg, Graph &g) { g.constructEdge(offset, lg.get_dest(offset), 0); } } +#else +size_t Context::read_graph_gpu(std::string dataset_str) { +} +#endif // user-defined pre-computing function, called during initialization // for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 0dd83b6b07..4e27fdd9bb 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -1,7 +1,12 @@ #include "layers/graph_conv_layer.h" +#ifdef CPU_ONLY void graph_conv_layer::aggregate(Graph &g, const vec_t &in, tensor_t &out) { update_all(g, in, out, true, context->norm_factor); +#else +void graph_conv_layer::aggregate(CSRGraph &g, const float_t *in, float_t *out) { + update_all(g, in, out, true, NULL); +#endif } void graph_conv_layer::combine(const vec_t &self, const vec_t &neighbors, vec_t &out) { diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index a7e25a7256..064926eb58 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -35,7 +35,7 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, int ldb = (TransB == CblasNoTrans) ? N : K; cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasSgemm(DeepGalois::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + CUBLAS_CHECK(cublasSgemm(Context::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C) { @@ -52,24 +52,24 @@ int argmax_gpu(const size_t n, const float_t *x) { void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float* A, const float* x, const float beta, float* y) { cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasSgemv(DeepGalois::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); + CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); } void scal_gpu(const int N, const float alpha, float *X) { - CUBLAS_CHECK(cublasSscal(DeepGalois::cublas_handle(), N, &alpha, X, 1)); + CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), N, &alpha, X, 1)); } void dot_gpu(const int n, const float* x, const float* y, float* out) { - CUBLAS_CHECK(cublasSdot(DeepGalois::cublas_handle(), n, x, 1, y, 1, out)); + CUBLAS_CHECK(cublasSdot(Context::cublas_handle(), n, x, 1, y, 1, out)); } void asum_gpu(const int n, const float* x, float* y) { - CUBLAS_CHECK(cublasSasum(DeepGalois::cublas_handle(), n, x, 1, y)); + CUBLAS_CHECK(cublasSasum(Context::cublas_handle(), n, x, 1, y)); } void scale_gpu(const int n, const float alpha, const float *x, float* y) { - CUBLAS_CHECK(cublasScopy(DeepGalois::cublas_handle(), n, x, 1, y, 1)); - CUBLAS_CHECK(cublasSscal(DeepGalois::cublas_handle(), n, &alpha, y, 1)); + CUBLAS_CHECK(cublasScopy(Context::cublas_handle(), n, x, 1, y, 1)); + CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), n, &alpha, y, 1)); } __global__ void set_kernel(const int n, const float_t alpha, float_t* y) { diff --git a/libgpu/include/checker.h b/libgpu/include/checker.h new file mode 100644 index 0000000000..7f2cf4e36e --- /dev/null +++ b/libgpu/include/checker.h @@ -0,0 +1,15 @@ +#ifndef CHECKER_H +#define CHECKER_H +#include +#include + +static void check_cuda_error(const cudaError_t e, const char* file, + const int line) { + if (e != cudaSuccess) { + fprintf(stderr, "%s:%d: %s (%d)\n", file, line, cudaGetErrorString(e), e); + exit(1); + } +} +#define check_cuda(x) check_cuda_error(x, __FILE__, __LINE__) + +#endif diff --git a/libgpu/include/csr_graph.h b/libgpu/include/csr_graph.h index c9e13b88dc..04a8d90f99 100644 --- a/libgpu/include/csr_graph.h +++ b/libgpu/include/csr_graph.h @@ -14,124 +14,7 @@ #ifndef LSG_CSR_GRAPH #define LSG_CSR_GRAPH -#include - -// Adapted from LSG CSRGraph.h - -// TODO: make this template data -typedef unsigned index_type; // should be size_t, but GPU chokes on size_t -typedef int edge_data_type; -typedef int node_data_type; - -// very simple implementation -struct CSRGraph { - unsigned read(char file[], bool read_edge_data = true); - void copy_to_gpu(struct CSRGraph& copygraph); - void copy_to_cpu(struct CSRGraph& copygraph); - - CSRGraph(); - - unsigned init(); - unsigned allocOnHost(bool no_edge_data = false); - unsigned allocOnDevice(bool no_edge_data = false); - void progressPrint(unsigned maxii, unsigned ii); - unsigned readFromGR(char file[], bool read_edge_data = true); - - unsigned deallocOnHost(); - unsigned deallocOnDevice(); - void dealloc(); - - __device__ __host__ bool valid_node(index_type node) { - return (node < nnodes); - } - - __device__ __host__ bool valid_edge(index_type edge) { - return (edge < nedges); - } - - __device__ __host__ index_type getOutDegree(unsigned src) { - assert(src < nnodes); - return row_start[src + 1] - row_start[src]; - }; - - __device__ __host__ index_type getDestination(unsigned src, unsigned edge) { - assert(src < nnodes); - assert(edge < getOutDegree(src)); - - index_type abs_edge = row_start[src] + edge; - assert(abs_edge < nedges); - - return edge_dst[abs_edge]; - }; - - __device__ __host__ index_type getAbsDestination(unsigned abs_edge) { - assert(abs_edge < nedges); - - return edge_dst[abs_edge]; - }; - - __device__ __host__ index_type getFirstEdge(unsigned src) { - assert(src <= nnodes); // <= is okay - return row_start[src]; - }; - - __device__ __host__ edge_data_type getWeight(unsigned src, unsigned edge) { - assert(src < nnodes); - assert(edge < getOutDegree(src)); - - index_type abs_edge = row_start[src] + edge; - assert(abs_edge < nedges); - - return edge_data[abs_edge]; - }; - - __device__ __host__ edge_data_type getAbsWeight(unsigned abs_edge) { - assert(abs_edge < nedges); - - return edge_data[abs_edge]; - }; - - void init_from_mgraph(int m, int nnz, index_type *h_row_offsets, index_type *h_column_indices, node_data_type *h_labels) { - nnodes = m; - nedges = nnz; - check_cuda(cudaMalloc((void **)&row_start, (m + 1) * sizeof(index_type))); - check_cuda(cudaMalloc((void **)&edge_dst, nnz * sizeof(index_type))); - check_cuda(cudaMemcpy(row_start, h_row_offsets, (m + 1) * sizeof(index_type), cudaMemcpyHostToDevice)); - check_cuda(cudaMemcpy(edge_dst, h_column_indices, nnz * sizeof(index_type), cudaMemcpyHostToDevice)); - #ifdef ENABLE_LABEL - check_cuda(cudaMalloc((void **)&node_data, m * sizeof(node_data_type))); - check_cuda(cudaMemcpy(node_data, h_labels, m * sizeof(node_data_type), cudaMemcpyHostToDevice)); - #endif - //int *h_degrees = (int *)malloc(m * sizeof(int)); - //for (int i = 0; i < m; i++) h_degrees[i] = h_row_offsets[i + 1] - h_row_offsets[i]; - //check_cuda(cudaMalloc((void **)&d_degrees, m * sizeof(int))); - //check_cuda(cudaMemcpy(d_degrees, h_degrees, m * sizeof(int), cudaMemcpyHostToDevice)); - } - - inline __device__ __host__ index_type getEdgeDst(unsigned edge) { - assert(edge < nedges); - return edge_dst[edge]; - }; - inline __device__ __host__ node_data_type getData(unsigned vid) { - return node_data[vid]; - } - inline __device__ __host__ index_type edge_begin(unsigned src) { - assert(src <= nnodes); - return row_start[src]; - }; - inline __device__ __host__ index_type edge_end(unsigned src) { - assert(src <= nnodes); - return row_start[src+1]; - }; - - index_type nnodes, nedges; - index_type* row_start; // row_start[node] points into edge_dst, node starts at - // 0, row_start[nnodes] = nedges - index_type* edge_dst; - edge_data_type* edge_data; - node_data_type* node_data; - bool device_graph; -}; +#include "graph_gpu.h" struct CSRGraphTex : CSRGraph { cudaTextureObject_t edge_dst_tx; diff --git a/libgpu/include/gg.h b/libgpu/include/gg.h index 779aafdd84..7f4a130c23 100644 --- a/libgpu/include/gg.h +++ b/libgpu/include/gg.h @@ -34,14 +34,7 @@ unsigned const debug = GGDEBUG; #include "Timer.h" - -static void check_cuda_error(const cudaError_t e, const char* file, - const int line) { - if (e != cudaSuccess) { - fprintf(stderr, "%s:%d: %s (%d)\n", file, line, cudaGetErrorString(e), e); - exit(1); - } -} +#include "checker.h" template static void check_retval(const T retval, const T expected, const char* file, @@ -64,7 +57,6 @@ inline static __device__ __host__ int GG_MIN(int x, int y) { return x; } -#define check_cuda(x) check_cuda_error(x, __FILE__, __LINE__) #define check_rv(r, x) check_retval(r, x, __FILE__, __LINE__) #include "bmk2.h" diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h new file mode 100644 index 0000000000..c197c077ec --- /dev/null +++ b/libgpu/include/graph_gpu.h @@ -0,0 +1,137 @@ +/* + csr_graph.h + + Implements a CSR Graph. Part of the GGC source code. + Interface derived from LonestarGPU. + + Copyright (C) 2014--2016, The University of Texas at Austin + + See LICENSE.TXT for copyright license. + + Author: Sreepathi Pai +*/ + +#ifndef CSR_GRAPH +#define CSR_GRAPH + +#include +#include +#include "checker.h" + +// Adapted from LSG CSRGraph.h + +// TODO: make this template data +typedef unsigned index_type; // should be size_t, but GPU chokes on size_t +typedef int edge_data_type; +typedef int node_data_type; + +// very simple implementation +struct CSRGraph { + unsigned read(char file[], bool read_edge_data = true); + void copy_to_gpu(struct CSRGraph& copygraph); + void copy_to_cpu(struct CSRGraph& copygraph); + + CSRGraph(); + + unsigned init(); + unsigned allocOnHost(bool no_edge_data = false); + unsigned allocOnDevice(bool no_edge_data = false); + void progressPrint(unsigned maxii, unsigned ii); + unsigned readFromGR(char file[], bool read_edge_data = true); + + unsigned deallocOnHost(); + unsigned deallocOnDevice(); + void dealloc(); + + __device__ __host__ bool valid_node(index_type node) { + return (node < nnodes); + } + + __device__ __host__ bool valid_edge(index_type edge) { + return (edge < nedges); + } + + __device__ __host__ index_type getOutDegree(unsigned src) { + assert(src < nnodes); + return row_start[src + 1] - row_start[src]; + }; + + __device__ __host__ index_type getDestination(unsigned src, unsigned edge) { + assert(src < nnodes); + assert(edge < getOutDegree(src)); + + index_type abs_edge = row_start[src] + edge; + assert(abs_edge < nedges); + + return edge_dst[abs_edge]; + }; + + __device__ __host__ index_type getAbsDestination(unsigned abs_edge) { + assert(abs_edge < nedges); + + return edge_dst[abs_edge]; + }; + + __device__ __host__ index_type getFirstEdge(unsigned src) { + assert(src <= nnodes); // <= is okay + return row_start[src]; + }; + + __device__ __host__ edge_data_type getWeight(unsigned src, unsigned edge) { + assert(src < nnodes); + assert(edge < getOutDegree(src)); + + index_type abs_edge = row_start[src] + edge; + assert(abs_edge < nedges); + + return edge_data[abs_edge]; + }; + + __device__ __host__ edge_data_type getAbsWeight(unsigned abs_edge) { + assert(abs_edge < nedges); + + return edge_data[abs_edge]; + }; + + void init_from_mgraph(int m, int nnz, index_type *h_row_offsets, index_type *h_column_indices, node_data_type *h_labels) { + nnodes = m; + nedges = nnz; + check_cuda(cudaMalloc((void **)&row_start, (m + 1) * sizeof(index_type))); + check_cuda(cudaMalloc((void **)&edge_dst, nnz * sizeof(index_type))); + check_cuda(cudaMemcpy(row_start, h_row_offsets, (m + 1) * sizeof(index_type), cudaMemcpyHostToDevice)); + check_cuda(cudaMemcpy(edge_dst, h_column_indices, nnz * sizeof(index_type), cudaMemcpyHostToDevice)); + #ifdef ENABLE_LABEL + check_cuda(cudaMalloc((void **)&node_data, m * sizeof(node_data_type))); + check_cuda(cudaMemcpy(node_data, h_labels, m * sizeof(node_data_type), cudaMemcpyHostToDevice)); + #endif + //int *h_degrees = (int *)malloc(m * sizeof(int)); + //for (int i = 0; i < m; i++) h_degrees[i] = h_row_offsets[i + 1] - h_row_offsets[i]; + //check_cuda(cudaMalloc((void **)&d_degrees, m * sizeof(int))); + //check_cuda(cudaMemcpy(d_degrees, h_degrees, m * sizeof(int), cudaMemcpyHostToDevice)); + } + + inline __device__ __host__ index_type getEdgeDst(unsigned edge) { + assert(edge < nedges); + return edge_dst[edge]; + }; + inline __device__ __host__ node_data_type getData(unsigned vid) { + return node_data[vid]; + } + inline __device__ __host__ index_type edge_begin(unsigned src) { + assert(src <= nnodes); + return row_start[src]; + }; + inline __device__ __host__ index_type edge_end(unsigned src) { + assert(src <= nnodes); + return row_start[src+1]; + }; + + index_type nnodes, nedges; + index_type* row_start; // row_start[node] points into edge_dst, node starts at + // 0, row_start[nnodes] = nedges + index_type* edge_dst; + edge_data_type* edge_data; + node_data_type* node_data; + bool device_graph; +}; +#endif diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index 338ded6c67..10e7288dd9 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -6,6 +6,7 @@ include_directories(${CMAKE_SOURCE_DIR}/lonestargnn) include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include) SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include) include_directories(${CUDA_INC}) +include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) From 2388800b99785ffe3053d4c45eda1d68a48fe1bd Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 23 Feb 2020 08:42:40 -0600 Subject: [PATCH 011/660] fix gpu compilation --- libdeepgalois/CMakeLists.txt | 28 +++++----- libdeepgalois/include/context.h | 1 + libdeepgalois/include/cutils.h | 74 ++++++++++++++++++++++++- libdeepgalois/include/math_functions.hh | 6 +- libdeepgalois/src/aggregator.cu | 2 + libdeepgalois/src/context.cpp | 20 +++---- libdeepgalois/src/net.cpp | 3 + lonestargnn/gcn/CMakeLists.txt | 3 +- 8 files changed, 107 insertions(+), 30 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index e27f822b69..7ff89b086b 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -16,8 +16,7 @@ include_directories("${MGPU_ROOT}/src") link_directories(${OPENBLAS_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgalois) -#deepgalois_option(CPU_ONLY "Build DeepGalois without CUDA support" OFF) -set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support") +set(USE_CPU OFF CACHE BOOL "Build DeepGalois without CUDA support") if(USE_CPU) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") else() @@ -26,12 +25,14 @@ else() set(CUDA_PROPAGATE_HOST_FLAGS OFF) set(CUDA_HOST_COMPILER g++) #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60") - #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60; -std=c++11") link_directories(${CUDA_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgpu) - file(GLOB CUDA_FILES "src/" *.cu) - cuda_compile(CU_O src/math_functions.cu src/aggregator.cu) - #CUDA_COMPILE(CU_O ${CUDA_FILES}) + set(CUDA_SOURCES src/math_functions.cu src/aggregator.cu) + cuda_add_library(dg_gpu ${CUDA_SOURCES}) + set_target_properties(dg_gpu PROPERTIES COMPILE_FLAGS "-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA") + set_target_properties(dg_gpu PROPERTIES CUDA_SEPERABLE_COMPILATION ON) + #cuda_compile(MF_O src/math_functions.cu) + #cuda_compile(AGG_O src/aggregator.cu) endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") @@ -43,22 +44,21 @@ set(sources src/aggregator.cpp src/context.cpp src/net.cpp - ${CU_O} ) -add_library(deepgalois STATIC ${sources}) +add_library(dg_cpu STATIC ${sources}) -target_link_libraries(deepgalois galois_shmem gllvm galois_gpu) -target_link_libraries(deepgalois ${MPI_CXX_LIBRARIES}) -target_link_libraries(deepgalois -lopenblas) -target_link_libraries(deepgalois -lcudart -lcublas -lcurand) +target_link_libraries(dg_cpu galois_shmem gllvm galois_gpu) +target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES}) +target_link_libraries(dg_cpu -lopenblas) +target_link_libraries(dg_cpu -lcudart -lcublas -lcurand) -target_include_directories(deepgalois PUBLIC +target_include_directories(dg_cpu PUBLIC ${CMAKE_SOURCE_DIR}/libllvm/include ${CMAKE_SOURCE_DIR}/libgalois/include ${CMAKE_CURRENT_SOURCE_DIR}/include ) -set_target_properties(deepgalois PROPERTIES +set_target_properties(dg_cpu PROPERTIES INTERFACE_POSITION_INDEPENDENT_CODE On POSITION_INDEPENDENT_CODE On ) diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index 884eba685b..39fd817198 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -20,6 +20,7 @@ class Context { #ifndef CPU_ONLY inline static cublasHandle_t cublas_handle() { return cublas_handle_; } inline static curandGenerator_t curand_generator() { return curand_generator_; } + static void create_blas_handle(); #endif Brew mode() { return mode_; } void set_mode(Brew mode) { mode_ = mode; } diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h index 8a0fcaa3a1..924dfd06e7 100644 --- a/libdeepgalois/include/cutils.h +++ b/libdeepgalois/include/cutils.h @@ -13,6 +13,68 @@ inline int CUDA_GET_BLOCKS(const int N) { return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; } +inline const char* cublasGetErrorString(cublasStatus_t error) { + switch (error) { + case CUBLAS_STATUS_SUCCESS: + return "CUBLAS_STATUS_SUCCESS"; + case CUBLAS_STATUS_NOT_INITIALIZED: + return "CUBLAS_STATUS_NOT_INITIALIZED"; + case CUBLAS_STATUS_ALLOC_FAILED: + return "CUBLAS_STATUS_ALLOC_FAILED"; + case CUBLAS_STATUS_INVALID_VALUE: + return "CUBLAS_STATUS_INVALID_VALUE"; + case CUBLAS_STATUS_ARCH_MISMATCH: + return "CUBLAS_STATUS_ARCH_MISMATCH"; + case CUBLAS_STATUS_MAPPING_ERROR: + return "CUBLAS_STATUS_MAPPING_ERROR"; + case CUBLAS_STATUS_EXECUTION_FAILED: + return "CUBLAS_STATUS_EXECUTION_FAILED"; + case CUBLAS_STATUS_INTERNAL_ERROR: + return "CUBLAS_STATUS_INTERNAL_ERROR"; +#if CUDA_VERSION >= 6000 + case CUBLAS_STATUS_NOT_SUPPORTED: + return "CUBLAS_STATUS_NOT_SUPPORTED"; +#endif +#if CUDA_VERSION >= 6050 + case CUBLAS_STATUS_LICENSE_ERROR: + return "CUBLAS_STATUS_LICENSE_ERROR"; +#endif + } + return "Unknown cublas status"; +} + +inline const char* curandGetErrorString(curandStatus_t error) { + switch (error) { + case CURAND_STATUS_SUCCESS: + return "CURAND_STATUS_SUCCESS"; + case CURAND_STATUS_VERSION_MISMATCH: + return "CURAND_STATUS_VERSION_MISMATCH"; + case CURAND_STATUS_NOT_INITIALIZED: + return "CURAND_STATUS_NOT_INITIALIZED"; + case CURAND_STATUS_ALLOCATION_FAILED: + return "CURAND_STATUS_ALLOCATION_FAILED"; + case CURAND_STATUS_TYPE_ERROR: + return "CURAND_STATUS_TYPE_ERROR"; + case CURAND_STATUS_OUT_OF_RANGE: + return "CURAND_STATUS_OUT_OF_RANGE"; + case CURAND_STATUS_LENGTH_NOT_MULTIPLE: + return "CURAND_STATUS_LENGTH_NOT_MULTIPLE"; + case CURAND_STATUS_DOUBLE_PRECISION_REQUIRED: + return "CURAND_STATUS_DOUBLE_PRECISION_REQUIRED"; + case CURAND_STATUS_LAUNCH_FAILURE: + return "CURAND_STATUS_LAUNCH_FAILURE"; + case CURAND_STATUS_PREEXISTING_FAILURE: + return "CURAND_STATUS_PREEXISTING_FAILURE"; + case CURAND_STATUS_INITIALIZATION_FAILED: + return "CURAND_STATUS_INITIALIZATION_FAILED"; + case CURAND_STATUS_ARCH_MISMATCH: + return "CURAND_STATUS_ARCH_MISMATCH"; + case CURAND_STATUS_INTERNAL_ERROR: + return "CURAND_STATUS_INTERNAL_ERROR"; + } + return "Unknown curand status"; +} + // CUDA: various checks for different function calls. #define CUDA_CHECK(condition) \ do { \ @@ -27,13 +89,21 @@ inline int CUDA_GET_BLOCKS(const int N) { #define CUBLAS_CHECK(condition) \ do { \ cublasStatus_t status = condition; \ - if (status != CUBLAS_STATUS_SUCCESS) \ - ; \ + if (status != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, "error %d: cuBLAS error in file '%s' in line %i : %s.\n", \ + status, __FILE__, __LINE__, cublasGetErrorString(status) ); \ + exit(EXIT_FAILURE); \ + } \ } while (0) #define CURAND_CHECK(condition) \ do { \ curandStatus_t status = condition; \ + if (status != CURAND_STATUS_SUCCESS) { \ + fprintf(stderr, "error %d: cuBLAS error in file '%s' in line %i : %s.\n", \ + status, __FILE__, __LINE__, curandGetErrorString(status) ); \ + exit(EXIT_FAILURE); \ + } \ } while (0) // CUDA: grid stride looping diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 86363f4ba3..87d48fd92e 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -41,9 +41,9 @@ void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp); float_t cross_entropy(const vec_t &y, const vec_t &p); void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d); -void vadd_gpu(const size_t n, const float_t *a, const float_t *b, float_t *out); // vector add -void relu_gpu(const size_t n, const float_t *in, float_t *out); // ReLU -void d_relu_gpu(const size_t n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative +void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add +void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU +void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative void dropout_gpu(const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out); // dropout void d_dropout_gpu(const float scale, const float_t *in_diff, const unsigned *mask, float_t *out_diff); // dropout derivative void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index 49fed1e67e..064a01da0e 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -5,5 +5,7 @@ #include "math_functions.hh" void update_all(CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { + unsigned n = g.nnodes; + vadd_gpu(n, in, in, out); } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 8d7fa0e00c..bbb68c194e 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -26,16 +26,15 @@ Context::Context() : mode_(Context::CPU), solver_count_(1), Context::~Context() {} #else Context::Context() : mode_(Context::GPU), solver_count_(1), - solver_rank_(0), multiprocess_(false) { - // Try to create a cublas handler, and report an error if failed (but we will - // keep the program running as one might just want to run CPU code). - if (cublasCreate(&cublas_handle_) != CUBLAS_STATUS_SUCCESS) { - std::cout << "Cannot create Cublas handle. Cublas won't be available."; - } - // Try to create a curand handler. - if (curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT) != CURAND_STATUS_SUCCESS || - curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen()) != CURAND_STATUS_SUCCESS) - std::cout << "Cannot create Curand generator. Curand won't be available."; + solver_rank_(0), multiprocess_(false) { } + +cublasHandle_t Context::cublas_handle_ = 0; +curandGenerator_t Context::curand_generator_ = 0; + +void Context::create_blas_handle() { + CUBLAS_CHECK(cublasCreate(&cublas_handle_)); + CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); } Context::~Context() { @@ -102,6 +101,7 @@ void Context::genGraph(LGraph &lg, Graph &g) { } #else size_t Context::read_graph_gpu(std::string dataset_str) { + return 0; } #endif diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index aff96fde56..eeaf5b668b 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -2,6 +2,9 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { context = new Context(); +#ifndef CPU_ONLY + Context::create_blas_handle(); +#endif n = context->read_graph(dataset_str); num_classes = context->read_labels(dataset_str, n); context->degree_counting(); diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt index ccc59d83de..b71d2df5f8 100644 --- a/lonestargnn/gcn/CMakeLists.txt +++ b/lonestargnn/gcn/CMakeLists.txt @@ -1,3 +1,4 @@ app(gcn gcn.cpp) -target_link_libraries(gcn deepgalois) +target_link_libraries(gcn dg_cpu dg_gpu) +target_link_libraries(gcn -lcudart -lcublas -lcurand -lcudadevrt) From 7a45854332181d526284af79ffadcc60801eecaa Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 23 Feb 2020 10:20:14 -0600 Subject: [PATCH 012/660] add gpu graph reading --- libdeepgalois/CMakeLists.txt | 3 +- libdeepgalois/include/context.h | 14 +++++--- libdeepgalois/include/net.h | 39 ++++----------------- libdeepgalois/src/context.cpp | 61 ++++++++++++++++++++++++++++++--- libdeepgalois/src/net.cpp | 52 ++++++++-------------------- libgpu/include/graph_gpu.h | 4 +-- libgpu/src/csr_graph.cu | 4 +-- lonestargnn/CMakeLists.txt | 2 +- lonestargnn/gcn/CMakeLists.txt | 5 ++- 9 files changed, 98 insertions(+), 86 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 7ff89b086b..168a022860 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -16,7 +16,7 @@ include_directories("${MGPU_ROOT}/src") link_directories(${OPENBLAS_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgalois) -set(USE_CPU OFF CACHE BOOL "Build DeepGalois without CUDA support") +set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support") if(USE_CPU) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") else() @@ -29,6 +29,7 @@ else() link_directories(${CMAKE_SOURCE_DIR}/libgpu) set(CUDA_SOURCES src/math_functions.cu src/aggregator.cu) cuda_add_library(dg_gpu ${CUDA_SOURCES}) + target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcurand) set_target_properties(dg_gpu PROPERTIES COMPILE_FLAGS "-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA") set_target_properties(dg_gpu PROPERTIES CUDA_SEPERABLE_COMPILATION ON) #cuda_compile(MF_O src/math_functions.cu) diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index 39fd817198..a40b31b120 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -32,26 +32,32 @@ class Context { void set_multiprocess(bool val) { multiprocess_ = val; } bool root_solver() { return solver_rank_ == 0; } size_t read_graph(std::string dataset_str); - size_t read_labels(std::string dataset_str, size_t num); + size_t read_labels(std::string dataset_str); + size_t read_features(std::string dataset_str); label_t get_label(size_t i) { return labels[i]; } label_t *get_labels_ptr(size_t i) { return &(labels[0]); } void degree_counting(); void norm_factor_counting(); + std::vector labels; // labels for classification: N x 1 + std::vector norm_factor; // normalization constant based on graph structure + std::vector degrees; + tensor_t h_feats; // input features: N x D + size_t feat_len; // input feature length: D #ifdef CPU_ONLY Graph graph_cpu; // the input graph, |V| = N void genGraph(LGraph &lg, Graph &g); size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr"); #else CSRGraph graph_gpu; // the input graph, |V| = N + label_t *d_labels; // labels on device + float_t *d_norm_factor; // norm_factor on device + float_t *d_feats; // input features on device size_t read_graph_gpu(std::string dataset_str); void SetDevice(const int device_id); void DeviceQuery() {} bool CheckDevice(const int device_id) { return true; } int FindDevice(const int start_id = 0) { return 0; } #endif - std::vector labels; // labels for classification: N x 1 - std::vector norm_factor; // normalization constant based on graph structure - std::vector degrees; protected: #ifndef CPU_ONLY diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index dba2753221..80da9fe1ad 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -21,27 +21,17 @@ class Net { void init(std::string dataset_str, unsigned epochs, unsigned hidden1); size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id+1]; } - size_t get_ft_dim() { return feature_dims[0]; } - size_t read_features(std::string dataset_str, tensor_t &feats); - void construct_layers() { - std::cout << "\nConstructing layers...\n"; - append_conv_layer(0, true); // first conv layer - append_conv_layer(1); // hidden1 layer - append_out_layer(2); // output layer - layers[0]->set_in_data(input_features); // feed input data - set_contexts(); - } - + size_t get_nnodes() { return num_samples; } + void train(optimizer *opt, bool need_validate); // training + void construct_layers(); void set_contexts() { for (size_t i = 0; i < num_layers; i ++) layers[i]->set_context(context); } - void set_netphases(net_phase phase) { for (size_t i = 0; i < num_layers; i ++) layers[i]->set_netphase(phase); } - void print_layers_info() { for (size_t i = 0; i < num_layers; i ++) layers[i]->print_layer_info(); @@ -51,21 +41,17 @@ class Net { assert(dropout_rate < 1.0); assert(layer_id < NUM_CONV_LAYERS); std::vector in_dims(2), out_dims(2); - in_dims[0] = out_dims[0] = n; + in_dims[0] = out_dims[0] = num_samples; in_dims[1] = get_in_dim(layer_id); out_dims[1] = get_out_dim(layer_id); -#ifdef CPU_ONLY layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); -#else - layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); -#endif if(layer_id > 0) connect(layers[layer_id-1], layers[layer_id]); } void append_out_layer(size_t layer_id) { assert(layer_id > 0); // can not be the first layer std::vector in_dims(2), out_dims(2); - in_dims[0] = out_dims[0] = n; + in_dims[0] = out_dims[0] = num_samples; in_dims[1] = get_in_dim(layer_id); out_dims[1] = get_out_dim(layer_id); layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); @@ -106,29 +92,16 @@ class Net { return t_eval.Millisecs(); } - // training - void train(optimizer *opt, bool need_validate); - size_t get_nnodes() { return n; } - protected: Context *context; - size_t n; // number of samples: N + size_t num_samples; // number of samples: N size_t num_classes; // number of vertex classes: E size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 unsigned num_epochs; // number of epochs std::vector feature_dims; // feature dimnesions for each layer - tensor_t input_features; // input features: N x D MaskList train_mask, val_mask; // masks for traning and validation size_t train_begin, train_end, train_count, val_begin, val_end, val_count; std::vector layers; // all the layers in the neural network - /* - inline void init_features(size_t dim, vec_t &x) { - std::default_random_engine rng; - std::uniform_real_distribution dist(0, 0.1); - for (size_t i = 0; i < dim; ++i) - x[i] = dist(rng); - } - //*/ // comparing outputs with the ground truth (labels) inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, MaskList &masks) { diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index bbb68c194e..f0854eb403 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -101,8 +101,19 @@ void Context::genGraph(LGraph &lg, Graph &g) { } #else size_t Context::read_graph_gpu(std::string dataset_str) { + std::string filename = path + dataset_str + ".csgr"; + graph_gpu.read(filename.c_str(), false); + exit(0); return 0; } + +void copy_data_to_device() { + CUDA_CHECK(cudaMalloc((void **)&d_labels, n * sizeof(label_t))); + CUDA_SAFE_CALL(cudaMemcpy(d_labels, labels, n * sizeof(label_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMalloc((void **)&d_norm_factor, n * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void **)&d_feats, n * sizeof(float_t))); + CUDA_SAFE_CALL(cudaMemcpy(d_feats, h_feats, n * sizeof(float_t), cudaMemcpyHostToDevice)); +} #endif // user-defined pre-computing function, called during initialization @@ -132,18 +143,17 @@ void Context::degree_counting() { // labels contain the ground truth (e.g. vertex classes) for each example (num_examples x 1). // Note that labels is not one-hot encoded vector and it can be computed // as y.argmax(axis=1) from one-hot encoded vector (y) of labels if required. -size_t Context::read_labels(std::string dataset_str, size_t num) { +size_t Context::read_labels(std::string dataset_str) { std::cout << "Reading labels ... "; - labels.resize(num, 0); // label for each vertex: N x 1 Timer t_read; t_read.Start(); std::string filename = path + dataset_str + "-labels.txt"; std::ifstream in; std::string line; in.open(filename, std::ios::in); - size_t m, n; + size_t m, n; // m: number of vertices; n: number of classes in >> m >> n >> std::ws; - assert(m == labels.size()); // number of vertices + labels.resize(m, 0); // label for each vertex: N x 1 unsigned v = 0; while (std::getline(in, line)) { std::istringstream label_stream(line); @@ -159,8 +169,49 @@ size_t Context::read_labels(std::string dataset_str, size_t num) { } in.close(); t_read.Stop(); - // number of vertex classes + // print the number of vertex classes std::cout << "Done, unique label counts: " << n << ", time: " << t_read.Millisecs() << " ms\n"; return n; } +size_t Context::read_features(std::string dataset_str) { + std::cout << "Reading features ... "; + Timer t_read; + t_read.Start(); + std::string filename = path + dataset_str + ".ft"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + size_t m; // m = number of vertices + in >> m >> feat_len >> std::ws; + //assert(m == ); + h_feats.resize(m); + for (size_t i = 0; i < m; ++i) { + h_feats[i].resize(feat_len); + for (size_t j = 0; j < feat_len; ++j) + h_feats[i][j] = 0; + } + while (std::getline(in, line)) { + std::istringstream edge_stream(line); + unsigned u, v; + float_t w; + edge_stream >> u; + edge_stream >> v; + edge_stream >> w; + h_feats[u][v] = w; + } + in.close(); + t_read.Stop(); + std::cout << "Done, feature length: " << feat_len << ", time: " << t_read.Millisecs() << " ms\n"; + return feat_len; +} + +/* +inline void init_features(size_t dim, vec_t &x) { + std::default_random_engine rng; + std::uniform_real_distribution dist(0, 0.1); + for (size_t i = 0; i < dim; ++i) + x[i] = dist(rng); +} +//*/ + diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index eeaf5b668b..a7cd4ba567 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -5,14 +5,15 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { #ifndef CPU_ONLY Context::create_blas_handle(); #endif - n = context->read_graph(dataset_str); - num_classes = context->read_labels(dataset_str, n); + num_samples = context->read_graph(dataset_str); + num_classes = context->read_labels(dataset_str); context->degree_counting(); context->norm_factor_counting(); // pre-compute normalizing factor num_epochs = epochs; + std::cout << "Reading label masks ... "; - train_mask.resize(n, 0); - val_mask.resize(n, 0); + train_mask.resize(num_samples, 0); + val_mask.resize(num_samples, 0); if (dataset_str == "reddit") { train_begin = 0, train_count = 153431, train_end = train_begin + train_count; val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; @@ -26,45 +27,13 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { num_layers = NUM_CONV_LAYERS + 1; feature_dims.resize(num_layers + 1); - input_features.resize(n); // input embedding: N x D - feature_dims[0] = read_features(dataset_str, input_features); // input feature dimension: D + feature_dims[0] = context->read_features(dataset_str); // input feature dimension: D feature_dims[1] = hidden1; // hidden1 level embedding: 16 feature_dims[2] = num_classes; // output embedding: E feature_dims[3] = num_classes; // normalized output embedding: E layers.resize(num_layers); } -size_t Net::read_features(std::string dataset_str, tensor_t &feats) { - std::cout << "Reading features ... "; - Timer t_read; - t_read.Start(); - std::string filename = path + dataset_str + ".ft"; - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - size_t m, n; - in >> m >> n >> std::ws; - assert(m == feats.size()); // m = number of vertices - for (size_t i = 0; i < m; ++i) { - feats[i].resize(n); - for (size_t j = 0; j < n; ++j) - feats[i][j] = 0; - } - while (std::getline(in, line)) { - std::istringstream edge_stream(line); - unsigned u, v; - float_t w; - edge_stream >> u; - edge_stream >> v; - edge_stream >> w; - feats[u][v] = w; - } - in.close(); - t_read.Stop(); - std::cout << "Done, feature dimention: " << n << ", time: " << t_read.Millisecs() << " ms\n"; - return n; -} - void Net::train(optimizer *opt, bool need_validate) { std::cout << "\nStart training...\n"; galois::StatTimer Tupdate("Train-WeightUpdate"); @@ -108,3 +77,12 @@ void Net::train(optimizer *opt, bool need_validate) { } } +void Net::construct_layers() { + std::cout << "\nConstructing layers...\n"; + append_conv_layer(0, true); // first conv layer + append_conv_layer(1); // hidden1 layer + append_out_layer(2); // output layer + layers[0]->set_in_data(context->h_feats); // feed input data + set_contexts(); +} + diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index c197c077ec..2458ad8632 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -27,7 +27,7 @@ typedef int node_data_type; // very simple implementation struct CSRGraph { - unsigned read(char file[], bool read_edge_data = true); + unsigned read(const char file[], bool read_edge_data = true); void copy_to_gpu(struct CSRGraph& copygraph); void copy_to_cpu(struct CSRGraph& copygraph); @@ -37,7 +37,7 @@ struct CSRGraph { unsigned allocOnHost(bool no_edge_data = false); unsigned allocOnDevice(bool no_edge_data = false); void progressPrint(unsigned maxii, unsigned ii); - unsigned readFromGR(char file[], bool read_edge_data = true); + unsigned readFromGR(const char file[], bool read_edge_data = true); unsigned deallocOnHost(); unsigned deallocOnDevice(); diff --git a/libgpu/src/csr_graph.cu b/libgpu/src/csr_graph.cu index 554550fe91..d00912a404 100644 --- a/libgpu/src/csr_graph.cu +++ b/libgpu/src/csr_graph.cu @@ -150,7 +150,7 @@ void CSRGraph::progressPrint(unsigned maxii, unsigned ii) { } } -unsigned CSRGraph::readFromGR(char file[], bool read_edge_data) { +unsigned CSRGraph::readFromGR(const char file[], bool read_edge_data) { std::ifstream cfile; cfile.open(file); @@ -237,7 +237,7 @@ unsigned CSRGraph::readFromGR(char file[], bool read_edge_data) { return 0; } -unsigned CSRGraph::read(char file[], bool read_edge_data) { +unsigned CSRGraph::read(const char file[], bool read_edge_data) { return readFromGR(file, read_edge_data); } diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index 10e7288dd9..e270f63011 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -4,7 +4,7 @@ include_directories(BEFORE ) include_directories(${CMAKE_SOURCE_DIR}/lonestargnn) include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include) -SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include) +SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include) include_directories(${CUDA_INC}) include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt index b71d2df5f8..715a68d497 100644 --- a/lonestargnn/gcn/CMakeLists.txt +++ b/lonestargnn/gcn/CMakeLists.txt @@ -1,4 +1,7 @@ app(gcn gcn.cpp) -target_link_libraries(gcn dg_cpu dg_gpu) +target_link_libraries(gcn dg_cpu) +if(NOT USE_CPU) + target_link_libraries(gcn dg_gpu) +endif() target_link_libraries(gcn -lcudart -lcublas -lcurand -lcudadevrt) From eda1ffdabdf4513a1f0cf591399c131877f5b6d4 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 23 Feb 2020 10:34:07 -0600 Subject: [PATCH 013/660] add copy_data --- libdeepgalois/include/context.h | 5 ++++- libdeepgalois/src/context.cpp | 30 ++++++++++++++---------------- libdeepgalois/src/net.cpp | 7 ++++--- 3 files changed, 22 insertions(+), 20 deletions(-) diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index a40b31b120..d03740358e 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -20,7 +20,7 @@ class Context { #ifndef CPU_ONLY inline static cublasHandle_t cublas_handle() { return cublas_handle_; } inline static curandGenerator_t curand_generator() { return curand_generator_; } - static void create_blas_handle(); + //static void create_blas_handle(); #endif Brew mode() { return mode_; } void set_mode(Brew mode) { mode_ = mode; } @@ -42,6 +42,8 @@ class Context { std::vector norm_factor; // normalization constant based on graph structure std::vector degrees; tensor_t h_feats; // input features: N x D + size_t n; // number of samples: N + size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D #ifdef CPU_ONLY Graph graph_cpu; // the input graph, |V| = N @@ -53,6 +55,7 @@ class Context { float_t *d_norm_factor; // norm_factor on device float_t *d_feats; // input features on device size_t read_graph_gpu(std::string dataset_str); + void copy_data_to_device(); // copy labels and input features void SetDevice(const int device_id); void DeviceQuery() {} bool CheckDevice(const int device_id) { return true; } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index f0854eb403..d71baebc9c 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -25,13 +25,12 @@ Context::Context() : mode_(Context::CPU), solver_count_(1), solver_rank_(0), multiprocess_(false) { } Context::~Context() {} #else -Context::Context() : mode_(Context::GPU), solver_count_(1), - solver_rank_(0), multiprocess_(false) { } - cublasHandle_t Context::cublas_handle_ = 0; curandGenerator_t Context::curand_generator_ = 0; -void Context::create_blas_handle() { +Context::Context() : mode_(Context::GPU), solver_count_(1), + solver_rank_(0), multiprocess_(false) { +//void Context::create_blas_handle() { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); @@ -59,9 +58,9 @@ void Context::SetDevice(const int device_id) { size_t Context::read_graph(std::string dataset_str) { #ifdef CPU_ONLY - size_t n = read_graph_cpu(dataset_str, "gr"); + n = read_graph_cpu(dataset_str, "gr"); #else - size_t n = read_graph_gpu(dataset_str); + n = read_graph_gpu(dataset_str); #endif return n; } @@ -103,8 +102,7 @@ void Context::genGraph(LGraph &lg, Graph &g) { size_t Context::read_graph_gpu(std::string dataset_str) { std::string filename = path + dataset_str + ".csgr"; graph_gpu.read(filename.c_str(), false); - exit(0); - return 0; + return graph_gpu.nnodes; } void copy_data_to_device() { @@ -112,7 +110,7 @@ void copy_data_to_device() { CUDA_SAFE_CALL(cudaMemcpy(d_labels, labels, n * sizeof(label_t), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMalloc((void **)&d_norm_factor, n * sizeof(float_t))); CUDA_CHECK(cudaMalloc((void **)&d_feats, n * sizeof(float_t))); - CUDA_SAFE_CALL(cudaMemcpy(d_feats, h_feats, n * sizeof(float_t), cudaMemcpyHostToDevice)); + CUDA_SAFE_CALL(cudaMemcpy(d_feats, h_feats, n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); } #endif @@ -120,7 +118,6 @@ void copy_data_to_device() { // for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v void Context::norm_factor_counting() { #ifdef CPU_ONLY - size_t n = graph_cpu.size(); norm_factor.resize(n); galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) { float_t temp = std::sqrt(float_t(degrees[v])); @@ -132,7 +129,6 @@ void Context::norm_factor_counting() { void Context::degree_counting() { #ifdef CPU_ONLY - size_t n = graph_cpu.size(); degrees.resize(n); galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) { degrees[v] = std::distance(graph_cpu.edge_begin(v), graph_cpu.edge_end(v)); @@ -151,14 +147,15 @@ size_t Context::read_labels(std::string dataset_str) { std::ifstream in; std::string line; in.open(filename, std::ios::in); - size_t m, n; // m: number of vertices; n: number of classes - in >> m >> n >> std::ws; + size_t m; // m: number of samples + in >> m >> num_classes >> std::ws; + assert(m == n); labels.resize(m, 0); // label for each vertex: N x 1 unsigned v = 0; while (std::getline(in, line)) { std::istringstream label_stream(line); unsigned x; - for (size_t idx = 0; idx < n; ++idx) { + for (size_t idx = 0; idx < num_classes; ++idx) { label_stream >> x; if (x != 0) { labels[v] = idx; @@ -170,8 +167,9 @@ size_t Context::read_labels(std::string dataset_str) { in.close(); t_read.Stop(); // print the number of vertex classes - std::cout << "Done, unique label counts: " << n << ", time: " << t_read.Millisecs() << " ms\n"; - return n; + std::cout << "Done, unique label counts: " << num_classes + << ", time: " << t_read.Millisecs() << " ms\n"; + return num_classes; } size_t Context::read_features(std::string dataset_str) { diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index a7cd4ba567..9d1fe771fb 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -2,9 +2,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { context = new Context(); -#ifndef CPU_ONLY - Context::create_blas_handle(); -#endif + //Context::create_blas_handle(); num_samples = context->read_graph(dataset_str); num_classes = context->read_labels(dataset_str); context->degree_counting(); @@ -32,6 +30,9 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { feature_dims[2] = num_classes; // output embedding: E feature_dims[3] = num_classes; // normalized output embedding: E layers.resize(num_layers); +#ifndef CPU_ONLY + copy_data_to_device(); // copy labels and input features to the device +#endif } void Net::train(optimizer *opt, bool need_validate) { From 0400c451b79f062a1e627bed87876b9dcc5082f1 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 23 Feb 2020 18:58:55 -0600 Subject: [PATCH 014/660] convert data to 1D --- libdeepgalois/CMakeLists.txt | 1 - libdeepgalois/include/aggregator.h | 5 +- libdeepgalois/include/context.h | 4 +- libdeepgalois/include/layers.h | 6 +- .../include/layers/graph_conv_layer.h | 10 +- libdeepgalois/include/layers/layer.h | 36 +++---- .../include/layers/softmax_loss_layer.h | 4 +- libdeepgalois/include/math_functions.hh | 12 +++ libdeepgalois/include/net.h | 2 +- libdeepgalois/include/node.h | 44 ++++----- libdeepgalois/src/aggregator.cpp | 29 +----- libdeepgalois/src/aggregator.cu | 4 +- libdeepgalois/src/context.cpp | 15 +-- libdeepgalois/src/layers/graph_conv_layer.cpp | 39 ++++---- .../src/layers/softmax_loss_layer.cpp | 21 +++-- libdeepgalois/src/math_functions.cpp | 94 +++++++++++++++++++ libdeepgalois/src/net.cpp | 2 +- 17 files changed, 195 insertions(+), 133 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 168a022860..0fe04fab0d 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -38,7 +38,6 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") set(sources - src/layers/relu_layer.cpp src/layers/graph_conv_layer.cpp src/layers/softmax_loss_layer.cpp src/math_functions.cpp diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h index 61befebf2d..5f818d6ce2 100644 --- a/libdeepgalois/include/aggregator.h +++ b/libdeepgalois/include/aggregator.h @@ -2,12 +2,11 @@ #include "types.h" #ifdef CPU_ONLY #include "gtypes.h" -void update_all(Graph &g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); -void update_all(Graph &g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor); +void update_all(size_t len, Graph &g, const vec_t &in, vec_t &out, bool norm, const vec_t &norm_factor); #else #include "graph_gpu.h" #define TB_SIZE 256 #define WARP_SIZE 32 -void update_all(CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor); +void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor); #endif diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index d03740358e..1acc9d0b0e 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -39,9 +39,9 @@ class Context { void degree_counting(); void norm_factor_counting(); std::vector labels; // labels for classification: N x 1 - std::vector norm_factor; // normalization constant based on graph structure + vec_t norm_factor; // normalization constant based on graph structure std::vector degrees; - tensor_t h_feats; // input features: N x D + vec_t h_feats; // input features: N x D size_t n; // number of samples: N size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D diff --git a/libdeepgalois/include/layers.h b/libdeepgalois/include/layers.h index 9650e931a9..432d315183 100644 --- a/libdeepgalois/include/layers.h +++ b/libdeepgalois/include/layers.h @@ -1,8 +1,8 @@ #ifndef _LAYERS_H_ #define _LAYERS_H_ -#include "layers/relu_layer.h" -#include "layers/linear_layer.h" -#include "layers/arithmetic_layer.h" +//#include "layers/relu_layer.h" +//#include "layers/linear_layer.h" +//#include "layers/arithmetic_layer.h" #include "layers/graph_conv_layer.h" #include "layers/softmax_loss_layer.h" #endif diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h index ff7fb82b31..a74f45f5f0 100644 --- a/libdeepgalois/include/layers/graph_conv_layer.h +++ b/libdeepgalois/include/layers/graph_conv_layer.h @@ -24,15 +24,15 @@ class graph_conv_layer: public layer { void init(); std::string layer_type() const override { return std::string("graph_conv"); } void set_netphase(net_phase ctx) override { phase_ = ctx; } - virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data); - virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad); + virtual void forward_propagation(const vec_t &in_data, vec_t &out_data); + virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad); virtual void forward_propagation(const float_t *in_data, float_t *out_data); virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); // user-defined aggregate function #ifdef CPU_ONLY - virtual void aggregate(Graph &g, const vec_t &in, tensor_t &out); + virtual void aggregate(size_t len, Graph &g, const vec_t &in, vec_t &out); #else - virtual void aggregate(CSRGraph &g, const float_t *in, float_t *out); + virtual void aggregate(size_t len, CSRGraph &g, const float_t *in, float_t *out); #endif // user-defined combine function virtual void combine(const vec_t &self, const vec_t &neighbors, vec_t &out); @@ -51,7 +51,7 @@ class graph_conv_layer: public layer { vec_t out_temp; vec_t in_temp; vec_t trans_data; // y*x - std::vector > dropout_mask; + std::vector dropout_mask; // x*y // Glorot & Bengio (AISTATS 2010) inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) { diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 15e7d88900..cc2d79dcfe 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -38,13 +38,13 @@ class layer : public node { level_(level), begin_(0), end_(0), num_dims(in_dims.size()), input_dims(in_dims), output_dims(out_dims) { add_edge(); } virtual ~layer() = default; - virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data) = 0; - virtual void forward_propagation(const float_t *in_data, float_t *out_data) = 0; - virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) = 0; - virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) = 0; virtual std::string layer_type() const = 0; - virtual void set_context(Context *ctx) { context = ctx; } virtual void set_netphase(net_phase phase) {} + virtual void set_context(Context *ctx) { context = ctx; } + virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) = 0; + virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) = 0; + virtual void forward_propagation(const float_t *in_data, float_t *out_data) = 0; + virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) = 0; void set_trainable(bool trainable) { trainable_ = trainable; } bool trainable() const { return trainable_; } @@ -61,29 +61,22 @@ class layer : public node { count_ = sample_count; masks_ = masks; } - void set_in_data(tensor_t data) { - prev_ = std::make_shared(this, input_dims[1]); + void set_in_data(vec_t data) { + prev_ = std::make_shared(this, input_dims[0], input_dims[1]); + // allocate memory for intermediate features prev_->get_data() = data; - prev_->get_gradient().resize(input_dims[0]); // allocate memory for intermediate gradients - //std::cout << "l0 in_grad alloc: x=" << output_dims[0] << ", y=" << output_dims[1] << "\n"; - for (size_t i = 0; i < input_dims[0]; ++i) - prev_->get_gradient()[i].resize(input_dims[1]); + prev_->get_gradient().resize(input_dims[0]*output_dims[1]); } void add_edge() { // add an outgoing edge - next_ = std::make_shared(this, output_dims[1]); + next_ = std::make_shared(this, output_dims[0], output_dims[1]); // allocate memory for intermediate feature vectors - next_->get_data().resize(output_dims[0]); - for (size_t i = 0; i < output_dims[0]; ++i) - next_->get_data()[i].resize(output_dims[1]); + next_->get_data().resize(output_dims[0]*output_dims[1]); } void alloc_grad() { // allocate memory for intermediate gradients - //std::cout << "l" << level_ << " out_grad alloc: x=" << output_dims[0] << ", y=" << output_dims[1] << "\n"; - next_->get_gradient().resize(output_dims[0]); - for (size_t i = 0; i < output_dims[0]; ++i) - next_->get_gradient()[i].resize(output_dims[1]); + next_->get_gradient().resize(output_dims[0]*output_dims[1]); } void forward() { forward_propagation(prev()->get_data(), next()->get_data()); @@ -92,7 +85,6 @@ class layer : public node { back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient()); } void update_weight(optimizer *opt) { - //std::cout << "[debug] " << name_ << ": updating weight...\n"; // parallelize only when target size is big enough to mitigate thread spawning overhead. bool parallel = (W.size() >= 512); //vec_t diff; @@ -105,20 +97,16 @@ class layer : public node { prev()->clear_grads(); } inline acc_t get_masked_loss() { - //acc_t total_loss = acc_t(0); - //size_t valid_sample_count = 0; AccumF total_loss; AccumU valid_sample_count; total_loss.reset(); valid_sample_count.reset(); - //for (size_t i = begin_; i < end_; i ++) { galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { if (masks_[i]) { total_loss += loss[i]; valid_sample_count += 1; } }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); - //} assert(valid_sample_count.reduce() == count_); return total_loss.reduce() / (acc_t)count_; } diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h index cb698491fc..0b1e9af3b5 100644 --- a/libdeepgalois/include/layers/softmax_loss_layer.h +++ b/libdeepgalois/include/layers/softmax_loss_layer.h @@ -6,9 +6,9 @@ class softmax_loss_layer: public layer { softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims); ~softmax_loss_layer() {} std::string layer_type() const override { return std::string("softmax_loss"); } - virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data); + virtual void forward_propagation(const vec_t &in_data, vec_t &out_data); virtual void forward_propagation(const float_t *in_data, float_t *out_data); - virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad); + virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad); virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); }; diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 87d48fd92e..99a9494f08 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -23,23 +23,35 @@ void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector); void vvmul(const vec_t &a, const vec_t &b, tensor_t &out); void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C); void copy2D1D(const tensor_t &in, vec_t &out); +void copy1D1D(const vec_t &in, vec_t &out); +void copy1D1D(size_t len, const float_t *in, float_t *out); void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C); void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const vec_t &A, const vec_t &B, vec_t &C); // matrix multiply void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C); void transpose2D(const tensor_t &in, tensor_t &out); void transpose2D1D(const tensor_t &in, vec_t &out); void transpose(size_t x, size_t y, const vec_t &in, vec_t &out); +void transpose(size_t x, size_t y, const float_t *in, float_t *out); int argmax(const size_t n, const vec_t &x); // the arguments of the maxima +int argmax(const size_t n, const float_t *x); // the arguments of the maxima void clear(vec_t &in); +void clear(size_t n, float_t *in); void relu(const vec_t &in, vec_t &out); // ReLU +void relu(size_t n, const float_t *in, float_t *out); // ReLU void d_relu(const vec_t &in_diff, const vec_t &data, vec_t &out_diff); // ReLU derivative void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &mask, vec_t &out); // dropout void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &mask, float_t *out); +void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out); void d_dropout(const float scale, const vec_t &in_diff, std::vector &mask, vec_t &out_diff); // dropout derivative +void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *mask, float_t *out_diff); void softmax(const vec_t &input, vec_t &output); +void softmax(size_t n, const float_t *input, float_t *output); void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp); +void d_softmax(size_t n, const float_t *y, const float_t *p, float_t *dy, const float_t *dp); float_t cross_entropy(const vec_t &y, const vec_t &p); +float_t cross_entropy(size_t n, const float_t *y, const float_t *p); void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d); +void d_cross_entropy(size_t n, const float_t *y, const float_t *p, float_t *d); void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index 80da9fe1ad..66f50a17b6 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -109,7 +109,7 @@ class Net { accuracy_all.reset(); galois::do_all(galois::iterate(begin, end), [&](const auto& i) { if (masks[i] == 1) { - int preds = argmax(num_classes, layers[NUM_CONV_LAYERS-1]->next()->get_data()[i]); + int preds = argmax(num_classes, &(layers[NUM_CONV_LAYERS-1]->next()->get_data()[i*num_classes])); if ((label_t)preds == context->get_label(i)) accuracy_all += 1.0; } }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h index eec041e0e1..b74edec280 100644 --- a/libdeepgalois/include/node.h +++ b/libdeepgalois/include/node.h @@ -25,38 +25,31 @@ class node : public std::enable_shared_from_this { // edges manage the input/output data and gradients between nodes class edge { public: - edge(node *prev, size_t len) : - ft_dim_(len), - data_({vec_t(len)}), - grad_({vec_t(len)}), + edge(node *prev, size_t n, size_t len) : + num_samples_(n), ft_dim_(len), + data_(vec_t(n*len)), grad_(vec_t(n*len)), prev_(prev) {} void merge_grads(vec_t *dst) { assert(!grad_.empty()); - const auto &grad_head = grad_[0]; - size_t sz = grad_head.size(); - dst->resize(sz); + dst->resize(ft_dim_); float_t *pdst = &(*dst)[0]; - std::copy(grad_head.begin(), grad_head.end(), pdst); + std::copy(grad_.begin(), grad_.begin()+ft_dim_, pdst); // @todo consider adding parallelism and vectorization - for (size_t sample = 1; sample < grad_.size(); ++sample) { - for (size_t i = 0; i < sz; i++) pdst[i] += grad_[sample][i]; - //vectorize::reduce(&grad_[sample][0], sz, pdst); + for (size_t sample = 1; sample < num_samples_; ++sample) { + for (size_t i = 0; i < ft_dim_; i++) pdst[i] += grad_[sample*ft_dim_+i]; + //vectorize::reduce(&grad_[sample][0], ft_dim_, pdst); } } void clear_grads() { - for (size_t sample = 0; sample < grad_.size(); ++sample) { - auto &g = grad_[sample]; - std::fill(g.begin(), g.end(), 0.0); // TODO: need vectorize - //vectorize::fill(&g[0], g.size(), float_t{0}); - } + std::fill(grad_.begin(), grad_.end(), float_t{0}); // TODO: need vectorize + //vectorize::fill(&grad_[0], grad_.size(), float_t{0}); } - tensor_t *get_data_ptr() { return &data_; } - tensor_t &get_data() { return data_; } - const tensor_t &get_data() const { return data_; } - tensor_t &get_gradient() { return grad_; } - const tensor_t &get_gradient() const { return grad_; } + vec_t &get_data() { return data_; } + const vec_t &get_data() const { return data_; } + vec_t &get_gradient() { return grad_; } + const vec_t &get_gradient() const { return grad_; } float_t *get_gpu_data() const { return gpu_data_; } float_t *get_gpu_gradient() { return gpu_grad_; } @@ -66,12 +59,13 @@ class edge { void add_next_node(node *next) { next_ = next; } private: + size_t num_samples_;// number of samples size_t ft_dim_; // feature dimensions - tensor_t data_; // feature vectors on CPU - tensor_t grad_; // gradients on CPU + vec_t data_; // feature vectors on CPU + vec_t grad_; // gradients on CPU float_t *gpu_data_; // feature vectors on GPU float_t *gpu_grad_; // gradients on CPU - node *prev_; // previous node, "producer" of this tensor - node *next_; // next node, "consumer" of this tensor + node *prev_; // previous node, "producer" of data + node *next_; // next node, "consumer" of data }; diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp index 723a36e9e9..30b2fc0a5e 100644 --- a/libdeepgalois/src/aggregator.cpp +++ b/libdeepgalois/src/aggregator.cpp @@ -3,28 +3,9 @@ #include "aggregator.h" #include "math_functions.hh" -void update_all(Graph &g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { +void update_all(size_t len, Graph &g, const vec_t &in, vec_t &out, bool norm, const vec_t &norm_factor) { galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto& src) { - clear(out[src]); // TODO: vectorize clear - float_t a = 0.0, b = 0.0; - if (norm) a = norm_factor[src]; - // gather neighbors' embeddings - for (const auto e : g.edges(src)) { - const auto dst = g.getEdgeDst(e); - if (norm) { - b = a * norm_factor[dst]; - vec_t neighbor = in[dst]; - mul_scalar(b, neighbor); - vadd(out[src], neighbor, out[src]); // out[src] += in[dst] - } else vadd(out[src], in[dst], out[src]); // out[src] += in[dst] - } - }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); -} - -void update_all(Graph &g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { - size_t len = out[0].size(); - galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto& src) { - clear(out[src]); + clear(len, &out[src*len]); float_t a = 0.0, b = 0.0; if (norm) a = norm_factor[src]; // gather neighbors' embeddings @@ -33,9 +14,9 @@ void update_all(Graph &g, const vec_t &in, tensor_t &out, bool norm, const vec_t if (norm) { b = a * norm_factor[dst]; vec_t neighbor(len); - mul_scalar(len, b, &in[dst*len], neighbor.data()); - vadd(out[src], neighbor, out[src]); // out[src] += in[dst] - } else vadd(len, out[src].data(), &in[dst*len], out[src].data()); // out[src] += in[dst] + mul_scalar(len, b, &in[dst*len], &neighbor[0]); + vadd(len, &out[src*len], &neighbor[0], &out[src*len]); // out[src] += in[dst] + } else vadd(len, &out[src*len], &in[dst*len], &out[src*len]); // out[src] += in[dst] } }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); } diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index 064a01da0e..04f9c1e8f8 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -4,8 +4,8 @@ #include "aggregator.h" #include "math_functions.hh" -void update_all(CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { +void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { unsigned n = g.nnodes; - vadd_gpu(n, in, in, out); + vadd_gpu(len, in, in, out); } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index d71baebc9c..44b12e4bb0 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -105,12 +105,12 @@ size_t Context::read_graph_gpu(std::string dataset_str) { return graph_gpu.nnodes; } -void copy_data_to_device() { +void Context::copy_data_to_device() { CUDA_CHECK(cudaMalloc((void **)&d_labels, n * sizeof(label_t))); - CUDA_SAFE_CALL(cudaMemcpy(d_labels, labels, n * sizeof(label_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMalloc((void **)&d_norm_factor, n * sizeof(float_t))); CUDA_CHECK(cudaMalloc((void **)&d_feats, n * sizeof(float_t))); - CUDA_SAFE_CALL(cudaMemcpy(d_feats, h_feats, n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); } #endif @@ -183,12 +183,7 @@ size_t Context::read_features(std::string dataset_str) { size_t m; // m = number of vertices in >> m >> feat_len >> std::ws; //assert(m == ); - h_feats.resize(m); - for (size_t i = 0; i < m; ++i) { - h_feats[i].resize(feat_len); - for (size_t j = 0; j < feat_len; ++j) - h_feats[i][j] = 0; - } + h_feats.resize(m*feat_len, 0); while (std::getline(in, line)) { std::istringstream edge_stream(line); unsigned u, v; @@ -196,7 +191,7 @@ size_t Context::read_features(std::string dataset_str) { edge_stream >> u; edge_stream >> v; edge_stream >> w; - h_feats[u][v] = w; + h_feats[u*feat_len+v] = w; } in.close(); t_read.Stop(); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 4e27fdd9bb..ed2e000661 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -1,11 +1,11 @@ #include "layers/graph_conv_layer.h" #ifdef CPU_ONLY -void graph_conv_layer::aggregate(Graph &g, const vec_t &in, tensor_t &out) { - update_all(g, in, out, true, context->norm_factor); +void graph_conv_layer::aggregate(size_t len, Graph &g, const vec_t &in, vec_t &out) { + update_all(len, g, in, out, true, context->norm_factor); #else -void graph_conv_layer::aggregate(CSRGraph &g, const float_t *in, float_t *out) { - update_all(g, in, out, true, NULL); +void graph_conv_layer::aggregate(size_t len, CSRGraph &g, const float_t *in, float_t *out) { + update_all(len, g, in, out, true, context->d_norm_factor); #endif } @@ -40,10 +40,7 @@ void graph_conv_layer::init() { //rand_init_matrix(y, z, Q); zero_init_matrix(y, z, weight_grad); alloc_grad(); - if (dropout_) { - dropout_mask.resize(x); - for (size_t i = 0; i < x; i++) dropout_mask[i].resize(y); - } + if (dropout_) dropout_mask.resize(x*y); in_temp.resize(x*y); out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py trans_data.resize(y*x); // y*x @@ -53,54 +50,54 @@ void graph_conv_layer::init() { #ifdef CPU_ONLY // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) -void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) { +void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) { // input: x*y; W: y*z; output: x*z // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W (not implemented yet) if (dropout_ && phase_ == net_phase::train) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - dropout(scale_, dropout_rate_, in_data[i], dropout_mask[i], &in_temp[i*y]); + dropout(y, scale_, dropout_rate_, &in_data[i*y], &dropout_mask[i*y], &in_temp[i*y]); }, galois::loopname("dropout")); matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z - } else matmul2D1D(z, in_data, W, out_temp); // x*y; y*z; x*z - aggregate(context->graph_cpu, out_temp, out_data); // aggregate + } else matmul1D1D(x, z, y, in_data, W, out_temp); // x*y; y*z; x*z + aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate if (act_) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - relu(out_data[i], out_data[i]); + relu(z, &out_data[i*z], &out_data[i*z]); }, galois::loopname("relu")); } } // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ -void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) { +void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) { if (act_) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { for (size_t j = 0; j < z; ++j) //TODO: use in_data or out_data? - out_temp[i*z+j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0); + out_temp[i*z+j] = out_data[i*z+j] > float_t(0) ? out_grad[i*z+j] : float_t(0); }, galois::loopname("d_relu")); - } else copy2D1D(out_grad, out_temp); // TODO: avoid copying + } else copy1D1D(out_grad, out_temp); // TODO: avoid copying if (level_ != 0) { // no need to calculate in_grad for the first layer vec_t trans_W(z*y); transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y //NOTE: since graph is symmetric, the derivative is the same - update_all(context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y + update_all(y, context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y if (dropout_) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - d_dropout(scale_, in_grad[i], dropout_mask[i], in_grad[i]); + d_dropout(y, scale_, &in_grad[i*y], &dropout_mask[i*y], &in_grad[i*y]); }, galois::chunk_size(), galois::steal(), galois::loopname("d_dropout")); } } // calculate weight gradients - transpose2D1D(in_data, trans_data); // y*x + transpose(x, y, in_data, trans_data); // y*x matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z } void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {} void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {} #else -void graph_conv_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) {} -void graph_conv_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) {} +void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {} +void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {} // GPU forward void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 3d8c22bf49..bc0cd5e953 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -9,27 +9,30 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector in_di // TODO: need kernel fusion optimization // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] -void softmax_loss_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) { +void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) { + size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { if (masks_[i] == 1) { // masked - softmax(in_data[i], out_data[i]); // normalize using softmax + softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax // y is a one hot encoded vector for the labels std::vector y(output_dims[1], 0.0); // ground truth y[context->get_label(i)] = 1.0; // one-hot - loss[i] = cross_entropy(y, out_data[i]); + loss[i] = cross_entropy(len, &y[0], &out_data[len*i]); } }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-fw")); } -void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { } +void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { +} -void softmax_loss_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) { +void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) { + size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - vec_t norm_grad(output_dims[1]); - std::vector y(output_dims[1], 0.0); // ground truth + vec_t norm_grad(len); + std::vector y(len, 0.0); // ground truth y[context->get_label(i)] = 1.0; - d_cross_entropy(y, out_data[i], norm_grad); - d_softmax(in_data[i], out_data[i], in_grad[i], norm_grad); + d_cross_entropy(len, &y[0], &out_data[len*i], &norm_grad[0]); + d_softmax(len, &in_data[len*i], &out_data[len*i], &in_grad[len*i], &norm_grad[0]); }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); } diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 1e3e0e1d79..f66fb8d8be 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -78,6 +78,13 @@ float_t dot(const vec_t &x, const vec_t &y) { return sum; } +float_t dot(size_t n, const float_t *x, const float_t *y) { + float_t sum = 0; + for (size_t i = 0; i < n; ++i) + sum += x[i] * y[i]; + return sum; +} + // matrix-vector multiply void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector) { size_t m = out_vector.size(); @@ -118,6 +125,14 @@ void copy2D1D(const tensor_t &in, vec_t &out) { } } +void copy1D1D(const vec_t &in, vec_t &out) { + std::copy(in.begin(), in.end(), &out[0]); +} + +void copy1D1D(size_t len, const float_t *in, float_t *out) { + std::copy(in, in+len, out); +} + void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C) { @@ -220,6 +235,15 @@ void transpose(size_t x, size_t y, const vec_t &in, vec_t &out) { } } } + +void transpose(size_t x, size_t y, const float_t *in, float_t *out) { + for (size_t i = 0; i < y; i ++) { + for (size_t j = 0; j < x; j ++) { + out[i*x+j] = in[j*y+i]; + } + } +} + int argmax(const size_t n, const vec_t &x) { float_t max = x[0]; int max_ind = 0; @@ -232,16 +256,37 @@ int argmax(const size_t n, const vec_t &x) { return max_ind; } +int argmax(const size_t n, const float_t *x) { + float_t max = x[0]; + int max_ind = 0; + for (size_t i = 1; i < n; i++) { + if (x[i] > max) { + max_ind = i; + max = x[i]; + } + } + return max_ind; +} + void clear(vec_t &in) { for (size_t i = 0; i < in.size(); i++) in[i] = 0; } +void clear(size_t n, float_t *in) { + for (size_t i = 0; i < n; i++) in[i] = 0; +} + void relu(const vec_t &in, vec_t &out) { for (size_t i = 0; i < out.size(); ++i) { out[i] = std::max(in[i], (float_t)0) + negative_slope * std::min(in[i], (float_t)0); } } +void relu(size_t n, const float_t *in, float_t *out) { + for (size_t i = 0; i < n; ++i) + out[i] = std::max(in[i], float_t{0}); +} + void d_relu(const vec_t &in_diff, const vec_t &fv, vec_t &out_diff) { for (size_t i = 0; i < out_diff.size(); ++i) { out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) + negative_slope * (fv[i] <= (float_t)0)); @@ -283,11 +328,23 @@ void dropout(const float scale, const float dropout_rate, const vec_t &in, std:: out[i] = in[i] * mask[i] * scale; } +void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out) { + for (size_t i = 0; i < n; ++i) + mask[i] = bernoulli(dropout_rate); + for (size_t i = 0; i < n; ++i) + out[i] = in[i] * mask[i] * scale; +} + void d_dropout(const float scale, const vec_t &in_diff, std::vector &mask, vec_t &out_diff) { for (size_t i = 0; i < in_diff.size(); ++i) out_diff[i] = in_diff[i] * mask[i] * scale; } +void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *mask, float_t *out_diff) { + for (size_t i = 0; i < n; ++i) + out_diff[i] = in_diff[i] * mask[i] * scale; +} + float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; } @@ -317,6 +374,17 @@ void softmax(const vec_t &input, vec_t &output) { output[i] /= denominator; } +void softmax(size_t n, const float_t *input, float_t *output) { + const float_t max = *std::max_element(input, input+n); + float_t denominator(0); + for (size_t i = 0; i < n; i++) { + output[i] = std::exp(input[i] - max); + denominator += output[i]; + } + for (size_t i = 0; i < n; i++) + output[i] /= denominator; +} + void log_softmax(const vec_t &input, vec_t &output) { const float_t max = *std::max_element(input.begin(), input.end()); float_t denominator(0); @@ -344,6 +412,16 @@ void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp) { } } +void d_softmax(size_t n, const float_t *y, const float_t *p, float_t *dy, const float_t *dp) { + vec_t df(n, 0); + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < n; j++) { + df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i]; + } + dy[i] = dot(n, dp, &df[0]); + } +} + // cross-entropy loss function for multi-class classification // y: ground truth // p: predicted probability @@ -361,6 +439,16 @@ float_t cross_entropy(const vec_t &y, const vec_t &p) { return loss; } +float_t cross_entropy(size_t n, const float_t *y, const float_t *p) { + float_t loss = 0.0; + for (size_t i = 0; i < n; i++) { + if (y[i] == float_t(0)) continue; + if (p[i] == float_t(0)) loss -= y[i] * std::log(float_t(1e-10)); + else loss -= y[i] * std::log(p[i]); + } + return loss; +} + void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d) { auto n = y.size(); //for (size_t i = 0; i < n; i++) d[i] = (p[i] - y[i]) / (p[i] * (float_t(1) - p[i])); @@ -370,3 +458,9 @@ void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d) { } } +void d_cross_entropy(size_t n, const float_t *y, const float_t *p, float_t *d) { + for (size_t i = 0; i < n; i++) { + d[i] = -y[i] / (p[i] + float_t(1e-10)); + } +} + diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 9d1fe771fb..ac9f8c98de 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -31,7 +31,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { feature_dims[3] = num_classes; // normalized output embedding: E layers.resize(num_layers); #ifndef CPU_ONLY - copy_data_to_device(); // copy labels and input features to the device + context->copy_data_to_device(); // copy labels and input features to the device #endif } From 2ffb0e8fd5208e888c650558b83935bc0a76049a Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 23 Feb 2020 18:59:37 -0600 Subject: [PATCH 015/660] refine interfaces --- libdeepgalois/include/aggregator.h | 2 +- libdeepgalois/include/context.h | 2 +- .../include/layers/graph_conv_layer.h | 6 +-- libdeepgalois/include/layers/layer.h | 14 +++--- .../include/layers/softmax_loss_layer.h | 4 +- libdeepgalois/include/math_functions.hh | 2 +- libdeepgalois/include/node.h | 49 ++++++++++++++----- libdeepgalois/src/aggregator.cpp | 2 +- libdeepgalois/src/context.cpp | 4 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 33 +++++++------ .../src/layers/softmax_loss_layer.cpp | 15 +++--- libdeepgalois/src/math_functions.cpp | 11 ++--- 12 files changed, 87 insertions(+), 57 deletions(-) diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h index 5f818d6ce2..1ae8d062ae 100644 --- a/libdeepgalois/include/aggregator.h +++ b/libdeepgalois/include/aggregator.h @@ -2,7 +2,7 @@ #include "types.h" #ifdef CPU_ONLY #include "gtypes.h" -void update_all(size_t len, Graph &g, const vec_t &in, vec_t &out, bool norm, const vec_t &norm_factor); +void update_all(size_t len, Graph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor); #else #include "graph_gpu.h" #define TB_SIZE 256 diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index 1acc9d0b0e..68967aaeea 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -39,7 +39,7 @@ class Context { void degree_counting(); void norm_factor_counting(); std::vector labels; // labels for classification: N x 1 - vec_t norm_factor; // normalization constant based on graph structure + float_t *norm_factor; // normalization constant based on graph structure std::vector degrees; vec_t h_feats; // input features: N x D size_t n; // number of samples: N diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h index a74f45f5f0..b8b42ca1d0 100644 --- a/libdeepgalois/include/layers/graph_conv_layer.h +++ b/libdeepgalois/include/layers/graph_conv_layer.h @@ -24,13 +24,13 @@ class graph_conv_layer: public layer { void init(); std::string layer_type() const override { return std::string("graph_conv"); } void set_netphase(net_phase ctx) override { phase_ = ctx; } - virtual void forward_propagation(const vec_t &in_data, vec_t &out_data); - virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad); + //virtual void forward_propagation(const vec_t &in_data, vec_t &out_data); + //virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad); virtual void forward_propagation(const float_t *in_data, float_t *out_data); virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); // user-defined aggregate function #ifdef CPU_ONLY - virtual void aggregate(size_t len, Graph &g, const vec_t &in, vec_t &out); + virtual void aggregate(size_t len, Graph &g, const float_t *in, float_t *out); #else virtual void aggregate(size_t len, CSRGraph &g, const float_t *in, float_t *out); #endif diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index cc2d79dcfe..737d38fe55 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -41,8 +41,8 @@ class layer : public node { virtual std::string layer_type() const = 0; virtual void set_netphase(net_phase phase) {} virtual void set_context(Context *ctx) { context = ctx; } - virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) = 0; - virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) = 0; + //virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) = 0; + //virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) = 0; virtual void forward_propagation(const float_t *in_data, float_t *out_data) = 0; virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) = 0; @@ -62,21 +62,23 @@ class layer : public node { masks_ = masks; } void set_in_data(vec_t data) { + assert(data.size() == input_dims[0]*input_dims[1]); prev_ = std::make_shared(this, input_dims[0], input_dims[1]); // allocate memory for intermediate features - prev_->get_data() = data; + //prev_->get_data() = data; + std::copy(data.begin(), data.end(), prev_->get_data()); // allocate memory for intermediate gradients - prev_->get_gradient().resize(input_dims[0]*output_dims[1]); + //prev_->get_gradient().resize(input_dims[0]*input_dims[1]); } void add_edge() { // add an outgoing edge next_ = std::make_shared(this, output_dims[0], output_dims[1]); // allocate memory for intermediate feature vectors - next_->get_data().resize(output_dims[0]*output_dims[1]); + //next_->get_data().resize(output_dims[0]*output_dims[1]); } void alloc_grad() { // allocate memory for intermediate gradients - next_->get_gradient().resize(output_dims[0]*output_dims[1]); + //next_->get_gradient().resize(output_dims[0]*output_dims[1]); } void forward() { forward_propagation(prev()->get_data(), next()->get_data()); diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h index 0b1e9af3b5..f4adb51bcd 100644 --- a/libdeepgalois/include/layers/softmax_loss_layer.h +++ b/libdeepgalois/include/layers/softmax_loss_layer.h @@ -6,9 +6,9 @@ class softmax_loss_layer: public layer { softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims); ~softmax_loss_layer() {} std::string layer_type() const override { return std::string("softmax_loss"); } - virtual void forward_propagation(const vec_t &in_data, vec_t &out_data); + //virtual void forward_propagation(const vec_t &in_data, vec_t &out_data); virtual void forward_propagation(const float_t *in_data, float_t *out_data); - virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad); + //virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad); virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); }; diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 99a9494f08..0cbb53bd66 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -26,7 +26,7 @@ void copy2D1D(const tensor_t &in, vec_t &out); void copy1D1D(const vec_t &in, vec_t &out); void copy1D1D(size_t len, const float_t *in, float_t *out); void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C); -void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const vec_t &A, const vec_t &B, vec_t &C); // matrix multiply +void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C); void transpose2D(const tensor_t &in, tensor_t &out); void transpose2D1D(const tensor_t &in, vec_t &out); diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h index b74edec280..4dec9a2080 100644 --- a/libdeepgalois/include/node.h +++ b/libdeepgalois/include/node.h @@ -27,31 +27,58 @@ class edge { public: edge(node *prev, size_t n, size_t len) : num_samples_(n), ft_dim_(len), - data_(vec_t(n*len)), grad_(vec_t(n*len)), - prev_(prev) {} + //data_(vec_t(n*len)), grad_(vec_t(n*len)), + data_(NULL), grad_(NULL), prev_(prev) { +#ifdef CPU_ONLY + data_ = new float_t[n*len]; + grad_ = new float_t[n*len]; +#else + CUDA_CHECK(cudaMalloc((void **)&data_, n * len * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void **)&grad_, n * len * sizeof(float_t))); +#endif + } void merge_grads(vec_t *dst) { - assert(!grad_.empty()); + //assert(!grad_.empty()); + assert(grad_ != NULL); dst->resize(ft_dim_); float_t *pdst = &(*dst)[0]; - std::copy(grad_.begin(), grad_.begin()+ft_dim_, pdst); +#ifdef CPU_ONLY + //std::copy(grad_.begin(), grad_.begin()+ft_dim_, pdst); + std::copy(grad_, grad_+ft_dim_, pdst); // @todo consider adding parallelism and vectorization for (size_t sample = 1; sample < num_samples_; ++sample) { for (size_t i = 0; i < ft_dim_; i++) pdst[i] += grad_[sample*ft_dim_+i]; //vectorize::reduce(&grad_[sample][0], ft_dim_, pdst); } +#else + CUDA_CHECK(cudaMemcpy(&pdst, grad, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost)); + //TODO +#endif } void clear_grads() { - std::fill(grad_.begin(), grad_.end(), float_t{0}); // TODO: need vectorize +#ifdef CPU_ONLY + //std::fill(grad_.begin(), grad_.end(), float_t{0}); // TODO: need vectorize + std::fill(grad_, grad_+ft_dim_*num_samples_, float_t{0}); // TODO: need vectorize //vectorize::fill(&grad_[0], grad_.size(), float_t{0}); +#else + CUDA_CHECK(cudaMemset(grad_, 0, ft_dim_*num_samples_*sizeof(float_t))); +#endif } - +/* vec_t &get_data() { return data_; } const vec_t &get_data() const { return data_; } vec_t &get_gradient() { return grad_; } const vec_t &get_gradient() const { return grad_; } - float_t *get_gpu_data() const { return gpu_data_; } + float_t *get_gpu_data() { return gpu_data_; } + const float_t *get_gpu_data() const { return gpu_data_; } float_t *get_gpu_gradient() { return gpu_grad_; } + const float_t *get_gpu_gradient() const { return gpu_grad_; } +*/ + float_t *get_data() { return data_; } + const float_t *get_data() const { return data_; } + float_t *get_gradient() { return grad_; } + const float_t *get_gradient() const { return grad_; } const node *next() const { return next_; } node *prev() { return prev_; } @@ -61,10 +88,10 @@ class edge { private: size_t num_samples_;// number of samples size_t ft_dim_; // feature dimensions - vec_t data_; // feature vectors on CPU - vec_t grad_; // gradients on CPU - float_t *gpu_data_; // feature vectors on GPU - float_t *gpu_grad_; // gradients on CPU + //vec_t data_; // feature vectors on CPU + //vec_t grad_; // gradients on CPU + float_t *data_; // feature vectors + float_t *grad_; // gradients node *prev_; // previous node, "producer" of data node *next_; // next node, "consumer" of data }; diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp index 30b2fc0a5e..45862b7516 100644 --- a/libdeepgalois/src/aggregator.cpp +++ b/libdeepgalois/src/aggregator.cpp @@ -3,7 +3,7 @@ #include "aggregator.h" #include "math_functions.hh" -void update_all(size_t len, Graph &g, const vec_t &in, vec_t &out, bool norm, const vec_t &norm_factor) { +void update_all(size_t len, Graph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto& src) { clear(len, &out[src*len]); float_t a = 0.0, b = 0.0; diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 44b12e4bb0..3058bac480 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -109,7 +109,7 @@ void Context::copy_data_to_device() { CUDA_CHECK(cudaMalloc((void **)&d_labels, n * sizeof(label_t))); CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMalloc((void **)&d_norm_factor, n * sizeof(float_t))); - CUDA_CHECK(cudaMalloc((void **)&d_feats, n * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void **)&d_feats, n * feat_len * sizeof(float_t))); CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); } #endif @@ -118,7 +118,7 @@ void Context::copy_data_to_device() { // for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v void Context::norm_factor_counting() { #ifdef CPU_ONLY - norm_factor.resize(n); + norm_factor = new float_t[n]; galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) { float_t temp = std::sqrt(float_t(degrees[v])); if (temp == 0.0) norm_factor[v] = 0.0; diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index ed2e000661..2685629138 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -1,7 +1,7 @@ #include "layers/graph_conv_layer.h" #ifdef CPU_ONLY -void graph_conv_layer::aggregate(size_t len, Graph &g, const vec_t &in, vec_t &out) { +void graph_conv_layer::aggregate(size_t len, Graph &g, const float_t *in, float_t *out) { update_all(len, g, in, out, true, context->norm_factor); #else void graph_conv_layer::aggregate(size_t len, CSRGraph &g, const float_t *in, float_t *out) { @@ -39,7 +39,7 @@ void graph_conv_layer::init() { rand_init_matrix(y, z, W); //rand_init_matrix(y, z, Q); zero_init_matrix(y, z, weight_grad); - alloc_grad(); + //alloc_grad(); if (dropout_) dropout_mask.resize(x*y); in_temp.resize(x*y); out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py @@ -50,7 +50,8 @@ void graph_conv_layer::init() { #ifdef CPU_ONLY // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) -void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) { +void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { +//void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) { // input: x*y; W: y*z; output: x*z // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W (not implemented yet) @@ -58,9 +59,9 @@ void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { dropout(y, scale_, dropout_rate_, &in_data[i*y], &dropout_mask[i*y], &in_temp[i*y]); }, galois::loopname("dropout")); - matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z - } else matmul1D1D(x, z, y, in_data, W, out_temp); // x*y; y*z; x*z - aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate + matmul1D1D(x, z, y, &in_temp[0], &W[0], &out_temp[0]); // x*y; y*z; x*z + } else matmul1D1D(x, z, y, in_data, &W[0], &out_temp[0]); // x*y; y*z; x*z + aggregate(z, context->graph_cpu, &out_temp[0], out_data); // aggregate if (act_) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { relu(z, &out_data[i*z], &out_data[i*z]); @@ -69,19 +70,21 @@ void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data } // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ -void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) { +void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { +//void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) { if (act_) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { for (size_t j = 0; j < z; ++j) //TODO: use in_data or out_data? out_temp[i*z+j] = out_data[i*z+j] > float_t(0) ? out_grad[i*z+j] : float_t(0); }, galois::loopname("d_relu")); - } else copy1D1D(out_grad, out_temp); // TODO: avoid copying + //} else copy1D1D(out_grad, out_temp); // TODO: avoid copying + } else copy1D1D(x*z, out_grad, &out_temp[0]); // TODO: avoid copying if (level_ != 0) { // no need to calculate in_grad for the first layer vec_t trans_W(z*y); transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix - matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y + matmul1D1D(x, y, z, &out_temp[0], &trans_W[0], &in_temp[0]); // x*z; z*y -> x*y //NOTE: since graph is symmetric, the derivative is the same - update_all(y, context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y + update_all(y, context->graph_cpu, &in_temp[0], in_grad, true, context->norm_factor); // x*x; x*y -> x*y if (dropout_) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { d_dropout(y, scale_, &in_grad[i*y], &dropout_mask[i*y], &in_grad[i*y]); @@ -89,15 +92,13 @@ void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_d } } // calculate weight gradients - transpose(x, y, in_data, trans_data); // y*x - matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z + transpose(x, y, in_data, &trans_data[0]); // y*x + matmul1D1D(y, z, x, &trans_data[0], &out_temp[0], &weight_grad[0]); // y*x; x*z; y*z } -void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) {} -void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) {} #else -void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {} -void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {} +//void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {} +//void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {} // GPU forward void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index bc0cd5e953..7a9686e772 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -6,10 +6,11 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector in_di loss.resize(in_dims[0]); // error for each sample name_ = layer_type() + "_" + std::to_string(level); } - +#ifdef CPU_ONLY // TODO: need kernel fusion optimization // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] -void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) { +void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { +//void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { if (masks_[i] == 1) { // masked @@ -22,10 +23,8 @@ void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t &out_da }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-fw")); } -void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { -} - -void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) { +//void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) { +void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { vec_t norm_grad(len); @@ -35,6 +34,10 @@ void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t &out d_softmax(len, &in_data[len*i], &out_data[len*i], &in_grad[len*i], &norm_grad[0]); }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); } +#else // GPU implementation +void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { +} void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { } +#endif diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index f66fb8d8be..98535d98bd 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -161,15 +161,12 @@ void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) { } void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, - const vec_t &A, const vec_t &B, vec_t &C) { + const float_t *A, const float_t *B, float_t *C) { galois::StatTimer Tmatmul("MatMul"); Tmatmul.start(); - assert(A.size() == dim_x*dim_z); - assert(B.size() == dim_z*dim_y); - assert(C.size() == dim_x*dim_y); const CBLAS_TRANSPOSE TransA = CblasNoTrans; const CBLAS_TRANSPOSE TransB = CblasNoTrans; - sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, &A[0], &B[0], 0.0, &C[0]); + sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); Tmatmul.stop(); } @@ -181,7 +178,7 @@ void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C) assert(C.size() == dim_x*dim_y); vec_t A1D(dim_x*dim_z); copy2D1D(A, A1D); - matmul1D1D(dim_x, dim_y, dim_z, A1D, B, C); + matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C[0]); } void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) { @@ -198,7 +195,7 @@ void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) { std::copy(A[i].begin(), A[i].end(), ptr); ptr += dim_z; } - matmul1D1D(dim_x, dim_y, dim_z, A1D, B, C1D); + matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C1D[0]); for (size_t i = 0; i < dim_x; i++) { for (size_t j = 0; j < dim_y; ++j) { C[i][j] = C1D[i*dim_y+j]; From e15823ee75be386c2e7681fc4baead010912fb02 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 23 Feb 2020 20:19:15 -0600 Subject: [PATCH 016/660] add gpu data --- libdeepgalois/include/context.h | 1 + libdeepgalois/include/layers/layer.h | 13 +++++++++---- libdeepgalois/include/math_functions.hh | 1 + libdeepgalois/include/node.h | 13 ++++++++----- libdeepgalois/src/context.cpp | 2 ++ libdeepgalois/src/layers/softmax_loss_layer.cpp | 1 + libdeepgalois/src/math_functions.cu | 2 ++ libdeepgalois/src/net.cpp | 2 +- 8 files changed, 25 insertions(+), 10 deletions(-) diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index 68967aaeea..5a362804cc 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -36,6 +36,7 @@ class Context { size_t read_features(std::string dataset_str); label_t get_label(size_t i) { return labels[i]; } label_t *get_labels_ptr(size_t i) { return &(labels[0]); } + float_t * get_in_ptr(); void degree_counting(); void norm_factor_counting(); std::vector labels; // labels for classification: N x 1 diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 737d38fe55..4e634f4934 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -61,19 +61,23 @@ class layer : public node { count_ = sample_count; masks_ = masks; } - void set_in_data(vec_t data) { + void set_in_data(float_t *data) { assert(data.size() == input_dims[0]*input_dims[1]); prev_ = std::make_shared(this, input_dims[0], input_dims[1]); + prev_->set_data(data); + // no need to allocate memory for gradients, since this is the input layer. + // // allocate memory for intermediate features //prev_->get_data() = data; - std::copy(data.begin(), data.end(), prev_->get_data()); + //std::copy(data.begin(), data.end(), prev_->get_data()); // allocate memory for intermediate gradients //prev_->get_gradient().resize(input_dims[0]*input_dims[1]); } void add_edge() { // add an outgoing edge next_ = std::make_shared(this, output_dims[0], output_dims[1]); - // allocate memory for intermediate feature vectors + // allocate memory for intermediate feature vectors and gradients + next_->alloc(); //next_->get_data().resize(output_dims[0]*output_dims[1]); } void alloc_grad() { @@ -96,7 +100,8 @@ class layer : public node { //for (size_t i = 0; i < diff.size(); ++i) // diff[i] *= rcp_batch_size; opt->update(weight_grad, W, parallel); // W += grad - prev()->clear_grads(); + //prev()->clear_grads(); + next()->clear_grads(); } inline acc_t get_masked_loss() { AccumF total_loss; diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 0cbb53bd66..626ed9b4a6 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -60,5 +60,6 @@ void dropout_gpu(const float scale, const float dropout_rate, const float_t *in, void d_dropout_gpu(const float scale, const float_t *in_diff, const unsigned *mask, float_t *out_diff); // dropout derivative void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply int argmax_gpu(const size_t n, const float_t *x); // the arguments of the maxima +void softmax_cross_entropy_gpu(int x, int y, float_t *in_data, float_t *out_data); #endif diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h index 4dec9a2080..317a8e6f03 100644 --- a/libdeepgalois/include/node.h +++ b/libdeepgalois/include/node.h @@ -28,13 +28,15 @@ class edge { edge(node *prev, size_t n, size_t len) : num_samples_(n), ft_dim_(len), //data_(vec_t(n*len)), grad_(vec_t(n*len)), - data_(NULL), grad_(NULL), prev_(prev) { + data_(NULL), grad_(NULL), prev_(prev) {} + + void alloc() { #ifdef CPU_ONLY - data_ = new float_t[n*len]; - grad_ = new float_t[n*len]; + data_ = new float_t[num_samples_ * ft_dim_]; + grad_ = new float_t[num_samples_ * ft_dim_]; #else - CUDA_CHECK(cudaMalloc((void **)&data_, n * len * sizeof(float_t))); - CUDA_CHECK(cudaMalloc((void **)&grad_, n * len * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void **)&data_, num_samples_ * ft_dim_ * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void **)&grad_, num_samples_ * ft_dim_ * sizeof(float_t))); #endif } @@ -75,6 +77,7 @@ class edge { float_t *get_gpu_gradient() { return gpu_grad_; } const float_t *get_gpu_gradient() const { return gpu_grad_; } */ + void set_data(float_t *ptr) { data_ = ptr; } float_t *get_data() { return data_; } const float_t *get_data() const { return data_; } float_t *get_gradient() { return grad_; } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 3058bac480..a275cb3b4c 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -98,6 +98,7 @@ void Context::genGraph(LGraph &lg, Graph &g) { g.constructEdge(offset, lg.get_dest(offset), 0); } } +float_t * Context::get_in_ptr() { return &h_feats[0]; } #else size_t Context::read_graph_gpu(std::string dataset_str) { std::string filename = path + dataset_str + ".csgr"; @@ -112,6 +113,7 @@ void Context::copy_data_to_device() { CUDA_CHECK(cudaMalloc((void **)&d_feats, n * feat_len * sizeof(float_t))); CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); } +float_t * Context::get_in_ptr() { return d_feats; } #endif // user-defined pre-computing function, called during initialization diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 7a9686e772..cd1b517ccf 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -36,6 +36,7 @@ void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t } #else // GPU implementation void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { + softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, out_data); } void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 064926eb58..b65d39972c 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -106,3 +106,5 @@ void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) { vadd_kernel<<>>(N, a, b, y); } +void softmax_cross_entropy_gpu(int x, int y, float_t *in_data, float_t *out_data) { +} diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index ac9f8c98de..857b7691b5 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -83,7 +83,7 @@ void Net::construct_layers() { append_conv_layer(0, true); // first conv layer append_conv_layer(1); // hidden1 layer append_out_layer(2); // output layer - layers[0]->set_in_data(context->h_feats); // feed input data + layers[0]->set_in_data(context->get_in_ptr()); // feed input data set_contexts(); } From 1bc8b8d5a2923d75c7ad02c752b52faaa99482ed Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 23 Feb 2020 21:20:56 -0600 Subject: [PATCH 017/660] add node.cu --- libdeepgalois/CMakeLists.txt | 9 ++- libdeepgalois/include/layers/layer.h | 2 +- libdeepgalois/include/math_functions.hh | 2 +- libdeepgalois/include/node.h | 58 +++---------------- .../src/layers/softmax_loss_layer.cpp | 2 +- libdeepgalois/src/math_functions.cu | 2 +- libdeepgalois/src/node.cpp | 36 ++++++++++++ libdeepgalois/src/node.cu | 15 +++++ 8 files changed, 71 insertions(+), 55 deletions(-) create mode 100644 libdeepgalois/src/node.cpp create mode 100644 libdeepgalois/src/node.cu diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 0fe04fab0d..73152f8792 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -16,7 +16,7 @@ include_directories("${MGPU_ROOT}/src") link_directories(${OPENBLAS_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgalois) -set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support") +set(USE_CPU OFF CACHE BOOL "Build DeepGalois without CUDA support") if(USE_CPU) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") else() @@ -27,7 +27,11 @@ else() #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60") link_directories(${CUDA_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgpu) - set(CUDA_SOURCES src/math_functions.cu src/aggregator.cu) + set(CUDA_SOURCES + src/math_functions.cu + src/aggregator.cu + src/node.cu + ) cuda_add_library(dg_gpu ${CUDA_SOURCES}) target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcurand) set_target_properties(dg_gpu PROPERTIES COMPILE_FLAGS "-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA") @@ -43,6 +47,7 @@ set(sources src/math_functions.cpp src/aggregator.cpp src/context.cpp + src/node.cpp src/net.cpp ) add_library(dg_cpu STATIC ${sources}) diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 4e634f4934..874ce85d30 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -132,7 +132,7 @@ class layer : public node { vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x 16, layer1: 16 x E vec_t weight_grad; // weight gradient for updating parameters - vec_t loss; // error for each vertex: N x 1 + float_t *loss; // error for each vertex: N x 1 Context *context; }; diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 626ed9b4a6..b4ce800bb7 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -60,6 +60,6 @@ void dropout_gpu(const float scale, const float dropout_rate, const float_t *in, void d_dropout_gpu(const float scale, const float_t *in_diff, const unsigned *mask, float_t *out_diff); // dropout derivative void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply int argmax_gpu(const size_t n, const float_t *x); // the arguments of the maxima -void softmax_cross_entropy_gpu(int x, int y, float_t *in_data, float_t *out_data); +void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, float_t *out_data); #endif diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h index 317a8e6f03..98b97b2c55 100644 --- a/libdeepgalois/include/node.h +++ b/libdeepgalois/include/node.h @@ -1,5 +1,7 @@ #pragma once #include +#include +#include #include "types.h" class node; class layer; @@ -7,7 +9,7 @@ class edge; typedef std::shared_ptr edgeptr_t; -// node data structure +// node data structure: each layer is a node, two layers are connected by an edge class node : public std::enable_shared_from_this { public: node(size_t in_size, size_t out_size) {}//: prev_(in_size), next_(out_size) {} @@ -30,53 +32,13 @@ class edge { //data_(vec_t(n*len)), grad_(vec_t(n*len)), data_(NULL), grad_(NULL), prev_(prev) {} - void alloc() { -#ifdef CPU_ONLY - data_ = new float_t[num_samples_ * ft_dim_]; - grad_ = new float_t[num_samples_ * ft_dim_]; -#else - CUDA_CHECK(cudaMalloc((void **)&data_, num_samples_ * ft_dim_ * sizeof(float_t))); - CUDA_CHECK(cudaMalloc((void **)&grad_, num_samples_ * ft_dim_ * sizeof(float_t))); -#endif - } + void alloc(); + void alloc_gpu(); + void merge_grads(vec_t *dst); + void merge_grads_gpu(float_t *dst); + void clear_grads(); + void clear_grads_gpu(); - void merge_grads(vec_t *dst) { - //assert(!grad_.empty()); - assert(grad_ != NULL); - dst->resize(ft_dim_); - float_t *pdst = &(*dst)[0]; -#ifdef CPU_ONLY - //std::copy(grad_.begin(), grad_.begin()+ft_dim_, pdst); - std::copy(grad_, grad_+ft_dim_, pdst); - // @todo consider adding parallelism and vectorization - for (size_t sample = 1; sample < num_samples_; ++sample) { - for (size_t i = 0; i < ft_dim_; i++) pdst[i] += grad_[sample*ft_dim_+i]; - //vectorize::reduce(&grad_[sample][0], ft_dim_, pdst); - } -#else - CUDA_CHECK(cudaMemcpy(&pdst, grad, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost)); - //TODO -#endif - } - void clear_grads() { -#ifdef CPU_ONLY - //std::fill(grad_.begin(), grad_.end(), float_t{0}); // TODO: need vectorize - std::fill(grad_, grad_+ft_dim_*num_samples_, float_t{0}); // TODO: need vectorize - //vectorize::fill(&grad_[0], grad_.size(), float_t{0}); -#else - CUDA_CHECK(cudaMemset(grad_, 0, ft_dim_*num_samples_*sizeof(float_t))); -#endif - } -/* - vec_t &get_data() { return data_; } - const vec_t &get_data() const { return data_; } - vec_t &get_gradient() { return grad_; } - const vec_t &get_gradient() const { return grad_; } - float_t *get_gpu_data() { return gpu_data_; } - const float_t *get_gpu_data() const { return gpu_data_; } - float_t *get_gpu_gradient() { return gpu_grad_; } - const float_t *get_gpu_gradient() const { return gpu_grad_; } -*/ void set_data(float_t *ptr) { data_ = ptr; } float_t *get_data() { return data_; } const float_t *get_data() const { return data_; } @@ -91,8 +53,6 @@ class edge { private: size_t num_samples_;// number of samples size_t ft_dim_; // feature dimensions - //vec_t data_; // feature vectors on CPU - //vec_t grad_; // gradients on CPU float_t *data_; // feature vectors float_t *grad_; // gradients node *prev_; // previous node, "producer" of data diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index cd1b517ccf..d6969b7a95 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -3,7 +3,7 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims) : layer(level, in_dims, out_dims) { trainable_ = false; - loss.resize(in_dims[0]); // error for each sample + loss = new float_t[in_dims[0]]; // error for each sample name_ = layer_type() + "_" + std::to_string(level); } #ifdef CPU_ONLY diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index b65d39972c..9f319d1325 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -106,5 +106,5 @@ void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) { vadd_kernel<<>>(N, a, b, y); } -void softmax_cross_entropy_gpu(int x, int y, float_t *in_data, float_t *out_data) { +void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, float_t *out_data) { } diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp new file mode 100644 index 0000000000..fd55aad0e5 --- /dev/null +++ b/libdeepgalois/src/node.cpp @@ -0,0 +1,36 @@ +#include "node.h" + +void edge::alloc() { +#ifdef CPU_ONLY + data_ = new float_t[num_samples_ * ft_dim_]; + grad_ = new float_t[num_samples_ * ft_dim_]; +#else + alloc_gpu(); +#endif +} + +void edge::merge_grads(vec_t *dst) { + assert(grad_ != NULL); + dst->resize(ft_dim_); + float_t *pdst = &(*dst)[0]; +#ifdef CPU_ONLY + std::copy(grad_, grad_+ft_dim_, pdst); + // @todo consider adding parallelism and vectorization + for (size_t sample = 1; sample < num_samples_; ++sample) { + for (size_t i = 0; i < ft_dim_; i++) pdst[i] += grad_[sample*ft_dim_+i]; + //vectorize::reduce(&grad_[sample][0], ft_dim_, pdst); + } +#else + merge_grads_gpu(pdst); +#endif +} + +void edge::clear_grads() { +#ifdef CPU_ONLY + std::fill(grad_, grad_+ft_dim_*num_samples_, float_t{0}); // TODO: need vectorize + //vectorize::fill(&grad_[0], grad_.size(), float_t{0}); +#else + clear_grads_gpu(); +#endif +} + diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu new file mode 100644 index 0000000000..87795390ff --- /dev/null +++ b/libdeepgalois/src/node.cu @@ -0,0 +1,15 @@ +#include "node.h" +#include "cutils.h" + +void edge::alloc_gpu() { + CUDA_CHECK(cudaMalloc((void **)&data_, num_samples_ * ft_dim_ * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void **)&grad_, num_samples_ * ft_dim_ * sizeof(float_t))); +} + +void edge::merge_grads_gpu(float_t *dst) { + CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost)); +} + +void edge::clear_grads_gpu() { + CUDA_CHECK(cudaMemset(grad_, 0, ft_dim_*num_samples_*sizeof(float_t))); +} From 159cd9bb730feb4fb83832407fa4969259d4cde2 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 24 Feb 2020 09:56:20 -0600 Subject: [PATCH 018/660] add gpu functions --- .../include/layers/graph_conv_layer.h | 8 +-- libdeepgalois/include/layers/layer.h | 2 + libdeepgalois/include/math_functions.hh | 14 ++++- libdeepgalois/src/layers/graph_conv_layer.cpp | 53 +++++++++++-------- libdeepgalois/src/math_functions.cu | 48 +++++++++++++++-- 5 files changed, 94 insertions(+), 31 deletions(-) diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h index b8b42ca1d0..f0f27687e7 100644 --- a/libdeepgalois/include/layers/graph_conv_layer.h +++ b/libdeepgalois/include/layers/graph_conv_layer.h @@ -48,10 +48,10 @@ class graph_conv_layer: public layer { size_t x; size_t y; size_t z; - vec_t out_temp; - vec_t in_temp; - vec_t trans_data; // y*x - std::vector dropout_mask; // x*y + float_t *out_temp; + float_t *in_temp; + float_t *trans_data; // y*x + unsigned * dropout_mask; // x*y // Glorot & Bengio (AISTATS 2010) inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) { diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 874ce85d30..fea557a3ff 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -132,6 +132,8 @@ class layer : public node { vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x 16, layer1: 16 x E vec_t weight_grad; // weight gradient for updating parameters + float_t *d_W; + float_t *d_weight_grad; float_t *loss; // error for each vertex: N x 1 Context *context; }; diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index b4ce800bb7..691db22a96 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -6,6 +6,11 @@ #include #include "types.h" +extern "C" { +#include +//#include +} + const float negative_slope = 0; void vadd(const vec_t &a, const vec_t &b, vec_t &out); // vector add @@ -53,13 +58,20 @@ float_t cross_entropy(size_t n, const float_t *y, const float_t *p); void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d); void d_cross_entropy(size_t n, const float_t *y, const float_t *p, float_t *d); +void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad); +void copy_gpu(size_t len, const float_t *in, float_t *out); +void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out); void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative -void dropout_gpu(const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out); // dropout +void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out); // dropout void d_dropout_gpu(const float scale, const float_t *in_diff, const unsigned *mask, float_t *out_diff); // dropout derivative +void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C); void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply int argmax_gpu(const size_t n, const float_t *x); // the arguments of the maxima void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, float_t *out_data); +void scal_gpu(const int N, const float alpha, float *X); +void add_scalar_gpu(const int N, const float_t alpha, float_t* Y); #endif diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 2685629138..11e3a6cadb 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -35,15 +35,17 @@ void graph_conv_layer::init() { std::cout << name_ << ": allocating memory for parameters and intermediate data... "; Timer t_alloc; t_alloc.Start(); - // randomly initialize trainable parameters for conv layers - rand_init_matrix(y, z, W); +#ifdef CPU_ONLY + rand_init_matrix(y, z, W); // randomly initialize trainable parameters //rand_init_matrix(y, z, Q); zero_init_matrix(y, z, weight_grad); - //alloc_grad(); - if (dropout_) dropout_mask.resize(x*y); - in_temp.resize(x*y); - out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py - trans_data.resize(y*x); // y*x + if (dropout_) dropout_mask = new unsigned[x*y]; + in_temp = new float_t[x*y]; + out_temp = new float_t[x*z]; // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py + trans_data = new float_t[y*x]; // y*x +#else + gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, d_weight_grad); +#endif t_alloc.Stop(); std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; } @@ -51,7 +53,6 @@ void graph_conv_layer::init() { #ifdef CPU_ONLY // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { -//void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) { // input: x*y; W: y*z; output: x*z // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W (not implemented yet) @@ -59,9 +60,9 @@ void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_ galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { dropout(y, scale_, dropout_rate_, &in_data[i*y], &dropout_mask[i*y], &in_temp[i*y]); }, galois::loopname("dropout")); - matmul1D1D(x, z, y, &in_temp[0], &W[0], &out_temp[0]); // x*y; y*z; x*z - } else matmul1D1D(x, z, y, in_data, &W[0], &out_temp[0]); // x*y; y*z; x*z - aggregate(z, context->graph_cpu, &out_temp[0], out_data); // aggregate + matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z + } else matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z + aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate if (act_) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { relu(z, &out_data[i*z], &out_data[i*z]); @@ -77,14 +78,13 @@ void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *o for (size_t j = 0; j < z; ++j) //TODO: use in_data or out_data? out_temp[i*z+j] = out_data[i*z+j] > float_t(0) ? out_grad[i*z+j] : float_t(0); }, galois::loopname("d_relu")); - //} else copy1D1D(out_grad, out_temp); // TODO: avoid copying - } else copy1D1D(x*z, out_grad, &out_temp[0]); // TODO: avoid copying + } else copy1D1D(x*z, out_grad, out_temp); // TODO: avoid copying if (level_ != 0) { // no need to calculate in_grad for the first layer - vec_t trans_W(z*y); + float_t *trans_W = new float[z*y]; transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix - matmul1D1D(x, y, z, &out_temp[0], &trans_W[0], &in_temp[0]); // x*z; z*y -> x*y + matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y //NOTE: since graph is symmetric, the derivative is the same - update_all(y, context->graph_cpu, &in_temp[0], in_grad, true, context->norm_factor); // x*x; x*y -> x*y + update_all(y, context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y if (dropout_) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { d_dropout(y, scale_, &in_grad[i*y], &dropout_mask[i*y], &in_grad[i*y]); @@ -92,19 +92,30 @@ void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *o } } // calculate weight gradients - transpose(x, y, in_data, &trans_data[0]); // y*x - matmul1D1D(y, z, x, &trans_data[0], &out_temp[0], &weight_grad[0]); // y*x; x*z; y*z + transpose(x, y, in_data, trans_data); // y*x + matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z } #else -//void graph_conv_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) {} -//void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) {} - // GPU forward void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { + if (dropout_ && phase_ == net_phase::train) { + dropout_gpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); + matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp); + } else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp); + aggregate(z, context->graph_gpu, out_temp, out_data); + if (act_) relu_gpu(x*z, out_data, out_data); } // GPU backward void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { + if (act_) d_relu_gpu(x*z, out_grad, out_data, out_temp); + else copy_gpu(x*z, out_grad, out_temp); + if (level_ != 0) { + sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); + update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); + if (dropout_) d_dropout(y, scale_, in_grad, dropout_mask, in_grad); + } + sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); } #endif diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 9f319d1325..9fc01278c9 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -1,9 +1,47 @@ #include "math_functions.hh" #include "context.h" -extern "C" { -#include -//#include +void gpu_rng_uniform(const int n, unsigned *r) { + CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n)); +} + +void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) { + CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n)); + const float range = b - a; + if (range != float_t{1}) scal_gpu(n, range, r); + if (a != float_t{0}) add_scalar_gpu(n, a, r); +} + +void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t *r) { + CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); +} + + +void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad) { + if (dropout) CUDA_CHECK(cudaMalloc((void **)&masks, x * y * sizeof(unsigned))); + CUDA_CHECK(cudaMalloc((void **)&in, x * y * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void **)&out, x * z * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void **)&matrix, y * z * sizeof(float_t))); + auto init_range = sqrt(6.0/(y + z)); + // Glorot & Bengio (AISTATS 2010) + gpu_rng_uniform(y*z, -init_range, init_range, matrix); + CUDA_CHECK(cudaMalloc((void **)&grad, y * z * sizeof(float_t))); + CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t))); +} + +void copy_gpu(size_t len, const float_t *in, float_t *out) { + CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); +} + +__global__ void dropout_kernel(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned *masks, float_t* out) { + CUDA_KERNEL_LOOP(i, n) { + //masks[i] = bernoulli(dropout_rate); + out[i] = in[i] * masks[i] * scale; + } +} + +void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out) { + dropout_kernel<<>>(n, scale, dropout_rate, in, masks, out); } // flattern data into 1D before feed into the ReLU operater @@ -13,8 +51,8 @@ __global__ void relu_kernel(const int n, const float_t* in, float_t* out) { } } -void relu_gpu(const int n, const float_t *in_data, float_t* out_data) { - relu_kernel<<>>(n, in_data, out_data); +void relu_gpu(const int n, const float_t *in, float_t* out) { + relu_kernel<<>>(n, in, out); } __global__ void d_relu_kernel(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff) { From 0d37df5dfaec00c21c68054d4d68212e888b5900 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 24 Feb 2020 10:09:09 -0600 Subject: [PATCH 019/660] fix bug --- libdeepgalois/CMakeLists.txt | 2 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 73152f8792..47ace780b9 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -16,7 +16,7 @@ include_directories("${MGPU_ROOT}/src") link_directories(${OPENBLAS_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgalois) -set(USE_CPU OFF CACHE BOOL "Build DeepGalois without CUDA support") +set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support") if(USE_CPU) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") else() diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 11e3a6cadb..d335d1be65 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -80,9 +80,10 @@ void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *o }, galois::loopname("d_relu")); } else copy1D1D(x*z, out_grad, out_temp); // TODO: avoid copying if (level_ != 0) { // no need to calculate in_grad for the first layer - float_t *trans_W = new float[z*y]; + vec_t trans_W(z*y); transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix - matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y + matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y + //sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y -> x*y //NOTE: since graph is symmetric, the derivative is the same update_all(y, context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y if (dropout_) { From 1c1f9db80d01b82f1647c117666174ba521aed7e Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 24 Feb 2020 10:49:21 -0600 Subject: [PATCH 020/660] add agg kernel --- libdeepgalois/src/aggregator.cu | 24 ++++++++++++++++++- libdeepgalois/src/layers/graph_conv_layer.cpp | 1 + 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index 04f9c1e8f8..aa3b70b7b6 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -1,11 +1,33 @@ #include "gg.h" #include "ggcuda.h" #include "cub/cub.cuh" +#include "cutils.h" #include "aggregator.h" #include "math_functions.hh" +// TODO: use warp +__device__ void scale_add(const int n, const float_t alpha, const float_t* a, const float_t* b, float_t* y) { + for (int i = 0; i < n; i++) y[i] = alpha * a[i] + b[i]; +} + +__global__ void update_all_kernel(size_t n, size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { + CUDA_KERNEL_LOOP(src, n) { + float_t a = 0.0, b = 1.0; + if (norm) a = norm_factor[src]; + index_type begin = g.edge_begin(src); + index_type end = g.edge_end(src); + for (index_type e = begin; e != end; e++) { + index_type dst = g.getEdgeDst(e); + assert(dst < n); + if (norm) b = a * norm_factor[dst]; + scale_add(len, b, in+dst*len, out+src*len, out+src*len); // out[src] += in[dst] + } + } +} + void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { unsigned n = g.nnodes; - vadd_gpu(len, in, in, out); + CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); + update_all_kernel<<>>(n, len, g, in, out, norm, norm_factor); } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index d335d1be65..8c7ba7fc1f 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -100,6 +100,7 @@ void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *o #else // GPU forward void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { + assert(y <= 128); // currently only support feature length <= 128 if (dropout_ && phase_ == net_phase::train) { dropout_gpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp); From df3bd07abc2ed44acf469661941b7e5433884467 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 24 Feb 2020 13:11:47 -0600 Subject: [PATCH 021/660] add softmax kernel --- libdeepgalois/include/layers/layer.h | 5 ++- .../include/layers/softmax_loss_layer.h | 2 - libdeepgalois/include/math_functions.hh | 4 +- libdeepgalois/include/net.h | 10 ++--- libdeepgalois/include/types.h | 2 - .../src/layers/softmax_loss_layer.cpp | 21 ++++++--- libdeepgalois/src/math_functions.cu | 43 ++++++++++++++++++- libdeepgalois/src/net.cpp | 6 +-- lonestargnn/gcn/gcn.cpp | 4 +- 9 files changed, 72 insertions(+), 25 deletions(-) diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index fea557a3ff..057bc58383 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -55,7 +55,7 @@ class layer : public node { << " input[" << input_dims[0] << "," << input_dims[1] << "] output[" << output_dims[0] << "," << output_dims[1] << "]\n"; } - virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, MaskList &masks) { + virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, mask_t *masks) { begin_ = sample_begin; end_ = sample_end; count_ = sample_count; @@ -123,7 +123,6 @@ class layer : public node { size_t begin_; // sample begin index size_t end_; // sample end index size_t count_; // number of samples - MaskList masks_; // masks to show which samples are valid size_t num_dims; // number of dimensions std::vector input_dims; // input dimensions std::vector output_dims; // output dimentions @@ -134,6 +133,8 @@ class layer : public node { vec_t weight_grad; // weight gradient for updating parameters float_t *d_W; float_t *d_weight_grad; + mask_t *masks_; // masks to show which samples are valid + mask_t *d_masks_; float_t *loss; // error for each vertex: N x 1 Context *context; }; diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h index f4adb51bcd..f6d23f6c5a 100644 --- a/libdeepgalois/include/layers/softmax_loss_layer.h +++ b/libdeepgalois/include/layers/softmax_loss_layer.h @@ -6,9 +6,7 @@ class softmax_loss_layer: public layer { softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims); ~softmax_loss_layer() {} std::string layer_type() const override { return std::string("softmax_loss"); } - //virtual void forward_propagation(const vec_t &in_data, vec_t &out_data); virtual void forward_propagation(const float_t *in_data, float_t *out_data); - //virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad); virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); }; diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 691db22a96..45a34c7fc6 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -58,6 +58,7 @@ float_t cross_entropy(size_t n, const float_t *y, const float_t *p); void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d); void d_cross_entropy(size_t n, const float_t *y, const float_t *p, float_t *d); +void out_malloc_device(int n, mask_t *h_masks, mask_t *d_masks, float_t *loss); void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad); void copy_gpu(size_t len, const float_t *in, float_t *out); void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out); @@ -70,7 +71,8 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C); void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply int argmax_gpu(const size_t n, const float_t *x); // the arguments of the maxima -void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, float_t *out_data); +void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data); +void d_softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, const float_t *out_data, float_t *diff); void scal_gpu(const int N, const float alpha, float *X); void add_scalar_gpu(const int N, const float_t alpha, float_t* Y); diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index 66f50a17b6..c2bf8e997e 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -59,9 +59,9 @@ class Net { } // forward propagation: [begin, end) is the range of samples used. - acc_t fprop(size_t begin, size_t end, size_t count, MaskList &masks) { + acc_t fprop(size_t begin, size_t end, size_t count, mask_t *masks) { // set mask for the last layer - layers[num_layers-1]->set_sample_mask(begin, end, count, masks); + layers[num_layers-1]->set_sample_mask(begin, end, count, &masks[0]); // layer0: from N x D to N x 16 // layer1: from N x 16 to N x E // layer2: from N x E to N x E (normalize only) @@ -83,7 +83,7 @@ class Net { } // evaluate, i.e. inference or predict - double evaluate(size_t begin, size_t end, size_t count, MaskList &masks, acc_t &loss, acc_t &acc) { + double evaluate(size_t begin, size_t end, size_t count, mask_t *masks, acc_t &loss, acc_t &acc) { Timer t_eval; t_eval.Start(); loss = fprop(begin, end, count, masks); @@ -99,12 +99,12 @@ class Net { size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 unsigned num_epochs; // number of epochs std::vector feature_dims; // feature dimnesions for each layer - MaskList train_mask, val_mask; // masks for traning and validation + std::vector train_mask, val_mask; // masks for traning and validation size_t train_begin, train_end, train_count, val_begin, val_end, val_count; std::vector layers; // all the layers in the neural network // comparing outputs with the ground truth (labels) - inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, MaskList &masks) { + inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t *masks) { AccumF accuracy_all; accuracy_all.reset(); galois::do_all(galois::iterate(begin, end), [&](const auto& i) { diff --git a/libdeepgalois/include/types.h b/libdeepgalois/include/types.h index 8d78e03d48..9b483e1d70 100644 --- a/libdeepgalois/include/types.h +++ b/libdeepgalois/include/types.h @@ -17,8 +17,6 @@ typedef std::vector FV2D; // feature vectors: num_samples x feature_dim typedef float acc_t; // Accuracy type typedef short label_t; // label is for classification (supervised learning) typedef uint8_t mask_t; // mask is used to indicate different uses of labels: train, val, test -typedef std::vector LabelList; // label list to store label for each vertex -typedef std::vector MaskList; // mask list to store mask for each vertex #define CHUNK_SIZE 256 #endif diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index d6969b7a95..7322e916d7 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -3,8 +3,12 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims) : layer(level, in_dims, out_dims) { trainable_ = false; - loss = new float_t[in_dims[0]]; // error for each sample name_ = layer_type() + "_" + std::to_string(level); +#ifdef CPU_ONLY + loss = new float_t[in_dims[0]]; // error for each sample +#else + out_malloc_device(in_dims[0], masks_, d_masks_, loss); +#endif } #ifdef CPU_ONLY // TODO: need kernel fusion optimization @@ -27,18 +31,21 @@ void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *ou void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - vec_t norm_grad(len); - std::vector y(len, 0.0); // ground truth - y[context->get_label(i)] = 1.0; - d_cross_entropy(len, &y[0], &out_data[len*i], &norm_grad[0]); - d_softmax(len, &in_data[len*i], &out_data[len*i], &in_grad[len*i], &norm_grad[0]); + if (masks_[i] == 1) { // masked + vec_t norm_grad(len); + std::vector y(len, 0.0); // ground truth + y[context->get_label(i)] = 1.0; + d_cross_entropy(len, &y[0], &out_data[len*i], &norm_grad[0]); + d_softmax(len, &in_data[len*i], &out_data[len*i], &in_grad[len*i], &norm_grad[0]); + } }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); } #else // GPU implementation void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { - softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, out_data); + softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, context->d_labels, loss, out_data); } void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { + d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, context->d_labels, out_data, in_grad); } #endif diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 9fc01278c9..4097e0410f 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -16,6 +16,11 @@ void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_ CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); } +void out_malloc_device(int n, mask_t *h_masks, mask_t *d_masks, float_t *loss) { + CUDA_CHECK(cudaMalloc((void **)&d_masks, n * sizeof(mask_t))); + CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMalloc((void **)&loss, n * sizeof(float_t))); +} void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad) { if (dropout) CUDA_CHECK(cudaMalloc((void **)&masks, x * y * sizeof(unsigned))); @@ -144,5 +149,41 @@ void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) { vadd_kernel<<>>(N, a, b, y); } -void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, float_t *out_data) { +// TODO: use warp +__device__ void softmax(int n, const float_t *input, float_t *output) { + float_t max = input[0]; + for (size_t i = 1; i < n; i++) if (input[i] > max) max = input[i]; + float_t denominator = 0.0; + for (size_t i = 0; i < n; i++) { + output[i] = exp(input[i] - max); + denominator += output[i]; + } + for (size_t i = 0; i < n; i++) output[i] /= denominator; +} + +__device__ void cross_entropy(int n, const label_t idx, float_t *p, float_t &loss) { + if (p[idx] == 0.0) loss -= log(float_t(1e-10)); + else loss -= log(p[idx]); } + +// n: number of vectors +// len: length of vectors +// for each vector, do softmax to normalize the vector, and then compute a loss +__global__ void softmax_cross_entropy_kernel(int n, int len, const float_t *in_data, + const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data) { + CUDA_KERNEL_LOOP(i, n) { + if (masks[i] == 1) { // masked + softmax(len, in_data+len*i, out_data+len*i); // normalize using softmax + loss[i] = 0.0; + cross_entropy(len, labels[i], &out_data[len*i], loss[i]); + } + } +} + +void softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out) { + softmax_cross_entropy_kernel<<>>(n, len, in, masks, labels, loss, out); +} + +void d_softmax_cross_entropy_gpu(int x, int y, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) { +} + diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 857b7691b5..df775c9504 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -51,8 +51,8 @@ void Net::train(optimizer *opt, bool need_validate) { set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; Tfw.start(); - train_loss = fprop(train_begin, train_end, train_count, train_mask); // forward - train_acc = masked_accuracy(train_begin, train_end, train_count, train_mask); // predict + train_loss = fprop(train_begin, train_end, train_count, &train_mask[0]); // forward + train_acc = masked_accuracy(train_begin, train_end, train_count, &train_mask[0]); // predict Tfw.stop(); Tbw.start(); bprop(); // back propogation @@ -68,7 +68,7 @@ void Net::train(optimizer *opt, bool need_validate) { // Validation acc_t val_loss = 0.0, val_acc = 0.0; Tval.start(); - double val_time = evaluate(val_begin, val_end, val_count, val_mask, val_loss, val_acc); + double val_time = evaluate(val_begin, val_end, val_count, &val_mask[0], val_loss, val_acc); Tval.stop(); std::cout << " val_loss = " << std::setw(5) << val_loss << " val_acc = " << std::setw(5) << val_acc; std::cout << " time = " << epoch_time + val_time << " ms (train_time = " << epoch_time << " val_time = " << val_time << ")\n"; diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 7540a4b0e4..1ef0fa24f2 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -29,14 +29,14 @@ int main(int argc, char** argv) { size_t n = network.get_nnodes(); acc_t test_loss = 0.0, test_acc = 0.0; size_t test_begin = 0, test_end = n, test_count = n; - MaskList test_mask(n, 0); + std::vector test_mask(n, 0); if (dataset == "reddit") { test_begin = 177262; test_count = 55703; test_end = test_begin + test_count; for (size_t i = test_begin; i < test_end; i++) test_mask[i] = 1; } else test_count = read_masks(dataset, "test", test_begin, test_end, test_mask); galois::StatTimer Ttest("Test"); Ttest.start(); - double test_time = network.evaluate(test_begin, test_end, test_count, test_mask, test_loss, test_acc); + double test_time = network.evaluate(test_begin, test_end, test_count, &test_mask[0], test_loss, test_acc); std::cout << "\nTesting: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n"; Ttest.stop(); } From a8dc221f6c113f44f92e5da2b603123f69b0d50e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 24 Feb 2020 14:35:06 -0600 Subject: [PATCH 022/660] cmake changes for cpu only compile --- libdeepgalois/CMakeLists.txt | 9 +++++++-- lonestargnn/CMakeLists.txt | 5 ++++- lonestargnn/gcn/CMakeLists.txt | 3 +-- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 47ace780b9..d3176699ab 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -40,7 +40,7 @@ else() #cuda_compile(AGG_O src/aggregator.cu) endif() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -mavx2") set(sources src/layers/graph_conv_layer.cpp src/layers/softmax_loss_layer.cpp @@ -52,7 +52,12 @@ set(sources ) add_library(dg_cpu STATIC ${sources}) -target_link_libraries(dg_cpu galois_shmem gllvm galois_gpu) +if(USE_CPU) + target_link_libraries(dg_cpu galois_shmem gllvm) +else() + target_link_libraries(dg_cpu galois_shmem gllvm galois_gpu) +endif() + target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES}) target_link_libraries(dg_cpu -lopenblas) target_link_libraries(dg_cpu -lcudart -lcublas -lcurand) diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index e270f63011..00dac96a27 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -4,9 +4,12 @@ include_directories(BEFORE ) include_directories(${CMAKE_SOURCE_DIR}/lonestargnn) include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include) + SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include) include_directories(${CUDA_INC}) -include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) +if(NOT USE_CPU) + include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) +endif() SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt index 715a68d497..3d25bb3966 100644 --- a/lonestargnn/gcn/CMakeLists.txt +++ b/lonestargnn/gcn/CMakeLists.txt @@ -2,6 +2,5 @@ app(gcn gcn.cpp) target_link_libraries(gcn dg_cpu) if(NOT USE_CPU) target_link_libraries(gcn dg_gpu) + target_link_libraries(gcn -lcudart -lcublas -lcurand -lcudadevrt) endif() -target_link_libraries(gcn -lcudart -lcublas -lcurand -lcudadevrt) - From 55f18d88ebd3e1b440f1832c6a41f4efd4ef0c7c Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 24 Feb 2020 14:35:44 -0600 Subject: [PATCH 023/660] fix bug --- libdeepgalois/CMakeLists.txt | 2 +- libdeepgalois/include/utils.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 47ace780b9..50e05c53f8 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -40,7 +40,7 @@ else() #cuda_compile(AGG_O src/aggregator.cu) endif() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -mavx2") set(sources src/layers/graph_conv_layer.cpp src/layers/softmax_loss_layer.cpp diff --git a/libdeepgalois/include/utils.h b/libdeepgalois/include/utils.h index caf27c56a3..3ca868f501 100644 --- a/libdeepgalois/include/utils.h +++ b/libdeepgalois/include/utils.h @@ -8,7 +8,7 @@ #include #include -const std::string path = "/h2/xchen/datasets/Learning/"; // path to the input dataset +const std::string path = "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset enum class net_phase { train, test }; class ResourceManager { From e13ba6ccd73e04123db170049a92ac48f324b9c6 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 24 Feb 2020 15:23:28 -0600 Subject: [PATCH 024/660] compile on faraday --- libdeepgalois/CMakeLists.txt | 6 +++--- libdeepgalois/src/math_functions.cpp | 9 +++++++++ lonestargnn/CMakeLists.txt | 4 ++-- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index d3176699ab..625ab3b6a4 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 2.8) -SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) -SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) +SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include) +SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib) set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include) @@ -40,7 +40,7 @@ else() #cuda_compile(AGG_O src/aggregator.cu) endif() -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -mavx2") +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") set(sources src/layers/graph_conv_layer.cpp src/layers/softmax_loss_layer.cpp diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 98535d98bd..97f6c1198e 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -9,6 +9,7 @@ extern "C" { } // vector add +#if defined(__AVX__) || defined(__AVX2__) void vadd(const vec_t &a, const vec_t &b, vec_t &out) { //for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i]; size_t n = out.size(); @@ -26,6 +27,14 @@ void vadd(size_t n, const float_t *a, const float_t *b, float_t *out) { _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i]; } +#else +void vadd(const vec_t &a, const vec_t &b, vec_t &out) { + for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i]; +} +void vadd(size_t n, const float_t *a, const float_t *b, float_t *out) { + for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i]; +} +#endif // vector subtract void vsub(const vec_t &in_a, const vec_t &in_b, vec_t &out) { diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index 00dac96a27..e48887e261 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -11,8 +11,8 @@ if(NOT USE_CPU) include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) endif() -SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) -SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) +SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include) +SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib) include_directories(${OPENBLAS_INC}) link_directories(${OPENBLAS_LIB}) if(USE_CPU) From da535bd5cb9ba6a174bb6f9d37212a6abab0da4a Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 24 Feb 2020 15:29:03 -0600 Subject: [PATCH 025/660] fix --- libdeepgalois/include/utils.h | 2 +- libdeepgalois/src/math_functions.cpp | 2 +- libdeepgalois/src/math_functions.cu | 35 ++++++++++++++++++++++++---- libdeepgalois/src/node.cpp | 4 ++-- 4 files changed, 35 insertions(+), 8 deletions(-) diff --git a/libdeepgalois/include/utils.h b/libdeepgalois/include/utils.h index 3ca868f501..63d0f74ff7 100644 --- a/libdeepgalois/include/utils.h +++ b/libdeepgalois/include/utils.h @@ -84,7 +84,7 @@ uniform_rand(T min, T max) { } inline bool bernoulli(float_t p) { - return uniform_rand(float_t{0}, float_t{1}) <= p; + return uniform_rand(float_t(0), float_t(1)) <= p; } inline size_t read_masks(std::string dataset_str, std::string mask_type, size_t &begin, size_t &end, std::vector &masks) { diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 97f6c1198e..7e0b805e05 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -290,7 +290,7 @@ void relu(const vec_t &in, vec_t &out) { void relu(size_t n, const float_t *in, float_t *out) { for (size_t i = 0; i < n; ++i) - out[i] = std::max(in[i], float_t{0}); + out[i] = std::max(in[i], float_t(0)); } void d_relu(const vec_t &in_diff, const vec_t &fv, vec_t &out_diff) { diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 4097e0410f..781e4a083a 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -8,8 +8,8 @@ void gpu_rng_uniform(const int n, unsigned *r) { void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) { CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n)); const float range = b - a; - if (range != float_t{1}) scal_gpu(n, range, r); - if (a != float_t{0}) add_scalar_gpu(n, a, r); + if (range != float_t(1)) scal_gpu(n, range, r); + if (a != float_t(0)) add_scalar_gpu(n, a, r); } void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t *r) { @@ -161,11 +161,28 @@ __device__ void softmax(int n, const float_t *input, float_t *output) { for (size_t i = 0; i < n; i++) output[i] /= denominator; } -__device__ void cross_entropy(int n, const label_t idx, float_t *p, float_t &loss) { +// TODO: use warp +__device__ void d_softmax(size_t n, const float_t *p, const float_t *dp, float_t *dy) { + for (size_t i = 0; i < n; i++) { + dy[i] = 0; + for (size_t j = 0; j < n; j++) { + float_t df = (j == i) ? p[i] * (1.0 - p[i]) : -p[j] * p[i]; + dy[i] += df * dp[j]; + } + } +} + +__device__ void cross_entropy(int n, const label_t idx, const float_t *p, float_t &loss) { if (p[idx] == 0.0) loss -= log(float_t(1e-10)); else loss -= log(p[idx]); } +__device__ void d_cross_entropy(int n, const label_t idx, const float_t *p, float_t *d) { + for (int i = 0; i < n; i++) + if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10); + else d[i] = 0.0; +} + // n: number of vectors // len: length of vectors // for each vector, do softmax to normalize the vector, and then compute a loss @@ -184,6 +201,16 @@ void softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t * softmax_cross_entropy_kernel<<>>(n, len, in, masks, labels, loss, out); } -void d_softmax_cross_entropy_gpu(int x, int y, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) { +__global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t *in, + const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) { + CUDA_KERNEL_LOOP(i, n) { + float_t out_grad[41]; + d_cross_entropy(len, labels[i], out+len*i, out_grad); + d_softmax(len, out+len*i, out_grad, diff+len*i); + } +} + +void d_softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) { + d_softmax_cross_entropy_kernel<<>>(n, len, in, masks, labels, out, diff); } diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp index fd55aad0e5..e4cf43dd21 100644 --- a/libdeepgalois/src/node.cpp +++ b/libdeepgalois/src/node.cpp @@ -27,8 +27,8 @@ void edge::merge_grads(vec_t *dst) { void edge::clear_grads() { #ifdef CPU_ONLY - std::fill(grad_, grad_+ft_dim_*num_samples_, float_t{0}); // TODO: need vectorize - //vectorize::fill(&grad_[0], grad_.size(), float_t{0}); + std::fill(grad_, grad_+ft_dim_*num_samples_, float_t(0)); // TODO: need vectorize + //vectorize::fill(&grad_[0], grad_.size(), float_t(0)); #else clear_grads_gpu(); #endif From b1269ca62ad70f1eda99849a05a4559575354e57 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 24 Feb 2020 19:32:52 -0600 Subject: [PATCH 026/660] add context.cu --- libdeepgalois/include/aggregator.h | 2 - libdeepgalois/include/context.h | 30 +++---- libdeepgalois/include/layers/layer.h | 17 +--- .../include/layers/softmax_loss_layer.h | 1 + libdeepgalois/include/math_functions.hh | 1 + libdeepgalois/include/types.h | 2 + libdeepgalois/src/aggregator.cu | 2 +- libdeepgalois/src/context.cpp | 79 ++---------------- libdeepgalois/src/context.cu | 80 +++++++++++++++++++ .../src/layers/softmax_loss_layer.cpp | 22 ++++- libdeepgalois/src/math_functions.cu | 24 ++++++ libdeepgalois/src/net.cpp | 2 - libdeepgalois/src/node.cu | 2 +- 13 files changed, 149 insertions(+), 115 deletions(-) create mode 100644 libdeepgalois/src/context.cu diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h index 1ae8d062ae..78749104cf 100644 --- a/libdeepgalois/include/aggregator.h +++ b/libdeepgalois/include/aggregator.h @@ -5,8 +5,6 @@ void update_all(size_t len, Graph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor); #else #include "graph_gpu.h" -#define TB_SIZE 256 -#define WARP_SIZE 32 void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor); #endif diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index 5a362804cc..884cf51c1a 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -3,25 +3,19 @@ #include #include "types.h" #include "utils.h" -#include "lgraph.h" #ifdef CPU_ONLY +#include "lgraph.h" #include "gtypes.h" #else #include "graph_gpu.h" -#endif #include "cutils.h" +#endif class Context { public: Context(); ~Context(); enum Brew { CPU, GPU }; - //static Context& Get(); -#ifndef CPU_ONLY - inline static cublasHandle_t cublas_handle() { return cublas_handle_; } - inline static curandGenerator_t curand_generator() { return curand_generator_; } - //static void create_blas_handle(); -#endif Brew mode() { return mode_; } void set_mode(Brew mode) { mode_ = mode; } int solver_count() { return solver_count_; } @@ -46,21 +40,25 @@ class Context { size_t n; // number of samples: N size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D -#ifdef CPU_ONLY - Graph graph_cpu; // the input graph, |V| = N - void genGraph(LGraph &lg, Graph &g); - size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr"); -#else - CSRGraph graph_gpu; // the input graph, |V| = N label_t *d_labels; // labels on device - float_t *d_norm_factor; // norm_factor on device float_t *d_feats; // input features on device + float_t *d_norm_factor; // norm_factor on device + size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr"); size_t read_graph_gpu(std::string dataset_str); void copy_data_to_device(); // copy labels and input features void SetDevice(const int device_id); void DeviceQuery() {} bool CheckDevice(const int device_id) { return true; } int FindDevice(const int start_id = 0) { return 0; } + +#ifdef CPU_ONLY + Graph graph_cpu; // the input graph, |V| = N + void genGraph(LGraph &lg, Graph &g); +#else + CSRGraph graph_gpu; // the input graph, |V| = N + inline static cublasHandle_t cublas_handle() { return cublas_handle_; } + inline static curandGenerator_t curand_generator() { return curand_generator_; } + void norm_factor_counting_gpu(size_t n, CSRGraph graph, float_t *norm_factor); #endif protected: @@ -69,8 +67,6 @@ class Context { static curandGenerator_t curand_generator_; // used to generate random numbers on GPU #endif Brew mode_; - //shared_ptr random_generator_; - // Parallel training int solver_count_; int solver_rank_; bool multiprocess_; diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 057bc58383..83547a7f1f 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -41,8 +41,7 @@ class layer : public node { virtual std::string layer_type() const = 0; virtual void set_netphase(net_phase phase) {} virtual void set_context(Context *ctx) { context = ctx; } - //virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) = 0; - //virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) = 0; + virtual acc_t get_masked_loss() { return acc_t(0); } virtual void forward_propagation(const float_t *in_data, float_t *out_data) = 0; virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) = 0; @@ -103,20 +102,6 @@ class layer : public node { //prev()->clear_grads(); next()->clear_grads(); } - inline acc_t get_masked_loss() { - AccumF total_loss; - AccumU valid_sample_count; - total_loss.reset(); - valid_sample_count.reset(); - galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - if (masks_[i]) { - total_loss += loss[i]; - valid_sample_count += 1; - } - }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); - assert(valid_sample_count.reduce() == count_); - return total_loss.reduce() / (acc_t)count_; - } protected: unsigned level_; // layer id: [0, num_layers-1] diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h index f6d23f6c5a..78166b2fb5 100644 --- a/libdeepgalois/include/layers/softmax_loss_layer.h +++ b/libdeepgalois/include/layers/softmax_loss_layer.h @@ -8,5 +8,6 @@ class softmax_loss_layer: public layer { std::string layer_type() const override { return std::string("softmax_loss"); } virtual void forward_propagation(const float_t *in_data, float_t *out_data); virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); + virtual acc_t get_masked_loss(); }; diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 45a34c7fc6..810bb894b1 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -75,5 +75,6 @@ void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_ void d_softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, const float_t *out_data, float_t *diff); void scal_gpu(const int N, const float alpha, float *X); void add_scalar_gpu(const int N, const float_t alpha, float_t* Y); +acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss); #endif diff --git a/libdeepgalois/include/types.h b/libdeepgalois/include/types.h index 9b483e1d70..720c2ae2b8 100644 --- a/libdeepgalois/include/types.h +++ b/libdeepgalois/include/types.h @@ -18,5 +18,7 @@ typedef float acc_t; // Accuracy type typedef short label_t; // label is for classification (supervised learning) typedef uint8_t mask_t; // mask is used to indicate different uses of labels: train, val, test #define CHUNK_SIZE 256 +#define TB_SIZE 256 +#define WARP_SIZE 32 #endif diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index aa3b70b7b6..f8d138ca76 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -30,4 +30,4 @@ void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool n CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); update_all_kernel<<>>(n, len, g, in, out, norm, norm_factor); } - + diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index a275cb3b4c..50e954a19b 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -1,59 +1,10 @@ #include "context.h" #include "gtypes.h" -#include -#include - -// random seeding -int64_t cluster_seedgen(void) { - int64_t s, seed, pid; - FILE* f = fopen("/dev/urandom", "rb"); - if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { - fclose(f); - return seed; - } - std::cout << "System entropy source not available, " - "using fallback algorithm to generate seed instead."; - if (f) fclose(f); - pid = getpid(); - s = time(NULL); - seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729); - return seed; -} #ifdef CPU_ONLY Context::Context() : mode_(Context::CPU), solver_count_(1), solver_rank_(0), multiprocess_(false) { } Context::~Context() {} -#else -cublasHandle_t Context::cublas_handle_ = 0; -curandGenerator_t Context::curand_generator_ = 0; - -Context::Context() : mode_(Context::GPU), solver_count_(1), - solver_rank_(0), multiprocess_(false) { -//void Context::create_blas_handle() { - CUBLAS_CHECK(cublasCreate(&cublas_handle_)); - CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); -} - -Context::~Context() { - if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); - if (curand_generator_) { - CURAND_CHECK(curandDestroyGenerator(curand_generator_)); - } -} - -void Context::SetDevice(const int device_id) { - int current_device; - CUDA_CHECK(cudaGetDevice(¤t_device)); - if (current_device == device_id) return; - CUDA_CHECK(cudaSetDevice(device_id)); - if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); - if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_)); - CUBLAS_CHECK(cublasCreate(&cublas_handle_)); - CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); -} #endif size_t Context::read_graph(std::string dataset_str) { @@ -98,22 +49,8 @@ void Context::genGraph(LGraph &lg, Graph &g) { g.constructEdge(offset, lg.get_dest(offset), 0); } } -float_t * Context::get_in_ptr() { return &h_feats[0]; } -#else -size_t Context::read_graph_gpu(std::string dataset_str) { - std::string filename = path + dataset_str + ".csgr"; - graph_gpu.read(filename.c_str(), false); - return graph_gpu.nnodes; -} -void Context::copy_data_to_device() { - CUDA_CHECK(cudaMalloc((void **)&d_labels, n * sizeof(label_t))); - CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMalloc((void **)&d_norm_factor, n * sizeof(float_t))); - CUDA_CHECK(cudaMalloc((void **)&d_feats, n * feat_len * sizeof(float_t))); - CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); -} -float_t * Context::get_in_ptr() { return d_feats; } +float_t * Context::get_in_ptr() { return &h_feats[0]; } #endif // user-defined pre-computing function, called during initialization @@ -122,19 +59,13 @@ void Context::norm_factor_counting() { #ifdef CPU_ONLY norm_factor = new float_t[n]; galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) { - float_t temp = std::sqrt(float_t(degrees[v])); + auto degree = std::distance(graph_cpu.edge_begin(v), graph_cpu.edge_end(v)); + float_t temp = std::sqrt(float_t(degree)); if (temp == 0.0) norm_factor[v] = 0.0; else norm_factor[v] = 1.0 / temp; }, galois::loopname("NormCounting")); -#endif -} - -void Context::degree_counting() { -#ifdef CPU_ONLY - degrees.resize(n); - galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) { - degrees[v] = std::distance(graph_cpu.edge_begin(v), graph_cpu.edge_end(v)); - }, galois::loopname("DegreeCounting")); +#else + norm_factor_counting_gpu(n, graph_gpu, d_norm_factor); #endif } diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu new file mode 100644 index 0000000000..626c5abc0a --- /dev/null +++ b/libdeepgalois/src/context.cu @@ -0,0 +1,80 @@ +#include +#include +#include +#include +#include "context.h" + +// random seeding +int64_t cluster_seedgen(void) { + int64_t s, seed, pid; + FILE* f = fopen("/dev/urandom", "rb"); + if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { + fclose(f); + return seed; + } + std::cout << "System entropy source not available, " + "using fallback algorithm to generate seed instead."; + if (f) fclose(f); + pid = getpid(); + s = time(NULL); + seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729); + return seed; +} + +__global__ void norm_factor_counting_kernel(size_t n, CSRGraph graph, float_t *norm_factor) { + CUDA_KERNEL_LOOP(i, n) { + float_t temp = sqrt(float_t(graph.getOutDegree(i))); + if (temp == 0.0) norm_factor[i] = 0.0; + else norm_factor[i] = 1.0 / temp; + } +} + +void Context::norm_factor_counting_gpu(size_t n, CSRGraph graph, float_t *norm_factor) { + CUDA_CHECK(cudaMalloc((void **)&norm_factor, n * sizeof(float_t))); + norm_factor_counting_kernel<<>>(n, graph, norm_factor); +} + +cublasHandle_t Context::cublas_handle_ = 0; +curandGenerator_t Context::curand_generator_ = 0; + +Context::Context() : mode_(Context::GPU), solver_count_(1), + solver_rank_(0), multiprocess_(false) { + CUBLAS_CHECK(cublasCreate(&cublas_handle_)); + CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); +} + +Context::~Context() { + if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); + if (curand_generator_) { + CURAND_CHECK(curandDestroyGenerator(curand_generator_)); + } +} + +void Context::SetDevice(const int device_id) { + int current_device; + CUDA_CHECK(cudaGetDevice(¤t_device)); + if (current_device == device_id) return; + CUDA_CHECK(cudaSetDevice(device_id)); + if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); + if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_)); + CUBLAS_CHECK(cublasCreate(&cublas_handle_)); + CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); +} + +size_t Context::read_graph_gpu(std::string dataset_str) { + std::string filename = path + dataset_str + ".csgr"; + graph_gpu.read(filename.c_str(), false); + return graph_gpu.nnodes; +} + +void Context::copy_data_to_device() { + CUDA_CHECK(cudaMalloc((void **)&d_labels, n * sizeof(label_t))); + CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMalloc((void **)&d_feats, n * feat_len * sizeof(float_t))); + CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); +} + +float_t * Context::get_in_ptr() { return d_feats; } + diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 7322e916d7..15e7009da6 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -14,7 +14,6 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector in_di // TODO: need kernel fusion optimization // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { -//void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t &out_data) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { if (masks_[i] == 1) { // masked @@ -27,7 +26,6 @@ void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *ou }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-fw")); } -//void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) { void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { @@ -40,6 +38,22 @@ void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t } }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); } + +acc_t softmax_loss_layer::get_masked_loss() { + AccumF total_loss; + AccumU valid_sample_count; + total_loss.reset(); + valid_sample_count.reset(); + galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { + if (masks_[i]) { + total_loss += loss[i]; + valid_sample_count += 1; + } + }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); + assert(valid_sample_count.reduce() == count_); + return total_loss.reduce() / (acc_t)count_; +} + #else // GPU implementation void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, context->d_labels, loss, out_data); @@ -48,4 +62,8 @@ void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *ou void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, context->d_labels, out_data, in_grad); } + +acc_t softmax_loss_layer::get_masked_loss() { + return masked_avg_loss(begin_, end_, count_, masks_, loss); +} #endif diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 781e4a083a..98e91472aa 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -1,5 +1,8 @@ #include "math_functions.hh" #include "context.h" +#include "gg.h" +#include "ggcuda.h" +#include "cub/cub.cuh" void gpu_rng_uniform(const int n, unsigned *r) { CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n)); @@ -214,3 +217,24 @@ void d_softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t d_softmax_cross_entropy_kernel<<>>(n, len, in, masks, labels, out, diff); } +__global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t *masks, float_t *loss, HGAccumulator total) { + total.thread_entry(); + __shared__ cub::BlockReduce::TempStorage local_loss; + CUDA_KERNEL_LOOP(i, end-begin) { + if (masks[begin+i] == 1) + //total += loss[begin+i]; + total.reduce(loss[begin+i]); + } + total.thread_exit >(local_loss); +} + +acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss) { + HGAccumulator loss_accum; + Shared total_loss = Shared(1); + *(total_loss.cpu_wr_ptr()) = 0; + loss_accum.rv = total_loss.gpu_wr_ptr(); + masked_avg_loss_kernel<<>>(begin, end, masks, loss, loss_accum); + cudaDeviceSynchronize(); + return *(total_loss.cpu_rd_ptr()); +} + diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index df775c9504..1d81ea1012 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -2,10 +2,8 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { context = new Context(); - //Context::create_blas_handle(); num_samples = context->read_graph(dataset_str); num_classes = context->read_labels(dataset_str); - context->degree_counting(); context->norm_factor_counting(); // pre-compute normalizing factor num_epochs = epochs; diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu index 87795390ff..da79217231 100644 --- a/libdeepgalois/src/node.cu +++ b/libdeepgalois/src/node.cu @@ -11,5 +11,5 @@ void edge::merge_grads_gpu(float_t *dst) { } void edge::clear_grads_gpu() { - CUDA_CHECK(cudaMemset(grad_, 0, ft_dim_*num_samples_*sizeof(float_t))); + CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t))); } From 2f43132241a715afc2fa93192d83a29547b9d25a Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 24 Feb 2020 22:16:58 -0600 Subject: [PATCH 027/660] copy graph to gpu --- libdeepgalois/include/context.h | 26 +++++++++++++------------- libdeepgalois/include/cutils.h | 12 ++++++++++++ libdeepgalois/src/context.cpp | 2 +- libdeepgalois/src/context.cu | 20 +++++++++++++------- 4 files changed, 39 insertions(+), 21 deletions(-) diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index 884cf51c1a..198b0cc9dc 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -31,18 +31,7 @@ class Context { label_t get_label(size_t i) { return labels[i]; } label_t *get_labels_ptr(size_t i) { return &(labels[0]); } float_t * get_in_ptr(); - void degree_counting(); - void norm_factor_counting(); - std::vector labels; // labels for classification: N x 1 - float_t *norm_factor; // normalization constant based on graph structure - std::vector degrees; - vec_t h_feats; // input features: N x D - size_t n; // number of samples: N - size_t num_classes; // number of classes: E - size_t feat_len; // input feature length: D - label_t *d_labels; // labels on device - float_t *d_feats; // input features on device - float_t *d_norm_factor; // norm_factor on device + size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr"); size_t read_graph_gpu(std::string dataset_str); void copy_data_to_device(); // copy labels and input features @@ -50,6 +39,18 @@ class Context { void DeviceQuery() {} bool CheckDevice(const int device_id) { return true; } int FindDevice(const int start_id = 0) { return 0; } + void norm_factor_counting(); + void norm_factor_counting_gpu(); + + size_t n; // number of samples: N + size_t num_classes; // number of classes: E + size_t feat_len; // input feature length: D + std::vector labels; // labels for classification: N x 1 + label_t *d_labels; // labels on device + vec_t h_feats; // input features: N x D + float_t *d_feats; // input features on device + float_t *norm_factor; // normalization constant based on graph structure + float_t *d_norm_factor; // norm_factor on device #ifdef CPU_ONLY Graph graph_cpu; // the input graph, |V| = N @@ -58,7 +59,6 @@ class Context { CSRGraph graph_gpu; // the input graph, |V| = N inline static cublasHandle_t cublas_handle() { return cublas_handle_; } inline static curandGenerator_t curand_generator() { return curand_generator_; } - void norm_factor_counting_gpu(size_t n, CSRGraph graph, float_t *norm_factor); #endif protected: diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h index 924dfd06e7..7d9eef3bb1 100644 --- a/libdeepgalois/include/cutils.h +++ b/libdeepgalois/include/cutils.h @@ -13,6 +13,18 @@ inline int CUDA_GET_BLOCKS(const int N) { return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; } +static unsigned CudaTest(const char *msg) { + cudaError_t e; + //cudaThreadSynchronize(); + cudaDeviceSynchronize(); + if (cudaSuccess != (e = cudaGetLastError())) { + fprintf(stderr, "%s: %d\n", msg, e); + fprintf(stderr, "%s\n", cudaGetErrorString(e)); + exit(-1); + } + return 0; +} + inline const char* cublasGetErrorString(cublasStatus_t error) { switch (error) { case CUBLAS_STATUS_SUCCESS: diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 50e954a19b..a500c02125 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -65,7 +65,7 @@ void Context::norm_factor_counting() { else norm_factor[v] = 1.0 / temp; }, galois::loopname("NormCounting")); #else - norm_factor_counting_gpu(n, graph_gpu, d_norm_factor); + norm_factor_counting_gpu(); #endif } diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 626c5abc0a..182deeaed0 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -21,17 +21,20 @@ int64_t cluster_seedgen(void) { return seed; } -__global__ void norm_factor_counting_kernel(size_t n, CSRGraph graph, float_t *norm_factor) { +__global__ void norm_factor_counting_kernel(int n, CSRGraph graph, float_t *norm_fac) { CUDA_KERNEL_LOOP(i, n) { float_t temp = sqrt(float_t(graph.getOutDegree(i))); - if (temp == 0.0) norm_factor[i] = 0.0; - else norm_factor[i] = 1.0 / temp; + if (temp == 0.0) norm_fac[i] = 0.0; + else norm_fac[i] = 1.0 / temp; } } -void Context::norm_factor_counting_gpu(size_t n, CSRGraph graph, float_t *norm_factor) { - CUDA_CHECK(cudaMalloc((void **)&norm_factor, n * sizeof(float_t))); - norm_factor_counting_kernel<<>>(n, graph, norm_factor); +void Context::norm_factor_counting_gpu() { + std::cout << "Pre-computing normalization factor (n=" << n << ")\n"; + assert(graph_gpu.nnodes == n); + CUDA_CHECK(cudaMalloc((void **)&d_norm_factor, n * sizeof(float_t))); + norm_factor_counting_kernel<<>>(n, graph_gpu, d_norm_factor); + CudaTest("solving norm_factor_counting kernel failed"); } cublasHandle_t Context::cublas_handle_ = 0; @@ -65,11 +68,14 @@ void Context::SetDevice(const int device_id) { size_t Context::read_graph_gpu(std::string dataset_str) { std::string filename = path + dataset_str + ".csgr"; - graph_gpu.read(filename.c_str(), false); + CSRGraph g; + g.read(filename.c_str(), false); + g.copy_to_gpu(graph_gpu); return graph_gpu.nnodes; } void Context::copy_data_to_device() { + assert(labels.size() == n); CUDA_CHECK(cudaMalloc((void **)&d_labels, n * sizeof(label_t))); CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMalloc((void **)&d_feats, n * feat_len * sizeof(float_t))); From 04a4e3b13f0cff7383ffdfdfdc946e9a4f8203ed Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 25 Feb 2020 08:39:41 -0600 Subject: [PATCH 028/660] add optimizer.cu --- libdeepgalois/include/cutils.h | 2 +- libdeepgalois/include/layers/layer.h | 17 +-- libdeepgalois/include/math_functions.hh | 8 +- libdeepgalois/include/node.h | 1 - libdeepgalois/include/optimizer.h | 112 ++++-------------- libdeepgalois/src/layers/graph_conv_layer.cpp | 7 +- .../src/layers/softmax_loss_layer.cpp | 4 +- libdeepgalois/src/math_functions.cu | 8 +- libdeepgalois/src/node.cpp | 2 + libdeepgalois/src/optimizer.cpp | 76 ++++++++++++ libdeepgalois/src/optimizer.cu | 4 + 11 files changed, 133 insertions(+), 108 deletions(-) create mode 100644 libdeepgalois/src/optimizer.cpp create mode 100644 libdeepgalois/src/optimizer.cu diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h index 7d9eef3bb1..3710b50ec9 100644 --- a/libdeepgalois/include/cutils.h +++ b/libdeepgalois/include/cutils.h @@ -13,7 +13,7 @@ inline int CUDA_GET_BLOCKS(const int N) { return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; } -static unsigned CudaTest(const char *msg) { +inline unsigned CudaTest(const char *msg) { cudaError_t e; //cudaThreadSynchronize(); cudaDeviceSynchronize(); diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 83547a7f1f..c022b1be46 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -59,6 +59,9 @@ class layer : public node { end_ = sample_end; count_ = sample_count; masks_ = masks; +#ifndef CPU_ONLY + copy_masks_device(input_dims[0], masks_, d_masks_); +#endif } void set_in_data(float_t *data) { assert(data.size() == input_dims[0]*input_dims[1]); @@ -77,11 +80,9 @@ class layer : public node { next_ = std::make_shared(this, output_dims[0], output_dims[1]); // allocate memory for intermediate feature vectors and gradients next_->alloc(); - //next_->get_data().resize(output_dims[0]*output_dims[1]); } void alloc_grad() { // allocate memory for intermediate gradients - //next_->get_gradient().resize(output_dims[0]*output_dims[1]); } void forward() { forward_propagation(prev()->get_data(), next()->get_data()); @@ -90,15 +91,15 @@ class layer : public node { back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient()); } void update_weight(optimizer *opt) { - // parallelize only when target size is big enough to mitigate thread spawning overhead. - bool parallel = (W.size() >= 512); //vec_t diff; //prev()->merge_grads(&diff); - //auto in_data = prev()->get_data(); - //float_t rcp_batch_size = float_t(1.0) / in_data.size(); - //for (size_t i = 0; i < diff.size(); ++i) - // diff[i] *= rcp_batch_size; +#ifdef CPU_ONLY + // parallelize only when target size is big enough to mitigate thread spawning overhead. + bool parallel = (W.size() >= 512); opt->update(weight_grad, W, parallel); // W += grad +#else + opt->update_gpu(d_weight_grad, d_W); // W += grad +#endif //prev()->clear_grads(); next()->clear_grads(); } diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 810bb894b1..bf2dafbc5d 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -58,10 +58,7 @@ float_t cross_entropy(size_t n, const float_t *y, const float_t *p); void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d); void d_cross_entropy(size_t n, const float_t *y, const float_t *p, float_t *d); -void out_malloc_device(int n, mask_t *h_masks, mask_t *d_masks, float_t *loss); -void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad); void copy_gpu(size_t len, const float_t *in, float_t *out); -void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out); void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative @@ -77,4 +74,9 @@ void scal_gpu(const int N, const float alpha, float *X); void add_scalar_gpu(const int N, const float_t alpha, float_t* Y); acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss); +void copy_masks_device(int n, mask_t *h_masks, mask_t *d_masks); +void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out); +void loss_malloc_device(int n, float_t *loss); +void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad); + #endif diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h index 98b97b2c55..5a3cf3f83f 100644 --- a/libdeepgalois/include/node.h +++ b/libdeepgalois/include/node.h @@ -29,7 +29,6 @@ class edge { public: edge(node *prev, size_t n, size_t len) : num_samples_(n), ft_dim_(len), - //data_(vec_t(n*len)), grad_(vec_t(n*len)), data_(NULL), grad_(NULL), prev_(prev) {} void alloc(); diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h index d0f35eac11..2af75a4966 100644 --- a/libdeepgalois/include/optimizer.h +++ b/libdeepgalois/include/optimizer.h @@ -14,6 +14,7 @@ struct optimizer { optimizer &operator=(optimizer &&) = default; virtual ~optimizer() = default; virtual void update(const vec_t &dW, vec_t &W, bool parallelize) = 0; + virtual void update_gpu(const float_t *dW, float_t *W) = 0; virtual void reset() {} // override to implement pre-learning action }; @@ -40,20 +41,8 @@ struct stateful_optimizer : public optimizer { **/ struct adagrad : public stateful_optimizer<1> { adagrad() : alpha(0.01), eps(float_t(1e-8)) {} - void update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &g = get<0>(W); - if (parallelize) { - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - g[i] += dW[i] * dW[i]; - W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); - }, galois::loopname("adagrad_update")); - } else { - for (size_t i = 0; i < W.size(); i++) { - g[i] += dW[i] * dW[i]; - W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); - } - } - } + void update(const vec_t &dW, vec_t &W, bool parallelize); + void update_gpu(const float_t *dW, float_t *W) {} float_t alpha; // learning rate private: float_t eps; @@ -67,13 +56,8 @@ struct adagrad : public stateful_optimizer<1> { **/ struct RMSprop : public stateful_optimizer<1> { RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {} - void update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &g = get<0>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i]; - W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps); - }, galois::loopname("rms_update")); - } + void update(const vec_t &dW, vec_t &W, bool parallelize); + void update_gpu(const float_t *dW, float_t *W) {} float_t alpha; // learning rate float_t mu; // decay term private: @@ -83,23 +67,10 @@ struct RMSprop : public stateful_optimizer<1> { // Adam: A Method for Stochastic Optimization // http://arxiv.org/abs/1412.6980 struct adam : public stateful_optimizer<2> { - adam() : alpha(0.01), b1(float_t(0.9)), - b2(float_t(0.999)), b1_t(float_t(0.9)), - b2_t(float_t(0.999)), eps(float_t(1e-8)) {} - - void update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &mt = get<0>(W); - vec_t &vt = get<1>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; - vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; - // L2 norm based update rule - W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / - std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); - }, galois::chunk_size<256>(), galois::steal(), galois::loopname("adam_update")); - b1_t *= b1; - b2_t *= b2; - } + adam() : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)), + b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {} + void update(const vec_t &dW, vec_t &W, bool parallelize); + void update_gpu(const float_t *dW, float_t *W); float_t alpha; // learning rate float_t b1; // decay term @@ -118,24 +89,11 @@ struct adam : public stateful_optimizer<2> { * */ struct adamax : public stateful_optimizer<2> { - adamax() - : alpha(float_t(0.002)), - b1(float_t(0.9)), - b2(float_t(0.999)), - b1_t(b1), - eps(float_t(1e-8)) {} - - void update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &mt = get<0>(W); - vec_t &ut = get<1>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; - ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); - // Lp norm based update rule - W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); - }, galois::loopname("adamax_update")); - b1_t *= b1; - } + adamax() : alpha(float_t(0.002)), + b1(float_t(0.9)), b2(float_t(0.999)), + b1_t(b1), eps(float_t(1e-8)) {} + void update(const vec_t &dW, vec_t &W, bool parallelize); + void update_gpu(const float_t *dW, float_t *W) {} float_t alpha; // learning rate float_t b1; // decay term @@ -146,18 +104,12 @@ struct adamax : public stateful_optimizer<2> { float_t eps; // constant value to avoid zero-division }; -/** - * SGD without momentum - * - * slightly faster than tiny_dnn::momentum - **/ +// SGD without momentum +// slightly faster than tiny_dnn::momentum struct gradient_descent : public optimizer { gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {} - void update(const vec_t &dW, vec_t &W, bool parallelize) { - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); - }, galois::loopname("gradient_descent_update")); - } + void update(const vec_t &dW, vec_t &W, bool parallelize); + void update_gpu(const float_t *dW, float_t *W) {} float_t alpha; // learning rate float_t lambda; // weight decay }; @@ -172,18 +124,8 @@ struct gradient_descent : public optimizer { struct momentum : public stateful_optimizer<1> { public: momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} - - void update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &dWprev = get<0>(W); - - //for_i(parallelize, W.size(), [&](size_t i) { - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); - W[i] += V; - dWprev[i] = V; - //}); - }, galois::loopname("momentum_update")); - } + void update(const vec_t &dW, vec_t &W, bool parallelize); + void update_gpu(const float_t *dW, float_t *W) {} float_t alpha; // learning rate float_t lambda; // weight decay @@ -201,18 +143,8 @@ struct nesterov_momentum : public stateful_optimizer<1> { public: nesterov_momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} - - void update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &dWprev = get<0>(W); - - //for_i(parallelize, W.size(), [&](size_t i) { - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); - W[i] += (-mu) * dWprev[i] + (1 + mu) * V; - dWprev[i] = V; - //}); - }, galois::loopname("nesterov_momentum_update")); - } + void update(const vec_t &dW, vec_t &W, bool parallelize); + void update_gpu(const float_t *dW, float_t *W) {} float_t alpha; // learning rate float_t lambda; // weight decay diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 8c7ba7fc1f..b81589b741 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -32,9 +32,9 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, bool bia } void graph_conv_layer::init() { - std::cout << name_ << ": allocating memory for parameters and intermediate data... "; Timer t_alloc; t_alloc.Start(); + //std::cout << name_ << ": allocating memory for parameters and intermediate data... "; #ifdef CPU_ONLY rand_init_matrix(y, z, W); // randomly initialize trainable parameters //rand_init_matrix(y, z, Q); @@ -47,7 +47,7 @@ void graph_conv_layer::init() { gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, d_weight_grad); #endif t_alloc.Stop(); - std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; + //std::cout << "Done, time: " << t_alloc.Millisecs() << " ms\n"; } #ifdef CPU_ONLY @@ -101,6 +101,9 @@ void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *o // GPU forward void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { assert(y <= 128); // currently only support feature length <= 128 + assert(in_data != NULL); + assert(in_temp != NULL); + assert(dropout_mask != NULL); if (dropout_ && phase_ == net_phase::train) { dropout_gpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp); diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 15e7009da6..430e1f253b 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -4,11 +4,13 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector in_di std::vector out_dims) : layer(level, in_dims, out_dims) { trainable_ = false; name_ = layer_type() + "_" + std::to_string(level); + std::cout << name_ << ": allocating memory for intermediate data... "; #ifdef CPU_ONLY loss = new float_t[in_dims[0]]; // error for each sample #else - out_malloc_device(in_dims[0], masks_, d_masks_, loss); + loss_malloc_device(in_dims[0], loss); #endif + std::cout << "Done\n"; } #ifdef CPU_ONLY // TODO: need kernel fusion optimization diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 98e91472aa..34b426386a 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -19,10 +19,14 @@ void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_ CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); } -void out_malloc_device(int n, mask_t *h_masks, mask_t *d_masks, float_t *loss) { +void loss_malloc_device(int n, float_t *loss) { + CUDA_CHECK(cudaMalloc((void **)&loss, n * sizeof(float_t))); +} + +void copy_masks_device(int n, mask_t *h_masks, mask_t *d_masks) { + assert(h_masks != NULL); CUDA_CHECK(cudaMalloc((void **)&d_masks, n * sizeof(mask_t))); CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMalloc((void **)&loss, n * sizeof(float_t))); } void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad) { diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp index e4cf43dd21..b08cf3c51c 100644 --- a/libdeepgalois/src/node.cpp +++ b/libdeepgalois/src/node.cpp @@ -1,6 +1,8 @@ #include "node.h" +#include void edge::alloc() { + std::cout << "Allocating memory for tensors (intermediate features and gradients... "; #ifdef CPU_ONLY data_ = new float_t[num_samples_ * ft_dim_]; grad_ = new float_t[num_samples_ * ft_dim_]; diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp new file mode 100644 index 0000000000..3372378de1 --- /dev/null +++ b/libdeepgalois/src/optimizer.cpp @@ -0,0 +1,76 @@ +#include "optimizer.h" +#include "galois/Galois.h" + +void adagrad::update(const vec_t &dW, vec_t &W, bool parallelize) { + vec_t &g = get<0>(W); + if (parallelize) { + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + g[i] += dW[i] * dW[i]; + W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); + }, galois::loopname("adagrad_update")); + } else { + for (size_t i = 0; i < W.size(); i++) { + g[i] += dW[i] * dW[i]; + W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); + } + } +} + +void RMSprop::update(const vec_t &dW, vec_t &W, bool parallelize) { + vec_t &g = get<0>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i]; + W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps); + }, galois::loopname("rms_update")); +} + +void adam::update(const vec_t &dW, vec_t &W, bool parallelize) { + vec_t &mt = get<0>(W); + vec_t &vt = get<1>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; + // L2 norm based update rule + W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / + std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); + }, galois::chunk_size<256>(), galois::steal(), galois::loopname("adam_update")); + b1_t *= b1; + b2_t *= b2; +} + +void adamax::update(const vec_t &dW, vec_t &W, bool parallelize) { + vec_t &mt = get<0>(W); + vec_t &ut = get<1>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); + // Lp norm based update rule + W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); + }, galois::loopname("adamax_update")); + b1_t *= b1; +} + +void gradient_descent::update(const vec_t &dW, vec_t &W, bool parallelize) { + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); + }, galois::loopname("gradient_descent_update")); +} + +void momentum::update(const vec_t &dW, vec_t &W, bool parallelize) { + vec_t &dWprev = get<0>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += V; + dWprev[i] = V; + }, galois::loopname("momentum_update")); +} + +void nesterov_momentum::update(const vec_t &dW, vec_t &W, bool parallelize) { + vec_t &dWprev = get<0>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += (-mu) * dWprev[i] + (1 + mu) * V; + dWprev[i] = V; + }, galois::loopname("nesterov_momentum_update")); +} + diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu new file mode 100644 index 0000000000..832da51cbf --- /dev/null +++ b/libdeepgalois/src/optimizer.cu @@ -0,0 +1,4 @@ +#include "optimizer.h" + +void adam::update_gpu(const float_t *dW, float_t *W) { +} From 3fd5da6761d18ff3b639683997930ed3f97a1c5a Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 25 Feb 2020 08:44:40 -0600 Subject: [PATCH 029/660] fix bug --- libdeepgalois/include/optimizer.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h index 2af75a4966..072eb7d2bc 100644 --- a/libdeepgalois/include/optimizer.h +++ b/libdeepgalois/include/optimizer.h @@ -70,7 +70,11 @@ struct adam : public stateful_optimizer<2> { adam() : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {} void update(const vec_t &dW, vec_t &W, bool parallelize); +#ifdef CPU_ONLY + void update_gpu(const float_t *dW, float_t *W) {} +#else void update_gpu(const float_t *dW, float_t *W); +#endif float_t alpha; // learning rate float_t b1; // decay term From 63cfd0f876c31f0bee17825a0d641afaabeb8b8b Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 25 Feb 2020 08:51:26 -0600 Subject: [PATCH 030/660] update CMakeLists.txt --- libdeepgalois/CMakeLists.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 625ab3b6a4..1ce41abc73 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -30,6 +30,8 @@ else() set(CUDA_SOURCES src/math_functions.cu src/aggregator.cu + src/optimizer.cu + src/context.cu src/node.cu ) cuda_add_library(dg_gpu ${CUDA_SOURCES}) @@ -46,6 +48,7 @@ set(sources src/layers/softmax_loss_layer.cpp src/math_functions.cpp src/aggregator.cpp + src/optimizer.cpp src/context.cpp src/node.cpp src/net.cpp From 06964ac4e5f687c53d48d20009b9972a08a06861 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 25 Feb 2020 11:58:04 -0600 Subject: [PATCH 031/660] fix gpu memory alloc --- libdeepgalois/include/layers/layer.h | 3 ++ libdeepgalois/include/math_functions.hh | 14 ++--- libdeepgalois/src/aggregator.cu | 2 + libdeepgalois/src/layers/graph_conv_layer.cpp | 2 +- libdeepgalois/src/math_functions.cpp | 30 +++++------ libdeepgalois/src/math_functions.cu | 52 +++++++++++++++++-- libdeepgalois/src/node.cpp | 2 +- 7 files changed, 77 insertions(+), 28 deletions(-) diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index c022b1be46..11f82b1486 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -85,12 +85,15 @@ class layer : public node { // allocate memory for intermediate gradients } void forward() { + std::cout << name_ << ": forwarding ... "; forward_propagation(prev()->get_data(), next()->get_data()); } void backward() { + std::cout << name_ << ": backwarding ... "; back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient()); } void update_weight(optimizer *opt) { + std::cout << name_ << ": weight updating ... "; //vec_t diff; //prev()->merge_grads(&diff); #ifdef CPU_ONLY diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index bf2dafbc5d..2e435d60e2 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -44,11 +44,11 @@ void clear(size_t n, float_t *in); void relu(const vec_t &in, vec_t &out); // ReLU void relu(size_t n, const float_t *in, float_t *out); // ReLU void d_relu(const vec_t &in_diff, const vec_t &data, vec_t &out_diff); // ReLU derivative -void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &mask, vec_t &out); // dropout -void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &mask, float_t *out); -void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out); -void d_dropout(const float scale, const vec_t &in_diff, std::vector &mask, vec_t &out_diff); // dropout derivative -void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *mask, float_t *out_diff); +void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &masks, vec_t &out); // dropout +void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &masks, float_t *out); +void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out); +void d_dropout(const float scale, const vec_t &in_diff, std::vector &masks, vec_t &out_diff); // dropout derivative +void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *masks, float_t *out_diff); void softmax(const vec_t &input, vec_t &output); void softmax(size_t n, const float_t *input, float_t *output); void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp); @@ -62,8 +62,8 @@ void copy_gpu(size_t len, const float_t *in, float_t *out); void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative -void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out); // dropout -void d_dropout_gpu(const float scale, const float_t *in_diff, const unsigned *mask, float_t *out_diff); // dropout derivative +void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out); // dropout +void d_dropout_gpu(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out); // dropout derivative void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C); void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index f8d138ca76..a6b61ce914 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -26,8 +26,10 @@ __global__ void update_all_kernel(size_t n, size_t len, CSRGraph &g, const floa } void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { + std::cout << "[debug]: update_all on GPU\n"; unsigned n = g.nnodes; CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); update_all_kernel<<>>(n, len, g, in, out, norm, norm_factor); + CudaTest("solving update_all kernel failed"); } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index b81589b741..86f39ade20 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -119,7 +119,7 @@ void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *o if (level_ != 0) { sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); - if (dropout_) d_dropout(y, scale_, in_grad, dropout_mask, in_grad); + if (dropout_) d_dropout_gpu(y, scale_, in_grad, dropout_mask, in_grad); } sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); } diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 7e0b805e05..6b41afb020 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -318,37 +318,37 @@ float reduce_mean(const vec_t &x) { return sum / (float)n; } -void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &mask, vec_t &out) { - assert(mask.size() == out.size()); - //rng_bernoulli(1. - dropout_rate, mask); // Create random numbers +void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &masks, vec_t &out) { + assert(masks.size() == out.size()); + //rng_bernoulli(1. - dropout_rate, masks); // Create random numbers for (size_t i = 0; i < in.size(); ++i) - mask[i] = bernoulli(dropout_rate); + masks[i] = bernoulli(dropout_rate); for (size_t i = 0; i < in.size(); ++i) - out[i] = in[i] * mask[i] * scale; + out[i] = in[i] * masks[i] * scale; } -void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &mask, float_t *out) { +void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &masks, float_t *out) { for (size_t i = 0; i < in.size(); ++i) - mask[i] = bernoulli(dropout_rate); + masks[i] = bernoulli(dropout_rate); for (size_t i = 0; i < in.size(); ++i) - out[i] = in[i] * mask[i] * scale; + out[i] = in[i] * masks[i] * scale; } -void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *mask, float_t *out) { +void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out) { for (size_t i = 0; i < n; ++i) - mask[i] = bernoulli(dropout_rate); + masks[i] = bernoulli(dropout_rate); for (size_t i = 0; i < n; ++i) - out[i] = in[i] * mask[i] * scale; + out[i] = in[i] * masks[i] * scale; } -void d_dropout(const float scale, const vec_t &in_diff, std::vector &mask, vec_t &out_diff) { +void d_dropout(const float scale, const vec_t &in_diff, std::vector &masks, vec_t &out_diff) { for (size_t i = 0; i < in_diff.size(); ++i) - out_diff[i] = in_diff[i] * mask[i] * scale; + out_diff[i] = in_diff[i] * masks[i] * scale; } -void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *mask, float_t *out_diff) { +void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *masks, float_t *out_diff) { for (size_t i = 0; i < n; ++i) - out_diff[i] = in_diff[i] * mask[i] * scale; + out_diff[i] = in_diff[i] * masks[i] * scale; } float_t sigmoid_func(float_t x) { diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 34b426386a..415e141ec9 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -3,6 +3,8 @@ #include "gg.h" #include "ggcuda.h" #include "cub/cub.cuh" +#include + void gpu_rng_uniform(const int n, unsigned *r) { CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n)); @@ -45,15 +47,47 @@ void copy_gpu(size_t len, const float_t *in, float_t *out) { CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); } -__global__ void dropout_kernel(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned *masks, float_t* out) { +__global__ void setup_curand_kernel(const int n, curandState *state) { + CUDA_KERNEL_LOOP(i, n) { + curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 + //curand_init(7+i, i, 0, &state[i]); // Each thread gets different seed + } +} + +__device__ bool bernoulli_gpu(int tid, curandState *state, float_t p) { + curandState local_state = state[tid]; + return curand_uniform(&local_state) <= p; +} + +__global__ void dropout_kernel(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned *masks, curandState *state, float_t* out) { CUDA_KERNEL_LOOP(i, n) { - //masks[i] = bernoulli(dropout_rate); + masks[i] = bernoulli_gpu(i, state, dropout_rate); out[i] = in[i] * masks[i] * scale; } } void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out) { - dropout_kernel<<>>(n, scale, dropout_rate, in, masks, out); + curandState *devStates; + CUDA_CHECK(cudaMalloc((void **)&devStates, n * sizeof(curandState))); + std::cout << "[debug]: setup curand, n = " << n << "\n"; + setup_curand_kernel<<>>(n, devStates); + CudaTest("solving setup_curand kernel failed"); + std::cout << "[debug]: dropout_gpu\n"; + dropout_kernel<<>>(n, scale, dropout_rate, in, masks, devStates, out); + CudaTest("solving dropout kernel failed"); + CUDA_CHECK(cudaFree(devStates)); + std::cout << "[debug]: dropout_gpu done\n"; +} + +__global__ void d_dropout_kernel(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out) { + CUDA_KERNEL_LOOP(i, n) { + out[i] = in[i] * masks[i] * scale; + } +} + +void d_dropout_gpu(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out) { + d_dropout_kernel<<>>(n, scale, in, masks, out); + CudaTest("solving dropout kernel failed"); } // flattern data into 1D before feed into the ReLU operater @@ -64,7 +98,9 @@ __global__ void relu_kernel(const int n, const float_t* in, float_t* out) { } void relu_gpu(const int n, const float_t *in, float_t* out) { + std::cout << "[debug]: relu_gpu\n"; relu_kernel<<>>(n, in, out); + CudaTest("solving relu kernel failed"); } __global__ void d_relu_kernel(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff) { @@ -75,6 +111,7 @@ __global__ void d_relu_kernel(const int n, const float_t* in_diff, const float_t void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff) { d_relu_kernel<<>>(n, in_diff, data, out_diff); + CudaTest("solving d_relu kernel failed"); } void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, @@ -89,6 +126,7 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, } void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C) { + std::cout << "[debug]: matmul1D1D_gpu\n"; const CBLAS_TRANSPOSE TransA = CblasNoTrans; const CBLAS_TRANSPOSE TransB = CblasNoTrans; sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); @@ -134,6 +172,7 @@ void set_gpu(const int N, const float_t alpha, float_t* Y) { return; } set_kernel<<>>(N, alpha, Y); + CudaTest("solving set kernel failed"); } __global__ void add_scalar_kernel(const int n, const float_t alpha, float_t* y) { @@ -144,6 +183,7 @@ __global__ void add_scalar_kernel(const int n, const float_t alpha, float_t* y) void add_scalar_gpu(const int N, const float_t alpha, float_t* Y) { add_scalar_kernel<<>>(N, alpha, Y); + CudaTest("solving add_scalar kernel failed"); } __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, float_t* y) { @@ -154,6 +194,7 @@ __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, flo void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) { vadd_kernel<<>>(N, a, b, y); + CudaTest("solving vadd kernel failed"); } // TODO: use warp @@ -206,12 +247,13 @@ __global__ void softmax_cross_entropy_kernel(int n, int len, const float_t *in_d void softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out) { softmax_cross_entropy_kernel<<>>(n, len, in, masks, labels, loss, out); + CudaTest("solving softmax_cross_entropy kernel failed"); } __global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) { CUDA_KERNEL_LOOP(i, n) { - float_t out_grad[41]; + float_t out_grad[41]; // TODO d_cross_entropy(len, labels[i], out+len*i, out_grad); d_softmax(len, out+len*i, out_grad, diff+len*i); } @@ -219,6 +261,7 @@ __global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t *in void d_softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) { d_softmax_cross_entropy_kernel<<>>(n, len, in, masks, labels, out, diff); + CudaTest("solving d_softmax_cross_entropy kernel failed"); } __global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t *masks, float_t *loss, HGAccumulator total) { @@ -238,6 +281,7 @@ acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, flo *(total_loss.cpu_wr_ptr()) = 0; loss_accum.rv = total_loss.gpu_wr_ptr(); masked_avg_loss_kernel<<>>(begin, end, masks, loss, loss_accum); + CudaTest("solving masked_avg_loss kernel failed"); cudaDeviceSynchronize(); return *(total_loss.cpu_rd_ptr()); } diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp index b08cf3c51c..5b60a9f22a 100644 --- a/libdeepgalois/src/node.cpp +++ b/libdeepgalois/src/node.cpp @@ -2,7 +2,7 @@ #include void edge::alloc() { - std::cout << "Allocating memory for tensors (intermediate features and gradients... "; + //std::cout << "Allocating memory for tensors (intermediate features and gradients) ...\n"; #ifdef CPU_ONLY data_ = new float_t[num_samples_ * ft_dim_]; grad_ = new float_t[num_samples_ * ft_dim_]; From e559ed12a69e1be1b93ea7b911cd6dac5b6429bd Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 25 Feb 2020 13:25:55 -0600 Subject: [PATCH 032/660] ran clang-format on lonestargnn --- lonestargnn/gcn/gcn.cpp | 75 +++++++++++--------- lonestargnn/graphsage/gs-mean.cpp | 56 ++++++++------- lonestargnn/lonestargnn.h | 114 ++++++++++++++++++++---------- 3 files changed, 147 insertions(+), 98 deletions(-) diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 1ef0fa24f2..9bfe231181 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -7,40 +7,47 @@ const char* desc = "Graph convolutional neural networks on an undirected graph"; const char* url = 0; int main(int argc, char** argv) { - galois::SharedMemSys G; - LonestarGnnStart(argc, argv, name, desc, url); - Net network; // the neural network to train - network.init(dataset, epochs, hidden1); - network.construct_layers(); // default setting for now; can be customized by the user - network.print_layers_info(); - ResourceManager rm; + galois::SharedMemSys G; + LonestarGnnStart(argc, argv, name, desc, url); + Net network; // the neural network to train + network.init(dataset, epochs, hidden1); + network.construct_layers(); // default setting for now; can be customized by + // the user + network.print_layers_info(); + ResourceManager rm; - // the optimizer used to update parameters, see optimizer.h for more details - //optimizer *opt = new gradient_descent(); - //optimizer *opt = new adagrad(); - optimizer *opt = new adam(); - galois::StatTimer Ttrain("TrainAndVal"); - Ttrain.start(); - network.train(opt, do_validate); // do training using training samples - Ttrain.stop(); + // the optimizer used to update parameters, see optimizer.h for more details + // optimizer *opt = new gradient_descent(); + // optimizer *opt = new adagrad(); + optimizer* opt = new adam(); + galois::StatTimer Ttrain("TrainAndVal"); + Ttrain.start(); + network.train(opt, do_validate); // do training using training samples + Ttrain.stop(); - if (do_test) { - // test using test samples - size_t n = network.get_nnodes(); - acc_t test_loss = 0.0, test_acc = 0.0; - size_t test_begin = 0, test_end = n, test_count = n; - std::vector test_mask(n, 0); - if (dataset == "reddit") { - test_begin = 177262; test_count = 55703; test_end = test_begin + test_count; - for (size_t i = test_begin; i < test_end; i++) test_mask[i] = 1; - } else test_count = read_masks(dataset, "test", test_begin, test_end, test_mask); - galois::StatTimer Ttest("Test"); - Ttest.start(); - double test_time = network.evaluate(test_begin, test_end, test_count, &test_mask[0], test_loss, test_acc); - std::cout << "\nTesting: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n"; - Ttest.stop(); - } - std::cout << "\n" << rm.get_peak_memory() << "\n\n"; - return 0; + if (do_test) { + // test using test samples + size_t n = network.get_nnodes(); + acc_t test_loss = 0.0, test_acc = 0.0; + size_t test_begin = 0, test_end = n, test_count = n; + std::vector test_mask(n, 0); + if (dataset == "reddit") { + test_begin = 177262; + test_count = 55703; + test_end = test_begin + test_count; + for (size_t i = test_begin; i < test_end; i++) + test_mask[i] = 1; + } else + test_count = read_masks(dataset, "test", test_begin, test_end, test_mask); + galois::StatTimer Ttest("Test"); + Ttest.start(); + double test_time = network.evaluate(test_begin, test_end, test_count, + &test_mask[0], test_loss, test_acc); + std::cout << "\nTesting: test_loss = " << test_loss + << " test_acc = " << test_acc << " test_time = " << test_time + << "\n"; + Ttest.stop(); + } + std::cout << "\n" << rm.get_peak_memory() << "\n\n"; + return 0; } - diff --git a/lonestargnn/graphsage/gs-mean.cpp b/lonestargnn/graphsage/gs-mean.cpp index b70cdc183c..4bd80e6203 100644 --- a/lonestargnn/graphsage/gs-mean.cpp +++ b/lonestargnn/graphsage/gs-mean.cpp @@ -6,36 +6,40 @@ const char* name = "GraphSage"; const char* desc = "A graph neural network variant: GraphSAGE"; const char* url = 0; -class GraphSageMean: public graph_conv_layer { - // user-defined combine function +class GraphSageMean : public graph_conv_layer { + // user-defined combine function }; int main(int argc, char** argv) { - galois::SharedMemSys G; - LonestarStart(argc, argv, name, desc, url); - Net network; // the neural network to train - network.init(); // default setting for now; see its implementation to find how to customize it by the user - ResourceManager rm; + galois::SharedMemSys G; + LonestarStart(argc, argv, name, desc, url); + Net network; // the neural network to train + network.init(); // default setting for now; see its implementation to find how + // to customize it by the user + ResourceManager rm; - // the optimizer used to update parameters, see optimizer.h for more details - //optimizer *opt = new gradient_descent(); - //optimizer *opt = new adagrad(); - optimizer *opt = new adam(); - galois::StatTimer Ttrain("Train"); - Ttrain.start(); - network.train(opt); // do training using training samples - Ttrain.stop(); + // the optimizer used to update parameters, see optimizer.h for more details + // optimizer *opt = new gradient_descent(); + // optimizer *opt = new adagrad(); + optimizer* opt = new adam(); + galois::StatTimer Ttrain("Train"); + Ttrain.start(); + network.train(opt); // do training using training samples + Ttrain.stop(); - // test using test samples - acc_t test_loss = 0.0, test_acc = 0.0; - size_t test_begin = 2312, test_end = 3312; // [2312, 3327) test size = 1015 TODO: replace ad-hoc settings - galois::StatTimer Ttest("Test"); - Ttest.start(); - double test_time = network.evaluate(test_begin, test_end, test_loss, test_acc); - std::cout << "\nTesting: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n"; - Ttest.stop(); + // test using test samples + acc_t test_loss = 0.0, test_acc = 0.0; + size_t test_begin = 2312, test_end = 3312; // [2312, 3327) test size = 1015 + // TODO: replace ad-hoc settings + galois::StatTimer Ttest("Test"); + Ttest.start(); + double test_time = + network.evaluate(test_begin, test_end, test_loss, test_acc); + std::cout << "\nTesting: test_loss = " << test_loss + << " test_acc = " << test_acc << " test_time = " << test_time + << "\n"; + Ttest.stop(); - std::cout << "\n" << rm.get_peak_memory() << "\n\n"; - return 0; + std::cout << "\n" << rm.get_peak_memory() << "\n\n"; + return 0; } - diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h index cbf3c1ae2a..7ecbe32d7a 100644 --- a/lonestargnn/lonestargnn.h +++ b/lonestargnn/lonestargnn.h @@ -12,17 +12,41 @@ #include namespace cll = llvm::cl; -static cll::opt dataset(cll::Positional, cll::desc(""), cll::Required); // 'cora', 'citeseer', 'pubmed' -static cll::opt filetype(cll::Positional, cll::desc(""), cll::init("gr")); // file format of the input graph -static cll::opt model("m", cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense' -static cll::opt learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01)); -static cll::opt epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1)); -static cll::opt hidden1("h", cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16)); -static cll::opt dropout_rate("d", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5)); -static cll::opt weight_decay("wd", cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4)); -static cll::opt early_stopping("es", cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10)); -static cll::opt max_degree("md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"), cll::init(3)); -static cll::opt do_validate("dv", cll::desc("enable validation"), cll::init(1)); +static cll::opt + dataset(cll::Positional, cll::desc(""), + cll::Required); // 'cora', 'citeseer', 'pubmed' +static cll::opt + filetype(cll::Positional, cll::desc(""), + cll::init("gr")); // file format of the input graph +static cll::opt + model("m", cll::desc("Model string"), + cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense' +static cll::opt + learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"), + cll::init(0.01)); +static cll::opt + epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"), + cll::init(1)); +static cll::opt + hidden1("h", + cll::desc("Number of units in hidden layer 1 (default value 16)"), + cll::init(16)); +static cll::opt dropout_rate( + "d", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), + cll::init(0.5)); +static cll::opt weight_decay( + "wd", + cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), + cll::init(5e-4)); +static cll::opt early_stopping( + "es", + cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), + cll::init(10)); +static cll::opt max_degree( + "md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"), + cll::init(3)); +static cll::opt do_validate("dv", cll::desc("enable validation"), + cll::init(1)); static cll::opt do_test("dt", cll::desc("enable test"), cll::init(1)); //! standard global options to the benchmarks @@ -31,40 +55,54 @@ extern llvm::cl::opt numThreads; extern llvm::cl::opt statFile; //! standard global options to the benchmarks -llvm::cl::opt skipVerify("noverify", llvm::cl::desc("Skip verification step (default value false)"), llvm::cl::init(false)); -llvm::cl::optnumThreads("t", llvm::cl::desc("Number of threads (default value 1)"), llvm::cl::init(1)); -llvm::cl::opt statFile("statFile", llvm::cl::desc("ouput file to print stats to (default value empty)"), llvm::cl::init("")); +llvm::cl::opt + skipVerify("noverify", + llvm::cl::desc("Skip verification step (default value false)"), + llvm::cl::init(false)); +llvm::cl::opt + numThreads("t", llvm::cl::desc("Number of threads (default value 1)"), + llvm::cl::init(1)); +llvm::cl::opt statFile( + "statFile", + llvm::cl::desc("ouput file to print stats to (default value empty)"), + llvm::cl::init("")); static void LonestarGnnPrintVersion() { - std::cout << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " (" << galois::getRevision() << ")\n"; + std::cout << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " (" + << galois::getRevision() << ")\n"; } //! initialize lonestargnn benchmark -void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, const char* url) { - llvm::cl::SetVersionPrinter(LonestarGnnPrintVersion); - llvm::cl::ParseCommandLineOptions(argc, argv); - numThreads = galois::setActiveThreads(numThreads); - galois::runtime::setStatFile(statFile); - LonestarGnnPrintVersion(); - std::cout << "Copyright (C) " << galois::getCopyrightYear() << " The University of Texas at Austin\n"; - std::cout << "http://iss.ices.utexas.edu/galois/\n\n"; - std::cout << "application: " << (app ? app : "unspecified") << "\n"; - if (desc) std::cout << desc << "\n"; - if (url) std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" << url << "\n"; - std::cout << "\n"; - std::ostringstream cmdout; - for (int i = 0; i < argc; ++i) { - cmdout << argv[i]; - if (i != argc - 1) cmdout << " "; - } - galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str()); - galois::runtime::reportParam("(NULL)", "Threads", numThreads); - char name[256]; - gethostname(name, 256); - galois::runtime::reportParam("(NULL)", "Hostname", name); +void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, + const char* url) { + llvm::cl::SetVersionPrinter(LonestarGnnPrintVersion); + llvm::cl::ParseCommandLineOptions(argc, argv); + numThreads = galois::setActiveThreads(numThreads); + galois::runtime::setStatFile(statFile); + LonestarGnnPrintVersion(); + std::cout << "Copyright (C) " << galois::getCopyrightYear() + << " The University of Texas at Austin\n"; + std::cout << "http://iss.ices.utexas.edu/galois/\n\n"; + std::cout << "application: " << (app ? app : "unspecified") << "\n"; + if (desc) + std::cout << desc << "\n"; + if (url) + std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" + << url << "\n"; + std::cout << "\n"; + std::ostringstream cmdout; + for (int i = 0; i < argc; ++i) { + cmdout << argv[i]; + if (i != argc - 1) + cmdout << " "; + } + galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str()); + galois::runtime::reportParam("(NULL)", "Threads", numThreads); + char name[256]; + gethostname(name, 256); + galois::runtime::reportParam("(NULL)", "Hostname", name); } #include "types.h" #include "utils.h" #include "net.h" - From 172e69316f85c63e109674a3a37d2d36f23a363c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 25 Feb 2020 13:32:38 -0600 Subject: [PATCH 033/660] clangformat on libdeepgalois --- libdeepgalois/include/aggregator.h | 9 +- libdeepgalois/include/context.h | 116 +-- libdeepgalois/include/cutils.h | 71 +- libdeepgalois/include/gtypes.h | 7 +- .../include/layers/arithmetic_layer.h | 34 +- .../include/layers/graph_conv_layer.h | 127 +-- libdeepgalois/include/layers/layer.h | 235 +++--- libdeepgalois/include/layers/linear_layer.h | 44 +- libdeepgalois/include/layers/relu_layer.h | 24 +- .../include/layers/softmax_loss_layer.h | 18 +- libdeepgalois/include/lgraph.h | 305 +++---- libdeepgalois/include/math_functions.hh | 150 ++-- libdeepgalois/include/net.h | 194 ++--- libdeepgalois/include/node.h | 77 +- libdeepgalois/include/optimizer.h | 238 ++++-- libdeepgalois/include/random.h | 81 +- libdeepgalois/include/timer.h | 33 +- libdeepgalois/include/types.h | 14 +- libdeepgalois/include/utils.h | 164 ++-- libdeepgalois/src/aggregator.cpp | 41 +- libdeepgalois/src/aggregator.cu | 50 +- libdeepgalois/src/context.cpp | 304 ++++--- libdeepgalois/src/context.cu | 125 +-- libdeepgalois/src/layers/graph_conv_layer.cpp | 228 +++--- libdeepgalois/src/layers/relu_layer.cpp | 46 +- .../src/layers/softmax_loss_layer.cpp | 109 +-- libdeepgalois/src/math_functions.cpp | 762 +++++++++--------- libdeepgalois/src/math_functions.cu | 395 +++++---- libdeepgalois/src/net.cpp | 161 ++-- libdeepgalois/src/node.cpp | 39 +- libdeepgalois/src/node.cu | 13 +- libdeepgalois/src/optimizer.cpp | 133 +-- libdeepgalois/src/optimizer.cu | 3 +- 33 files changed, 2372 insertions(+), 1978 deletions(-) diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h index 78749104cf..01b1a1e8c8 100644 --- a/libdeepgalois/include/aggregator.h +++ b/libdeepgalois/include/aggregator.h @@ -2,9 +2,12 @@ #include "types.h" #ifdef CPU_ONLY #include "gtypes.h" -void update_all(size_t len, Graph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor); +void update_all(size_t len, Graph& g, const float_t* in, float_t* out, + bool norm, const float_t* norm_factor); #else #include "graph_gpu.h" -void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor); +#define TB_SIZE 256 +#define WARP_SIZE 32 +void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, + bool norm, const float_t* norm_factor); #endif - diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index 198b0cc9dc..688ed9a2a5 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -3,76 +3,82 @@ #include #include "types.h" #include "utils.h" -#ifdef CPU_ONLY #include "lgraph.h" +#ifdef CPU_ONLY #include "gtypes.h" #else #include "graph_gpu.h" -#include "cutils.h" #endif +#include "cutils.h" class Context { public: - Context(); - ~Context(); - enum Brew { CPU, GPU }; - Brew mode() { return mode_; } - void set_mode(Brew mode) { mode_ = mode; } - int solver_count() { return solver_count_; } - void set_solver_count(int val) { solver_count_ = val; } - int solver_rank() { return solver_rank_; } - void set_solver_rank(int val) { solver_rank_ = val; } - bool multiprocess() { return multiprocess_; } - void set_multiprocess(bool val) { multiprocess_ = val; } - bool root_solver() { return solver_rank_ == 0; } - size_t read_graph(std::string dataset_str); - size_t read_labels(std::string dataset_str); - size_t read_features(std::string dataset_str); - label_t get_label(size_t i) { return labels[i]; } - label_t *get_labels_ptr(size_t i) { return &(labels[0]); } - float_t * get_in_ptr(); - - size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr"); - size_t read_graph_gpu(std::string dataset_str); - void copy_data_to_device(); // copy labels and input features - void SetDevice(const int device_id); - void DeviceQuery() {} - bool CheckDevice(const int device_id) { return true; } - int FindDevice(const int start_id = 0) { return 0; } - void norm_factor_counting(); - void norm_factor_counting_gpu(); - - size_t n; // number of samples: N - size_t num_classes; // number of classes: E - size_t feat_len; // input feature length: D - std::vector labels; // labels for classification: N x 1 - label_t *d_labels; // labels on device - vec_t h_feats; // input features: N x D - float_t *d_feats; // input features on device - float_t *norm_factor; // normalization constant based on graph structure - float_t *d_norm_factor; // norm_factor on device - + Context(); + ~Context(); + enum Brew { CPU, GPU }; + // static Context& Get(); +#ifndef CPU_ONLY + inline static cublasHandle_t cublas_handle() { return cublas_handle_; } + inline static curandGenerator_t curand_generator() { + return curand_generator_; + } + // static void create_blas_handle(); +#endif + Brew mode() { return mode_; } + void set_mode(Brew mode) { mode_ = mode; } + int solver_count() { return solver_count_; } + void set_solver_count(int val) { solver_count_ = val; } + int solver_rank() { return solver_rank_; } + void set_solver_rank(int val) { solver_rank_ = val; } + bool multiprocess() { return multiprocess_; } + void set_multiprocess(bool val) { multiprocess_ = val; } + bool root_solver() { return solver_rank_ == 0; } + size_t read_graph(std::string dataset_str); + size_t read_labels(std::string dataset_str); + size_t read_features(std::string dataset_str); + label_t get_label(size_t i) { return labels[i]; } + label_t* get_labels_ptr(size_t i) { return &(labels[0]); } + float_t* get_in_ptr(); + void degree_counting(); + void norm_factor_counting(); + std::vector labels; // labels for classification: N x 1 + float_t* norm_factor; // normalization constant based on graph structure + std::vector degrees; + vec_t h_feats; // input features: N x D + size_t n; // number of samples: N + size_t num_classes; // number of classes: E + size_t feat_len; // input feature length: D #ifdef CPU_ONLY - Graph graph_cpu; // the input graph, |V| = N - void genGraph(LGraph &lg, Graph &g); + Graph graph_cpu; // the input graph, |V| = N + void genGraph(LGraph& lg, Graph& g); + size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr"); #else - CSRGraph graph_gpu; // the input graph, |V| = N - inline static cublasHandle_t cublas_handle() { return cublas_handle_; } - inline static curandGenerator_t curand_generator() { return curand_generator_; } + CSRGraph graph_gpu; // the input graph, |V| = N + label_t* d_labels; // labels on device + float_t* d_norm_factor; // norm_factor on device + float_t* d_feats; // input features on device + size_t read_graph_gpu(std::string dataset_str); + void copy_data_to_device(); // copy labels and input features + void SetDevice(const int device_id); + void DeviceQuery() {} + bool CheckDevice(const int device_id) { return true; } + int FindDevice(const int start_id = 0) { return 0; } #endif protected: #ifndef CPU_ONLY - static cublasHandle_t cublas_handle_; // used to call cuBLAS - static curandGenerator_t curand_generator_; // used to generate random numbers on GPU + static cublasHandle_t cublas_handle_; // used to call cuBLAS + static curandGenerator_t + curand_generator_; // used to generate random numbers on GPU #endif - Brew mode_; - int solver_count_; - int solver_rank_; - bool multiprocess_; + Brew mode_; + // shared_ptr random_generator_; + // Parallel training + int solver_count_; + int solver_rank_; + bool multiprocess_; private: - // The private constructor to avoid duplicate instantiation. - //Context(); + // The private constructor to avoid duplicate instantiation. + // Context(); }; - diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h index 3710b50ec9..830a4bbd08 100644 --- a/libdeepgalois/include/cutils.h +++ b/libdeepgalois/include/cutils.h @@ -10,19 +10,7 @@ const int CUDA_NUM_THREADS = 256; // CUDA: number of blocks for threads. inline int CUDA_GET_BLOCKS(const int N) { - return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; -} - -inline unsigned CudaTest(const char *msg) { - cudaError_t e; - //cudaThreadSynchronize(); - cudaDeviceSynchronize(); - if (cudaSuccess != (e = cudaGetLastError())) { - fprintf(stderr, "%s: %d\n", msg, e); - fprintf(stderr, "%s\n", cudaGetErrorString(e)); - exit(-1); - } - return 0; + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; } inline const char* cublasGetErrorString(cublasStatus_t error) { @@ -88,41 +76,42 @@ inline const char* curandGetErrorString(curandStatus_t error) { } // CUDA: various checks for different function calls. -#define CUDA_CHECK(condition) \ - do { \ - cudaError_t error = condition; \ - if (error != cudaSuccess) { \ - fprintf(stderr, "error %d: Cuda error in file '%s' in line %i : %s.\n", \ - error, __FILE__, __LINE__, cudaGetErrorString(error) ); \ - exit(EXIT_FAILURE); \ - } \ +#define CUDA_CHECK(condition) \ + do { \ + cudaError_t error = condition; \ + if (error != cudaSuccess) { \ + fprintf(stderr, "error %d: Cuda error in file '%s' in line %i : %s.\n", \ + error, __FILE__, __LINE__, cudaGetErrorString(error)); \ + exit(EXIT_FAILURE); \ + } \ } while (0) -#define CUBLAS_CHECK(condition) \ - do { \ - cublasStatus_t status = condition; \ - if (status != CUBLAS_STATUS_SUCCESS) { \ - fprintf(stderr, "error %d: cuBLAS error in file '%s' in line %i : %s.\n", \ - status, __FILE__, __LINE__, cublasGetErrorString(status) ); \ - exit(EXIT_FAILURE); \ - } \ +#define CUBLAS_CHECK(condition) \ + do { \ + cublasStatus_t status = condition; \ + if (status != CUBLAS_STATUS_SUCCESS) { \ + fprintf(stderr, \ + "error %d: cuBLAS error in file '%s' in line %i : %s.\n", \ + status, __FILE__, __LINE__, cublasGetErrorString(status)); \ + exit(EXIT_FAILURE); \ + } \ } while (0) -#define CURAND_CHECK(condition) \ - do { \ - curandStatus_t status = condition; \ - if (status != CURAND_STATUS_SUCCESS) { \ - fprintf(stderr, "error %d: cuBLAS error in file '%s' in line %i : %s.\n", \ - status, __FILE__, __LINE__, curandGetErrorString(status) ); \ - exit(EXIT_FAILURE); \ - } \ +#define CURAND_CHECK(condition) \ + do { \ + curandStatus_t status = condition; \ + if (status != CURAND_STATUS_SUCCESS) { \ + fprintf(stderr, \ + "error %d: cuBLAS error in file '%s' in line %i : %s.\n", \ + status, __FILE__, __LINE__, curandGetErrorString(status)); \ + exit(EXIT_FAILURE); \ + } \ } while (0) // CUDA: grid stride looping -#define CUDA_KERNEL_LOOP(i, n) \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; \ - i < (n); i += blockDim.x * gridDim.x) +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) // CUDA: check for error after kernel execution and exit loudly if there is one. #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError()) - diff --git a/libdeepgalois/include/gtypes.h b/libdeepgalois/include/gtypes.h index a30468b0f9..e11c1058cc 100644 --- a/libdeepgalois/include/gtypes.h +++ b/libdeepgalois/include/gtypes.h @@ -6,10 +6,11 @@ typedef galois::GAccumulator AccumF; typedef galois::GAccumulator AccumU; #ifdef EDGE_LABEL -typedef galois::graphs::LC_CSR_Graph::with_numa_alloc::type ::with_no_lockable::type Graph; +typedef galois::graphs::LC_CSR_Graph::with_numa_alloc< + true>::type ::with_no_lockable::type Graph; #else -typedef galois::graphs::LC_CSR_Graph::with_numa_alloc::type ::with_no_lockable::type Graph; +typedef galois::graphs::LC_CSR_Graph::with_numa_alloc< + true>::type ::with_no_lockable::type Graph; #endif typedef Graph::GraphNode GNode; - diff --git a/libdeepgalois/include/layers/arithmetic_layer.h b/libdeepgalois/include/layers/arithmetic_layer.h index aed91e0379..63dc66f780 100644 --- a/libdeepgalois/include/layers/arithmetic_layer.h +++ b/libdeepgalois/include/layers/arithmetic_layer.h @@ -4,19 +4,23 @@ // element-wise add N vectors ```y_i = x0_i + x1_i + ... + xnum_i``` class elementwise_add_layer : public layer { public: - elementwise_add_layer(unsigned level, std::vector in_dim, - std::vector out_dim) : layer(level, in_dim, out_dim) { - trainable_ = false; - } - std::string layer_type() const override { return std::string("elementwise_add"); } - void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { - for (size_t sample = 0; sample < in_data.size(); ++sample) { - for (size_t j = 0; j < in_data[0].size(); j++) - out_data[sample][j] = in_data[sample][j]; - } - } - void back_propagation(const tensor_t &in_data, const tensor_t &out_data, - tensor_t &out_grad, tensor_t &in_grad) override { - in_grad = out_grad; - } + elementwise_add_layer(unsigned level, std::vector in_dim, + std::vector out_dim) + : layer(level, in_dim, out_dim) { + trainable_ = false; + } + std::string layer_type() const override { + return std::string("elementwise_add"); + } + void forward_propagation(const tensor_t& in_data, + tensor_t& out_data) override { + for (size_t sample = 0; sample < in_data.size(); ++sample) { + for (size_t j = 0; j < in_data[0].size(); j++) + out_data[sample][j] = in_data[sample][j]; + } + } + void back_propagation(const tensor_t& in_data, const tensor_t& out_data, + tensor_t& out_grad, tensor_t& in_grad) override { + in_grad = out_grad; + } }; diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h index f0f27687e7..7dfc8c2154 100644 --- a/libdeepgalois/include/layers/graph_conv_layer.h +++ b/libdeepgalois/include/layers/graph_conv_layer.h @@ -3,72 +3,79 @@ #include "aggregator.h" /* GraphConv Layer - Parameters - ---------- - x: int, number of samples. - y: int, Input feature size. - z: int, Output feature size. - dropout: bool, optional, if True, a dropout operation is applied before other operations. - norm : bool, optional, if True, the normalizer :math:`c_{ij}` is applied. Default: ``True``. - bias : bool, optional, if True, adds a learnable bias to the output. Default: ``False``. - activation: callable activation function/layer or None, optional - If not None, applies an activation function to the updated node features. Default: ``None``. + Parameters + ---------- + x: int, number of samples. + y: int, Input feature size. + z: int, Output feature size. + dropout: bool, optional, if True, a dropout operation is applied before + other operations. norm : bool, optional, if True, the normalizer + :math:`c_{ij}` is applied. Default: ``True``. bias : bool, optional, if True, + adds a learnable bias to the output. Default: ``False``. activation: callable + activation function/layer or None, optional If not None, applies an + activation function to the updated node features. Default: ``None``. */ -class graph_conv_layer: public layer { +class graph_conv_layer : public layer { public: - graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout, - float dropout_rate, std::vector in_dims, std::vector out_dims); - graph_conv_layer(unsigned level, std::vector in_dims, std::vector out_dims) : - graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {} - ~graph_conv_layer() {} - void init(); - std::string layer_type() const override { return std::string("graph_conv"); } - void set_netphase(net_phase ctx) override { phase_ = ctx; } - //virtual void forward_propagation(const vec_t &in_data, vec_t &out_data); - //virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad); - virtual void forward_propagation(const float_t *in_data, float_t *out_data); - virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); - // user-defined aggregate function + graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout, + float dropout_rate, std::vector in_dims, + std::vector out_dims); + graph_conv_layer(unsigned level, std::vector in_dims, + std::vector out_dims) + : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, + out_dims) {} + ~graph_conv_layer() {} + void init(); + std::string layer_type() const override { return std::string("graph_conv"); } + void set_netphase(net_phase ctx) override { phase_ = ctx; } + // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data); + // virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, + // vec_t &out_grad, vec_t &in_grad); + virtual void forward_propagation(const float_t* in_data, float_t* out_data); + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad); + // user-defined aggregate function #ifdef CPU_ONLY - virtual void aggregate(size_t len, Graph &g, const float_t *in, float_t *out); + virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out); #else - virtual void aggregate(size_t len, CSRGraph &g, const float_t *in, float_t *out); + virtual void aggregate(size_t len, CSRGraph& g, const float_t* in, + float_t* out); #endif - // user-defined combine function - virtual void combine(const vec_t &self, const vec_t &neighbors, vec_t &out); + // user-defined combine function + virtual void combine(const vec_t& self, const vec_t& neighbors, vec_t& out); private: - bool act_; // whether to use activation function at the end - bool norm_; // whether to normalize data - bool bias_; // whether to add bias afterwards - bool dropout_; // whether to use dropout at first - const float dropout_rate_; - float scale_; - net_phase phase_; - size_t x; - size_t y; - size_t z; - float_t *out_temp; - float_t *in_temp; - float_t *trans_data; // y*x - unsigned * dropout_mask; // x*y + bool act_; // whether to use activation function at the end + bool norm_; // whether to normalize data + bool bias_; // whether to add bias afterwards + bool dropout_; // whether to use dropout at first + const float dropout_rate_; + float scale_; + net_phase phase_; + size_t x; + size_t y; + size_t z; + float_t* out_temp; + float_t* in_temp; + float_t* trans_data; // y*x + unsigned* dropout_mask; // x*y - // Glorot & Bengio (AISTATS 2010) - inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) { - auto init_range = sqrt(6.0/(dim_x + dim_y)); - std::default_random_engine rng; - std::uniform_real_distribution dist(-init_range, init_range); - matrix.resize(dim_x * dim_y); - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) - matrix[i*dim_y+j] = dist(rng); - } - } - inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) { - matrix.resize(dim_x * dim_y); - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) - matrix[i*dim_y+j] = 0; - } - } + // Glorot & Bengio (AISTATS 2010) + inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) { + auto init_range = sqrt(6.0 / (dim_x + dim_y)); + std::default_random_engine rng; + std::uniform_real_distribution dist(-init_range, init_range); + matrix.resize(dim_x * dim_y); + for (size_t i = 0; i < dim_x; ++i) { + for (size_t j = 0; j < dim_y; ++j) + matrix[i * dim_y + j] = dist(rng); + } + } + inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) { + matrix.resize(dim_x * dim_y); + for (size_t i = 0; i < dim_x; ++i) { + for (size_t j = 0; j < dim_y; ++j) + matrix[i * dim_y + j] = 0; + } + } }; diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 11f82b1486..c0e694d21c 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -33,116 +33,139 @@ class layer : public node { public: - layer(unsigned level, std::vector in_dims, std::vector out_dims) : - node(in_dims.size(), out_dims.size()), - level_(level), begin_(0), end_(0), num_dims(in_dims.size()), - input_dims(in_dims), output_dims(out_dims) { add_edge(); } - virtual ~layer() = default; - virtual std::string layer_type() const = 0; - virtual void set_netphase(net_phase phase) {} - virtual void set_context(Context *ctx) { context = ctx; } - virtual acc_t get_masked_loss() { return acc_t(0); } - virtual void forward_propagation(const float_t *in_data, float_t *out_data) = 0; - virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) = 0; + layer(unsigned level, std::vector in_dims, + std::vector out_dims) + : node(in_dims.size(), out_dims.size()), level_(level), begin_(0), + end_(0), num_dims(in_dims.size()), input_dims(in_dims), + output_dims(out_dims) { + add_edge(); + } + virtual ~layer() = default; + virtual std::string layer_type() const = 0; + virtual void set_netphase(net_phase phase) {} + virtual void set_context(Context* ctx) { context = ctx; } + // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) = + // 0; virtual void back_propagation(const vec_t &in_data, const vec_t + // &out_data, vec_t &out_grad, vec_t &in_grad) = 0; + virtual void forward_propagation(const float_t* in_data, + float_t* out_data) = 0; + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad) = 0; - void set_trainable(bool trainable) { trainable_ = trainable; } - bool trainable() const { return trainable_; } - void set_name(std::string name) { name_ = name; } - std::string get_name() { return name_; } - void print_layer_info() { - std::cout << "Layer" << level_ << " type: " << layer_type() - << " input[" << input_dims[0] << "," << input_dims[1] - << "] output[" << output_dims[0] << "," << output_dims[1] << "]\n"; - } - virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, mask_t *masks) { - begin_ = sample_begin; - end_ = sample_end; - count_ = sample_count; - masks_ = masks; -#ifndef CPU_ONLY - copy_masks_device(input_dims[0], masks_, d_masks_); -#endif - } - void set_in_data(float_t *data) { - assert(data.size() == input_dims[0]*input_dims[1]); - prev_ = std::make_shared(this, input_dims[0], input_dims[1]); - prev_->set_data(data); - // no need to allocate memory for gradients, since this is the input layer. - // - // allocate memory for intermediate features - //prev_->get_data() = data; - //std::copy(data.begin(), data.end(), prev_->get_data()); - // allocate memory for intermediate gradients - //prev_->get_gradient().resize(input_dims[0]*input_dims[1]); - } - void add_edge() { - // add an outgoing edge - next_ = std::make_shared(this, output_dims[0], output_dims[1]); - // allocate memory for intermediate feature vectors and gradients - next_->alloc(); - } - void alloc_grad() { - // allocate memory for intermediate gradients - } - void forward() { - std::cout << name_ << ": forwarding ... "; - forward_propagation(prev()->get_data(), next()->get_data()); - } - void backward() { - std::cout << name_ << ": backwarding ... "; - back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient()); - } - void update_weight(optimizer *opt) { - std::cout << name_ << ": weight updating ... "; - //vec_t diff; - //prev()->merge_grads(&diff); -#ifdef CPU_ONLY - // parallelize only when target size is big enough to mitigate thread spawning overhead. - bool parallel = (W.size() >= 512); - opt->update(weight_grad, W, parallel); // W += grad -#else - opt->update_gpu(d_weight_grad, d_W); // W += grad -#endif - //prev()->clear_grads(); - next()->clear_grads(); - } + void set_trainable(bool trainable) { trainable_ = trainable; } + bool trainable() const { return trainable_; } + void set_name(std::string name) { name_ = name; } + std::string get_name() { return name_; } + void print_layer_info() { + std::cout << "Layer" << level_ << " type: " << layer_type() << " input[" + << input_dims[0] << "," << input_dims[1] << "] output[" + << output_dims[0] << "," << output_dims[1] << "]\n"; + } + virtual void set_sample_mask(size_t sample_begin, size_t sample_end, + size_t sample_count, mask_t* masks) { + begin_ = sample_begin; + end_ = sample_end; + count_ = sample_count; + masks_ = masks; + } + void set_in_data(float_t* data) { + assert(data.size() == input_dims[0] * input_dims[1]); + prev_ = std::make_shared(this, input_dims[0], input_dims[1]); + prev_->set_data(data); + // no need to allocate memory for gradients, since this is the input layer. + // + // allocate memory for intermediate features + // prev_->get_data() = data; + // std::copy(data.begin(), data.end(), prev_->get_data()); + // allocate memory for intermediate gradients + // prev_->get_gradient().resize(input_dims[0]*input_dims[1]); + } + void add_edge() { + // add an outgoing edge + next_ = std::make_shared(this, output_dims[0], output_dims[1]); + // allocate memory for intermediate feature vectors and gradients + next_->alloc(); + // next_->get_data().resize(output_dims[0]*output_dims[1]); + } + void alloc_grad() { + // allocate memory for intermediate gradients + // next_->get_gradient().resize(output_dims[0]*output_dims[1]); + } + void forward() { + forward_propagation(prev()->get_data(), next()->get_data()); + } + void backward() { + back_propagation(prev()->get_data(), next()->get_data(), + next()->get_gradient(), prev()->get_gradient()); + } + void update_weight(optimizer* opt) { + // parallelize only when target size is big enough to mitigate thread + // spawning overhead. + bool parallel = (W.size() >= 512); + // vec_t diff; + // prev()->merge_grads(&diff); + // auto in_data = prev()->get_data(); + // float_t rcp_batch_size = float_t(1.0) / in_data.size(); + // for (size_t i = 0; i < diff.size(); ++i) + // diff[i] *= rcp_batch_size; + opt->update(weight_grad, W, parallel); // W += grad + // prev()->clear_grads(); + next()->clear_grads(); + } + inline acc_t get_masked_loss() { + AccumF total_loss; + AccumU valid_sample_count; + total_loss.reset(); + valid_sample_count.reset(); + galois::do_all(galois::iterate(begin_, end_), + [&](const auto& i) { + if (masks_[i]) { + total_loss += loss[i]; + valid_sample_count += 1; + } + }, + galois::chunk_size<256>(), galois::steal(), + galois::loopname("getMaskedLoss")); + assert(valid_sample_count.reduce() == count_); + return total_loss.reduce() / (acc_t)count_; + } protected: - unsigned level_; // layer id: [0, num_layers-1] - size_t begin_; // sample begin index - size_t end_; // sample end index - size_t count_; // number of samples - size_t num_dims; // number of dimensions - std::vector input_dims; // input dimensions - std::vector output_dims; // output dimentions - std::string name_; // name of this layer - bool trainable_; // is this layer trainable - vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E - vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x 16, layer1: 16 x E - vec_t weight_grad; // weight gradient for updating parameters - float_t *d_W; - float_t *d_weight_grad; - mask_t *masks_; // masks to show which samples are valid - mask_t *d_masks_; - float_t *loss; // error for each vertex: N x 1 - Context *context; + unsigned level_; // layer id: [0, num_layers-1] + size_t begin_; // sample begin index + size_t end_; // sample end index + size_t count_; // number of samples + size_t num_dims; // number of dimensions + std::vector input_dims; // input dimensions + std::vector output_dims; // output dimentions + std::string name_; // name of this layer + bool trainable_; // is this layer trainable + vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E + vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x + // 16, layer1: 16 x E + vec_t weight_grad; // weight gradient for updating parameters + float_t* d_W; + float_t* d_weight_grad; + mask_t* masks_; // masks to show which samples are valid + mask_t* d_masks_; + float_t* loss; // error for each vertex: N x 1 + Context* context; }; // head: layer i+1, tail: layer i -inline void connect(layer *head, layer *tail, - size_t head_index = 0, size_t tail_index = 0) { - //auto out_shape = head->out_shape()[head_index]; - //auto in_shape = tail->in_shape()[tail_index]; - //head->setup(false); - //if (in_shape.size() == 0) { - // tail->set_in_shape(out_shape); - // in_shape = out_shape; - //} - //if (out_shape.size() != in_shape.size()) - // connection_mismatch(*head, *tail); - //if (!head->next_[head_index]) - // throw nn_error("output edge must not be null"); - tail->prev_ = head->next_; - tail->prev_->add_next_node(tail); +inline void connect(layer* head, layer* tail, size_t head_index = 0, + size_t tail_index = 0) { + // auto out_shape = head->out_shape()[head_index]; + // auto in_shape = tail->in_shape()[tail_index]; + // head->setup(false); + // if (in_shape.size() == 0) { + // tail->set_in_shape(out_shape); + // in_shape = out_shape; + //} + // if (out_shape.size() != in_shape.size()) + // connection_mismatch(*head, *tail); + // if (!head->next_[head_index]) + // throw nn_error("output edge must not be null"); + tail->prev_ = head->next_; + tail->prev_->add_next_node(tail); } - diff --git a/libdeepgalois/include/layers/linear_layer.h b/libdeepgalois/include/layers/linear_layer.h index e4ff524f3f..55d5d245d8 100644 --- a/libdeepgalois/include/layers/linear_layer.h +++ b/libdeepgalois/include/layers/linear_layer.h @@ -3,26 +3,30 @@ class linear_layer : public layer { public: - linear_layer(unsigned level, float_t scale, float_t bias, - std::vector in_dims, std::vector out_dims) : - layer(level, in_dims, out_dims), scale_(scale), bias_(bias) { - trainable_ = false; } - linear_layer(unsigned level, std::vector in_dim, - std::vector out_dim) : linear_layer(level, 1.0, 0.0, in_dim, out_dim) { } - std::string layer_type() const override { return "linear"; } + linear_layer(unsigned level, float_t scale, float_t bias, + std::vector in_dims, std::vector out_dims) + : layer(level, in_dims, out_dims), scale_(scale), bias_(bias) { + trainable_ = false; + } + linear_layer(unsigned level, std::vector in_dim, + std::vector out_dim) + : linear_layer(level, 1.0, 0.0, in_dim, out_dim) {} + std::string layer_type() const override { return "linear"; } + + void forward_propagation(const tensor_t& in_data, + tensor_t& out_data) override { + for (size_t sample = 0; sample < input_dims[0]; ++sample) { + for (size_t i = 0; i < input_dims[1]; i++) + out_data[sample][i] = scale_ * in_data[sample][i] + bias_; + } + } + void back_propagation(const tensor_t& in_data, const tensor_t& out_data, + tensor_t& out_grad, tensor_t& in_grad) override { + for (size_t sample = 0; sample < input_dims[0]; ++sample) + for (size_t i = 0; i < input_dims[1]; i++) + in_grad[sample][i] = out_grad[sample][i] * scale_; + } - void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { - for (size_t sample = 0; sample < input_dims[0]; ++sample) { - for (size_t i = 0; i < input_dims[1]; i ++) - out_data[sample][i] = scale_ * in_data[sample][i] + bias_; - } - } - void back_propagation(const tensor_t &in_data, const tensor_t &out_data, - tensor_t &out_grad, tensor_t &in_grad) override { - for (size_t sample = 0; sample < input_dims[0]; ++sample) - for (size_t i = 0; i < input_dims[1]; i++) - in_grad[sample][i] = out_grad[sample][i] * scale_; - } protected: - float_t scale_, bias_; + float_t scale_, bias_; }; diff --git a/libdeepgalois/include/layers/relu_layer.h b/libdeepgalois/include/layers/relu_layer.h index 285e09b472..8a7b447038 100644 --- a/libdeepgalois/include/layers/relu_layer.h +++ b/libdeepgalois/include/layers/relu_layer.h @@ -4,14 +4,18 @@ // ReLU Layer class relu_layer : public layer { public: - relu_layer(unsigned level, std::vector in_dims, std::vector out_dims) - : layer(level, in_dims, out_dims) { - trainable_ = false; - } - ~relu_layer() {} - std::string layer_type() const override { return std::string("relu"); } - virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data); - virtual void forward_propagation(const float_t *in_data, float_t *out_data); - virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad); - virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); + relu_layer(unsigned level, std::vector in_dims, + std::vector out_dims) + : layer(level, in_dims, out_dims) { + trainable_ = false; + } + ~relu_layer() {} + std::string layer_type() const override { return std::string("relu"); } + virtual void forward_propagation(const tensor_t& in_data, tensor_t& out_data); + virtual void forward_propagation(const float_t* in_data, float_t* out_data); + virtual void back_propagation(const tensor_t& in_data, + const tensor_t& out_data, tensor_t& out_grad, + tensor_t& in_grad); + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad); }; diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h index 78166b2fb5..0a680a3209 100644 --- a/libdeepgalois/include/layers/softmax_loss_layer.h +++ b/libdeepgalois/include/layers/softmax_loss_layer.h @@ -1,13 +1,15 @@ #pragma once #include "layer.h" -class softmax_loss_layer: public layer { +class softmax_loss_layer : public layer { public: - softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims); - ~softmax_loss_layer() {} - std::string layer_type() const override { return std::string("softmax_loss"); } - virtual void forward_propagation(const float_t *in_data, float_t *out_data); - virtual void back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad); - virtual acc_t get_masked_loss(); + softmax_loss_layer(unsigned level, std::vector in_dims, + std::vector out_dims); + ~softmax_loss_layer() {} + std::string layer_type() const override { + return std::string("softmax_loss"); + } + virtual void forward_propagation(const float_t* in_data, float_t* out_data); + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad); }; - diff --git a/libdeepgalois/include/lgraph.h b/libdeepgalois/include/lgraph.h index 78f6f76aec..65cd004c82 100644 --- a/libdeepgalois/include/lgraph.h +++ b/libdeepgalois/include/lgraph.h @@ -1,7 +1,7 @@ #ifndef __LGRAPH_HPP__ #define __LGRAPH_HPP__ -//defines the Learning Graph (LGraph) data structure +// defines the Learning Graph (LGraph) data structure #include #include #include @@ -12,168 +12,173 @@ typedef unsigned IndexT; typedef float ValueT; struct Edge { - IndexT src; - IndexT dst; - ValueT elabel; - Edge() : src(0), dst(0), elabel(0) {} - Edge(IndexT from, IndexT to, ValueT el) : - src(from), dst(to), elabel(el) {} - std::string to_string() const { - std::stringstream ss; - ss << "e(" << src << "," << dst << "," << elabel << ")"; - return ss.str(); - } + IndexT src; + IndexT dst; + ValueT elabel; + Edge() : src(0), dst(0), elabel(0) {} + Edge(IndexT from, IndexT to, ValueT el) : src(from), dst(to), elabel(el) {} + std::string to_string() const { + std::stringstream ss; + ss << "e(" << src << "," << dst << "," << elabel << ")"; + return ss.str(); + } }; typedef std::vector EdgeList; class LGraph { public: - LGraph() : symmetrize_(false), directed_(false) {} - void clean() { - delete[] rowptr_; - delete[] colidx_; - delete[] weight_; - degrees.clear(); - el.clear(); - //labels_.clear(); - //vertices.clear(); - } - bool directed() const { return directed_; } - size_t num_vertices() const { return num_vertices_; } - size_t num_edges() const { return num_edges_; } - IndexT * out_rowptr() const { return rowptr_; } - IndexT * out_colidx() const { return colidx_; } - unsigned out_degree(IndexT n) const { return rowptr_[n+1] - rowptr_[n]; } - IndexT get_offset(IndexT n) { return rowptr_[n]; } - IndexT get_dest(IndexT n) { return colidx_[n]; } - ValueT get_weight(IndexT n) { return weight_[n]; } - unsigned get_max_degree() { return max_degree; } - //ValueT * labels() { return labels_.data(); } - //ValueT get_label(IndexT n) { return labels_[n]; } - void read_edgelist(const char *filename, bool symmetrize = false) { - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - IndexT max_vid = 0; - while (std::getline(in, line)) { - std::istringstream edge_stream(line); - IndexT u, v; - edge_stream >> u; - edge_stream >> v; - el.push_back(Edge(u, v, 1)); - if (symmetrize) el.push_back(Edge(v, u, 1)); - if (u > max_vid) max_vid = u; - if (v > max_vid) max_vid = v; - } - in.close(); - directed_ = true; - num_vertices_ = max_vid+1; - num_edges_ = el.size(); - std::cout << "num_vertices_ " << num_vertices_ << " num_edges_ " << num_edges_ << "\n"; - MakeGraphFromEL(); - } + LGraph() : symmetrize_(false), directed_(false) {} + void clean() { + delete[] rowptr_; + delete[] colidx_; + delete[] weight_; + degrees.clear(); + el.clear(); + // labels_.clear(); + // vertices.clear(); + } + bool directed() const { return directed_; } + size_t num_vertices() const { return num_vertices_; } + size_t num_edges() const { return num_edges_; } + IndexT* out_rowptr() const { return rowptr_; } + IndexT* out_colidx() const { return colidx_; } + unsigned out_degree(IndexT n) const { return rowptr_[n + 1] - rowptr_[n]; } + IndexT get_offset(IndexT n) { return rowptr_[n]; } + IndexT get_dest(IndexT n) { return colidx_[n]; } + ValueT get_weight(IndexT n) { return weight_[n]; } + unsigned get_max_degree() { return max_degree; } + // ValueT * labels() { return labels_.data(); } + // ValueT get_label(IndexT n) { return labels_[n]; } + void read_edgelist(const char* filename, bool symmetrize = false) { + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + IndexT max_vid = 0; + while (std::getline(in, line)) { + std::istringstream edge_stream(line); + IndexT u, v; + edge_stream >> u; + edge_stream >> v; + el.push_back(Edge(u, v, 1)); + if (symmetrize) + el.push_back(Edge(v, u, 1)); + if (u > max_vid) + max_vid = u; + if (v > max_vid) + max_vid = v; + } + in.close(); + directed_ = true; + num_vertices_ = max_vid + 1; + num_edges_ = el.size(); + std::cout << "num_vertices_ " << num_vertices_ << " num_edges_ " + << num_edges_ << "\n"; + MakeGraphFromEL(); + } private: - EdgeList el; - bool symmetrize_; // whether to symmetrize a directed graph - bool directed_; - size_t num_vertices_; - size_t num_edges_; - IndexT *rowptr_; - IndexT *colidx_; - ValueT *weight_; - unsigned max_degree; - std::vector degrees; - std::vector labels_; - std::vector > vertices; + EdgeList el; + bool symmetrize_; // whether to symmetrize a directed graph + bool directed_; + size_t num_vertices_; + size_t num_edges_; + IndexT* rowptr_; + IndexT* colidx_; + ValueT* weight_; + unsigned max_degree; + std::vector degrees; + std::vector labels_; + std::vector> vertices; - static bool compare_id(Edge a, Edge b) { return (a.dst < b.dst); } + static bool compare_id(Edge a, Edge b) { return (a.dst < b.dst); } - void MakeGraphFromEL() { - SquishGraph(); - MakeCSR(false); - } + void MakeGraphFromEL() { + SquishGraph(); + MakeCSR(false); + } - void SquishGraph(bool remove_selfloops = true, bool remove_redundents = true) { - std::vector neighbors; - for (size_t i = 0; i < num_vertices_; i++) - vertices.push_back(neighbors); - for (size_t i = 0; i < num_edges_; i ++) - vertices[el[i].src].push_back(el[i]); - el.clear(); - printf("Sorting the neighbor lists..."); - for (size_t i = 0; i < num_vertices_; i ++) - std::sort(vertices[i].begin(), vertices[i].end(), compare_id); - printf(" Done\n"); - //remove self loops - int num_selfloops = 0; - if(remove_selfloops) { - printf("Removing self loops..."); - for(size_t i = 0; i < num_vertices_; i ++) { - for(unsigned j = 0; j < vertices[i].size(); j ++) { - if(i == vertices[i][j].dst) { - vertices[i].erase(vertices[i].begin()+j); - num_selfloops ++; - j --; - } - } - } - printf(" %d selfloops are removed\n", num_selfloops); - num_edges_ -= num_selfloops; - } - // remove redundent - int num_redundents = 0; - if(remove_redundents) { - printf("Removing redundent edges..."); - for (size_t i = 0; i < num_vertices_; i ++) { - for (unsigned j = 1; j < vertices[i].size(); j ++) { - if (vertices[i][j].dst == vertices[i][j-1].dst) { - vertices[i].erase(vertices[i].begin()+j); - num_redundents ++; - j --; - } - } - } - printf(" %d redundent edges are removed\n", num_redundents); - num_edges_ -= num_redundents; - } - } + void SquishGraph(bool remove_selfloops = true, + bool remove_redundents = true) { + std::vector neighbors; + for (size_t i = 0; i < num_vertices_; i++) + vertices.push_back(neighbors); + for (size_t i = 0; i < num_edges_; i++) + vertices[el[i].src].push_back(el[i]); + el.clear(); + printf("Sorting the neighbor lists..."); + for (size_t i = 0; i < num_vertices_; i++) + std::sort(vertices[i].begin(), vertices[i].end(), compare_id); + printf(" Done\n"); + // remove self loops + int num_selfloops = 0; + if (remove_selfloops) { + printf("Removing self loops..."); + for (size_t i = 0; i < num_vertices_; i++) { + for (unsigned j = 0; j < vertices[i].size(); j++) { + if (i == vertices[i][j].dst) { + vertices[i].erase(vertices[i].begin() + j); + num_selfloops++; + j--; + } + } + } + printf(" %d selfloops are removed\n", num_selfloops); + num_edges_ -= num_selfloops; + } + // remove redundent + int num_redundents = 0; + if (remove_redundents) { + printf("Removing redundent edges..."); + for (size_t i = 0; i < num_vertices_; i++) { + for (unsigned j = 1; j < vertices[i].size(); j++) { + if (vertices[i][j].dst == vertices[i][j - 1].dst) { + vertices[i].erase(vertices[i].begin() + j); + num_redundents++; + j--; + } + } + } + printf(" %d redundent edges are removed\n", num_redundents); + num_edges_ -= num_redundents; + } + } - void MakeCSR(bool transpose) { - degrees.resize(num_vertices_); - std::fill(degrees.begin(), degrees.end(), 0); - for (size_t i = 0; i < num_vertices_; i ++) - degrees[i] = vertices[i].size(); - max_degree = *(std::max_element(degrees.begin(), degrees.end())); + void MakeCSR(bool transpose) { + degrees.resize(num_vertices_); + std::fill(degrees.begin(), degrees.end(), 0); + for (size_t i = 0; i < num_vertices_; i++) + degrees[i] = vertices[i].size(); + max_degree = *(std::max_element(degrees.begin(), degrees.end())); - std::vector offsets(degrees.size() + 1); - IndexT total = 0; - for (size_t n = 0; n < degrees.size(); n++) { - offsets[n] = total; - total += degrees[n]; - } - offsets[degrees.size()] = total; + std::vector offsets(degrees.size() + 1); + IndexT total = 0; + for (size_t n = 0; n < degrees.size(); n++) { + offsets[n] = total; + total += degrees[n]; + } + offsets[degrees.size()] = total; - assert(num_edges_ == offsets[num_vertices_]); - weight_ = new ValueT[num_edges_]; - colidx_ = new IndexT[num_edges_]; - rowptr_ = new IndexT[num_vertices_+1]; - for (size_t i = 0; i < num_vertices_+1; i ++) rowptr_[i] = offsets[i]; - for (size_t i = 0; i < num_vertices_; i ++) { - for (auto it = vertices[i].begin(); it < vertices[i].end(); it ++) { - Edge e = *it; - assert(i == e.src); - if (symmetrize_ || (!symmetrize_ && !transpose)) { - weight_[offsets[e.src]] = e.elabel; - colidx_[offsets[e.src]++] = e.dst; - } - if (symmetrize_ || (!symmetrize_ && transpose)) { - weight_[offsets[e.dst]] = e.elabel; - colidx_[offsets[e.dst]++] = e.src; - } - } - } - } + assert(num_edges_ == offsets[num_vertices_]); + weight_ = new ValueT[num_edges_]; + colidx_ = new IndexT[num_edges_]; + rowptr_ = new IndexT[num_vertices_ + 1]; + for (size_t i = 0; i < num_vertices_ + 1; i++) + rowptr_[i] = offsets[i]; + for (size_t i = 0; i < num_vertices_; i++) { + for (auto it = vertices[i].begin(); it < vertices[i].end(); it++) { + Edge e = *it; + assert(i == e.src); + if (symmetrize_ || (!symmetrize_ && !transpose)) { + weight_[offsets[e.src]] = e.elabel; + colidx_[offsets[e.src]++] = e.dst; + } + if (symmetrize_ || (!symmetrize_ && transpose)) { + weight_[offsets[e.dst]] = e.elabel; + colidx_[offsets[e.dst]++] = e.src; + } + } + } + } }; #endif diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 2e435d60e2..6f4348ff34 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -13,70 +13,94 @@ extern "C" { const float negative_slope = 0; -void vadd(const vec_t &a, const vec_t &b, vec_t &out); // vector add -void vadd(size_t n, const float_t *a, const float_t *b, float_t *out); -void vsub(const vec_t &a, const vec_t &b, vec_t &out); -void vmul(const vec_t &a, const vec_t &b, vec_t &out); -void vdiv(const vec_t &a, const vec_t &b, vec_t &out); -void add_scalar(const float_t alpha, vec_t &Y); -void sub_scalar(const float_t alpha, vec_t &Y); -void mul_scalar(const float_t alpha, vec_t &Y); -void mul_scalar(size_t n, const float_t alpha, const float_t *in, float_t *out); -void div_scalar(const float_t alpha, vec_t &Y); -float_t dot(const vec_t &x, const vec_t &y); -void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector); -void vvmul(const vec_t &a, const vec_t &b, tensor_t &out); -void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C); -void copy2D1D(const tensor_t &in, vec_t &out); -void copy1D1D(const vec_t &in, vec_t &out); -void copy1D1D(size_t len, const float_t *in, float_t *out); -void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C); -void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply -void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C); -void transpose2D(const tensor_t &in, tensor_t &out); -void transpose2D1D(const tensor_t &in, vec_t &out); -void transpose(size_t x, size_t y, const vec_t &in, vec_t &out); -void transpose(size_t x, size_t y, const float_t *in, float_t *out); -int argmax(const size_t n, const vec_t &x); // the arguments of the maxima -int argmax(const size_t n, const float_t *x); // the arguments of the maxima -void clear(vec_t &in); -void clear(size_t n, float_t *in); -void relu(const vec_t &in, vec_t &out); // ReLU -void relu(size_t n, const float_t *in, float_t *out); // ReLU -void d_relu(const vec_t &in_diff, const vec_t &data, vec_t &out_diff); // ReLU derivative -void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &masks, vec_t &out); // dropout -void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &masks, float_t *out); -void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out); -void d_dropout(const float scale, const vec_t &in_diff, std::vector &masks, vec_t &out_diff); // dropout derivative -void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *masks, float_t *out_diff); -void softmax(const vec_t &input, vec_t &output); -void softmax(size_t n, const float_t *input, float_t *output); -void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp); -void d_softmax(size_t n, const float_t *y, const float_t *p, float_t *dy, const float_t *dp); -float_t cross_entropy(const vec_t &y, const vec_t &p); -float_t cross_entropy(size_t n, const float_t *y, const float_t *p); -void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d); -void d_cross_entropy(size_t n, const float_t *y, const float_t *p, float_t *d); +void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add +void vadd(size_t n, const float_t* a, const float_t* b, float_t* out); +void vsub(const vec_t& a, const vec_t& b, vec_t& out); +void vmul(const vec_t& a, const vec_t& b, vec_t& out); +void vdiv(const vec_t& a, const vec_t& b, vec_t& out); +void add_scalar(const float_t alpha, vec_t& Y); +void sub_scalar(const float_t alpha, vec_t& Y); +void mul_scalar(const float_t alpha, vec_t& Y); +void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out); +void div_scalar(const float_t alpha, vec_t& Y); +float_t dot(const vec_t& x, const vec_t& y); +void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector); +void vvmul(const vec_t& a, const vec_t& b, tensor_t& out); +void matadd(size_t x, size_t y, const tensor_t& A, const tensor_t& B, + tensor_t& C); +void copy2D1D(const tensor_t& in, vec_t& out); +void copy1D1D(const vec_t& in, vec_t& out); +void copy1D1D(size_t len, const float_t* in, float_t* out); +void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C); +void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, + const float_t* A, const float_t* B, + float_t* C); // matrix multiply +void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B, + vec_t& C); +void transpose2D(const tensor_t& in, tensor_t& out); +void transpose2D1D(const tensor_t& in, vec_t& out); +void transpose(size_t x, size_t y, const vec_t& in, vec_t& out); +void transpose(size_t x, size_t y, const float_t* in, float_t* out); +int argmax(const size_t n, const vec_t& x); // the arguments of the maxima +int argmax(const size_t n, const float_t* x); // the arguments of the maxima +void clear(vec_t& in); +void clear(size_t n, float_t* in); +void relu(const vec_t& in, vec_t& out); // ReLU +void relu(size_t n, const float_t* in, float_t* out); // ReLU +void d_relu(const vec_t& in_diff, const vec_t& data, + vec_t& out_diff); // ReLU derivative +void dropout(const float scale, const float dropout_rate, const vec_t& in, + std::vector& mask, vec_t& out); // dropout +void dropout(const float scale, const float dropout_rate, const vec_t& in, + std::vector& mask, float_t* out); +void dropout(size_t n, const float scale, const float dropout_rate, + const float_t* in, unsigned* mask, float_t* out); +void d_dropout(const float scale, const vec_t& in_diff, + std::vector& mask, + vec_t& out_diff); // dropout derivative +void d_dropout(size_t n, const float scale, const float_t* in_diff, + unsigned* mask, float_t* out_diff); +void softmax(const vec_t& input, vec_t& output); +void softmax(size_t n, const float_t* input, float_t* output); +void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp); +void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, + const float_t* dp); +float_t cross_entropy(const vec_t& y, const vec_t& p); +float_t cross_entropy(size_t n, const float_t* y, const float_t* p); +void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d); +void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d); -void copy_gpu(size_t len, const float_t *in, float_t *out); -void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add -void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU -void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative -void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out); // dropout -void d_dropout_gpu(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out); // dropout derivative -void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, - const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C); -void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply -int argmax_gpu(const size_t n, const float_t *x); // the arguments of the maxima -void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data); -void d_softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, const float_t *out_data, float_t *diff); -void scal_gpu(const int N, const float alpha, float *X); +void out_malloc_device(int n, mask_t* h_masks, mask_t* d_masks, float_t* loss); +void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, + unsigned* masks, float_t* in, float_t* out, + float_t* matrix, float_t* grad); +void copy_gpu(size_t len, const float_t* in, float_t* out); +void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned* masks, + float_t* in, float_t* out); +void vadd_gpu(const int n, const float_t* a, const float_t* b, + float_t* out); // vector add +void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU +void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, + float_t* out_diff); // ReLU derivative +void dropout_gpu(const int n, const float scale, const float dropout_rate, + const float_t* in, unsigned* mask, float_t* out); // dropout +void d_dropout_gpu(const float scale, const float_t* in_diff, + const unsigned* mask, + float_t* out_diff); // dropout derivative +void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C); +void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, + const float_t* A, const float_t* B, + float_t* C); // matrix multiply +int argmax_gpu(const size_t n, const float_t* x); // the arguments of the maxima +void softmax_cross_entropy_gpu(int x, int y, const float_t* in_data, + const mask_t* masks, const label_t* labels, + float_t* loss, float_t* out_data); +void d_softmax_cross_entropy_gpu(int x, int y, const float_t* in_data, + const mask_t* masks, const label_t* labels, + const float_t* out_data, float_t* diff); +void scal_gpu(const int N, const float alpha, float* X); void add_scalar_gpu(const int N, const float_t alpha, float_t* Y); -acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss); - -void copy_masks_device(int n, mask_t *h_masks, mask_t *d_masks); -void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out); -void loss_malloc_device(int n, float_t *loss); -void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad); #endif diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index c2bf8e997e..87a0e3b72b 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -11,110 +11,124 @@ #define NUM_CONV_LAYERS 2 -// N: number of vertices, D: feature vector dimentions, +// N: number of vertices, D: feature vector dimentions, // E: number of distinct labels, i.e. number of vertex classes // layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16) // layer 2: features N x 16, weights 16 x E, out N x E class Net { public: - Net() {} - void init(std::string dataset_str, unsigned epochs, unsigned hidden1); - size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } - size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id+1]; } - size_t get_nnodes() { return num_samples; } - void train(optimizer *opt, bool need_validate); // training - void construct_layers(); - void set_contexts() { - for (size_t i = 0; i < num_layers; i ++) - layers[i]->set_context(context); - } - void set_netphases(net_phase phase) { - for (size_t i = 0; i < num_layers; i ++) - layers[i]->set_netphase(phase); - } - void print_layers_info() { - for (size_t i = 0; i < num_layers; i ++) - layers[i]->print_layer_info(); - } + Net() {} + void init(std::string dataset_str, unsigned epochs, unsigned hidden1); + size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } + size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } + size_t get_nnodes() { return num_samples; } + void train(optimizer* opt, bool need_validate); // training + void construct_layers(); + void set_contexts() { + for (size_t i = 0; i < num_layers; i++) + layers[i]->set_context(context); + } + void set_netphases(net_phase phase) { + for (size_t i = 0; i < num_layers; i++) + layers[i]->set_netphase(phase); + } + void print_layers_info() { + for (size_t i = 0; i < num_layers; i++) + layers[i]->print_layer_info(); + } - void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, bool bias = false, bool dropout = true, float dropout_rate = 0.5) { - assert(dropout_rate < 1.0); - assert(layer_id < NUM_CONV_LAYERS); - std::vector in_dims(2), out_dims(2); - in_dims[0] = out_dims[0] = num_samples; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); - if(layer_id > 0) connect(layers[layer_id-1], layers[layer_id]); - } + void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, + bool bias = false, bool dropout = true, + float dropout_rate = 0.5) { + assert(dropout_rate < 1.0); + assert(layer_id < NUM_CONV_LAYERS); + std::vector in_dims(2), out_dims(2); + in_dims[0] = out_dims[0] = num_samples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, + dropout_rate, in_dims, out_dims); + if (layer_id > 0) + connect(layers[layer_id - 1], layers[layer_id]); + } - void append_out_layer(size_t layer_id) { - assert(layer_id > 0); // can not be the first layer - std::vector in_dims(2), out_dims(2); - in_dims[0] = out_dims[0] = num_samples; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); - connect(layers[layer_id-1], layers[layer_id]); - } + void append_out_layer(size_t layer_id) { + assert(layer_id > 0); // can not be the first layer + std::vector in_dims(2), out_dims(2); + in_dims[0] = out_dims[0] = num_samples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); + connect(layers[layer_id - 1], layers[layer_id]); + } - // forward propagation: [begin, end) is the range of samples used. - acc_t fprop(size_t begin, size_t end, size_t count, mask_t *masks) { - // set mask for the last layer - layers[num_layers-1]->set_sample_mask(begin, end, count, &masks[0]); - // layer0: from N x D to N x 16 - // layer1: from N x 16 to N x E - // layer2: from N x E to N x E (normalize only) - for (size_t i = 0; i < num_layers; i ++) - layers[i]->forward(); - return layers[num_layers-1]->get_masked_loss(); - } + // forward propagation: [begin, end) is the range of samples used. + acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) { + // set mask for the last layer + layers[num_layers - 1]->set_sample_mask(begin, end, count, &masks[0]); + // layer0: from N x D to N x 16 + // layer1: from N x 16 to N x E + // layer2: from N x E to N x E (normalize only) + for (size_t i = 0; i < num_layers; i++) + layers[i]->forward(); + return layers[num_layers - 1]->get_masked_loss(); + } - // back propogation - void bprop() { - for (size_t i = num_layers; i != 0; i --) - layers[i-1]->backward(); - } + // back propogation + void bprop() { + for (size_t i = num_layers; i != 0; i--) + layers[i - 1]->backward(); + } - // update trainable weights after back-propagation - void update_weights(optimizer *opt) { - for (size_t i = 0; i < num_layers; i ++) - if (layers[i]->trainable()) layers[i]->update_weight(opt); - } + // update trainable weights after back-propagation + void update_weights(optimizer* opt) { + for (size_t i = 0; i < num_layers; i++) + if (layers[i]->trainable()) + layers[i]->update_weight(opt); + } - // evaluate, i.e. inference or predict - double evaluate(size_t begin, size_t end, size_t count, mask_t *masks, acc_t &loss, acc_t &acc) { - Timer t_eval; - t_eval.Start(); - loss = fprop(begin, end, count, masks); - acc = masked_accuracy(begin, end, count, masks); - t_eval.Stop(); - return t_eval.Millisecs(); - } + // evaluate, i.e. inference or predict + double evaluate(size_t begin, size_t end, size_t count, mask_t* masks, + acc_t& loss, acc_t& acc) { + Timer t_eval; + t_eval.Start(); + loss = fprop(begin, end, count, masks); + acc = masked_accuracy(begin, end, count, masks); + t_eval.Stop(); + return t_eval.Millisecs(); + } protected: - Context *context; - size_t num_samples; // number of samples: N - size_t num_classes; // number of vertex classes: E - size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 - unsigned num_epochs; // number of epochs - std::vector feature_dims; // feature dimnesions for each layer - std::vector train_mask, val_mask; // masks for traning and validation - size_t train_begin, train_end, train_count, val_begin, val_end, val_count; - std::vector layers; // all the layers in the neural network + Context* context; + size_t num_samples; // number of samples: N + size_t num_classes; // number of vertex classes: E + size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 + unsigned num_epochs; // number of epochs + std::vector feature_dims; // feature dimnesions for each layer + std::vector train_mask, val_mask; // masks for traning and validation + size_t train_begin, train_end, train_count, val_begin, val_end, val_count; + std::vector layers; // all the layers in the neural network - // comparing outputs with the ground truth (labels) - inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t *masks) { - AccumF accuracy_all; - accuracy_all.reset(); - galois::do_all(galois::iterate(begin, end), [&](const auto& i) { - if (masks[i] == 1) { - int preds = argmax(num_classes, &(layers[NUM_CONV_LAYERS-1]->next()->get_data()[i*num_classes])); - if ((label_t)preds == context->get_label(i)) accuracy_all += 1.0; - } - }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); - return accuracy_all.reduce() / (acc_t)count; - } + // comparing outputs with the ground truth (labels) + inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, + mask_t* masks) { + AccumF accuracy_all; + accuracy_all.reset(); + galois::do_all(galois::iterate(begin, end), + [&](const auto& i) { + if (masks[i] == 1) { + int preds = argmax(num_classes, + &(layers[NUM_CONV_LAYERS - 1] + ->next() + ->get_data()[i * num_classes])); + if ((label_t)preds == context->get_label(i)) + accuracy_all += 1.0; + } + }, + galois::chunk_size<256>(), galois::steal(), + galois::loopname("getMaskedLoss")); + return accuracy_all.reduce() / (acc_t)count; + } }; #endif diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h index 5a3cf3f83f..918b91b86c 100644 --- a/libdeepgalois/include/node.h +++ b/libdeepgalois/include/node.h @@ -9,52 +9,55 @@ class edge; typedef std::shared_ptr edgeptr_t; -// node data structure: each layer is a node, two layers are connected by an edge +// node data structure: each layer is a node, two layers are connected by an +// edge class node : public std::enable_shared_from_this { public: - node(size_t in_size, size_t out_size) {}//: prev_(in_size), next_(out_size) {} - virtual ~node() {} - const edgeptr_t prev() const { return prev_; } - const edgeptr_t next() const { return next_; } + node(size_t in_size, size_t out_size) { + } //: prev_(in_size), next_(out_size) {} + virtual ~node() {} + const edgeptr_t prev() const { return prev_; } + const edgeptr_t next() const { return next_; } protected: - node() = delete; - friend void connect(layer *head, layer *tail, size_t head_index, size_t tail_index); - mutable edgeptr_t prev_; - mutable edgeptr_t next_; + node() = delete; + friend void connect(layer* head, layer* tail, size_t head_index, + size_t tail_index); + mutable edgeptr_t prev_; + mutable edgeptr_t next_; }; // edges manage the input/output data and gradients between nodes class edge { public: - edge(node *prev, size_t n, size_t len) : - num_samples_(n), ft_dim_(len), - data_(NULL), grad_(NULL), prev_(prev) {} - - void alloc(); - void alloc_gpu(); - void merge_grads(vec_t *dst); - void merge_grads_gpu(float_t *dst); - void clear_grads(); - void clear_grads_gpu(); - - void set_data(float_t *ptr) { data_ = ptr; } - float_t *get_data() { return data_; } - const float_t *get_data() const { return data_; } - float_t *get_gradient() { return grad_; } - const float_t *get_gradient() const { return grad_; } - - const node *next() const { return next_; } - node *prev() { return prev_; } - const node *prev() const { return prev_; } - void add_next_node(node *next) { next_ = next; } + edge(node* prev, size_t n, size_t len) + : num_samples_(n), ft_dim_(len), + // data_(vec_t(n*len)), grad_(vec_t(n*len)), + data_(NULL), grad_(NULL), prev_(prev) {} + + void alloc(); + void alloc_gpu(); + void merge_grads(vec_t* dst); + void merge_grads_gpu(float_t* dst); + void clear_grads(); + void clear_grads_gpu(); + + void set_data(float_t* ptr) { data_ = ptr; } + float_t* get_data() { return data_; } + const float_t* get_data() const { return data_; } + float_t* get_gradient() { return grad_; } + const float_t* get_gradient() const { return grad_; } + + const node* next() const { return next_; } + node* prev() { return prev_; } + const node* prev() const { return prev_; } + void add_next_node(node* next) { next_ = next; } private: - size_t num_samples_;// number of samples - size_t ft_dim_; // feature dimensions - float_t *data_; // feature vectors - float_t *grad_; // gradients - node *prev_; // previous node, "producer" of data - node *next_; // next node, "consumer" of data + size_t num_samples_; // number of samples + size_t ft_dim_; // feature dimensions + float_t* data_; // feature vectors + float_t* grad_; // gradients + node* prev_; // previous node, "producer" of data + node* next_; // next node, "consumer" of data }; - diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h index 072eb7d2bc..f1822adc7d 100644 --- a/libdeepgalois/include/optimizer.h +++ b/libdeepgalois/include/optimizer.h @@ -5,31 +5,36 @@ #include "types.h" // base class of optimizer -// usesHessian : true if an optimizer uses hessian (2nd order derivative of loss function) +// usesHessian : true if an optimizer uses hessian (2nd order derivative of loss +// function) struct optimizer { - optimizer() = default; - optimizer(const optimizer &) = default; - optimizer(optimizer &&) = default; - optimizer &operator=(const optimizer &) = default; - optimizer &operator=(optimizer &&) = default; - virtual ~optimizer() = default; - virtual void update(const vec_t &dW, vec_t &W, bool parallelize) = 0; - virtual void update_gpu(const float_t *dW, float_t *W) = 0; - virtual void reset() {} // override to implement pre-learning action + optimizer() = default; + optimizer(const optimizer&) = default; + optimizer(optimizer&&) = default; + optimizer& operator=(const optimizer&) = default; + optimizer& operator=(optimizer&&) = default; + virtual ~optimizer() = default; + virtual void update(const vec_t& dW, vec_t& W, bool parallelize) = 0; + virtual void reset() {} // override to implement pre-learning action }; // helper class to hold N values for each weight template struct stateful_optimizer : public optimizer { - void reset() override { for (auto &e : E_) e.clear(); } + void reset() override { + for (auto& e : E_) + e.clear(); + } + protected: - template - vec_t &get(const vec_t &key) { - static_assert(Index < N, "index out of range"); - if (E_[Index][&key].empty()) E_[Index][&key].resize(key.size(), float_t()); - return E_[Index][&key]; - } - std::unordered_map E_[N]; + template + vec_t& get(const vec_t& key) { + static_assert(Index < N, "index out of range"); + if (E_[Index][&key].empty()) + E_[Index][&key].resize(key.size(), float_t()); + return E_[Index][&key]; + } + std::unordered_map E_[N]; }; /** @@ -40,12 +45,26 @@ struct stateful_optimizer : public optimizer { * The Journal of Machine Learning Research, pages 2121-2159, 2011. **/ struct adagrad : public stateful_optimizer<1> { - adagrad() : alpha(0.01), eps(float_t(1e-8)) {} - void update(const vec_t &dW, vec_t &W, bool parallelize); - void update_gpu(const float_t *dW, float_t *W) {} - float_t alpha; // learning rate - private: - float_t eps; + adagrad() : alpha(0.01), eps(float_t(1e-8)) {} + void update(const vec_t& dW, vec_t& W, bool parallelize) { + vec_t& g = get<0>(W); + if (parallelize) { + galois::do_all(galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + g[i] += dW[i] * dW[i]; + W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); + }, + galois::loopname("adagrad_update")); + } else { + for (size_t i = 0; i < W.size(); i++) { + g[i] += dW[i] * dW[i]; + W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); + } + } + } + float_t alpha; // learning rate +private: + float_t eps; }; /** @@ -55,35 +74,54 @@ struct adagrad : public stateful_optimizer<1> { * Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning (2012) **/ struct RMSprop : public stateful_optimizer<1> { - RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {} - void update(const vec_t &dW, vec_t &W, bool parallelize); - void update_gpu(const float_t *dW, float_t *W) {} - float_t alpha; // learning rate - float_t mu; // decay term + RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {} + void update(const vec_t& dW, vec_t& W, bool parallelize) { + vec_t& g = get<0>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i]; + W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps); + }, + galois::loopname("rms_update")); + } + float_t alpha; // learning rate + float_t mu; // decay term private: - float_t eps; // constant value to avoid zero-division + float_t eps; // constant value to avoid zero-division }; // Adam: A Method for Stochastic Optimization // http://arxiv.org/abs/1412.6980 struct adam : public stateful_optimizer<2> { - adam() : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)), - b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {} - void update(const vec_t &dW, vec_t &W, bool parallelize); -#ifdef CPU_ONLY - void update_gpu(const float_t *dW, float_t *W) {} -#else - void update_gpu(const float_t *dW, float_t *W); -#endif - - float_t alpha; // learning rate - float_t b1; // decay term - float_t b2; // decay term - float_t b1_t; // decay term power t - float_t b2_t; // decay term power t + adam() + : alpha(0.01), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(float_t(0.9)), + b2_t(float_t(0.999)), eps(float_t(1e-8)) {} + + void update(const vec_t& dW, vec_t& W, bool parallelize) { + vec_t& mt = get<0>(W); + vec_t& vt = get<1>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; + // L2 norm based update rule + W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / + std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); + }, + galois::chunk_size<256>(), galois::steal(), + galois::loopname("adam_update")); + b1_t *= b1; + b2_t *= b2; + } + + float_t alpha; // learning rate + float_t b1; // decay term + float_t b2; // decay term + float_t b1_t; // decay term power t + float_t b2_t; // decay term power t private: - float_t eps; // constant value to avoid zero-division + float_t eps; // constant value to avoid zero-division }; /** @@ -93,29 +131,48 @@ struct adam : public stateful_optimizer<2> { * */ struct adamax : public stateful_optimizer<2> { - adamax() : alpha(float_t(0.002)), - b1(float_t(0.9)), b2(float_t(0.999)), - b1_t(b1), eps(float_t(1e-8)) {} - void update(const vec_t &dW, vec_t &W, bool parallelize); - void update_gpu(const float_t *dW, float_t *W) {} - - float_t alpha; // learning rate - float_t b1; // decay term - float_t b2; // decay term - float_t b1_t; // decay term power t + adamax() + : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1), + eps(float_t(1e-8)) {} + + void update(const vec_t& dW, vec_t& W, bool parallelize) { + vec_t& mt = get<0>(W); + vec_t& ut = get<1>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); + // Lp norm based update rule + W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); + }, + galois::loopname("adamax_update")); + b1_t *= b1; + } + + float_t alpha; // learning rate + float_t b1; // decay term + float_t b2; // decay term + float_t b1_t; // decay term power t private: - float_t eps; // constant value to avoid zero-division + float_t eps; // constant value to avoid zero-division }; -// SGD without momentum -// slightly faster than tiny_dnn::momentum +/** + * SGD without momentum + * + * slightly faster than tiny_dnn::momentum + **/ struct gradient_descent : public optimizer { - gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {} - void update(const vec_t &dW, vec_t &W, bool parallelize); - void update_gpu(const float_t *dW, float_t *W) {} - float_t alpha; // learning rate - float_t lambda; // weight decay + gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {} + void update(const vec_t& dW, vec_t& W, bool parallelize) { + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); }, + galois::loopname("gradient_descent_update")); + } + float_t alpha; // learning rate + float_t lambda; // weight decay }; /** @@ -126,14 +183,27 @@ struct gradient_descent : public optimizer { * USSR Computational Mathematics and Mathematical Physics, 4(5):1-17, 1964. **/ struct momentum : public stateful_optimizer<1> { - public: +public: momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} - void update(const vec_t &dW, vec_t &W, bool parallelize); - void update_gpu(const float_t *dW, float_t *W) {} - float_t alpha; // learning rate - float_t lambda; // weight decay - float_t mu; // momentum + void update(const vec_t& dW, vec_t& W, bool parallelize) { + vec_t& dWprev = get<0>(W); + + // for_i(parallelize, W.size(), [&](size_t i) { + galois::do_all(galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + float_t V = + mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += V; + dWprev[i] = V; + //}); + }, + galois::loopname("momentum_update")); + } + + float_t alpha; // learning rate + float_t lambda; // weight decay + float_t mu; // momentum }; /** @@ -144,14 +214,26 @@ struct momentum : public stateful_optimizer<1> { * convergence o(1/k2), Doklady ANSSSR, vol.269, pp.543-547, 1983. **/ struct nesterov_momentum : public stateful_optimizer<1> { - public: +public: nesterov_momentum() - : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} - void update(const vec_t &dW, vec_t &W, bool parallelize); - void update_gpu(const float_t *dW, float_t *W) {} - - float_t alpha; // learning rate - float_t lambda; // weight decay - float_t mu; // momentum + : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} + + void update(const vec_t& dW, vec_t& W, bool parallelize) { + vec_t& dWprev = get<0>(W); + + // for_i(parallelize, W.size(), [&](size_t i) { + galois::do_all(galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + float_t V = + mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += (-mu) * dWprev[i] + (1 + mu) * V; + dWprev[i] = V; + //}); + }, + galois::loopname("nesterov_momentum_update")); + } + + float_t alpha; // learning rate + float_t lambda; // weight decay + float_t mu; // momentum }; - diff --git a/libdeepgalois/include/random.h b/libdeepgalois/include/random.h index 9236e9c391..8560a24de1 100644 --- a/libdeepgalois/include/random.h +++ b/libdeepgalois/include/random.h @@ -4,60 +4,65 @@ typedef boost::mt19937 rng_t; // random seeding int64_t seedgen(void) { - int64_t s, seed, pid; - FILE* f = fopen("/dev/urandom", "rb"); - if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { - fclose(f); - return seed; - } - std::cout << "System entropy source not available, using fallback algorithm to generate seed instead."; - if (f) fclose(f); - pid = getpid(); - s = time(NULL); - seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729); - return seed; + int64_t s, seed, pid; + FILE* f = fopen("/dev/urandom", "rb"); + if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { + fclose(f); + return seed; + } + std::cout << "System entropy source not available, using fallback algorithm " + "to generate seed instead."; + if (f) + fclose(f); + pid = getpid(); + s = time(NULL); + seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729); + return seed; } // This random number generator facade hides boost and CUDA rng // implementation from one another (for cross-platform compatibility). class RNG { public: - RNG() : generator_(new Generator()) { } - explicit RNG(unsigned int seed) : generator_(new Generator(seed)) { } - explicit RNG(const RNG&); - RNG& operator=(const RNG& other) { generator_ = other.generator_; return *this; } - void* generator() { return static_cast(generator_->rng()); } + RNG() : generator_(new Generator()) {} + explicit RNG(unsigned int seed) : generator_(new Generator(seed)) {} + explicit RNG(const RNG&); + RNG& operator=(const RNG& other) { + generator_ = other.generator_; + return *this; + } + void* generator() { return static_cast(generator_->rng()); } + private: - class Generator { - public: - Generator() : rng_(new rng_t(seedgen())) {} - explicit Generator(unsigned seed) : rng_(new rng_t(seed)) {} - rng_t* rng() { return rng_.get(); } - private: - std::shared_ptr rng_; - }; - - std::shared_ptr generator_; + class Generator { + public: + Generator() : rng_(new rng_t(seedgen())) {} + explicit Generator(unsigned seed) : rng_(new rng_t(seed)) {} + rng_t* rng() { return rng_.get(); } + + private: + std::shared_ptr rng_; + }; + + std::shared_ptr generator_; }; std::shared_ptr random_generator_; inline static RNG& rng_stream() { - random_generator_.reset(new RNG()); - return *random_generator_; + random_generator_.reset(new RNG()); + return *random_generator_; } -inline rng_t* rng() { - return static_cast(rng_stream().generator()); -} +inline rng_t* rng() { return static_cast(rng_stream().generator()); } #include template -void rng_bernoulli(const DataTy p, std::vector &r) { - boost::bernoulli_distribution random_distribution(p); - boost::variate_generator > - variate_generator(rng(), random_distribution); - for (size_t i = 0; i < r.size(); ++i) - r[i] = static_cast(variate_generator()); +void rng_bernoulli(const DataTy p, std::vector& r) { + boost::bernoulli_distribution random_distribution(p); + boost::variate_generator> + variate_generator(rng(), random_distribution); + for (size_t i = 0; i < r.size(); ++i) + r[i] = static_cast(variate_generator()); } #endif diff --git a/libdeepgalois/include/timer.h b/libdeepgalois/include/timer.h index e6c838c37b..af01412463 100644 --- a/libdeepgalois/include/timer.h +++ b/libdeepgalois/include/timer.h @@ -4,18 +4,25 @@ class Timer { public: - Timer() {} - void Start() { gettimeofday(&start_time_, NULL); } - void Stop() { - gettimeofday(&elapsed_time_, NULL); - elapsed_time_.tv_sec -= start_time_.tv_sec; - elapsed_time_.tv_usec -= start_time_.tv_usec; - } - double Seconds() const { return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1e6; } - double Millisecs() const { return 1000*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1000; } - double Microsecs() const { return 1e6*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; } + Timer() {} + void Start() { gettimeofday(&start_time_, NULL); } + void Stop() { + gettimeofday(&elapsed_time_, NULL); + elapsed_time_.tv_sec -= start_time_.tv_sec; + elapsed_time_.tv_usec -= start_time_.tv_usec; + } + double Seconds() const { + return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1e6; + } + double Millisecs() const { + return 1000 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1000; + } + double Microsecs() const { + return 1e6 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; + } + private: - struct timeval start_time_; - struct timeval elapsed_time_; + struct timeval start_time_; + struct timeval elapsed_time_; }; -#endif // TIMER_H_ +#endif // TIMER_H_ diff --git a/libdeepgalois/include/types.h b/libdeepgalois/include/types.h index 720c2ae2b8..5890ed307c 100644 --- a/libdeepgalois/include/types.h +++ b/libdeepgalois/include/types.h @@ -11,14 +11,14 @@ typedef float float_t; typedef float feature_t; // feature type #endif typedef std::vector vec_t; // feature vector (1D) -typedef std::vector tensor_t; // feature vectors (2D): num_samples x feature_dim +typedef std::vector + tensor_t; // feature vectors (2D): num_samples x feature_dim typedef std::vector FV; // feature vector -typedef std::vector FV2D; // feature vectors: num_samples x feature_dim -typedef float acc_t; // Accuracy type -typedef short label_t; // label is for classification (supervised learning) -typedef uint8_t mask_t; // mask is used to indicate different uses of labels: train, val, test +typedef std::vector FV2D; // feature vectors: num_samples x feature_dim +typedef float acc_t; // Accuracy type +typedef short label_t; // label is for classification (supervised learning) +typedef uint8_t mask_t; // mask is used to indicate different uses of labels: + // train, val, test #define CHUNK_SIZE 256 -#define TB_SIZE 256 -#define WARP_SIZE 32 #endif diff --git a/libdeepgalois/include/utils.h b/libdeepgalois/include/utils.h index 63d0f74ff7..1c330daa5b 100644 --- a/libdeepgalois/include/utils.h +++ b/libdeepgalois/include/utils.h @@ -8,113 +8,121 @@ #include #include -const std::string path = "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset +const std::string path = + "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset enum class net_phase { train, test }; class ResourceManager { public: - ResourceManager() {} - ~ResourceManager(){} - //peak memory usage - std::string get_peak_memory() { - double kbm; - struct rusage CurUsage; - getrusage(RUSAGE_SELF, &CurUsage); - kbm = (double)CurUsage.ru_maxrss; - double mbm = kbm / 1024.0; - double gbm = mbm / 1024.0; - return - "Peak memory: " + - to_string_with_precision(mbm, 3) + " MB; " + - to_string_with_precision(gbm, 3) + " GB"; - } + ResourceManager() {} + ~ResourceManager() {} + // peak memory usage + std::string get_peak_memory() { + double kbm; + struct rusage CurUsage; + getrusage(RUSAGE_SELF, &CurUsage); + kbm = (double)CurUsage.ru_maxrss; + double mbm = kbm / 1024.0; + double gbm = mbm / 1024.0; + return "Peak memory: " + to_string_with_precision(mbm, 3) + " MB; " + + to_string_with_precision(gbm, 3) + " GB"; + } + private: - template - std::string to_string_with_precision(const T a_value, const int& n) { - std::ostringstream out; - out << std::fixed; - out << std::setprecision(n) << a_value; - return out.str(); - } + template + std::string to_string_with_precision(const T a_value, const int& n) { + std::ostringstream out; + out << std::fixed; + out << std::setprecision(n) << a_value; + return out.str(); + } }; class Timer { public: - Timer() {} - void Start() { gettimeofday(&start_time_, NULL); } - void Stop() { - gettimeofday(&elapsed_time_, NULL); - elapsed_time_.tv_sec -= start_time_.tv_sec; - elapsed_time_.tv_usec -= start_time_.tv_usec; - } - double Seconds() const { return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1e6; } - double Millisecs() const { return 1000*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1000; } - double Microsecs() const { return 1e6*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; } + Timer() {} + void Start() { gettimeofday(&start_time_, NULL); } + void Stop() { + gettimeofday(&elapsed_time_, NULL); + elapsed_time_.tv_sec -= start_time_.tv_sec; + elapsed_time_.tv_usec -= start_time_.tv_usec; + } + double Seconds() const { + return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1e6; + } + double Millisecs() const { + return 1000 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1000; + } + double Microsecs() const { + return 1e6 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; + } + private: - struct timeval start_time_; - struct timeval elapsed_time_; + struct timeval start_time_; + struct timeval elapsed_time_; }; class random_generator { public: - static random_generator &get_instance() { - static random_generator instance; - return instance; - } - std::mt19937 &operator()() { return gen_; } - void set_seed(unsigned int seed) { gen_.seed(seed); } + static random_generator& get_instance() { + static random_generator instance; + return instance; + } + std::mt19937& operator()() { return gen_; } + void set_seed(unsigned int seed) { gen_.seed(seed); } private: - random_generator() : gen_(1) {} - std::mt19937 gen_; + random_generator() : gen_(1) {} + std::mt19937 gen_; }; template inline typename std::enable_if::value, T>::type uniform_rand(T min, T max) { - std::uniform_int_distribution dst(min, max); - return dst(random_generator::get_instance()()); + std::uniform_int_distribution dst(min, max); + return dst(random_generator::get_instance()()); } template inline typename std::enable_if::value, T>::type uniform_rand(T min, T max) { - std::uniform_real_distribution dst(min, max); - return dst(random_generator::get_instance()()); + std::uniform_real_distribution dst(min, max); + return dst(random_generator::get_instance()()); } inline bool bernoulli(float_t p) { - return uniform_rand(float_t(0), float_t(1)) <= p; + return uniform_rand(float_t(0), float_t(1)) <= p; } -inline size_t read_masks(std::string dataset_str, std::string mask_type, size_t &begin, size_t &end, std::vector &masks) { - if (dataset_str != "citeseer" && dataset_str != "cora") { - std::cout << "Dataset currently not supported\n"; - exit(1); - } - size_t i = 0; - size_t sample_count = 0; - std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; - //std::cout << "Reading " << filename << "\n"; - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - in >> begin >> end >> std::ws; - while (std::getline(in, line)) { - std::istringstream mask_stream(line); - if (i >= begin && i < end) { - unsigned mask = 0; - mask_stream >> mask; - if (mask == 1) { - masks[i] = 1; - sample_count ++; - } - } - i ++; - } - //std::cout << mask_type + "_mask range: [" << begin << ", " << end - // << ") Number of valid samples: " << sample_count << "\n"; - in.close(); - return sample_count; +inline size_t read_masks(std::string dataset_str, std::string mask_type, + size_t& begin, size_t& end, + std::vector& masks) { + if (dataset_str != "citeseer" && dataset_str != "cora") { + std::cout << "Dataset currently not supported\n"; + exit(1); + } + size_t i = 0; + size_t sample_count = 0; + std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; + // std::cout << "Reading " << filename << "\n"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + in >> begin >> end >> std::ws; + while (std::getline(in, line)) { + std::istringstream mask_stream(line); + if (i >= begin && i < end) { + unsigned mask = 0; + mask_stream >> mask; + if (mask == 1) { + masks[i] = 1; + sample_count++; + } + } + i++; + } + // std::cout << mask_type + "_mask range: [" << begin << ", " << end + // << ") Number of valid samples: " << sample_count << "\n"; + in.close(); + return sample_count; } - diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp index 45862b7516..6bb301b0be 100644 --- a/libdeepgalois/src/aggregator.cpp +++ b/libdeepgalois/src/aggregator.cpp @@ -3,21 +3,28 @@ #include "aggregator.h" #include "math_functions.hh" -void update_all(size_t len, Graph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { - galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto& src) { - clear(len, &out[src*len]); - float_t a = 0.0, b = 0.0; - if (norm) a = norm_factor[src]; - // gather neighbors' embeddings - for (const auto e : g.edges(src)) { - const auto dst = g.getEdgeDst(e); - if (norm) { - b = a * norm_factor[dst]; - vec_t neighbor(len); - mul_scalar(len, b, &in[dst*len], &neighbor[0]); - vadd(len, &out[src*len], &neighbor[0], &out[src*len]); // out[src] += in[dst] - } else vadd(len, &out[src*len], &in[dst*len], &out[src*len]); // out[src] += in[dst] - } - }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); +void update_all(size_t len, Graph& g, const float_t* in, float_t* out, + bool norm, const float_t* norm_factor) { + galois::do_all(galois::iterate(g.begin(), g.end()), + [&](const auto& src) { + clear(len, &out[src * len]); + float_t a = 0.0, b = 0.0; + if (norm) + a = norm_factor[src]; + // gather neighbors' embeddings + for (const auto e : g.edges(src)) { + const auto dst = g.getEdgeDst(e); + if (norm) { + b = a * norm_factor[dst]; + vec_t neighbor(len); + mul_scalar(len, b, &in[dst * len], &neighbor[0]); + vadd(len, &out[src * len], &neighbor[0], + &out[src * len]); // out[src] += in[dst] + } else + vadd(len, &out[src * len], &in[dst * len], + &out[src * len]); // out[src] += in[dst] + } + }, + galois::chunk_size(), galois::steal(), + galois::loopname("update_all")); } - diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index a6b61ce914..ea41fd3dcb 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -6,30 +6,36 @@ #include "math_functions.hh" // TODO: use warp -__device__ void scale_add(const int n, const float_t alpha, const float_t* a, const float_t* b, float_t* y) { - for (int i = 0; i < n; i++) y[i] = alpha * a[i] + b[i]; +__device__ void scale_add(const int n, const float_t alpha, const float_t* a, + const float_t* b, float_t* y) { + for (int i = 0; i < n; i++) + y[i] = alpha * a[i] + b[i]; } -__global__ void update_all_kernel(size_t n, size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { - CUDA_KERNEL_LOOP(src, n) { - float_t a = 0.0, b = 1.0; - if (norm) a = norm_factor[src]; - index_type begin = g.edge_begin(src); - index_type end = g.edge_end(src); - for (index_type e = begin; e != end; e++) { - index_type dst = g.getEdgeDst(e); - assert(dst < n); - if (norm) b = a * norm_factor[dst]; - scale_add(len, b, in+dst*len, out+src*len, out+src*len); // out[src] += in[dst] - } - } +__global__ void update_all_kernel(size_t n, size_t len, CSRGraph& g, + const float_t* in, float_t* out, bool norm, + const float_t* norm_factor) { + CUDA_KERNEL_LOOP(src, n) { + float_t a = 0.0, b = 1.0; + if (norm) + a = norm_factor[src]; + index_type begin = g.edge_begin(src); + index_type end = g.edge_end(src); + for (index_type e = begin; e != end; e++) { + index_type dst = g.getEdgeDst(e); + assert(dst < n); + if (norm) + b = a * norm_factor[dst]; + scale_add(len, b, in + dst * len, out + src * len, + out + src * len); // out[src] += in[dst] + } + } } -void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { - std::cout << "[debug]: update_all on GPU\n"; - unsigned n = g.nnodes; - CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); - update_all_kernel<<>>(n, len, g, in, out, norm, norm_factor); - CudaTest("solving update_all kernel failed"); +void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, + bool norm, const float_t* norm_factor) { + unsigned n = g.nnodes; + CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); + update_all_kernel<<>>( + n, len, g, in, out, norm, norm_factor); } - diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index a500c02125..04d7c14476 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -1,143 +1,237 @@ #include "context.h" #include "gtypes.h" +#include +#include + +// random seeding +int64_t cluster_seedgen(void) { + int64_t s, seed, pid; + FILE* f = fopen("/dev/urandom", "rb"); + if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { + fclose(f); + return seed; + } + std::cout << "System entropy source not available, " + "using fallback algorithm to generate seed instead."; + if (f) + fclose(f); + pid = getpid(); + s = time(NULL); + seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729); + return seed; +} #ifdef CPU_ONLY -Context::Context() : mode_(Context::CPU), solver_count_(1), - solver_rank_(0), multiprocess_(false) { } +Context::Context() + : mode_(Context::CPU), solver_count_(1), solver_rank_(0), + multiprocess_(false) {} Context::~Context() {} +#else +cublasHandle_t Context::cublas_handle_ = 0; +curandGenerator_t Context::curand_generator_ = 0; + +Context::Context() + : mode_(Context::GPU), solver_count_(1), solver_rank_(0), + multiprocess_(false) { + // void Context::create_blas_handle() { + CUBLAS_CHECK(cublasCreate(&cublas_handle_)); + CURAND_CHECK( + curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK( + curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); +} + +Context::~Context() { + if (cublas_handle_) + CUBLAS_CHECK(cublasDestroy(cublas_handle_)); + if (curand_generator_) { + CURAND_CHECK(curandDestroyGenerator(curand_generator_)); + } +} + +void Context::SetDevice(const int device_id) { + int current_device; + CUDA_CHECK(cudaGetDevice(¤t_device)); + if (current_device == device_id) + return; + CUDA_CHECK(cudaSetDevice(device_id)); + if (cublas_handle_) + CUBLAS_CHECK(cublasDestroy(cublas_handle_)); + if (curand_generator_) + CURAND_CHECK(curandDestroyGenerator(curand_generator_)); + CUBLAS_CHECK(cublasCreate(&cublas_handle_)); + CURAND_CHECK( + curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK( + curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); +} #endif size_t Context::read_graph(std::string dataset_str) { #ifdef CPU_ONLY - n = read_graph_cpu(dataset_str, "gr"); + n = read_graph_cpu(dataset_str, "gr"); #else - n = read_graph_gpu(dataset_str); + n = read_graph_gpu(dataset_str); #endif - return n; + return n; } #ifdef CPU_ONLY size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype) { - galois::StatTimer Tread("GraphReadingTime"); - Tread.start(); - LGraph lgraph; - if (filetype == "el") { - std::string filename = path + dataset_str + ".el"; - printf("Reading .el file: %s\n", filename.c_str()); - lgraph.read_edgelist(filename.c_str(), true); //symmetrize - genGraph(lgraph, graph_cpu); - lgraph.clean(); - } else if (filetype == "gr") { - std::string filename = path + dataset_str + ".csgr"; - printf("Reading .gr file: %s\n", filename.c_str()); - galois::graphs::readGraph(graph_cpu, filename); - } else { printf("Unkown file format\n"); exit(1); } - Tread.stop(); - std::cout << "num_vertices " << graph_cpu.size() << " num_edges " << graph_cpu.sizeEdges() << "\n"; - return graph_cpu.size(); + galois::StatTimer Tread("GraphReadingTime"); + Tread.start(); + LGraph lgraph; + if (filetype == "el") { + std::string filename = path + dataset_str + ".el"; + printf("Reading .el file: %s\n", filename.c_str()); + lgraph.read_edgelist(filename.c_str(), true); // symmetrize + genGraph(lgraph, graph_cpu); + lgraph.clean(); + } else if (filetype == "gr") { + std::string filename = path + dataset_str + ".csgr"; + printf("Reading .gr file: %s\n", filename.c_str()); + galois::graphs::readGraph(graph_cpu, filename); + } else { + printf("Unkown file format\n"); + exit(1); + } + Tread.stop(); + std::cout << "num_vertices " << graph_cpu.size() << " num_edges " + << graph_cpu.sizeEdges() << "\n"; + return graph_cpu.size(); } -void Context::genGraph(LGraph &lg, Graph &g) { - g.allocateFrom(lg.num_vertices(), lg.num_edges()); - g.constructNodes(); - for (size_t i = 0; i < lg.num_vertices(); i++) { - g.getData(i) = 1; - auto row_begin = lg.get_offset(i); - auto row_end = lg.get_offset(i+1); - g.fixEndEdge(i, row_end); - for (auto offset = row_begin; offset < row_end; offset ++) - g.constructEdge(offset, lg.get_dest(offset), 0); - } +void Context::genGraph(LGraph& lg, Graph& g) { + g.allocateFrom(lg.num_vertices(), lg.num_edges()); + g.constructNodes(); + for (size_t i = 0; i < lg.num_vertices(); i++) { + g.getData(i) = 1; + auto row_begin = lg.get_offset(i); + auto row_end = lg.get_offset(i + 1); + g.fixEndEdge(i, row_end); + for (auto offset = row_begin; offset < row_end; offset++) + g.constructEdge(offset, lg.get_dest(offset), 0); + } +} +float_t* Context::get_in_ptr() { return &h_feats[0]; } +#else +size_t Context::read_graph_gpu(std::string dataset_str) { + std::string filename = path + dataset_str + ".csgr"; + graph_gpu.read(filename.c_str(), false); + return graph_gpu.nnodes; } -float_t * Context::get_in_ptr() { return &h_feats[0]; } +void Context::copy_data_to_device() { + CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t))); + CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), + cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMalloc((void**)&d_norm_factor, n * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t))); + CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), + cudaMemcpyHostToDevice)); +} +float_t* Context::get_in_ptr() { return d_feats; } #endif // user-defined pre-computing function, called during initialization // for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v void Context::norm_factor_counting() { #ifdef CPU_ONLY - norm_factor = new float_t[n]; - galois::do_all(galois::iterate((size_t)0, n), [&] (auto v) { - auto degree = std::distance(graph_cpu.edge_begin(v), graph_cpu.edge_end(v)); - float_t temp = std::sqrt(float_t(degree)); - if (temp == 0.0) norm_factor[v] = 0.0; - else norm_factor[v] = 1.0 / temp; - }, galois::loopname("NormCounting")); -#else - norm_factor_counting_gpu(); + norm_factor = new float_t[n]; + galois::do_all(galois::iterate((size_t)0, n), + [&](auto v) { + float_t temp = std::sqrt(float_t(degrees[v])); + if (temp == 0.0) + norm_factor[v] = 0.0; + else + norm_factor[v] = 1.0 / temp; + }, + galois::loopname("NormCounting")); #endif } -// labels contain the ground truth (e.g. vertex classes) for each example (num_examples x 1). -// Note that labels is not one-hot encoded vector and it can be computed -// as y.argmax(axis=1) from one-hot encoded vector (y) of labels if required. +void Context::degree_counting() { +#ifdef CPU_ONLY + degrees.resize(n); + galois::do_all(galois::iterate((size_t)0, n), + [&](auto v) { + degrees[v] = std::distance(graph_cpu.edge_begin(v), + graph_cpu.edge_end(v)); + }, + galois::loopname("DegreeCounting")); +#endif +} + +// labels contain the ground truth (e.g. vertex classes) for each example +// (num_examples x 1). Note that labels is not one-hot encoded vector and it can +// be computed as y.argmax(axis=1) from one-hot encoded vector (y) of labels if +// required. size_t Context::read_labels(std::string dataset_str) { - std::cout << "Reading labels ... "; - Timer t_read; - t_read.Start(); - std::string filename = path + dataset_str + "-labels.txt"; - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - size_t m; // m: number of samples - in >> m >> num_classes >> std::ws; - assert(m == n); - labels.resize(m, 0); // label for each vertex: N x 1 - unsigned v = 0; - while (std::getline(in, line)) { - std::istringstream label_stream(line); - unsigned x; - for (size_t idx = 0; idx < num_classes; ++idx) { - label_stream >> x; - if (x != 0) { - labels[v] = idx; - break; - } - } - v ++; - } - in.close(); - t_read.Stop(); - // print the number of vertex classes - std::cout << "Done, unique label counts: " << num_classes - << ", time: " << t_read.Millisecs() << " ms\n"; - return num_classes; + std::cout << "Reading labels ... "; + Timer t_read; + t_read.Start(); + std::string filename = path + dataset_str + "-labels.txt"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + size_t m; // m: number of samples + in >> m >> num_classes >> std::ws; + assert(m == n); + labels.resize(m, 0); // label for each vertex: N x 1 + unsigned v = 0; + while (std::getline(in, line)) { + std::istringstream label_stream(line); + unsigned x; + for (size_t idx = 0; idx < num_classes; ++idx) { + label_stream >> x; + if (x != 0) { + labels[v] = idx; + break; + } + } + v++; + } + in.close(); + t_read.Stop(); + // print the number of vertex classes + std::cout << "Done, unique label counts: " << num_classes + << ", time: " << t_read.Millisecs() << " ms\n"; + return num_classes; } size_t Context::read_features(std::string dataset_str) { - std::cout << "Reading features ... "; - Timer t_read; - t_read.Start(); - std::string filename = path + dataset_str + ".ft"; - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - size_t m; // m = number of vertices - in >> m >> feat_len >> std::ws; - //assert(m == ); - h_feats.resize(m*feat_len, 0); - while (std::getline(in, line)) { - std::istringstream edge_stream(line); - unsigned u, v; - float_t w; - edge_stream >> u; - edge_stream >> v; - edge_stream >> w; - h_feats[u*feat_len+v] = w; - } - in.close(); - t_read.Stop(); - std::cout << "Done, feature length: " << feat_len << ", time: " << t_read.Millisecs() << " ms\n"; - return feat_len; + std::cout << "Reading features ... "; + Timer t_read; + t_read.Start(); + std::string filename = path + dataset_str + ".ft"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + size_t m; // m = number of vertices + in >> m >> feat_len >> std::ws; + // assert(m == ); + h_feats.resize(m * feat_len, 0); + while (std::getline(in, line)) { + std::istringstream edge_stream(line); + unsigned u, v; + float_t w; + edge_stream >> u; + edge_stream >> v; + edge_stream >> w; + h_feats[u * feat_len + v] = w; + } + in.close(); + t_read.Stop(); + std::cout << "Done, feature length: " << feat_len + << ", time: " << t_read.Millisecs() << " ms\n"; + return feat_len; } /* inline void init_features(size_t dim, vec_t &x) { - std::default_random_engine rng; - std::uniform_real_distribution dist(0, 0.1); - for (size_t i = 0; i < dim; ++i) - x[i] = dist(rng); + std::default_random_engine rng; + std::uniform_real_distribution dist(0, 0.1); + for (size_t i = 0; i < dim; ++i) + x[i] = dist(rng); } //*/ - diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 182deeaed0..b68f07ab98 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -6,81 +6,96 @@ // random seeding int64_t cluster_seedgen(void) { - int64_t s, seed, pid; - FILE* f = fopen("/dev/urandom", "rb"); - if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { - fclose(f); - return seed; - } - std::cout << "System entropy source not available, " - "using fallback algorithm to generate seed instead."; - if (f) fclose(f); - pid = getpid(); - s = time(NULL); - seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729); - return seed; + int64_t s, seed, pid; + FILE* f = fopen("/dev/urandom", "rb"); + if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { + fclose(f); + return seed; + } + std::cout << "System entropy source not available, " + "using fallback algorithm to generate seed instead."; + if (f) + fclose(f); + pid = getpid(); + s = time(NULL); + seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729); + return seed; } -__global__ void norm_factor_counting_kernel(int n, CSRGraph graph, float_t *norm_fac) { - CUDA_KERNEL_LOOP(i, n) { - float_t temp = sqrt(float_t(graph.getOutDegree(i))); - if (temp == 0.0) norm_fac[i] = 0.0; - else norm_fac[i] = 1.0 / temp; - } +__global__ void norm_factor_counting_kernel(int n, CSRGraph graph, + float_t* norm_fac) { + CUDA_KERNEL_LOOP(i, n) { + float_t temp = sqrt(float_t(graph.getOutDegree(i))); + if (temp == 0.0) + norm_fac[i] = 0.0; + else + norm_fac[i] = 1.0 / temp; + } } void Context::norm_factor_counting_gpu() { - std::cout << "Pre-computing normalization factor (n=" << n << ")\n"; - assert(graph_gpu.nnodes == n); - CUDA_CHECK(cudaMalloc((void **)&d_norm_factor, n * sizeof(float_t))); - norm_factor_counting_kernel<<>>(n, graph_gpu, d_norm_factor); - CudaTest("solving norm_factor_counting kernel failed"); + std::cout << "Pre-computing normalization factor (n=" << n << ")\n"; + assert(graph_gpu.nnodes == n); + CUDA_CHECK(cudaMalloc((void**)&d_norm_factor, n * sizeof(float_t))); + norm_factor_counting_kernel<<>>( + n, graph_gpu, d_norm_factor); + CudaTest("solving norm_factor_counting kernel failed"); } -cublasHandle_t Context::cublas_handle_ = 0; +cublasHandle_t Context::cublas_handle_ = 0; curandGenerator_t Context::curand_generator_ = 0; -Context::Context() : mode_(Context::GPU), solver_count_(1), - solver_rank_(0), multiprocess_(false) { - CUBLAS_CHECK(cublasCreate(&cublas_handle_)); - CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); +Context::Context() + : mode_(Context::GPU), solver_count_(1), solver_rank_(0), + multiprocess_(false) { + CUBLAS_CHECK(cublasCreate(&cublas_handle_)); + CURAND_CHECK( + curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK( + curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); } Context::~Context() { - if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); - if (curand_generator_) { - CURAND_CHECK(curandDestroyGenerator(curand_generator_)); - } + if (cublas_handle_) + CUBLAS_CHECK(cublasDestroy(cublas_handle_)); + if (curand_generator_) { + CURAND_CHECK(curandDestroyGenerator(curand_generator_)); + } } void Context::SetDevice(const int device_id) { - int current_device; - CUDA_CHECK(cudaGetDevice(¤t_device)); - if (current_device == device_id) return; - CUDA_CHECK(cudaSetDevice(device_id)); - if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); - if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_)); - CUBLAS_CHECK(cublasCreate(&cublas_handle_)); - CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); + int current_device; + CUDA_CHECK(cudaGetDevice(¤t_device)); + if (current_device == device_id) + return; + CUDA_CHECK(cudaSetDevice(device_id)); + if (cublas_handle_) + CUBLAS_CHECK(cublasDestroy(cublas_handle_)); + if (curand_generator_) + CURAND_CHECK(curandDestroyGenerator(curand_generator_)); + CUBLAS_CHECK(cublasCreate(&cublas_handle_)); + CURAND_CHECK( + curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK( + curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); } size_t Context::read_graph_gpu(std::string dataset_str) { - std::string filename = path + dataset_str + ".csgr"; - CSRGraph g; - g.read(filename.c_str(), false); - g.copy_to_gpu(graph_gpu); - return graph_gpu.nnodes; + std::string filename = path + dataset_str + ".csgr"; + CSRGraph g; + g.read(filename.c_str(), false); + g.copy_to_gpu(graph_gpu); + return graph_gpu.nnodes; } void Context::copy_data_to_device() { - assert(labels.size() == n); - CUDA_CHECK(cudaMalloc((void **)&d_labels, n * sizeof(label_t))); - CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMalloc((void **)&d_feats, n * feat_len * sizeof(float_t))); - CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); + assert(labels.size() == n); + CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t))); + CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), + cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t))); + CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), + cudaMemcpyHostToDevice)); } -float_t * Context::get_in_ptr() { return d_feats; } - +float_t* Context::get_in_ptr() { return d_feats; } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 86f39ade20..06ec53b2db 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -1,126 +1,164 @@ #include "layers/graph_conv_layer.h" #ifdef CPU_ONLY -void graph_conv_layer::aggregate(size_t len, Graph &g, const float_t *in, float_t *out) { - update_all(len, g, in, out, true, context->norm_factor); +void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, + float_t* out) { + update_all(len, g, in, out, true, context->norm_factor); #else -void graph_conv_layer::aggregate(size_t len, CSRGraph &g, const float_t *in, float_t *out) { - update_all(len, g, in, out, true, context->d_norm_factor); +void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, + float_t* out) { + update_all(len, g, in, out, true, context->d_norm_factor); #endif } -void graph_conv_layer::combine(const vec_t &self, const vec_t &neighbors, vec_t &out) { - vec_t a(out.size(), 0); - vec_t b(out.size(), 0); - mvmul(Q, self, a); - mvmul(W, neighbors, b); - vadd(a, b, out); // out = W*self + Q*neighbors +void graph_conv_layer::combine(const vec_t& self, const vec_t& neighbors, + vec_t& out) { + vec_t a(out.size(), 0); + vec_t b(out.size(), 0); + mvmul(Q, self, a); + mvmul(W, neighbors, b); + vadd(a, b, out); // out = W*self + Q*neighbors } -graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, bool bias, - bool dropout, float dropout_rate, std::vector in_dims, std::vector out_dims) : - layer(level, in_dims, out_dims), act_(act), norm_(norm), - bias_(bias), dropout_(dropout), dropout_rate_(dropout_rate) { - assert(input_dims[0] == output_dims[0]); // num_vertices - x = input_dims[0]; - y = input_dims[1]; - z = output_dims[1]; - trainable_ = true; - name_ = layer_type() + "_" + std::to_string(level); - init(); - scale_ = 1. / (1. - dropout_rate_); +graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, + bool bias, bool dropout, float dropout_rate, + std::vector in_dims, + std::vector out_dims) + : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias), + dropout_(dropout), dropout_rate_(dropout_rate) { + assert(input_dims[0] == output_dims[0]); // num_vertices + x = input_dims[0]; + y = input_dims[1]; + z = output_dims[1]; + trainable_ = true; + name_ = layer_type() + "_" + std::to_string(level); + init(); + scale_ = 1. / (1. - dropout_rate_); } void graph_conv_layer::init() { - Timer t_alloc; - t_alloc.Start(); - //std::cout << name_ << ": allocating memory for parameters and intermediate data... "; + std::cout << name_ + << ": allocating memory for parameters and intermediate data... "; + Timer t_alloc; + t_alloc.Start(); #ifdef CPU_ONLY - rand_init_matrix(y, z, W); // randomly initialize trainable parameters - //rand_init_matrix(y, z, Q); - zero_init_matrix(y, z, weight_grad); - if (dropout_) dropout_mask = new unsigned[x*y]; - in_temp = new float_t[x*y]; - out_temp = new float_t[x*z]; // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py - trans_data = new float_t[y*x]; // y*x + rand_init_matrix(y, z, W); // randomly initialize trainable parameters + // rand_init_matrix(y, z, Q); + zero_init_matrix(y, z, weight_grad); + if (dropout_) + dropout_mask = new unsigned[x * y]; + in_temp = new float_t[x * y]; + out_temp = new float_t + [x * z]; // same as pre_sup in original GCN code: + // https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py + trans_data = new float_t[y * x]; // y*x #else - gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, d_weight_grad); + gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, + d_weight_grad); #endif - t_alloc.Stop(); - //std::cout << "Done, time: " << t_alloc.Millisecs() << " ms\n"; + t_alloc.Stop(); + std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; } #ifdef CPU_ONLY // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) -void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { - // input: x*y; W: y*z; output: x*z - // if y > z: mult W first to reduce the feature size for aggregation - // else: aggregate first then mult W (not implemented yet) - if (dropout_ && phase_ == net_phase::train) { - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - dropout(y, scale_, dropout_rate_, &in_data[i*y], &dropout_mask[i*y], &in_temp[i*y]); - }, galois::loopname("dropout")); - matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z - } else matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z - aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate - if (act_) { - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - relu(z, &out_data[i*z], &out_data[i*z]); - }, galois::loopname("relu")); - } +void graph_conv_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + // input: x*y; W: y*z; output: x*z + // if y > z: mult W first to reduce the feature size for aggregation + // else: aggregate first then mult W (not implemented yet) + if (dropout_ && phase_ == net_phase::train) { + galois::do_all(galois::iterate((size_t)0, x), + [&](const auto& i) { + dropout(y, scale_, dropout_rate_, &in_data[i * y], + &dropout_mask[i * y], &in_temp[i * y]); + }, + galois::loopname("dropout")); + matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z + } else + matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z + aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate + if (act_) { + galois::do_all( + galois::iterate((size_t)0, x), + [&](const auto& i) { relu(z, &out_data[i * z], &out_data[i * z]); }, + galois::loopname("relu")); + } } // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ -void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { -//void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t &out_data, vec_t &out_grad, vec_t &in_grad) { - if (act_) { - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - for (size_t j = 0; j < z; ++j) //TODO: use in_data or out_data? - out_temp[i*z+j] = out_data[i*z+j] > float_t(0) ? out_grad[i*z+j] : float_t(0); - }, galois::loopname("d_relu")); - } else copy1D1D(x*z, out_grad, out_temp); // TODO: avoid copying - if (level_ != 0) { // no need to calculate in_grad for the first layer - vec_t trans_W(z*y); - transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix - matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y - //sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y -> x*y - //NOTE: since graph is symmetric, the derivative is the same - update_all(y, context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y - if (dropout_) { - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - d_dropout(y, scale_, &in_grad[i*y], &dropout_mask[i*y], &in_grad[i*y]); - }, galois::chunk_size(), galois::steal(), galois::loopname("d_dropout")); - } - } - // calculate weight gradients - transpose(x, y, in_data, trans_data); // y*x - matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z +void graph_conv_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + // void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t + // &out_data, vec_t &out_grad, vec_t &in_grad) { + if (act_) { + galois::do_all(galois::iterate((size_t)0, x), + [&](const auto& i) { + for (size_t j = 0; j < z; + ++j) // TODO: use in_data or out_data? + out_temp[i * z + j] = out_data[i * z + j] > float_t(0) + ? out_grad[i * z + j] + : float_t(0); + }, + galois::loopname("d_relu")); + } else + copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying + if (level_ != 0) { // no need to calculate in_grad for the first layer + vec_t trans_W(z * y); + transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix + matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y + // sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y -> + // x*y NOTE: since graph is symmetric, the derivative is the same + update_all(y, context->graph_cpu, in_temp, in_grad, true, + context->norm_factor); // x*x; x*y -> x*y + if (dropout_) { + galois::do_all(galois::iterate((size_t)0, x), + [&](const auto& i) { + d_dropout(y, scale_, &in_grad[i * y], + &dropout_mask[i * y], &in_grad[i * y]); + }, + galois::chunk_size(), galois::steal(), + galois::loopname("d_dropout")); + } + } + // calculate weight gradients + transpose(x, y, in_data, trans_data); // y*x + matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z } #else // GPU forward -void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { - assert(y <= 128); // currently only support feature length <= 128 - assert(in_data != NULL); - assert(in_temp != NULL); - assert(dropout_mask != NULL); - if (dropout_ && phase_ == net_phase::train) { - dropout_gpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); - matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp); - } else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp); - aggregate(z, context->graph_gpu, out_temp, out_data); - if (act_) relu_gpu(x*z, out_data, out_data); +void graph_conv_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + assert(y <= 128); // currently only support feature length <= 128 + if (dropout_ && phase_ == net_phase::train) { + dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); + matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp); + } else + matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp); + aggregate(z, context->graph_gpu, out_temp, out_data); + if (act_) + relu_gpu(x * z, out_data, out_data); } // GPU backward -void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { - if (act_) d_relu_gpu(x*z, out_grad, out_data, out_temp); - else copy_gpu(x*z, out_grad, out_temp); - if (level_ != 0) { - sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); - update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); - if (dropout_) d_dropout_gpu(y, scale_, in_grad, dropout_mask, in_grad); - } - sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); +void graph_conv_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + if (act_) + d_relu_gpu(x * z, out_grad, out_data, out_temp); + else + copy_gpu(x * z, out_grad, out_temp); + if (level_ != 0) { + sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, + in_temp); + update_all(y, context->graph_gpu, in_temp, in_grad, true, + context->d_norm_factor); + if (dropout_) + d_dropout(y, scale_, in_grad, dropout_mask, in_grad); + } + sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, + d_weight_grad); } #endif diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp index eb02f66d50..0c52d0eb25 100644 --- a/libdeepgalois/src/layers/relu_layer.cpp +++ b/libdeepgalois/src/layers/relu_layer.cpp @@ -1,31 +1,45 @@ #include "layers/relu_layer.h" // ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) -void relu_layer::forward_propagation(const tensor_t &in_data, tensor_t &out_data) { - galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) { - for (size_t j = 0; j < input_dims[1]; ++j) - out_data[i][j] = std::max(in_data[i][j], (float_t)0); - }, galois::chunk_size(), galois::steal(), galois::loopname("relu_layer-fw")); +void relu_layer::forward_propagation(const tensor_t& in_data, + tensor_t& out_data) { + galois::do_all(galois::iterate((size_t)0, input_dims[0]), + [&](const auto& i) { + for (size_t j = 0; j < input_dims[1]; ++j) + out_data[i][j] = std::max(in_data[i][j], (float_t)0); + }, + galois::chunk_size(), galois::steal(), + galois::loopname("relu_layer-fw")); } // ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) -void relu_layer::forward_propagation(const float_t *in_data, float_t *out_data) { - const size_t count = input_dims[0] * input_dims[1]; - relu_gpu(count, in_data, out_data); +void relu_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + const size_t count = input_dims[0] * input_dims[1]; + relu_gpu(count, in_data, out_data); } // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ , ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ -void relu_layer::back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) { - galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) { - for (size_t j = 0; j < input_dims[1]; ++j) - in_grad[i][j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0); - }, galois::chunk_size(), galois::steal(), galois::loopname("relu_layer-bw")); +void relu_layer::back_propagation(const tensor_t& in_data, + const tensor_t& out_data, tensor_t& out_grad, + tensor_t& in_grad) { + galois::do_all(galois::iterate((size_t)0, input_dims[0]), + [&](const auto& i) { + for (size_t j = 0; j < input_dims[1]; ++j) + in_grad[i][j] = out_data[i][j] > float_t(0) + ? out_grad[i][j] + : float_t(0); + }, + galois::chunk_size(), galois::steal(), + galois::loopname("relu_layer-bw")); } // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ , ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ -void relu_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { - const size_t count = input_dims[0] * input_dims[1]; - d_relu_gpu(count, out_grad, in_data, in_grad); +void relu_layer::back_propagation(const float_t* in_data, + const float_t* out_data, float_t* out_grad, + float_t* in_grad) { + const size_t count = input_dims[0] * input_dims[1]; + d_relu_gpu(count, out_grad, in_data, in_grad); } diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 430e1f253b..579de65667 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -1,71 +1,72 @@ #include "layers/softmax_loss_layer.h" -softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector in_dims, - std::vector out_dims) : layer(level, in_dims, out_dims) { - trainable_ = false; - name_ = layer_type() + "_" + std::to_string(level); - std::cout << name_ << ": allocating memory for intermediate data... "; +softmax_loss_layer::softmax_loss_layer(unsigned level, + std::vector in_dims, + std::vector out_dims) + : layer(level, in_dims, out_dims) { + trainable_ = false; + name_ = layer_type() + "_" + std::to_string(level); #ifdef CPU_ONLY - loss = new float_t[in_dims[0]]; // error for each sample + loss = new float_t[in_dims[0]]; // error for each sample #else - loss_malloc_device(in_dims[0], loss); + out_malloc_device(in_dims[0], masks_, d_masks_, loss); #endif - std::cout << "Done\n"; } #ifdef CPU_ONLY // TODO: need kernel fusion optimization // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] -void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { - size_t len = input_dims[1]; - galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - if (masks_[i] == 1) { // masked - softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax - // y is a one hot encoded vector for the labels - std::vector y(output_dims[1], 0.0); // ground truth - y[context->get_label(i)] = 1.0; // one-hot - loss[i] = cross_entropy(len, &y[0], &out_data[len*i]); - } - }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-fw")); +void softmax_loss_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + // void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t + // &out_data) { + size_t len = input_dims[1]; + galois::do_all(galois::iterate(begin_, end_), + [&](const auto& i) { + if (masks_[i] == 1) { // masked + softmax(len, &in_data[len * i], + &out_data[len * i]); // normalize using softmax + // y is a one hot encoded vector for the labels + std::vector y(output_dims[1], 0.0); // ground truth + y[context->get_label(i)] = 1.0; // one-hot + loss[i] = cross_entropy(len, &y[0], &out_data[len * i]); + } + }, + galois::chunk_size(), galois::steal(), + galois::loopname("softmax-loss-fw")); } -void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { - size_t len = input_dims[1]; - galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - if (masks_[i] == 1) { // masked - vec_t norm_grad(len); - std::vector y(len, 0.0); // ground truth - y[context->get_label(i)] = 1.0; - d_cross_entropy(len, &y[0], &out_data[len*i], &norm_grad[0]); - d_softmax(len, &in_data[len*i], &out_data[len*i], &in_grad[len*i], &norm_grad[0]); - } - }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); +// void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t +// &out_data, vec_t &out_grad, vec_t &in_grad) { +void softmax_loss_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + size_t len = input_dims[1]; + galois::do_all(galois::iterate(begin_, end_), + [&](const auto& i) { + if (masks_[i] == 1) { // masked + vec_t norm_grad(len); + std::vector y(len, 0.0); // ground truth + y[context->get_label(i)] = 1.0; + d_cross_entropy(len, &y[0], &out_data[len * i], + &norm_grad[0]); + d_softmax(len, &in_data[len * i], &out_data[len * i], + &in_grad[len * i], &norm_grad[0]); + } + }, + galois::chunk_size(), galois::steal(), + galois::loopname("softmax-loss-bw")); } - -acc_t softmax_loss_layer::get_masked_loss() { - AccumF total_loss; - AccumU valid_sample_count; - total_loss.reset(); - valid_sample_count.reset(); - galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - if (masks_[i]) { - total_loss += loss[i]; - valid_sample_count += 1; - } - }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); - assert(valid_sample_count.reduce() == count_); - return total_loss.reduce() / (acc_t)count_; -} - #else // GPU implementation -void softmax_loss_layer::forward_propagation(const float_t *in_data, float_t *out_data) { - softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, context->d_labels, loss, out_data); -} - -void softmax_loss_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { - d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, context->d_labels, out_data, in_grad); +void softmax_loss_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, + context->d_labels, loss, out_data); } -acc_t softmax_loss_layer::get_masked_loss() { - return masked_avg_loss(begin_, end_, count_, masks_, loss); +void softmax_loss_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, + context->d_labels, out_data, in_grad); } #endif diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 6b41afb020..2e2d68f05d 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -10,463 +10,497 @@ extern "C" { // vector add #if defined(__AVX__) || defined(__AVX2__) -void vadd(const vec_t &a, const vec_t &b, vec_t &out) { - //for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i]; - size_t n = out.size(); - size_t vec_len = 8; - const size_t alignedN = n - n % vec_len; - for (size_t i = 0; i < alignedN; i += vec_len) - _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); - for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i]; -} - -void vadd(size_t n, const float_t *a, const float_t *b, float_t *out) { - size_t vec_len = 8; - const size_t alignedN = n - n % vec_len; - for (size_t i = 0; i < alignedN; i += vec_len) - _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); - for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i]; +void vadd(const vec_t& a, const vec_t& b, vec_t& out) { + // for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i]; + size_t n = out.size(); + size_t vec_len = 8; + const size_t alignedN = n - n % vec_len; + for (size_t i = 0; i < alignedN; i += vec_len) + _mm256_storeu_ps( + &out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); + for (size_t i = alignedN; i < n; ++i) + out[i] = a[i] + b[i]; +} + +void vadd(size_t n, const float_t* a, const float_t* b, float_t* out) { + size_t vec_len = 8; + const size_t alignedN = n - n % vec_len; + for (size_t i = 0; i < alignedN; i += vec_len) + _mm256_storeu_ps( + &out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); + for (size_t i = alignedN; i < n; ++i) + out[i] = a[i] + b[i]; } #else -void vadd(const vec_t &a, const vec_t &b, vec_t &out) { - for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i]; +void vadd(const vec_t& a, const vec_t& b, vec_t& out) { + for (size_t i = 0; i < out.size(); ++i) + out[i] = a[i] + b[i]; } -void vadd(size_t n, const float_t *a, const float_t *b, float_t *out) { - for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i]; +void vadd(size_t n, const float_t* a, const float_t* b, float_t* out) { + for (size_t i = 0; i < n; ++i) + out[i] = a[i] + b[i]; } #endif // vector subtract -void vsub(const vec_t &in_a, const vec_t &in_b, vec_t &out) { - for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] - in_b[i]; +void vsub(const vec_t& in_a, const vec_t& in_b, vec_t& out) { + for (size_t i = 0; i < out.size(); ++i) + out[i] = in_a[i] - in_b[i]; } // vector multiply -void vmul(const vec_t &in_a, const vec_t &in_b, vec_t &out) { - for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] * in_b[i]; +void vmul(const vec_t& in_a, const vec_t& in_b, vec_t& out) { + for (size_t i = 0; i < out.size(); ++i) + out[i] = in_a[i] * in_b[i]; } // vector divide -void vdiv(const vec_t &in_a, const vec_t &in_b, vec_t &out) { - for (size_t i = 0; i < out.size(); ++i) { - assert(in_b[i] != 0); - out[i] = in_a[i] / in_b[i]; - } +void vdiv(const vec_t& in_a, const vec_t& in_b, vec_t& out) { + for (size_t i = 0; i < out.size(); ++i) { + assert(in_b[i] != 0); + out[i] = in_a[i] / in_b[i]; + } } // vector add scalar -void add_scalar(const float_t alpha, vec_t &Y) { - for (size_t i = 0; i < Y.size(); ++i) Y[i] += alpha; +void add_scalar(const float_t alpha, vec_t& Y) { + for (size_t i = 0; i < Y.size(); ++i) + Y[i] += alpha; } // vector subtract scalar -void sub_scalar(const float_t alpha, vec_t &Y) { - for (size_t i = 0; i < Y.size(); ++i) Y[i] -= alpha; +void sub_scalar(const float_t alpha, vec_t& Y) { + for (size_t i = 0; i < Y.size(); ++i) + Y[i] -= alpha; } // vector multiply scalar -void mul_scalar(const float_t alpha, vec_t &Y) { - for (size_t i = 0; i < Y.size(); ++i) Y[i] *= alpha; +void mul_scalar(const float_t alpha, vec_t& Y) { + for (size_t i = 0; i < Y.size(); ++i) + Y[i] *= alpha; } -void mul_scalar(size_t n, const float_t alpha, const float_t *in, float_t *out) { - for (size_t i = 0; i < n; ++i) out[i] = alpha *in[i]; +void mul_scalar(size_t n, const float_t alpha, const float_t* in, + float_t* out) { + for (size_t i = 0; i < n; ++i) + out[i] = alpha * in[i]; } // vector divide scalar -void div_scalar(const float_t alpha, vec_t &Y) { - assert(alpha != 0); - for (size_t i = 0; i < Y.size(); ++i) Y[i] /= alpha; +void div_scalar(const float_t alpha, vec_t& Y) { + assert(alpha != 0); + for (size_t i = 0; i < Y.size(); ++i) + Y[i] /= alpha; } // dot product -float_t dot(const vec_t &x, const vec_t &y) { - float_t sum = 0; - for (size_t i = 0; i < x.size(); ++i) - sum += x[i] * y[i]; - return sum; +float_t dot(const vec_t& x, const vec_t& y) { + float_t sum = 0; + for (size_t i = 0; i < x.size(); ++i) + sum += x[i] * y[i]; + return sum; } -float_t dot(size_t n, const float_t *x, const float_t *y) { - float_t sum = 0; - for (size_t i = 0; i < n; ++i) - sum += x[i] * y[i]; - return sum; +float_t dot(size_t n, const float_t* x, const float_t* y) { + float_t sum = 0; + for (size_t i = 0; i < n; ++i) + sum += x[i] * y[i]; + return sum; } // matrix-vector multiply -void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector) { - size_t m = out_vector.size(); - size_t n = in_vector.size(); - for (size_t i = 0; i < m; ++i) { - for (size_t j = 0; j < n; ++j) { - out_vector[i] += matrix[i*n+j] * in_vector[j]; - } - } +void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector) { + size_t m = out_vector.size(); + size_t n = in_vector.size(); + for (size_t i = 0; i < m; ++i) { + for (size_t j = 0; j < n; ++j) { + out_vector[i] += matrix[i * n + j] * in_vector[j]; + } + } } // vector-vector multiply -void vvmul(const vec_t &a, const vec_t &b, tensor_t &out) { - size_t m = a.size(); - size_t n = b.size(); - for (size_t i = 0; i < m; ++i) { - for (size_t j = 0; j < n; ++j) { - out[i][j] += a[i] * b[j]; - } - } +void vvmul(const vec_t& a, const vec_t& b, tensor_t& out) { + size_t m = a.size(); + size_t n = b.size(); + for (size_t i = 0; i < m; ++i) { + for (size_t j = 0; j < n; ++j) { + out[i][j] += a[i] * b[j]; + } + } } // matrix addition -void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C) { - for (size_t i = 0; i < x; ++i) - for (size_t j = 0; j < y; ++j) - C[i][j] = A[i][j] + B[i][j]; +void matadd(size_t x, size_t y, const tensor_t& A, const tensor_t& B, + tensor_t& C) { + for (size_t i = 0; i < x; ++i) + for (size_t j = 0; j < y; ++j) + C[i][j] = A[i][j] + B[i][j]; } // TODO: vectorize -void copy2D1D(const tensor_t &in, vec_t &out) { - size_t x = in.size(); - size_t y = in[0].size(); - auto ptr = &out[0]; - for (size_t i = 0; i < x; i++) { - std::copy(in[i].begin(), in[i].end(), ptr); - ptr += y; - } -} - -void copy1D1D(const vec_t &in, vec_t &out) { - std::copy(in.begin(), in.end(), &out[0]); -} - -void copy1D1D(size_t len, const float_t *in, float_t *out) { - std::copy(in, in+len, out); -} - -void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, - const int M, const int N, const int K, const float alpha, - const float* A, const float* B, const float beta, float* C) { - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); -} - -void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) { - // A: x*z; B: z*y; C: x*y - size_t dim_x = A.size(); - size_t dim_y = C[0].size(); - size_t dim_z = A[0].size(); - assert(C.size() == dim_x); - assert(B.size() == dim_z); - assert(B[0].size() == dim_y); - - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) { - C[i][j] = 0; - for (size_t k = 0; k < dim_z; ++k) { - C[i][j] += A[i][k] * B[k][j]; - } - } - } -} - -void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, - const float_t *A, const float_t *B, float_t *C) { - galois::StatTimer Tmatmul("MatMul"); - Tmatmul.start(); - const CBLAS_TRANSPOSE TransA = CblasNoTrans; - const CBLAS_TRANSPOSE TransB = CblasNoTrans; - sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); - Tmatmul.stop(); -} - -void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C) { - // A: x*z; B: z*y; C: x*y - size_t dim_x = A.size(); - size_t dim_z = A[0].size(); - assert(B.size() == dim_z*dim_y); - assert(C.size() == dim_x*dim_y); - vec_t A1D(dim_x*dim_z); - copy2D1D(A, A1D); - matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C[0]); -} - -void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) { - // A: x*z; B: z*y; C: x*y - size_t dim_x = C.size(); - size_t dim_y = C[0].size(); - size_t dim_z = A[0].size(); - assert(A.size() == dim_x); - assert(B.size() == dim_y*dim_z); - vec_t A1D(dim_x*dim_z); - vec_t C1D(dim_x*dim_y, 0); - auto ptr = &A1D[0]; - for (size_t i = 0; i < dim_x; i++) { - std::copy(A[i].begin(), A[i].end(), ptr); - ptr += dim_z; - } - matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C1D[0]); - for (size_t i = 0; i < dim_x; i++) { - for (size_t j = 0; j < dim_y; ++j) { - C[i][j] = C1D[i*dim_y+j]; - } - } -} - -void transpose2D(const tensor_t &in, tensor_t &out) { - size_t x = in.size(); - size_t y = in[0].size(); - for (size_t i = 0; i < y; i ++) { - for (size_t j = 0; j < x; j ++) { - out[i][j] = in[j][i]; - } - } +void copy2D1D(const tensor_t& in, vec_t& out) { + size_t x = in.size(); + size_t y = in[0].size(); + auto ptr = &out[0]; + for (size_t i = 0; i < x; i++) { + std::copy(in[i].begin(), in[i].end(), ptr); + ptr += y; + } +} + +void copy1D1D(const vec_t& in, vec_t& out) { + std::copy(in.begin(), in.end(), &out[0]); +} + +void copy1D1D(size_t len, const float_t* in, float_t* out) { + std::copy(in, in + len, out); +} + +void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, + beta, C, N); +} + +void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C) { + // A: x*z; B: z*y; C: x*y + size_t dim_x = A.size(); + size_t dim_y = C[0].size(); + size_t dim_z = A[0].size(); + assert(C.size() == dim_x); + assert(B.size() == dim_z); + assert(B[0].size() == dim_y); + + for (size_t i = 0; i < dim_x; ++i) { + for (size_t j = 0; j < dim_y; ++j) { + C[i][j] = 0; + for (size_t k = 0; k < dim_z; ++k) { + C[i][j] += A[i][k] * B[k][j]; + } + } + } +} + +void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, + const float_t* A, const float_t* B, float_t* C) { + galois::StatTimer Tmatmul("MatMul"); + Tmatmul.start(); + const CBLAS_TRANSPOSE TransA = CblasNoTrans; + const CBLAS_TRANSPOSE TransB = CblasNoTrans; + sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); + Tmatmul.stop(); +} + +void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B, + vec_t& C) { + // A: x*z; B: z*y; C: x*y + size_t dim_x = A.size(); + size_t dim_z = A[0].size(); + assert(B.size() == dim_z * dim_y); + assert(C.size() == dim_x * dim_y); + vec_t A1D(dim_x * dim_z); + copy2D1D(A, A1D); + matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C[0]); +} + +void matmul(const tensor_t& A, const vec_t& B, tensor_t& C) { + // A: x*z; B: z*y; C: x*y + size_t dim_x = C.size(); + size_t dim_y = C[0].size(); + size_t dim_z = A[0].size(); + assert(A.size() == dim_x); + assert(B.size() == dim_y * dim_z); + vec_t A1D(dim_x * dim_z); + vec_t C1D(dim_x * dim_y, 0); + auto ptr = &A1D[0]; + for (size_t i = 0; i < dim_x; i++) { + std::copy(A[i].begin(), A[i].end(), ptr); + ptr += dim_z; + } + matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C1D[0]); + for (size_t i = 0; i < dim_x; i++) { + for (size_t j = 0; j < dim_y; ++j) { + C[i][j] = C1D[i * dim_y + j]; + } + } +} + +void transpose2D(const tensor_t& in, tensor_t& out) { + size_t x = in.size(); + size_t y = in[0].size(); + for (size_t i = 0; i < y; i++) { + for (size_t j = 0; j < x; j++) { + out[i][j] = in[j][i]; + } + } } // TODO: vectorize -void transpose2D1D(const tensor_t &in, vec_t &out) { - size_t x = in.size(); - size_t y = in[0].size(); - assert(out.size() == x*y); - for (size_t i = 0; i < y; i ++) { - for (size_t j = 0; j < x; j ++) { - out[i*x+j] = in[j][i]; - } - } +void transpose2D1D(const tensor_t& in, vec_t& out) { + size_t x = in.size(); + size_t y = in[0].size(); + assert(out.size() == x * y); + for (size_t i = 0; i < y; i++) { + for (size_t j = 0; j < x; j++) { + out[i * x + j] = in[j][i]; + } + } } -void transpose(size_t x, size_t y, const vec_t &in, vec_t &out) { - for (size_t i = 0; i < y; i ++) { - for (size_t j = 0; j < x; j ++) { - out[i*x+j] = in[j*y+i]; - } - } +void transpose(size_t x, size_t y, const vec_t& in, vec_t& out) { + for (size_t i = 0; i < y; i++) { + for (size_t j = 0; j < x; j++) { + out[i * x + j] = in[j * y + i]; + } + } } -void transpose(size_t x, size_t y, const float_t *in, float_t *out) { - for (size_t i = 0; i < y; i ++) { - for (size_t j = 0; j < x; j ++) { - out[i*x+j] = in[j*y+i]; - } - } +void transpose(size_t x, size_t y, const float_t* in, float_t* out) { + for (size_t i = 0; i < y; i++) { + for (size_t j = 0; j < x; j++) { + out[i * x + j] = in[j * y + i]; + } + } } -int argmax(const size_t n, const vec_t &x) { - float_t max = x[0]; - int max_ind = 0; - for (size_t i = 1; i < n; i++) { - if (x[i] > max) { - max_ind = i; - max = x[i]; - } - } - return max_ind; +int argmax(const size_t n, const vec_t& x) { + float_t max = x[0]; + int max_ind = 0; + for (size_t i = 1; i < n; i++) { + if (x[i] > max) { + max_ind = i; + max = x[i]; + } + } + return max_ind; } -int argmax(const size_t n, const float_t *x) { - float_t max = x[0]; - int max_ind = 0; - for (size_t i = 1; i < n; i++) { - if (x[i] > max) { - max_ind = i; - max = x[i]; - } - } - return max_ind; +int argmax(const size_t n, const float_t* x) { + float_t max = x[0]; + int max_ind = 0; + for (size_t i = 1; i < n; i++) { + if (x[i] > max) { + max_ind = i; + max = x[i]; + } + } + return max_ind; } -void clear(vec_t &in) { - for (size_t i = 0; i < in.size(); i++) in[i] = 0; +void clear(vec_t& in) { + for (size_t i = 0; i < in.size(); i++) + in[i] = 0; } -void clear(size_t n, float_t *in) { - for (size_t i = 0; i < n; i++) in[i] = 0; +void clear(size_t n, float_t* in) { + for (size_t i = 0; i < n; i++) + in[i] = 0; } -void relu(const vec_t &in, vec_t &out) { - for (size_t i = 0; i < out.size(); ++i) { - out[i] = std::max(in[i], (float_t)0) + negative_slope * std::min(in[i], (float_t)0); - } +void relu(const vec_t& in, vec_t& out) { + for (size_t i = 0; i < out.size(); ++i) { + out[i] = std::max(in[i], (float_t)0) + + negative_slope * std::min(in[i], (float_t)0); + } } -void relu(size_t n, const float_t *in, float_t *out) { - for (size_t i = 0; i < n; ++i) - out[i] = std::max(in[i], float_t(0)); +void relu(size_t n, const float_t* in, float_t* out) { + for (size_t i = 0; i < n; ++i) + out[i] = std::max(in[i], float_t(0)); } -void d_relu(const vec_t &in_diff, const vec_t &fv, vec_t &out_diff) { - for (size_t i = 0; i < out_diff.size(); ++i) { - out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) + negative_slope * (fv[i] <= (float_t)0)); - } +void d_relu(const vec_t& in_diff, const vec_t& fv, vec_t& out_diff) { + for (size_t i = 0; i < out_diff.size(); ++i) { + out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) + + negative_slope * (fv[i] <= (float_t)0)); + } } -void d_mvmul(vec_t &in_diff, vec_t &h_in, tensor_t &out_diff) { - vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff +void d_mvmul(vec_t& in_diff, vec_t& h_in, tensor_t& out_diff) { + vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff } -void d_vadd(vec_t &in_diff, vec_t &out_diff) { - for (size_t i = 0; i < out_diff.size(); ++i) - out_diff[i] = in_diff[i]; +void d_vadd(vec_t& in_diff, vec_t& out_diff) { + for (size_t i = 0; i < out_diff.size(); ++i) + out_diff[i] = in_diff[i]; } -float reduce_mean(const vec_t &x) { - size_t n = x.size(); - assert(n > 0); - float sum = (float)x[0]; - for (size_t i = 1; i < n; i++) { - sum += (float)x[i]; - } - return sum / (float)n; +float reduce_mean(const vec_t& x) { + size_t n = x.size(); + assert(n > 0); + float sum = (float)x[0]; + for (size_t i = 1; i < n; i++) { + sum += (float)x[i]; + } + return sum / (float)n; } -void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &masks, vec_t &out) { - assert(masks.size() == out.size()); - //rng_bernoulli(1. - dropout_rate, masks); // Create random numbers - for (size_t i = 0; i < in.size(); ++i) - masks[i] = bernoulli(dropout_rate); - for (size_t i = 0; i < in.size(); ++i) - out[i] = in[i] * masks[i] * scale; +void dropout(const float scale, const float dropout_rate, const vec_t& in, + std::vector& mask, vec_t& out) { + assert(mask.size() == out.size()); + // rng_bernoulli(1. - dropout_rate, mask); // Create random numbers + for (size_t i = 0; i < in.size(); ++i) + mask[i] = bernoulli(dropout_rate); + for (size_t i = 0; i < in.size(); ++i) + out[i] = in[i] * mask[i] * scale; } -void dropout(const float scale, const float dropout_rate, const vec_t &in, std::vector &masks, float_t *out) { - for (size_t i = 0; i < in.size(); ++i) - masks[i] = bernoulli(dropout_rate); - for (size_t i = 0; i < in.size(); ++i) - out[i] = in[i] * masks[i] * scale; +void dropout(const float scale, const float dropout_rate, const vec_t& in, + std::vector& mask, float_t* out) { + for (size_t i = 0; i < in.size(); ++i) + mask[i] = bernoulli(dropout_rate); + for (size_t i = 0; i < in.size(); ++i) + out[i] = in[i] * mask[i] * scale; } -void dropout(size_t n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out) { - for (size_t i = 0; i < n; ++i) - masks[i] = bernoulli(dropout_rate); - for (size_t i = 0; i < n; ++i) - out[i] = in[i] * masks[i] * scale; +void dropout(size_t n, const float scale, const float dropout_rate, + const float_t* in, unsigned* mask, float_t* out) { + for (size_t i = 0; i < n; ++i) + mask[i] = bernoulli(dropout_rate); + for (size_t i = 0; i < n; ++i) + out[i] = in[i] * mask[i] * scale; } -void d_dropout(const float scale, const vec_t &in_diff, std::vector &masks, vec_t &out_diff) { - for (size_t i = 0; i < in_diff.size(); ++i) - out_diff[i] = in_diff[i] * masks[i] * scale; +void d_dropout(const float scale, const vec_t& in_diff, + std::vector& mask, vec_t& out_diff) { + for (size_t i = 0; i < in_diff.size(); ++i) + out_diff[i] = in_diff[i] * mask[i] * scale; } -void d_dropout(size_t n, const float scale, const float_t *in_diff, unsigned *masks, float_t *out_diff) { - for (size_t i = 0; i < n; ++i) - out_diff[i] = in_diff[i] * masks[i] * scale; +void d_dropout(size_t n, const float scale, const float_t* in_diff, + unsigned* mask, float_t* out_diff) { + for (size_t i = 0; i < n; ++i) + out_diff[i] = in_diff[i] * mask[i] * scale; } -float_t sigmoid_func(float_t x) { - return 0.5 * tanh(0.5 * x) + 0.5; -} +float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; } // Sigmoid -void sigmoid(vec_t &fv) { - size_t count = fv.size(); - for (size_t i = 0; i < count; ++i) { - fv[i] = sigmoid_func(fv[i]); - } +void sigmoid(vec_t& fv) { + size_t count = fv.size(); + for (size_t i = 0; i < count; ++i) { + fv[i] = sigmoid_func(fv[i]); + } } // Softmax function takes an N-dimensional vector (X) of real number, -// and transforms it into a vector of real number in range (0,1) which add upto 1. -// To make softmax func numerically stable, we simply normalize the values in the vector, -// by multiplying the numerator and denominator with a constant C, where log(C)=-max(X) +// and transforms it into a vector of real number in range (0,1) which add +// upto 1. To make softmax func numerically stable, we simply normalize the +// values in the vector, by multiplying the numerator and denominator with a +// constant C, where log(C)=-max(X) // exps = np.exp(X - np.max(X)) // exps / np.sum(exps) -void softmax(const vec_t &input, vec_t &output) { - const float_t max = *std::max_element(input.begin(), input.end()); - float_t denominator(0); - for (size_t i = 0; i < input.size(); i++) { - output[i] = std::exp(input[i] - max); - denominator += output[i]; - } - for (size_t i = 0; i < input.size(); i++) - output[i] /= denominator; -} - -void softmax(size_t n, const float_t *input, float_t *output) { - const float_t max = *std::max_element(input, input+n); - float_t denominator(0); - for (size_t i = 0; i < n; i++) { - output[i] = std::exp(input[i] - max); - denominator += output[i]; - } - for (size_t i = 0; i < n; i++) - output[i] /= denominator; -} - -void log_softmax(const vec_t &input, vec_t &output) { - const float_t max = *std::max_element(input.begin(), input.end()); - float_t denominator(0); - for (size_t i = 0; i < input.size(); i++) - denominator += std::exp(input[i] - max); - for (size_t i = 0; i < input.size(); i++) - output[i] = input[i] - max - denominator; -} - -// Due to the desirable property of softmax function outputting a probability distribution, -// we often use it as the final layer in neural networks. -// For this we need to calculate the derivative or gradient, -// and pass it back to the previous layer during backpropagation. -void d_softmax(const vec_t &y, const vec_t &p, vec_t &dy, const vec_t &dp) { - auto n = y.size(); - vec_t df(n, 0); - for (size_t i = 0; i < n; i++) { - for (size_t j = 0; j < n; j++) { - //float_t delta_ij = i == j? 1 : 0; - //df[i] += p[j] * (delta_ij - p[i]); - df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i]; - } - // dy = dp * (gradient of softmax) - dy[i] = dot(dp, df); - } -} - -void d_softmax(size_t n, const float_t *y, const float_t *p, float_t *dy, const float_t *dp) { - vec_t df(n, 0); - for (size_t i = 0; i < n; i++) { - for (size_t j = 0; j < n; j++) { - df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i]; - } - dy[i] = dot(n, dp, &df[0]); - } +void softmax(const vec_t& input, vec_t& output) { + const float_t max = *std::max_element(input.begin(), input.end()); + float_t denominator(0); + for (size_t i = 0; i < input.size(); i++) { + output[i] = std::exp(input[i] - max); + denominator += output[i]; + } + for (size_t i = 0; i < input.size(); i++) + output[i] /= denominator; +} + +void softmax(size_t n, const float_t* input, float_t* output) { + const float_t max = *std::max_element(input, input + n); + float_t denominator(0); + for (size_t i = 0; i < n; i++) { + output[i] = std::exp(input[i] - max); + denominator += output[i]; + } + for (size_t i = 0; i < n; i++) + output[i] /= denominator; +} + +void log_softmax(const vec_t& input, vec_t& output) { + const float_t max = *std::max_element(input.begin(), input.end()); + float_t denominator(0); + for (size_t i = 0; i < input.size(); i++) + denominator += std::exp(input[i] - max); + for (size_t i = 0; i < input.size(); i++) + output[i] = input[i] - max - denominator; +} + +// Due to the desirable property of softmax function outputting a probability +// distribution, we often use it as the final layer in neural networks. For this +// we need to calculate the derivative or gradient, and pass it back to the +// previous layer during backpropagation. +void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp) { + auto n = y.size(); + vec_t df(n, 0); + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < n; j++) { + // float_t delta_ij = i == j? 1 : 0; + // df[i] += p[j] * (delta_ij - p[i]); + df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i]; + } + // dy = dp * (gradient of softmax) + dy[i] = dot(dp, df); + } +} + +void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, + const float_t* dp) { + vec_t df(n, 0); + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < n; j++) { + df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i]; + } + dy[i] = dot(n, dp, &df[0]); + } } // cross-entropy loss function for multi-class classification // y: ground truth // p: predicted probability -float_t cross_entropy(const vec_t &y, const vec_t &p) { - auto n = y.size(); - assert(n > 0); - float_t loss = 0.0; - for (size_t i = 0; i < n; i++) { - if (y[i] == float_t(0)) continue; - if (p[i] == float_t(0)) loss -= y[i] * std::log(float_t(1e-10)); - //if (p[i]==float_t(1)) loss -= (float_t(1) - y[i]) * std::log(float_t(1e-10)); - else loss -= y[i] * std::log(p[i]);// + (float_t(1) - y[i]) * std::log(float_t(1) - p[i]); - //loss -= y[i] * std::log(p[i]); - } - return loss; -} - -float_t cross_entropy(size_t n, const float_t *y, const float_t *p) { - float_t loss = 0.0; - for (size_t i = 0; i < n; i++) { - if (y[i] == float_t(0)) continue; - if (p[i] == float_t(0)) loss -= y[i] * std::log(float_t(1e-10)); - else loss -= y[i] * std::log(p[i]); - } - return loss; -} - -void d_cross_entropy(const vec_t &y, const vec_t &p, vec_t &d) { - auto n = y.size(); - //for (size_t i = 0; i < n; i++) d[i] = (p[i] - y[i]) / (p[i] * (float_t(1) - p[i])); - for (size_t i = 0; i < n; i++) { - d[i] = -y[i] / (p[i] + float_t(1e-10)); - //d[i] = p[i] - y[i]; - } -} - -void d_cross_entropy(size_t n, const float_t *y, const float_t *p, float_t *d) { - for (size_t i = 0; i < n; i++) { - d[i] = -y[i] / (p[i] + float_t(1e-10)); - } +float_t cross_entropy(const vec_t& y, const vec_t& p) { + auto n = y.size(); + assert(n > 0); + float_t loss = 0.0; + for (size_t i = 0; i < n; i++) { + if (y[i] == float_t(0)) + continue; + if (p[i] == float_t(0)) + loss -= y[i] * std::log(float_t(1e-10)); + // if (p[i]==float_t(1)) loss -= (float_t(1) - y[i]) * + // std::log(float_t(1e-10)); + else + loss -= + y[i] * std::log(p[i]); // + (float_t(1) - y[i]) * std::log(float_t(1) + // - p[i]); loss -= y[i] * std::log(p[i]); + } + return loss; +} + +float_t cross_entropy(size_t n, const float_t* y, const float_t* p) { + float_t loss = 0.0; + for (size_t i = 0; i < n; i++) { + if (y[i] == float_t(0)) + continue; + if (p[i] == float_t(0)) + loss -= y[i] * std::log(float_t(1e-10)); + else + loss -= y[i] * std::log(p[i]); + } + return loss; +} + +void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d) { + auto n = y.size(); + // for (size_t i = 0; i < n; i++) d[i] = (p[i] - y[i]) / (p[i] * (float_t(1) - + // p[i])); + for (size_t i = 0; i < n; i++) { + d[i] = -y[i] / (p[i] + float_t(1e-10)); + // d[i] = p[i] - y[i]; + } +} + +void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) { + for (size_t i = 0; i < n; i++) { + d[i] = -y[i] / (p[i] + float_t(1e-10)); + } } - diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 415e141ec9..8dbe141c96 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -1,288 +1,253 @@ #include "math_functions.hh" #include "context.h" -#include "gg.h" -#include "ggcuda.h" -#include "cub/cub.cuh" -#include - -void gpu_rng_uniform(const int n, unsigned *r) { - CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n)); -} - -void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) { - CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n)); - const float range = b - a; - if (range != float_t(1)) scal_gpu(n, range, r); - if (a != float_t(0)) add_scalar_gpu(n, a, r); -} - -void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t *r) { - CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); -} - -void loss_malloc_device(int n, float_t *loss) { - CUDA_CHECK(cudaMalloc((void **)&loss, n * sizeof(float_t))); -} - -void copy_masks_device(int n, mask_t *h_masks, mask_t *d_masks) { - assert(h_masks != NULL); - CUDA_CHECK(cudaMalloc((void **)&d_masks, n * sizeof(mask_t))); - CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); +void gpu_rng_uniform(const int n, unsigned* r) { + CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n)); } -void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *masks, float_t *in, float_t *out, float_t *matrix, float_t *grad) { - if (dropout) CUDA_CHECK(cudaMalloc((void **)&masks, x * y * sizeof(unsigned))); - CUDA_CHECK(cudaMalloc((void **)&in, x * y * sizeof(float_t))); - CUDA_CHECK(cudaMalloc((void **)&out, x * z * sizeof(float_t))); - CUDA_CHECK(cudaMalloc((void **)&matrix, y * z * sizeof(float_t))); - auto init_range = sqrt(6.0/(y + z)); - // Glorot & Bengio (AISTATS 2010) - gpu_rng_uniform(y*z, -init_range, init_range, matrix); - CUDA_CHECK(cudaMalloc((void **)&grad, y * z * sizeof(float_t))); - CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t))); +void gpu_rng_uniform(const int n, const float_t a, const float_t b, + float_t* r) { + CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n)); + const float range = b - a; + if (range != float_t(1)) + scal_gpu(n, range, r); + if (a != float_t(0)) + add_scalar_gpu(n, a, r); } -void copy_gpu(size_t len, const float_t *in, float_t *out) { - CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); +void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, + float_t* r) { + CURAND_CHECK( + curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); } -__global__ void setup_curand_kernel(const int n, curandState *state) { - CUDA_KERNEL_LOOP(i, n) { - curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 - //curand_init(7+i, i, 0, &state[i]); // Each thread gets different seed - } +void out_malloc_device(int n, mask_t* h_masks, mask_t* d_masks, float_t* loss) { + CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t))); + CUDA_CHECK( + cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMalloc((void**)&loss, n * sizeof(float_t))); } -__device__ bool bernoulli_gpu(int tid, curandState *state, float_t p) { - curandState local_state = state[tid]; - return curand_uniform(&local_state) <= p; +void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, + unsigned* masks, float_t* in, float_t* out, + float_t* matrix, float_t* grad) { + if (dropout) + CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned))); + CUDA_CHECK(cudaMalloc((void**)&in, x * y * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void**)&out, x * z * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void**)&matrix, y * z * sizeof(float_t))); + auto init_range = sqrt(6.0 / (y + z)); + // Glorot & Bengio (AISTATS 2010) + gpu_rng_uniform(y * z, -init_range, init_range, matrix); + CUDA_CHECK(cudaMalloc((void**)&grad, y * z * sizeof(float_t))); + CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t))); } -__global__ void dropout_kernel(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned *masks, curandState *state, float_t* out) { - CUDA_KERNEL_LOOP(i, n) { - masks[i] = bernoulli_gpu(i, state, dropout_rate); - out[i] = in[i] * masks[i] * scale; - } +void copy_gpu(size_t len, const float_t* in, float_t* out) { + CUDA_CHECK( + cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); } -void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out) { - curandState *devStates; - CUDA_CHECK(cudaMalloc((void **)&devStates, n * sizeof(curandState))); - std::cout << "[debug]: setup curand, n = " << n << "\n"; - setup_curand_kernel<<>>(n, devStates); - CudaTest("solving setup_curand kernel failed"); - std::cout << "[debug]: dropout_gpu\n"; - dropout_kernel<<>>(n, scale, dropout_rate, in, masks, devStates, out); - CudaTest("solving dropout kernel failed"); - CUDA_CHECK(cudaFree(devStates)); - std::cout << "[debug]: dropout_gpu done\n"; +__global__ void dropout_kernel(const int n, const float scale, + const float dropout_rate, const float_t* in, + unsigned* masks, float_t* out) { + CUDA_KERNEL_LOOP(i, n) { + // masks[i] = bernoulli(dropout_rate); + out[i] = in[i] * masks[i] * scale; + } } -__global__ void d_dropout_kernel(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out) { - CUDA_KERNEL_LOOP(i, n) { - out[i] = in[i] * masks[i] * scale; - } -} - -void d_dropout_gpu(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out) { - d_dropout_kernel<<>>(n, scale, in, masks, out); - CudaTest("solving dropout kernel failed"); +void dropout_gpu(const int n, const float scale, const float dropout_rate, + const float_t* in, unsigned* masks, float_t* out) { + dropout_kernel<<>>( + n, scale, dropout_rate, in, masks, out); } // flattern data into 1D before feed into the ReLU operater __global__ void relu_kernel(const int n, const float_t* in, float_t* out) { - CUDA_KERNEL_LOOP(index, n) { - out[index] = in[index] > 0 ? in[index] : 0; - } + CUDA_KERNEL_LOOP(index, n) { out[index] = in[index] > 0 ? in[index] : 0; } } -void relu_gpu(const int n, const float_t *in, float_t* out) { - std::cout << "[debug]: relu_gpu\n"; - relu_kernel<<>>(n, in, out); - CudaTest("solving relu kernel failed"); +void relu_gpu(const int n, const float_t* in, float_t* out) { + relu_kernel<<>>(n, in, out); } -__global__ void d_relu_kernel(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - out_diff[index] = data[index] > 0 ? in_diff[index] : 0; - } +__global__ void d_relu_kernel(const int n, const float_t* in_diff, + const float_t* data, float_t* out_diff) { + CUDA_KERNEL_LOOP(index, n) { + out_diff[index] = data[index] > 0 ? in_diff[index] : 0; + } } -void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff) { - d_relu_kernel<<>>(n, in_diff, data, out_diff); - CudaTest("solving d_relu kernel failed"); +void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, + float_t* out_diff) { + d_relu_kernel<<>>(n, in_diff, data, + out_diff); } -void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, - const int M, const int N, const int K, const float alpha, - const float* A, const float* B, const float beta, float* C) { - // Note that cublas follows fortran order. - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasSgemm(Context::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); +void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C) { + // Note that cublas follows fortran order. + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cublasOperation_t cuTransA = + (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + cublasOperation_t cuTransB = + (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; + CUBLAS_CHECK(cublasSgemm(Context::cublas_handle(), cuTransB, cuTransA, N, M, + K, &alpha, B, ldb, A, lda, &beta, C, N)); } -void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C) { - std::cout << "[debug]: matmul1D1D_gpu\n"; - const CBLAS_TRANSPOSE TransA = CblasNoTrans; - const CBLAS_TRANSPOSE TransB = CblasNoTrans; - sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); +void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, + const float_t* A, const float_t* B, float_t* C) { + const CBLAS_TRANSPOSE TransA = CblasNoTrans; + const CBLAS_TRANSPOSE TransB = CblasNoTrans; + sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); } // the arguments of the maxima -int argmax_gpu(const size_t n, const float_t *x) { - return 0; -} +int argmax_gpu(const size_t n, const float_t* x) { return 0; } -void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const float alpha, const float* A, const float* x, const float beta, float* y) { - cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); +void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float* A, const float* x, + const float beta, float* y) { + cublasOperation_t cuTransA = + (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A, + N, x, 1, &beta, y, 1)); } -void scal_gpu(const int N, const float alpha, float *X) { - CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), N, &alpha, X, 1)); +void scal_gpu(const int N, const float alpha, float* X) { + CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), N, &alpha, X, 1)); } void dot_gpu(const int n, const float* x, const float* y, float* out) { - CUBLAS_CHECK(cublasSdot(Context::cublas_handle(), n, x, 1, y, 1, out)); + CUBLAS_CHECK(cublasSdot(Context::cublas_handle(), n, x, 1, y, 1, out)); } void asum_gpu(const int n, const float* x, float* y) { - CUBLAS_CHECK(cublasSasum(Context::cublas_handle(), n, x, 1, y)); + CUBLAS_CHECK(cublasSasum(Context::cublas_handle(), n, x, 1, y)); } -void scale_gpu(const int n, const float alpha, const float *x, float* y) { - CUBLAS_CHECK(cublasScopy(Context::cublas_handle(), n, x, 1, y, 1)); - CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), n, &alpha, y, 1)); +void scale_gpu(const int n, const float alpha, const float* x, float* y) { + CUBLAS_CHECK(cublasScopy(Context::cublas_handle(), n, x, 1, y, 1)); + CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), n, &alpha, y, 1)); } __global__ void set_kernel(const int n, const float_t alpha, float_t* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = alpha; - } + CUDA_KERNEL_LOOP(index, n) { y[index] = alpha; } } void set_gpu(const int N, const float_t alpha, float_t* Y) { - if (alpha == 0) { - CUDA_CHECK(cudaMemset(Y, 0, sizeof(float_t) * N)); - return; - } - set_kernel<<>>(N, alpha, Y); - CudaTest("solving set kernel failed"); + if (alpha == 0) { + CUDA_CHECK(cudaMemset(Y, 0, sizeof(float_t) * N)); + return; + } + set_kernel<<>>(N, alpha, Y); } -__global__ void add_scalar_kernel(const int n, const float_t alpha, float_t* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] += alpha; - } +__global__ void add_scalar_kernel(const int n, const float_t alpha, + float_t* y) { + CUDA_KERNEL_LOOP(index, n) { y[index] += alpha; } } void add_scalar_gpu(const int N, const float_t alpha, float_t* Y) { - add_scalar_kernel<<>>(N, alpha, Y); - CudaTest("solving add_scalar kernel failed"); + add_scalar_kernel<<>>(N, alpha, Y); } -__global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, float_t* y) { - CUDA_KERNEL_LOOP(index, n) { - y[index] = a[index] + b[index]; - } +__global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, + float_t* y) { + CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] + b[index]; } } void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) { - vadd_kernel<<>>(N, a, b, y); - CudaTest("solving vadd kernel failed"); + vadd_kernel<<>>(N, a, b, y); } // TODO: use warp -__device__ void softmax(int n, const float_t *input, float_t *output) { - float_t max = input[0]; - for (size_t i = 1; i < n; i++) if (input[i] > max) max = input[i]; - float_t denominator = 0.0; - for (size_t i = 0; i < n; i++) { - output[i] = exp(input[i] - max); - denominator += output[i]; - } - for (size_t i = 0; i < n; i++) output[i] /= denominator; +__device__ void softmax(int n, const float_t* input, float_t* output) { + float_t max = input[0]; + for (size_t i = 1; i < n; i++) + if (input[i] > max) + max = input[i]; + float_t denominator = 0.0; + for (size_t i = 0; i < n; i++) { + output[i] = exp(input[i] - max); + denominator += output[i]; + } + for (size_t i = 0; i < n; i++) + output[i] /= denominator; } // TODO: use warp -__device__ void d_softmax(size_t n, const float_t *p, const float_t *dp, float_t *dy) { - for (size_t i = 0; i < n; i++) { - dy[i] = 0; - for (size_t j = 0; j < n; j++) { - float_t df = (j == i) ? p[i] * (1.0 - p[i]) : -p[j] * p[i]; - dy[i] += df * dp[j]; - } - } -} - -__device__ void cross_entropy(int n, const label_t idx, const float_t *p, float_t &loss) { - if (p[idx] == 0.0) loss -= log(float_t(1e-10)); - else loss -= log(p[idx]); -} - -__device__ void d_cross_entropy(int n, const label_t idx, const float_t *p, float_t *d) { - for (int i = 0; i < n; i++) - if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10); - else d[i] = 0.0; +__device__ void d_softmax(size_t n, const float_t* p, const float_t* dp, + float_t* dy) { + for (size_t i = 0; i < n; i++) { + dy[i] = 0; + for (size_t j = 0; j < n; j++) { + float_t df = (j == i) ? p[i] * (1.0 - p[i]) : -p[j] * p[i]; + dy[i] += df * dp[j]; + } + } +} + +__device__ void cross_entropy(int n, const label_t idx, const float_t* p, + float_t& loss) { + if (p[idx] == 0.0) + loss -= log(float_t(1e-10)); + else + loss -= log(p[idx]); +} + +__device__ void d_cross_entropy(int n, const label_t idx, const float_t* p, + float_t* d) { + for (int i = 0; i < n; i++) + if (i == (int)idx) + d[i] = -1.0 / (p[i] + 1e-10); + else + d[i] = 0.0; } // n: number of vectors // len: length of vectors // for each vector, do softmax to normalize the vector, and then compute a loss -__global__ void softmax_cross_entropy_kernel(int n, int len, const float_t *in_data, - const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data) { - CUDA_KERNEL_LOOP(i, n) { - if (masks[i] == 1) { // masked - softmax(len, in_data+len*i, out_data+len*i); // normalize using softmax - loss[i] = 0.0; - cross_entropy(len, labels[i], &out_data[len*i], loss[i]); - } - } -} - -void softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out) { - softmax_cross_entropy_kernel<<>>(n, len, in, masks, labels, loss, out); - CudaTest("solving softmax_cross_entropy kernel failed"); -} - -__global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t *in, - const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) { - CUDA_KERNEL_LOOP(i, n) { - float_t out_grad[41]; // TODO - d_cross_entropy(len, labels[i], out+len*i, out_grad); - d_softmax(len, out+len*i, out_grad, diff+len*i); - } -} - -void d_softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) { - d_softmax_cross_entropy_kernel<<>>(n, len, in, masks, labels, out, diff); - CudaTest("solving d_softmax_cross_entropy kernel failed"); -} - -__global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t *masks, float_t *loss, HGAccumulator total) { - total.thread_entry(); - __shared__ cub::BlockReduce::TempStorage local_loss; - CUDA_KERNEL_LOOP(i, end-begin) { - if (masks[begin+i] == 1) - //total += loss[begin+i]; - total.reduce(loss[begin+i]); - } - total.thread_exit >(local_loss); -} - -acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss) { - HGAccumulator loss_accum; - Shared total_loss = Shared(1); - *(total_loss.cpu_wr_ptr()) = 0; - loss_accum.rv = total_loss.gpu_wr_ptr(); - masked_avg_loss_kernel<<>>(begin, end, masks, loss, loss_accum); - CudaTest("solving masked_avg_loss kernel failed"); - cudaDeviceSynchronize(); - return *(total_loss.cpu_rd_ptr()); +__global__ void softmax_cross_entropy_kernel(int n, int len, + const float_t* in_data, + const mask_t* masks, + const label_t* labels, + float_t* loss, float_t* out_data) { + CUDA_KERNEL_LOOP(i, n) { + if (masks[i] == 1) { // masked + softmax(len, in_data + len * i, + out_data + len * i); // normalize using softmax + loss[i] = 0.0; + cross_entropy(len, labels[i], &out_data[len * i], loss[i]); + } + } +} + +void softmax_cross_entropy_gpu(int n, int len, const float_t* in, + const mask_t* masks, const label_t* labels, + float_t* loss, float_t* out) { + softmax_cross_entropy_kernel<<>>( + n, len, in, masks, labels, loss, out); +} + +__global__ void +d_softmax_cross_entropy_kernel(int n, int len, const float_t* in, + const mask_t* masks, const label_t* labels, + const float_t* out, float_t* diff) { + CUDA_KERNEL_LOOP(i, n) { + float_t out_grad[41]; + d_cross_entropy(len, labels[i], out + len * i, out_grad); + d_softmax(len, out + len * i, out_grad, diff + len * i); + } +} + +void d_softmax_cross_entropy_gpu(int n, int len, const float_t* in, + const mask_t* masks, const label_t* labels, + const float_t* out, float_t* diff) { + d_softmax_cross_entropy_kernel<<>>( + n, len, in, masks, labels, out, diff); } - diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 1d81ea1012..6625f283b3 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -1,87 +1,102 @@ #include "net.h" void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { - context = new Context(); - num_samples = context->read_graph(dataset_str); - num_classes = context->read_labels(dataset_str); - context->norm_factor_counting(); // pre-compute normalizing factor - num_epochs = epochs; + context = new Context(); + // Context::create_blas_handle(); + num_samples = context->read_graph(dataset_str); + num_classes = context->read_labels(dataset_str); + context->degree_counting(); + context->norm_factor_counting(); // pre-compute normalizing factor + num_epochs = epochs; - std::cout << "Reading label masks ... "; - train_mask.resize(num_samples, 0); - val_mask.resize(num_samples, 0); - if (dataset_str == "reddit") { - train_begin = 0, train_count = 153431, train_end = train_begin + train_count; - val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; - for (size_t i = train_begin; i < train_end; i++) train_mask[i] = 1; - for (size_t i = val_begin; i < val_end; i++) val_mask[i] = 1; - } else { - train_count = read_masks(dataset_str, "train", train_begin, train_end, train_mask); - val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask); - } - std::cout << "Done\n"; + std::cout << "Reading label masks ... "; + train_mask.resize(num_samples, 0); + val_mask.resize(num_samples, 0); + if (dataset_str == "reddit") { + train_begin = 0, train_count = 153431, + train_end = train_begin + train_count; + val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; + for (size_t i = train_begin; i < train_end; i++) + train_mask[i] = 1; + for (size_t i = val_begin; i < val_end; i++) + val_mask[i] = 1; + } else { + train_count = + read_masks(dataset_str, "train", train_begin, train_end, train_mask); + val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask); + } + std::cout << "Done\n"; - num_layers = NUM_CONV_LAYERS + 1; - feature_dims.resize(num_layers + 1); - feature_dims[0] = context->read_features(dataset_str); // input feature dimension: D - feature_dims[1] = hidden1; // hidden1 level embedding: 16 - feature_dims[2] = num_classes; // output embedding: E - feature_dims[3] = num_classes; // normalized output embedding: E - layers.resize(num_layers); + num_layers = NUM_CONV_LAYERS + 1; + feature_dims.resize(num_layers + 1); + feature_dims[0] = + context->read_features(dataset_str); // input feature dimension: D + feature_dims[1] = hidden1; // hidden1 level embedding: 16 + feature_dims[2] = num_classes; // output embedding: E + feature_dims[3] = num_classes; // normalized output embedding: E + layers.resize(num_layers); #ifndef CPU_ONLY - context->copy_data_to_device(); // copy labels and input features to the device + context + ->copy_data_to_device(); // copy labels and input features to the device #endif } -void Net::train(optimizer *opt, bool need_validate) { - std::cout << "\nStart training...\n"; - galois::StatTimer Tupdate("Train-WeightUpdate"); - galois::StatTimer Tfw("Train-Forward"); - galois::StatTimer Tbw("Train-Backward"); - galois::StatTimer Tval("Validation"); - Timer t_epoch; - // run epoches - for (unsigned i = 0; i < num_epochs; i++) { - std::cout << "Epoch " << std::setw(2) << i << std::fixed << std::setprecision(3) << ":"; - t_epoch.Start(); +void Net::train(optimizer* opt, bool need_validate) { + std::cout << "\nStart training...\n"; + galois::StatTimer Tupdate("Train-WeightUpdate"); + galois::StatTimer Tfw("Train-Forward"); + galois::StatTimer Tbw("Train-Backward"); + galois::StatTimer Tval("Validation"); + Timer t_epoch; + // run epoches + for (unsigned i = 0; i < num_epochs; i++) { + std::cout << "Epoch " << std::setw(2) << i << std::fixed + << std::setprecision(3) << ":"; + t_epoch.Start(); - // training steps - set_netphases(net_phase::train); - acc_t train_loss = 0.0, train_acc = 0.0; - Tfw.start(); - train_loss = fprop(train_begin, train_end, train_count, &train_mask[0]); // forward - train_acc = masked_accuracy(train_begin, train_end, train_count, &train_mask[0]); // predict - Tfw.stop(); - Tbw.start(); - bprop(); // back propogation - Tbw.stop(); - Tupdate.start(); - update_weights(opt); // update parameters - Tupdate.stop(); - set_netphases(net_phase::test); - std::cout << " train_loss = " << std::setw(5) << train_loss << " train_acc = " << std::setw(5) << train_acc; - t_epoch.Stop(); - double epoch_time = t_epoch.Millisecs(); - if (need_validate) { - // Validation - acc_t val_loss = 0.0, val_acc = 0.0; - Tval.start(); - double val_time = evaluate(val_begin, val_end, val_count, &val_mask[0], val_loss, val_acc); - Tval.stop(); - std::cout << " val_loss = " << std::setw(5) << val_loss << " val_acc = " << std::setw(5) << val_acc; - std::cout << " time = " << epoch_time + val_time << " ms (train_time = " << epoch_time << " val_time = " << val_time << ")\n"; - } else { - std::cout << " train_time = " << epoch_time << " ms\n"; - } - } + // training steps + set_netphases(net_phase::train); + acc_t train_loss = 0.0, train_acc = 0.0; + Tfw.start(); + train_loss = + fprop(train_begin, train_end, train_count, &train_mask[0]); // forward + train_acc = masked_accuracy(train_begin, train_end, train_count, + &train_mask[0]); // predict + Tfw.stop(); + Tbw.start(); + bprop(); // back propogation + Tbw.stop(); + Tupdate.start(); + update_weights(opt); // update parameters + Tupdate.stop(); + set_netphases(net_phase::test); + std::cout << " train_loss = " << std::setw(5) << train_loss + << " train_acc = " << std::setw(5) << train_acc; + t_epoch.Stop(); + double epoch_time = t_epoch.Millisecs(); + if (need_validate) { + // Validation + acc_t val_loss = 0.0, val_acc = 0.0; + Tval.start(); + double val_time = evaluate(val_begin, val_end, val_count, &val_mask[0], + val_loss, val_acc); + Tval.stop(); + std::cout << " val_loss = " << std::setw(5) << val_loss + << " val_acc = " << std::setw(5) << val_acc; + std::cout << " time = " << epoch_time + val_time + << " ms (train_time = " << epoch_time + << " val_time = " << val_time << ")\n"; + } else { + std::cout << " train_time = " << epoch_time << " ms\n"; + } + } } void Net::construct_layers() { - std::cout << "\nConstructing layers...\n"; - append_conv_layer(0, true); // first conv layer - append_conv_layer(1); // hidden1 layer - append_out_layer(2); // output layer - layers[0]->set_in_data(context->get_in_ptr()); // feed input data - set_contexts(); + std::cout << "\nConstructing layers...\n"; + append_conv_layer(0, true); // first conv layer + append_conv_layer(1); // hidden1 layer + append_out_layer(2); // output layer + layers[0]->set_in_data(context->get_in_ptr()); // feed input data + set_contexts(); } - diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp index 5b60a9f22a..f4278688d1 100644 --- a/libdeepgalois/src/node.cpp +++ b/libdeepgalois/src/node.cpp @@ -1,38 +1,37 @@ #include "node.h" -#include void edge::alloc() { - //std::cout << "Allocating memory for tensors (intermediate features and gradients) ...\n"; #ifdef CPU_ONLY - data_ = new float_t[num_samples_ * ft_dim_]; - grad_ = new float_t[num_samples_ * ft_dim_]; + data_ = new float_t[num_samples_ * ft_dim_]; + grad_ = new float_t[num_samples_ * ft_dim_]; #else - alloc_gpu(); + alloc_gpu(); #endif } -void edge::merge_grads(vec_t *dst) { - assert(grad_ != NULL); - dst->resize(ft_dim_); - float_t *pdst = &(*dst)[0]; +void edge::merge_grads(vec_t* dst) { + assert(grad_ != NULL); + dst->resize(ft_dim_); + float_t* pdst = &(*dst)[0]; #ifdef CPU_ONLY - std::copy(grad_, grad_+ft_dim_, pdst); - // @todo consider adding parallelism and vectorization - for (size_t sample = 1; sample < num_samples_; ++sample) { - for (size_t i = 0; i < ft_dim_; i++) pdst[i] += grad_[sample*ft_dim_+i]; - //vectorize::reduce(&grad_[sample][0], ft_dim_, pdst); - } + std::copy(grad_, grad_ + ft_dim_, pdst); + // @todo consider adding parallelism and vectorization + for (size_t sample = 1; sample < num_samples_; ++sample) { + for (size_t i = 0; i < ft_dim_; i++) + pdst[i] += grad_[sample * ft_dim_ + i]; + // vectorize::reduce(&grad_[sample][0], ft_dim_, pdst); + } #else - merge_grads_gpu(pdst); + merge_grads_gpu(pdst); #endif } void edge::clear_grads() { #ifdef CPU_ONLY - std::fill(grad_, grad_+ft_dim_*num_samples_, float_t(0)); // TODO: need vectorize - //vectorize::fill(&grad_[0], grad_.size(), float_t(0)); + std::fill(grad_, grad_ + ft_dim_ * num_samples_, + float_t(0)); // TODO: need vectorize + // vectorize::fill(&grad_[0], grad_.size(), float_t(0)); #else - clear_grads_gpu(); + clear_grads_gpu(); #endif } - diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu index da79217231..2443e9ed7c 100644 --- a/libdeepgalois/src/node.cu +++ b/libdeepgalois/src/node.cu @@ -2,14 +2,17 @@ #include "cutils.h" void edge::alloc_gpu() { - CUDA_CHECK(cudaMalloc((void **)&data_, num_samples_ * ft_dim_ * sizeof(float_t))); - CUDA_CHECK(cudaMalloc((void **)&grad_, num_samples_ * ft_dim_ * sizeof(float_t))); + CUDA_CHECK( + cudaMalloc((void**)&data_, num_samples_ * ft_dim_ * sizeof(float_t))); + CUDA_CHECK( + cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t))); } -void edge::merge_grads_gpu(float_t *dst) { - CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost)); +void edge::merge_grads_gpu(float_t* dst) { + CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), + cudaMemcpyDeviceToHost)); } void edge::clear_grads_gpu() { - CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t))); + CUDA_CHECK(cudaMemset(grad_, 0, ft_dim_ * num_samples_ * sizeof(float_t))); } diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp index 3372378de1..fb10221f19 100644 --- a/libdeepgalois/src/optimizer.cpp +++ b/libdeepgalois/src/optimizer.cpp @@ -1,76 +1,89 @@ #include "optimizer.h" #include "galois/Galois.h" -void adagrad::update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &g = get<0>(W); - if (parallelize) { - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - g[i] += dW[i] * dW[i]; - W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); - }, galois::loopname("adagrad_update")); - } else { - for (size_t i = 0; i < W.size(); i++) { - g[i] += dW[i] * dW[i]; - W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); - } - } +void adagrad::update(const vec_t& dW, vec_t& W, bool parallelize) { + vec_t& g = get<0>(W); + if (parallelize) { + galois::do_all(galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + g[i] += dW[i] * dW[i]; + W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); + }, + galois::loopname("adagrad_update")); + } else { + for (size_t i = 0; i < W.size(); i++) { + g[i] += dW[i] * dW[i]; + W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); + } + } } -void RMSprop::update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &g = get<0>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i]; - W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps); - }, galois::loopname("rms_update")); +void RMSprop::update(const vec_t& dW, vec_t& W, bool parallelize) { + vec_t& g = get<0>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i]; + W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps); + }, + galois::loopname("rms_update")); } -void adam::update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &mt = get<0>(W); - vec_t &vt = get<1>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; - vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; - // L2 norm based update rule - W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / - std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); - }, galois::chunk_size<256>(), galois::steal(), galois::loopname("adam_update")); - b1_t *= b1; - b2_t *= b2; +void adam::update(const vec_t& dW, vec_t& W, bool parallelize) { + vec_t& mt = get<0>(W); + vec_t& vt = get<1>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; + // L2 norm based update rule + W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / + std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); + }, + galois::chunk_size<256>(), galois::steal(), + galois::loopname("adam_update")); + b1_t *= b1; + b2_t *= b2; } -void adamax::update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &mt = get<0>(W); - vec_t &ut = get<1>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; - ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); - // Lp norm based update rule - W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); - }, galois::loopname("adamax_update")); - b1_t *= b1; +void adamax::update(const vec_t& dW, vec_t& W, bool parallelize) { + vec_t& mt = get<0>(W); + vec_t& ut = get<1>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); + // Lp norm based update rule + W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); + }, + galois::loopname("adamax_update")); + b1_t *= b1; } -void gradient_descent::update(const vec_t &dW, vec_t &W, bool parallelize) { - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); - }, galois::loopname("gradient_descent_update")); +void gradient_descent::update(const vec_t& dW, vec_t& W, bool parallelize) { + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); }, + galois::loopname("gradient_descent_update")); } -void momentum::update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &dWprev = get<0>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); - W[i] += V; - dWprev[i] = V; - }, galois::loopname("momentum_update")); +void momentum::update(const vec_t& dW, vec_t& W, bool parallelize) { + vec_t& dWprev = get<0>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += V; + dWprev[i] = V; + }, + galois::loopname("momentum_update")); } -void nesterov_momentum::update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &dWprev = get<0>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); - W[i] += (-mu) * dWprev[i] + (1 + mu) * V; - dWprev[i] = V; - }, galois::loopname("nesterov_momentum_update")); +void nesterov_momentum::update(const vec_t& dW, vec_t& W, bool parallelize) { + vec_t& dWprev = get<0>(W); + galois::do_all(galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += (-mu) * dWprev[i] + (1 + mu) * V; + dWprev[i] = V; + }, + galois::loopname("nesterov_momentum_update")); } - diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu index 832da51cbf..908ce4f32a 100644 --- a/libdeepgalois/src/optimizer.cu +++ b/libdeepgalois/src/optimizer.cu @@ -1,4 +1,3 @@ #include "optimizer.h" -void adam::update_gpu(const float_t *dW, float_t *W) { -} +void adam::update_gpu(const float_t* dW, float_t* W) {} From 51efbcb099bf2452dd73d83d9b6cc9b95b012077 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 25 Feb 2020 16:58:39 -0600 Subject: [PATCH 034/660] fix gpu bugs --- libdeepgalois/include/layers/layer.h | 132 ++++------ libdeepgalois/include/math_functions.hh | 50 ++-- libdeepgalois/include/net.h | 45 +--- libdeepgalois/src/aggregator.cu | 12 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 87 +++---- .../src/layers/softmax_loss_layer.cpp | 7 +- libdeepgalois/src/math_functions.cu | 246 +++++++++++------- libdeepgalois/src/net.cpp | 17 ++ libgpu/include/graph_gpu.h | 8 +- 9 files changed, 306 insertions(+), 298 deletions(-) diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index c0e694d21c..7b8bbc55a4 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -52,83 +52,61 @@ class layer : public node { virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) = 0; - void set_trainable(bool trainable) { trainable_ = trainable; } - bool trainable() const { return trainable_; } - void set_name(std::string name) { name_ = name; } - std::string get_name() { return name_; } - void print_layer_info() { - std::cout << "Layer" << level_ << " type: " << layer_type() << " input[" - << input_dims[0] << "," << input_dims[1] << "] output[" - << output_dims[0] << "," << output_dims[1] << "]\n"; - } - virtual void set_sample_mask(size_t sample_begin, size_t sample_end, - size_t sample_count, mask_t* masks) { - begin_ = sample_begin; - end_ = sample_end; - count_ = sample_count; - masks_ = masks; - } - void set_in_data(float_t* data) { - assert(data.size() == input_dims[0] * input_dims[1]); - prev_ = std::make_shared(this, input_dims[0], input_dims[1]); - prev_->set_data(data); - // no need to allocate memory for gradients, since this is the input layer. - // - // allocate memory for intermediate features - // prev_->get_data() = data; - // std::copy(data.begin(), data.end(), prev_->get_data()); - // allocate memory for intermediate gradients - // prev_->get_gradient().resize(input_dims[0]*input_dims[1]); - } - void add_edge() { - // add an outgoing edge - next_ = std::make_shared(this, output_dims[0], output_dims[1]); - // allocate memory for intermediate feature vectors and gradients - next_->alloc(); - // next_->get_data().resize(output_dims[0]*output_dims[1]); - } - void alloc_grad() { - // allocate memory for intermediate gradients - // next_->get_gradient().resize(output_dims[0]*output_dims[1]); - } - void forward() { - forward_propagation(prev()->get_data(), next()->get_data()); - } - void backward() { - back_propagation(prev()->get_data(), next()->get_data(), - next()->get_gradient(), prev()->get_gradient()); - } - void update_weight(optimizer* opt) { - // parallelize only when target size is big enough to mitigate thread - // spawning overhead. - bool parallel = (W.size() >= 512); - // vec_t diff; - // prev()->merge_grads(&diff); - // auto in_data = prev()->get_data(); - // float_t rcp_batch_size = float_t(1.0) / in_data.size(); - // for (size_t i = 0; i < diff.size(); ++i) - // diff[i] *= rcp_batch_size; - opt->update(weight_grad, W, parallel); // W += grad - // prev()->clear_grads(); - next()->clear_grads(); - } - inline acc_t get_masked_loss() { - AccumF total_loss; - AccumU valid_sample_count; - total_loss.reset(); - valid_sample_count.reset(); - galois::do_all(galois::iterate(begin_, end_), - [&](const auto& i) { - if (masks_[i]) { - total_loss += loss[i]; - valid_sample_count += 1; - } - }, - galois::chunk_size<256>(), galois::steal(), - galois::loopname("getMaskedLoss")); - assert(valid_sample_count.reduce() == count_); - return total_loss.reduce() / (acc_t)count_; - } + void set_trainable(bool trainable) { trainable_ = trainable; } + bool trainable() const { return trainable_; } + void set_name(std::string name) { name_ = name; } + std::string get_name() { return name_; } + mask_t *get_device_masks() { return d_masks_; } + void print_layer_info() { + std::cout << "Layer" << level_ << " type: " << layer_type() + << " input[" << input_dims[0] << "," << input_dims[1] + << "] output[" << output_dims[0] << "," << output_dims[1] << "]\n"; + } + virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, mask_t *masks) { + begin_ = sample_begin; + end_ = sample_end; + count_ = sample_count; + masks_ = masks; +#ifndef CPU_ONLY + copy_masks_device(input_dims[0], masks_, d_masks_); +#endif + } + void set_in_data(float_t *data) { + prev_ = std::make_shared(this, input_dims[0], input_dims[1]); + prev_->set_data(data); + // no need to allocate memory for gradients, since this is the input layer. + } + void add_edge() { + // add an outgoing edge + next_ = std::make_shared(this, output_dims[0], output_dims[1]); + // allocate memory for intermediate feature vectors and gradients + next_->alloc(); + } + void alloc_grad() { + // allocate memory for intermediate gradients + } + void forward() { + //std::cout << name_ << ": forwarding ... "; + forward_propagation(prev()->get_data(), next()->get_data()); + } + void backward() { + //std::cout << name_ << ": backwarding ... "; + back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient()); + } + void update_weight(optimizer *opt) { + //std::cout << name_ << ": weight updating ... "; + //vec_t diff; + //prev()->merge_grads(&diff); +#ifdef CPU_ONLY + // parallelize only when target size is big enough to mitigate thread spawning overhead. + bool parallel = (W.size() >= 512); + opt->update(weight_grad, W, parallel); // W += grad +#else + opt->update_gpu(d_weight_grad, d_W); // W += grad +#endif + //prev()->clear_grads(); + next()->clear_grads(); + } protected: unsigned level_; // layer id: [0, num_layers-1] diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 6f4348ff34..02afab2c49 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -70,37 +70,25 @@ float_t cross_entropy(size_t n, const float_t* y, const float_t* p); void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d); void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d); -void out_malloc_device(int n, mask_t* h_masks, mask_t* d_masks, float_t* loss); -void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, - unsigned* masks, float_t* in, float_t* out, - float_t* matrix, float_t* grad); -void copy_gpu(size_t len, const float_t* in, float_t* out); -void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned* masks, - float_t* in, float_t* out); -void vadd_gpu(const int n, const float_t* a, const float_t* b, - float_t* out); // vector add -void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU -void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, - float_t* out_diff); // ReLU derivative -void dropout_gpu(const int n, const float scale, const float dropout_rate, - const float_t* in, unsigned* mask, float_t* out); // dropout -void d_dropout_gpu(const float scale, const float_t* in_diff, - const unsigned* mask, - float_t* out_diff); // dropout derivative -void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, - const int M, const int N, const int K, const float alpha, - const float* A, const float* B, const float beta, float* C); -void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, - const float_t* A, const float_t* B, - float_t* C); // matrix multiply -int argmax_gpu(const size_t n, const float_t* x); // the arguments of the maxima -void softmax_cross_entropy_gpu(int x, int y, const float_t* in_data, - const mask_t* masks, const label_t* labels, - float_t* loss, float_t* out_data); -void d_softmax_cross_entropy_gpu(int x, int y, const float_t* in_data, - const mask_t* masks, const label_t* labels, - const float_t* out_data, float_t* diff); -void scal_gpu(const int N, const float alpha, float* X); +void copy_gpu(size_t len, const float_t *in, float_t *out); +void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add +void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU +void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative +void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out); // dropout +void d_dropout_gpu(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out); // dropout derivative +void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C); +void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply +void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data); +void d_softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, const float_t *out_data, float_t *diff); +void scal_gpu(const int N, const float alpha, float *X); void add_scalar_gpu(const int N, const float_t alpha, float_t* Y); +acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss); +acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end, size_t count, mask_t *masks, float_t *preds, label_t *labels); + +void copy_masks_device(int n, mask_t *h_masks, mask_t *&d_masks); +void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *&masks, float_t *&in, float_t *&out); +void loss_malloc_device(int n, float_t *&loss); +void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *&masks, float_t *&in, float_t *&out, float_t *&matrix, float_t *&grad); #endif diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index 87a0e3b72b..9d3e1c1184 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -62,17 +62,17 @@ class Net { connect(layers[layer_id - 1], layers[layer_id]); } - // forward propagation: [begin, end) is the range of samples used. - acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) { - // set mask for the last layer - layers[num_layers - 1]->set_sample_mask(begin, end, count, &masks[0]); - // layer0: from N x D to N x 16 - // layer1: from N x 16 to N x E - // layer2: from N x E to N x E (normalize only) - for (size_t i = 0; i < num_layers; i++) - layers[i]->forward(); - return layers[num_layers - 1]->get_masked_loss(); - } + // forward propagation: [begin, end) is the range of samples used. + acc_t fprop(size_t begin, size_t end, size_t count, mask_t *masks) { + // set mask for the last layer + layers[num_layers-1]->set_sample_mask(begin, end, count, masks); + // layer0: from N x D to N x 16 + // layer1: from N x 16 to N x E + // layer2: from N x E to N x E (normalize only) + for (size_t i = 0; i < num_layers; i ++) + layers[i]->forward(); + return layers[num_layers-1]->get_masked_loss(); + } // back propogation void bprop() { @@ -108,27 +108,8 @@ class Net { std::vector train_mask, val_mask; // masks for traning and validation size_t train_begin, train_end, train_count, val_begin, val_end, val_count; std::vector layers; // all the layers in the neural network - - // comparing outputs with the ground truth (labels) - inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, - mask_t* masks) { - AccumF accuracy_all; - accuracy_all.reset(); - galois::do_all(galois::iterate(begin, end), - [&](const auto& i) { - if (masks[i] == 1) { - int preds = argmax(num_classes, - &(layers[NUM_CONV_LAYERS - 1] - ->next() - ->get_data()[i * num_classes])); - if ((label_t)preds == context->get_label(i)) - accuracy_all += 1.0; - } - }, - galois::chunk_size<256>(), galois::steal(), - galois::loopname("getMaskedLoss")); - return accuracy_all.reduce() / (acc_t)count; - } + // comparing outputs with the ground truth (labels) + acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t *masks); }; #endif diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index ea41fd3dcb..c1f578caa1 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -32,10 +32,10 @@ __global__ void update_all_kernel(size_t n, size_t len, CSRGraph& g, } } -void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, - bool norm, const float_t* norm_factor) { - unsigned n = g.nnodes; - CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); - update_all_kernel<<>>( - n, len, g, in, out, norm, norm_factor); +void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { + unsigned n = g.nnodes; + std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n"; + CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); + update_all_kernel<<>>(n, len, g, in, out, norm, norm_factor); + CudaTest("solving update_all kernel failed"); } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 06ec53b2db..1ef9be19c1 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -62,28 +62,22 @@ void graph_conv_layer::init() { #ifdef CPU_ONLY // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) -void graph_conv_layer::forward_propagation(const float_t* in_data, - float_t* out_data) { - // input: x*y; W: y*z; output: x*z - // if y > z: mult W first to reduce the feature size for aggregation - // else: aggregate first then mult W (not implemented yet) - if (dropout_ && phase_ == net_phase::train) { - galois::do_all(galois::iterate((size_t)0, x), - [&](const auto& i) { - dropout(y, scale_, dropout_rate_, &in_data[i * y], - &dropout_mask[i * y], &in_temp[i * y]); - }, - galois::loopname("dropout")); - matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z - } else - matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z - aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate - if (act_) { - galois::do_all( - galois::iterate((size_t)0, x), - [&](const auto& i) { relu(z, &out_data[i * z], &out_data[i * z]); }, - galois::loopname("relu")); - } +void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { + // input: x*y; W: y*z; output: x*z + // if y > z: mult W first to reduce the feature size for aggregation + // else: aggregate first then mult W (not implemented yet) + if (dropout_ && phase_ == net_phase::train) { + galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { + dropout(y, scale_, dropout_rate_, &in_data[i*y], &dropout_mask[i*y], &in_temp[i*y]); + }, galois::loopname("dropout")); + matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z + } else matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z + aggregate(z, context->graph_cpu, out_temp, out_data); + if (act_) { + galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { + relu(z, &out_data[i*z], &out_data[i*z]); + }, galois::loopname("relu")); + } } // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ @@ -129,36 +123,29 @@ void graph_conv_layer::back_propagation(const float_t* in_data, #else // GPU forward -void graph_conv_layer::forward_propagation(const float_t* in_data, - float_t* out_data) { - assert(y <= 128); // currently only support feature length <= 128 - if (dropout_ && phase_ == net_phase::train) { - dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); - matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp); - } else - matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp); - aggregate(z, context->graph_gpu, out_temp, out_data); - if (act_) - relu_gpu(x * z, out_data, out_data); +void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { + assert(y <= 128); // currently only support feature length <= 128 + assert(in_data != NULL); + assert(in_temp != NULL); + assert(dropout_mask != NULL); + //std::cout << "in_data=" << in_data << ", in_temp=" << in_temp << ", dropout_mask=" << dropout_mask << ", out_temp=" << out_temp << ", out_data=" << out_data << "\n"; + if (dropout_ && phase_ == net_phase::train) { + dropout_gpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); + matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp); + } else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp); + //aggregate(z, context->graph_gpu, out_temp, out_data); + if (act_) relu_gpu(x*z, out_data, out_data); } // GPU backward -void graph_conv_layer::back_propagation(const float_t* in_data, - const float_t* out_data, - float_t* out_grad, float_t* in_grad) { - if (act_) - d_relu_gpu(x * z, out_grad, out_data, out_temp); - else - copy_gpu(x * z, out_grad, out_temp); - if (level_ != 0) { - sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, - in_temp); - update_all(y, context->graph_gpu, in_temp, in_grad, true, - context->d_norm_factor); - if (dropout_) - d_dropout(y, scale_, in_grad, dropout_mask, in_grad); - } - sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, - d_weight_grad); +void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { + if (act_) d_relu_gpu(x*z, out_grad, out_data, out_temp); + else copy_gpu(x*z, out_grad, out_temp); + if (level_ != 0) { + sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); + //update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); + if (dropout_) d_dropout_gpu(x*y, scale_, in_grad, dropout_mask, in_grad); + } + sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); } #endif diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 579de65667..6c29dc9a14 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -63,10 +63,7 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, context->d_labels, loss, out_data); } -void softmax_loss_layer::back_propagation(const float_t* in_data, - const float_t* out_data, - float_t* out_grad, float_t* in_grad) { - d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, - context->d_labels, out_data, in_grad); +acc_t softmax_loss_layer::get_masked_loss() { + return masked_avg_loss(begin_, end_, count_, d_masks_, loss); } #endif diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 8dbe141c96..70ddd8826d 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -1,18 +1,41 @@ #include "math_functions.hh" #include "context.h" -void gpu_rng_uniform(const int n, unsigned* r) { - CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n)); +void gpu_rng_uniform(const int n, unsigned *r) { + CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n)); } -void gpu_rng_uniform(const int n, const float_t a, const float_t b, - float_t* r) { - CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n)); - const float range = b - a; - if (range != float_t(1)) - scal_gpu(n, range, r); - if (a != float_t(0)) - add_scalar_gpu(n, a, r); +void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) { + CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n)); + const float range = b - a; + if (range != float_t(1)) scal_gpu(n, range, r); + if (a != float_t(0)) add_scalar_gpu(n, a, r); +} + +void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t *r) { + CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); +} + +void loss_malloc_device(int n, float_t *&loss) { + CUDA_CHECK(cudaMalloc((void **)&loss, n * sizeof(float_t))); +} + +void copy_masks_device(int n, mask_t *h_masks, mask_t *&d_masks) { + assert(h_masks != NULL); + CUDA_CHECK(cudaMalloc((void **)&d_masks, n * sizeof(mask_t))); + CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); +} + +void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *&masks, float_t *&in, float_t *&out, float_t *&matrix, float_t *&grad) { + if (dropout) CUDA_CHECK(cudaMalloc((void **)&masks, x * y * sizeof(unsigned))); + CUDA_CHECK(cudaMalloc((void **)&in, x * y * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void **)&out, x * z * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void **)&matrix, y * z * sizeof(float_t))); + auto init_range = sqrt(6.0/(y + z)); + // Glorot & Bengio (AISTATS 2010) + gpu_rng_uniform(y*z, -init_range, init_range, matrix); + CUDA_CHECK(cudaMalloc((void **)&grad, y * z * sizeof(float_t))); + CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t))); } void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, @@ -21,40 +44,33 @@ void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); } -void out_malloc_device(int n, mask_t* h_masks, mask_t* d_masks, float_t* loss) { - CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t))); - CUDA_CHECK( - cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMalloc((void**)&loss, n * sizeof(float_t))); +__global__ void setup_curand_kernel(const int n, curandState *state) { + CUDA_KERNEL_LOOP(i, n) { + //curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 + curand_init(7+i, i, 0, &state[i]); // Each thread gets different seed + } } -void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, - unsigned* masks, float_t* in, float_t* out, - float_t* matrix, float_t* grad) { - if (dropout) - CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned))); - CUDA_CHECK(cudaMalloc((void**)&in, x * y * sizeof(float_t))); - CUDA_CHECK(cudaMalloc((void**)&out, x * z * sizeof(float_t))); - CUDA_CHECK(cudaMalloc((void**)&matrix, y * z * sizeof(float_t))); - auto init_range = sqrt(6.0 / (y + z)); - // Glorot & Bengio (AISTATS 2010) - gpu_rng_uniform(y * z, -init_range, init_range, matrix); - CUDA_CHECK(cudaMalloc((void**)&grad, y * z * sizeof(float_t))); - CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t))); +__global__ void dropout_kernel(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned *masks, curandState *state, float_t* out) { + CUDA_KERNEL_LOOP(i, n) { + //curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 + //masks[i] = curand_uniform(&state[i]) <= dropout_rate ? 1 : 0; + masks[i] = 1.0 - dropout_rate; + out[i] = in[i] * masks[i] * scale; + } } -void copy_gpu(size_t len, const float_t* in, float_t* out) { - CUDA_CHECK( - cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); -} - -__global__ void dropout_kernel(const int n, const float scale, - const float dropout_rate, const float_t* in, - unsigned* masks, float_t* out) { - CUDA_KERNEL_LOOP(i, n) { - // masks[i] = bernoulli(dropout_rate); - out[i] = in[i] * masks[i] * scale; - } +void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out) { + curandState *devStates; + CUDA_CHECK(cudaMalloc((void **)&devStates, n * sizeof(curandState))); + //std::cout << "[debug]: setup curand, n = " << n << "\n"; + //setup_curand_kernel<<>>(n, devStates); + //CudaTest("solving setup_curand kernel failed"); + //std::cout << "[debug]: dropout_gpu\n"; + dropout_kernel<<>>(n, scale, dropout_rate, in, masks, devStates, out); + CudaTest("solving dropout kernel failed"); + CUDA_CHECK(cudaFree(devStates)); + //std::cout << "[debug]: dropout_gpu done\n"; } void dropout_gpu(const int n, const float scale, const float dropout_rate, @@ -68,8 +84,10 @@ __global__ void relu_kernel(const int n, const float_t* in, float_t* out) { CUDA_KERNEL_LOOP(index, n) { out[index] = in[index] > 0 ? in[index] : 0; } } -void relu_gpu(const int n, const float_t* in, float_t* out) { - relu_kernel<<>>(n, in, out); +void relu_gpu(const int n, const float_t *in, float_t* out) { + //std::cout << "[debug]: relu_gpu\n"; + relu_kernel<<>>(n, in, out); + CudaTest("solving relu kernel failed"); } __global__ void d_relu_kernel(const int n, const float_t* in_diff, @@ -99,23 +117,17 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, K, &alpha, B, ldb, A, lda, &beta, C, N)); } -void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, - const float_t* A, const float_t* B, float_t* C) { - const CBLAS_TRANSPOSE TransA = CblasNoTrans; - const CBLAS_TRANSPOSE TransB = CblasNoTrans; - sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); +void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C) { + //std::cout << "[debug]: matmul1D1D_gpu\n"; + const CBLAS_TRANSPOSE TransA = CblasNoTrans; + const CBLAS_TRANSPOSE TransB = CblasNoTrans; + sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); } -// the arguments of the maxima -int argmax_gpu(const size_t n, const float_t* x) { return 0; } - -void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const float alpha, const float* A, const float* x, - const float beta, float* y) { - cublasOperation_t cuTransA = - (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A, - N, x, 1, &beta, y, 1)); +void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float* A, const float* x, const float beta, float* y) { + cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); } void scal_gpu(const int N, const float alpha, float* X) { @@ -212,42 +224,90 @@ __device__ void d_cross_entropy(int n, const label_t idx, const float_t* p, // n: number of vectors // len: length of vectors // for each vector, do softmax to normalize the vector, and then compute a loss -__global__ void softmax_cross_entropy_kernel(int n, int len, - const float_t* in_data, - const mask_t* masks, - const label_t* labels, - float_t* loss, float_t* out_data) { - CUDA_KERNEL_LOOP(i, n) { - if (masks[i] == 1) { // masked - softmax(len, in_data + len * i, - out_data + len * i); // normalize using softmax - loss[i] = 0.0; - cross_entropy(len, labels[i], &out_data[len * i], loss[i]); - } - } +__global__ void softmax_cross_entropy_kernel(int n, int len, const float_t *in_data, + const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data) { + CUDA_KERNEL_LOOP(i, n) { + if (masks[i] == 1) { // masked + softmax(len, in_data+len*i, out_data+len*i); // normalize using softmax + loss[i] = 0.0; + cross_entropy(len, labels[i], &out_data[len*i], loss[i]); + } + } +} + +void softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out) { + softmax_cross_entropy_kernel<<>>(n, len, in, masks, labels, loss, out); + CudaTest("solving softmax_cross_entropy kernel failed"); +} + +__global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t *in, + const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) { + CUDA_KERNEL_LOOP(i, n) { + float_t out_grad[41]; // TODO + d_cross_entropy(len, labels[i], out+len*i, out_grad); + d_softmax(len, out+len*i, out_grad, diff+len*i); + } +} + +void d_softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) { + d_softmax_cross_entropy_kernel<<>>(n, len, in, masks, labels, out, diff); + CudaTest("solving d_softmax_cross_entropy kernel failed"); +} + +__global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t *masks, float_t *loss, HGAccumulator total) { + total.thread_entry(); + __shared__ cub::BlockReduce::TempStorage local_loss; + CUDA_KERNEL_LOOP(i, end-begin) { + if (masks[begin+i] == 1) + //total += loss[begin+i]; + total.reduce(loss[begin+i]); + } + total.thread_exit >(local_loss); +} + +acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss) { + HGAccumulator loss_accum; + Shared total_loss = Shared(1); + *(total_loss.cpu_wr_ptr()) = 0; + loss_accum.rv = total_loss.gpu_wr_ptr(); + masked_avg_loss_kernel<<>>(begin, end, masks, loss, loss_accum); + CudaTest("solving masked_avg_loss kernel failed"); + cudaDeviceSynchronize(); + return *(total_loss.cpu_rd_ptr()) / count; } -void softmax_cross_entropy_gpu(int n, int len, const float_t* in, - const mask_t* masks, const label_t* labels, - float_t* loss, float_t* out) { - softmax_cross_entropy_kernel<<>>( - n, len, in, masks, labels, loss, out); -} - -__global__ void -d_softmax_cross_entropy_kernel(int n, int len, const float_t* in, - const mask_t* masks, const label_t* labels, - const float_t* out, float_t* diff) { - CUDA_KERNEL_LOOP(i, n) { - float_t out_grad[41]; - d_cross_entropy(len, labels[i], out + len * i, out_grad); - d_softmax(len, out + len * i, out_grad, diff + len * i); - } -} - -void d_softmax_cross_entropy_gpu(int n, int len, const float_t* in, - const mask_t* masks, const label_t* labels, - const float_t* out, float_t* diff) { - d_softmax_cross_entropy_kernel<<>>( - n, len, in, masks, labels, out, diff); +// the arguments of the maxima +__device__ size_t argmax_device(const size_t n, const float_t *x) { + float_t max = x[0]; + size_t max_ind = 0; + for (size_t i = 1; i < n; i++) { + if (x[i] > max) { + max_ind = i; + max = x[i]; + } + } + return max_ind; +} + +__global__ void masked_accuracy_kernel(size_t num_classes, size_t begin, size_t end, mask_t *masks, float_t *preds, label_t *labels, HGAccumulator total) { + total.thread_entry(); + __shared__ cub::BlockReduce::TempStorage local_accuracy; + CUDA_KERNEL_LOOP(i, end-begin) { + if (masks[begin+i] == 1) { + label_t pred = (label_t)argmax_device(num_classes, preds+(begin+i)*num_classes); + if (pred == labels[begin+i]) total.reduce(1.0); + } + } + total.thread_exit >(local_accuracy); +} + +acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end, size_t count, mask_t *masks, float_t *preds, label_t *labels) { + HGAccumulator accuracy_accum; + Shared total_accuracy = Shared(1); + *(total_accuracy.cpu_wr_ptr()) = 0; + accuracy_accum.rv = total_accuracy.gpu_wr_ptr(); + masked_accuracy_kernel<<>>(num_classes, begin, end, masks, preds, labels, accuracy_accum); + CudaTest("solving masked_avg_loss kernel failed"); + cudaDeviceSynchronize(); + return *(total_accuracy.cpu_rd_ptr()) / count; } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 6625f283b3..f76ccaeb8a 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -100,3 +100,20 @@ void Net::construct_layers() { layers[0]->set_in_data(context->get_in_ptr()); // feed input data set_contexts(); } + +acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t *masks) { +#ifdef CPU_ONLY + AccumF accuracy_all; + accuracy_all.reset(); + galois::do_all(galois::iterate(begin, end), [&](const auto& i) { + if (masks[i] == 1) { + int preds = argmax(num_classes, &(layers[NUM_CONV_LAYERS-1]->next()->get_data()[i*num_classes])); + if ((label_t)preds == context->get_label(i)) accuracy_all += 1.0; + } + }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); + return accuracy_all.reduce() / (acc_t)count; +#else + return masked_accuracy_gpu(num_classes, begin, end, count, layers[NUM_CONV_LAYERS]->get_device_masks(), layers[NUM_CONV_LAYERS-1]->next()->get_data(), context->d_labels); +#endif +} + diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index 2458ad8632..050d7bbc69 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -110,18 +110,18 @@ struct CSRGraph { //check_cuda(cudaMemcpy(d_degrees, h_degrees, m * sizeof(int), cudaMemcpyHostToDevice)); } - inline __device__ __host__ index_type getEdgeDst(unsigned edge) { + __device__ __host__ index_type getEdgeDst(unsigned edge) { assert(edge < nedges); return edge_dst[edge]; }; - inline __device__ __host__ node_data_type getData(unsigned vid) { + __device__ __host__ node_data_type getData(unsigned vid) { return node_data[vid]; } - inline __device__ __host__ index_type edge_begin(unsigned src) { + __device__ __host__ index_type edge_begin(unsigned src) { assert(src <= nnodes); return row_start[src]; }; - inline __device__ __host__ index_type edge_end(unsigned src) { + __device__ __host__ index_type edge_end(unsigned src) { assert(src <= nnodes); return row_start[src+1]; }; From 859862435478f5361815fb9c1e2da1ad5d6dab1f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 25 Feb 2020 17:42:43 -0600 Subject: [PATCH 035/660] got back optimizer changes --- libdeepgalois/include/optimizer.h | 122 ++++++------------------------ 1 file changed, 23 insertions(+), 99 deletions(-) diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h index f1822adc7d..ed8e7654d9 100644 --- a/libdeepgalois/include/optimizer.h +++ b/libdeepgalois/include/optimizer.h @@ -15,6 +15,7 @@ struct optimizer { optimizer& operator=(optimizer&&) = default; virtual ~optimizer() = default; virtual void update(const vec_t& dW, vec_t& W, bool parallelize) = 0; + virtual void update_gpu(const float_t* dW, float_t* W) = 0; virtual void reset() {} // override to implement pre-learning action }; @@ -46,22 +47,8 @@ struct stateful_optimizer : public optimizer { **/ struct adagrad : public stateful_optimizer<1> { adagrad() : alpha(0.01), eps(float_t(1e-8)) {} - void update(const vec_t& dW, vec_t& W, bool parallelize) { - vec_t& g = get<0>(W); - if (parallelize) { - galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - g[i] += dW[i] * dW[i]; - W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); - }, - galois::loopname("adagrad_update")); - } else { - for (size_t i = 0; i < W.size(); i++) { - g[i] += dW[i] * dW[i]; - W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); - } - } - } + void update(const vec_t& dW, vec_t& W, bool parallelize); + void update_gpu(const float_t* dW, float_t* W) {} float_t alpha; // learning rate private: float_t eps; @@ -75,15 +62,8 @@ struct adagrad : public stateful_optimizer<1> { **/ struct RMSprop : public stateful_optimizer<1> { RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {} - void update(const vec_t& dW, vec_t& W, bool parallelize) { - vec_t& g = get<0>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i]; - W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps); - }, - galois::loopname("rms_update")); - } + void update(const vec_t& dW, vec_t& W, bool parallelize); + void update_gpu(const float_t* dW, float_t* W) {} float_t alpha; // learning rate float_t mu; // decay term private: @@ -94,25 +74,14 @@ struct RMSprop : public stateful_optimizer<1> { // http://arxiv.org/abs/1412.6980 struct adam : public stateful_optimizer<2> { adam() - : alpha(0.01), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(float_t(0.9)), - b2_t(float_t(0.999)), eps(float_t(1e-8)) {} - - void update(const vec_t& dW, vec_t& W, bool parallelize) { - vec_t& mt = get<0>(W); - vec_t& vt = get<1>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; - vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; - // L2 norm based update rule - W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / - std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); - }, - galois::chunk_size<256>(), galois::steal(), - galois::loopname("adam_update")); - b1_t *= b1; - b2_t *= b2; - } + : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)), + b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {} + void update(const vec_t& dW, vec_t& W, bool parallelize); +#ifdef CPU_ONLY + void update_gpu(const float_t* dW, float_t* W) {} +#else + void update_gpu(const float_t* dW, float_t* W); +#endif float_t alpha; // learning rate float_t b1; // decay term @@ -134,20 +103,8 @@ struct adamax : public stateful_optimizer<2> { adamax() : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1), eps(float_t(1e-8)) {} - - void update(const vec_t& dW, vec_t& W, bool parallelize) { - vec_t& mt = get<0>(W); - vec_t& ut = get<1>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; - ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); - // Lp norm based update rule - W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); - }, - galois::loopname("adamax_update")); - b1_t *= b1; - } + void update(const vec_t& dW, vec_t& W, bool parallelize); + void update_gpu(const float_t* dW, float_t* W) {} float_t alpha; // learning rate float_t b1; // decay term @@ -158,19 +115,12 @@ struct adamax : public stateful_optimizer<2> { float_t eps; // constant value to avoid zero-division }; -/** - * SGD without momentum - * - * slightly faster than tiny_dnn::momentum - **/ +// SGD without momentum +// slightly faster than tiny_dnn::momentum struct gradient_descent : public optimizer { gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {} - void update(const vec_t& dW, vec_t& W, bool parallelize) { - galois::do_all( - galois::iterate((size_t)0, W.size()), - [&](const auto& i) { W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); }, - galois::loopname("gradient_descent_update")); - } + void update(const vec_t& dW, vec_t& W, bool parallelize); + void update_gpu(const float_t* dW, float_t* W) {} float_t alpha; // learning rate float_t lambda; // weight decay }; @@ -185,21 +135,8 @@ struct gradient_descent : public optimizer { struct momentum : public stateful_optimizer<1> { public: momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} - - void update(const vec_t& dW, vec_t& W, bool parallelize) { - vec_t& dWprev = get<0>(W); - - // for_i(parallelize, W.size(), [&](size_t i) { - galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - float_t V = - mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); - W[i] += V; - dWprev[i] = V; - //}); - }, - galois::loopname("momentum_update")); - } + void update(const vec_t& dW, vec_t& W, bool parallelize); + void update_gpu(const float_t* dW, float_t* W) {} float_t alpha; // learning rate float_t lambda; // weight decay @@ -217,21 +154,8 @@ struct nesterov_momentum : public stateful_optimizer<1> { public: nesterov_momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} - - void update(const vec_t& dW, vec_t& W, bool parallelize) { - vec_t& dWprev = get<0>(W); - - // for_i(parallelize, W.size(), [&](size_t i) { - galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - float_t V = - mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); - W[i] += (-mu) * dWprev[i] + (1 + mu) * V; - dWprev[i] = V; - //}); - }, - galois::loopname("nesterov_momentum_update")); - } + void update(const vec_t& dW, vec_t& W, bool parallelize); + void update_gpu(const float_t* dW, float_t* W) {} float_t alpha; // learning rate float_t lambda; // weight decay From e4bb47cfb1b2275e22e36857ca6faa8a5d86abe7 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 25 Feb 2020 17:54:01 -0600 Subject: [PATCH 036/660] getting back changes erased by clangformat rebase --- libdeepgalois/include/aggregator.h | 2 - libdeepgalois/include/context.h | 54 ++- libdeepgalois/include/cutils.h | 12 + libdeepgalois/include/layers/layer.h | 113 ++++--- .../include/layers/softmax_loss_layer.h | 1 + libdeepgalois/include/net.h | 45 ++- libdeepgalois/include/node.h | 4 +- libdeepgalois/include/types.h | 2 + libdeepgalois/src/aggregator.cu | 14 +- libdeepgalois/src/context.cpp | 95 +----- libdeepgalois/src/layers/graph_conv_layer.cpp | 96 +++--- .../src/layers/softmax_loss_layer.cpp | 36 +- libdeepgalois/src/math_functions.cpp | 37 +- libdeepgalois/src/math_functions.cu | 318 +++++++++--------- libdeepgalois/src/net.cpp | 21 +- libdeepgalois/src/node.cpp | 3 + libdeepgalois/src/node.cu | 2 +- 17 files changed, 425 insertions(+), 430 deletions(-) diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h index 01b1a1e8c8..552925c1bf 100644 --- a/libdeepgalois/include/aggregator.h +++ b/libdeepgalois/include/aggregator.h @@ -6,8 +6,6 @@ void update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); #else #include "graph_gpu.h" -#define TB_SIZE 256 -#define WARP_SIZE 32 void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); #endif diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index 688ed9a2a5..47b32d023e 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -3,27 +3,19 @@ #include #include "types.h" #include "utils.h" -#include "lgraph.h" #ifdef CPU_ONLY +#include "lgraph.h" #include "gtypes.h" #else #include "graph_gpu.h" -#endif #include "cutils.h" +#endif class Context { public: Context(); ~Context(); enum Brew { CPU, GPU }; - // static Context& Get(); -#ifndef CPU_ONLY - inline static cublasHandle_t cublas_handle() { return cublas_handle_; } - inline static curandGenerator_t curand_generator() { - return curand_generator_; - } - // static void create_blas_handle(); -#endif Brew mode() { return mode_; } void set_mode(Brew mode) { mode_ = mode; } int solver_count() { return solver_count_; } @@ -39,30 +31,36 @@ class Context { label_t get_label(size_t i) { return labels[i]; } label_t* get_labels_ptr(size_t i) { return &(labels[0]); } float_t* get_in_ptr(); - void degree_counting(); - void norm_factor_counting(); - std::vector labels; // labels for classification: N x 1 - float_t* norm_factor; // normalization constant based on graph structure - std::vector degrees; - vec_t h_feats; // input features: N x D - size_t n; // number of samples: N - size_t num_classes; // number of classes: E - size_t feat_len; // input feature length: D -#ifdef CPU_ONLY - Graph graph_cpu; // the input graph, |V| = N - void genGraph(LGraph& lg, Graph& g); + size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr"); -#else - CSRGraph graph_gpu; // the input graph, |V| = N - label_t* d_labels; // labels on device - float_t* d_norm_factor; // norm_factor on device - float_t* d_feats; // input features on device size_t read_graph_gpu(std::string dataset_str); void copy_data_to_device(); // copy labels and input features void SetDevice(const int device_id); void DeviceQuery() {} bool CheckDevice(const int device_id) { return true; } int FindDevice(const int start_id = 0) { return 0; } + void norm_factor_counting(); + void norm_factor_counting_gpu(); + + size_t n; // number of samples: N + size_t num_classes; // number of classes: E + size_t feat_len; // input feature length: D + std::vector labels; // labels for classification: N x 1 + label_t* d_labels; // labels on device + vec_t h_feats; // input features: N x D + float_t* d_feats; // input features on device + float_t* norm_factor; // normalization constant based on graph structure + float_t* d_norm_factor; // norm_factor on device + +#ifdef CPU_ONLY + Graph graph_cpu; // the input graph, |V| = N + void genGraph(LGraph& lg, Graph& g); +#else + CSRGraph graph_gpu; // the input graph, |V| = N + inline static cublasHandle_t cublas_handle() { return cublas_handle_; } + inline static curandGenerator_t curand_generator() { + return curand_generator_; + } #endif protected: @@ -72,8 +70,6 @@ class Context { curand_generator_; // used to generate random numbers on GPU #endif Brew mode_; - // shared_ptr random_generator_; - // Parallel training int solver_count_; int solver_rank_; bool multiprocess_; diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h index 830a4bbd08..fac2cfaa64 100644 --- a/libdeepgalois/include/cutils.h +++ b/libdeepgalois/include/cutils.h @@ -13,6 +13,18 @@ inline int CUDA_GET_BLOCKS(const int N) { return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; } +inline unsigned CudaTest(const char* msg) { + cudaError_t e; + // cudaThreadSynchronize(); + cudaDeviceSynchronize(); + if (cudaSuccess != (e = cudaGetLastError())) { + fprintf(stderr, "%s: %d\n", msg, e); + fprintf(stderr, "%s\n", cudaGetErrorString(e)); + exit(-1); + } + return 0; +} + inline const char* cublasGetErrorString(cublasStatus_t error) { switch (error) { case CUBLAS_STATUS_SUCCESS: diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 7b8bbc55a4..609047853a 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -44,69 +44,76 @@ class layer : public node { virtual std::string layer_type() const = 0; virtual void set_netphase(net_phase phase) {} virtual void set_context(Context* ctx) { context = ctx; } - // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) = - // 0; virtual void back_propagation(const vec_t &in_data, const vec_t - // &out_data, vec_t &out_grad, vec_t &in_grad) = 0; + virtual acc_t get_masked_loss() { return acc_t(0); } virtual void forward_propagation(const float_t* in_data, float_t* out_data) = 0; virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) = 0; - void set_trainable(bool trainable) { trainable_ = trainable; } - bool trainable() const { return trainable_; } - void set_name(std::string name) { name_ = name; } - std::string get_name() { return name_; } - mask_t *get_device_masks() { return d_masks_; } - void print_layer_info() { - std::cout << "Layer" << level_ << " type: " << layer_type() - << " input[" << input_dims[0] << "," << input_dims[1] - << "] output[" << output_dims[0] << "," << output_dims[1] << "]\n"; - } - virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, mask_t *masks) { - begin_ = sample_begin; - end_ = sample_end; - count_ = sample_count; - masks_ = masks; + void set_trainable(bool trainable) { trainable_ = trainable; } + bool trainable() const { return trainable_; } + void set_name(std::string name) { name_ = name; } + std::string get_name() { return name_; } + void print_layer_info() { + std::cout << "Layer" << level_ << " type: " << layer_type() << " input[" + << input_dims[0] << "," << input_dims[1] << "] output[" + << output_dims[0] << "," << output_dims[1] << "]\n"; + } + virtual void set_sample_mask(size_t sample_begin, size_t sample_end, + size_t sample_count, mask_t* masks) { + begin_ = sample_begin; + end_ = sample_end; + count_ = sample_count; + masks_ = masks; #ifndef CPU_ONLY - copy_masks_device(input_dims[0], masks_, d_masks_); + copy_masks_device(input_dims[0], masks_, d_masks_); #endif - } - void set_in_data(float_t *data) { - prev_ = std::make_shared(this, input_dims[0], input_dims[1]); - prev_->set_data(data); - // no need to allocate memory for gradients, since this is the input layer. - } - void add_edge() { - // add an outgoing edge - next_ = std::make_shared(this, output_dims[0], output_dims[1]); - // allocate memory for intermediate feature vectors and gradients - next_->alloc(); - } - void alloc_grad() { - // allocate memory for intermediate gradients - } - void forward() { - //std::cout << name_ << ": forwarding ... "; - forward_propagation(prev()->get_data(), next()->get_data()); - } - void backward() { - //std::cout << name_ << ": backwarding ... "; - back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient()); - } - void update_weight(optimizer *opt) { - //std::cout << name_ << ": weight updating ... "; - //vec_t diff; - //prev()->merge_grads(&diff); + } + void set_in_data(float_t* data) { + assert(data.size() == input_dims[0] * input_dims[1]); + prev_ = std::make_shared(this, input_dims[0], input_dims[1]); + prev_->set_data(data); + // no need to allocate memory for gradients, since this is the input layer. + // + // allocate memory for intermediate features + // prev_->get_data() = data; + // std::copy(data.begin(), data.end(), prev_->get_data()); + // allocate memory for intermediate gradients + // prev_->get_gradient().resize(input_dims[0]*input_dims[1]); + } + void add_edge() { + // add an outgoing edge + next_ = std::make_shared(this, output_dims[0], output_dims[1]); + // allocate memory for intermediate feature vectors and gradients + next_->alloc(); + } + void alloc_grad() { + // allocate memory for intermediate gradients + } + void forward() { + std::cout << name_ << ": forwarding ... "; + forward_propagation(prev()->get_data(), next()->get_data()); + } + void backward() { + std::cout << name_ << ": backwarding ... "; + back_propagation(prev()->get_data(), next()->get_data(), + next()->get_gradient(), prev()->get_gradient()); + } + void update_weight(optimizer* opt) { + std::cout << name_ << ": weight updating ... "; + // vec_t diff; + // prev()->merge_grads(&diff); #ifdef CPU_ONLY - // parallelize only when target size is big enough to mitigate thread spawning overhead. - bool parallel = (W.size() >= 512); - opt->update(weight_grad, W, parallel); // W += grad + // parallelize only when target size is big enough to mitigate thread + // spawning overhead. + bool parallel = (W.size() >= 512); + opt->update(weight_grad, W, parallel); // W += grad #else - opt->update_gpu(d_weight_grad, d_W); // W += grad + opt->update_gpu(d_weight_grad, d_W); // W += grad #endif - //prev()->clear_grads(); - next()->clear_grads(); - } + // prev()->clear_grads(); + next()->clear_grads(); + } protected: unsigned level_; // layer id: [0, num_layers-1] diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/layers/softmax_loss_layer.h index 0a680a3209..0fa56cf7fe 100644 --- a/libdeepgalois/include/layers/softmax_loss_layer.h +++ b/libdeepgalois/include/layers/softmax_loss_layer.h @@ -12,4 +12,5 @@ class softmax_loss_layer : public layer { virtual void forward_propagation(const float_t* in_data, float_t* out_data); virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); + virtual acc_t get_masked_loss(); }; diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index 9d3e1c1184..87a0e3b72b 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -62,17 +62,17 @@ class Net { connect(layers[layer_id - 1], layers[layer_id]); } - // forward propagation: [begin, end) is the range of samples used. - acc_t fprop(size_t begin, size_t end, size_t count, mask_t *masks) { - // set mask for the last layer - layers[num_layers-1]->set_sample_mask(begin, end, count, masks); - // layer0: from N x D to N x 16 - // layer1: from N x 16 to N x E - // layer2: from N x E to N x E (normalize only) - for (size_t i = 0; i < num_layers; i ++) - layers[i]->forward(); - return layers[num_layers-1]->get_masked_loss(); - } + // forward propagation: [begin, end) is the range of samples used. + acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) { + // set mask for the last layer + layers[num_layers - 1]->set_sample_mask(begin, end, count, &masks[0]); + // layer0: from N x D to N x 16 + // layer1: from N x 16 to N x E + // layer2: from N x E to N x E (normalize only) + for (size_t i = 0; i < num_layers; i++) + layers[i]->forward(); + return layers[num_layers - 1]->get_masked_loss(); + } // back propogation void bprop() { @@ -108,8 +108,27 @@ class Net { std::vector train_mask, val_mask; // masks for traning and validation size_t train_begin, train_end, train_count, val_begin, val_end, val_count; std::vector layers; // all the layers in the neural network - // comparing outputs with the ground truth (labels) - acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t *masks); + + // comparing outputs with the ground truth (labels) + inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, + mask_t* masks) { + AccumF accuracy_all; + accuracy_all.reset(); + galois::do_all(galois::iterate(begin, end), + [&](const auto& i) { + if (masks[i] == 1) { + int preds = argmax(num_classes, + &(layers[NUM_CONV_LAYERS - 1] + ->next() + ->get_data()[i * num_classes])); + if ((label_t)preds == context->get_label(i)) + accuracy_all += 1.0; + } + }, + galois::chunk_size<256>(), galois::steal(), + galois::loopname("getMaskedLoss")); + return accuracy_all.reduce() / (acc_t)count; + } }; #endif diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h index 918b91b86c..8b48e85aa8 100644 --- a/libdeepgalois/include/node.h +++ b/libdeepgalois/include/node.h @@ -31,9 +31,7 @@ class node : public std::enable_shared_from_this { class edge { public: edge(node* prev, size_t n, size_t len) - : num_samples_(n), ft_dim_(len), - // data_(vec_t(n*len)), grad_(vec_t(n*len)), - data_(NULL), grad_(NULL), prev_(prev) {} + : num_samples_(n), ft_dim_(len), data_(NULL), grad_(NULL), prev_(prev) {} void alloc(); void alloc_gpu(); diff --git a/libdeepgalois/include/types.h b/libdeepgalois/include/types.h index 5890ed307c..387b5f5b60 100644 --- a/libdeepgalois/include/types.h +++ b/libdeepgalois/include/types.h @@ -20,5 +20,7 @@ typedef short label_t; // label is for classification (supervised learning) typedef uint8_t mask_t; // mask is used to indicate different uses of labels: // train, val, test #define CHUNK_SIZE 256 +#define TB_SIZE 256 +#define WARP_SIZE 32 #endif diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index c1f578caa1..3a0288b197 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -32,10 +32,12 @@ __global__ void update_all_kernel(size_t n, size_t len, CSRGraph& g, } } -void update_all(size_t len, CSRGraph &g, const float_t *in, float_t *out, bool norm, const float_t *norm_factor) { - unsigned n = g.nnodes; - std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n"; - CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); - update_all_kernel<<>>(n, len, g, in, out, norm, norm_factor); - CudaTest("solving update_all kernel failed"); +void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, + bool norm, const float_t* norm_factor) { + std::cout << "[debug]: update_all on GPU\n"; + unsigned n = g.nnodes; + CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); + update_all_kernel<<>>( + n, len, g, in, out, norm, norm_factor); + CudaTest("solving update_all kernel failed"); } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 04d7c14476..785f4b2d26 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -1,70 +1,11 @@ #include "context.h" #include "gtypes.h" -#include -#include - -// random seeding -int64_t cluster_seedgen(void) { - int64_t s, seed, pid; - FILE* f = fopen("/dev/urandom", "rb"); - if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { - fclose(f); - return seed; - } - std::cout << "System entropy source not available, " - "using fallback algorithm to generate seed instead."; - if (f) - fclose(f); - pid = getpid(); - s = time(NULL); - seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729); - return seed; -} #ifdef CPU_ONLY Context::Context() : mode_(Context::CPU), solver_count_(1), solver_rank_(0), multiprocess_(false) {} Context::~Context() {} -#else -cublasHandle_t Context::cublas_handle_ = 0; -curandGenerator_t Context::curand_generator_ = 0; - -Context::Context() - : mode_(Context::GPU), solver_count_(1), solver_rank_(0), - multiprocess_(false) { - // void Context::create_blas_handle() { - CUBLAS_CHECK(cublasCreate(&cublas_handle_)); - CURAND_CHECK( - curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK( - curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); -} - -Context::~Context() { - if (cublas_handle_) - CUBLAS_CHECK(cublasDestroy(cublas_handle_)); - if (curand_generator_) { - CURAND_CHECK(curandDestroyGenerator(curand_generator_)); - } -} - -void Context::SetDevice(const int device_id) { - int current_device; - CUDA_CHECK(cudaGetDevice(¤t_device)); - if (current_device == device_id) - return; - CUDA_CHECK(cudaSetDevice(device_id)); - if (cublas_handle_) - CUBLAS_CHECK(cublasDestroy(cublas_handle_)); - if (curand_generator_) - CURAND_CHECK(curandDestroyGenerator(curand_generator_)); - CUBLAS_CHECK(cublasCreate(&cublas_handle_)); - CURAND_CHECK( - curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK( - curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); -} #endif size_t Context::read_graph(std::string dataset_str) { @@ -113,24 +54,8 @@ void Context::genGraph(LGraph& lg, Graph& g) { g.constructEdge(offset, lg.get_dest(offset), 0); } } -float_t* Context::get_in_ptr() { return &h_feats[0]; } -#else -size_t Context::read_graph_gpu(std::string dataset_str) { - std::string filename = path + dataset_str + ".csgr"; - graph_gpu.read(filename.c_str(), false); - return graph_gpu.nnodes; -} -void Context::copy_data_to_device() { - CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t))); - CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), - cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMalloc((void**)&d_norm_factor, n * sizeof(float_t))); - CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t))); - CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), - cudaMemcpyHostToDevice)); -} -float_t* Context::get_in_ptr() { return d_feats; } +float_t* Context::get_in_ptr() { return &h_feats[0]; } #endif // user-defined pre-computing function, called during initialization @@ -140,25 +65,17 @@ void Context::norm_factor_counting() { norm_factor = new float_t[n]; galois::do_all(galois::iterate((size_t)0, n), [&](auto v) { - float_t temp = std::sqrt(float_t(degrees[v])); + auto degree = std::distance(graph_cpu.edge_begin(v), + graph_cpu.edge_end(v)); + float_t temp = std::sqrt(float_t(degree)); if (temp == 0.0) norm_factor[v] = 0.0; else norm_factor[v] = 1.0 / temp; }, galois::loopname("NormCounting")); -#endif -} - -void Context::degree_counting() { -#ifdef CPU_ONLY - degrees.resize(n); - galois::do_all(galois::iterate((size_t)0, n), - [&](auto v) { - degrees[v] = std::distance(graph_cpu.edge_begin(v), - graph_cpu.edge_end(v)); - }, - galois::loopname("DegreeCounting")); +#else + norm_factor_counting_gpu(); #endif } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 1ef9be19c1..715fcafd39 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -37,10 +37,10 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, } void graph_conv_layer::init() { - std::cout << name_ - << ": allocating memory for parameters and intermediate data... "; Timer t_alloc; t_alloc.Start(); + // std::cout << name_ << ": allocating memory for parameters and intermediate + // data... "; #ifdef CPU_ONLY rand_init_matrix(y, z, W); // randomly initialize trainable parameters // rand_init_matrix(y, z, Q); @@ -57,27 +57,33 @@ void graph_conv_layer::init() { d_weight_grad); #endif t_alloc.Stop(); - std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; + // std::cout << "Done, time: " << t_alloc.Millisecs() << " ms\n"; } #ifdef CPU_ONLY // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) -void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { - // input: x*y; W: y*z; output: x*z - // if y > z: mult W first to reduce the feature size for aggregation - // else: aggregate first then mult W (not implemented yet) - if (dropout_ && phase_ == net_phase::train) { - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - dropout(y, scale_, dropout_rate_, &in_data[i*y], &dropout_mask[i*y], &in_temp[i*y]); - }, galois::loopname("dropout")); - matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z - } else matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z - aggregate(z, context->graph_cpu, out_temp, out_data); - if (act_) { - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - relu(z, &out_data[i*z], &out_data[i*z]); - }, galois::loopname("relu")); - } +void graph_conv_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + // input: x*y; W: y*z; output: x*z + // if y > z: mult W first to reduce the feature size for aggregation + // else: aggregate first then mult W (not implemented yet) + if (dropout_ && phase_ == net_phase::train) { + galois::do_all(galois::iterate((size_t)0, x), + [&](const auto& i) { + dropout(y, scale_, dropout_rate_, &in_data[i * y], + &dropout_mask[i * y], &in_temp[i * y]); + }, + galois::loopname("dropout")); + matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z + } else + matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z + aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate + if (act_) { + galois::do_all( + galois::iterate((size_t)0, x), + [&](const auto& i) { relu(z, &out_data[i * z], &out_data[i * z]); }, + galois::loopname("relu")); + } } // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ @@ -123,29 +129,39 @@ void graph_conv_layer::back_propagation(const float_t* in_data, #else // GPU forward -void graph_conv_layer::forward_propagation(const float_t *in_data, float_t *out_data) { - assert(y <= 128); // currently only support feature length <= 128 - assert(in_data != NULL); - assert(in_temp != NULL); - assert(dropout_mask != NULL); - //std::cout << "in_data=" << in_data << ", in_temp=" << in_temp << ", dropout_mask=" << dropout_mask << ", out_temp=" << out_temp << ", out_data=" << out_data << "\n"; - if (dropout_ && phase_ == net_phase::train) { - dropout_gpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); - matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp); - } else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp); - //aggregate(z, context->graph_gpu, out_temp, out_data); - if (act_) relu_gpu(x*z, out_data, out_data); +void graph_conv_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + assert(y <= 128); // currently only support feature length <= 128 + assert(in_data != NULL); + assert(in_temp != NULL); + assert(dropout_mask != NULL); + if (dropout_ && phase_ == net_phase::train) { + dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); + matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp); + } else + matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp); + aggregate(z, context->graph_gpu, out_temp, out_data); + if (act_) + relu_gpu(x * z, out_data, out_data); } // GPU backward -void graph_conv_layer::back_propagation(const float_t *in_data, const float_t *out_data, float_t *out_grad, float_t *in_grad) { - if (act_) d_relu_gpu(x*z, out_grad, out_data, out_temp); - else copy_gpu(x*z, out_grad, out_temp); - if (level_ != 0) { - sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); - //update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); - if (dropout_) d_dropout_gpu(x*y, scale_, in_grad, dropout_mask, in_grad); - } - sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); +void graph_conv_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + if (act_) + d_relu_gpu(x * z, out_grad, out_data, out_temp); + else + copy_gpu(x * z, out_grad, out_temp); + if (level_ != 0) { + sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, + in_temp); + update_all(y, context->graph_gpu, in_temp, in_grad, true, + context->d_norm_factor); + if (dropout_) + d_dropout_gpu(y, scale_, in_grad, dropout_mask, in_grad); + } + sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, + d_weight_grad); } #endif diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 6c29dc9a14..85e81d038c 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -6,19 +6,19 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, : layer(level, in_dims, out_dims) { trainable_ = false; name_ = layer_type() + "_" + std::to_string(level); + std::cout << name_ << ": allocating memory for intermediate data... "; #ifdef CPU_ONLY loss = new float_t[in_dims[0]]; // error for each sample #else - out_malloc_device(in_dims[0], masks_, d_masks_, loss); + loss_malloc_device(in_dims[0], loss); #endif + std::cout << "Done\n"; } #ifdef CPU_ONLY // TODO: need kernel fusion optimization // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { - // void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t - // &out_data) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { @@ -35,8 +35,6 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, galois::loopname("softmax-loss-fw")); } -// void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t -// &out_data, vec_t &out_grad, vec_t &in_grad) { void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { @@ -56,6 +54,25 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); } + +acc_t softmax_loss_layer::get_masked_loss() { + AccumF total_loss; + AccumU valid_sample_count; + total_loss.reset(); + valid_sample_count.reset(); + galois::do_all(galois::iterate(begin_, end_), + [&](const auto& i) { + if (masks_[i]) { + total_loss += loss[i]; + valid_sample_count += 1; + } + }, + galois::chunk_size<256>(), galois::steal(), + galois::loopname("getMaskedLoss")); + assert(valid_sample_count.reduce() == count_); + return total_loss.reduce() / (acc_t)count_; +} + #else // GPU implementation void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { @@ -63,7 +80,14 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, context->d_labels, loss, out_data); } +void softmax_loss_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, + context->d_labels, out_data, in_grad); +} + acc_t softmax_loss_layer::get_masked_loss() { - return masked_avg_loss(begin_, end_, count_, d_masks_, loss); + return masked_avg_loss(begin_, end_, count_, masks_, loss); } #endif diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 2e2d68f05d..e21bb42396 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -340,41 +340,41 @@ float reduce_mean(const vec_t& x) { } void dropout(const float scale, const float dropout_rate, const vec_t& in, - std::vector& mask, vec_t& out) { - assert(mask.size() == out.size()); - // rng_bernoulli(1. - dropout_rate, mask); // Create random numbers + std::vector& masks, vec_t& out) { + assert(masks.size() == out.size()); + // rng_bernoulli(1. - dropout_rate, masks); // Create random numbers for (size_t i = 0; i < in.size(); ++i) - mask[i] = bernoulli(dropout_rate); + masks[i] = bernoulli(dropout_rate); for (size_t i = 0; i < in.size(); ++i) - out[i] = in[i] * mask[i] * scale; + out[i] = in[i] * masks[i] * scale; } void dropout(const float scale, const float dropout_rate, const vec_t& in, - std::vector& mask, float_t* out) { + std::vector& masks, float_t* out) { for (size_t i = 0; i < in.size(); ++i) - mask[i] = bernoulli(dropout_rate); + masks[i] = bernoulli(dropout_rate); for (size_t i = 0; i < in.size(); ++i) - out[i] = in[i] * mask[i] * scale; + out[i] = in[i] * masks[i] * scale; } void dropout(size_t n, const float scale, const float dropout_rate, - const float_t* in, unsigned* mask, float_t* out) { + const float_t* in, unsigned* masks, float_t* out) { for (size_t i = 0; i < n; ++i) - mask[i] = bernoulli(dropout_rate); + masks[i] = bernoulli(dropout_rate); for (size_t i = 0; i < n; ++i) - out[i] = in[i] * mask[i] * scale; + out[i] = in[i] * masks[i] * scale; } void d_dropout(const float scale, const vec_t& in_diff, - std::vector& mask, vec_t& out_diff) { + std::vector& masks, vec_t& out_diff) { for (size_t i = 0; i < in_diff.size(); ++i) - out_diff[i] = in_diff[i] * mask[i] * scale; + out_diff[i] = in_diff[i] * masks[i] * scale; } void d_dropout(size_t n, const float scale, const float_t* in_diff, - unsigned* mask, float_t* out_diff) { + unsigned* masks, float_t* out_diff) { for (size_t i = 0; i < n; ++i) - out_diff[i] = in_diff[i] * mask[i] * scale; + out_diff[i] = in_diff[i] * masks[i] * scale; } float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; } @@ -469,9 +469,10 @@ float_t cross_entropy(const vec_t& y, const vec_t& p) { // if (p[i]==float_t(1)) loss -= (float_t(1) - y[i]) * // std::log(float_t(1e-10)); else - loss -= - y[i] * std::log(p[i]); // + (float_t(1) - y[i]) * std::log(float_t(1) - // - p[i]); loss -= y[i] * std::log(p[i]); + loss -= y[i] * + std::log( + p[i]); // + (float_t(1) - y[i]) * std::log(float_t(1) - p[i]); + // loss -= y[i] * std::log(p[i]); } return loss; } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 70ddd8826d..49fa979e0a 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -1,82 +1,109 @@ #include "math_functions.hh" #include "context.h" +#include "gg.h" +#include "ggcuda.h" +#include "cub/cub.cuh" +#include -void gpu_rng_uniform(const int n, unsigned *r) { - CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n)); +void gpu_rng_uniform(const int n, unsigned* r) { + CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n)); } -void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) { - CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n)); - const float range = b - a; - if (range != float_t(1)) scal_gpu(n, range, r); - if (a != float_t(0)) add_scalar_gpu(n, a, r); +void gpu_rng_uniform(const int n, const float_t a, const float_t b, + float_t* r) { + CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n)); + const float range = b - a; + if (range != float_t(1)) + scal_gpu(n, range, r); + if (a != float_t(0)) + add_scalar_gpu(n, a, r); } -void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t *r) { - CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); +void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, + float_t* r) { + CURAND_CHECK( + curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); } -void loss_malloc_device(int n, float_t *&loss) { - CUDA_CHECK(cudaMalloc((void **)&loss, n * sizeof(float_t))); +void loss_malloc_device(int n, float_t* loss) { + CUDA_CHECK(cudaMalloc((void**)&loss, n * sizeof(float_t))); } -void copy_masks_device(int n, mask_t *h_masks, mask_t *&d_masks) { - assert(h_masks != NULL); - CUDA_CHECK(cudaMalloc((void **)&d_masks, n * sizeof(mask_t))); - CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); +void copy_masks_device(int n, mask_t* h_masks, mask_t* d_masks) { + assert(h_masks != NULL); + CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t))); + CUDA_CHECK( + cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); } -void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *&masks, float_t *&in, float_t *&out, float_t *&matrix, float_t *&grad) { - if (dropout) CUDA_CHECK(cudaMalloc((void **)&masks, x * y * sizeof(unsigned))); - CUDA_CHECK(cudaMalloc((void **)&in, x * y * sizeof(float_t))); - CUDA_CHECK(cudaMalloc((void **)&out, x * z * sizeof(float_t))); - CUDA_CHECK(cudaMalloc((void **)&matrix, y * z * sizeof(float_t))); - auto init_range = sqrt(6.0/(y + z)); - // Glorot & Bengio (AISTATS 2010) - gpu_rng_uniform(y*z, -init_range, init_range, matrix); - CUDA_CHECK(cudaMalloc((void **)&grad, y * z * sizeof(float_t))); - CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t))); +void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, + unsigned* masks, float_t* in, float_t* out, + float_t* matrix, float_t* grad) { + if (dropout) + CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned))); + CUDA_CHECK(cudaMalloc((void**)&in, x * y * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void**)&out, x * z * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void**)&matrix, y * z * sizeof(float_t))); + auto init_range = sqrt(6.0 / (y + z)); + // Glorot & Bengio (AISTATS 2010) + gpu_rng_uniform(y * z, -init_range, init_range, matrix); + CUDA_CHECK(cudaMalloc((void**)&grad, y * z * sizeof(float_t))); + CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t))); } -void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, - float_t* r) { - CURAND_CHECK( - curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); +void copy_gpu(size_t len, const float_t* in, float_t* out) { + CUDA_CHECK( + cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); } -__global__ void setup_curand_kernel(const int n, curandState *state) { - CUDA_KERNEL_LOOP(i, n) { - //curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 - curand_init(7+i, i, 0, &state[i]); // Each thread gets different seed - } +__global__ void setup_curand_kernel(const int n, curandState* state) { + CUDA_KERNEL_LOOP(i, n) { + curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 + // curand_init(7+i, i, 0, &state[i]); // Each thread gets different seed + } } -__global__ void dropout_kernel(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned *masks, curandState *state, float_t* out) { - CUDA_KERNEL_LOOP(i, n) { - //curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 - //masks[i] = curand_uniform(&state[i]) <= dropout_rate ? 1 : 0; - masks[i] = 1.0 - dropout_rate; - out[i] = in[i] * masks[i] * scale; - } +__device__ bool bernoulli_gpu(int tid, curandState* state, float_t p) { + curandState local_state = state[tid]; + return curand_uniform(&local_state) <= p; } -void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out) { - curandState *devStates; - CUDA_CHECK(cudaMalloc((void **)&devStates, n * sizeof(curandState))); - //std::cout << "[debug]: setup curand, n = " << n << "\n"; - //setup_curand_kernel<<>>(n, devStates); - //CudaTest("solving setup_curand kernel failed"); - //std::cout << "[debug]: dropout_gpu\n"; - dropout_kernel<<>>(n, scale, dropout_rate, in, masks, devStates, out); - CudaTest("solving dropout kernel failed"); - CUDA_CHECK(cudaFree(devStates)); - //std::cout << "[debug]: dropout_gpu done\n"; +__global__ void dropout_kernel(const int n, const float scale, + const float dropout_rate, const float_t* in, + unsigned* masks, curandState* state, + float_t* out) { + CUDA_KERNEL_LOOP(i, n) { + masks[i] = bernoulli_gpu(i, state, dropout_rate); + out[i] = in[i] * masks[i] * scale; + } } void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned* masks, float_t* out) { + curandState* devStates; + CUDA_CHECK(cudaMalloc((void**)&devStates, n * sizeof(curandState))); + std::cout << "[debug]: setup curand, n = " << n << "\n"; + setup_curand_kernel<<>>(n, devStates); + CudaTest("solving setup_curand kernel failed"); + std::cout << "[debug]: dropout_gpu\n"; dropout_kernel<<>>( - n, scale, dropout_rate, in, masks, out); + n, scale, dropout_rate, in, masks, devStates, out); + CudaTest("solving dropout kernel failed"); + CUDA_CHECK(cudaFree(devStates)); + std::cout << "[debug]: dropout_gpu done\n"; +} + +__global__ void d_dropout_kernel(const int n, const float scale, + const float_t* in, const unsigned* masks, + float_t* out) { + CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; } +} + +void d_dropout_gpu(const int n, const float scale, const float_t* in, + const unsigned* masks, float_t* out) { + d_dropout_kernel<<>>(n, scale, in, + masks, out); + CudaTest("solving dropout kernel failed"); } // flattern data into 1D before feed into the ReLU operater @@ -84,10 +111,10 @@ __global__ void relu_kernel(const int n, const float_t* in, float_t* out) { CUDA_KERNEL_LOOP(index, n) { out[index] = in[index] > 0 ? in[index] : 0; } } -void relu_gpu(const int n, const float_t *in, float_t* out) { - //std::cout << "[debug]: relu_gpu\n"; - relu_kernel<<>>(n, in, out); - CudaTest("solving relu kernel failed"); +void relu_gpu(const int n, const float_t* in, float_t* out) { + std::cout << "[debug]: relu_gpu\n"; + relu_kernel<<>>(n, in, out); + CudaTest("solving relu kernel failed"); } __global__ void d_relu_kernel(const int n, const float_t* in_diff, @@ -101,6 +128,7 @@ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff) { d_relu_kernel<<>>(n, in_diff, data, out_diff); + CudaTest("solving d_relu kernel failed"); } void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, @@ -117,17 +145,24 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, K, &alpha, B, ldb, A, lda, &beta, C, N)); } -void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C) { - //std::cout << "[debug]: matmul1D1D_gpu\n"; - const CBLAS_TRANSPOSE TransA = CblasNoTrans; - const CBLAS_TRANSPOSE TransB = CblasNoTrans; - sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); +void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, + const float_t* A, const float_t* B, float_t* C) { + std::cout << "[debug]: matmul1D1D_gpu\n"; + const CBLAS_TRANSPOSE TransA = CblasNoTrans; + const CBLAS_TRANSPOSE TransB = CblasNoTrans; + sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); } -void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, - const float alpha, const float* A, const float* x, const float beta, float* y) { - cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); +// the arguments of the maxima +int argmax_gpu(const size_t n, const float_t* x) { return 0; } + +void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float* A, const float* x, + const float beta, float* y) { + cublasOperation_t cuTransA = + (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; + CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A, + N, x, 1, &beta, y, 1)); } void scal_gpu(const int N, const float alpha, float* X) { @@ -157,6 +192,7 @@ void set_gpu(const int N, const float_t alpha, float_t* Y) { return; } set_kernel<<>>(N, alpha, Y); + CudaTest("solving set kernel failed"); } __global__ void add_scalar_kernel(const int n, const float_t alpha, @@ -166,6 +202,7 @@ __global__ void add_scalar_kernel(const int n, const float_t alpha, void add_scalar_gpu(const int N, const float_t alpha, float_t* Y) { add_scalar_kernel<<>>(N, alpha, Y); + CudaTest("solving add_scalar kernel failed"); } __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, @@ -175,6 +212,7 @@ __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) { vadd_kernel<<>>(N, a, b, y); + CudaTest("solving vadd kernel failed"); } // TODO: use warp @@ -224,90 +262,70 @@ __device__ void d_cross_entropy(int n, const label_t idx, const float_t* p, // n: number of vectors // len: length of vectors // for each vector, do softmax to normalize the vector, and then compute a loss -__global__ void softmax_cross_entropy_kernel(int n, int len, const float_t *in_data, - const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data) { - CUDA_KERNEL_LOOP(i, n) { - if (masks[i] == 1) { // masked - softmax(len, in_data+len*i, out_data+len*i); // normalize using softmax - loss[i] = 0.0; - cross_entropy(len, labels[i], &out_data[len*i], loss[i]); - } - } -} - -void softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out) { - softmax_cross_entropy_kernel<<>>(n, len, in, masks, labels, loss, out); - CudaTest("solving softmax_cross_entropy kernel failed"); -} - -__global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t *in, - const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) { - CUDA_KERNEL_LOOP(i, n) { - float_t out_grad[41]; // TODO - d_cross_entropy(len, labels[i], out+len*i, out_grad); - d_softmax(len, out+len*i, out_grad, diff+len*i); - } -} - -void d_softmax_cross_entropy_gpu(int n, int len, const float_t *in, const mask_t *masks, const label_t *labels, const float_t *out, float_t *diff) { - d_softmax_cross_entropy_kernel<<>>(n, len, in, masks, labels, out, diff); - CudaTest("solving d_softmax_cross_entropy kernel failed"); -} - -__global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t *masks, float_t *loss, HGAccumulator total) { - total.thread_entry(); - __shared__ cub::BlockReduce::TempStorage local_loss; - CUDA_KERNEL_LOOP(i, end-begin) { - if (masks[begin+i] == 1) - //total += loss[begin+i]; - total.reduce(loss[begin+i]); - } - total.thread_exit >(local_loss); -} - -acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss) { - HGAccumulator loss_accum; - Shared total_loss = Shared(1); - *(total_loss.cpu_wr_ptr()) = 0; - loss_accum.rv = total_loss.gpu_wr_ptr(); - masked_avg_loss_kernel<<>>(begin, end, masks, loss, loss_accum); - CudaTest("solving masked_avg_loss kernel failed"); - cudaDeviceSynchronize(); - return *(total_loss.cpu_rd_ptr()) / count; +__global__ void softmax_cross_entropy_kernel(int n, int len, + const float_t* in_data, + const mask_t* masks, + const label_t* labels, + float_t* loss, float_t* out_data) { + CUDA_KERNEL_LOOP(i, n) { + if (masks[i] == 1) { // masked + softmax(len, in_data + len * i, + out_data + len * i); // normalize using softmax + loss[i] = 0.0; + cross_entropy(len, labels[i], &out_data[len * i], loss[i]); + } + } } -// the arguments of the maxima -__device__ size_t argmax_device(const size_t n, const float_t *x) { - float_t max = x[0]; - size_t max_ind = 0; - for (size_t i = 1; i < n; i++) { - if (x[i] > max) { - max_ind = i; - max = x[i]; - } - } - return max_ind; -} - -__global__ void masked_accuracy_kernel(size_t num_classes, size_t begin, size_t end, mask_t *masks, float_t *preds, label_t *labels, HGAccumulator total) { - total.thread_entry(); - __shared__ cub::BlockReduce::TempStorage local_accuracy; - CUDA_KERNEL_LOOP(i, end-begin) { - if (masks[begin+i] == 1) { - label_t pred = (label_t)argmax_device(num_classes, preds+(begin+i)*num_classes); - if (pred == labels[begin+i]) total.reduce(1.0); - } - } - total.thread_exit >(local_accuracy); -} - -acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end, size_t count, mask_t *masks, float_t *preds, label_t *labels) { - HGAccumulator accuracy_accum; - Shared total_accuracy = Shared(1); - *(total_accuracy.cpu_wr_ptr()) = 0; - accuracy_accum.rv = total_accuracy.gpu_wr_ptr(); - masked_accuracy_kernel<<>>(num_classes, begin, end, masks, preds, labels, accuracy_accum); - CudaTest("solving masked_avg_loss kernel failed"); - cudaDeviceSynchronize(); - return *(total_accuracy.cpu_rd_ptr()) / count; +void softmax_cross_entropy_gpu(int n, int len, const float_t* in, + const mask_t* masks, const label_t* labels, + float_t* loss, float_t* out) { + softmax_cross_entropy_kernel<<>>( + n, len, in, masks, labels, loss, out); + CudaTest("solving softmax_cross_entropy kernel failed"); +} + +__global__ void +d_softmax_cross_entropy_kernel(int n, int len, const float_t* in, + const mask_t* masks, const label_t* labels, + const float_t* out, float_t* diff) { + CUDA_KERNEL_LOOP(i, n) { + float_t out_grad[41]; // TODO + d_cross_entropy(len, labels[i], out + len * i, out_grad); + d_softmax(len, out + len * i, out_grad, diff + len * i); + } +} + +void d_softmax_cross_entropy_gpu(int n, int len, const float_t* in, + const mask_t* masks, const label_t* labels, + const float_t* out, float_t* diff) { + d_softmax_cross_entropy_kernel<<>>( + n, len, in, masks, labels, out, diff); + CudaTest("solving d_softmax_cross_entropy kernel failed"); +} + +__global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t* masks, + float_t* loss, + HGAccumulator total) { + total.thread_entry(); + __shared__ cub::BlockReduce::TempStorage local_loss; + CUDA_KERNEL_LOOP(i, end - begin) { + if (masks[begin + i] == 1) + // total += loss[begin+i]; + total.reduce(loss[begin + i]); + } + total.thread_exit>(local_loss); +} + +acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t* masks, + float_t* loss) { + HGAccumulator loss_accum; + Shared total_loss = Shared(1); + *(total_loss.cpu_wr_ptr()) = 0; + loss_accum.rv = total_loss.gpu_wr_ptr(); + masked_avg_loss_kernel<<>>( + begin, end, masks, loss, loss_accum); + CudaTest("solving masked_avg_loss kernel failed"); + cudaDeviceSynchronize(); + return *(total_loss.cpu_rd_ptr()); } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index f76ccaeb8a..09267795df 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -1,11 +1,9 @@ #include "net.h" void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { - context = new Context(); - // Context::create_blas_handle(); + context = new Context(); num_samples = context->read_graph(dataset_str); num_classes = context->read_labels(dataset_str); - context->degree_counting(); context->norm_factor_counting(); // pre-compute normalizing factor num_epochs = epochs; @@ -100,20 +98,3 @@ void Net::construct_layers() { layers[0]->set_in_data(context->get_in_ptr()); // feed input data set_contexts(); } - -acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t *masks) { -#ifdef CPU_ONLY - AccumF accuracy_all; - accuracy_all.reset(); - galois::do_all(galois::iterate(begin, end), [&](const auto& i) { - if (masks[i] == 1) { - int preds = argmax(num_classes, &(layers[NUM_CONV_LAYERS-1]->next()->get_data()[i*num_classes])); - if ((label_t)preds == context->get_label(i)) accuracy_all += 1.0; - } - }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); - return accuracy_all.reduce() / (acc_t)count; -#else - return masked_accuracy_gpu(num_classes, begin, end, count, layers[NUM_CONV_LAYERS]->get_device_masks(), layers[NUM_CONV_LAYERS-1]->next()->get_data(), context->d_labels); -#endif -} - diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp index f4278688d1..9b88620d65 100644 --- a/libdeepgalois/src/node.cpp +++ b/libdeepgalois/src/node.cpp @@ -1,6 +1,9 @@ #include "node.h" +#include void edge::alloc() { + // std::cout << "Allocating memory for tensors (intermediate features and + // gradients) ...\n"; #ifdef CPU_ONLY data_ = new float_t[num_samples_ * ft_dim_]; grad_ = new float_t[num_samples_ * ft_dim_]; diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu index 2443e9ed7c..e6d149a540 100644 --- a/libdeepgalois/src/node.cu +++ b/libdeepgalois/src/node.cu @@ -14,5 +14,5 @@ void edge::merge_grads_gpu(float_t* dst) { } void edge::clear_grads_gpu() { - CUDA_CHECK(cudaMemset(grad_, 0, ft_dim_ * num_samples_ * sizeof(float_t))); + CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t))); } From c9fb35816c2f4364203426b9b5aba737bfa5f06a Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 25 Feb 2020 18:05:02 -0600 Subject: [PATCH 037/660] set 2 of changes --- libdeepgalois/include/layers/layer.h | 18 ++- libdeepgalois/include/math_functions.hh | 53 +++++--- libdeepgalois/include/net.h | 23 +--- libdeepgalois/src/aggregator.cu | 2 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 21 ++-- .../src/layers/softmax_loss_layer.cpp | 36 +----- libdeepgalois/src/math_functions.cpp | 7 +- libdeepgalois/src/math_functions.cu | 119 +++++++++++------- libdeepgalois/src/net.cpp | 31 ++++- 9 files changed, 168 insertions(+), 142 deletions(-) diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 609047853a..cec1da3665 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -44,7 +44,9 @@ class layer : public node { virtual std::string layer_type() const = 0; virtual void set_netphase(net_phase phase) {} virtual void set_context(Context* ctx) { context = ctx; } - virtual acc_t get_masked_loss() { return acc_t(0); } + // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) = + // 0; virtual void back_propagation(const vec_t &in_data, const vec_t + // &out_data, vec_t &out_grad, vec_t &in_grad) = 0; virtual void forward_propagation(const float_t* in_data, float_t* out_data) = 0; virtual void back_propagation(const float_t* in_data, const float_t* out_data, @@ -54,6 +56,7 @@ class layer : public node { bool trainable() const { return trainable_; } void set_name(std::string name) { name_ = name; } std::string get_name() { return name_; } + mask_t* get_device_masks() { return d_masks_; } void print_layer_info() { std::cout << "Layer" << level_ << " type: " << layer_type() << " input[" << input_dims[0] << "," << input_dims[1] << "] output[" @@ -70,16 +73,9 @@ class layer : public node { #endif } void set_in_data(float_t* data) { - assert(data.size() == input_dims[0] * input_dims[1]); prev_ = std::make_shared(this, input_dims[0], input_dims[1]); prev_->set_data(data); // no need to allocate memory for gradients, since this is the input layer. - // - // allocate memory for intermediate features - // prev_->get_data() = data; - // std::copy(data.begin(), data.end(), prev_->get_data()); - // allocate memory for intermediate gradients - // prev_->get_gradient().resize(input_dims[0]*input_dims[1]); } void add_edge() { // add an outgoing edge @@ -91,16 +87,16 @@ class layer : public node { // allocate memory for intermediate gradients } void forward() { - std::cout << name_ << ": forwarding ... "; + // std::cout << name_ << ": forwarding ... "; forward_propagation(prev()->get_data(), next()->get_data()); } void backward() { - std::cout << name_ << ": backwarding ... "; + // std::cout << name_ << ": backwarding ... "; back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient()); } void update_weight(optimizer* opt) { - std::cout << name_ << ": weight updating ... "; + // std::cout << name_ << ": weight updating ... "; // vec_t diff; // prev()->merge_grads(&diff); #ifdef CPU_ONLY diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 02afab2c49..61e95ef5b0 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -70,25 +70,42 @@ float_t cross_entropy(size_t n, const float_t* y, const float_t* p); void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d); void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d); -void copy_gpu(size_t len, const float_t *in, float_t *out); -void vadd_gpu(const int n, const float_t *a, const float_t *b, float_t *out); // vector add -void relu_gpu(const int n, const float_t *in, float_t *out); // ReLU -void d_relu_gpu(const int n, const float_t *in_diff, const float_t *data, float_t *out_diff); // ReLU derivative -void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t *in, unsigned *masks, float_t *out); // dropout -void d_dropout_gpu(const int n, const float scale, const float_t *in, const unsigned *masks, float_t *out); // dropout derivative -void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, - const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C); -void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t *A, const float_t *B, float_t *C); // matrix multiply -void softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, float_t *loss, float_t *out_data); -void d_softmax_cross_entropy_gpu(int x, int y, const float_t *in_data, const mask_t *masks, const label_t *labels, const float_t *out_data, float_t *diff); -void scal_gpu(const int N, const float alpha, float *X); +void copy_gpu(size_t len, const float_t* in, float_t* out); +void vadd_gpu(const int n, const float_t* a, const float_t* b, + float_t* out); // vector add +void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU +void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, + float_t* out_diff); // ReLU derivative +void dropout_gpu(const int n, const float scale, const float dropout_rate, + const float_t* in, unsigned* masks, float_t* out); // dropout +void d_dropout_gpu(const int n, const float scale, const float_t* in, + const unsigned* masks, float_t* out); // dropout derivative +void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C); +void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, + const float_t* A, const float_t* B, + float_t* C); // matrix multiply +void softmax_cross_entropy_gpu(int x, int y, const float_t* in_data, + const mask_t* masks, const label_t* labels, + float_t* loss, float_t* out_data); +void d_softmax_cross_entropy_gpu(int x, int y, const float_t* in_data, + const mask_t* masks, const label_t* labels, + const float_t* out_data, float_t* diff); +void scal_gpu(const int N, const float alpha, float* X); void add_scalar_gpu(const int N, const float_t alpha, float_t* Y); -acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t *masks, float_t *loss); -acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end, size_t count, mask_t *masks, float_t *preds, label_t *labels); +acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t* masks, + float_t* loss); +acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end, + size_t count, mask_t* masks, float_t* preds, + label_t* labels); -void copy_masks_device(int n, mask_t *h_masks, mask_t *&d_masks); -void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *&masks, float_t *&in, float_t *&out); -void loss_malloc_device(int n, float_t *&loss); -void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned *&masks, float_t *&in, float_t *&out, float_t *&matrix, float_t *&grad); +void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks); +void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned*& masks, + float_t*& in, float_t*& out); +void loss_malloc_device(int n, float_t*& loss); +void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, + unsigned*& masks, float_t*& in, float_t*& out, + float_t*& matrix, float_t*& grad); #endif diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index 87a0e3b72b..4a83caaf88 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -65,7 +65,7 @@ class Net { // forward propagation: [begin, end) is the range of samples used. acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) { // set mask for the last layer - layers[num_layers - 1]->set_sample_mask(begin, end, count, &masks[0]); + layers[num_layers - 1]->set_sample_mask(begin, end, count, masks); // layer0: from N x D to N x 16 // layer1: from N x 16 to N x E // layer2: from N x E to N x E (normalize only) @@ -108,27 +108,8 @@ class Net { std::vector train_mask, val_mask; // masks for traning and validation size_t train_begin, train_end, train_count, val_begin, val_end, val_count; std::vector layers; // all the layers in the neural network - // comparing outputs with the ground truth (labels) - inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, - mask_t* masks) { - AccumF accuracy_all; - accuracy_all.reset(); - galois::do_all(galois::iterate(begin, end), - [&](const auto& i) { - if (masks[i] == 1) { - int preds = argmax(num_classes, - &(layers[NUM_CONV_LAYERS - 1] - ->next() - ->get_data()[i * num_classes])); - if ((label_t)preds == context->get_label(i)) - accuracy_all += 1.0; - } - }, - galois::chunk_size<256>(), galois::steal(), - galois::loopname("getMaskedLoss")); - return accuracy_all.reduce() / (acc_t)count; - } + acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks); }; #endif diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index 3a0288b197..3d6d016363 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -34,8 +34,8 @@ __global__ void update_all_kernel(size_t n, size_t len, CSRGraph& g, void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { - std::cout << "[debug]: update_all on GPU\n"; unsigned n = g.nnodes; + std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n"; CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); update_all_kernel<<>>( n, len, g, in, out, norm, norm_factor); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 715fcafd39..d53a75e53a 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -37,10 +37,10 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, } void graph_conv_layer::init() { + std::cout << name_ + << ": allocating memory for parameters and intermediate data... "; Timer t_alloc; t_alloc.Start(); - // std::cout << name_ << ": allocating memory for parameters and intermediate - // data... "; #ifdef CPU_ONLY rand_init_matrix(y, z, W); // randomly initialize trainable parameters // rand_init_matrix(y, z, Q); @@ -57,7 +57,7 @@ void graph_conv_layer::init() { d_weight_grad); #endif t_alloc.Stop(); - // std::cout << "Done, time: " << t_alloc.Millisecs() << " ms\n"; + std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; } #ifdef CPU_ONLY @@ -76,8 +76,8 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, galois::loopname("dropout")); matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z } else - matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z - aggregate(z, context->graph_cpu, out_temp, out_data); // aggregate + matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z + aggregate(z, context->graph_cpu, out_temp, out_data); if (act_) { galois::do_all( galois::iterate((size_t)0, x), @@ -135,12 +135,15 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, assert(in_data != NULL); assert(in_temp != NULL); assert(dropout_mask != NULL); + // std::cout << "in_data=" << in_data << ", in_temp=" << in_temp << ", + // dropout_mask=" << dropout_mask << ", out_temp=" << out_temp << ", out_data=" + // << out_data << "\n"; if (dropout_ && phase_ == net_phase::train) { dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp); } else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp); - aggregate(z, context->graph_gpu, out_temp, out_data); + // aggregate(z, context->graph_gpu, out_temp, out_data); if (act_) relu_gpu(x * z, out_data, out_data); } @@ -156,10 +159,10 @@ void graph_conv_layer::back_propagation(const float_t* in_data, if (level_ != 0) { sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); - update_all(y, context->graph_gpu, in_temp, in_grad, true, - context->d_norm_factor); + // update_all(y, context->graph_gpu, in_temp, in_grad, true, + // context->d_norm_factor); if (dropout_) - d_dropout_gpu(y, scale_, in_grad, dropout_mask, in_grad); + d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad); } sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 85e81d038c..a953dd5f1e 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -6,19 +6,19 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, : layer(level, in_dims, out_dims) { trainable_ = false; name_ = layer_type() + "_" + std::to_string(level); - std::cout << name_ << ": allocating memory for intermediate data... "; #ifdef CPU_ONLY loss = new float_t[in_dims[0]]; // error for each sample #else - loss_malloc_device(in_dims[0], loss); + out_malloc_device(in_dims[0], masks_, d_masks_, loss); #endif - std::cout << "Done\n"; } #ifdef CPU_ONLY // TODO: need kernel fusion optimization // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { + // void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t + // &out_data) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { @@ -35,6 +35,8 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, galois::loopname("softmax-loss-fw")); } +// void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t +// &out_data, vec_t &out_grad, vec_t &in_grad) { void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { @@ -54,25 +56,6 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); } - -acc_t softmax_loss_layer::get_masked_loss() { - AccumF total_loss; - AccumU valid_sample_count; - total_loss.reset(); - valid_sample_count.reset(); - galois::do_all(galois::iterate(begin_, end_), - [&](const auto& i) { - if (masks_[i]) { - total_loss += loss[i]; - valid_sample_count += 1; - } - }, - galois::chunk_size<256>(), galois::steal(), - galois::loopname("getMaskedLoss")); - assert(valid_sample_count.reduce() == count_); - return total_loss.reduce() / (acc_t)count_; -} - #else // GPU implementation void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { @@ -80,14 +63,7 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, context->d_labels, loss, out_data); } -void softmax_loss_layer::back_propagation(const float_t* in_data, - const float_t* out_data, - float_t* out_grad, float_t* in_grad) { - d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, - context->d_labels, out_data, in_grad); -} - acc_t softmax_loss_layer::get_masked_loss() { - return masked_avg_loss(begin_, end_, count_, masks_, loss); + return masked_avg_loss(begin_, end_, count_, d_masks_, loss); } #endif diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index e21bb42396..3acc213d5e 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -469,10 +469,9 @@ float_t cross_entropy(const vec_t& y, const vec_t& p) { // if (p[i]==float_t(1)) loss -= (float_t(1) - y[i]) * // std::log(float_t(1e-10)); else - loss -= y[i] * - std::log( - p[i]); // + (float_t(1) - y[i]) * std::log(float_t(1) - p[i]); - // loss -= y[i] * std::log(p[i]); + loss -= + y[i] * std::log(p[i]); // + (float_t(1) - y[i]) * std::log(float_t(1) + // - p[i]); loss -= y[i] * std::log(p[i]); } return loss; } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 49fa979e0a..e098922ba1 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -1,9 +1,5 @@ #include "math_functions.hh" #include "context.h" -#include "gg.h" -#include "ggcuda.h" -#include "cub/cub.cuh" -#include void gpu_rng_uniform(const int n, unsigned* r) { CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n)); @@ -25,11 +21,11 @@ void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); } -void loss_malloc_device(int n, float_t* loss) { +void loss_malloc_device(int n, float_t*& loss) { CUDA_CHECK(cudaMalloc((void**)&loss, n * sizeof(float_t))); } -void copy_masks_device(int n, mask_t* h_masks, mask_t* d_masks) { +void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) { assert(h_masks != NULL); CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t))); CUDA_CHECK( @@ -37,8 +33,8 @@ void copy_masks_device(int n, mask_t* h_masks, mask_t* d_masks) { } void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, - unsigned* masks, float_t* in, float_t* out, - float_t* matrix, float_t* grad) { + unsigned*& masks, float_t*& in, float_t*& out, + float_t*& matrix, float_t*& grad) { if (dropout) CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned))); CUDA_CHECK(cudaMalloc((void**)&in, x * y * sizeof(float_t))); @@ -51,29 +47,27 @@ void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t))); } -void copy_gpu(size_t len, const float_t* in, float_t* out) { - CUDA_CHECK( - cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); +void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, + float_t* r) { + CURAND_CHECK( + curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); } __global__ void setup_curand_kernel(const int n, curandState* state) { CUDA_KERNEL_LOOP(i, n) { - curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 - // curand_init(7+i, i, 0, &state[i]); // Each thread gets different seed + // curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 + curand_init(7 + i, i, 0, &state[i]); // Each thread gets different seed } } -__device__ bool bernoulli_gpu(int tid, curandState* state, float_t p) { - curandState local_state = state[tid]; - return curand_uniform(&local_state) <= p; -} - __global__ void dropout_kernel(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned* masks, curandState* state, float_t* out) { CUDA_KERNEL_LOOP(i, n) { - masks[i] = bernoulli_gpu(i, state, dropout_rate); + // curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 + // masks[i] = curand_uniform(&state[i]) <= dropout_rate ? 1 : 0; + masks[i] = 1.0 - dropout_rate; out[i] = in[i] * masks[i] * scale; } } @@ -82,28 +76,21 @@ void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned* masks, float_t* out) { curandState* devStates; CUDA_CHECK(cudaMalloc((void**)&devStates, n * sizeof(curandState))); - std::cout << "[debug]: setup curand, n = " << n << "\n"; - setup_curand_kernel<<>>(n, devStates); - CudaTest("solving setup_curand kernel failed"); - std::cout << "[debug]: dropout_gpu\n"; + // std::cout << "[debug]: setup curand, n = " << n << "\n"; + // setup_curand_kernel<<>>(n, + // devStates); CudaTest("solving setup_curand kernel failed"); std::cout << + // "[debug]: dropout_gpu\n"; dropout_kernel<<>>( n, scale, dropout_rate, in, masks, devStates, out); CudaTest("solving dropout kernel failed"); CUDA_CHECK(cudaFree(devStates)); - std::cout << "[debug]: dropout_gpu done\n"; + // std::cout << "[debug]: dropout_gpu done\n"; } -__global__ void d_dropout_kernel(const int n, const float scale, - const float_t* in, const unsigned* masks, - float_t* out) { - CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; } -} - -void d_dropout_gpu(const int n, const float scale, const float_t* in, - const unsigned* masks, float_t* out) { - d_dropout_kernel<<>>(n, scale, in, - masks, out); - CudaTest("solving dropout kernel failed"); +void dropout_gpu(const int n, const float scale, const float dropout_rate, + const float_t* in, unsigned* masks, float_t* out) { + dropout_kernel<<>>( + n, scale, dropout_rate, in, masks, out); } // flattern data into 1D before feed into the ReLU operater @@ -112,7 +99,7 @@ __global__ void relu_kernel(const int n, const float_t* in, float_t* out) { } void relu_gpu(const int n, const float_t* in, float_t* out) { - std::cout << "[debug]: relu_gpu\n"; + // std::cout << "[debug]: relu_gpu\n"; relu_kernel<<>>(n, in, out); CudaTest("solving relu kernel failed"); } @@ -128,7 +115,6 @@ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff) { d_relu_kernel<<>>(n, in_diff, data, out_diff); - CudaTest("solving d_relu kernel failed"); } void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, @@ -147,15 +133,12 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t* A, const float_t* B, float_t* C) { - std::cout << "[debug]: matmul1D1D_gpu\n"; + // std::cout << "[debug]: matmul1D1D_gpu\n"; const CBLAS_TRANSPOSE TransA = CblasNoTrans; const CBLAS_TRANSPOSE TransB = CblasNoTrans; sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); } -// the arguments of the maxima -int argmax_gpu(const size_t n, const float_t* x) { return 0; } - void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float* A, const float* x, const float beta, float* y) { @@ -192,7 +175,6 @@ void set_gpu(const int N, const float_t alpha, float_t* Y) { return; } set_kernel<<>>(N, alpha, Y); - CudaTest("solving set kernel failed"); } __global__ void add_scalar_kernel(const int n, const float_t alpha, @@ -202,7 +184,6 @@ __global__ void add_scalar_kernel(const int n, const float_t alpha, void add_scalar_gpu(const int N, const float_t alpha, float_t* Y) { add_scalar_kernel<<>>(N, alpha, Y); - CudaTest("solving add_scalar kernel failed"); } __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, @@ -212,7 +193,6 @@ __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) { vadd_kernel<<>>(N, a, b, y); - CudaTest("solving vadd kernel failed"); } // TODO: use warp @@ -308,13 +288,13 @@ __global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t* masks, float_t* loss, HGAccumulator total) { total.thread_entry(); - __shared__ cub::BlockReduce::TempStorage local_loss; + __shared__ cub::BlockReduce::TempStorage local_loss; CUDA_KERNEL_LOOP(i, end - begin) { if (masks[begin + i] == 1) // total += loss[begin+i]; total.reduce(loss[begin + i]); } - total.thread_exit>(local_loss); + total.thread_exit>(local_loss); } acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t* masks, @@ -327,5 +307,50 @@ acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t* masks, begin, end, masks, loss, loss_accum); CudaTest("solving masked_avg_loss kernel failed"); cudaDeviceSynchronize(); - return *(total_loss.cpu_rd_ptr()); + return *(total_loss.cpu_rd_ptr()) / count; +} + +// the arguments of the maxima +__device__ size_t argmax_device(const size_t n, const float_t* x) { + float_t max = x[0]; + size_t max_ind = 0; + for (size_t i = 1; i < n; i++) { + if (x[i] > max) { + max_ind = i; + max = x[i]; + } + } + return max_ind; +} + +__global__ void masked_accuracy_kernel(size_t num_classes, size_t begin, + size_t end, mask_t* masks, + float_t* preds, label_t* labels, + HGAccumulator total) { + total.thread_entry(); + __shared__ cub::BlockReduce::TempStorage + local_accuracy; + CUDA_KERNEL_LOOP(i, end - begin) { + if (masks[begin + i] == 1) { + label_t pred = (label_t)argmax_device(num_classes, + preds + (begin + i) * num_classes); + if (pred == labels[begin + i]) + total.reduce(1.0); + } + } + total.thread_exit>(local_accuracy); +} + +acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end, + size_t count, mask_t* masks, float_t* preds, + label_t* labels) { + HGAccumulator accuracy_accum; + Shared total_accuracy = Shared(1); + *(total_accuracy.cpu_wr_ptr()) = 0; + accuracy_accum.rv = total_accuracy.gpu_wr_ptr(); + masked_accuracy_kernel<<>>( + num_classes, begin, end, masks, preds, labels, accuracy_accum); + CudaTest("solving masked_avg_loss kernel failed"); + cudaDeviceSynchronize(); + return *(total_accuracy.cpu_rd_ptr()) / count; } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 09267795df..41393e6f13 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -1,9 +1,11 @@ #include "net.h" void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { - context = new Context(); + context = new Context(); + // Context::create_blas_handle(); num_samples = context->read_graph(dataset_str); num_classes = context->read_labels(dataset_str); + context->degree_counting(); context->norm_factor_counting(); // pre-compute normalizing factor num_epochs = epochs; @@ -98,3 +100,30 @@ void Net::construct_layers() { layers[0]->set_in_data(context->get_in_ptr()); // feed input data set_contexts(); } + +acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, + mask_t* masks) { +#ifdef CPU_ONLY + AccumF accuracy_all; + accuracy_all.reset(); + galois::do_all(galois::iterate(begin, end), + [&](const auto& i) { + if (masks[i] == 1) { + int preds = argmax(num_classes, + &(layers[NUM_CONV_LAYERS - 1] + ->next() + ->get_data()[i * num_classes])); + if ((label_t)preds == context->get_label(i)) + accuracy_all += 1.0; + } + }, + galois::chunk_size<256>(), galois::steal(), + galois::loopname("getMaskedLoss")); + return accuracy_all.reduce() / (acc_t)count; +#else + return masked_accuracy_gpu(num_classes, begin, end, count, + layers[NUM_CONV_LAYERS]->get_device_masks(), + layers[NUM_CONV_LAYERS - 1]->next()->get_data(), + context->d_labels); +#endif +} From 6336fe34b4fece4f2618fd534529c0e6451ca896 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 25 Feb 2020 18:16:21 -0600 Subject: [PATCH 038/660] softmax missing functions --- .../src/layers/softmax_loss_layer.cpp | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index a953dd5f1e..66ce404a18 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -56,6 +56,27 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); } + +acc_t softmax_loss_layer::get_masked_loss() { + AccumF total_loss; + AccumU valid_sample_count; + total_loss.reset(); + valid_sample_count.reset(); + galois::do_all(galois::iterate(begin_, end_), + [&](const auto& i) { + if (masks_[i]) { + total_loss += loss[i]; + valid_sample_count += 1; + } + }, + galois::chunk_size<256>(), galois::steal(), + galois::loopname("getMaskedLoss")); + assert(valid_sample_count.reduce() == count_); + return total_loss.reduce() / (acc_t)count_; +} + + + #else // GPU implementation void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { @@ -63,6 +84,13 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, context->d_labels, loss, out_data); } +void softmax_loss_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, + context->d_labels, out_data, in_grad); +} + acc_t softmax_loss_layer::get_masked_loss() { return masked_avg_loss(begin_, end_, count_, d_masks_, loss); } From 0339aa284f5934bd7079350dba60abc1c91b0fa2 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 25 Feb 2020 18:20:25 -0600 Subject: [PATCH 039/660] remove degree counting from net.cpp --- libdeepgalois/src/net.cpp | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 41393e6f13..ddd6df4afa 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -3,15 +3,18 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { context = new Context(); // Context::create_blas_handle(); + // read graph, get num nodes num_samples = context->read_graph(dataset_str); num_classes = context->read_labels(dataset_str); - context->degree_counting(); context->norm_factor_counting(); // pre-compute normalizing factor + num_epochs = epochs; std::cout << "Reading label masks ... "; train_mask.resize(num_samples, 0); val_mask.resize(num_samples, 0); + + // get testing and validation sets if (dataset_str == "reddit") { train_begin = 0, train_count = 153431, train_end = train_begin + train_count; @@ -25,9 +28,11 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { read_masks(dataset_str, "train", train_begin, train_end, train_mask); val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask); } + std::cout << "Done\n"; num_layers = NUM_CONV_LAYERS + 1; + // initialize feature metadata feature_dims.resize(num_layers + 1); feature_dims[0] = context->read_features(dataset_str); // input feature dimension: D @@ -42,7 +47,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { } void Net::train(optimizer* opt, bool need_validate) { - std::cout << "\nStart training...\n"; + galois::gPrint("\nStart training...\n"); galois::StatTimer Tupdate("Train-WeightUpdate"); galois::StatTimer Tfw("Train-Forward"); galois::StatTimer Tbw("Train-Backward"); From 8038b6bdd485d7e9b22a0817174b33be833fb520 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 25 Feb 2020 19:03:41 -0600 Subject: [PATCH 040/660] fix merge errors --- libdeepgalois/include/layers/layer.h | 1 + .../src/layers/softmax_loss_layer.cpp | 2 +- libdeepgalois/src/math_functions.cu | 31 ++++++++++++------- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index cec1da3665..68260c034a 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -44,6 +44,7 @@ class layer : public node { virtual std::string layer_type() const = 0; virtual void set_netphase(net_phase phase) {} virtual void set_context(Context* ctx) { context = ctx; } + virtual acc_t get_masked_loss() { return acc_t(0); } // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) = // 0; virtual void back_propagation(const vec_t &in_data, const vec_t // &out_data, vec_t &out_grad, vec_t &in_grad) = 0; diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 66ce404a18..dbebe73f44 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -9,7 +9,7 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, #ifdef CPU_ONLY loss = new float_t[in_dims[0]]; // error for each sample #else - out_malloc_device(in_dims[0], masks_, d_masks_, loss); + loss_malloc_device(in_dims[0], loss); #endif } #ifdef CPU_ONLY diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index e098922ba1..c507ee313b 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -1,5 +1,9 @@ #include "math_functions.hh" #include "context.h" +#include "gg.h" +#include "ggcuda.h" +#include "cub/cub.cuh" +#include void gpu_rng_uniform(const int n, unsigned* r) { CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n)); @@ -47,12 +51,6 @@ void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t))); } -void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, - float_t* r) { - CURAND_CHECK( - curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); -} - __global__ void setup_curand_kernel(const int n, curandState* state) { CUDA_KERNEL_LOOP(i, n) { // curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 @@ -77,8 +75,8 @@ void dropout_gpu(const int n, const float scale, const float dropout_rate, curandState* devStates; CUDA_CHECK(cudaMalloc((void**)&devStates, n * sizeof(curandState))); // std::cout << "[debug]: setup curand, n = " << n << "\n"; - // setup_curand_kernel<<>>(n, - // devStates); CudaTest("solving setup_curand kernel failed"); std::cout << + // setup_curand_kernel<<>>(n, devStates); + // CudaTest("solving setup_curand kernel failed"); std::cout << // "[debug]: dropout_gpu\n"; dropout_kernel<<>>( n, scale, dropout_rate, in, masks, devStates, out); @@ -87,10 +85,15 @@ void dropout_gpu(const int n, const float scale, const float dropout_rate, // std::cout << "[debug]: dropout_gpu done\n"; } -void dropout_gpu(const int n, const float scale, const float dropout_rate, - const float_t* in, unsigned* masks, float_t* out) { - dropout_kernel<<>>( - n, scale, dropout_rate, in, masks, out); +__global__ void d_dropout_kernel(const int n, const float scale, + const float_t* in, const unsigned* masks, + float_t* out) { + CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; } +} + +void d_dropout_gpu(const int n, const float scale, const float_t* in, + const unsigned* masks, float_t* out) { + d_dropout_kernel<<>>(n, scale, in, masks, out); } // flattern data into 1D before feed into the ReLU operater @@ -191,6 +194,10 @@ __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] + b[index]; } } +void copy_gpu(size_t len, const float_t* in, float_t* out) { + CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); +} + void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) { vadd_kernel<<>>(N, a, b, y); } From 6ec8a06b362c9cfd1ea8c3cb942b7767d0237537 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 25 Feb 2020 19:30:47 -0600 Subject: [PATCH 041/660] refine --- .../src/layers/softmax_loss_layer.cpp | 7 ----- libdeepgalois/src/math_functions.cpp | 10 +------ libdeepgalois/src/math_functions.cu | 4 +++ libdeepgalois/src/net.cpp | 29 +++++++------------ 4 files changed, 15 insertions(+), 35 deletions(-) diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index dbebe73f44..0cd9547250 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -17,8 +17,6 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { - // void softmax_loss_layer::forward_propagation(const vec_t &in_data, vec_t - // &out_data) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { @@ -35,8 +33,6 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, galois::loopname("softmax-loss-fw")); } -// void softmax_loss_layer::back_propagation(const vec_t &in_data, const vec_t -// &out_data, vec_t &out_grad, vec_t &in_grad) { void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { @@ -74,9 +70,6 @@ acc_t softmax_loss_layer::get_masked_loss() { assert(valid_sample_count.reduce() == count_); return total_loss.reduce() / (acc_t)count_; } - - - #else // GPU implementation void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 3acc213d5e..9914fd68d5 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -466,12 +466,7 @@ float_t cross_entropy(const vec_t& y, const vec_t& p) { continue; if (p[i] == float_t(0)) loss -= y[i] * std::log(float_t(1e-10)); - // if (p[i]==float_t(1)) loss -= (float_t(1) - y[i]) * - // std::log(float_t(1e-10)); - else - loss -= - y[i] * std::log(p[i]); // + (float_t(1) - y[i]) * std::log(float_t(1) - // - p[i]); loss -= y[i] * std::log(p[i]); + else loss -= y[i] * std::log(p[i]); } return loss; } @@ -491,11 +486,8 @@ float_t cross_entropy(size_t n, const float_t* y, const float_t* p) { void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d) { auto n = y.size(); - // for (size_t i = 0; i < n; i++) d[i] = (p[i] - y[i]) / (p[i] * (float_t(1) - - // p[i])); for (size_t i = 0; i < n; i++) { d[i] = -y[i] / (p[i] + float_t(1e-10)); - // d[i] = p[i] - y[i]; } } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index c507ee313b..28e65e149d 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -118,6 +118,7 @@ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff) { d_relu_kernel<<>>(n, in_diff, data, out_diff); + CudaTest("solving d_relu kernel failed"); } void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, @@ -178,6 +179,7 @@ void set_gpu(const int N, const float_t alpha, float_t* Y) { return; } set_kernel<<>>(N, alpha, Y); + CudaTest("solving set kernel failed"); } __global__ void add_scalar_kernel(const int n, const float_t alpha, @@ -187,6 +189,7 @@ __global__ void add_scalar_kernel(const int n, const float_t alpha, void add_scalar_gpu(const int N, const float_t alpha, float_t* Y) { add_scalar_kernel<<>>(N, alpha, Y); + CudaTest("solving add_scalar kernel failed"); } __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, @@ -200,6 +203,7 @@ void copy_gpu(size_t len, const float_t* in, float_t* out) { void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) { vadd_kernel<<>>(N, a, b, y); + CudaTest("solving vadd kernel failed"); } // TODO: use warp diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index ddd6df4afa..9b78853833 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -2,18 +2,15 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { context = new Context(); - // Context::create_blas_handle(); // read graph, get num nodes num_samples = context->read_graph(dataset_str); num_classes = context->read_labels(dataset_str); context->norm_factor_counting(); // pre-compute normalizing factor - num_epochs = epochs; std::cout << "Reading label masks ... "; train_mask.resize(num_samples, 0); val_mask.resize(num_samples, 0); - // get testing and validation sets if (dataset_str == "reddit") { train_begin = 0, train_count = 153431, @@ -28,7 +25,6 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { read_masks(dataset_str, "train", train_begin, train_end, train_mask); val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask); } - std::cout << "Done\n"; num_layers = NUM_CONV_LAYERS + 1; @@ -41,8 +37,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { feature_dims[3] = num_classes; // normalized output embedding: E layers.resize(num_layers); #ifndef CPU_ONLY - context - ->copy_data_to_device(); // copy labels and input features to the device + context->copy_data_to_device(); // copy labels and input features to the device #endif } @@ -111,19 +106,15 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, #ifdef CPU_ONLY AccumF accuracy_all; accuracy_all.reset(); - galois::do_all(galois::iterate(begin, end), - [&](const auto& i) { - if (masks[i] == 1) { - int preds = argmax(num_classes, - &(layers[NUM_CONV_LAYERS - 1] - ->next() - ->get_data()[i * num_classes])); - if ((label_t)preds == context->get_label(i)) - accuracy_all += 1.0; - } - }, - galois::chunk_size<256>(), galois::steal(), - galois::loopname("getMaskedLoss")); + galois::do_all(galois::iterate(begin, end), [&](const auto& i) { + if (masks[i] == 1) { + int preds = argmax(num_classes, + &(layers[NUM_CONV_LAYERS - 1]->next()->get_data()[i * num_classes])); + if ((label_t)preds == context->get_label(i)) + accuracy_all += 1.0; + } + }, + galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); return accuracy_all.reduce() / (acc_t)count; #else return masked_accuracy_gpu(num_classes, begin, end, count, From 969ae109d5e14818507365b8078bdec2bae2fb6e Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 25 Feb 2020 19:56:38 -0600 Subject: [PATCH 042/660] refine code --- libdeepgalois/src/aggregator.cu | 10 +++----- libdeepgalois/src/math_functions.cu | 39 ++++++++++++----------------- libdeepgalois/src/optimizer.cpp | 26 +++++++++---------- libdeepgalois/src/optimizer.cu | 3 ++- 4 files changed, 34 insertions(+), 44 deletions(-) diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index 3d6d016363..c5ed6e0817 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -13,19 +13,17 @@ __device__ void scale_add(const int n, const float_t alpha, const float_t* a, } __global__ void update_all_kernel(size_t n, size_t len, CSRGraph& g, - const float_t* in, float_t* out, bool norm, - const float_t* norm_factor) { + const float_t* in, float_t* out, + bool norm, const float_t* norm_factor) { CUDA_KERNEL_LOOP(src, n) { float_t a = 0.0, b = 1.0; - if (norm) - a = norm_factor[src]; + if (norm) a = norm_factor[src]; index_type begin = g.edge_begin(src); index_type end = g.edge_end(src); for (index_type e = begin; e != end; e++) { index_type dst = g.getEdgeDst(e); assert(dst < n); - if (norm) - b = a * norm_factor[dst]; + if (norm) b = a * norm_factor[dst]; scale_add(len, b, in + dst * len, out + src * len, out + src * len); // out[src] += in[dst] } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 28e65e149d..174cd1b36a 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -9,8 +9,7 @@ void gpu_rng_uniform(const int n, unsigned* r) { CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n)); } -void gpu_rng_uniform(const int n, const float_t a, const float_t b, - float_t* r) { +void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) { CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n)); const float range = b - a; if (range != float_t(1)) @@ -19,10 +18,8 @@ void gpu_rng_uniform(const int n, const float_t a, const float_t b, add_scalar_gpu(n, a, r); } -void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, - float_t* r) { - CURAND_CHECK( - curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); +void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t* r) { + CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); } void loss_malloc_device(int n, float_t*& loss) { @@ -32,15 +29,13 @@ void loss_malloc_device(int n, float_t*& loss) { void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) { assert(h_masks != NULL); CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t))); - CUDA_CHECK( - cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); } void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned*& masks, float_t*& in, float_t*& out, float_t*& matrix, float_t*& grad) { - if (dropout) - CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned))); + if (dropout) CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned))); CUDA_CHECK(cudaMalloc((void**)&in, x * y * sizeof(float_t))); CUDA_CHECK(cudaMalloc((void**)&out, x * z * sizeof(float_t))); CUDA_CHECK(cudaMalloc((void**)&matrix, y * z * sizeof(float_t))); @@ -60,11 +55,11 @@ __global__ void setup_curand_kernel(const int n, curandState* state) { __global__ void dropout_kernel(const int n, const float scale, const float dropout_rate, const float_t* in, - unsigned* masks, curandState* state, - float_t* out) { + unsigned* masks, curandState* state, float_t* out) { CUDA_KERNEL_LOOP(i, n) { - // curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 - // masks[i] = curand_uniform(&state[i]) <= dropout_rate ? 1 : 0; + // curandState_t curand_state; + //curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 + //masks[i] = curand_uniform(&state[i]) <= dropout_rate ? 1 : 0; masks[i] = 1.0 - dropout_rate; out[i] = in[i] * masks[i] * scale; } @@ -74,20 +69,19 @@ void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned* masks, float_t* out) { curandState* devStates; CUDA_CHECK(cudaMalloc((void**)&devStates, n * sizeof(curandState))); - // std::cout << "[debug]: setup curand, n = " << n << "\n"; - // setup_curand_kernel<<>>(n, devStates); - // CudaTest("solving setup_curand kernel failed"); std::cout << - // "[debug]: dropout_gpu\n"; + //std::cout << "[debug]: setup curand, n = " << n << "\n"; + //setup_curand_kernel<<>>(n, devStates); + //CudaTest("solving setup_curand kernel failed"); + std::cout << "[debug]: dropout_gpu\n"; dropout_kernel<<>>( n, scale, dropout_rate, in, masks, devStates, out); CudaTest("solving dropout kernel failed"); CUDA_CHECK(cudaFree(devStates)); - // std::cout << "[debug]: dropout_gpu done\n"; + std::cout << "[debug]: dropout_gpu done\n"; } -__global__ void d_dropout_kernel(const int n, const float scale, - const float_t* in, const unsigned* masks, - float_t* out) { +__global__ void d_dropout_kernel(const int n, const float scale, const float_t* in, + const unsigned* masks, float_t* out) { CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; } } @@ -102,7 +96,6 @@ __global__ void relu_kernel(const int n, const float_t* in, float_t* out) { } void relu_gpu(const int n, const float_t* in, float_t* out) { - // std::cout << "[debug]: relu_gpu\n"; relu_kernel<<>>(n, in, out); CudaTest("solving relu kernel failed"); } diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp index fb10221f19..b076df561f 100644 --- a/libdeepgalois/src/optimizer.cpp +++ b/libdeepgalois/src/optimizer.cpp @@ -21,26 +21,24 @@ void adagrad::update(const vec_t& dW, vec_t& W, bool parallelize) { void RMSprop::update(const vec_t& dW, vec_t& W, bool parallelize) { vec_t& g = get<0>(W); galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i]; - W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps); - }, - galois::loopname("rms_update")); + [&](const auto& i) { + g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i]; + W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps); + }, galois::loopname("rms_update")); } void adam::update(const vec_t& dW, vec_t& W, bool parallelize) { vec_t& mt = get<0>(W); vec_t& vt = get<1>(W); galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; - vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; - // L2 norm based update rule - W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / - std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); - }, - galois::chunk_size<256>(), galois::steal(), - galois::loopname("adam_update")); + [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; + // L2 norm based update rule + W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / + std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); + }, galois::chunk_size<256>(), galois::steal(), + galois::loopname("adam_update")); b1_t *= b1; b2_t *= b2; } diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu index 908ce4f32a..a936999d3e 100644 --- a/libdeepgalois/src/optimizer.cu +++ b/libdeepgalois/src/optimizer.cu @@ -1,3 +1,4 @@ #include "optimizer.h" -void adam::update_gpu(const float_t* dW, float_t* W) {} +void adam::update_gpu(const float_t* dW, float_t* W) { +} From 9e4c91ea42aac12a1d36664d7ac0ead74e3af363 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 25 Feb 2020 20:27:32 -0600 Subject: [PATCH 043/660] add adam gpu --- libdeepgalois/include/layers/layer.h | 2 +- libdeepgalois/include/optimizer.h | 18 +++++----- libdeepgalois/src/math_functions.cu | 4 +-- libdeepgalois/src/optimizer.cpp | 49 +++++++++++++--------------- libdeepgalois/src/optimizer.cu | 25 +++++++++++++- 5 files changed, 58 insertions(+), 40 deletions(-) diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 68260c034a..bd8f67fa07 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -106,7 +106,7 @@ class layer : public node { bool parallel = (W.size() >= 512); opt->update(weight_grad, W, parallel); // W += grad #else - opt->update_gpu(d_weight_grad, d_W); // W += grad + opt->update_gpu(input_dims[1]*output_dims[1], d_weight_grad, d_W); // W += grad #endif // prev()->clear_grads(); next()->clear_grads(); diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h index ed8e7654d9..d9f8de9116 100644 --- a/libdeepgalois/include/optimizer.h +++ b/libdeepgalois/include/optimizer.h @@ -15,7 +15,7 @@ struct optimizer { optimizer& operator=(optimizer&&) = default; virtual ~optimizer() = default; virtual void update(const vec_t& dW, vec_t& W, bool parallelize) = 0; - virtual void update_gpu(const float_t* dW, float_t* W) = 0; + virtual void update_gpu(const size_t n, const float_t* dW, float_t* W) = 0; virtual void reset() {} // override to implement pre-learning action }; @@ -48,7 +48,7 @@ struct stateful_optimizer : public optimizer { struct adagrad : public stateful_optimizer<1> { adagrad() : alpha(0.01), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W, bool parallelize); - void update_gpu(const float_t* dW, float_t* W) {} + void update_gpu(const size_t n, const float_t* dW, float_t* W) {} float_t alpha; // learning rate private: float_t eps; @@ -63,7 +63,7 @@ struct adagrad : public stateful_optimizer<1> { struct RMSprop : public stateful_optimizer<1> { RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W, bool parallelize); - void update_gpu(const float_t* dW, float_t* W) {} + void update_gpu(const size_t n, const float_t* dW, float_t* W) {} float_t alpha; // learning rate float_t mu; // decay term private: @@ -78,9 +78,9 @@ struct adam : public stateful_optimizer<2> { b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W, bool parallelize); #ifdef CPU_ONLY - void update_gpu(const float_t* dW, float_t* W) {} + void update_gpu(const size_t n, const float_t* dW, float_t* W) {} #else - void update_gpu(const float_t* dW, float_t* W); + void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif float_t alpha; // learning rate @@ -104,7 +104,7 @@ struct adamax : public stateful_optimizer<2> { : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W, bool parallelize); - void update_gpu(const float_t* dW, float_t* W) {} + void update_gpu(const size_t n, const float_t* dW, float_t* W) {} float_t alpha; // learning rate float_t b1; // decay term @@ -120,7 +120,7 @@ struct adamax : public stateful_optimizer<2> { struct gradient_descent : public optimizer { gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {} void update(const vec_t& dW, vec_t& W, bool parallelize); - void update_gpu(const float_t* dW, float_t* W) {} + void update_gpu(const size_t n, const float_t* dW, float_t* W) {} float_t alpha; // learning rate float_t lambda; // weight decay }; @@ -136,7 +136,7 @@ struct momentum : public stateful_optimizer<1> { public: momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} void update(const vec_t& dW, vec_t& W, bool parallelize); - void update_gpu(const float_t* dW, float_t* W) {} + void update_gpu(const size_t n, const float_t* dW, float_t* W) {} float_t alpha; // learning rate float_t lambda; // weight decay @@ -155,7 +155,7 @@ struct nesterov_momentum : public stateful_optimizer<1> { nesterov_momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} void update(const vec_t& dW, vec_t& W, bool parallelize); - void update_gpu(const float_t* dW, float_t* W) {} + void update_gpu(const size_t n, const float_t* dW, float_t* W) {} float_t alpha; // learning rate float_t lambda; // weight decay diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 174cd1b36a..3bdcbd2607 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -72,12 +72,12 @@ void dropout_gpu(const int n, const float scale, const float dropout_rate, //std::cout << "[debug]: setup curand, n = " << n << "\n"; //setup_curand_kernel<<>>(n, devStates); //CudaTest("solving setup_curand kernel failed"); - std::cout << "[debug]: dropout_gpu\n"; + //std::cout << "[debug]: dropout_gpu\n"; dropout_kernel<<>>( n, scale, dropout_rate, in, masks, devStates, out); CudaTest("solving dropout kernel failed"); CUDA_CHECK(cudaFree(devStates)); - std::cout << "[debug]: dropout_gpu done\n"; + //std::cout << "[debug]: dropout_gpu done\n"; } __global__ void d_dropout_kernel(const int n, const float scale, const float_t* in, diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp index b076df561f..0ec40cf4d0 100644 --- a/libdeepgalois/src/optimizer.cpp +++ b/libdeepgalois/src/optimizer.cpp @@ -5,11 +5,10 @@ void adagrad::update(const vec_t& dW, vec_t& W, bool parallelize) { vec_t& g = get<0>(W); if (parallelize) { galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - g[i] += dW[i] * dW[i]; - W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); - }, - galois::loopname("adagrad_update")); + [&](const auto& i) { + g[i] += dW[i] * dW[i]; + W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); + }, galois::loopname("adagrad_update")); } else { for (size_t i = 0; i < W.size(); i++) { g[i] += dW[i] * dW[i]; @@ -47,41 +46,37 @@ void adamax::update(const vec_t& dW, vec_t& W, bool parallelize) { vec_t& mt = get<0>(W); vec_t& ut = get<1>(W); galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; - ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); - // Lp norm based update rule - W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); - }, - galois::loopname("adamax_update")); + [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); + // Lp norm based update rule + W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); + }, galois::loopname("adamax_update")); b1_t *= b1; } void gradient_descent::update(const vec_t& dW, vec_t& W, bool parallelize) { - galois::do_all( - galois::iterate((size_t)0, W.size()), + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); }, - galois::loopname("gradient_descent_update")); + galois::loopname("gradient_descent_update")); } void momentum::update(const vec_t& dW, vec_t& W, bool parallelize) { vec_t& dWprev = get<0>(W); galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); - W[i] += V; - dWprev[i] = V; - }, - galois::loopname("momentum_update")); + [&](const auto& i) { + float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += V; + dWprev[i] = V; + }, galois::loopname("momentum_update")); } void nesterov_momentum::update(const vec_t& dW, vec_t& W, bool parallelize) { vec_t& dWprev = get<0>(W); galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); - W[i] += (-mu) * dWprev[i] + (1 + mu) * V; - dWprev[i] = V; - }, - galois::loopname("nesterov_momentum_update")); + [&](const auto& i) { + float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += (-mu) * dWprev[i] + (1 + mu) * V; + dWprev[i] = V; + }, galois::loopname("nesterov_momentum_update")); } diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu index a936999d3e..7d718ea865 100644 --- a/libdeepgalois/src/optimizer.cu +++ b/libdeepgalois/src/optimizer.cu @@ -1,4 +1,27 @@ #include "optimizer.h" +#include "cutils.h" +#include "math_functions.hh" -void adam::update_gpu(const float_t* dW, float_t* W) { +__global__ void update_kernel(const int n, float_t alpha, float_t b1, + float_t b2, float_t b1_t, float_t b2_t, + float_t eps, float_t* mt, float_t* vt, + const float_t* dW, float_t* W) { + CUDA_KERNEL_LOOP(i, n) { + mt[i] = b1 * mt[i] + (1.0 - b1) * dW[i]; + vt[i] = b2 * vt[i] + (1.0 - b2) * dW[i] * dW[i]; + W[i] -= alpha * (mt[i] / (1.0 - b1_t)) / + std::sqrt((vt[i] / (1.0 - b2_t)) + eps); + } +} + +void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { + float_t * W1, *W2; + CUDA_CHECK(cudaMalloc((void**)&W1, n * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void**)&W2, n * sizeof(float_t))); + copy_gpu(n, W, W1); + copy_gpu(n, W, W2); + update_kernel<<>>( + n, alpha, b1, b2, b1_t, b2_t, eps, W1, W2, dW, W); + b1_t *= b1; + b2_t *= b2; } From e225e2ca620be4f7051c4562515a291b9c69d4a7 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 25 Feb 2020 21:09:17 -0600 Subject: [PATCH 044/660] refine graph_conv_layer.cpp --- libdeepgalois/src/layers/graph_conv_layer.cpp | 95 +++++++------------ 1 file changed, 33 insertions(+), 62 deletions(-) diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index d53a75e53a..6ab8662101 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -1,18 +1,15 @@ #include "layers/graph_conv_layer.h" #ifdef CPU_ONLY -void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, - float_t* out) { +void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { update_all(len, g, in, out, true, context->norm_factor); #else -void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, - float_t* out) { +void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { update_all(len, g, in, out, true, context->d_norm_factor); #endif } -void graph_conv_layer::combine(const vec_t& self, const vec_t& neighbors, - vec_t& out) { +void graph_conv_layer::combine(const vec_t& self, const vec_t& neighbors, vec_t& out) { vec_t a(out.size(), 0); vec_t b(out.size(), 0); mvmul(Q, self, a); @@ -37,8 +34,7 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, } void graph_conv_layer::init() { - std::cout << name_ - << ": allocating memory for parameters and intermediate data... "; + std::cout << name_ << ": allocating memory for params and temp data... "; Timer t_alloc; t_alloc.Start(); #ifdef CPU_ONLY @@ -48,13 +44,11 @@ void graph_conv_layer::init() { if (dropout_) dropout_mask = new unsigned[x * y]; in_temp = new float_t[x * y]; - out_temp = new float_t - [x * z]; // same as pre_sup in original GCN code: + out_temp = new float_t[x * z]; // same as pre_sup in original GCN code: // https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py trans_data = new float_t[y * x]; // y*x #else - gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, - d_weight_grad); + gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, d_weight_grad); #endif t_alloc.Stop(); std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; @@ -62,18 +56,16 @@ void graph_conv_layer::init() { #ifdef CPU_ONLY // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) -void graph_conv_layer::forward_propagation(const float_t* in_data, - float_t* out_data) { +void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { // input: x*y; W: y*z; output: x*z // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W (not implemented yet) if (dropout_ && phase_ == net_phase::train) { galois::do_all(galois::iterate((size_t)0, x), - [&](const auto& i) { - dropout(y, scale_, dropout_rate_, &in_data[i * y], - &dropout_mask[i * y], &in_temp[i * y]); - }, - galois::loopname("dropout")); + [&](const auto& i) { + dropout(y, scale_, dropout_rate_, &in_data[i * y], + &dropout_mask[i * y], &in_temp[i * y]); + }, galois::loopname("dropout")); matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z } else matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z @@ -90,20 +82,14 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { - // void graph_conv_layer::back_propagation(const vec_t &in_data, const vec_t - // &out_data, vec_t &out_grad, vec_t &in_grad) { if (act_) { galois::do_all(galois::iterate((size_t)0, x), - [&](const auto& i) { - for (size_t j = 0; j < z; - ++j) // TODO: use in_data or out_data? - out_temp[i * z + j] = out_data[i * z + j] > float_t(0) - ? out_grad[i * z + j] - : float_t(0); - }, - galois::loopname("d_relu")); - } else - copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying + [&](const auto& i) { + for (size_t j = 0; j < z; ++j) // TODO: use in_data or out_data? + out_temp[i * z + j] = out_data[i * z + j] > float_t(0) + ? out_grad[i * z + j] : float_t(0); + }, galois::loopname("d_relu")); + } else copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying if (level_ != 0) { // no need to calculate in_grad for the first layer vec_t trans_W(z * y); transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix @@ -114,12 +100,11 @@ void graph_conv_layer::back_propagation(const float_t* in_data, context->norm_factor); // x*x; x*y -> x*y if (dropout_) { galois::do_all(galois::iterate((size_t)0, x), - [&](const auto& i) { - d_dropout(y, scale_, &in_grad[i * y], - &dropout_mask[i * y], &in_grad[i * y]); - }, - galois::chunk_size(), galois::steal(), - galois::loopname("d_dropout")); + [&](const auto& i) { + d_dropout(y, scale_, &in_grad[i * y], + &dropout_mask[i * y], &in_grad[i * y]); + }, galois::chunk_size(), galois::steal(), + galois::loopname("d_dropout")); } } // calculate weight gradients @@ -128,43 +113,29 @@ void graph_conv_layer::back_propagation(const float_t* in_data, } #else -// GPU forward +// GPU forward: compute output features void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { assert(y <= 128); // currently only support feature length <= 128 - assert(in_data != NULL); - assert(in_temp != NULL); - assert(dropout_mask != NULL); - // std::cout << "in_data=" << in_data << ", in_temp=" << in_temp << ", - // dropout_mask=" << dropout_mask << ", out_temp=" << out_temp << ", out_data=" - // << out_data << "\n"; if (dropout_ && phase_ == net_phase::train) { dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp); - } else - matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp); - // aggregate(z, context->graph_gpu, out_temp, out_data); - if (act_) - relu_gpu(x * z, out_data, out_data); + } else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp); + //aggregate(z, context->graph_gpu, out_temp, out_data); + if (act_) relu_gpu(x * z, out_data, out_data); } -// GPU backward +// GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad) void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { - if (act_) - d_relu_gpu(x * z, out_grad, out_data, out_temp); - else - copy_gpu(x * z, out_grad, out_temp); + if (act_) d_relu_gpu(x * z, out_grad, out_data, out_temp); + else copy_gpu(x * z, out_grad, out_temp); if (level_ != 0) { - sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, - in_temp); - // update_all(y, context->graph_gpu, in_temp, in_grad, true, - // context->d_norm_factor); - if (dropout_) - d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad); + sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); + //update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); + if (dropout_) d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad); } - sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, - d_weight_grad); + sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); } #endif From 1ec2fb9f39f0d5dafc91fdad489cd3a795b2777c Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 25 Feb 2020 21:25:09 -0600 Subject: [PATCH 045/660] refine softmax_loss_layer.cpp --- .../src/layers/softmax_loss_layer.cpp | 61 +++++++++---------- libdeepgalois/src/math_functions.cu | 28 +++------ 2 files changed, 38 insertions(+), 51 deletions(-) diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 0cd9547250..1c305827ac 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -19,18 +19,16 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), - [&](const auto& i) { - if (masks_[i] == 1) { // masked - softmax(len, &in_data[len * i], - &out_data[len * i]); // normalize using softmax - // y is a one hot encoded vector for the labels - std::vector y(output_dims[1], 0.0); // ground truth - y[context->get_label(i)] = 1.0; // one-hot - loss[i] = cross_entropy(len, &y[0], &out_data[len * i]); - } - }, - galois::chunk_size(), galois::steal(), - galois::loopname("softmax-loss-fw")); + [&](const auto& i) { + if (masks_[i] == 1) { // masked + softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax + // y is a one hot encoded vector for the labels + std::vector y(output_dims[1], 0.0); // ground truth + y[context->get_label(i)] = 1.0; // one-hot + loss[i] = cross_entropy(len, &y[0], &out_data[len*i]); + } + }, galois::chunk_size(), galois::steal(), + galois::loopname("softmax-loss-fw")); } void softmax_loss_layer::back_propagation(const float_t* in_data, @@ -38,19 +36,17 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, float_t* out_grad, float_t* in_grad) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), - [&](const auto& i) { - if (masks_[i] == 1) { // masked - vec_t norm_grad(len); - std::vector y(len, 0.0); // ground truth - y[context->get_label(i)] = 1.0; - d_cross_entropy(len, &y[0], &out_data[len * i], - &norm_grad[0]); - d_softmax(len, &in_data[len * i], &out_data[len * i], - &in_grad[len * i], &norm_grad[0]); - } - }, - galois::chunk_size(), galois::steal(), - galois::loopname("softmax-loss-bw")); + [&](const auto& i) { + if (masks_[i] == 1) { // masked + vec_t norm_grad(len); + std::vector y(len, 0.0); // ground truth + y[context->get_label(i)] = 1.0; + d_cross_entropy(len, &y[0], &out_data[len * i], &norm_grad[0]); + d_softmax(len, &in_data[len * i], &out_data[len * i], + &in_grad[len * i], &norm_grad[0]); + } + }, galois::chunk_size(), galois::steal(), + galois::loopname("softmax-loss-bw")); } acc_t softmax_loss_layer::get_masked_loss() { @@ -59,14 +55,13 @@ acc_t softmax_loss_layer::get_masked_loss() { total_loss.reset(); valid_sample_count.reset(); galois::do_all(galois::iterate(begin_, end_), - [&](const auto& i) { - if (masks_[i]) { - total_loss += loss[i]; - valid_sample_count += 1; - } - }, - galois::chunk_size<256>(), galois::steal(), - galois::loopname("getMaskedLoss")); + [&](const auto& i) { + if (masks_[i]) { + total_loss += loss[i]; + valid_sample_count += 1; + } + }, galois::chunk_size<256>(), galois::steal(), + galois::loopname("getMaskedLoss")); assert(valid_sample_count.reduce() == count_); return total_loss.reduce() / (acc_t)count_; } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 3bdcbd2607..53ff024872 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -215,8 +215,7 @@ __device__ void softmax(int n, const float_t* input, float_t* output) { } // TODO: use warp -__device__ void d_softmax(size_t n, const float_t* p, const float_t* dp, - float_t* dy) { +__device__ void d_softmax(size_t n, const float_t* p, const float_t* dp, float_t* dy) { for (size_t i = 0; i < n; i++) { dy[i] = 0; for (size_t j = 0; j < n; j++) { @@ -226,21 +225,15 @@ __device__ void d_softmax(size_t n, const float_t* p, const float_t* dp, } } -__device__ void cross_entropy(int n, const label_t idx, const float_t* p, - float_t& loss) { - if (p[idx] == 0.0) - loss -= log(float_t(1e-10)); - else - loss -= log(p[idx]); +__device__ void cross_entropy(int n, const label_t idx, const float_t* p, float_t& loss) { + if (p[idx] == 0.0) loss -= log(float_t(1e-10)); + else loss -= log(p[idx]); } -__device__ void d_cross_entropy(int n, const label_t idx, const float_t* p, - float_t* d) { +__device__ void d_cross_entropy(int n, const label_t idx, const float_t* p, float_t* d) { for (int i = 0; i < n; i++) - if (i == (int)idx) - d[i] = -1.0 / (p[i] + 1e-10); - else - d[i] = 0.0; + if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10); + else d[i] = 0.0; } // n: number of vectors @@ -253,8 +246,8 @@ __global__ void softmax_cross_entropy_kernel(int n, int len, float_t* loss, float_t* out_data) { CUDA_KERNEL_LOOP(i, n) { if (masks[i] == 1) { // masked - softmax(len, in_data + len * i, - out_data + len * i); // normalize using softmax + // normalize using softmax + softmax(len, in_data + len * i, out_data + len * i); loss[i] = 0.0; cross_entropy(len, labels[i], &out_data[len * i], loss[i]); } @@ -269,8 +262,7 @@ void softmax_cross_entropy_gpu(int n, int len, const float_t* in, CudaTest("solving softmax_cross_entropy kernel failed"); } -__global__ void -d_softmax_cross_entropy_kernel(int n, int len, const float_t* in, +__global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t* in, const mask_t* masks, const label_t* labels, const float_t* out, float_t* diff) { CUDA_KERNEL_LOOP(i, n) { From 9a5722e5b93c5f1a7d919ca3b61e5923bc9262a4 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 25 Feb 2020 22:15:25 -0600 Subject: [PATCH 046/660] refine src/context.cpp --- libdeepgalois/include/layers/layer.h | 3 ++- libdeepgalois/src/context.cpp | 17 +++++++---------- libdeepgalois/src/optimizer.cu | 4 ++++ 3 files changed, 13 insertions(+), 11 deletions(-) diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index bd8f67fa07..438ee45993 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -97,15 +97,16 @@ class layer : public node { next()->get_gradient(), prev()->get_gradient()); } void update_weight(optimizer* opt) { - // std::cout << name_ << ": weight updating ... "; // vec_t diff; // prev()->merge_grads(&diff); #ifdef CPU_ONLY + // std::cout << name_ << ": weight updating ... "; // parallelize only when target size is big enough to mitigate thread // spawning overhead. bool parallel = (W.size() >= 512); opt->update(weight_grad, W, parallel); // W += grad #else + std::cout << name_ << ": "; opt->update_gpu(input_dims[1]*output_dims[1], d_weight_grad, d_W); // W += grad #endif // prev()->clear_grads(); diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 785f4b2d26..aab3e1c3cd 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -64,16 +64,13 @@ void Context::norm_factor_counting() { #ifdef CPU_ONLY norm_factor = new float_t[n]; galois::do_all(galois::iterate((size_t)0, n), - [&](auto v) { - auto degree = std::distance(graph_cpu.edge_begin(v), - graph_cpu.edge_end(v)); - float_t temp = std::sqrt(float_t(degree)); - if (temp == 0.0) - norm_factor[v] = 0.0; - else - norm_factor[v] = 1.0 / temp; - }, - galois::loopname("NormCounting")); + [&](auto v) { + auto degree = std::distance(graph_cpu.edge_begin(v), + graph_cpu.edge_end(v)); + float_t temp = std::sqrt(float_t(degree)); + if (temp == 0.0) norm_factor[v] = 0.0; + else norm_factor[v] = 1.0 / temp; + }, galois::loopname("NormCounting")); #else norm_factor_counting_gpu(); #endif diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu index 7d718ea865..e58c641245 100644 --- a/libdeepgalois/src/optimizer.cu +++ b/libdeepgalois/src/optimizer.cu @@ -1,3 +1,4 @@ +#include #include "optimizer.h" #include "cutils.h" #include "math_functions.hh" @@ -15,6 +16,7 @@ __global__ void update_kernel(const int n, float_t alpha, float_t b1, } void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { + std::cout << updating weights on GPU, n = " << n << "\n"; float_t * W1, *W2; CUDA_CHECK(cudaMalloc((void**)&W1, n * sizeof(float_t))); CUDA_CHECK(cudaMalloc((void**)&W2, n * sizeof(float_t))); @@ -24,4 +26,6 @@ void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { n, alpha, b1, b2, b1_t, b2_t, eps, W1, W2, dW, W); b1_t *= b1; b2_t *= b2; + CUDA_CHECK(cudaFree(W1)); + CUDA_CHECK(cudaFree(W2)); } From 8360c38f6baee35524584d1b3d9fe6455e7169ca Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 25 Feb 2020 23:45:18 -0600 Subject: [PATCH 047/660] fix bug in aggregator --- libdeepgalois/include/cutils.h | 9 +++++++++ libdeepgalois/src/aggregator.cu | 2 +- libdeepgalois/src/context.cu | 1 + libdeepgalois/src/layers/graph_conv_layer.cpp | 9 +++++++-- libdeepgalois/src/optimizer.cu | 4 ++-- 5 files changed, 20 insertions(+), 5 deletions(-) diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h index fac2cfaa64..c817863242 100644 --- a/libdeepgalois/include/cutils.h +++ b/libdeepgalois/include/cutils.h @@ -4,6 +4,7 @@ #include #include #include +#include // CUDA: use 256 threads per block const int CUDA_NUM_THREADS = 256; @@ -127,3 +128,11 @@ inline const char* curandGetErrorString(curandStatus_t error) { // CUDA: check for error after kernel execution and exit loudly if there is one. #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError()) + +inline void print_device_vector(size_t n, const float_t *d_x, std::string name = "x") { + float_t *h_x = new float_t[n]; + CUDA_CHECK(cudaMemcpy(h_x, d_x, n * sizeof(float_t), cudaMemcpyDeviceToHost)); + for (size_t i = 0; i < n; i ++) std::cout << name << "[" << i << "]=" << h_x[i] << "\n"; + delete h_x; +} + diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index c5ed6e0817..885660e973 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -12,7 +12,7 @@ __device__ void scale_add(const int n, const float_t alpha, const float_t* a, y[i] = alpha * a[i] + b[i]; } -__global__ void update_all_kernel(size_t n, size_t len, CSRGraph& g, +__global__ void update_all_kernel(size_t n, size_t len, CSRGraph g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { CUDA_KERNEL_LOOP(src, n) { diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index b68f07ab98..0a63bb40bd 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -96,6 +96,7 @@ void Context::copy_data_to_device() { CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t))); CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); + print_device_vector(10, d_feats, "d_feats"); } float_t* Context::get_in_ptr() { return d_feats; } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 6ab8662101..073ba9eb76 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -121,8 +121,10 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp); } else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp); - //aggregate(z, context->graph_gpu, out_temp, out_data); + aggregate(z, context->graph_gpu, out_temp, out_data); if (act_) relu_gpu(x * z, out_data, out_data); + std::cout << "Forward " << name_ << ":\n"; + print_device_vector(10, in_data, "in_data"); } // GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad) @@ -133,9 +135,12 @@ void graph_conv_layer::back_propagation(const float_t* in_data, else copy_gpu(x * z, out_grad, out_temp); if (level_ != 0) { sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); - //update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); + update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); if (dropout_) d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad); } + std::cout << "Backward " << name_ << ":\n"; + print_device_vector(10, in_data, "in_data"); + print_device_vector(10, out_temp, "out_temp"); sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); } #endif diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu index e58c641245..ee9ff3b8d4 100644 --- a/libdeepgalois/src/optimizer.cu +++ b/libdeepgalois/src/optimizer.cu @@ -1,4 +1,3 @@ -#include #include "optimizer.h" #include "cutils.h" #include "math_functions.hh" @@ -16,7 +15,8 @@ __global__ void update_kernel(const int n, float_t alpha, float_t b1, } void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { - std::cout << updating weights on GPU, n = " << n << "\n"; + std::cout << "updating weights on GPU, n = " << n << "\n"; + print_device_vector(10, dW, "dW"); float_t * W1, *W2; CUDA_CHECK(cudaMalloc((void**)&W1, n * sizeof(float_t))); CUDA_CHECK(cudaMalloc((void**)&W2, n * sizeof(float_t))); From 8daaf898542ddb2a3b521e2bfb034f905d7fd846 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 26 Feb 2020 11:06:17 -0600 Subject: [PATCH 048/660] refine gpu operators --- libdeepgalois/include/math_functions.hh | 32 ++-- libdeepgalois/src/aggregator.cu | 4 +- libdeepgalois/src/context.cu | 2 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 23 ++- .../src/layers/softmax_loss_layer.cpp | 8 +- libdeepgalois/src/math_functions.cu | 164 ++++++++++++------ libdeepgalois/src/optimizer.cu | 4 +- 7 files changed, 150 insertions(+), 87 deletions(-) diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 61e95ef5b0..ef313815a7 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -53,24 +53,26 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in, std::vector& mask, vec_t& out); // dropout void dropout(const float scale, const float dropout_rate, const vec_t& in, std::vector& mask, float_t* out); -void dropout(size_t n, const float scale, const float dropout_rate, +void dropout(int n, const float scale, const float dropout_rate, const float_t* in, unsigned* mask, float_t* out); void d_dropout(const float scale, const vec_t& in_diff, std::vector& mask, vec_t& out_diff); // dropout derivative -void d_dropout(size_t n, const float scale, const float_t* in_diff, +void d_dropout(int n, const float scale, const float_t* in_diff, unsigned* mask, float_t* out_diff); void softmax(const vec_t& input, vec_t& output); -void softmax(size_t n, const float_t* input, float_t* output); +void softmax(int n, const float_t* input, float_t* output); void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp); -void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, +void d_softmax(int n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp); float_t cross_entropy(const vec_t& y, const vec_t& p); -float_t cross_entropy(size_t n, const float_t* y, const float_t* p); +float_t cross_entropy(int n, const float_t* y, const float_t* p); void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d); -void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d); +void d_cross_entropy(int n, const float_t* y, const float_t* p, float_t* d); -void copy_gpu(size_t len, const float_t* in, float_t* out); +// GPU operators +void init_const_gpu(int n, float_t value, float_t *array); +void copy_gpu(int len, const float_t* in, float_t* out); void vadd_gpu(const int n, const float_t* a, const float_t* b, float_t* out); // vector add void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU @@ -83,29 +85,27 @@ void d_dropout_gpu(const int n, const float scale, const float_t* in, void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C); +void matmul_gpu(const size_t x, const size_t y, const size_t z, + const float_t* A, const float_t* B, float_t* C); void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t* A, const float_t* B, float_t* C); // matrix multiply -void softmax_cross_entropy_gpu(int x, int y, const float_t* in_data, +void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in_data, const mask_t* masks, const label_t* labels, float_t* loss, float_t* out_data); -void d_softmax_cross_entropy_gpu(int x, int y, const float_t* in_data, +void d_softmax_cross_entropy_gpu(int len, int bengin, int end, const mask_t* masks, const label_t* labels, const float_t* out_data, float_t* diff); void scal_gpu(const int N, const float alpha, float* X); void add_scalar_gpu(const int N, const float_t alpha, float_t* Y); -acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t* masks, +acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, float_t* loss); -acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end, - size_t count, mask_t* masks, float_t* preds, +acc_t masked_accuracy_gpu(int num_classes, int begin, int end, + int count, mask_t* masks, float_t* preds, label_t* labels); - void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks); -void malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned*& masks, - float_t*& in, float_t*& out); void loss_malloc_device(int n, float_t*& loss); void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned*& masks, float_t*& in, float_t*& out, float_t*& matrix, float_t*& grad); - #endif diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index 885660e973..f0c06722b6 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -22,7 +22,6 @@ __global__ void update_all_kernel(size_t n, size_t len, CSRGraph g, index_type end = g.edge_end(src); for (index_type e = begin; e != end; e++) { index_type dst = g.getEdgeDst(e); - assert(dst < n); if (norm) b = a * norm_factor[dst]; scale_add(len, b, in + dst * len, out + src * len, out + src * len); // out[src] += in[dst] @@ -33,7 +32,8 @@ __global__ void update_all_kernel(size_t n, size_t len, CSRGraph g, void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { unsigned n = g.nnodes; - std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n"; + //std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n"; + //print_device_vector(10, norm_factor, "norm_factor"); CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); update_all_kernel<<>>( n, len, g, in, out, norm, norm_factor); diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 0a63bb40bd..647e010f60 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -96,7 +96,7 @@ void Context::copy_data_to_device() { CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t))); CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); - print_device_vector(10, d_feats, "d_feats"); + //print_device_vector(10, d_feats, "d_feats"); } float_t* Context::get_in_ptr() { return d_feats; } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 073ba9eb76..25b06417bb 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -30,6 +30,7 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, trainable_ = true; name_ = layer_type() + "_" + std::to_string(level); init(); + assert(dropout_rate_ < 1.); scale_ = 1. / (1. - dropout_rate_); } @@ -117,14 +118,19 @@ void graph_conv_layer::back_propagation(const float_t* in_data, void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { assert(y <= 128); // currently only support feature length <= 128 + //if (level_ == 0) print_device_vector(20, in_data, "in_data"); + //if (level_ == 0) print_device_vector(20, d_W, "W"); if (dropout_ && phase_ == net_phase::train) { dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); - matmul1D1D_gpu(x, z, y, in_temp, d_W, out_temp); - } else matmul1D1D_gpu(x, z, y, in_data, d_W, out_temp); + sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp); + //copy_gpu(x*y, in_data, in_temp); + //matmul_gpu(x, z, y, in_temp, d_W, out_temp); + } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp); + //if (level_ == 0) print_device_vector(20, out_temp, "out_temp"); aggregate(z, context->graph_gpu, out_temp, out_data); if (act_) relu_gpu(x * z, out_data, out_data); - std::cout << "Forward " << name_ << ":\n"; - print_device_vector(10, in_data, "in_data"); + //std::cout << "Forward " << name_ << ":\n"; + //print_device_vector(20, out_data, "out_data"); } // GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad) @@ -138,9 +144,12 @@ void graph_conv_layer::back_propagation(const float_t* in_data, update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); if (dropout_) d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad); } - std::cout << "Backward " << name_ << ":\n"; - print_device_vector(10, in_data, "in_data"); - print_device_vector(10, out_temp, "out_temp"); sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); + if (level_ == 0) { + std::cout << "Backward " << name_ << ":\n"; + print_device_vector(20, in_data, "in_data"); + print_device_vector(20, out_temp, "out_temp"); + print_device_vector(20, d_weight_grad, "dW"); + } } #endif diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 1c305827ac..8457d1255a 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -50,6 +50,7 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, } acc_t softmax_loss_layer::get_masked_loss() { + assert(count_ > 0); AccumF total_loss; AccumU valid_sample_count; total_loss.reset(); @@ -68,14 +69,15 @@ acc_t softmax_loss_layer::get_masked_loss() { #else // GPU implementation void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { - softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, - context->d_labels, loss, out_data); + init_const_gpu(input_dims[0], 0.0, loss); + softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, + d_masks_, context->d_labels, loss, out_data); } void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { - d_softmax_cross_entropy_gpu(input_dims[0], input_dims[1], in_data, d_masks_, + d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, context->d_labels, out_data, in_grad); } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 53ff024872..5d12d04986 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -32,6 +32,17 @@ void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) { CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); } +__global__ void init_const_kernel(int n, float_t value, float_t *array) { + CUDA_KERNEL_LOOP(i, n) { + array[i] = value; + } +} + +void init_const_gpu(int n, float_t value, float_t *array) { + init_const_kernel<<>>(n, value, array); + CudaTest("solving init_const kernel failed"); +} + void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned*& masks, float_t*& in, float_t*& out, float_t*& matrix, float_t*& grad) { @@ -60,8 +71,10 @@ __global__ void dropout_kernel(const int n, const float scale, // curandState_t curand_state; //curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 //masks[i] = curand_uniform(&state[i]) <= dropout_rate ? 1 : 0; - masks[i] = 1.0 - dropout_rate; - out[i] = in[i] * masks[i] * scale; + //masks[i] = 1.0 - dropout_rate; + //out[i] = in[i] * masks[i] * scale; + masks[i] = 1.0; + out[i] = in[i]; } } @@ -82,12 +95,14 @@ void dropout_gpu(const int n, const float scale, const float dropout_rate, __global__ void d_dropout_kernel(const int n, const float scale, const float_t* in, const unsigned* masks, float_t* out) { - CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; } + //CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; } + CUDA_KERNEL_LOOP(i, n) { out[i] = in[i]; } } void d_dropout_gpu(const int n, const float scale, const float_t* in, const unsigned* masks, float_t* out) { d_dropout_kernel<<>>(n, scale, in, masks, out); + CudaTest("solving d_dropout kernel failed"); } // flattern data into 1D before feed into the ReLU operater @@ -114,6 +129,28 @@ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, CudaTest("solving d_relu kernel failed"); } +__global__ void matmul_kernel(int x, int y, int z, const float_t* A, + const float_t* B, float_t* C) { + int row = blockIdx.x*blockDim.x+threadIdx.x; + int col = blockIdx.y*blockDim.y+threadIdx.y; + float_t sum = 0.0f; + if (row < x && col < y) { + for (int i = 0; i < z; i++) { + sum += A[row * z + i] * B[i * y + col]; + } + } + C[row * y + col] = sum; +} + +#define TILE_SZ 16 +void matmul_gpu(const size_t x, const size_t y, const size_t z, + const float_t* A, const float_t* B, float_t* C) { + dim3 threadsPerBlock(TILE_SZ, TILE_SZ); + dim3 blocksPerGrid((y-1)/TILE_SZ+1, (x-1)/TILE_SZ+1); + matmul_kernel<<>>(x, y, z, A, B, C); + CudaTest("solving matmul kernel failed"); +} + void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C) { @@ -124,8 +161,8 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasSgemm(Context::cublas_handle(), cuTransB, cuTransA, N, M, - K, &alpha, B, ldb, A, lda, &beta, C, N)); + CUBLAS_CHECK(cublasSgemm(Context::cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, @@ -190,7 +227,7 @@ __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] + b[index]; } } -void copy_gpu(size_t len, const float_t* in, float_t* out) { +void copy_gpu(int len, const float_t* in, float_t* out) { CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); } @@ -200,87 +237,100 @@ void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) { } // TODO: use warp -__device__ void softmax(int n, const float_t* input, float_t* output) { +__device__ void softmax_device(int n, const float_t* input, float_t* output) { float_t max = input[0]; - for (size_t i = 1; i < n; i++) + for (int i = 1; i < n; i++) if (input[i] > max) max = input[i]; float_t denominator = 0.0; - for (size_t i = 0; i < n; i++) { - output[i] = exp(input[i] - max); + for (int i = 0; i < n; i++) { + output[i] = expf(input[i] - max); denominator += output[i]; + if (output[i] < 0.0) printf("in[%d]=%f, out[%d]=%f\n", i, input[i], i, output[i]); + //assert(output[i] >= 0.0); } - for (size_t i = 0; i < n; i++) + assert(denominator != 0.0); + for (int i = 0; i < n; i++) { output[i] /= denominator; -} - -// TODO: use warp -__device__ void d_softmax(size_t n, const float_t* p, const float_t* dp, float_t* dy) { - for (size_t i = 0; i < n; i++) { - dy[i] = 0; - for (size_t j = 0; j < n; j++) { - float_t df = (j == i) ? p[i] * (1.0 - p[i]) : -p[j] * p[i]; - dy[i] += df * dp[j]; - } + //assert(output[i] >= 0.0); + //assert(output[i] <= 1.0); } } -__device__ void cross_entropy(int n, const label_t idx, const float_t* p, float_t& loss) { - if (p[idx] == 0.0) loss -= log(float_t(1e-10)); - else loss -= log(p[idx]); -} - -__device__ void d_cross_entropy(int n, const label_t idx, const float_t* p, float_t* d) { - for (int i = 0; i < n; i++) - if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10); - else d[i] = 0.0; +__device__ void cross_entropy_device(int n, const label_t idx, const float_t* p, float_t& loss) { + if (p[idx] == 0.0) loss -= logf(float_t(1e-10)); + else loss -= logf(p[idx]); } // n: number of vectors // len: length of vectors // for each vector, do softmax to normalize the vector, and then compute a loss -__global__ void softmax_cross_entropy_kernel(int n, int len, +__global__ void softmax_cross_entropy_kernel(int len, int begin, int end, const float_t* in_data, const mask_t* masks, const label_t* labels, float_t* loss, float_t* out_data) { - CUDA_KERNEL_LOOP(i, n) { - if (masks[i] == 1) { // masked + CUDA_KERNEL_LOOP(i, end-begin) { + int id = begin + i; + if (masks[id] == 1) { // masked // normalize using softmax - softmax(len, in_data + len * i, out_data + len * i); - loss[i] = 0.0; - cross_entropy(len, labels[i], &out_data[len * i], loss[i]); + softmax_device(len, in_data + len*id, out_data + len*id); + //loss[id] = 0.0; + cross_entropy_device(len, labels[id], out_data + len*id, loss[id]); } } } -void softmax_cross_entropy_gpu(int n, int len, const float_t* in, +void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in, const mask_t* masks, const label_t* labels, float_t* loss, float_t* out) { - softmax_cross_entropy_kernel<<>>( - n, len, in, masks, labels, loss, out); + softmax_cross_entropy_kernel<<>>( + len, begin, end, in, masks, labels, loss, out); CudaTest("solving softmax_cross_entropy kernel failed"); } -__global__ void d_softmax_cross_entropy_kernel(int n, int len, const float_t* in, +// TODO: use warp +__device__ void d_softmax(int n, const float_t* p, const float_t* dp, float_t* dy) { + for (int i = 0; i < n; i++) { + dy[i] = 0; + for (int j = 0; j < n; j++) { + float_t df = (j == i) ? p[i] * (1.0 - p[i]) : -p[j] * p[i]; + dy[i] += df * dp[j]; + } + } +} + +__device__ void d_cross_entropy(int n, const label_t idx, const float_t* p, float_t* d) { + for (int i = 0; i < n; i++) { + //assert(p[i] >= 0.0); + //assert(p[i] >= 0.0 && p[i] <= 1.0); + if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10); + else d[i] = 0.0; + } +} + +__global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end, const mask_t* masks, const label_t* labels, const float_t* out, float_t* diff) { - CUDA_KERNEL_LOOP(i, n) { - float_t out_grad[41]; // TODO - d_cross_entropy(len, labels[i], out + len * i, out_grad); - d_softmax(len, out + len * i, out_grad, diff + len * i); + CUDA_KERNEL_LOOP(i, end-begin) { + int id = begin + i; + if (masks[id] == 1) { // masked + float_t out_grad[41]; // TODO + d_cross_entropy(len, labels[id], out + len*id, out_grad); + d_softmax(len, out + len*id, out_grad, diff + len*id); + } } } -void d_softmax_cross_entropy_gpu(int n, int len, const float_t* in, +void d_softmax_cross_entropy_gpu(int len, int begin, int end, const mask_t* masks, const label_t* labels, const float_t* out, float_t* diff) { - d_softmax_cross_entropy_kernel<<>>( - n, len, in, masks, labels, out, diff); + d_softmax_cross_entropy_kernel<<>>( + len, begin, end, masks, labels, out, diff); CudaTest("solving d_softmax_cross_entropy kernel failed"); } -__global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t* masks, +__global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks, float_t* loss, HGAccumulator total) { total.thread_entry(); @@ -293,8 +343,9 @@ __global__ void masked_avg_loss_kernel(size_t begin, size_t end, mask_t* masks, total.thread_exit>(local_loss); } -acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t* masks, +acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, float_t* loss) { + assert(count > 0); HGAccumulator loss_accum; Shared total_loss = Shared(1); *(total_loss.cpu_wr_ptr()) = 0; @@ -307,10 +358,10 @@ acc_t masked_avg_loss(size_t begin, size_t end, size_t count, mask_t* masks, } // the arguments of the maxima -__device__ size_t argmax_device(const size_t n, const float_t* x) { +__device__ int argmax_device(const int n, const float_t* x) { float_t max = x[0]; - size_t max_ind = 0; - for (size_t i = 1; i < n; i++) { + int max_ind = 0; + for (int i = 1; i < n; i++) { if (x[i] > max) { max_ind = i; max = x[i]; @@ -319,8 +370,8 @@ __device__ size_t argmax_device(const size_t n, const float_t* x) { return max_ind; } -__global__ void masked_accuracy_kernel(size_t num_classes, size_t begin, - size_t end, mask_t* masks, +__global__ void masked_accuracy_kernel(int num_classes, int begin, + int end, mask_t* masks, float_t* preds, label_t* labels, HGAccumulator total) { total.thread_entry(); @@ -337,9 +388,10 @@ __global__ void masked_accuracy_kernel(size_t num_classes, size_t begin, total.thread_exit>(local_accuracy); } -acc_t masked_accuracy_gpu(size_t num_classes, size_t begin, size_t end, - size_t count, mask_t* masks, float_t* preds, +acc_t masked_accuracy_gpu(int num_classes, int begin, int end, + int count, mask_t* masks, float_t* preds, label_t* labels) { + assert(count > 0); HGAccumulator accuracy_accum; Shared total_accuracy = Shared(1); *(total_accuracy.cpu_wr_ptr()) = 0; diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu index ee9ff3b8d4..a9326aaefd 100644 --- a/libdeepgalois/src/optimizer.cu +++ b/libdeepgalois/src/optimizer.cu @@ -15,8 +15,8 @@ __global__ void update_kernel(const int n, float_t alpha, float_t b1, } void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { - std::cout << "updating weights on GPU, n = " << n << "\n"; - print_device_vector(10, dW, "dW"); + //std::cout << "updating weights on GPU, n = " << n << "\n"; + //print_device_vector(10, dW, "dW"); float_t * W1, *W2; CUDA_CHECK(cudaMalloc((void**)&W1, n * sizeof(float_t))); CUDA_CHECK(cudaMalloc((void**)&W2, n * sizeof(float_t))); From 594d2c78583574da7d8c4fc9989a35e62c7c1eac Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 26 Feb 2020 14:39:26 -0600 Subject: [PATCH 049/660] fix bug in optimizer.cu --- libdeepgalois/include/math_functions.hh | 4 +- libdeepgalois/include/optimizer.h | 16 +++++- libdeepgalois/src/layers/graph_conv_layer.cpp | 49 +++++++++++++++++-- .../src/layers/softmax_loss_layer.cpp | 28 ++++++++++- libdeepgalois/src/math_functions.cu | 46 ++++++++++++----- libdeepgalois/src/optimizer.cu | 14 ++---- 6 files changed, 130 insertions(+), 27 deletions(-) diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index ef313815a7..0e0f9f38df 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -71,6 +71,7 @@ void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d); void d_cross_entropy(int n, const float_t* y, const float_t* p, float_t* d); // GPU operators +bool isnan_gpu(int n, const float_t *array); // does array contain any 'nan' element void init_const_gpu(int n, float_t value, float_t *array); void copy_gpu(int len, const float_t* in, float_t* out); void vadd_gpu(const int n, const float_t* a, const float_t* b, @@ -103,8 +104,9 @@ acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count, mask_t* masks, float_t* preds, label_t* labels); +bool is_allocated_device(float_t* data); void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks); -void loss_malloc_device(int n, float_t*& loss); +void float_malloc_device(int n, float_t*& loss); void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned*& masks, float_t*& in, float_t*& out, float_t*& matrix, float_t*& grad); diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h index d9f8de9116..96ef841644 100644 --- a/libdeepgalois/include/optimizer.h +++ b/libdeepgalois/include/optimizer.h @@ -3,7 +3,9 @@ #include #include #include "types.h" - +#ifndef CPU_ONLY +#include "math_functions.hh" +#endif // base class of optimizer // usesHessian : true if an optimizer uses hessian (2nd order derivative of loss // function) @@ -36,6 +38,18 @@ struct stateful_optimizer : public optimizer { return E_[Index][&key]; } std::unordered_map E_[N]; +#ifndef CPU_ONLY + template + float_t *get_gpu(const size_t n, const float_t *key) { + static_assert(Index < N, "index out of range"); + if (!is_allocated_device(dE_[Index][key])) { + float_malloc_device(n, dE_[Index][key]); + init_const_gpu(n, 0.0, dE_[Index][key]); + } + return dE_[Index][key]; + } + std::unordered_map dE_[N]; +#endif }; /** diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 25b06417bb..574e9369c0 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -120,17 +120,46 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, assert(y <= 128); // currently only support feature length <= 128 //if (level_ == 0) print_device_vector(20, in_data, "in_data"); //if (level_ == 0) print_device_vector(20, d_W, "W"); + + if (isnan_gpu(x*z, out_temp)) { + std::cout << name_ << " forward before sgemm Exception: out_temp nan, exiting\n"; + exit(0); + } + init_const_gpu(x*z, 0.0, out_temp); + if (isnan_gpu(x*y, in_temp)) { + std::cout << name_ << " forward Exception: in_temp nan, exiting\n"; + exit(0); + } + + if (isnan_gpu(y*z, d_W)) { + std::cout << name_ << " forward before sgemm Exception: d_W nan, exiting\n"; + exit(0); + } + if (dropout_ && phase_ == net_phase::train) { dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); - sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp); - //copy_gpu(x*y, in_data, in_temp); - //matmul_gpu(x, z, y, in_temp, d_W, out_temp); + //sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp); + matmul_gpu(x, z, y, in_temp, d_W, out_temp); } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp); //if (level_ == 0) print_device_vector(20, out_temp, "out_temp"); aggregate(z, context->graph_gpu, out_temp, out_data); if (act_) relu_gpu(x * z, out_data, out_data); //std::cout << "Forward " << name_ << ":\n"; //print_device_vector(20, out_data, "out_data"); + if (isnan_gpu(x*y, in_data)) { + std::cout << name_ << " forward Exception: in_data nan, exiting\n"; + exit(0); + } + + if (isnan_gpu(x*z, out_temp)) { + std::cout << name_ << " forward after sgemm Exception: out_temp nan, exiting\n"; + exit(0); + } + + if (isnan_gpu(x*z, out_data)) { + std::cout << name_ << " forward Exception: out_data nan, exiting\n"; + exit(0); + } } // GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad) @@ -143,6 +172,10 @@ void graph_conv_layer::back_propagation(const float_t* in_data, sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); if (dropout_) d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad); + if (isnan_gpu(x*y, in_grad)) { + std::cout << name_ << "Exception: ingrad nan, exiting\n"; + exit(0); + } } sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); if (level_ == 0) { @@ -151,5 +184,15 @@ void graph_conv_layer::back_propagation(const float_t* in_data, print_device_vector(20, out_temp, "out_temp"); print_device_vector(20, d_weight_grad, "dW"); } + + if (isnan_gpu(x*z, out_temp)) { + std::cout << name_ << " backward Exception: out_temp nan, exiting\n"; + exit(0); + } + + if (isnan_gpu(y*z, d_weight_grad)) { + std::cout << name_ << "Exception: ingrad nan, exiting\n"; + exit(0); + } } #endif diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 8457d1255a..c75781843b 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -9,7 +9,7 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, #ifdef CPU_ONLY loss = new float_t[in_dims[0]]; // error for each sample #else - loss_malloc_device(in_dims[0], loss); + float_malloc_device(in_dims[0], loss); #endif } #ifdef CPU_ONLY @@ -70,8 +70,30 @@ acc_t softmax_loss_layer::get_masked_loss() { void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { init_const_gpu(input_dims[0], 0.0, loss); + if (isnan_gpu(input_dims[0]*input_dims[1], in_data)) { + std::cout << name_ << " Exception: in_data nan, exiting\n"; + exit(0); + } + if (isnan_gpu(output_dims[0], loss)) { + std::cout << name_ << " Exception: loss nan, exiting\n"; + exit(0); + } + /* + if (isnan_gpu(output_dims[0], d_masks_)) { + std::cout << name_ << " Exception: masks nan, exiting\n"; + exit(0); + } + if (isnan_gpu(output_dims[0], context->d_labels)) { + std::cout << name_ << " Exception: labels nan, exiting\n"; + exit(0); + }*/ + softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, d_masks_, context->d_labels, loss, out_data); + if (isnan_gpu(output_dims[0]*output_dims[1], out_data)) { + std::cout << name_ << " Exception: out_data nan, exiting\n"; + exit(0); + } } void softmax_loss_layer::back_propagation(const float_t* in_data, @@ -79,6 +101,10 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, float_t* out_grad, float_t* in_grad) { d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, context->d_labels, out_data, in_grad); + if (isnan_gpu(input_dims[1]*input_dims[1], in_grad)) { + std::cout << name_ << " Exception: ingrad nan, exiting\n"; + exit(0); + } } acc_t softmax_loss_layer::get_masked_loss() { diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 5d12d04986..99b83e4d6e 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -5,6 +5,29 @@ #include "cub/cub.cuh" #include +__global__ void init_const_kernel(int n, float_t value, float_t *array) { + CUDA_KERNEL_LOOP(i, n) { array[i] = value; } +} + +void init_const_gpu(int n, float_t value, float_t *array) { + init_const_kernel<<>>(n, value, array); + CudaTest("solving init_const kernel failed"); +} + +__global__ void isnan_test(const int n, const float *data, bool *result) { + CUDA_KERNEL_LOOP(i, n) { if (isnan(data[i])) *result = true; } +} + +bool isnan_gpu(int n, const float_t *array) { + bool *d_result, h_result = false; + cudaMalloc((void **)&d_result, sizeof (bool)); + cudaMemcpy(d_result, &h_result, sizeof(bool), cudaMemcpyHostToDevice); + isnan_test<<>>(n, array, d_result); + CudaTest("solving init_const kernel failed"); + cudaMemcpy(&h_result, d_result, sizeof(bool), cudaMemcpyDeviceToHost); + return h_result; +} + void gpu_rng_uniform(const int n, unsigned* r) { CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n)); } @@ -22,7 +45,15 @@ void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_ CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); } -void loss_malloc_device(int n, float_t*& loss) { +bool is_allocated_device(float_t* data) { + if (data == NULL) return false; + cudaPointerAttributes attributes; + CUDA_CHECK(cudaPointerGetAttributes(&attributes, data)); + if (attributes.devicePointer != NULL) return true; + return false; +} + +void float_malloc_device(int n, float_t*& loss) { CUDA_CHECK(cudaMalloc((void**)&loss, n * sizeof(float_t))); } @@ -32,23 +63,14 @@ void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) { CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); } -__global__ void init_const_kernel(int n, float_t value, float_t *array) { - CUDA_KERNEL_LOOP(i, n) { - array[i] = value; - } -} - -void init_const_gpu(int n, float_t value, float_t *array) { - init_const_kernel<<>>(n, value, array); - CudaTest("solving init_const kernel failed"); -} - void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned*& masks, float_t*& in, float_t*& out, float_t*& matrix, float_t*& grad) { if (dropout) CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned))); CUDA_CHECK(cudaMalloc((void**)&in, x * y * sizeof(float_t))); + init_const_gpu(x*y, 0.0, in); CUDA_CHECK(cudaMalloc((void**)&out, x * z * sizeof(float_t))); + init_const_gpu(x*z, 0.0, out); CUDA_CHECK(cudaMalloc((void**)&matrix, y * z * sizeof(float_t))); auto init_range = sqrt(6.0 / (y + z)); // Glorot & Bengio (AISTATS 2010) diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu index a9326aaefd..bf279e4e37 100644 --- a/libdeepgalois/src/optimizer.cu +++ b/libdeepgalois/src/optimizer.cu @@ -10,22 +10,18 @@ __global__ void update_kernel(const int n, float_t alpha, float_t b1, mt[i] = b1 * mt[i] + (1.0 - b1) * dW[i]; vt[i] = b2 * vt[i] + (1.0 - b2) * dW[i] * dW[i]; W[i] -= alpha * (mt[i] / (1.0 - b1_t)) / - std::sqrt((vt[i] / (1.0 - b2_t)) + eps); + sqrtf((vt[i] / (1.0 - b2_t)) + eps); } } void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { //std::cout << "updating weights on GPU, n = " << n << "\n"; //print_device_vector(10, dW, "dW"); - float_t * W1, *W2; - CUDA_CHECK(cudaMalloc((void**)&W1, n * sizeof(float_t))); - CUDA_CHECK(cudaMalloc((void**)&W2, n * sizeof(float_t))); - copy_gpu(n, W, W1); - copy_gpu(n, W, W2); + float_t* cache = get_gpu<0>(n, W); + float_t* velocity = get_gpu<1>(n, W); + update_kernel<<>>( - n, alpha, b1, b2, b1_t, b2_t, eps, W1, W2, dW, W); + n, alpha, b1, b2, b1_t, b2_t, eps, cache, velocity, dW, W); b1_t *= b1; b2_t *= b2; - CUDA_CHECK(cudaFree(W1)); - CUDA_CHECK(cudaFree(W2)); } From 2f62cf42885ab646a0e4108d2bb910ef1bfda377 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 26 Feb 2020 14:46:20 -0600 Subject: [PATCH 050/660] gpu working --- libdeepgalois/src/layers/graph_conv_layer.cpp | 49 +++---------------- .../src/layers/softmax_loss_layer.cpp | 26 ---------- 2 files changed, 6 insertions(+), 69 deletions(-) diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 574e9369c0..c0c07cb889 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -121,45 +121,17 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, //if (level_ == 0) print_device_vector(20, in_data, "in_data"); //if (level_ == 0) print_device_vector(20, d_W, "W"); - if (isnan_gpu(x*z, out_temp)) { - std::cout << name_ << " forward before sgemm Exception: out_temp nan, exiting\n"; - exit(0); - } init_const_gpu(x*z, 0.0, out_temp); - if (isnan_gpu(x*y, in_temp)) { - std::cout << name_ << " forward Exception: in_temp nan, exiting\n"; - exit(0); - } - - if (isnan_gpu(y*z, d_W)) { - std::cout << name_ << " forward before sgemm Exception: d_W nan, exiting\n"; - exit(0); - } - if (dropout_ && phase_ == net_phase::train) { dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); - //sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp); - matmul_gpu(x, z, y, in_temp, d_W, out_temp); + sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp); + //matmul_gpu(x, z, y, in_temp, d_W, out_temp); } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp); //if (level_ == 0) print_device_vector(20, out_temp, "out_temp"); aggregate(z, context->graph_gpu, out_temp, out_data); if (act_) relu_gpu(x * z, out_data, out_data); //std::cout << "Forward " << name_ << ":\n"; //print_device_vector(20, out_data, "out_data"); - if (isnan_gpu(x*y, in_data)) { - std::cout << name_ << " forward Exception: in_data nan, exiting\n"; - exit(0); - } - - if (isnan_gpu(x*z, out_temp)) { - std::cout << name_ << " forward after sgemm Exception: out_temp nan, exiting\n"; - exit(0); - } - - if (isnan_gpu(x*z, out_data)) { - std::cout << name_ << " forward Exception: out_data nan, exiting\n"; - exit(0); - } } // GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad) @@ -172,22 +144,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data, sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); if (dropout_) d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad); - if (isnan_gpu(x*y, in_grad)) { - std::cout << name_ << "Exception: ingrad nan, exiting\n"; - exit(0); - } } sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); if (level_ == 0) { - std::cout << "Backward " << name_ << ":\n"; - print_device_vector(20, in_data, "in_data"); - print_device_vector(20, out_temp, "out_temp"); - print_device_vector(20, d_weight_grad, "dW"); - } - - if (isnan_gpu(x*z, out_temp)) { - std::cout << name_ << " backward Exception: out_temp nan, exiting\n"; - exit(0); + //std::cout << "Backward " << name_ << ":\n"; + //print_device_vector(20, in_data, "in_data"); + //print_device_vector(20, out_temp, "out_temp"); + //print_device_vector(20, d_weight_grad, "dW"); } if (isnan_gpu(y*z, d_weight_grad)) { diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index c75781843b..af04b06bbf 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -70,30 +70,8 @@ acc_t softmax_loss_layer::get_masked_loss() { void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { init_const_gpu(input_dims[0], 0.0, loss); - if (isnan_gpu(input_dims[0]*input_dims[1], in_data)) { - std::cout << name_ << " Exception: in_data nan, exiting\n"; - exit(0); - } - if (isnan_gpu(output_dims[0], loss)) { - std::cout << name_ << " Exception: loss nan, exiting\n"; - exit(0); - } - /* - if (isnan_gpu(output_dims[0], d_masks_)) { - std::cout << name_ << " Exception: masks nan, exiting\n"; - exit(0); - } - if (isnan_gpu(output_dims[0], context->d_labels)) { - std::cout << name_ << " Exception: labels nan, exiting\n"; - exit(0); - }*/ - softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, d_masks_, context->d_labels, loss, out_data); - if (isnan_gpu(output_dims[0]*output_dims[1], out_data)) { - std::cout << name_ << " Exception: out_data nan, exiting\n"; - exit(0); - } } void softmax_loss_layer::back_propagation(const float_t* in_data, @@ -101,10 +79,6 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, float_t* out_grad, float_t* in_grad) { d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, context->d_labels, out_data, in_grad); - if (isnan_gpu(input_dims[1]*input_dims[1], in_grad)) { - std::cout << name_ << " Exception: ingrad nan, exiting\n"; - exit(0); - } } acc_t softmax_loss_layer::get_masked_loss() { From a6ebf28cdeb4db15a6a04fdd350eab94e937dd86 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 26 Feb 2020 17:38:10 -0600 Subject: [PATCH 051/660] fix include/math_functions.hh --- libdeepgalois/include/math_functions.hh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 0e0f9f38df..414635b0e2 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -53,22 +53,22 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in, std::vector& mask, vec_t& out); // dropout void dropout(const float scale, const float dropout_rate, const vec_t& in, std::vector& mask, float_t* out); -void dropout(int n, const float scale, const float dropout_rate, +void dropout(size_t n, const float scale, const float dropout_rate, const float_t* in, unsigned* mask, float_t* out); void d_dropout(const float scale, const vec_t& in_diff, std::vector& mask, vec_t& out_diff); // dropout derivative -void d_dropout(int n, const float scale, const float_t* in_diff, +void d_dropout(size_t n, const float scale, const float_t* in_diff, unsigned* mask, float_t* out_diff); void softmax(const vec_t& input, vec_t& output); -void softmax(int n, const float_t* input, float_t* output); +void softmax(size_t n, const float_t* input, float_t* output); void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp); -void d_softmax(int n, const float_t* y, const float_t* p, float_t* dy, +void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp); float_t cross_entropy(const vec_t& y, const vec_t& p); -float_t cross_entropy(int n, const float_t* y, const float_t* p); +float_t cross_entropy(size_t n, const float_t* y, const float_t* p); void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d); -void d_cross_entropy(int n, const float_t* y, const float_t* p, float_t* d); +void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d); // GPU operators bool isnan_gpu(int n, const float_t *array); // does array contain any 'nan' element From 3efa9b8f9e6d508f51bd53e1347ed94fac75e5b3 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 26 Feb 2020 17:51:54 -0600 Subject: [PATCH 052/660] remove debug code --- libdeepgalois/src/layers/graph_conv_layer.cpp | 18 ------------------ 1 file changed, 18 deletions(-) diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index c0c07cb889..710bd79b64 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -118,20 +118,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data, void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { assert(y <= 128); // currently only support feature length <= 128 - //if (level_ == 0) print_device_vector(20, in_data, "in_data"); - //if (level_ == 0) print_device_vector(20, d_W, "W"); - init_const_gpu(x*z, 0.0, out_temp); if (dropout_ && phase_ == net_phase::train) { dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp); - //matmul_gpu(x, z, y, in_temp, d_W, out_temp); } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp); - //if (level_ == 0) print_device_vector(20, out_temp, "out_temp"); aggregate(z, context->graph_gpu, out_temp, out_data); if (act_) relu_gpu(x * z, out_data, out_data); - //std::cout << "Forward " << name_ << ":\n"; - //print_device_vector(20, out_data, "out_data"); } // GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad) @@ -146,16 +139,5 @@ void graph_conv_layer::back_propagation(const float_t* in_data, if (dropout_) d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad); } sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); - if (level_ == 0) { - //std::cout << "Backward " << name_ << ":\n"; - //print_device_vector(20, in_data, "in_data"); - //print_device_vector(20, out_temp, "out_temp"); - //print_device_vector(20, d_weight_grad, "dW"); - } - - if (isnan_gpu(y*z, d_weight_grad)) { - std::cout << name_ << "Exception: ingrad nan, exiting\n"; - exit(0); - } } #endif From 8089b4eec819d43d775160edea67bd78fcffd51b Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Thu, 27 Feb 2020 14:16:53 -0600 Subject: [PATCH 053/660] float->float_t --- libdeepgalois/include/layers/graph_conv_layer.h | 6 +++--- libdeepgalois/include/net.h | 2 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h index 7dfc8c2154..86a91c7287 100644 --- a/libdeepgalois/include/layers/graph_conv_layer.h +++ b/libdeepgalois/include/layers/graph_conv_layer.h @@ -18,7 +18,7 @@ class graph_conv_layer : public layer { public: graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout, - float dropout_rate, std::vector in_dims, + float_t dropout_rate, std::vector in_dims, std::vector out_dims); graph_conv_layer(unsigned level, std::vector in_dims, std::vector out_dims) @@ -49,8 +49,8 @@ class graph_conv_layer : public layer { bool norm_; // whether to normalize data bool bias_; // whether to add bias afterwards bool dropout_; // whether to use dropout at first - const float dropout_rate_; - float scale_; + const float_t dropout_rate_; + float_t scale_; net_phase phase_; size_t x; size_t y; diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index 4a83caaf88..0182a7e65e 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -39,7 +39,7 @@ class Net { void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, bool bias = false, bool dropout = true, - float dropout_rate = 0.5) { + float_t dropout_rate = 0.5) { assert(dropout_rate < 1.0); assert(layer_id < NUM_CONV_LAYERS); std::vector in_dims(2), out_dims(2); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 710bd79b64..7f69f915de 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -18,7 +18,7 @@ void graph_conv_layer::combine(const vec_t& self, const vec_t& neighbors, vec_t& } graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, - bool bias, bool dropout, float dropout_rate, + bool bias, bool dropout, float_t dropout_rate, std::vector in_dims, std::vector out_dims) : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias), From b95adf0ca0d0d1c3f9aeefc945fdb95362fd4393 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 25 Feb 2020 18:21:39 -0600 Subject: [PATCH 054/660] various comments on some files --- libdeepgalois/include/lgraph.h | 14 ++++++++++++++ libdeepgalois/include/net.h | 1 + libdeepgalois/include/utils.h | 2 ++ libdeepgalois/src/context.cpp | 4 ++-- lonestargnn/gcn/gcn.cpp | 1 + 5 files changed, 20 insertions(+), 2 deletions(-) diff --git a/libdeepgalois/include/lgraph.h b/libdeepgalois/include/lgraph.h index 65cd004c82..b15a505b45 100644 --- a/libdeepgalois/include/lgraph.h +++ b/libdeepgalois/include/lgraph.h @@ -11,6 +11,12 @@ typedef unsigned IndexT; typedef float ValueT; +/** + * Used to temporarily store read edges from edge list; graph itself doesn't + * use these. + * + * Source, dest, label. + */ struct Edge { IndexT src; IndexT dst; @@ -25,6 +31,14 @@ struct Edge { }; typedef std::vector EdgeList; +/** + * Learning graph. + * + * Provides basic accesors and such; nothing special. Just a CSR. + * Ultimatly becomes an LC_CSR. + * + * @todo remove this intermediate step if using edgelists + */ class LGraph { public: LGraph() : symmetrize_(false), directed_(false) {} diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index 0182a7e65e..743ac5ea11 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -24,6 +24,7 @@ class Net { size_t get_nnodes() { return num_samples; } void train(optimizer* opt, bool need_validate); // training void construct_layers(); + //! Save the context object to all layers of the network void set_contexts() { for (size_t i = 0; i < num_layers; i++) layers[i]->set_context(context); diff --git a/libdeepgalois/include/utils.h b/libdeepgalois/include/utils.h index 1c330daa5b..6ce0ef105f 100644 --- a/libdeepgalois/include/utils.h +++ b/libdeepgalois/include/utils.h @@ -94,6 +94,8 @@ inline bool bernoulli(float_t p) { return uniform_rand(float_t(0), float_t(1)) <= p; } +//! Get masks from datafile where first line tells range of +//! set to create mask from inline size_t read_masks(std::string dataset_str, std::string mask_type, size_t& begin, size_t& end, std::vector& masks) { diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index aab3e1c3cd..237416342c 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -58,8 +58,6 @@ void Context::genGraph(LGraph& lg, Graph& g) { float_t* Context::get_in_ptr() { return &h_feats[0]; } #endif -// user-defined pre-computing function, called during initialization -// for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v void Context::norm_factor_counting() { #ifdef CPU_ONLY norm_factor = new float_t[n]; @@ -113,6 +111,8 @@ size_t Context::read_labels(std::string dataset_str) { return num_classes; } +//! Read features, return the length of a feature vector +//! Features are stored in the Context class size_t Context::read_features(std::string dataset_str) { std::cout << "Reading features ... "; Timer t_read; diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 9bfe231181..fe0e2708a6 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -10,6 +10,7 @@ int main(int argc, char** argv) { galois::SharedMemSys G; LonestarGnnStart(argc, argv, name, desc, url); Net network; // the neural network to train + // read network, features, ground truth, initialize metadata network.init(dataset, epochs, hidden1); network.construct_layers(); // default setting for now; can be customized by // the user From ad7a9df6582043ab40ba310a990014e360b319f0 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 27 Feb 2020 16:45:59 -0600 Subject: [PATCH 055/660] optimizer; tiny dnn copyright --- libdeepgalois/include/optimizer.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h index 96ef841644..28cbabc5f5 100644 --- a/libdeepgalois/include/optimizer.h +++ b/libdeepgalois/include/optimizer.h @@ -1,3 +1,11 @@ +/** + * Code modified from below link. + * + * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h + * Copyright (c) 2013, Taiga Nomi and the respective contributors + * All rights reserved. + * Reused under 3-BSD + */ #pragma once #include @@ -6,6 +14,7 @@ #ifndef CPU_ONLY #include "math_functions.hh" #endif + // base class of optimizer // usesHessian : true if an optimizer uses hessian (2nd order derivative of loss // function) From 72af67a587a3e838865733eb586ffb4f43afc3c4 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 27 Feb 2020 16:47:21 -0600 Subject: [PATCH 056/660] added licensenote.txt for later release purposes --- libdeepgalois/licensenote.txt | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 libdeepgalois/licensenote.txt diff --git a/libdeepgalois/licensenote.txt b/libdeepgalois/licensenote.txt new file mode 100644 index 0000000000..c1e14addca --- /dev/null +++ b/libdeepgalois/licensenote.txt @@ -0,0 +1,8 @@ +TODO + +figure out which files have coded based on other codebsaes, get license, +note here + +e.g. +https://github.com/tiny-dnn/tiny-dnn/tree/master/tiny_dnn +under BSD-3 From b90ae48cd90f9843a257ab88060e61f4f1343bd7 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 27 Feb 2020 16:57:38 -0600 Subject: [PATCH 057/660] layer copyright + some comments i made while reading --- libdeepgalois/include/layers/layer.h | 34 ++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 438ee45993..b4fecdbca2 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -1,4 +1,13 @@ #pragma once +/** + * Code based on below link. + * + * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/layers/layer.h + * + * Copyright (c) 2013, Taiga Nomi and the respective contributors + * All rights reserved. + * Reused/revised under 3-BSD + */ #include #include @@ -29,8 +38,10 @@ * - in_shape ... specify input data shapes * - out_shape ... specify output data shapes * - layer_type ... name of layer + * + * Node inheritance is just to get accessed to linked-list semantics it + * provides **/ - class layer : public node { public: layer(unsigned level, std::vector in_dims, @@ -43,21 +54,26 @@ class layer : public node { virtual ~layer() = default; virtual std::string layer_type() const = 0; virtual void set_netphase(net_phase phase) {} + //! save context virtual void set_context(Context* ctx) { context = ctx; } virtual acc_t get_masked_loss() { return acc_t(0); } - // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data) = - // 0; virtual void back_propagation(const vec_t &in_data, const vec_t - // &out_data, vec_t &out_grad, vec_t &in_grad) = 0; + + // main functions for layer work virtual void forward_propagation(const float_t* in_data, float_t* out_data) = 0; virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) = 0; + // is this layer trainable? void set_trainable(bool trainable) { trainable_ = trainable; } bool trainable() const { return trainable_; } + + // name metadata void set_name(std::string name) { name_ = name; } std::string get_name() { return name_; } + mask_t* get_device_masks() { return d_masks_; } + //! debug print function void print_layer_info() { std::cout << "Layer" << level_ << " type: " << layer_type() << " input[" << input_dims[0] << "," << input_dims[1] << "] output[" @@ -73,11 +89,14 @@ class layer : public node { copy_masks_device(input_dims[0], masks_, d_masks_); #endif } + + //! set the data of the previous layer connected to this one void set_in_data(float_t* data) { prev_ = std::make_shared(this, input_dims[0], input_dims[1]); prev_->set_data(data); // no need to allocate memory for gradients, since this is the input layer. } + void add_edge() { // add an outgoing edge next_ = std::make_shared(this, output_dims[0], output_dims[1]); @@ -87,15 +106,22 @@ class layer : public node { void alloc_grad() { // allocate memory for intermediate gradients } + + //! calls forward propagation using previous layer as input and writes + //! to next layer as output void forward() { // std::cout << name_ << ": forwarding ... "; forward_propagation(prev()->get_data(), next()->get_data()); } + + //! calls backward propagation void backward() { // std::cout << name_ << ": backwarding ... "; back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient()); } + + //! use optimizer to update weights given gradient void update_weight(optimizer* opt) { // vec_t diff; // prev()->merge_grads(&diff); From 03927df74445ac51319f09153b6132849039b942 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 27 Feb 2020 16:57:57 -0600 Subject: [PATCH 058/660] node.h copyright --- libdeepgalois/include/node.h | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h index 8b48e85aa8..947e997275 100644 --- a/libdeepgalois/include/node.h +++ b/libdeepgalois/include/node.h @@ -1,4 +1,14 @@ #pragma once +/** + * Code modified from below + * + * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/node.h + * + * Copyright (c) 2013, Taiga Nomi and the respective contributors + * All rights reserved. + * Reused/revised under 3-BSD + */ + #include #include #include From 6727ddd377ec54cebfb61dbd622e62ed49ae6c23 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 29 Feb 2020 14:35:27 -0600 Subject: [PATCH 059/660] add pubmed dataset --- libdeepgalois/include/utils.h | 6 +++--- libdeepgalois/src/net.cpp | 4 ++-- lonestargnn/gcn/gcn.cpp | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/libdeepgalois/include/utils.h b/libdeepgalois/include/utils.h index 6ce0ef105f..086dcf321a 100644 --- a/libdeepgalois/include/utils.h +++ b/libdeepgalois/include/utils.h @@ -99,7 +99,7 @@ inline bool bernoulli(float_t p) { inline size_t read_masks(std::string dataset_str, std::string mask_type, size_t& begin, size_t& end, std::vector& masks) { - if (dataset_str != "citeseer" && dataset_str != "cora") { + if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed") { std::cout << "Dataset currently not supported\n"; exit(1); } @@ -123,8 +123,8 @@ inline size_t read_masks(std::string dataset_str, std::string mask_type, } i++; } - // std::cout << mask_type + "_mask range: [" << begin << ", " << end - // << ") Number of valid samples: " << sample_count << "\n"; + std::cout << mask_type + "_mask range: [" << begin << ", " << end + << ") Number of valid samples: " << sample_count << "\n"; in.close(); return sample_count; } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 9b78853833..6e253a2afd 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -8,7 +8,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { context->norm_factor_counting(); // pre-compute normalizing factor num_epochs = epochs; - std::cout << "Reading label masks ... "; + //std::cout << "Reading label masks ... "; train_mask.resize(num_samples, 0); val_mask.resize(num_samples, 0); // get testing and validation sets @@ -25,7 +25,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { read_masks(dataset_str, "train", train_begin, train_end, train_mask); val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask); } - std::cout << "Done\n"; + //std::cout << "Done\n"; num_layers = NUM_CONV_LAYERS + 1; // initialize feature metadata diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index fe0e2708a6..55c4e2320f 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -27,6 +27,7 @@ int main(int argc, char** argv) { Ttrain.stop(); if (do_test) { + std::cout << "\n"; // test using test samples size_t n = network.get_nnodes(); acc_t test_loss = 0.0, test_acc = 0.0; @@ -44,7 +45,7 @@ int main(int argc, char** argv) { Ttest.start(); double test_time = network.evaluate(test_begin, test_end, test_count, &test_mask[0], test_loss, test_acc); - std::cout << "\nTesting: test_loss = " << test_loss + std::cout << "Testing: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n"; Ttest.stop(); From f6c35c88c5ce1b98e42fbc06b9536ae5619a8b6f Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 29 Feb 2020 15:56:15 -0600 Subject: [PATCH 060/660] update aggregator --- libdeepgalois/include/types.h | 3 ++- libdeepgalois/src/aggregator.cu | 36 ++++++++++++++++++++++++++++++--- 2 files changed, 35 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/include/types.h b/libdeepgalois/include/types.h index 387b5f5b60..b669a25188 100644 --- a/libdeepgalois/include/types.h +++ b/libdeepgalois/include/types.h @@ -21,6 +21,7 @@ typedef uint8_t mask_t; // mask is used to indicate different uses of labels: // train, val, test #define CHUNK_SIZE 256 #define TB_SIZE 256 +#define BLOCK_SIZE 256 #define WARP_SIZE 32 - +#define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE) #endif diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index f0c06722b6..522975dca3 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -12,7 +12,7 @@ __device__ void scale_add(const int n, const float_t alpha, const float_t* a, y[i] = alpha * a[i] + b[i]; } -__global__ void update_all_kernel(size_t n, size_t len, CSRGraph g, +__global__ void update_all_naive(size_t n, size_t len, CSRGraph g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { CUDA_KERNEL_LOOP(src, n) { @@ -29,13 +29,43 @@ __global__ void update_all_kernel(size_t n, size_t len, CSRGraph g, } } +__global__ void update_all_warp(size_t n, size_t len, CSRGraph g, + const float_t* in, float_t* out, + bool norm, const float_t* norm_factor) { + __shared__ index_type ptrs[BLOCK_SIZE/WARP_SIZE][2]; + const int thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + for(int src = warp_id; src < n; src += num_warps) { + float_t a = 0.0, b = 1.0; + if (norm) a = norm_factor[src]; + if (thread_lane < 2) + ptrs[warp_lane][thread_lane] = g.edge_begin(src + thread_lane); + __syncthreads(); + const index_type row_begin = ptrs[warp_lane][0]; + const index_type row_end = ptrs[warp_lane][1]; + index_type base_src = src * len; + for(index_type offset = row_begin; offset < row_end; offset ++) { + index_type dst = g.getEdgeDst(offset); + if (norm) b = a * norm_factor[dst]; + index_type base_dst = dst * len; + for (int i = 0; i < len; i += WARP_SIZE) + if (thread_lane+i < len) + out[base_src+thread_lane+i] += in[base_dst+thread_lane+i] * b; + } + } +} + void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { unsigned n = g.nnodes; //std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n"; //print_device_vector(10, norm_factor, "norm_factor"); CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); - update_all_kernel<<>>( - n, len, g, in, out, norm, norm_factor); + //update_all_naive<<>>(n, len, g, in, out, norm, norm_factor); + update_all_warp<<<(n-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(n, len, g, in, out, norm, norm_factor); CudaTest("solving update_all kernel failed"); } From 53a8ee82061d94cd04f9bdbf2def855bbdee8e6e Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 2 Mar 2020 09:51:14 -0600 Subject: [PATCH 061/660] update dropout_gpu --- libdeepgalois/include/math_functions.hh | 4 +-- libdeepgalois/src/layers/graph_conv_layer.cpp | 2 +- libdeepgalois/src/math_functions.cu | 36 +++++++------------ 3 files changed, 15 insertions(+), 27 deletions(-) diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 414635b0e2..d647a35e3a 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -81,8 +81,8 @@ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff); // ReLU derivative void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned* masks, float_t* out); // dropout -void d_dropout_gpu(const int n, const float scale, const float_t* in, - const unsigned* masks, float_t* out); // dropout derivative +void d_dropout_gpu(const int n, const float scale, const float dropout_rate, + const float_t* in, const unsigned* masks, float_t* out); // dropout derivative void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 7f69f915de..115b297512 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -136,7 +136,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, if (level_ != 0) { sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); - if (dropout_) d_dropout_gpu(x * y, scale_, in_grad, dropout_mask, in_grad); + if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad); } sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 99b83e4d6e..ce8a8283ff 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -87,43 +87,31 @@ __global__ void setup_curand_kernel(const int n, curandState* state) { } __global__ void dropout_kernel(const int n, const float scale, - const float dropout_rate, const float_t* in, - unsigned* masks, curandState* state, float_t* out) { - CUDA_KERNEL_LOOP(i, n) { - // curandState_t curand_state; - //curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 - //masks[i] = curand_uniform(&state[i]) <= dropout_rate ? 1 : 0; - //masks[i] = 1.0 - dropout_rate; - //out[i] = in[i] * masks[i] * scale; - masks[i] = 1.0; - out[i] = in[i]; - } + const float threshold, const float_t* in, + unsigned* masks, float_t* out) { + CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * (masks[i] > threshold) * scale; } } void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned* masks, float_t* out) { - curandState* devStates; - CUDA_CHECK(cudaMalloc((void**)&devStates, n * sizeof(curandState))); - //std::cout << "[debug]: setup curand, n = " << n << "\n"; - //setup_curand_kernel<<>>(n, devStates); - //CudaTest("solving setup_curand kernel failed"); + gpu_rng_uniform(n, masks); //std::cout << "[debug]: dropout_gpu\n"; dropout_kernel<<>>( - n, scale, dropout_rate, in, masks, devStates, out); + n, scale, dropout_rate, in, masks, out); CudaTest("solving dropout kernel failed"); - CUDA_CHECK(cudaFree(devStates)); //std::cout << "[debug]: dropout_gpu done\n"; } -__global__ void d_dropout_kernel(const int n, const float scale, const float_t* in, +__global__ void d_dropout_kernel(const int n, const float scale, + const float threshold, const float_t* in, const unsigned* masks, float_t* out) { - //CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; } - CUDA_KERNEL_LOOP(i, n) { out[i] = in[i]; } + CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * (masks[i] > threshold) * scale; } } -void d_dropout_gpu(const int n, const float scale, const float_t* in, - const unsigned* masks, float_t* out) { - d_dropout_kernel<<>>(n, scale, in, masks, out); +void d_dropout_gpu(const int n, const float scale, const float dropout_rate, + const float_t* in, const unsigned* masks, float_t* out) { + d_dropout_kernel<<>>( + n, scale, dropout_rate, in, masks, out); CudaTest("solving d_dropout kernel failed"); } From 405e8a47a6b51b2835f3441d84ad3c8a3251604b Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 2 Mar 2020 13:20:03 -0600 Subject: [PATCH 062/660] add cusparse --- libdeepgalois/include/aggregator.h | 2 + libdeepgalois/include/context.h | 21 +++++-- libdeepgalois/include/cutils.h | 38 ++++++++++++ libdeepgalois/include/layers/layer.h | 2 +- libdeepgalois/include/math_functions.hh | 4 ++ libdeepgalois/src/aggregator.cu | 11 +++- libdeepgalois/src/context.cu | 59 +++++++++++++++---- libdeepgalois/src/layers/graph_conv_layer.cpp | 13 +++- libdeepgalois/src/math_functions.cu | 17 ++++++ libgpu/include/graph_gpu.h | 8 +++ 10 files changed, 153 insertions(+), 22 deletions(-) diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h index 552925c1bf..6853ea7126 100644 --- a/libdeepgalois/include/aggregator.h +++ b/libdeepgalois/include/aggregator.h @@ -8,4 +8,6 @@ void update_all(size_t len, Graph& g, const float_t* in, float_t* out, #include "graph_gpu.h" void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); +void update_all_cusparse(size_t len, CSRGraph& g, const float_t* in, + float_t* out, bool norm, const float_t* norm_factor); #endif diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index 47b32d023e..7444e90251 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -1,4 +1,14 @@ #pragma once +/** + * Code modified from below + * + * https://github.com/BVLC/caffe/blob/master/include/caffe/common.hpp + * + * Copyright (c) 2014-2017 The Regents of the University of California (Regents) + * All rights reserved. + * Reused/revised under BSD 2-Clause license + */ + #include #include #include "types.h" @@ -58,6 +68,8 @@ class Context { #else CSRGraph graph_gpu; // the input graph, |V| = N inline static cublasHandle_t cublas_handle() { return cublas_handle_; } + inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; } + inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; } inline static curandGenerator_t curand_generator() { return curand_generator_; } @@ -66,15 +78,12 @@ class Context { protected: #ifndef CPU_ONLY static cublasHandle_t cublas_handle_; // used to call cuBLAS - static curandGenerator_t - curand_generator_; // used to generate random numbers on GPU + static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE + static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE + static curandGenerator_t curand_generator_; // used to generate random numbers on GPU #endif Brew mode_; int solver_count_; int solver_rank_; bool multiprocess_; - -private: - // The private constructor to avoid duplicate instantiation. - // Context(); }; diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/cutils.h index c817863242..7be873a183 100644 --- a/libdeepgalois/include/cutils.h +++ b/libdeepgalois/include/cutils.h @@ -1,6 +1,7 @@ #pragma once #include #include +#include #include #include #include @@ -56,6 +57,32 @@ inline const char* cublasGetErrorString(cublasStatus_t error) { return "Unknown cublas status"; } +inline const char* cusparseGetErrorString(cusparseStatus_t error) { + switch (error) { + case CUSPARSE_STATUS_SUCCESS: + return "CUSPARSE_STATUS_SUCCESS"; + case CUSPARSE_STATUS_NOT_INITIALIZED: + return "CUSPARSE_STATUS_NOT_INITIALIZED"; + case CUSPARSE_STATUS_ALLOC_FAILED: + return "CUSPARSE_STATUS_ALLOC_FAILED"; + case CUSPARSE_STATUS_INVALID_VALUE: + return "CUSPARSE_STATUS_INVALID_VALUE"; + case CUSPARSE_STATUS_ARCH_MISMATCH: + return "CUSPARSE_STATUS_ARCH_MISMATCH"; + case CUSPARSE_STATUS_MAPPING_ERROR: + return "CUSPARSE_STATUS_MAPPING_ERROR"; + case CUSPARSE_STATUS_EXECUTION_FAILED: + return "CUSPARSE_STATUS_EXECUTION_FAILED"; + case CUSPARSE_STATUS_INTERNAL_ERROR: + return "CUSPARSE_STATUS_INTERNAL_ERROR"; + case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + case CUSPARSE_STATUS_ZERO_PIVOT: + return "CUSPARSE_STATUS_ZERO_PIVOT"; + } + return "Unknown cusparse status"; +} + inline const char* curandGetErrorString(curandStatus_t error) { switch (error) { case CURAND_STATUS_SUCCESS: @@ -110,6 +137,17 @@ inline const char* curandGetErrorString(curandStatus_t error) { } \ } while (0) +#define CUSPARSE_CHECK(condition) \ + do { \ + cusparseStatus_t status = condition; \ + if (status != CUSPARSE_STATUS_SUCCESS) { \ + fprintf(stderr, \ + "error %d: cuSPARSE error in file '%s' in line %i : %s.\n", \ + status, __FILE__, __LINE__, cusparseGetErrorString(status)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + #define CURAND_CHECK(condition) \ do { \ curandStatus_t status = condition; \ diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index b4fecdbca2..355f75a440 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -132,7 +132,7 @@ class layer : public node { bool parallel = (W.size() >= 512); opt->update(weight_grad, W, parallel); // W += grad #else - std::cout << name_ << ": "; + //std::cout << name_ << ": "; opt->update_gpu(input_dims[1]*output_dims[1], d_weight_grad, d_W); // W += grad #endif // prev()->clear_grads(); diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index d647a35e3a..f89f34a5a5 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -91,6 +91,10 @@ void matmul_gpu(const size_t x, const size_t y, const size_t z, void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t* A, const float_t* B, float_t* C); // matrix multiply +void csrmm_gpu(const int M, const int N, const int K, const int nnz, + const float alpha, const float* A_nonzeros, + const int* A_idx_ptr, const int* A_nonzero_idx, + const float* B, const float beta, float* C); void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in_data, const mask_t* masks, const label_t* labels, float_t* loss, float_t* out_data); diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index 522975dca3..bbd7fbf8b3 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -62,10 +62,17 @@ __global__ void update_all_warp(size_t n, size_t len, CSRGraph g, void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { unsigned n = g.nnodes; - //std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n"; - //print_device_vector(10, norm_factor, "norm_factor"); CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); //update_all_naive<<>>(n, len, g, in, out, norm, norm_factor); update_all_warp<<<(n-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(n, len, g, in, out, norm, norm_factor); CudaTest("solving update_all kernel failed"); } + +void update_all_cusparse(size_t len, CSRGraph& g, const float_t* in, float_t* out, + bool norm, const float_t* norm_factor) { + unsigned n = g.nnodes; + CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); + //std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n"; + //print_device_vector(10, norm_factor, "norm_factor"); + csrmm_gpu(n, len, n, g.nedges, 1.0, norm_factor, (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, out); +} diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 647e010f60..1ba6bcc8bd 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -22,7 +22,8 @@ int64_t cluster_seedgen(void) { return seed; } -__global__ void norm_factor_counting_kernel(int n, CSRGraph graph, +// computing normalization factor for each vertex +__global__ void norm_factor_counting_node(int n, CSRGraph graph, float_t* norm_fac) { CUDA_KERNEL_LOOP(i, n) { float_t temp = sqrt(float_t(graph.getOutDegree(i))); @@ -33,34 +34,69 @@ __global__ void norm_factor_counting_kernel(int n, CSRGraph graph, } } +// TODO: make sure self-loop added for each vertex +// computing normalization factor for each edge +__global__ void norm_factor_counting_edge(int n, CSRGraph graph, + float_t* norm_fac) { + CUDA_KERNEL_LOOP(src, n) { + float_t d_src = float_t(graph.getOutDegree(src)); + assert(d_src != 0.0); // should never be zero since self-loop added for each vertex + d_src = 1.0 / sqrt(d_src); + index_type start = graph.edge_begin(src); + index_type end = graph.edge_end(src); + for (index_type e = start; e != end; e++) { + index_type dst = graph.getEdgeDst(e); + float_t d_dst = float_t(graph.getOutDegree(dst)); + assert(d_dst != 0.0); + d_dst = 1.0 / sqrt(d_dst); + norm_fac[e] = d_src * d_dst; + } + } +} + void Context::norm_factor_counting_gpu() { - std::cout << "Pre-computing normalization factor (n=" << n << ")\n"; assert(graph_gpu.nnodes == n); + std::cout << "Pre-computing normalization factor (n=" << n << ")\n"; +#ifdef USE_CUSPARSE + int nnz = graph_gpu.nedges; + CUDA_CHECK(cudaMalloc((void**)&d_norm_factor, nnz * sizeof(float_t))); + init_const_kernel<<>>(nnz, 0.0, d_norm_factor); + norm_factor_counting_edge<<>>( + n, graph_gpu, d_norm_factor); +#else CUDA_CHECK(cudaMalloc((void**)&d_norm_factor, n * sizeof(float_t))); - norm_factor_counting_kernel<<>>( + norm_factor_counting_node<<>>( n, graph_gpu, d_norm_factor); +#endif CudaTest("solving norm_factor_counting kernel failed"); } -cublasHandle_t Context::cublas_handle_ = 0; -curandGenerator_t Context::curand_generator_ = 0; +cublasHandle_t Context::cublas_handle_ = 0; +cusparseHandle_t Context::cusparse_handle_ = 0; +cusparseMatDescr_t Context::cusparse_matdescr_ = 0; +curandGenerator_t Context::curand_generator_ = 0; Context::Context() : mode_(Context::GPU), solver_count_(1), solver_rank_(0), multiprocess_(false) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); - CURAND_CHECK( - curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK( - curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); + CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); + CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_)); + CUSPARSE_CHECK(cusparseSetMatType(cusparse_matdescr_,CUSPARSE_MATRIX_TYPE_GENERAL)); + CUSPARSE_CHECK(cusparseSetMatIndexBase(cusparse_matdescr_,CUSPARSE_INDEX_BASE_ZERO)); + CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); } Context::~Context() { if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); - if (curand_generator_) { + if (cusparse_handle_) + CUSPARSE_CHECK(cusparseDestroy(cusparse_handle_)); + if (cusparse_matdescr_) + CUSPARSE_CHECK(cusparseDestroyMatDescr(cusparse_matdescr_)); + if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_)); - } } void Context::SetDevice(const int device_id) { @@ -100,3 +136,4 @@ void Context::copy_data_to_device() { } float_t* Context::get_in_ptr() { return d_feats; } + diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 115b297512..753deed714 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -3,11 +3,16 @@ #ifdef CPU_ONLY void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { update_all(len, g, in, out, true, context->norm_factor); +} #else void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { + #ifdef USE_CUSPARSE + update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); + #else update_all(len, g, in, out, true, context->d_norm_factor); -#endif + #endif } +#endif void graph_conv_layer::combine(const vec_t& self, const vec_t& neighbors, vec_t& out) { vec_t a(out.size(), 0); @@ -35,7 +40,7 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, } void graph_conv_layer::init() { - std::cout << name_ << ": allocating memory for params and temp data... "; + //std::cout << name_ << ": allocating memory for params and temp data... "; Timer t_alloc; t_alloc.Start(); #ifdef CPU_ONLY @@ -135,7 +140,11 @@ void graph_conv_layer::back_propagation(const float_t* in_data, else copy_gpu(x * z, out_grad, out_temp); if (level_ != 0) { sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); +#ifdef USE_CUSPARSE + update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); +#else update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); +#endif if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad); } sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index ce8a8283ff..eb8f07c8b3 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -183,6 +183,23 @@ void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); } +void csrmm_gpu(const int M, const int N, const int K, const int nnz, + const float alpha, const float* A_nonzeros, + const int* A_idx_ptr, const int* A_nnz_idx, + const float* B, const float beta, float* C) { + float *transpose_C; + CUDA_CHECK(cudaMalloc((void**)&transpose_C, N * K * sizeof(float))); + CUSPARSE_CHECK(cusparseScsrmm2(Context::cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, + M, N, K, nnz, &alpha, Context::cusparse_matdescr(), A_nonzeros, + A_idx_ptr, A_nnz_idx, B, N, &beta, transpose_C, M)); + //transpose C + const float one = 1.0; + const float zero = 0.0; + CUBLAS_CHECK(cublasSgeam(Context::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T, + N, M, &one, transpose_C, M, &zero, transpose_C, M, C, N)); +} + void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float* A, const float* x, const float beta, float* y) { diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index 050d7bbc69..4784e510a5 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -125,6 +125,14 @@ struct CSRGraph { assert(src <= nnodes); return row_start[src+1]; }; + __device__ __host__ index_type *row_start_ptr() { return row_start; } + __device__ __host__ const index_type *row_start_ptr() const { return row_start; } + __device__ __host__ index_type *edge_dst_ptr() { return edge_dst; } + __device__ __host__ const index_type *edge_dst_ptr() const { return edge_dst; } + __device__ __host__ node_data_type *node_data_ptr() { return node_data; } + __device__ __host__ const node_data_type *node_data_ptr() const { return node_data; } + __device__ __host__ edge_data_type *edge_data_ptr() { return edge_data; } + __device__ __host__ const edge_data_type *edge_data_ptr() const { return edge_data; } index_type nnodes, nedges; index_type* row_start; // row_start[node] points into edge_dst, node starts at From 43a52f34c2477df2200df99b08416aaa630158de Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 2 Mar 2020 14:02:30 -0600 Subject: [PATCH 063/660] update lgraph --- libdeepgalois/include/lgraph.h | 146 ++++++--------------------------- 1 file changed, 24 insertions(+), 122 deletions(-) diff --git a/libdeepgalois/include/lgraph.h b/libdeepgalois/include/lgraph.h index b15a505b45..f3426db2a2 100644 --- a/libdeepgalois/include/lgraph.h +++ b/libdeepgalois/include/lgraph.h @@ -11,26 +11,6 @@ typedef unsigned IndexT; typedef float ValueT; -/** - * Used to temporarily store read edges from edge list; graph itself doesn't - * use these. - * - * Source, dest, label. - */ -struct Edge { - IndexT src; - IndexT dst; - ValueT elabel; - Edge() : src(0), dst(0), elabel(0) {} - Edge(IndexT from, IndexT to, ValueT el) : src(from), dst(to), elabel(el) {} - std::string to_string() const { - std::stringstream ss; - ss << "e(" << src << "," << dst << "," << elabel << ")"; - return ss.str(); - } -}; -typedef std::vector EdgeList; - /** * Learning graph. * @@ -41,15 +21,10 @@ typedef std::vector EdgeList; */ class LGraph { public: - LGraph() : symmetrize_(false), directed_(false) {} + LGraph() : directed_(false) {} void clean() { delete[] rowptr_; delete[] colidx_; - delete[] weight_; - degrees.clear(); - el.clear(); - // labels_.clear(); - // vertices.clear(); } bool directed() const { return directed_; } size_t num_vertices() const { return num_vertices_; } @@ -59,111 +34,49 @@ class LGraph { unsigned out_degree(IndexT n) const { return rowptr_[n + 1] - rowptr_[n]; } IndexT get_offset(IndexT n) { return rowptr_[n]; } IndexT get_dest(IndexT n) { return colidx_[n]; } - ValueT get_weight(IndexT n) { return weight_[n]; } - unsigned get_max_degree() { return max_degree; } - // ValueT * labels() { return labels_.data(); } - // ValueT get_label(IndexT n) { return labels_[n]; } - void read_edgelist(const char* filename, bool symmetrize = false) { + + void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false) { std::ifstream in; std::string line; in.open(filename, std::ios::in); - IndexT max_vid = 0; + size_t m, n; + in >> m >> n >> std::ws; + num_vertices_ = m; + num_edges_ = 0; + std::cout << "num_vertices " << num_vertices_ << "\n"; + vertices.resize(m); + for (size_t i = 0; i < n; i++) { + std::set neighbors; + if (add_self_loop) neighbors.insert(i); + vertices.push_back(neighbors); + } while (std::getline(in, line)) { std::istringstream edge_stream(line); IndexT u, v; edge_stream >> u; edge_stream >> v; - el.push_back(Edge(u, v, 1)); - if (symmetrize) - el.push_back(Edge(v, u, 1)); - if (u > max_vid) - max_vid = u; - if (v > max_vid) - max_vid = v; + vertices[u].insert(v); + if (symmetrize) vertices[v].insert(u); } in.close(); - directed_ = true; - num_vertices_ = max_vid + 1; - num_edges_ = el.size(); - std::cout << "num_vertices_ " << num_vertices_ << " num_edges_ " - << num_edges_ << "\n"; - MakeGraphFromEL(); + for (size_t i = 0; i < n; i++) num_edges_ += vertices[i].size(); + std::cout << "num_edges " << num_edges_ << "\n"; + MakeCSR(vertices); } private: - EdgeList el; - bool symmetrize_; // whether to symmetrize a directed graph bool directed_; size_t num_vertices_; size_t num_edges_; IndexT* rowptr_; IndexT* colidx_; - ValueT* weight_; - unsigned max_degree; - std::vector degrees; - std::vector labels_; - std::vector> vertices; - - static bool compare_id(Edge a, Edge b) { return (a.dst < b.dst); } - void MakeGraphFromEL() { - SquishGraph(); - MakeCSR(false); - } - - void SquishGraph(bool remove_selfloops = true, - bool remove_redundents = true) { - std::vector neighbors; - for (size_t i = 0; i < num_vertices_; i++) - vertices.push_back(neighbors); - for (size_t i = 0; i < num_edges_; i++) - vertices[el[i].src].push_back(el[i]); - el.clear(); - printf("Sorting the neighbor lists..."); - for (size_t i = 0; i < num_vertices_; i++) - std::sort(vertices[i].begin(), vertices[i].end(), compare_id); - printf(" Done\n"); - // remove self loops - int num_selfloops = 0; - if (remove_selfloops) { - printf("Removing self loops..."); - for (size_t i = 0; i < num_vertices_; i++) { - for (unsigned j = 0; j < vertices[i].size(); j++) { - if (i == vertices[i][j].dst) { - vertices[i].erase(vertices[i].begin() + j); - num_selfloops++; - j--; - } - } - } - printf(" %d selfloops are removed\n", num_selfloops); - num_edges_ -= num_selfloops; - } - // remove redundent - int num_redundents = 0; - if (remove_redundents) { - printf("Removing redundent edges..."); - for (size_t i = 0; i < num_vertices_; i++) { - for (unsigned j = 1; j < vertices[i].size(); j++) { - if (vertices[i][j].dst == vertices[i][j - 1].dst) { - vertices[i].erase(vertices[i].begin() + j); - num_redundents++; - j--; - } - } - } - printf(" %d redundent edges are removed\n", num_redundents); - num_edges_ -= num_redundents; - } - } - - void MakeCSR(bool transpose) { + void MakeCSR(std::vector > vertices, bool transpose) { + std::vector degrees; degrees.resize(num_vertices_); std::fill(degrees.begin(), degrees.end(), 0); for (size_t i = 0; i < num_vertices_; i++) degrees[i] = vertices[i].size(); - max_degree = *(std::max_element(degrees.begin(), degrees.end())); - std::vector offsets(degrees.size() + 1); IndexT total = 0; for (size_t n = 0; n < degrees.size(); n++) { @@ -171,26 +84,15 @@ class LGraph { total += degrees[n]; } offsets[degrees.size()] = total; - + degrees.clear(); assert(num_edges_ == offsets[num_vertices_]); - weight_ = new ValueT[num_edges_]; colidx_ = new IndexT[num_edges_]; rowptr_ = new IndexT[num_vertices_ + 1]; for (size_t i = 0; i < num_vertices_ + 1; i++) rowptr_[i] = offsets[i]; for (size_t i = 0; i < num_vertices_; i++) { - for (auto it = vertices[i].begin(); it < vertices[i].end(); it++) { - Edge e = *it; - assert(i == e.src); - if (symmetrize_ || (!symmetrize_ && !transpose)) { - weight_[offsets[e.src]] = e.elabel; - colidx_[offsets[e.src]++] = e.dst; - } - if (symmetrize_ || (!symmetrize_ && transpose)) { - weight_[offsets[e.dst]] = e.elabel; - colidx_[offsets[e.dst]++] = e.src; - } - } + for (auto dst : vertices[i]) + colidx_[offsets[i]++] = dst; } } }; From 7fdbfe639bdf256abdb2287d984f859c992644e6 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 2 Mar 2020 14:15:41 -0600 Subject: [PATCH 064/660] fix lgraph --- libdeepgalois/include/lgraph.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libdeepgalois/include/lgraph.h b/libdeepgalois/include/lgraph.h index f3426db2a2..2eb5ec6863 100644 --- a/libdeepgalois/include/lgraph.h +++ b/libdeepgalois/include/lgraph.h @@ -44,7 +44,7 @@ class LGraph { num_vertices_ = m; num_edges_ = 0; std::cout << "num_vertices " << num_vertices_ << "\n"; - vertices.resize(m); + std::vector > vertices(m); for (size_t i = 0; i < n; i++) { std::set neighbors; if (add_self_loop) neighbors.insert(i); @@ -71,7 +71,7 @@ class LGraph { IndexT* rowptr_; IndexT* colidx_; - void MakeCSR(std::vector > vertices, bool transpose) { + void MakeCSR(std::vector > vertices) { std::vector degrees; degrees.resize(num_vertices_); std::fill(degrees.begin(), degrees.end(), 0); From f07d8c22ef080d7d67a38dff8f58eac180300acc Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 2 Mar 2020 14:41:23 -0600 Subject: [PATCH 065/660] add selfloop --- libdeepgalois/include/context.h | 1 + libdeepgalois/src/context.cpp | 27 ++++++++++++++++++++++++--- 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index 7444e90251..61a8eed69d 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -65,6 +65,7 @@ class Context { #ifdef CPU_ONLY Graph graph_cpu; // the input graph, |V| = N void genGraph(LGraph& lg, Graph& g); + void add_selfloop(Graph og, Graph &g); #else CSRGraph graph_gpu; // the input graph, |V| = N inline static cublasHandle_t cublas_handle() { return cublas_handle_; } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 237416342c..6f7169add4 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -18,20 +18,23 @@ size_t Context::read_graph(std::string dataset_str) { } #ifdef CPU_ONLY -size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype) { +size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop = false) { galois::StatTimer Tread("GraphReadingTime"); Tread.start(); - LGraph lgraph; if (filetype == "el") { std::string filename = path + dataset_str + ".el"; printf("Reading .el file: %s\n", filename.c_str()); + LGraph lgraph; lgraph.read_edgelist(filename.c_str(), true); // symmetrize genGraph(lgraph, graph_cpu); lgraph.clean(); } else if (filetype == "gr") { std::string filename = path + dataset_str + ".csgr"; printf("Reading .gr file: %s\n", filename.c_str()); - galois::graphs::readGraph(graph_cpu, filename); + if (selfloop) { + galois::graphs::readGraph(graph_temp, filename); + add_selfloop(graph_temp, graph_cpu); + } else galois::graphs::readGraph(graph_cpu, filename); } else { printf("Unkown file format\n"); exit(1); @@ -55,6 +58,24 @@ void Context::genGraph(LGraph& lg, Graph& g) { } } +void Context::add_selfloop(Graph og, Graph &g) { + g.allocateFrom(og.size(), og.size()+og.sizeEdges()); + g.constructNodes(); + for (size_t src = 0; src < og.size(); src++) { + g.getData(src) = 1; + auto row_end = og.edge_end(src); + g.fixEndEdge(src, row_end+src+1); + bool self_inserted = false; + for (auto e : og.edges(src)) { + auto dst = og.edgeDst(e); + if (!self_inserted && dst > src) { + g.constructEdge(e, src, 0); + self_inserted = true; + } + g.constructEdge(e, dst, 0); + } +} + float_t* Context::get_in_ptr() { return &h_feats[0]; } #endif From b040b0d171872c26747961509e49438c329377cf Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 15:41:41 -0600 Subject: [PATCH 066/660] comments/licensing things --- .../include/layers/graph_conv_layer.h | 31 +++++++++++-------- libdeepgalois/licensenote.txt | 2 ++ libdeepgalois/src/layers/graph_conv_layer.cpp | 11 +++++-- libdeepgalois/src/math_functions.cpp | 1 + 4 files changed, 30 insertions(+), 15 deletions(-) diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h index 86a91c7287..e4296a44ff 100644 --- a/libdeepgalois/include/layers/graph_conv_layer.h +++ b/libdeepgalois/include/layers/graph_conv_layer.h @@ -2,19 +2,24 @@ #include "layer.h" #include "aggregator.h" -/* GraphConv Layer - Parameters - ---------- - x: int, number of samples. - y: int, Input feature size. - z: int, Output feature size. - dropout: bool, optional, if True, a dropout operation is applied before - other operations. norm : bool, optional, if True, the normalizer - :math:`c_{ij}` is applied. Default: ``True``. bias : bool, optional, if True, - adds a learnable bias to the output. Default: ``False``. activation: callable - activation function/layer or None, optional If not None, applies an - activation function to the updated node features. Default: ``None``. -*/ + +/** + * GraphConv Layer; based on DGL implementation + * https://docs.dgl.ai/en/0.4.x/_modules/dgl/nn/pytorch/conv/graphconv.html + * + * Parameters + * ---------- + * x: int, number of samples. + * y: int, Input feature size. + * z: int, Output feature size. + * dropout: bool, optional, if True, a dropout operation is applied before + * other operations. + * norm : bool, optional, if True, the normalizer :math:`c_{ij}` is applied. + * Default: ``True``. + * bias : bool, optional, if True, adds a learnable bias to the output. + * Default: ``False``. + * activation: default false + */ class graph_conv_layer : public layer { public: graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout, diff --git a/libdeepgalois/licensenote.txt b/libdeepgalois/licensenote.txt index c1e14addca..224adbc701 100644 --- a/libdeepgalois/licensenote.txt +++ b/libdeepgalois/licensenote.txt @@ -6,3 +6,5 @@ note here e.g. https://github.com/tiny-dnn/tiny-dnn/tree/master/tiny_dnn under BSD-3 + +DGL structure as well from what I can tell diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 753deed714..c7ae0b944d 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -73,9 +73,12 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ &dropout_mask[i * y], &in_temp[i * y]); }, galois::loopname("dropout")); matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z - } else + } else { matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z + } + aggregate(z, context->graph_cpu, out_temp, out_data); + if (act_) { galois::do_all( galois::iterate((size_t)0, x), @@ -95,7 +98,10 @@ void graph_conv_layer::back_propagation(const float_t* in_data, out_temp[i * z + j] = out_data[i * z + j] > float_t(0) ? out_grad[i * z + j] : float_t(0); }, galois::loopname("d_relu")); - } else copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying + } else { + copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying + } + if (level_ != 0) { // no need to calculate in_grad for the first layer vec_t trans_W(z * y); transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix @@ -113,6 +119,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, galois::loopname("d_dropout")); } } + // calculate weight gradients transpose(x, y, in_data, trans_data); // y*x matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 9914fd68d5..5c6c8b7ec3 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -185,6 +185,7 @@ void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C) { } } +// num rows in A, C; num columns in B, C; num columns in A, rows in B void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t* A, const float_t* B, float_t* C) { galois::StatTimer Tmatmul("MatMul"); From 87a35505889bf9b5379cec8b9c093fda6f84af20 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 2 Mar 2020 15:44:15 -0600 Subject: [PATCH 067/660] add selfloop --- libdeepgalois/include/context.h | 6 +++--- libdeepgalois/include/net.h | 2 +- libdeepgalois/src/context.cpp | 31 +++++++++++++++++++------------ libdeepgalois/src/context.cu | 3 ++- libdeepgalois/src/net.cpp | 4 ++-- libgpu/include/graph_gpu.h | 30 ++++++++++++++++++++++++++---- lonestargnn/gcn/gcn.cpp | 2 +- lonestargnn/lonestargnn.h | 1 + 8 files changed, 55 insertions(+), 24 deletions(-) diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index 61a8eed69d..bfc3a90c25 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -35,15 +35,15 @@ class Context { bool multiprocess() { return multiprocess_; } void set_multiprocess(bool val) { multiprocess_ = val; } bool root_solver() { return solver_rank_ == 0; } - size_t read_graph(std::string dataset_str); + size_t read_graph(std::string dataset_str, bool selfloop); size_t read_labels(std::string dataset_str); size_t read_features(std::string dataset_str); label_t get_label(size_t i) { return labels[i]; } label_t* get_labels_ptr(size_t i) { return &(labels[0]); } float_t* get_in_ptr(); - size_t read_graph_cpu(std::string dataset_str, std::string filetype = "gr"); - size_t read_graph_gpu(std::string dataset_str); + size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop); + size_t read_graph_gpu(std::string dataset_str, bool selfloop); void copy_data_to_device(); // copy labels and input features void SetDevice(const int device_id); void DeviceQuery() {} diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index 743ac5ea11..79364105e9 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -18,7 +18,7 @@ class Net { public: Net() {} - void init(std::string dataset_str, unsigned epochs, unsigned hidden1); + void init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop); size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } size_t get_nnodes() { return num_samples; } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 6f7169add4..eefa2da886 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -8,17 +8,17 @@ Context::Context() Context::~Context() {} #endif -size_t Context::read_graph(std::string dataset_str) { +size_t Context::read_graph(std::string dataset_str, bool selfloop) { #ifdef CPU_ONLY - n = read_graph_cpu(dataset_str, "gr"); + n = read_graph_cpu(dataset_str, "gr", selfloop); #else - n = read_graph_gpu(dataset_str); + n = read_graph_gpu(dataset_str, selfloop); #endif return n; } #ifdef CPU_ONLY -size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop = false) { +size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) { galois::StatTimer Tread("GraphReadingTime"); Tread.start(); if (filetype == "el") { @@ -63,16 +63,23 @@ void Context::add_selfloop(Graph og, Graph &g) { g.constructNodes(); for (size_t src = 0; src < og.size(); src++) { g.getData(src) = 1; - auto row_end = og.edge_end(src); - g.fixEndEdge(src, row_end+src+1); + auto begin = og.edge_begin(src); + auto end = og.edge_end(src); + g.fixEndEdge(src, end+src+1); bool self_inserted = false; - for (auto e : og.edges(src)) { + for (auto e = begin; e != end; e++) { auto dst = og.edgeDst(e); - if (!self_inserted && dst > src) { - g.constructEdge(e, src, 0); - self_inserted = true; - } - g.constructEdge(e, dst, 0); + if (!self_inserted) { + if (dst > src) { + g.constructEdge(e+src, src, 0); + g.constructEdge(e+src+1, dst, 0); + self_inserted = true; + else if (e+1 == end) { + g.constructEdge(e+src+1, src, 0); + g.constructEdge(e+src, dst, 0); + self_inserted = true; + } else g.constructEdge(e+src, dst, 0); + } else g.constructEdge(e+src+1, dst, 0); } } diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 1ba6bcc8bd..23c27f370f 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -116,10 +116,11 @@ void Context::SetDevice(const int device_id) { curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); } -size_t Context::read_graph_gpu(std::string dataset_str) { +size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) { std::string filename = path + dataset_str + ".csgr"; CSRGraph g; g.read(filename.c_str(), false); + if (selfloop) g.add_selfloop(); g.copy_to_gpu(graph_gpu); return graph_gpu.nnodes; } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 6e253a2afd..6bfe6f0f30 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -1,9 +1,9 @@ #include "net.h" -void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1) { +void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop) { context = new Context(); // read graph, get num nodes - num_samples = context->read_graph(dataset_str); + num_samples = context->read_graph(dataset_str, selfloop); num_classes = context->read_labels(dataset_str); context->norm_factor_counting(); // pre-compute normalizing factor num_epochs = epochs; diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index 4784e510a5..be337eb9ac 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -104,10 +104,32 @@ struct CSRGraph { check_cuda(cudaMalloc((void **)&node_data, m * sizeof(node_data_type))); check_cuda(cudaMemcpy(node_data, h_labels, m * sizeof(node_data_type), cudaMemcpyHostToDevice)); #endif - //int *h_degrees = (int *)malloc(m * sizeof(int)); - //for (int i = 0; i < m; i++) h_degrees[i] = h_row_offsets[i + 1] - h_row_offsets[i]; - //check_cuda(cudaMalloc((void **)&d_degrees, m * sizeof(int))); - //check_cuda(cudaMemcpy(d_degrees, h_degrees, m * sizeof(int), cudaMemcpyHostToDevice)); + } + + void add_selfloop() { + index_type *new_edge_dst = new index_type[nnodes+nedges]; + for (index_type i = 0; i < nnodes; i++) { + index_type start = row_start[i]; + index_type end = row_start[i+1]; + bool selfloop_inserted = false; + for (index_type e = start; e != end; e++) { + index_type dst = edge_dst[e]; + if (!selfloop_inserted) { + if (i < dst) { + selfloop_inserted = true; + new_edge_dst[e+i] = i; + new_edge_dst[e+i+1] = dst; + } else if (e+1 == end) { + selfloop_inserted = true; + new_edge_dst[e+i+1] = i; + new_edge_dst[e+i] = dst; + } else new_edge_dst[e+i] = dst; + } else new_edge_dst[e+i+1] = dst; + } + } + for (index_type i = 0; i < nnodes; i++) row_start[i] += i; + delete edge_dst; + edge_dst = new_edge_dst; } __device__ __host__ index_type getEdgeDst(unsigned edge) { diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 55c4e2320f..0f419896c8 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -11,7 +11,7 @@ int main(int argc, char** argv) { LonestarGnnStart(argc, argv, name, desc, url); Net network; // the neural network to train // read network, features, ground truth, initialize metadata - network.init(dataset, epochs, hidden1); + network.init(dataset, epochs, hidden1, add_selfloop); network.construct_layers(); // default setting for now; can be customized by // the user network.print_layers_info(); diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h index 7ecbe32d7a..e41fb39ab4 100644 --- a/lonestargnn/lonestargnn.h +++ b/lonestargnn/lonestargnn.h @@ -48,6 +48,7 @@ static cll::opt max_degree( static cll::opt do_validate("dv", cll::desc("enable validation"), cll::init(1)); static cll::opt do_test("dt", cll::desc("enable test"), cll::init(1)); +static cll::opt add_selfloop("sl", cll::desc("add selfloop"), cll::init(0)); //! standard global options to the benchmarks extern llvm::cl::opt skipVerify; From 987557635c871e588320bdae1ffc55aed1662102 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 2 Mar 2020 16:21:50 -0600 Subject: [PATCH 068/660] comment add_selfloop --- libdeepgalois/include/context.h | 2 +- libdeepgalois/src/context.cpp | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/context.h index bfc3a90c25..4715c2c248 100644 --- a/libdeepgalois/include/context.h +++ b/libdeepgalois/include/context.h @@ -65,7 +65,7 @@ class Context { #ifdef CPU_ONLY Graph graph_cpu; // the input graph, |V| = N void genGraph(LGraph& lg, Graph& g); - void add_selfloop(Graph og, Graph &g); + void add_selfloop(Graph &og, Graph &g); #else CSRGraph graph_gpu; // the input graph, |V| = N inline static cublasHandle_t cublas_handle() { return cublas_handle_; } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index eefa2da886..e30b5f0f37 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -32,6 +32,7 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo std::string filename = path + dataset_str + ".csgr"; printf("Reading .gr file: %s\n", filename.c_str()); if (selfloop) { + Graph graph_temp; galois::graphs::readGraph(graph_temp, filename); add_selfloop(graph_temp, graph_cpu); } else galois::graphs::readGraph(graph_cpu, filename); @@ -58,9 +59,10 @@ void Context::genGraph(LGraph& lg, Graph& g) { } } -void Context::add_selfloop(Graph og, Graph &g) { +void Context::add_selfloop(Graph &og, Graph &g) { g.allocateFrom(og.size(), og.size()+og.sizeEdges()); g.constructNodes(); + /* for (size_t src = 0; src < og.size(); src++) { g.getData(src) = 1; auto begin = og.edge_begin(src); @@ -68,19 +70,21 @@ void Context::add_selfloop(Graph og, Graph &g) { g.fixEndEdge(src, end+src+1); bool self_inserted = false; for (auto e = begin; e != end; e++) { - auto dst = og.edgeDst(e); + auto dst = og.getEdgeDst(e); if (!self_inserted) { if (dst > src) { g.constructEdge(e+src, src, 0); g.constructEdge(e+src+1, dst, 0); self_inserted = true; - else if (e+1 == end) { + } else if (e+1 == end) { g.constructEdge(e+src+1, src, 0); g.constructEdge(e+src, dst, 0); self_inserted = true; } else g.constructEdge(e+src, dst, 0); } else g.constructEdge(e+src+1, dst, 0); + } } + */ } float_t* Context::get_in_ptr() { return &h_feats[0]; } From 655feaa6fefd596bb9a2219c92acd77e25bcabc2 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 2 Mar 2020 17:43:06 -0600 Subject: [PATCH 069/660] fix selfloop --- libgpu/include/graph_gpu.h | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index be337eb9ac..da420ea416 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -105,8 +105,19 @@ struct CSRGraph { check_cuda(cudaMemcpy(node_data, h_labels, m * sizeof(node_data_type), cudaMemcpyHostToDevice)); #endif } - + void print_neighbors(index_type vid) { + printf("Vertex %d neighbors: [ ", vid); + index_type start = row_start[vid]; + index_type end = row_start[vid+1]; + for (index_type e = start; e != end; e++) { + index_type dst = edge_dst[e]; + printf("%d ", dst); + } + printf("]\n"); + } void add_selfloop() { + print_neighbors(nnodes-1); + print_neighbors(0); index_type *new_edge_dst = new index_type[nnodes+nedges]; for (index_type i = 0; i < nnodes; i++) { index_type start = row_start[i]; @@ -127,9 +138,13 @@ struct CSRGraph { } else new_edge_dst[e+i+1] = dst; } } - for (index_type i = 0; i < nnodes; i++) row_start[i] += i; + for (index_type i = 0; i <= nnodes; i++) row_start[i] += i; delete edge_dst; edge_dst = new_edge_dst; + nedges += nnodes; + printf("nnodes = %d, nedges = %d\n", nnodes, nedges); + print_neighbors(nnodes-1); + print_neighbors(0); } __device__ __host__ index_type getEdgeDst(unsigned edge) { From 8cca98240cc87ed9d094b0e2558662a60adb91eb Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 16:17:22 -0600 Subject: [PATCH 070/660] random.h is from Caffe; add TODO --- libdeepgalois/include/random.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libdeepgalois/include/random.h b/libdeepgalois/include/random.h index 8560a24de1..b63914bca1 100644 --- a/libdeepgalois/include/random.h +++ b/libdeepgalois/include/random.h @@ -1,3 +1,6 @@ +// From Caffe library it seems +// TODO get the license from it + #ifndef RANDOM_H #define RANDOM_H typedef boost::mt19937 rng_t; From 4002f29138d082b84e43a81923200b7fb6e267ae Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 16:22:54 -0600 Subject: [PATCH 071/660] moving some unused(?) files --- libdeepgalois/include/{ => unused}/random.h | 0 libdeepgalois/include/{ => unused}/timer.h | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename libdeepgalois/include/{ => unused}/random.h (100%) rename libdeepgalois/include/{ => unused}/timer.h (100%) diff --git a/libdeepgalois/include/random.h b/libdeepgalois/include/unused/random.h similarity index 100% rename from libdeepgalois/include/random.h rename to libdeepgalois/include/unused/random.h diff --git a/libdeepgalois/include/timer.h b/libdeepgalois/include/unused/timer.h similarity index 100% rename from libdeepgalois/include/timer.h rename to libdeepgalois/include/unused/timer.h From c49db0ab0ff1f9b5dd39bf1dae344227d1513294 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 16:27:09 -0600 Subject: [PATCH 072/660] initialize deepgalois directory: context.h moved --- libdeepgalois/include/{ => deepgalois}/context.h | 0 libdeepgalois/include/layers/layer.h | 2 +- libdeepgalois/include/net.h | 2 +- libdeepgalois/src/context.cpp | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename libdeepgalois/include/{ => deepgalois}/context.h (100%) diff --git a/libdeepgalois/include/context.h b/libdeepgalois/include/deepgalois/context.h similarity index 100% rename from libdeepgalois/include/context.h rename to libdeepgalois/include/deepgalois/context.h diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 355f75a440..9d6f01f644 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -26,7 +26,7 @@ #include "../types.h" #include "../utils.h" #include "../gtypes.h" -#include "../context.h" +#include "deepgalois/context.h" #include "../optimizer.h" #include "../math_functions.hh" /** diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index 79364105e9..1530a9c4dd 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -4,7 +4,7 @@ #include #include "types.h" #include "gtypes.h" -#include "context.h" +#include "deepgalois/context.h" #include "galois/Timer.h" #include "layers.h" #include "optimizer.h" diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index e30b5f0f37..b257f8422c 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -1,4 +1,4 @@ -#include "context.h" +#include "deepgalois/context.h" #include "gtypes.h" #ifdef CPU_ONLY From f54717a6ea05ce285ed1dc970f3f0d0217e15f1d Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 16:35:14 -0600 Subject: [PATCH 073/660] gtypes, types, utils to deepgalois directory --- libdeepgalois/include/aggregator.h | 4 ++-- libdeepgalois/include/{ => deepgalois}/gtypes.h | 0 libdeepgalois/include/{ => deepgalois}/types.h | 0 libdeepgalois/include/{ => deepgalois}/utils.h | 0 libdeepgalois/include/layers/layer.h | 8 ++++---- libdeepgalois/include/math_functions.hh | 3 ++- libdeepgalois/include/net.h | 4 ++-- libdeepgalois/include/node.h | 2 +- libdeepgalois/include/optimizer.h | 2 +- libdeepgalois/src/aggregator.cpp | 2 -- libdeepgalois/src/context.cpp | 1 - libdeepgalois/src/math_functions.cpp | 1 - lonestargnn/lonestargnn.h | 4 ++-- 13 files changed, 14 insertions(+), 17 deletions(-) rename libdeepgalois/include/{ => deepgalois}/gtypes.h (100%) rename libdeepgalois/include/{ => deepgalois}/types.h (100%) rename libdeepgalois/include/{ => deepgalois}/utils.h (100%) diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/aggregator.h index 6853ea7126..c54f8f69bc 100644 --- a/libdeepgalois/include/aggregator.h +++ b/libdeepgalois/include/aggregator.h @@ -1,7 +1,7 @@ #pragma once -#include "types.h" +#include "deepgalois/types.h" #ifdef CPU_ONLY -#include "gtypes.h" +#include "deepgalois/gtypes.h" void update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); #else diff --git a/libdeepgalois/include/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h similarity index 100% rename from libdeepgalois/include/gtypes.h rename to libdeepgalois/include/deepgalois/gtypes.h diff --git a/libdeepgalois/include/types.h b/libdeepgalois/include/deepgalois/types.h similarity index 100% rename from libdeepgalois/include/types.h rename to libdeepgalois/include/deepgalois/types.h diff --git a/libdeepgalois/include/utils.h b/libdeepgalois/include/deepgalois/utils.h similarity index 100% rename from libdeepgalois/include/utils.h rename to libdeepgalois/include/deepgalois/utils.h diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/layers/layer.h index 9d6f01f644..b393098680 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/layers/layer.h @@ -1,6 +1,6 @@ #pragma once /** - * Code based on below link. + * Code from on below link. Modified under Galois. * * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/layers/layer.h * @@ -23,9 +23,9 @@ #include #include #include "../node.h" -#include "../types.h" -#include "../utils.h" -#include "../gtypes.h" +#include "deepgalois/types.h" +#include "deepgalois/utils.h" +#include "deepgalois/gtypes.h" #include "deepgalois/context.h" #include "../optimizer.h" #include "../math_functions.hh" diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index f89f34a5a5..2d3adc5404 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -4,7 +4,8 @@ #include #include #include -#include "types.h" +#include "deepgalois/types.h" +#include "deepgalois/utils.h" extern "C" { #include diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index 1530a9c4dd..ba60ddf771 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -2,8 +2,8 @@ #define _MODEL_H_ #include -#include "types.h" -#include "gtypes.h" +#include "deepgalois/types.h" +#include "deepgalois/gtypes.h" #include "deepgalois/context.h" #include "galois/Timer.h" #include "layers.h" diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/node.h index 947e997275..fa58ddea2b 100644 --- a/libdeepgalois/include/node.h +++ b/libdeepgalois/include/node.h @@ -12,7 +12,7 @@ #include #include #include -#include "types.h" +#include "deepgalois/types.h" class node; class layer; class edge; diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/optimizer.h index 28cbabc5f5..cd6b36447c 100644 --- a/libdeepgalois/include/optimizer.h +++ b/libdeepgalois/include/optimizer.h @@ -10,7 +10,7 @@ #include #include -#include "types.h" +#include "deepgalois/types.h" #ifndef CPU_ONLY #include "math_functions.hh" #endif diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp index 6bb301b0be..370d3a6514 100644 --- a/libdeepgalois/src/aggregator.cpp +++ b/libdeepgalois/src/aggregator.cpp @@ -1,5 +1,3 @@ -#include "types.h" -#include "gtypes.h" #include "aggregator.h" #include "math_functions.hh" diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index b257f8422c..284f693829 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -1,5 +1,4 @@ #include "deepgalois/context.h" -#include "gtypes.h" #ifdef CPU_ONLY Context::Context() diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 5c6c8b7ec3..451fe59070 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -1,5 +1,4 @@ #include "math_functions.hh" -#include "utils.h" #include "galois/Timer.h" #include diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h index e41fb39ab4..baf7681995 100644 --- a/lonestargnn/lonestargnn.h +++ b/lonestargnn/lonestargnn.h @@ -104,6 +104,6 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, galois::runtime::reportParam("(NULL)", "Hostname", name); } -#include "types.h" -#include "utils.h" +#include "deepgalois/types.h" +#include "deepgalois/utils.h" #include "net.h" From 07f02c25c3c6ba78b00a5154a0aa625d9fde6fbd Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 16:37:42 -0600 Subject: [PATCH 074/660] added tinydnn note for math_functions.hh --- libdeepgalois/include/math_functions.hh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/math_functions.hh index 2d3adc5404..3e1af207da 100644 --- a/libdeepgalois/include/math_functions.hh +++ b/libdeepgalois/include/math_functions.hh @@ -1,3 +1,7 @@ +/** + * File inspired by similar one from TinyDNN + * https://github.com/tiny-dnn/ + */ #ifndef _MATH_FUNCTIONS_ #define _MATH_FUNCTIONS_ #include From 4c2fabf62492910e9e18174ac5715177f03244e0 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 16:39:30 -0600 Subject: [PATCH 075/660] aggregator.h to deepgalois --- libdeepgalois/include/{ => deepgalois}/aggregator.h | 0 libdeepgalois/include/layers/graph_conv_layer.h | 2 +- libdeepgalois/src/aggregator.cpp | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename libdeepgalois/include/{ => deepgalois}/aggregator.h (100%) diff --git a/libdeepgalois/include/aggregator.h b/libdeepgalois/include/deepgalois/aggregator.h similarity index 100% rename from libdeepgalois/include/aggregator.h rename to libdeepgalois/include/deepgalois/aggregator.h diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/layers/graph_conv_layer.h index e4296a44ff..4016b49024 100644 --- a/libdeepgalois/include/layers/graph_conv_layer.h +++ b/libdeepgalois/include/layers/graph_conv_layer.h @@ -1,6 +1,6 @@ #pragma once #include "layer.h" -#include "aggregator.h" +#include "deepgalois/aggregator.h" /** diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp index 370d3a6514..b9d1a70c7a 100644 --- a/libdeepgalois/src/aggregator.cpp +++ b/libdeepgalois/src/aggregator.cpp @@ -1,4 +1,4 @@ -#include "aggregator.h" +#include "deepgalois/aggregator.h" #include "math_functions.hh" void update_all(size_t len, Graph& g, const float_t* in, float_t* out, From 56878dcef44e2ee5fb81e12f9fb98d47940e680f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 16:44:07 -0600 Subject: [PATCH 076/660] remove layers.h, move layers directory into deepgalois --- .../include/{ => deepgalois}/layers/arithmetic_layer.h | 0 .../include/{ => deepgalois}/layers/graph_conv_layer.h | 0 libdeepgalois/include/{ => deepgalois}/layers/layer.h | 6 +++--- .../include/{ => deepgalois}/layers/linear_layer.h | 0 .../include/{ => deepgalois}/layers/relu_layer.h | 0 .../include/{ => deepgalois}/layers/softmax_loss_layer.h | 0 libdeepgalois/include/layers.h | 8 -------- libdeepgalois/include/net.h | 5 +++-- libdeepgalois/src/layers/graph_conv_layer.cpp | 2 +- libdeepgalois/src/layers/relu_layer.cpp | 2 +- libdeepgalois/src/layers/softmax_loss_layer.cpp | 2 +- 11 files changed, 9 insertions(+), 16 deletions(-) rename libdeepgalois/include/{ => deepgalois}/layers/arithmetic_layer.h (100%) rename libdeepgalois/include/{ => deepgalois}/layers/graph_conv_layer.h (100%) rename libdeepgalois/include/{ => deepgalois}/layers/layer.h (98%) rename libdeepgalois/include/{ => deepgalois}/layers/linear_layer.h (100%) rename libdeepgalois/include/{ => deepgalois}/layers/relu_layer.h (100%) rename libdeepgalois/include/{ => deepgalois}/layers/softmax_loss_layer.h (100%) delete mode 100644 libdeepgalois/include/layers.h diff --git a/libdeepgalois/include/layers/arithmetic_layer.h b/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h similarity index 100% rename from libdeepgalois/include/layers/arithmetic_layer.h rename to libdeepgalois/include/deepgalois/layers/arithmetic_layer.h diff --git a/libdeepgalois/include/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h similarity index 100% rename from libdeepgalois/include/layers/graph_conv_layer.h rename to libdeepgalois/include/deepgalois/layers/graph_conv_layer.h diff --git a/libdeepgalois/include/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h similarity index 98% rename from libdeepgalois/include/layers/layer.h rename to libdeepgalois/include/deepgalois/layers/layer.h index b393098680..7c40bc256c 100644 --- a/libdeepgalois/include/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -22,13 +22,13 @@ #include #include #include -#include "../node.h" +#include "../../node.h" #include "deepgalois/types.h" #include "deepgalois/utils.h" #include "deepgalois/gtypes.h" #include "deepgalois/context.h" -#include "../optimizer.h" -#include "../math_functions.hh" +#include "../../optimizer.h" +#include "../../math_functions.hh" /** * base class of all kind of NN layers * diff --git a/libdeepgalois/include/layers/linear_layer.h b/libdeepgalois/include/deepgalois/layers/linear_layer.h similarity index 100% rename from libdeepgalois/include/layers/linear_layer.h rename to libdeepgalois/include/deepgalois/layers/linear_layer.h diff --git a/libdeepgalois/include/layers/relu_layer.h b/libdeepgalois/include/deepgalois/layers/relu_layer.h similarity index 100% rename from libdeepgalois/include/layers/relu_layer.h rename to libdeepgalois/include/deepgalois/layers/relu_layer.h diff --git a/libdeepgalois/include/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h similarity index 100% rename from libdeepgalois/include/layers/softmax_loss_layer.h rename to libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h diff --git a/libdeepgalois/include/layers.h b/libdeepgalois/include/layers.h deleted file mode 100644 index 432d315183..0000000000 --- a/libdeepgalois/include/layers.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _LAYERS_H_ -#define _LAYERS_H_ -//#include "layers/relu_layer.h" -//#include "layers/linear_layer.h" -//#include "layers/arithmetic_layer.h" -#include "layers/graph_conv_layer.h" -#include "layers/softmax_loss_layer.h" -#endif diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/net.h index ba60ddf771..b1d514050e 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/net.h @@ -2,11 +2,12 @@ #define _MODEL_H_ #include +#include "galois/Timer.h" #include "deepgalois/types.h" #include "deepgalois/gtypes.h" #include "deepgalois/context.h" -#include "galois/Timer.h" -#include "layers.h" +#include "deepgalois/layers/graph_conv_layer.h" +#include "deepgalois/layers/softmax_loss_layer.h" #include "optimizer.h" #define NUM_CONV_LAYERS 2 diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index c7ae0b944d..adcd7cc33c 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -1,4 +1,4 @@ -#include "layers/graph_conv_layer.h" +#include "deepgalois/layers/graph_conv_layer.h" #ifdef CPU_ONLY void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp index 0c52d0eb25..ce2a167cb0 100644 --- a/libdeepgalois/src/layers/relu_layer.cpp +++ b/libdeepgalois/src/layers/relu_layer.cpp @@ -1,4 +1,4 @@ -#include "layers/relu_layer.h" +#include "deepgalois/layers/relu_layer.h" // ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) void relu_layer::forward_propagation(const tensor_t& in_data, diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index af04b06bbf..cc3e3b941b 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -1,4 +1,4 @@ -#include "layers/softmax_loss_layer.h" +#include "deepgalois/layers/softmax_loss_layer.h" softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector in_dims, From d47a23ac2280b4b8fe8366bee72519914c034970 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 16:49:32 -0600 Subject: [PATCH 077/660] lgraph, cutils, math, net, node, optimizer to deepgalois --- libdeepgalois/include/deepgalois/context.h | 10 +++++----- libdeepgalois/include/{ => deepgalois}/cutils.h | 0 libdeepgalois/include/deepgalois/layers/layer.h | 6 +++--- libdeepgalois/include/{ => deepgalois}/lgraph.h | 0 .../include/{ => deepgalois}/math_functions.hh | 0 libdeepgalois/include/{ => deepgalois}/net.h | 2 +- libdeepgalois/include/{ => deepgalois}/node.h | 0 libdeepgalois/include/{ => deepgalois}/optimizer.h | 2 +- libdeepgalois/src/aggregator.cpp | 2 +- libdeepgalois/src/math_functions.cpp | 2 +- libdeepgalois/src/net.cpp | 2 +- libdeepgalois/src/node.cpp | 2 +- libdeepgalois/src/optimizer.cpp | 2 +- lonestargnn/lonestargnn.h | 2 +- 14 files changed, 16 insertions(+), 16 deletions(-) rename libdeepgalois/include/{ => deepgalois}/cutils.h (100%) rename libdeepgalois/include/{ => deepgalois}/lgraph.h (100%) rename libdeepgalois/include/{ => deepgalois}/math_functions.hh (100%) rename libdeepgalois/include/{ => deepgalois}/net.h (99%) rename libdeepgalois/include/{ => deepgalois}/node.h (100%) rename libdeepgalois/include/{ => deepgalois}/optimizer.h (99%) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index 4715c2c248..644f3f0c15 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -11,14 +11,14 @@ #include #include -#include "types.h" -#include "utils.h" +#include "deepgalois/types.h" +#include "deepgalois/utils.h" #ifdef CPU_ONLY -#include "lgraph.h" -#include "gtypes.h" +#include "deepgalois/lgraph.h" +#include "deepgalois/gtypes.h" #else #include "graph_gpu.h" -#include "cutils.h" +#include "deepgalois/cutils.h" #endif class Context { diff --git a/libdeepgalois/include/cutils.h b/libdeepgalois/include/deepgalois/cutils.h similarity index 100% rename from libdeepgalois/include/cutils.h rename to libdeepgalois/include/deepgalois/cutils.h diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 7c40bc256c..b5757de2e3 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -22,13 +22,13 @@ #include #include #include -#include "../../node.h" +#include "deepgalois/node.h" #include "deepgalois/types.h" #include "deepgalois/utils.h" #include "deepgalois/gtypes.h" #include "deepgalois/context.h" -#include "../../optimizer.h" -#include "../../math_functions.hh" +#include "deepgalois/optimizer.h" +#include "deepgalois/math_functions.hh" /** * base class of all kind of NN layers * diff --git a/libdeepgalois/include/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h similarity index 100% rename from libdeepgalois/include/lgraph.h rename to libdeepgalois/include/deepgalois/lgraph.h diff --git a/libdeepgalois/include/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh similarity index 100% rename from libdeepgalois/include/math_functions.hh rename to libdeepgalois/include/deepgalois/math_functions.hh diff --git a/libdeepgalois/include/net.h b/libdeepgalois/include/deepgalois/net.h similarity index 99% rename from libdeepgalois/include/net.h rename to libdeepgalois/include/deepgalois/net.h index b1d514050e..79176674c2 100644 --- a/libdeepgalois/include/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -8,7 +8,7 @@ #include "deepgalois/context.h" #include "deepgalois/layers/graph_conv_layer.h" #include "deepgalois/layers/softmax_loss_layer.h" -#include "optimizer.h" +#include "deepgalois/optimizer.h" #define NUM_CONV_LAYERS 2 diff --git a/libdeepgalois/include/node.h b/libdeepgalois/include/deepgalois/node.h similarity index 100% rename from libdeepgalois/include/node.h rename to libdeepgalois/include/deepgalois/node.h diff --git a/libdeepgalois/include/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h similarity index 99% rename from libdeepgalois/include/optimizer.h rename to libdeepgalois/include/deepgalois/optimizer.h index cd6b36447c..2c2d783d0d 100644 --- a/libdeepgalois/include/optimizer.h +++ b/libdeepgalois/include/deepgalois/optimizer.h @@ -12,7 +12,7 @@ #include #include "deepgalois/types.h" #ifndef CPU_ONLY -#include "math_functions.hh" +#include "deepgalois/math_functions.hh" #endif // base class of optimizer diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp index b9d1a70c7a..c2a50710dd 100644 --- a/libdeepgalois/src/aggregator.cpp +++ b/libdeepgalois/src/aggregator.cpp @@ -1,5 +1,5 @@ #include "deepgalois/aggregator.h" -#include "math_functions.hh" +#include "deepgalois/math_functions.hh" void update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 451fe59070..144419f16d 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -1,4 +1,4 @@ -#include "math_functions.hh" +#include "deepgalois/math_functions.hh" #include "galois/Timer.h" #include diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 6bfe6f0f30..33da83d0fc 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -1,4 +1,4 @@ -#include "net.h" +#include "deepgalois/net.h" void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop) { context = new Context(); diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp index 9b88620d65..4ab918e0cd 100644 --- a/libdeepgalois/src/node.cpp +++ b/libdeepgalois/src/node.cpp @@ -1,4 +1,4 @@ -#include "node.h" +#include "deepgalois/node.h" #include void edge::alloc() { diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp index 0ec40cf4d0..c9c8768610 100644 --- a/libdeepgalois/src/optimizer.cpp +++ b/libdeepgalois/src/optimizer.cpp @@ -1,4 +1,4 @@ -#include "optimizer.h" +#include "deepgalois/optimizer.h" #include "galois/Galois.h" void adagrad::update(const vec_t& dW, vec_t& W, bool parallelize) { diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h index baf7681995..a04905b5cb 100644 --- a/lonestargnn/lonestargnn.h +++ b/lonestargnn/lonestargnn.h @@ -106,4 +106,4 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, #include "deepgalois/types.h" #include "deepgalois/utils.h" -#include "net.h" +#include "deepgalois/net.h" From 7d69511228db07d588a1dfa2bef3e159bee38a12 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 16:55:06 -0600 Subject: [PATCH 078/660] Context class now deepgalois::Context --- libdeepgalois/include/deepgalois/context.h | 2 ++ libdeepgalois/include/deepgalois/layers/layer.h | 4 ++-- libdeepgalois/include/deepgalois/net.h | 2 +- libdeepgalois/src/context.cpp | 5 ++++- libdeepgalois/src/context.cu | 4 +++- libdeepgalois/src/net.cpp | 2 +- 6 files changed, 13 insertions(+), 6 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index 644f3f0c15..d7f400d582 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -21,6 +21,7 @@ #include "deepgalois/cutils.h" #endif +namespace deepgalois { class Context { public: Context(); @@ -88,3 +89,4 @@ class Context { int solver_rank_; bool multiprocess_; }; +} // end deepgalois namespace diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index b5757de2e3..bf89ad216d 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -55,7 +55,7 @@ class layer : public node { virtual std::string layer_type() const = 0; virtual void set_netphase(net_phase phase) {} //! save context - virtual void set_context(Context* ctx) { context = ctx; } + virtual void set_context(deepgalois::Context* ctx) { context = ctx; } virtual acc_t get_masked_loss() { return acc_t(0); } // main functions for layer work @@ -158,7 +158,7 @@ class layer : public node { mask_t* masks_; // masks to show which samples are valid mask_t* d_masks_; float_t* loss; // error for each vertex: N x 1 - Context* context; + deepgalois::Context* context; }; // head: layer i+1, tail: layer i diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 79176674c2..efdd99b7b3 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -101,7 +101,7 @@ class Net { } protected: - Context* context; + deepgalois::Context* context; size_t num_samples; // number of samples: N size_t num_classes; // number of vertex classes: E size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 284f693829..5e2ccf4c02 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -1,5 +1,7 @@ #include "deepgalois/context.h" +namespace deepgalois { + #ifdef CPU_ONLY Context::Context() : mode_(Context::CPU), solver_count_(1), solver_rank_(0), @@ -179,4 +181,5 @@ inline void init_features(size_t dim, vec_t &x) { for (size_t i = 0; i < dim; ++i) x[i] = dist(rng); } -//*/ +*/ +} // end deepgalois namespace diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 23c27f370f..270252c5d8 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -54,6 +54,8 @@ __global__ void norm_factor_counting_edge(int n, CSRGraph graph, } } +namespace deepgalois { + void Context::norm_factor_counting_gpu() { assert(graph_gpu.nnodes == n); std::cout << "Pre-computing normalization factor (n=" << n << ")\n"; @@ -137,4 +139,4 @@ void Context::copy_data_to_device() { } float_t* Context::get_in_ptr() { return d_feats; } - +} // namespace context diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 33da83d0fc..9c907dbf57 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -1,7 +1,7 @@ #include "deepgalois/net.h" void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop) { - context = new Context(); + context = new deepgalois::Context(); // read graph, get num nodes num_samples = context->read_graph(dataset_str, selfloop); num_classes = context->read_labels(dataset_str); From 7f28900ced040ad353a1533074b0a49a71f77755 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 16:58:29 -0600 Subject: [PATCH 079/660] Net class now deepgalois::Net --- libdeepgalois/include/deepgalois/net.h | 6 ++++++ libdeepgalois/src/net.cpp | 4 ++++ lonestargnn/gcn/gcn.cpp | 2 +- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index efdd99b7b3..0e18f39e1c 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -1,3 +1,5 @@ +// TODO if this code was based on something, get copyright/license and put here + #ifndef _MODEL_H_ #define _MODEL_H_ @@ -12,6 +14,8 @@ #define NUM_CONV_LAYERS 2 +namespace deepgalois { + // N: number of vertices, D: feature vector dimentions, // E: number of distinct labels, i.e. number of vertex classes // layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16) @@ -114,4 +118,6 @@ class Net { acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks); }; +} // namespace deepgalois + #endif diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 9c907dbf57..2221b3daad 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -1,5 +1,7 @@ #include "deepgalois/net.h" +namespace deepgalois { + void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop) { context = new deepgalois::Context(); // read graph, get num nodes @@ -123,3 +125,5 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, context->d_labels); #endif } + +} // namespace deepgalois diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 0f419896c8..2d47237298 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -9,7 +9,7 @@ const char* url = 0; int main(int argc, char** argv) { galois::SharedMemSys G; LonestarGnnStart(argc, argv, name, desc, url); - Net network; // the neural network to train + deepgalois::Net network; // the neural network to train // read network, features, ground truth, initialize metadata network.init(dataset, epochs, hidden1, add_selfloop); network.construct_layers(); // default setting for now; can be customized by From ce01e5b2702fe5fa73f850aec2bba80ff425cc38 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 17:04:20 -0600 Subject: [PATCH 080/660] optimizers now deepgalois::optimizer --- libdeepgalois/include/deepgalois/layers/layer.h | 2 +- libdeepgalois/include/deepgalois/optimizer.h | 11 ++++++++++- libdeepgalois/src/optimizer.cpp | 4 ++++ lonestargnn/gcn/gcn.cpp | 2 +- 4 files changed, 16 insertions(+), 3 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index bf89ad216d..028148c194 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -122,7 +122,7 @@ class layer : public node { } //! use optimizer to update weights given gradient - void update_weight(optimizer* opt) { + void update_weight(deepgalois::optimizer* opt) { // vec_t diff; // prev()->merge_grads(&diff); #ifdef CPU_ONLY diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h index 2c2d783d0d..b6a90917ff 100644 --- a/libdeepgalois/include/deepgalois/optimizer.h +++ b/libdeepgalois/include/deepgalois/optimizer.h @@ -1,5 +1,5 @@ /** - * Code modified from below link. + * Code taken/modified from below link. * * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h * Copyright (c) 2013, Taiga Nomi and the respective contributors @@ -8,6 +8,11 @@ */ #pragma once +// TODO: +// - use classes, not structs (modern C++) +// - templatize this instead of using inheritance +// - put optimizers in their own namespace + #include #include #include "deepgalois/types.h" @@ -15,6 +20,8 @@ #include "deepgalois/math_functions.hh" #endif +namespace deepgalois { + // base class of optimizer // usesHessian : true if an optimizer uses hessian (2nd order derivative of loss // function) @@ -184,3 +191,5 @@ struct nesterov_momentum : public stateful_optimizer<1> { float_t lambda; // weight decay float_t mu; // momentum }; + +} // namespace deepgalois diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp index c9c8768610..c3267f282e 100644 --- a/libdeepgalois/src/optimizer.cpp +++ b/libdeepgalois/src/optimizer.cpp @@ -1,6 +1,8 @@ #include "deepgalois/optimizer.h" #include "galois/Galois.h" +namespace deepgalois { + void adagrad::update(const vec_t& dW, vec_t& W, bool parallelize) { vec_t& g = get<0>(W); if (parallelize) { @@ -80,3 +82,5 @@ void nesterov_momentum::update(const vec_t& dW, vec_t& W, bool parallelize) { dWprev[i] = V; }, galois::loopname("nesterov_momentum_update")); } + +} // namespace deepgalois diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 2d47237298..7b4977dbe1 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -20,7 +20,7 @@ int main(int argc, char** argv) { // the optimizer used to update parameters, see optimizer.h for more details // optimizer *opt = new gradient_descent(); // optimizer *opt = new adagrad(); - optimizer* opt = new adam(); + deepgalois::optimizer* opt = new deepgalois::adam(); galois::StatTimer Ttrain("TrainAndVal"); Ttrain.start(); network.train(opt, do_validate); // do training using training samples From f9f0d5ba5d3631852eb878a18c0fb3d4c6bc4364 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 17:16:06 -0600 Subject: [PATCH 081/660] deepgalois namespace for node and layers TODO qualify classes with deepgalois; right now relies on everything being in deepgalois namespace --- .../include/deepgalois/layers/arithmetic_layer.h | 2 ++ .../include/deepgalois/layers/graph_conv_layer.h | 3 ++- libdeepgalois/include/deepgalois/layers/layer.h | 12 +++++++++--- .../include/deepgalois/layers/linear_layer.h | 2 ++ libdeepgalois/include/deepgalois/layers/relu_layer.h | 2 ++ .../include/deepgalois/layers/softmax_loss_layer.h | 2 ++ libdeepgalois/include/deepgalois/node.h | 5 +++++ libdeepgalois/src/layers/graph_conv_layer.cpp | 4 ++++ libdeepgalois/src/layers/relu_layer.cpp | 4 ++++ libdeepgalois/src/layers/softmax_loss_layer.cpp | 4 ++++ libdeepgalois/src/node.cpp | 4 ++++ 11 files changed, 40 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h b/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h index 63dc66f780..c28d0ed89c 100644 --- a/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h +++ b/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h @@ -1,6 +1,7 @@ #pragma once #include "layer.h" +namespace deepgalois { // element-wise add N vectors ```y_i = x0_i + x1_i + ... + xnum_i``` class elementwise_add_layer : public layer { public: @@ -24,3 +25,4 @@ class elementwise_add_layer : public layer { in_grad = out_grad; } }; +} // namespace diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 4016b49024..ed681bdf30 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -2,7 +2,6 @@ #include "layer.h" #include "deepgalois/aggregator.h" - /** * GraphConv Layer; based on DGL implementation * https://docs.dgl.ai/en/0.4.x/_modules/dgl/nn/pytorch/conv/graphconv.html @@ -20,6 +19,7 @@ * Default: ``False``. * activation: default false */ +namespace deepgalois { class graph_conv_layer : public layer { public: graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout, @@ -84,3 +84,4 @@ class graph_conv_layer : public layer { } } }; +} // namespace diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 028148c194..c0deaf6748 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -29,6 +29,9 @@ #include "deepgalois/context.h" #include "deepgalois/optimizer.h" #include "deepgalois/math_functions.hh" + +namespace deepgalois { + /** * base class of all kind of NN layers * @@ -42,7 +45,7 @@ * Node inheritance is just to get accessed to linked-list semantics it * provides **/ -class layer : public node { +class layer : public deepgalois::node { public: layer(unsigned level, std::vector in_dims, std::vector out_dims) @@ -92,14 +95,14 @@ class layer : public node { //! set the data of the previous layer connected to this one void set_in_data(float_t* data) { - prev_ = std::make_shared(this, input_dims[0], input_dims[1]); + prev_ = std::make_shared(this, input_dims[0], input_dims[1]); prev_->set_data(data); // no need to allocate memory for gradients, since this is the input layer. } void add_edge() { // add an outgoing edge - next_ = std::make_shared(this, output_dims[0], output_dims[1]); + next_ = std::make_shared(this, output_dims[0], output_dims[1]); // allocate memory for intermediate feature vectors and gradients next_->alloc(); } @@ -161,6 +164,7 @@ class layer : public node { deepgalois::Context* context; }; + // head: layer i+1, tail: layer i inline void connect(layer* head, layer* tail, size_t head_index = 0, size_t tail_index = 0) { @@ -178,3 +182,5 @@ inline void connect(layer* head, layer* tail, size_t head_index = 0, tail->prev_ = head->next_; tail->prev_->add_next_node(tail); } + +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/linear_layer.h b/libdeepgalois/include/deepgalois/layers/linear_layer.h index 55d5d245d8..d68ae12479 100644 --- a/libdeepgalois/include/deepgalois/layers/linear_layer.h +++ b/libdeepgalois/include/deepgalois/layers/linear_layer.h @@ -1,6 +1,7 @@ #pragma once #include "layer.h" +namespace deepgalois { class linear_layer : public layer { public: linear_layer(unsigned level, float_t scale, float_t bias, @@ -30,3 +31,4 @@ class linear_layer : public layer { protected: float_t scale_, bias_; }; +} // namespace diff --git a/libdeepgalois/include/deepgalois/layers/relu_layer.h b/libdeepgalois/include/deepgalois/layers/relu_layer.h index 8a7b447038..a85d51608d 100644 --- a/libdeepgalois/include/deepgalois/layers/relu_layer.h +++ b/libdeepgalois/include/deepgalois/layers/relu_layer.h @@ -1,6 +1,7 @@ #pragma once #include "layer.h" +namespace deepgalois { // ReLU Layer class relu_layer : public layer { public: @@ -19,3 +20,4 @@ class relu_layer : public layer { virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); }; +} // namespace diff --git a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h index 0fa56cf7fe..798ad7a79a 100644 --- a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h +++ b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h @@ -1,6 +1,7 @@ #pragma once #include "layer.h" +namespace deepgalois { class softmax_loss_layer : public layer { public: softmax_loss_layer(unsigned level, std::vector in_dims, @@ -14,3 +15,4 @@ class softmax_loss_layer : public layer { float_t* out_grad, float_t* in_grad); virtual acc_t get_masked_loss(); }; +} diff --git a/libdeepgalois/include/deepgalois/node.h b/libdeepgalois/include/deepgalois/node.h index fa58ddea2b..fcb20513c0 100644 --- a/libdeepgalois/include/deepgalois/node.h +++ b/libdeepgalois/include/deepgalois/node.h @@ -13,6 +13,9 @@ #include #include #include "deepgalois/types.h" + +namespace deepgalois { + class node; class layer; class edge; @@ -69,3 +72,5 @@ class edge { node* prev_; // previous node, "producer" of data node* next_; // next node, "consumer" of data }; + +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index adcd7cc33c..b496f52d57 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -1,5 +1,7 @@ #include "deepgalois/layers/graph_conv_layer.h" +namespace deepgalois { + #ifdef CPU_ONLY void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { update_all(len, g, in, out, true, context->norm_factor); @@ -157,3 +159,5 @@ void graph_conv_layer::back_propagation(const float_t* in_data, sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); } #endif + +} // namespace diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp index ce2a167cb0..7441294f83 100644 --- a/libdeepgalois/src/layers/relu_layer.cpp +++ b/libdeepgalois/src/layers/relu_layer.cpp @@ -1,5 +1,7 @@ #include "deepgalois/layers/relu_layer.h" +namespace deepgalois { + // ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) void relu_layer::forward_propagation(const tensor_t& in_data, tensor_t& out_data) { @@ -43,3 +45,5 @@ void relu_layer::back_propagation(const float_t* in_data, const size_t count = input_dims[0] * input_dims[1]; d_relu_gpu(count, out_grad, in_data, in_grad); } + +} // namespace diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index cc3e3b941b..f16ba58fbe 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -1,5 +1,7 @@ #include "deepgalois/layers/softmax_loss_layer.h" +namespace deepgalois { + softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims) @@ -85,3 +87,5 @@ acc_t softmax_loss_layer::get_masked_loss() { return masked_avg_loss(begin_, end_, count_, d_masks_, loss); } #endif + +} // namespace diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp index 4ab918e0cd..e3117d9da2 100644 --- a/libdeepgalois/src/node.cpp +++ b/libdeepgalois/src/node.cpp @@ -1,6 +1,8 @@ #include "deepgalois/node.h" #include +namespace deepgalois { + void edge::alloc() { // std::cout << "Allocating memory for tensors (intermediate features and // gradients) ...\n"; @@ -38,3 +40,5 @@ void edge::clear_grads() { clear_grads_gpu(); #endif } + +} // namespace deepgalois From 47d84e719a06a42a5c7af14be65e132a69f6f700 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 17:17:19 -0600 Subject: [PATCH 082/660] moved deepgalois node to layers (only used there) --- libdeepgalois/include/deepgalois/layers/layer.h | 2 +- libdeepgalois/include/deepgalois/{ => layers}/node.h | 0 libdeepgalois/src/node.cpp | 2 +- 3 files changed, 2 insertions(+), 2 deletions(-) rename libdeepgalois/include/deepgalois/{ => layers}/node.h (100%) diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index c0deaf6748..f30ad03b7b 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -22,7 +22,7 @@ #include #include #include -#include "deepgalois/node.h" +#include "deepgalois/layers/node.h" #include "deepgalois/types.h" #include "deepgalois/utils.h" #include "deepgalois/gtypes.h" diff --git a/libdeepgalois/include/deepgalois/node.h b/libdeepgalois/include/deepgalois/layers/node.h similarity index 100% rename from libdeepgalois/include/deepgalois/node.h rename to libdeepgalois/include/deepgalois/layers/node.h diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp index e3117d9da2..b1ee96a58b 100644 --- a/libdeepgalois/src/node.cpp +++ b/libdeepgalois/src/node.cpp @@ -1,4 +1,4 @@ -#include "deepgalois/node.h" +#include "deepgalois/layers/node.h" #include namespace deepgalois { From 6116ad939f6b7ce5ea89479c74b66f9c6d0932a8 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 17:22:17 -0600 Subject: [PATCH 083/660] utils in deepgalois namespace --- libdeepgalois/include/deepgalois/utils.h | 5 +++++ libdeepgalois/src/math_functions.cpp | 6 +++--- lonestargnn/gcn/gcn.cpp | 4 ++-- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h index 086dcf321a..8b76d570dc 100644 --- a/libdeepgalois/include/deepgalois/utils.h +++ b/libdeepgalois/include/deepgalois/utils.h @@ -8,8 +8,11 @@ #include #include +namespace deepgalois { + const std::string path = "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset + enum class net_phase { train, test }; class ResourceManager { @@ -128,3 +131,5 @@ inline size_t read_masks(std::string dataset_str, std::string mask_type, in.close(); return sample_count; } + +} diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 144419f16d..979f5ce9d7 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -344,7 +344,7 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in, assert(masks.size() == out.size()); // rng_bernoulli(1. - dropout_rate, masks); // Create random numbers for (size_t i = 0; i < in.size(); ++i) - masks[i] = bernoulli(dropout_rate); + masks[i] = deepgalois::bernoulli(dropout_rate); for (size_t i = 0; i < in.size(); ++i) out[i] = in[i] * masks[i] * scale; } @@ -352,7 +352,7 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in, void dropout(const float scale, const float dropout_rate, const vec_t& in, std::vector& masks, float_t* out) { for (size_t i = 0; i < in.size(); ++i) - masks[i] = bernoulli(dropout_rate); + masks[i] = deepgalois::bernoulli(dropout_rate); for (size_t i = 0; i < in.size(); ++i) out[i] = in[i] * masks[i] * scale; } @@ -360,7 +360,7 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in, void dropout(size_t n, const float scale, const float dropout_rate, const float_t* in, unsigned* masks, float_t* out) { for (size_t i = 0; i < n; ++i) - masks[i] = bernoulli(dropout_rate); + masks[i] = deepgalois::bernoulli(dropout_rate); for (size_t i = 0; i < n; ++i) out[i] = in[i] * masks[i] * scale; } diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 7b4977dbe1..005e6b1477 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -15,7 +15,7 @@ int main(int argc, char** argv) { network.construct_layers(); // default setting for now; can be customized by // the user network.print_layers_info(); - ResourceManager rm; + deepgalois::ResourceManager rm; // the optimizer used to update parameters, see optimizer.h for more details // optimizer *opt = new gradient_descent(); @@ -40,7 +40,7 @@ int main(int argc, char** argv) { for (size_t i = test_begin; i < test_end; i++) test_mask[i] = 1; } else - test_count = read_masks(dataset, "test", test_begin, test_end, test_mask); + test_count = deepgalois::read_masks(dataset, "test", test_begin, test_end, test_mask); galois::StatTimer Ttest("Test"); Ttest.start(); double test_time = network.evaluate(test_begin, test_end, test_count, From 78f609a837b4fb1d49af691d2ee4db83585b5fa3 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 17:24:32 -0600 Subject: [PATCH 084/660] TODOs to deepgalois utils: galois supports a lot of what is being defined there, so reuse galois instead --- libdeepgalois/include/deepgalois/utils.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h index 8b76d570dc..51c0bb5c95 100644 --- a/libdeepgalois/include/deepgalois/utils.h +++ b/libdeepgalois/include/deepgalois/utils.h @@ -15,6 +15,8 @@ const std::string path = enum class net_phase { train, test }; +//! tracks max mem usage with rusage +// TODO use Galois's getrusage functionality class ResourceManager { public: ResourceManager() {} @@ -41,6 +43,7 @@ class ResourceManager { } }; +// TODO don't need a separate timer: use Galois's regular timer class Timer { public: Timer() {} From e55eea07fe447a6c4b57d4f688871e252c08a47b Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 17:26:24 -0600 Subject: [PATCH 085/660] gtypes, math func, types TODO need namespaces --- libdeepgalois/include/deepgalois/gtypes.h | 2 ++ libdeepgalois/include/deepgalois/math_functions.hh | 2 ++ libdeepgalois/include/deepgalois/types.h | 2 ++ 3 files changed, 6 insertions(+) diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h index e11c1058cc..5278660692 100644 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ b/libdeepgalois/include/deepgalois/gtypes.h @@ -2,6 +2,8 @@ #include "galois/Galois.h" #include "galois/graphs/LCGraph.h" +// TODO namespace + typedef galois::GAccumulator AccumF; typedef galois::GAccumulator AccumU; diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 3e1af207da..b5c51203f8 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -16,6 +16,8 @@ extern "C" { //#include } +// TODO namespace + const float negative_slope = 0; void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index b669a25188..118f04bd04 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -3,6 +3,8 @@ #include #include +// TODO namespace + #ifdef CNN_USE_DOUBLE typedef double float_t; typedef double feature_t; From f1038c7b20ef4c41757e5b7aa52d034fd849df08 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 17:27:39 -0600 Subject: [PATCH 086/660] lgraph namespace deepgalois --- libdeepgalois/include/deepgalois/lgraph.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index 2eb5ec6863..7a86960338 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -8,6 +8,9 @@ #include #include #include + +namespace deepgalois { + typedef unsigned IndexT; typedef float ValueT; @@ -97,4 +100,5 @@ class LGraph { } }; +} // namespace #endif From a79de8096fb4c68757de73db72d8044965d9d18b Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 17:28:49 -0600 Subject: [PATCH 087/660] aggregator deepgalois namespace --- libdeepgalois/include/deepgalois/aggregator.h | 5 +++++ libdeepgalois/src/aggregator.cpp | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/libdeepgalois/include/deepgalois/aggregator.h b/libdeepgalois/include/deepgalois/aggregator.h index c54f8f69bc..17a8451aee 100644 --- a/libdeepgalois/include/deepgalois/aggregator.h +++ b/libdeepgalois/include/deepgalois/aggregator.h @@ -2,12 +2,17 @@ #include "deepgalois/types.h" #ifdef CPU_ONLY #include "deepgalois/gtypes.h" + +namespace deepgalois { void update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); +} #else #include "graph_gpu.h" +namespace deepgalois { void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); void update_all_cusparse(size_t len, CSRGraph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); +} #endif diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/aggregator.cpp index c2a50710dd..360300dba3 100644 --- a/libdeepgalois/src/aggregator.cpp +++ b/libdeepgalois/src/aggregator.cpp @@ -1,7 +1,7 @@ #include "deepgalois/aggregator.h" #include "deepgalois/math_functions.hh" -void update_all(size_t len, Graph& g, const float_t* in, float_t* out, +void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto& src) { From ef5447550346d613be7ef497b4c78bd7169e01b8 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 17:32:59 -0600 Subject: [PATCH 088/660] removed unused files in deepgalois --- libdeepgalois/include/unused/random.h | 71 --------------------------- libdeepgalois/include/unused/timer.h | 28 ----------- 2 files changed, 99 deletions(-) delete mode 100644 libdeepgalois/include/unused/random.h delete mode 100644 libdeepgalois/include/unused/timer.h diff --git a/libdeepgalois/include/unused/random.h b/libdeepgalois/include/unused/random.h deleted file mode 100644 index b63914bca1..0000000000 --- a/libdeepgalois/include/unused/random.h +++ /dev/null @@ -1,71 +0,0 @@ -// From Caffe library it seems -// TODO get the license from it - -#ifndef RANDOM_H -#define RANDOM_H -typedef boost::mt19937 rng_t; - -// random seeding -int64_t seedgen(void) { - int64_t s, seed, pid; - FILE* f = fopen("/dev/urandom", "rb"); - if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { - fclose(f); - return seed; - } - std::cout << "System entropy source not available, using fallback algorithm " - "to generate seed instead."; - if (f) - fclose(f); - pid = getpid(); - s = time(NULL); - seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729); - return seed; -} - -// This random number generator facade hides boost and CUDA rng -// implementation from one another (for cross-platform compatibility). -class RNG { -public: - RNG() : generator_(new Generator()) {} - explicit RNG(unsigned int seed) : generator_(new Generator(seed)) {} - explicit RNG(const RNG&); - RNG& operator=(const RNG& other) { - generator_ = other.generator_; - return *this; - } - void* generator() { return static_cast(generator_->rng()); } - -private: - class Generator { - public: - Generator() : rng_(new rng_t(seedgen())) {} - explicit Generator(unsigned seed) : rng_(new rng_t(seed)) {} - rng_t* rng() { return rng_.get(); } - - private: - std::shared_ptr rng_; - }; - - std::shared_ptr generator_; -}; - -std::shared_ptr random_generator_; -inline static RNG& rng_stream() { - random_generator_.reset(new RNG()); - return *random_generator_; -} - -inline rng_t* rng() { return static_cast(rng_stream().generator()); } - -#include -template -void rng_bernoulli(const DataTy p, std::vector& r) { - boost::bernoulli_distribution random_distribution(p); - boost::variate_generator> - variate_generator(rng(), random_distribution); - for (size_t i = 0; i < r.size(); ++i) - r[i] = static_cast(variate_generator()); -} - -#endif diff --git a/libdeepgalois/include/unused/timer.h b/libdeepgalois/include/unused/timer.h deleted file mode 100644 index af01412463..0000000000 --- a/libdeepgalois/include/unused/timer.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef TIMER_H_ -#define TIMER_H_ -#include - -class Timer { -public: - Timer() {} - void Start() { gettimeofday(&start_time_, NULL); } - void Stop() { - gettimeofday(&elapsed_time_, NULL); - elapsed_time_.tv_sec -= start_time_.tv_sec; - elapsed_time_.tv_usec -= start_time_.tv_usec; - } - double Seconds() const { - return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1e6; - } - double Millisecs() const { - return 1000 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec / 1000; - } - double Microsecs() const { - return 1e6 * elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; - } - -private: - struct timeval start_time_; - struct timeval elapsed_time_; -}; -#endif // TIMER_H_ From f9ec7df2c72ade22be05260d314ce6f56ec3c503 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 17:33:16 -0600 Subject: [PATCH 089/660] removed the gnn directory from experimental (unused) --- lonestar/experimental/gnn/CMakeLists.txt | 16 - lonestar/experimental/gnn/README.md | 60 --- lonestar/experimental/gnn/gnn.cpp | 46 -- lonestar/experimental/gnn/gnn.h | 32 -- lonestar/experimental/gnn/graph_sage.cpp | 41 -- lonestar/experimental/gnn/layers.h | 8 - .../gnn/layers/arithmetic_layer.h | 22 - .../gnn/layers/graph_conv_layer.h | 186 ------- lonestar/experimental/gnn/layers/layer.h | 156 ------ .../experimental/gnn/layers/linear_layer.h | 28 - lonestar/experimental/gnn/layers/relu_layer.h | 24 - .../gnn/layers/softmax_loss_layer.h | 47 -- lonestar/experimental/gnn/lgraph.h | 179 ------- lonestar/experimental/gnn/math_functions.hpp | 500 ------------------ lonestar/experimental/gnn/net.h | 341 ------------ lonestar/experimental/gnn/node.h | 109 ---- lonestar/experimental/gnn/optimizer.h | 221 -------- lonestar/experimental/gnn/random.h | 63 --- lonestar/experimental/gnn/run-citeseer.sh | 1 - lonestar/experimental/gnn/timer.h | 21 - lonestar/experimental/gnn/types.h | 34 -- lonestar/experimental/gnn/utils.h | 119 ----- 22 files changed, 2254 deletions(-) delete mode 100644 lonestar/experimental/gnn/CMakeLists.txt delete mode 100644 lonestar/experimental/gnn/README.md delete mode 100644 lonestar/experimental/gnn/gnn.cpp delete mode 100644 lonestar/experimental/gnn/gnn.h delete mode 100644 lonestar/experimental/gnn/graph_sage.cpp delete mode 100644 lonestar/experimental/gnn/layers.h delete mode 100644 lonestar/experimental/gnn/layers/arithmetic_layer.h delete mode 100644 lonestar/experimental/gnn/layers/graph_conv_layer.h delete mode 100644 lonestar/experimental/gnn/layers/layer.h delete mode 100644 lonestar/experimental/gnn/layers/linear_layer.h delete mode 100644 lonestar/experimental/gnn/layers/relu_layer.h delete mode 100644 lonestar/experimental/gnn/layers/softmax_loss_layer.h delete mode 100644 lonestar/experimental/gnn/lgraph.h delete mode 100644 lonestar/experimental/gnn/math_functions.hpp delete mode 100644 lonestar/experimental/gnn/net.h delete mode 100644 lonestar/experimental/gnn/node.h delete mode 100644 lonestar/experimental/gnn/optimizer.h delete mode 100644 lonestar/experimental/gnn/random.h delete mode 100755 lonestar/experimental/gnn/run-citeseer.sh delete mode 100644 lonestar/experimental/gnn/timer.h delete mode 100644 lonestar/experimental/gnn/types.h delete mode 100644 lonestar/experimental/gnn/utils.h diff --git a/lonestar/experimental/gnn/CMakeLists.txt b/lonestar/experimental/gnn/CMakeLists.txt deleted file mode 100644 index cff09bb371..0000000000 --- a/lonestar/experimental/gnn/CMakeLists.txt +++ /dev/null @@ -1,16 +0,0 @@ -SET(USE_BLAS ON CACHE BOOL "Use blas") - -SET(BLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) -SET(BLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) - -if (USE_BLAS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DWITH_BLAS") - include_directories(${BLAS_INC}) - link_directories(${BLAS_LIB}) -endif() - -app(gnn gnn.cpp) - -if (USE_BLAS) - target_link_libraries(gnn -lopenblas) -endif() diff --git a/lonestar/experimental/gnn/README.md b/lonestar/experimental/gnn/README.md deleted file mode 100644 index 930609763c..0000000000 --- a/lonestar/experimental/gnn/README.md +++ /dev/null @@ -1,60 +0,0 @@ -DESCRIPTION -=========== - -This application does vertex classification in an undirected graph. -It uses graph neural network (GNN) to train the vertex features -which are then used to classify vertices into different classes. - -INPUT -=========== - -The input dataset contains three parts: -1. the input graph file: edgelist format of a |V| x |V| sparse matrix. -2. the vertex label file: |V| lines with each line a integer. -3. the input feature file: edgelist format of |V| x |D| sparse matrix. - -Vertex ids are expected to be sequential integers between 0 and |V|-1. -|V| is the number of vertices. |D| is the dimension of input feature vectors. - -BUILD -=========== - -1. Run cmake at BUILD directory `cd build; cmake -DUSE_EXP=1 ../` - -2. Run `cd /lonestar/experimental/gnn; make -j` - -RUN -=========== - -The following are a few example command lines. - -$ export OPENBLAS_NUM_THREADS=28 -$ ./gnn cora -t=1 -k=3 -$ ./gnn citeseer -t=3 -k=30 -$ ./gnn reddit -t=56 -k=3 - -PERFORMANCE -=========== -- I -- I -- I - -REFERENCES -=========== -The GCN model: -Semi-Supervised Classification with Graph Convolutional Networks (ICLR 2017) -http://arxiv.org/abs/1609.02907 -https://github.com/tkipf/gcn - -DGL: -Deep Graph Library: Towards Efficient and Scalable Deep Learning on Graphs -https://arxiv.org/abs/1909.01315 -https://github.com/dmlc/dgl - -GraphSAGE: -Inductive Representation Learning on Large Graphs -http://snap.stanford.edu/graphsage/ - -NeuGraph: Parallel Deep Neural Network Computation on Large Graphs -https://www.usenix.org/conference/atc19/presentation/ma - diff --git a/lonestar/experimental/gnn/gnn.cpp b/lonestar/experimental/gnn/gnn.cpp deleted file mode 100644 index 97cb2620af..0000000000 --- a/lonestar/experimental/gnn/gnn.cpp +++ /dev/null @@ -1,46 +0,0 @@ -// Graph Neural Networks -// Xuhao Chen -#include "gnn.h" - -const char* name = "Graph Convolutional Networks"; -const char* desc = "Graph convolutional neural networks on an undirected graph"; -const char* url = 0; - -int main(int argc, char** argv) { - galois::SharedMemSys G; - LonestarStart(argc, argv, name, desc, url); - Net network; // the neural network to train - network.init(); - network.construct_layers(); // default setting for now; see its implementation to find how to customize it by the user - network.print_layers_info(); - ResourceManager rm; - - // the optimizer used to update parameters, see optimizer.h for more details - //optimizer *opt = new gradient_descent(); - //optimizer *opt = new adagrad(); - optimizer *opt = new adam(); - galois::StatTimer Ttrain("TrainAndVal"); - Ttrain.start(); - network.train(opt); // do training using training samples - Ttrain.stop(); - - if (do_test) { - // test using test samples - size_t n = network.get_nnodes(); - acc_t test_loss = 0.0, test_acc = 0.0; - size_t test_begin = 0, test_end = n, test_count = n; - MaskList test_mask(n, 0); - if (dataset == "reddit") { - test_begin = 177262; test_count = 55703; test_end = test_begin + test_count; - for (size_t i = test_begin; i < test_end; i++) test_mask[i] = 1; - } else test_count = read_masks(dataset, "test", test_begin, test_end, test_mask); - galois::StatTimer Ttest("Test"); - Ttest.start(); - double test_time = network.evaluate(test_begin, test_end, test_count, test_mask, test_loss, test_acc); - std::cout << "\nTesting: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n"; - Ttest.stop(); - } - std::cout << "\n" << rm.get_peak_memory() << "\n\n"; - return 0; -} - diff --git a/lonestar/experimental/gnn/gnn.h b/lonestar/experimental/gnn/gnn.h deleted file mode 100644 index f80dacf4ed..0000000000 --- a/lonestar/experimental/gnn/gnn.h +++ /dev/null @@ -1,32 +0,0 @@ -#ifndef _GNN_H_ -#define _GNN_H_ - -#include "galois/Galois.h" -#include "galois/Reduction.h" -#include "galois/Timer.h" -#include "galois/ParallelSTL.h" -#include "llvm/Support/CommandLine.h" -#include "Lonestar/BoilerPlate.h" -#include "galois/runtime/Profile.h" -#include - -namespace cll = llvm::cl; -static cll::opt dataset(cll::Positional, cll::desc(""), cll::Required); // 'cora', 'citeseer', 'pubmed' -static cll::opt filetype(cll::Positional, cll::desc(""), cll::init("gr")); // file format of the input graph -static cll::opt model("m", cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense' -static cll::opt learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01)); -static cll::opt epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1)); -static cll::opt hidden1("h", cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16)); -static cll::opt dropout_rate("d", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5)); -static cll::opt weight_decay("wd", cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4)); -static cll::opt early_stopping("es", cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10)); -static cll::opt max_degree("md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"), cll::init(3)); -static cll::opt do_validate("dv", cll::desc("enable validation"), cll::init(1)); -static cll::opt do_test("dt", cll::desc("enable test"), cll::init(1)); -#define CHUNK_SIZE 256 - -#include "types.h" -#include "utils.h" -#include "net.h" - -#endif diff --git a/lonestar/experimental/gnn/graph_sage.cpp b/lonestar/experimental/gnn/graph_sage.cpp deleted file mode 100644 index b70cdc183c..0000000000 --- a/lonestar/experimental/gnn/graph_sage.cpp +++ /dev/null @@ -1,41 +0,0 @@ -// Graph Neural Networks -// Xuhao Chen -#include "gnn.h" - -const char* name = "GraphSage"; -const char* desc = "A graph neural network variant: GraphSAGE"; -const char* url = 0; - -class GraphSageMean: public graph_conv_layer { - // user-defined combine function -}; - -int main(int argc, char** argv) { - galois::SharedMemSys G; - LonestarStart(argc, argv, name, desc, url); - Net network; // the neural network to train - network.init(); // default setting for now; see its implementation to find how to customize it by the user - ResourceManager rm; - - // the optimizer used to update parameters, see optimizer.h for more details - //optimizer *opt = new gradient_descent(); - //optimizer *opt = new adagrad(); - optimizer *opt = new adam(); - galois::StatTimer Ttrain("Train"); - Ttrain.start(); - network.train(opt); // do training using training samples - Ttrain.stop(); - - // test using test samples - acc_t test_loss = 0.0, test_acc = 0.0; - size_t test_begin = 2312, test_end = 3312; // [2312, 3327) test size = 1015 TODO: replace ad-hoc settings - galois::StatTimer Ttest("Test"); - Ttest.start(); - double test_time = network.evaluate(test_begin, test_end, test_loss, test_acc); - std::cout << "\nTesting: test_loss = " << test_loss << " test_acc = " << test_acc << " test_time = " << test_time << "\n"; - Ttest.stop(); - - std::cout << "\n" << rm.get_peak_memory() << "\n\n"; - return 0; -} - diff --git a/lonestar/experimental/gnn/layers.h b/lonestar/experimental/gnn/layers.h deleted file mode 100644 index 9650e931a9..0000000000 --- a/lonestar/experimental/gnn/layers.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef _LAYERS_H_ -#define _LAYERS_H_ -#include "layers/relu_layer.h" -#include "layers/linear_layer.h" -#include "layers/arithmetic_layer.h" -#include "layers/graph_conv_layer.h" -#include "layers/softmax_loss_layer.h" -#endif diff --git a/lonestar/experimental/gnn/layers/arithmetic_layer.h b/lonestar/experimental/gnn/layers/arithmetic_layer.h deleted file mode 100644 index aed91e0379..0000000000 --- a/lonestar/experimental/gnn/layers/arithmetic_layer.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once -#include "layer.h" - -// element-wise add N vectors ```y_i = x0_i + x1_i + ... + xnum_i``` -class elementwise_add_layer : public layer { -public: - elementwise_add_layer(unsigned level, std::vector in_dim, - std::vector out_dim) : layer(level, in_dim, out_dim) { - trainable_ = false; - } - std::string layer_type() const override { return std::string("elementwise_add"); } - void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { - for (size_t sample = 0; sample < in_data.size(); ++sample) { - for (size_t j = 0; j < in_data[0].size(); j++) - out_data[sample][j] = in_data[sample][j]; - } - } - void back_propagation(const tensor_t &in_data, const tensor_t &out_data, - tensor_t &out_grad, tensor_t &in_grad) override { - in_grad = out_grad; - } -}; diff --git a/lonestar/experimental/gnn/layers/graph_conv_layer.h b/lonestar/experimental/gnn/layers/graph_conv_layer.h deleted file mode 100644 index b81f7bc10e..0000000000 --- a/lonestar/experimental/gnn/layers/graph_conv_layer.h +++ /dev/null @@ -1,186 +0,0 @@ -#pragma once -#include "layer.h" - -/* GraphConv Layer - Parameters - ---------- - x: int, number of samples. - y: int, Input feature size. - z: int, Output feature size. - dropout: bool, optional, if True, a dropout operation is applied before other operations. - norm : bool, optional, if True, the normalizer :math:`c_{ij}` is applied. Default: ``True``. - bias : bool, optional, if True, adds a learnable bias to the output. Default: ``False``. - activation: callable activation function/layer or None, optional - If not None, applies an activation function to the updated node features. Default: ``None``. -*/ -class graph_conv_layer: public layer { -public: - graph_conv_layer(unsigned level, Graph *g, bool act, bool norm, bool bias, bool dropout, - std::vector in_dims, std::vector out_dims) : - layer(level, in_dims, out_dims), graph(g), act_(act), norm_(norm), bias_(bias), dropout_(dropout) { - assert(input_dims[0] == output_dims[0]); // num_vertices - x = input_dims[0]; - y = input_dims[1]; - z = output_dims[1]; - trainable_ = true; - name_ = layer_type() + "_" + std::to_string(level); - //std::cout << name_ << " constructed: act(" << act_ << ") dropout(" << dropout << ")\n"; - init(); - } - void init() { - std::cout << name_ << ": allocating memory for parameters and intermediate data... "; - Timer t_alloc; - t_alloc.Start(); - // randomly initialize trainable parameters for conv layers - rand_init_matrix(y, z, W); - //rand_init_matrix(y, z, Q); - zero_init_matrix(y, z, weight_grad); - alloc_grad(); - if (dropout_) { - dropout_mask.resize(x); - for (size_t i = 0; i < x; i++) dropout_mask[i].resize(y); - } - in_temp.resize(x*y); - //for (size_t i = 0; i < x; ++i) in_temp[i].resize(y); - out_temp.resize(x*z); // same as pre_sup in original GCN code: https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py - //for (size_t i = 0; i < x; ++i) out_temp[i].resize(z); - trans_data.resize(y*x); // y*x - //for (size_t i = 0; i < y; ++i) trans_data[i].resize(x); - if (norm_) norm_factor_counting(); - t_alloc.Stop(); - std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; - } - graph_conv_layer(unsigned level, std::vector in_dims, - std::vector out_dims) : graph_conv_layer(level, NULL, false, true, false, true, in_dims, out_dims) {} - ~graph_conv_layer() {} - std::string layer_type() const override { return std::string("graph_conv"); } - - // user-defined aggregate function - void aggregate(Graph *g, const vec_t &in, tensor_t &out) { update_all(g, in, out, true, norm_factor); } - - // user-defined combine function - void combine(const vec_t &self, const vec_t &neighbors, const vec_t mat_v, const vec_t mat_u, vec_t &out) { - vec_t a(out.size(), 0); - vec_t b(out.size(), 0); - mvmul(mat_v, self, a); - mvmul(mat_u, neighbors, b); - vadd(a, b, out); // out = W*self + Q*neighbors - } - - void set_context(net_phase ctx) override { phase_ = ctx; } - - // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) - void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { - // input: x*y; W: y*z; output: x*z - // if y > z: - // mult W first to reduce the feature size for aggregation - // else: aggregate first then mult W (not implemented yet) - //Timer t_matmul, t_agg, t_dropout; - //t_matmul.Start(); - if (dropout_ && phase_ == net_phase::train) { - //t_dropout.Start(); - //for (size_t i = 0; i < x; ++i) { - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - dropout(in_data[i], dropout_mask[i], &in_temp[i*y]); - }, galois::loopname("dropout")); - //t_dropout.Stop(); - matmul1D1D(x, z, y, in_temp, W, out_temp); // x*y; y*z; x*z - } else matmul2D1D(z, in_data, W, out_temp); // x*y; y*z; x*z - //t_matmul.Stop(); - //t_agg.Start(); - aggregate(graph, out_temp, out_data); // aggregate - //t_agg.Stop(); - if (act_) { - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - relu(out_data[i], out_data[i]); - }, galois::loopname("relu")); - } - //double dropout_time = 0; - //if (dropout_ && phase_ == net_phase::train) dropout_time = t_dropout.Millisecs(); - //std::cout << "\n\t" << name_ << " matmul time: " << t_matmul.Millisecs() - // << ", aggregation time: " << t_agg.Millisecs() << ", dropout time: " << dropout_time << "\n"; - } - - // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ - void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) override { - if (act_) { - //for (size_t j = 0; j < z; ++j) - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - for (size_t j = 0; j < z; ++j) - //if (out_data[i][j] <= 0.0) out_temp[i][j] = 0.0; - out_temp[i*z+j] = out_data[i][j] > float_t(0) ? out_grad[i][j] : float_t(0); - }, galois::loopname("d_relu")); - //} else out_temp = out_grad; // TODO: avoid copying - } else copy2D1D(out_grad, out_temp); - if (level_ != 0) { // no need to calculate in_grad for the first layer - vec_t trans_W(z*y); - transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix - matmul1D1D(x, y, z, out_temp, trans_W, in_temp); // x*z; z*y -> x*y - update_all(graph, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y NOTE: since graph is symmetric, the derivative is the same - if (dropout_) { - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - d_dropout(in_grad[i], dropout_mask[i], in_grad[i]); - }, galois::chunk_size(), galois::steal(), galois::loopname("d_dropout")); - } - } - - // calculate weight gradients - transpose2D1D(in_data, trans_data); // y*x - matmul1D1D(y, z, x, trans_data, out_temp, weight_grad); // y*x; x*z; y*z - } - - void degree_counting() { - assert(x == graph->size()); - degrees.resize(x); - galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) { - degrees[v] = std::distance(graph->edge_begin(v), graph->edge_end(v)); - }, galois::loopname("DegreeCounting")); - } - - // for each vertex v, compute pow(|N(v)|, -0.5), where |N(v)| is the degree of v - void norm_factor_counting() { - degree_counting(); - norm_factor.resize(x); - galois::do_all(galois::iterate((size_t)0, x), [&] (auto v) { - float_t temp = std::sqrt(float_t(degrees[v])); - if (temp == 0.0) norm_factor[v] = 0.0; - else norm_factor[v] = 1.0 / temp; - }, galois::loopname("NormCounting")); - } - -private: - Graph *graph; - bool act_; // whether to use activation function at the end - bool norm_; // whether to normalize data - bool bias_; // whether to add bias afterwards - bool dropout_; // whether to use dropout at first - net_phase phase_; - size_t x; - size_t y; - size_t z; - vec_t out_temp; - vec_t in_temp; - vec_t trans_data; // y*x - std::vector degrees; - std::vector norm_factor; // normalization constant based on graph structure - std::vector > dropout_mask; - - // Glorot & Bengio (AISTATS 2010) init - inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) { - auto init_range = sqrt(6.0/(dim_x + dim_y)); - std::default_random_engine rng; - std::uniform_real_distribution dist(-init_range, init_range); - matrix.resize(dim_x * dim_y); - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) - matrix[i*dim_y+j] = dist(rng); - } - } - inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t &matrix) { - matrix.resize(dim_x * dim_y); - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) - matrix[i*dim_y+j] = 0; - } - } -}; diff --git a/lonestar/experimental/gnn/layers/layer.h b/lonestar/experimental/gnn/layers/layer.h deleted file mode 100644 index 4a8a545738..0000000000 --- a/lonestar/experimental/gnn/layers/layer.h +++ /dev/null @@ -1,156 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include "../node.h" -#include "../types.h" -#include "../utils.h" -#include "../optimizer.h" -#include "../math_functions.hpp" -/** - * base class of all kind of NN layers - * - * sub-class should override these methods: - * - forward_propagation ... body of forward-pass calculation - * - back_propagation ... body of backward-pass calculation - * - in_shape ... specify input data shapes - * - out_shape ... specify output data shapes - * - layer_type ... name of layer - **/ - -class layer : public node { -public: - layer(unsigned level, std::vector in_dims, std::vector out_dims) : - node(in_dims.size(), out_dims.size()), - level_(level), begin_(0), end_(0), num_dims(in_dims.size()), - input_dims(in_dims), output_dims(out_dims) { add_edge(); } - virtual ~layer() = default; - virtual void forward_propagation(const tensor_t &in_data, tensor_t &out_data) = 0; - virtual void back_propagation(const tensor_t &in_data, const tensor_t &out_data, - tensor_t &out_grad, tensor_t &in_grad) = 0; - virtual std::string layer_type() const = 0; - virtual void set_context(net_phase ctx) {} - //virtual void setup(Graph *g, vec_t *diff, LabelList *lab) = 0; - - void set_trainable(bool trainable) { trainable_ = trainable; } - bool trainable() const { return trainable_; } - void set_name(std::string name) { name_ = name; } - std::string get_name() { return name_; } - void print_layer_info() { - std::cout << "Layer" << level_ << " type: " << layer_type() - << " input[" << input_dims[0] << "," << input_dims[1] - << "] output[" << output_dims[0] << "," << output_dims[1] << "]\n"; - } - virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, MaskList &masks) { - begin_ = sample_begin; - end_ = sample_end; - count_ = sample_count; - masks_ = masks; - } - void set_in_data(tensor_t data) { - prev_ = std::make_shared(this, input_dims[1]); - prev_->get_data() = data; - prev_->get_gradient().resize(input_dims[0]); - // allocate memory for intermediate gradients - //std::cout << "l0 in_grad alloc: x=" << output_dims[0] << ", y=" << output_dims[1] << "\n"; - for (size_t i = 0; i < input_dims[0]; ++i) - prev_->get_gradient()[i].resize(input_dims[1]); - } - void add_edge() { - // add an outgoing edge - next_ = std::make_shared(this, output_dims[1]); - // allocate memory for intermediate feature vectors - next_->get_data().resize(output_dims[0]); - for (size_t i = 0; i < output_dims[0]; ++i) - next_->get_data()[i].resize(output_dims[1]); - } - void alloc_grad() { - // allocate memory for intermediate gradients - //std::cout << "l" << level_ << " out_grad alloc: x=" << output_dims[0] << ", y=" << output_dims[1] << "\n"; - next_->get_gradient().resize(output_dims[0]); - for (size_t i = 0; i < output_dims[0]; ++i) - next_->get_gradient()[i].resize(output_dims[1]); - } - void forward() { - forward_propagation(prev()->get_data(), next()->get_data()); - } - void backward() { - back_propagation(prev()->get_data(), next()->get_data(), next()->get_gradient(), prev()->get_gradient()); - } - void update_weight(optimizer *opt) { - //std::cout << "[debug] " << name_ << ": updating weight...\n"; - // parallelize only when target size is big enough to mitigate thread spawning overhead. - bool parallel = (W.size() >= 512); - //vec_t diff; - //prev()->merge_grads(&diff); - //auto in_data = prev()->get_data(); - //float_t rcp_batch_size = float_t(1.0) / in_data.size(); - //for (size_t i = 0; i < diff.size(); ++i) - // diff[i] *= rcp_batch_size; - opt->update(weight_grad, W, parallel); // W += grad - prev()->clear_grads(); - } - inline acc_t get_masked_loss() { - //acc_t total_loss = acc_t(0); - //size_t valid_sample_count = 0; - AccumF total_loss; - AccumU valid_sample_count; - total_loss.reset(); - valid_sample_count.reset(); - //for (size_t i = begin_; i < end_; i ++) { - galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - if (masks_[i]) { - total_loss += loss[i]; - valid_sample_count += 1; - } - }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); - //} - assert(valid_sample_count.reduce() == count_); - return total_loss.reduce() / (acc_t)count_; - } - -protected: - unsigned level_; // layer id: [0, num_layers-1] - size_t begin_; // sample begin index - size_t end_; // sample end index - size_t count_; // number of samples - MaskList masks_; // masks to show which samples are valid - size_t num_dims; // number of dimensions - std::vector input_dims; // input dimensions - std::vector output_dims; // output dimentions - std::string name_; // name of this layer - bool trainable_; // is this layer trainable - vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E - vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x 16, layer1: 16 x E - vec_t weight_grad; // weight gradient for updating parameters - vec_t loss; // error for each vertex: N x 1 -}; - -// head: layer i+1, tail: layer i -inline void connect(layer *head, layer *tail, - size_t head_index = 0, size_t tail_index = 0) { - //auto out_shape = head->out_shape()[head_index]; - //auto in_shape = tail->in_shape()[tail_index]; - //head->setup(false); - //if (in_shape.size() == 0) { - // tail->set_in_shape(out_shape); - // in_shape = out_shape; - //} - //if (out_shape.size() != in_shape.size()) - // connection_mismatch(*head, *tail); - //if (!head->next_[head_index]) - // throw nn_error("output edge must not be null"); - tail->prev_ = head->next_; - tail->prev_->add_next_node(tail); -} - diff --git a/lonestar/experimental/gnn/layers/linear_layer.h b/lonestar/experimental/gnn/layers/linear_layer.h deleted file mode 100644 index e4ff524f3f..0000000000 --- a/lonestar/experimental/gnn/layers/linear_layer.h +++ /dev/null @@ -1,28 +0,0 @@ -#pragma once -#include "layer.h" - -class linear_layer : public layer { -public: - linear_layer(unsigned level, float_t scale, float_t bias, - std::vector in_dims, std::vector out_dims) : - layer(level, in_dims, out_dims), scale_(scale), bias_(bias) { - trainable_ = false; } - linear_layer(unsigned level, std::vector in_dim, - std::vector out_dim) : linear_layer(level, 1.0, 0.0, in_dim, out_dim) { } - std::string layer_type() const override { return "linear"; } - - void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { - for (size_t sample = 0; sample < input_dims[0]; ++sample) { - for (size_t i = 0; i < input_dims[1]; i ++) - out_data[sample][i] = scale_ * in_data[sample][i] + bias_; - } - } - void back_propagation(const tensor_t &in_data, const tensor_t &out_data, - tensor_t &out_grad, tensor_t &in_grad) override { - for (size_t sample = 0; sample < input_dims[0]; ++sample) - for (size_t i = 0; i < input_dims[1]; i++) - in_grad[sample][i] = out_grad[sample][i] * scale_; - } -protected: - float_t scale_, bias_; -}; diff --git a/lonestar/experimental/gnn/layers/relu_layer.h b/lonestar/experimental/gnn/layers/relu_layer.h deleted file mode 100644 index 389e6b3c1f..0000000000 --- a/lonestar/experimental/gnn/layers/relu_layer.h +++ /dev/null @@ -1,24 +0,0 @@ -#pragma once -#include "layer.h" - -// ReLU Layer -class relu_layer : public layer { -public: - relu_layer(unsigned level, std::vector in_dims, std::vector out_dims) - : layer(level, in_dims, out_dims) { - trainable_ = false; - } - std::string layer_type() const override { return std::string("relu"); } - // ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) - void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { - galois::do_all(galois::iterate((size_t)0, input_dims[0]), [&](const auto& i) { - for (size_t j = 0; j < input_dims[1]; ++j) - out_data[i][j] = std::max(in_data[i][j], (float_t)0) + - negative_slope * std::min(in_data[i][j], (float_t)0); - }, galois::chunk_size(), galois::steal(), galois::loopname("relu_layer-fw")); - } - // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) - // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ , ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ - void back_propagation(const tensor_t &in_data, const tensor_t &out_data, - tensor_t &out_grad, tensor_t &in_grad) override {} -}; diff --git a/lonestar/experimental/gnn/layers/softmax_loss_layer.h b/lonestar/experimental/gnn/layers/softmax_loss_layer.h deleted file mode 100644 index bdd52e4d38..0000000000 --- a/lonestar/experimental/gnn/layers/softmax_loss_layer.h +++ /dev/null @@ -1,47 +0,0 @@ -#pragma once -#include "layer.h" - -class softmax_loss_layer: public layer { -public: - softmax_loss_layer(unsigned level, std::vector in_dims, - std::vector out_dims, LabelList *lab) - : layer(level, in_dims, out_dims), labels(lab) { - trainable_ = false; - loss.resize(in_dims[0]); // error for each sample - name_ = layer_type() + "_" + std::to_string(level); - } - softmax_loss_layer(unsigned level, std::vector in_dims, - std::vector out_dims) : - softmax_loss_layer(level, in_dims, out_dims, NULL) {} - ~softmax_loss_layer() {} - std::string layer_type() const override { return std::string("softmax_loss"); } - - // TODO: need kernel fusion optimization - // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] - void forward_propagation(const tensor_t &in_data, tensor_t &out_data) override { - galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - if (masks_[i] == 1) { // masked - softmax(in_data[i], out_data[i]); // normalize using softmax - // y is a one hot encoded vector for the labels - std::vector y(output_dims[1], 0.0); // ground truth - y[(*labels)[i]] = 1.0; // one-hot - loss[i] = cross_entropy(y, out_data[i]); - } - }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-fw")); - } - - void back_propagation(const tensor_t &in_data, const tensor_t &out_data, tensor_t &out_grad, tensor_t &in_grad) override { - //std::cout << name_ << " backward: x=" << in_grad.size() << ", y=" << in_grad[0].size() << "\n"; - galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - vec_t norm_grad(output_dims[1]); - std::vector y(output_dims[1], 0.0); // ground truth - y[(*labels)[i]] = 1.0; - d_cross_entropy(y, out_data[i], norm_grad); - d_softmax(in_data[i], out_data[i], in_grad[i], norm_grad); - }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); - } - -private: - LabelList *labels; -}; - diff --git a/lonestar/experimental/gnn/lgraph.h b/lonestar/experimental/gnn/lgraph.h deleted file mode 100644 index 78f6f76aec..0000000000 --- a/lonestar/experimental/gnn/lgraph.h +++ /dev/null @@ -1,179 +0,0 @@ -#ifndef __LGRAPH_HPP__ -#define __LGRAPH_HPP__ - -//defines the Learning Graph (LGraph) data structure -#include -#include -#include -#include -#include -#include -typedef unsigned IndexT; -typedef float ValueT; - -struct Edge { - IndexT src; - IndexT dst; - ValueT elabel; - Edge() : src(0), dst(0), elabel(0) {} - Edge(IndexT from, IndexT to, ValueT el) : - src(from), dst(to), elabel(el) {} - std::string to_string() const { - std::stringstream ss; - ss << "e(" << src << "," << dst << "," << elabel << ")"; - return ss.str(); - } -}; -typedef std::vector EdgeList; - -class LGraph { -public: - LGraph() : symmetrize_(false), directed_(false) {} - void clean() { - delete[] rowptr_; - delete[] colidx_; - delete[] weight_; - degrees.clear(); - el.clear(); - //labels_.clear(); - //vertices.clear(); - } - bool directed() const { return directed_; } - size_t num_vertices() const { return num_vertices_; } - size_t num_edges() const { return num_edges_; } - IndexT * out_rowptr() const { return rowptr_; } - IndexT * out_colidx() const { return colidx_; } - unsigned out_degree(IndexT n) const { return rowptr_[n+1] - rowptr_[n]; } - IndexT get_offset(IndexT n) { return rowptr_[n]; } - IndexT get_dest(IndexT n) { return colidx_[n]; } - ValueT get_weight(IndexT n) { return weight_[n]; } - unsigned get_max_degree() { return max_degree; } - //ValueT * labels() { return labels_.data(); } - //ValueT get_label(IndexT n) { return labels_[n]; } - void read_edgelist(const char *filename, bool symmetrize = false) { - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - IndexT max_vid = 0; - while (std::getline(in, line)) { - std::istringstream edge_stream(line); - IndexT u, v; - edge_stream >> u; - edge_stream >> v; - el.push_back(Edge(u, v, 1)); - if (symmetrize) el.push_back(Edge(v, u, 1)); - if (u > max_vid) max_vid = u; - if (v > max_vid) max_vid = v; - } - in.close(); - directed_ = true; - num_vertices_ = max_vid+1; - num_edges_ = el.size(); - std::cout << "num_vertices_ " << num_vertices_ << " num_edges_ " << num_edges_ << "\n"; - MakeGraphFromEL(); - } - -private: - EdgeList el; - bool symmetrize_; // whether to symmetrize a directed graph - bool directed_; - size_t num_vertices_; - size_t num_edges_; - IndexT *rowptr_; - IndexT *colidx_; - ValueT *weight_; - unsigned max_degree; - std::vector degrees; - std::vector labels_; - std::vector > vertices; - - static bool compare_id(Edge a, Edge b) { return (a.dst < b.dst); } - - void MakeGraphFromEL() { - SquishGraph(); - MakeCSR(false); - } - - void SquishGraph(bool remove_selfloops = true, bool remove_redundents = true) { - std::vector neighbors; - for (size_t i = 0; i < num_vertices_; i++) - vertices.push_back(neighbors); - for (size_t i = 0; i < num_edges_; i ++) - vertices[el[i].src].push_back(el[i]); - el.clear(); - printf("Sorting the neighbor lists..."); - for (size_t i = 0; i < num_vertices_; i ++) - std::sort(vertices[i].begin(), vertices[i].end(), compare_id); - printf(" Done\n"); - //remove self loops - int num_selfloops = 0; - if(remove_selfloops) { - printf("Removing self loops..."); - for(size_t i = 0; i < num_vertices_; i ++) { - for(unsigned j = 0; j < vertices[i].size(); j ++) { - if(i == vertices[i][j].dst) { - vertices[i].erase(vertices[i].begin()+j); - num_selfloops ++; - j --; - } - } - } - printf(" %d selfloops are removed\n", num_selfloops); - num_edges_ -= num_selfloops; - } - // remove redundent - int num_redundents = 0; - if(remove_redundents) { - printf("Removing redundent edges..."); - for (size_t i = 0; i < num_vertices_; i ++) { - for (unsigned j = 1; j < vertices[i].size(); j ++) { - if (vertices[i][j].dst == vertices[i][j-1].dst) { - vertices[i].erase(vertices[i].begin()+j); - num_redundents ++; - j --; - } - } - } - printf(" %d redundent edges are removed\n", num_redundents); - num_edges_ -= num_redundents; - } - } - - void MakeCSR(bool transpose) { - degrees.resize(num_vertices_); - std::fill(degrees.begin(), degrees.end(), 0); - for (size_t i = 0; i < num_vertices_; i ++) - degrees[i] = vertices[i].size(); - max_degree = *(std::max_element(degrees.begin(), degrees.end())); - - std::vector offsets(degrees.size() + 1); - IndexT total = 0; - for (size_t n = 0; n < degrees.size(); n++) { - offsets[n] = total; - total += degrees[n]; - } - offsets[degrees.size()] = total; - - assert(num_edges_ == offsets[num_vertices_]); - weight_ = new ValueT[num_edges_]; - colidx_ = new IndexT[num_edges_]; - rowptr_ = new IndexT[num_vertices_+1]; - for (size_t i = 0; i < num_vertices_+1; i ++) rowptr_[i] = offsets[i]; - for (size_t i = 0; i < num_vertices_; i ++) { - for (auto it = vertices[i].begin(); it < vertices[i].end(); it ++) { - Edge e = *it; - assert(i == e.src); - if (symmetrize_ || (!symmetrize_ && !transpose)) { - weight_[offsets[e.src]] = e.elabel; - colidx_[offsets[e.src]++] = e.dst; - } - if (symmetrize_ || (!symmetrize_ && transpose)) { - weight_[offsets[e.dst]] = e.elabel; - colidx_[offsets[e.dst]++] = e.src; - } - } - } - } -}; - -#endif diff --git a/lonestar/experimental/gnn/math_functions.hpp b/lonestar/experimental/gnn/math_functions.hpp deleted file mode 100644 index 8791416441..0000000000 --- a/lonestar/experimental/gnn/math_functions.hpp +++ /dev/null @@ -1,500 +0,0 @@ -#ifndef _MATH_FUNCTIONS_ -#define _MATH_FUNCTIONS_ -#include -#include "utils.h" -#include - -#ifdef WITH_BLAS -extern "C" { -#include -//#include -} -#endif - -const float negative_slope = 0; - -// vector add -template -inline void vadd(const std::vector &a, const std::vector &b, std::vector &out) { - //for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i]; - size_t n = out.size(); - size_t vec_len = 8; - const size_t alignedN = n - n % vec_len; - for (size_t i = 0; i < alignedN; i += vec_len) - _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); - for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i]; -} - -template -inline void vadd(size_t n, const DataTy *a, const DataTy *b, DataTy *out) { - size_t vec_len = 8; - const size_t alignedN = n - n % vec_len; - for (size_t i = 0; i < alignedN; i += vec_len) - _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); - for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i]; -} - -// vector subtract -template -inline void vsub(const std::vector &in_a, const std::vector &in_b, std::vector &out) { - for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] - in_b[i]; -} - -// vector multiply -template -inline void vmul(const std::vector &in_a, const std::vector &in_b, std::vector &out) { - for (size_t i = 0; i < out.size(); ++i) out[i] = in_a[i] * in_b[i]; -} - -// vector divide -template -inline void vdiv(const std::vector &in_a, const std::vector &in_b, std::vector &out) { - for (size_t i = 0; i < out.size(); ++i) { - assert(in_b[i] != 0); - out[i] = in_a[i] / in_b[i]; - } -} - -// vector add scalar -template -inline void add_scalar(const DataTy alpha, std::vector &Y) { - for (size_t i = 0; i < Y.size(); ++i) Y[i] += alpha; -} - -// vector subtract scalar -template -inline void sub_scalar(const DataTy alpha, std::vector &Y) { - for (size_t i = 0; i < Y.size(); ++i) Y[i] -= alpha; -} - -// vector multiply scalar -template -inline void mul_scalar(const DataTy alpha, std::vector &Y) { - for (size_t i = 0; i < Y.size(); ++i) Y[i] *= alpha; -} - -template -inline void mul_scalar(size_t n, const DataTy alpha, const DataTy *in, DataTy *out) { - for (size_t i = 0; i < n; ++i) out[i] = alpha *in[i]; -} - -// vector divide scalar -template -inline void div_scalar(const DataTy alpha, std::vector &Y) { - assert(alpha != 0); - for (size_t i = 0; i < Y.size(); ++i) Y[i] /= alpha; -} - -// dot product -template -inline DataTy dot(const std::vector &x, const std::vector &y) { - DataTy sum = 0; - for (size_t i = 0; i < x.size(); ++i) - sum += x[i] * y[i]; - return sum; -} - -// matrix-vector multiply -inline void mvmul(const vec_t &matrix, const vec_t &in_vector, vec_t &out_vector) { - size_t m = out_vector.size(); - size_t n = in_vector.size(); - for (size_t i = 0; i < m; ++i) { - for (size_t j = 0; j < n; ++j) { - out_vector[i] += matrix[i*n+j] * in_vector[j]; - } - } -} - -// vector-vector multiply -inline void vvmul(const vec_t &a, const vec_t &b, tensor_t &out) { - size_t m = a.size(); - size_t n = b.size(); - for (size_t i = 0; i < m; ++i) { - for (size_t j = 0; j < n; ++j) { - out[i][j] += a[i] * b[j]; - } - } -} - -// matrix addition -inline void matadd(size_t x, size_t y, const tensor_t &A, const tensor_t &B, tensor_t &C) { - for (size_t i = 0; i < x; ++i) - for (size_t j = 0; j < y; ++j) - C[i][j] = A[i][j] + B[i][j]; -} - -// TODO: vectorize -template -inline void copy2D1D(const tensor_t &in, vec_t &out) { - size_t x = in.size(); - size_t y = in[0].size(); -#ifdef WITH_BLAS - auto ptr = &out[0]; - for (size_t i = 0; i < x; i++) { - std::copy(in[i].begin(), in[i].end(), ptr); - ptr += y; - } -#else - assert(out.size() == x*y); - for (size_t i = 0; i < x; i ++) { - for (size_t j = 0; j < y; j ++) { - out[i*y+j] = in[i][j]; - } - } -#endif -} - -// matrix multiply: all 2D -inline void matmul2D(const tensor_t &A, const tensor_t &B, tensor_t &C) { - // A: x*z; B: z*y; C: x*y - size_t dim_x = A.size(); - size_t dim_y = C[0].size(); - size_t dim_z = A[0].size(); - assert(C.size() == dim_x); - assert(B.size() == dim_z); - assert(B[0].size() == dim_y); - - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) { - C[i][j] = 0; - for (size_t k = 0; k < dim_z; ++k) { - C[i][j] += A[i][k] * B[k][j]; - } - } - } -} - -inline void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, - const vec_t &A, const vec_t &B, vec_t &C) { - galois::StatTimer Tmatmul("MatMul"); - Tmatmul.start(); -#ifdef WITH_BLAS - const int M = dim_x; - const int N = dim_y; - const int K = dim_z; - const float alpha = 1.0; - const float beta = 0.0; - const CBLAS_TRANSPOSE TransA = CblasNoTrans; - const CBLAS_TRANSPOSE TransB = CblasNoTrans; - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, &A[0], lda, &B[0], ldb, beta, &C[0], N); -#else - //std::cout << "using naive matmul, slow\n"; - assert(A.size() == dim_x*dim_z); - assert(B.size() == dim_z*dim_y); - assert(C.size() == dim_x*dim_y); - - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) { - C[i*dim_y+j] = 0; - for (size_t k = 0; k < dim_z; ++k) { - C[i*dim_y+j] += A[i*dim_z+k] * B[k*dim_y+j]; - } - } - } -#endif - Tmatmul.stop(); -} - -inline void matmul2D1D(const size_t dim_y, const tensor_t &A, const vec_t &B, vec_t &C) { - // A: x*z; B: z*y; C: x*y - size_t dim_x = A.size(); - size_t dim_z = A[0].size(); - assert(B.size() == dim_z*dim_y); - assert(C.size() == dim_x*dim_y); - -#ifdef WITH_BLAS - vec_t A1D(dim_x*dim_z); - copy2D1D(A, A1D); - matmul1D1D(dim_x, dim_y, dim_z, A1D, B, C); -#else - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) { - C[i*dim_y+j] = 0; - for (size_t k = 0; k < dim_z; ++k) { - C[i*dim_y+j] += A[i][k] * B[k][j]; - } - } - } -#endif -} - -// matrix multiply -inline void matmul(const tensor_t &A, const vec_t &B, tensor_t &C) { - // A: x*z; B: z*y; C: x*y - size_t dim_x = C.size(); - size_t dim_y = C[0].size(); - size_t dim_z = A[0].size(); - assert(A.size() == dim_x); - assert(B.size() == dim_y*dim_z); - -#ifdef WITH_BLAS - vec_t A1D(dim_x*dim_z); - vec_t C1D(dim_x*dim_y, 0); - auto ptr = &A1D[0]; - for (size_t i = 0; i < dim_x; i++) { - std::copy(A[i].begin(), A[i].end(), ptr); - ptr += dim_z; - } - matmul1D1D(dim_x, dim_y, dim_z, A1D, B, C1D); - for (size_t i = 0; i < dim_x; i++) { - for (size_t j = 0; j < dim_y; ++j) { - C[i][j] = C1D[i*dim_y+j]; - } - } -#else - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) { - C[i][j] = 0; - for (size_t k = 0; k < dim_z; ++k) { - C[i][j] += A[i][k] * B[k*dim_y+j]; - } - } - } -#endif -} - -template -inline void transpose2D(const tensor_t &in, tensor_t &out) { - size_t x = in.size(); - size_t y = in[0].size(); - for (size_t i = 0; i < y; i ++) { - for (size_t j = 0; j < x; j ++) { - out[i][j] = in[j][i]; - } - } -} - -// TODO: vectorize -template -inline void transpose2D1D(const tensor_t &in, vec_t &out) { - size_t x = in.size(); - size_t y = in[0].size(); - assert(out.size() == x*y); - for (size_t i = 0; i < y; i ++) { - for (size_t j = 0; j < x; j ++) { - out[i*x+j] = in[j][i]; - } - } -} - -template -inline void transpose(size_t x, size_t y, const vec_t &in, vec_t &out) { - for (size_t i = 0; i < y; i ++) { - for (size_t j = 0; j < x; j ++) { - out[i*x+j] = in[j*y+i]; - } - } -} - -template -inline int argmax(const size_t n, const std::vector &x) { - DataTy max = x[0]; - int max_ind = 0; - for (size_t i = 1; i < n; i++) { - if (x[i] > max) { - max_ind = i; - max = x[i]; - } - } - return max_ind; -} - -inline void clear(vec_t &in) { - for (size_t i = 0; i < in.size(); i++) in[i] = 0; -} - -inline void update_all(Graph *g, const tensor_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { - galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) { - clear(out[src]); // TODO: vectorize clear - float_t a = 0.0, b = 0.0; - if (norm) a = norm_factor[src]; - // gather neighbors' embeddings - for (const auto e : g->edges(src)) { - const auto dst = g->getEdgeDst(e); - if (norm) { - b = a * norm_factor[dst]; - vec_t neighbor = in[dst]; - mul_scalar(b, neighbor); - vadd(out[src], neighbor, out[src]); // out[src] += in[dst] - } else vadd(out[src], in[dst], out[src]); // out[src] += in[dst] - } - }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); -} - -inline void update_all(Graph *g, const vec_t &in, tensor_t &out, bool norm, const vec_t &norm_factor) { - size_t len = out[0].size(); - galois::do_all(galois::iterate(g->begin(), g->end()), [&](const auto& src) { - clear(out[src]); - float_t a = 0.0, b = 0.0; - if (norm) a = norm_factor[src]; - // gather neighbors' embeddings - for (const auto e : g->edges(src)) { - const auto dst = g->getEdgeDst(e); - if (norm) { - b = a * norm_factor[dst]; - vec_t neighbor(len); - mul_scalar(len, b, &in[dst*len], neighbor.data()); - vadd(out[src], neighbor, out[src]); // out[src] += in[dst] - } else vadd(len, out[src].data(), &in[dst*len], out[src].data()); // out[src] += in[dst] - } - }, galois::chunk_size(), galois::steal(), galois::loopname("update_all")); -} - -template -inline void relu(const std::vector &in, std::vector &out) { - for (size_t i = 0; i < out.size(); ++i) { - out[i] = std::max(in[i], (DataTy)0) + negative_slope * std::min(in[i], (DataTy)0); - } -} - -template -inline void d_relu(const std::vector &in_diff, const std::vector &fv, std::vector &out_diff) { - for (size_t i = 0; i < out_diff.size(); ++i) { - out_diff[i] = in_diff[i] * ((fv[i] > (DataTy)0) + negative_slope * (fv[i] <= (DataTy)0)); - } -} - -inline void d_mvmul(vec_t &in_diff, vec_t &h_in, tensor_t &out_diff) { - vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff -} - -inline void d_vadd(vec_t &in_diff, vec_t &out_diff) { - for (size_t i = 0; i < out_diff.size(); ++i) - out_diff[i] = in_diff[i]; -} - -template -inline float reduce_mean(const std::vector &x) { - size_t n = x.size(); - assert(n > 0); - float sum = (float)x[0]; - for (size_t i = 1; i < n; i++) { - sum += (float)x[i]; - } - return sum / (float)n; -} - -const float scale_ = 1. / (1. - dropout_rate); - -inline void dropout(const vec_t &in, std::vector &mask, vec_t &out) { - assert(mask.size() == out.size()); - //rng_bernoulli(1. - dropout_rate, mask); // Create random numbers - for (size_t i = 0; i < in.size(); ++i) - mask[i] = bernoulli(dropout_rate); - for (size_t i = 0; i < in.size(); ++i) - out[i] = in[i] * mask[i] * scale_; -} - -inline void dropout(const vec_t &in, std::vector &mask, float_t *out) { - for (size_t i = 0; i < in.size(); ++i) - mask[i] = bernoulli(dropout_rate); - for (size_t i = 0; i < in.size(); ++i) - out[i] = in[i] * mask[i] * scale_; -} - -inline void d_dropout(const vec_t &in_diff, std::vector &mask, vec_t &out_diff) { - for (size_t i = 0; i < in_diff.size(); ++i) - out_diff[i] = in_diff[i] * mask[i] * scale_; -} - -template -inline DataTy sigmoid_func(DataTy x) { - return 0.5 * tanh(0.5 * x) + 0.5; -} - -// Sigmoid -template -inline void sigmoid(std::vector &fv) { - size_t count = fv.size(); - for (size_t i = 0; i < count; ++i) { - fv[i] = sigmoid_func(fv[i]); - } -} - -// Softmax function takes an N-dimensional vector (X) of real number, -// and transforms it into a vector of real number in range (0,1) which add upto 1. -// To make softmax func numerically stable, we simply normalize the values in the vector, -// by multiplying the numerator and denominator with a constant C, where log(C)=-max(X) -// exps = np.exp(X - np.max(X)) -// exps / np.sum(exps) -template -inline void softmax(const std::vector &input, std::vector &output) { - const float_t max = *std::max_element(input.begin(), input.end()); - float_t denominator(0); - for (size_t i = 0; i < input.size(); i++) { - output[i] = std::exp(input[i] - max); - denominator += output[i]; - } - for (size_t i = 0; i < input.size(); i++) - output[i] /= denominator; -} - -template -inline void log_softmax(const std::vector &input, std::vector &output) { - const float_t max = *std::max_element(input.begin(), input.end()); - float_t denominator(0); - for (size_t i = 0; i < input.size(); i++) - denominator += std::exp(input[i] - max); - for (size_t i = 0; i < input.size(); i++) - output[i] = input[i] - max - denominator; -} - -// Due to the desirable property of softmax function outputting a probability distribution, -// we often use it as the final layer in neural networks. -// For this we need to calculate the derivative or gradient, -// and pass it back to the previous layer during backpropagation. -template -inline void d_softmax(const std::vector &y, const std::vector &p, - std::vector &dy, const std::vector &dp) { - auto n = y.size(); - vec_t df(n, 0); - for (size_t i = 0; i < n; i++) { - for (size_t j = 0; j < n; j++) { - //DataTy delta_ij = i == j? 1 : 0; - //df[i] += p[j] * (delta_ij - p[i]); - df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i]; - } - // dy = dp * (gradient of softmax) - dy[i] = dot(dp, df); - } -/* - for (size_t j = 0; j < x.size(); j++) { - for (size_t k = 0; k < x.size(); k++) { - df[k] = (k == j) ? y[j] * (float_t(1) - y[j]) : -y[k] * y[j]; - } - dx[j] = vectorize::dot(&dy[0], &df[0], len); - } -*/ -} - -// cross-entropy loss function for multi-class classification -// y: ground truth -// p: predicted probability -template -inline DataTy cross_entropy(const std::vector &y, const std::vector &p) { - auto n = y.size(); - assert(n > 0); - DataTy loss = 0.0; - for (size_t i = 0; i < n; i++) { - if (y[i] == float_t(0)) continue; - if (p[i] == float_t(0)) loss -= y[i] * std::log(float_t(1e-10)); - //if (p[i]==float_t(1)) loss -= (float_t(1) - y[i]) * std::log(float_t(1e-10)); - else loss -= y[i] * std::log(p[i]);// + (float_t(1) - y[i]) * std::log(float_t(1) - p[i]); - //loss -= y[i] * std::log(p[i]); - } - return loss; -} - -template -inline void d_cross_entropy(const std::vector &y, const std::vector &p, std::vector &d) { - auto n = y.size(); - //for (size_t i = 0; i < n; i++) d[i] = (p[i] - y[i]) / (p[i] * (float_t(1) - p[i])); - for (size_t i = 0; i < n; i++) { - d[i] = -y[i] / (p[i] + float_t(1e-10)); - //d[i] = p[i] - y[i]; - } -} - -#endif diff --git a/lonestar/experimental/gnn/net.h b/lonestar/experimental/gnn/net.h deleted file mode 100644 index fac7caee00..0000000000 --- a/lonestar/experimental/gnn/net.h +++ /dev/null @@ -1,341 +0,0 @@ -#ifndef _MODEL_H_ -#define _MODEL_H_ - -#include -#include "gnn.h" -#include "lgraph.h" -#include "layers.h" -#include "optimizer.h" - -#define NUM_CONV_LAYERS 2 - -// N: number of vertices, D: feature vector dimentions, -// E: number of distinct labels, i.e. number of vertex classes -// layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16) -// layer 2: features N x 16, weights 16 x E, out N x E -class Net { -public: - Net() {} - - // user-defined aggregate function - virtual void aggregate(Graph *g, size_t dim, const tensor_t &in_feats, tensor_t &out_feats) {} - - // user-defined combine function - virtual void combine(const vec_t ma, const vec_t mb, const vec_t &a, const vec_t &b, vec_t &out) {} - - void init() { - assert(dropout_rate < 1.0); - read_graph(dataset, g); - n = g.size(); // N - labels.resize(n, 0); // label for each vertex: N x 1 - num_classes = read_labels(dataset, labels); - - std::cout << "Reading label masks ... "; - train_mask.resize(n, 0); - val_mask.resize(n, 0); - if (dataset == "reddit") { - train_begin = 0, train_count = 153431, train_end = train_begin + train_count; - val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; - for (size_t i = train_begin; i < train_end; i++) train_mask[i] = 1; - for (size_t i = val_begin; i < val_end; i++) val_mask[i] = 1; - } else { - train_count = read_masks(dataset, "train", train_begin, train_end, train_mask); - val_count = read_masks(dataset, "val", val_begin, val_end, val_mask); - } - std::cout << "Done\n"; - - num_layers = NUM_CONV_LAYERS + 1; - feature_dims.resize(num_layers + 1); - input_features.resize(n); // input embedding: N x D - feature_dims[0] = read_features(dataset, input_features); // input feature dimension: D - feature_dims[1] = hidden1; // hidden1 level embedding: 16 - feature_dims[2] = num_classes; // output embedding: E - feature_dims[3] = num_classes; // normalized output embedding: E - layers.resize(num_layers); - } - size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } - size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id+1]; } - size_t get_nnodes() { return n; } - size_t get_nedges() { return g.sizeEdges(); } - size_t get_ft_dim() { return feature_dims[0]; } - size_t get_nclasses() { return num_classes; } - size_t get_label(size_t i) { return labels[i]; } - void construct_layers() { - std::cout << "\nConstructing layers...\n"; - append_conv_layer(0, true); // first conv layer - append_conv_layer(1); // hidden1 layer - append_out_layer(2); // output layer - layers[0]->set_in_data(input_features); // feed input data - } - - void set_netphase(net_phase phase) { - for (size_t i = 0; i < num_layers; i ++) - layers[i]->set_context(phase); - } - - void print_layers_info() { - for (size_t i = 0; i < num_layers; i ++) - layers[i]->print_layer_info(); - } - - void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, bool bias = false, bool dropout = true) { - assert(layer_id < NUM_CONV_LAYERS); - std::vector in_dims(2), out_dims(2); - in_dims[0] = out_dims[0] = n; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - layers[layer_id] = new graph_conv_layer(layer_id, &g, act, norm, bias, dropout, in_dims, out_dims); - if(layer_id > 0) connect(layers[layer_id-1], layers[layer_id]); - } - - void append_out_layer(size_t layer_id) { - assert(layer_id > 0); // can not be the first layer - std::vector in_dims(2), out_dims(2); - in_dims[0] = out_dims[0] = n; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims, &labels); - connect(layers[layer_id-1], layers[layer_id]); - } - - // forward propagation: [begin, end) is the range of samples used. - acc_t fprop(size_t begin, size_t end, size_t count, MaskList &masks) { - // set mask for the last layer - layers[num_layers-1]->set_sample_mask(begin, end, count, masks); - // layer0: from N x D to N x 16 - // layer1: from N x 16 to N x E - // layer2: from N x E to N x E (normalize only) - for (size_t i = 0; i < num_layers; i ++) - layers[i]->forward(); - return layers[num_layers-1]->get_masked_loss(); - } - - // back propogation - void bprop() { - for (size_t i = num_layers; i != 0; i --) - layers[i-1]->backward(); - } - - // update trainable weights after back-propagation - void update_weights(optimizer *opt) { - for (size_t i = 0; i < num_layers; i ++) - if (layers[i]->trainable()) layers[i]->update_weight(opt); - } - - // evaluate, i.e. inference or predict - double evaluate(size_t begin, size_t end, size_t count, MaskList &masks, acc_t &loss, acc_t &acc) { - Timer t_eval; - t_eval.Start(); - loss = fprop(begin, end, count, masks); - acc = masked_accuracy(begin, end, count, masks); - t_eval.Stop(); - return t_eval.Millisecs(); - } - - // training - void train(optimizer *opt) { - std::cout << "\nStart training...\n"; - galois::StatTimer Tupdate("Train-WeightUpdate"); - galois::StatTimer Tfw("Train-Forward"); - galois::StatTimer Tbw("Train-Backward"); - galois::StatTimer Tval("Validation"); - Timer t_epoch; - // run epoches - for (size_t i = 0; i < epochs; i++) { - std::cout << "Epoch " << std::setw(2) << i << std::fixed << std::setprecision(3) << ":"; - t_epoch.Start(); - - // training steps - set_netphase(net_phase::train); - acc_t train_loss = 0.0, train_acc = 0.0; - Tfw.start(); - train_loss = fprop(train_begin, train_end, train_count, train_mask); // forward - train_acc = masked_accuracy(train_begin, train_end, train_count, train_mask); // predict - Tfw.stop(); - Tbw.start(); - bprop(); // back propogation - Tbw.stop(); - Tupdate.start(); - update_weights(opt); // update parameters - Tupdate.stop(); - set_netphase(net_phase::test); - std::cout << " train_loss = " << std::setw(5) << train_loss << " train_acc = " << std::setw(5) << train_acc; - t_epoch.Stop(); - double epoch_time = t_epoch.Millisecs(); - - if (do_validate) { - // Validation - acc_t val_loss = 0.0, val_acc = 0.0; - Tval.start(); - double val_time = evaluate(val_begin, val_end, val_count, val_mask, val_loss, val_acc); - Tval.stop(); - std::cout << " val_loss = " << std::setw(5) << val_loss << " val_acc = " << std::setw(5) << val_acc; - std::cout << " time = " << epoch_time + val_time << " ms (train_time = " << epoch_time << " val_time = " << val_time << ")\n"; - } else { - std::cout << " train_time = " << epoch_time << " ms\n"; - } - } - } - -protected: - size_t n; // number of samples: N - size_t num_classes; // number of vertex classes: E - size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 - std::vector feature_dims; // feature dimnesions for each layer - - Graph g; // the input graph, |V| = N - tensor_t input_features; // input features: N x D - std::vector labels; // labels for classification: N x 1 - MaskList train_mask, val_mask; // masks for traning and validation - size_t train_begin, train_end, train_count, val_begin, val_end, val_count; - - std::vector layers; // all the layers in the neural network - /* - inline void init_features(size_t dim, vec_t &x) { - std::default_random_engine rng; - std::uniform_real_distribution dist(0, 0.1); - for (size_t i = 0; i < dim; ++i) - x[i] = dist(rng); - } - //*/ - - // labels contain the ground truth (e.g. vertex classes) for each example (num_examples x 1). - // Note that labels is not one-hot encoded vector and it can be computed - // as y.argmax(axis=1) from one-hot encoded vector (y) of labels if required. - size_t read_labels(std::string dataset_str, LabelList &labels) { - std::cout << "Reading labels ... "; - Timer t_read; - t_read.Start(); - std::string filename = path + dataset_str + "-labels.txt"; - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - size_t m, n; - in >> m >> n >> std::ws; - assert(m == labels.size()); // number of vertices - unsigned v = 0; - while (std::getline(in, line)) { - std::istringstream label_stream(line); - unsigned x; - for (size_t idx = 0; idx < n; ++idx) { - label_stream >> x; - if (x != 0) { - labels[v] = idx; - break; - } - } - v ++; - } - in.close(); - t_read.Stop(); - // number of vertex classes - std::cout << "Done, unique label counts: " << n << ", time: " << t_read.Millisecs() << " ms\n"; - return n; - } - - size_t read_features(std::string dataset_str, tensor_t &feats) { - std::cout << "Reading features ... "; - Timer t_read; - t_read.Start(); - std::string filename = path + dataset_str + ".ft"; - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - size_t m, n; - in >> m >> n >> std::ws; - assert(m == feats.size()); // m = number of vertices - for (size_t i = 0; i < m; ++i) { - feats[i].resize(n); - for (size_t j = 0; j < n; ++j) - feats[i][j] = 0; - } - while (std::getline(in, line)) { - std::istringstream edge_stream(line); - unsigned u, v; - float_t w; - edge_stream >> u; - edge_stream >> v; - edge_stream >> w; - feats[u][v] = w; - } - /* - for (size_t i = 0; i < 10; ++i) - for (size_t j = 0; j < n; ++j) - if (feats[i][j] > 0) - std::cout << "feats[" << i << "][" << j << "]: " << feats[i][j] << std::endl; - //*/ - in.close(); - t_read.Stop(); - std::cout << "Done, feature dimention: " << n << ", time: " << t_read.Millisecs() << " ms\n"; - return n; - } - - unsigned read_graph(std::string dataset_str, Graph &graph) { - //printf("Start readGraph\n"); - galois::StatTimer Tread("GraphReadingTime"); - Tread.start(); - LGraph lgraph; - unsigned max_degree = 0; - if (filetype == "el") { - std::string filename = path + dataset_str + ".el"; - printf("Reading .el file: %s\n", filename.c_str()); - lgraph.read_edgelist(filename.c_str(), true); //symmetrize - genGraph(lgraph, graph); - } else if (filetype == "gr") { - std::string filename = path + dataset_str + ".csgr"; - printf("Reading .gr file: %s\n", filename.c_str()); - galois::graphs::readGraph(graph, filename); - /* - galois::do_all(galois::iterate(graph.begin(), graph.end()), [&](const auto& vid) { - graph.getData(vid) = 1; - //for (auto e : graph.edges(n)) graph.getEdgeData(e) = 1; - }, galois::chunk_size<256>(), galois::steal(), galois::loopname("assignVertexLabels")); - std::vector degrees(graph.size()); - galois::do_all(galois::iterate(graph.begin(), graph.end()), [&](const auto& vid) { - degrees[vid] = std::distance(graph.edge_begin(vid), graph.edge_end(vid)); - }, galois::loopname("computeMaxDegree")); - max_degree = *(std::max_element(degrees.begin(), degrees.end())); - */ - } else { printf("Unkown file format\n"); exit(1); } - if (filetype != "gr") { - max_degree = lgraph.get_max_degree(); - lgraph.clean(); - } - printf("max degree = %u\n", max_degree); - Tread.stop(); - //printf("Done readGraph\n"); - std::cout << "num_vertices " << g.size() << " num_edges " << g.sizeEdges() << "\n"; - return max_degree; - } - - void genGraph(LGraph &lg, Graph &g) { - g.allocateFrom(lg.num_vertices(), lg.num_edges()); - g.constructNodes(); - for (size_t i = 0; i < lg.num_vertices(); i++) { - g.getData(i) = 1; - auto row_begin = lg.get_offset(i); - auto row_end = lg.get_offset(i+1); - g.fixEndEdge(i, row_end); - for (auto offset = row_begin; offset < row_end; offset ++) - g.constructEdge(offset, lg.get_dest(offset), 0); // do not consider edge labels now - } - } - - inline acc_t masked_accuracy(size_t begin, size_t end, size_t count, MaskList &masks) { - // comparing outputs with the ground truth (labels) - //acc_t accuracy_all = 0.0; - AccumF accuracy_all; - accuracy_all.reset(); - //for (size_t i = begin; i < end; i++) { - galois::do_all(galois::iterate(begin, end), [&](const auto& i) { - if (masks[i] == 1) { - int prediction = argmax(num_classes, layers[NUM_CONV_LAYERS-1]->next()->get_data()[i]); - if ((label_t)prediction == labels[i]) accuracy_all += 1.0; - } - }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); - //} - return accuracy_all.reduce() / (acc_t)count; - } -}; - -#endif diff --git a/lonestar/experimental/gnn/node.h b/lonestar/experimental/gnn/node.h deleted file mode 100644 index deffebad9b..0000000000 --- a/lonestar/experimental/gnn/node.h +++ /dev/null @@ -1,109 +0,0 @@ -#pragma once -#include -class node; -class layer; -class edge; - -typedef std::shared_ptr edgeptr_t; - -// node data structure -class node : public std::enable_shared_from_this { -public: - node(size_t in_size, size_t out_size) {}//: prev_(in_size), next_(out_size) {} - virtual ~node() {} - const edgeptr_t prev() const { return prev_; } - //const std::vector &prev() const { return prev_; } - const edgeptr_t next() const { return next_; } - //const std::vector &next() const { return next_; } - //std::vector prev_nodes() const; - //std::vector next_nodes() const; - -protected: - node() = delete; - friend void connect(layer *head, layer *tail, size_t head_index, size_t tail_index); - //mutable std::vector prev_; - //mutable std::vector next_; - mutable edgeptr_t prev_; - mutable edgeptr_t next_; -}; - -// edges manage the input/output data and gradients between nodes -class edge { -public: - edge(node *prev, size_t len) : - ft_dim_(len), - data_({vec_t(len)}), - grad_({vec_t(len)}), - prev_(prev) {} - - void merge_grads(vec_t *dst) { - assert(!grad_.empty()); - const auto &grad_head = grad_[0]; - size_t sz = grad_head.size(); - dst->resize(sz); - float_t *pdst = &(*dst)[0]; - std::copy(grad_head.begin(), grad_head.end(), pdst); - // @todo consider adding parallelism and vectorization - for (size_t sample = 1; sample < grad_.size(); ++sample) { - for (size_t i = 0; i < sz; i++) - pdst[i] += grad_[sample][i]; - //vectorize::reduce(&grad_[sample][0], sz, pdst); - } - } - void clear_grads() { - for (size_t sample = 0; sample < grad_.size(); ++sample) { - auto &g = grad_[sample]; - std::fill(g.begin(), g.end(), 0.0); // TODO: need vectorize - //vectorize::fill(&g[0], g.size(), float_t{0}); - } - } - - tensor_t *get_data_ptr() { return &data_; } - tensor_t &get_data() { return data_; } - //const tensor_t *get_data() const { return &data_; } - const tensor_t &get_data() const { return data_; } - //tensor_t *get_gradient() { return &grad_; } - tensor_t &get_gradient() { return grad_; } - //const tensor_t *get_gradient() const { return &grad_; } - const tensor_t &get_gradient() const { return grad_; } - - //const std::vector &next() const { return next_; } - const node *next() const { return next_; } - node *prev() { return prev_; } - const node *prev() const { return prev_; } - //const shape3d &shape() const { return shape_; } - //vector_type vtype() const { return vtype_; } - //void add_next_node(node *next) { next_.push_back(next); } - void add_next_node(node *next) { next_ = next; } -private: - //shape3d shape_; - size_t ft_dim_; - //vector_type vtype_; - tensor_t data_; - tensor_t grad_; - node *prev_; // previous node, "producer" of this tensor - node *next_; // next node, "consumer" of this tensor - //std::vector next_; // next nodes, "consumers" of this tensor -}; -/* -inline std::vector node::prev_nodes() const { - std::vector vecs; - for (auto &e : prev_) { - if (e && e->prev()) { - vecs.insert(vecs.end(), e->prev()); - } - } - return vecs; -} - -inline std::vector node::next_nodes() const { - std::vector vecs; - for (auto &e : next_) { - if (e) { - auto n = e->next(); - vecs.insert(vecs.end(), n.begin(), n.end()); - } - } - return vecs; -} -*/ diff --git a/lonestar/experimental/gnn/optimizer.h b/lonestar/experimental/gnn/optimizer.h deleted file mode 100644 index 2896881fed..0000000000 --- a/lonestar/experimental/gnn/optimizer.h +++ /dev/null @@ -1,221 +0,0 @@ -#pragma once - -#include -#include -#include "types.h" - -// base class of optimizer -// usesHessian : true if an optimizer uses hessian (2nd order derivative of loss function) -struct optimizer { - optimizer() = default; - optimizer(const optimizer &) = default; - optimizer(optimizer &&) = default; - optimizer &operator=(const optimizer &) = default; - optimizer &operator=(optimizer &&) = default; - virtual ~optimizer() = default; - virtual void update(const vec_t &dW, vec_t &W, bool parallelize) = 0; - virtual void reset() {} // override to implement pre-learning action -}; - -// helper class to hold N values for each weight -template -struct stateful_optimizer : public optimizer { - void reset() override { for (auto &e : E_) e.clear(); } -protected: - template - vec_t &get(const vec_t &key) { - static_assert(Index < N, "index out of range"); - if (E_[Index][&key].empty()) E_[Index][&key].resize(key.size(), float_t()); - return E_[Index][&key]; - } - std::unordered_map E_[N]; -}; - -/** - * adaptive gradient method - * - * J Duchi, E Hazan and Y Singer, - * Adaptive subgradient methods for online learning and stochastic optimization - * The Journal of Machine Learning Research, pages 2121-2159, 2011. - **/ -struct adagrad : public stateful_optimizer<1> { - adagrad() : alpha(learning_rate), eps(float_t(1e-8)) {} - void update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &g = get<0>(W); - if (parallelize) { - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - g[i] += dW[i] * dW[i]; - W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); - }, galois::loopname("adagrad_update")); - } else { - for (size_t i = 0; i < W.size(); i++) { - g[i] += dW[i] * dW[i]; - W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); - } - } - } - float_t alpha; // learning rate - private: - float_t eps; -}; - -/** - * RMSprop - * - * T Tieleman, and G E Hinton, - * Lecture 6.5 - rmsprop, COURSERA: Neural Networks for Machine Learning (2012) - **/ -struct RMSprop : public stateful_optimizer<1> { - RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {} - void update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &g = get<0>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i]; - W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps); - }, galois::loopname("rms_update")); - } - float_t alpha; // learning rate - float_t mu; // decay term -private: - float_t eps; // constant value to avoid zero-division -}; - -// Adam: A Method for Stochastic Optimization -// http://arxiv.org/abs/1412.6980 -struct adam : public stateful_optimizer<2> { - adam() : alpha(learning_rate), b1(float_t(0.9)), - b2(float_t(0.999)), b1_t(float_t(0.9)), - b2_t(float_t(0.999)), eps(float_t(1e-8)) {} - - void update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &mt = get<0>(W); - vec_t &vt = get<1>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; - vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; - // L2 norm based update rule - W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / - std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); - }, galois::chunk_size(), galois::steal(), galois::loopname("adam_update")); - b1_t *= b1; - b2_t *= b2; - } - - float_t alpha; // learning rate - float_t b1; // decay term - float_t b2; // decay term - float_t b1_t; // decay term power t - float_t b2_t; // decay term power t - -private: - float_t eps; // constant value to avoid zero-division -}; - -/** - * @brief [a new optimizer (2015)] - * @details [see Adam: A Method for Stochastic Optimization (Algorithm 2) - * http://arxiv.org/abs/1412.6980] - * - */ -struct adamax : public stateful_optimizer<2> { - adamax() - : alpha(float_t(0.002)), - b1(float_t(0.9)), - b2(float_t(0.999)), - b1_t(b1), - eps(float_t(1e-8)) {} - - void update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &mt = get<0>(W); - vec_t &ut = get<1>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; - ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); - // Lp norm based update rule - W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); - }, galois::loopname("adamax_update")); - b1_t *= b1; - } - - float_t alpha; // learning rate - float_t b1; // decay term - float_t b2; // decay term - float_t b1_t; // decay term power t - -private: - float_t eps; // constant value to avoid zero-division -}; - -/** - * SGD without momentum - * - * slightly faster than tiny_dnn::momentum - **/ -struct gradient_descent : public optimizer { - gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {} - void update(const vec_t &dW, vec_t &W, bool parallelize) { - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); - }, galois::loopname("gradient_descent_update")); - } - float_t alpha; // learning rate - float_t lambda; // weight decay -}; - -/** - * SGD with momentum - * - * B T Polyak, - * Some methods of speeding up the convergence of iteration methods - * USSR Computational Mathematics and Mathematical Physics, 4(5):1-17, 1964. - **/ -struct momentum : public stateful_optimizer<1> { - public: - momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} - - void update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &dWprev = get<0>(W); - - //for_i(parallelize, W.size(), [&](size_t i) { - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); - W[i] += V; - dWprev[i] = V; - //}); - }, galois::loopname("momentum_update")); - } - - float_t alpha; // learning rate - float_t lambda; // weight decay - float_t mu; // momentum -}; - -/** - * SGD with Nesterov momentum - * - * Y Nesterov, - * A method for unconstrained convex minimization problem with the rate of - * convergence o(1/k2), Doklady ANSSSR, vol.269, pp.543-547, 1983. - **/ -struct nesterov_momentum : public stateful_optimizer<1> { - public: - nesterov_momentum() - : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} - - void update(const vec_t &dW, vec_t &W, bool parallelize) { - vec_t &dWprev = get<0>(W); - - //for_i(parallelize, W.size(), [&](size_t i) { - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); - W[i] += (-mu) * dWprev[i] + (1 + mu) * V; - dWprev[i] = V; - //}); - }, galois::loopname("nesterov_momentum_update")); - } - - float_t alpha; // learning rate - float_t lambda; // weight decay - float_t mu; // momentum -}; - diff --git a/lonestar/experimental/gnn/random.h b/lonestar/experimental/gnn/random.h deleted file mode 100644 index 9236e9c391..0000000000 --- a/lonestar/experimental/gnn/random.h +++ /dev/null @@ -1,63 +0,0 @@ -#ifndef RANDOM_H -#define RANDOM_H -typedef boost::mt19937 rng_t; - -// random seeding -int64_t seedgen(void) { - int64_t s, seed, pid; - FILE* f = fopen("/dev/urandom", "rb"); - if (f && fread(&seed, 1, sizeof(seed), f) == sizeof(seed)) { - fclose(f); - return seed; - } - std::cout << "System entropy source not available, using fallback algorithm to generate seed instead."; - if (f) fclose(f); - pid = getpid(); - s = time(NULL); - seed = std::abs(((s * 181) * ((pid - 83) * 359)) % 104729); - return seed; -} - -// This random number generator facade hides boost and CUDA rng -// implementation from one another (for cross-platform compatibility). -class RNG { -public: - RNG() : generator_(new Generator()) { } - explicit RNG(unsigned int seed) : generator_(new Generator(seed)) { } - explicit RNG(const RNG&); - RNG& operator=(const RNG& other) { generator_ = other.generator_; return *this; } - void* generator() { return static_cast(generator_->rng()); } -private: - class Generator { - public: - Generator() : rng_(new rng_t(seedgen())) {} - explicit Generator(unsigned seed) : rng_(new rng_t(seed)) {} - rng_t* rng() { return rng_.get(); } - private: - std::shared_ptr rng_; - }; - - std::shared_ptr generator_; -}; - -std::shared_ptr random_generator_; -inline static RNG& rng_stream() { - random_generator_.reset(new RNG()); - return *random_generator_; -} - -inline rng_t* rng() { - return static_cast(rng_stream().generator()); -} - -#include -template -void rng_bernoulli(const DataTy p, std::vector &r) { - boost::bernoulli_distribution random_distribution(p); - boost::variate_generator > - variate_generator(rng(), random_distribution); - for (size_t i = 0; i < r.size(); ++i) - r[i] = static_cast(variate_generator()); -} - -#endif diff --git a/lonestar/experimental/gnn/run-citeseer.sh b/lonestar/experimental/gnn/run-citeseer.sh deleted file mode 100755 index 30772b4f6e..0000000000 --- a/lonestar/experimental/gnn/run-citeseer.sh +++ /dev/null @@ -1 +0,0 @@ -./gnn citeseer -t=56 -k=3 diff --git a/lonestar/experimental/gnn/timer.h b/lonestar/experimental/gnn/timer.h deleted file mode 100644 index e6c838c37b..0000000000 --- a/lonestar/experimental/gnn/timer.h +++ /dev/null @@ -1,21 +0,0 @@ -#ifndef TIMER_H_ -#define TIMER_H_ -#include - -class Timer { -public: - Timer() {} - void Start() { gettimeofday(&start_time_, NULL); } - void Stop() { - gettimeofday(&elapsed_time_, NULL); - elapsed_time_.tv_sec -= start_time_.tv_sec; - elapsed_time_.tv_usec -= start_time_.tv_usec; - } - double Seconds() const { return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1e6; } - double Millisecs() const { return 1000*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1000; } - double Microsecs() const { return 1e6*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; } -private: - struct timeval start_time_; - struct timeval elapsed_time_; -}; -#endif // TIMER_H_ diff --git a/lonestar/experimental/gnn/types.h b/lonestar/experimental/gnn/types.h deleted file mode 100644 index bc9fe21049..0000000000 --- a/lonestar/experimental/gnn/types.h +++ /dev/null @@ -1,34 +0,0 @@ -#ifndef TYPES_H -#define TYPES_H -#include -#include "galois/Galois.h" -#include "galois/graphs/LCGraph.h" - -#ifdef CNN_USE_DOUBLE -typedef double float_t; -typedef double feature_t; -#else -typedef float float_t; -typedef float feature_t; // feature type -#endif -typedef std::vector vec_t; // feature vector (1D) -typedef std::vector tensor_t; // feature vectors (2D): num_samples x feature_dim -typedef std::vector FV; // feature vector -typedef std::vector FV2D; // feature vectors: num_samples x feature_dim -typedef float acc_t; // Accuracy type -typedef short label_t; // label is for classification (supervised learning) -typedef uint8_t mask_t; // mask is used to indicate different uses of labels: train, val, test -typedef std::vector LabelList; // label list to store label for each vertex -typedef std::vector MaskList; // mask list to store mask for each vertex -typedef galois::GAccumulator AccumF; -typedef galois::GAccumulator AccumU; - -#ifdef EDGE_LABEL -typedef galois::graphs::LC_CSR_Graph::with_numa_alloc::type ::with_no_lockable::type Graph; -#else -typedef galois::graphs::LC_CSR_Graph::with_numa_alloc::type ::with_no_lockable::type Graph; -#endif - -typedef Graph::GraphNode GNode; - -#endif diff --git a/lonestar/experimental/gnn/utils.h b/lonestar/experimental/gnn/utils.h deleted file mode 100644 index 70356654b9..0000000000 --- a/lonestar/experimental/gnn/utils.h +++ /dev/null @@ -1,119 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include "gnn.h" - -std::string path = "/h2/xchen/datasets/Learning/"; // path to the input dataset -enum class net_phase { train, test }; - -class ResourceManager { -public: - ResourceManager() {} - ~ResourceManager(){} - //peak memory usage - std::string get_peak_memory() { - double kbm; - struct rusage CurUsage; - getrusage(RUSAGE_SELF, &CurUsage); - kbm = (double)CurUsage.ru_maxrss; - double mbm = kbm / 1024.0; - double gbm = mbm / 1024.0; - return - "Peak memory: " + - to_string_with_precision(mbm, 3) + " MB; " + - to_string_with_precision(gbm, 3) + " GB"; - } -private: - template - std::string to_string_with_precision(const T a_value, const int& n) { - std::ostringstream out; - out << std::fixed; - out << std::setprecision(n) << a_value; - return out.str(); - } -}; - -class Timer { -public: - Timer() {} - void Start() { gettimeofday(&start_time_, NULL); } - void Stop() { - gettimeofday(&elapsed_time_, NULL); - elapsed_time_.tv_sec -= start_time_.tv_sec; - elapsed_time_.tv_usec -= start_time_.tv_usec; - } - double Seconds() const { return elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1e6; } - double Millisecs() const { return 1000*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec/1000; } - double Microsecs() const { return 1e6*elapsed_time_.tv_sec + (double)elapsed_time_.tv_usec; } -private: - struct timeval start_time_; - struct timeval elapsed_time_; -}; - -class random_generator { -public: - static random_generator &get_instance() { - static random_generator instance; - return instance; - } - std::mt19937 &operator()() { return gen_; } - void set_seed(unsigned int seed) { gen_.seed(seed); } - -private: - random_generator() : gen_(1) {} - std::mt19937 gen_; -}; - -template -inline typename std::enable_if::value, T>::type -uniform_rand(T min, T max) { - std::uniform_int_distribution dst(min, max); - return dst(random_generator::get_instance()()); -} - -template -inline typename std::enable_if::value, T>::type -uniform_rand(T min, T max) { - std::uniform_real_distribution dst(min, max); - return dst(random_generator::get_instance()()); -} - -inline bool bernoulli(float_t p) { - return uniform_rand(float_t{0}, float_t{1}) <= p; -} - -size_t read_masks(std::string dataset_str, std::string mask_type, size_t &begin, size_t &end, MaskList &masks) { - if (dataset_str != "citeseer" && dataset_str != "cora") { - std::cout << "Dataset currently not supported\n"; - exit(1); - } - size_t i = 0; - size_t sample_count = 0; - std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; - //std::cout << "Reading " << filename << "\n"; - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - in >> begin >> end >> std::ws; - while (std::getline(in, line)) { - std::istringstream mask_stream(line); - if (i >= begin && i < end) { - unsigned mask = 0; - mask_stream >> mask; - if (mask == 1) { - masks[i] = 1; - sample_count ++; - } - } - i ++; - } - //std::cout << mask_type + "_mask range: [" << begin << ", " << end - // << ") Number of valid samples: " << sample_count << "\n"; - in.close(); - return sample_count; -} - From 5cec98cebb6fb1507d884bf122ebd11a1680d767 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 17:39:44 -0600 Subject: [PATCH 090/660] correctly context.h copyright header (replace with TODO) --- libdeepgalois/include/deepgalois/context.h | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index d7f400d582..2f769dc917 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -1,12 +1,6 @@ #pragma once /** - * Code modified from below - * - * https://github.com/BVLC/caffe/blob/master/include/caffe/common.hpp - * - * Copyright (c) 2014-2017 The Regents of the University of California (Regents) - * All rights reserved. - * Reused/revised under BSD 2-Clause license + * TODO if used from somewhere, get copyright/licences */ #include From bed7ba7c33cb71d1eba53bf0c6f29f646c0a7e99 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 17:50:06 -0600 Subject: [PATCH 091/660] gcn: some comments, gPrint --- lonestargnn/gcn/gcn.cpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 005e6b1477..3357fd904e 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -15,6 +15,8 @@ int main(int argc, char** argv) { network.construct_layers(); // default setting for now; can be customized by // the user network.print_layers_info(); + + // tracks peak memory usage deepgalois::ResourceManager rm; // the optimizer used to update parameters, see optimizer.h for more details @@ -27,7 +29,7 @@ int main(int argc, char** argv) { Ttrain.stop(); if (do_test) { - std::cout << "\n"; + galois::gPrint("\n"); // test using test samples size_t n = network.get_nnodes(); acc_t test_loss = 0.0, test_acc = 0.0; @@ -45,11 +47,10 @@ int main(int argc, char** argv) { Ttest.start(); double test_time = network.evaluate(test_begin, test_end, test_count, &test_mask[0], test_loss, test_acc); - std::cout << "Testing: test_loss = " << test_loss - << " test_acc = " << test_acc << " test_time = " << test_time - << "\n"; + galois::gPrint("Testing: test_loss = ", test_loss, " test_acc = ", test_acc, + " test_time = ", test_time, "\n"); Ttest.stop(); } - std::cout << "\n" << rm.get_peak_memory() << "\n\n"; + galois::gPrint("\n", rm.get_peak_memory(), "\n\n"); return 0; } From d46fb46ed0eb256c4ff2690bb8ab69b618eaf688 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 18:14:35 -0600 Subject: [PATCH 092/660] aggregator.h: always include CPU version of update_all will not conflict with GPU since different signature --- libdeepgalois/include/deepgalois/aggregator.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/libdeepgalois/include/deepgalois/aggregator.h b/libdeepgalois/include/deepgalois/aggregator.h index 17a8451aee..bdc8c5aa5e 100644 --- a/libdeepgalois/include/deepgalois/aggregator.h +++ b/libdeepgalois/include/deepgalois/aggregator.h @@ -1,13 +1,11 @@ #pragma once #include "deepgalois/types.h" -#ifdef CPU_ONLY #include "deepgalois/gtypes.h" - namespace deepgalois { void update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); } -#else +#ifndef CPU_ONLY #include "graph_gpu.h" namespace deepgalois { void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, From bbb41251cf49985f71b9a327d137efb0ca046cea Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 2 Mar 2020 18:22:04 -0600 Subject: [PATCH 093/660] fixing GPU end deepgalois includes/namepsaces --- libdeepgalois/src/aggregator.cu | 6 +++--- libdeepgalois/src/context.cu | 2 +- libdeepgalois/src/math_functions.cu | 30 ++++++++++++++--------------- libdeepgalois/src/node.cu | 10 +++++----- libdeepgalois/src/optimizer.cu | 8 ++++---- 5 files changed, 28 insertions(+), 28 deletions(-) diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/aggregator.cu index bbd7fbf8b3..cd89cd92b1 100644 --- a/libdeepgalois/src/aggregator.cu +++ b/libdeepgalois/src/aggregator.cu @@ -1,9 +1,9 @@ #include "gg.h" #include "ggcuda.h" #include "cub/cub.cuh" -#include "cutils.h" -#include "aggregator.h" -#include "math_functions.hh" +#include "deepgalois/cutils.h" +#include "deepgalois/aggregator.h" +#include "deepgalois/math_functions.hh" // TODO: use warp __device__ void scale_add(const int n, const float_t alpha, const float_t* a, diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 270252c5d8..29bec6f008 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -2,7 +2,7 @@ #include #include #include -#include "context.h" +#include "deepgalois/context.h" // random seeding int64_t cluster_seedgen(void) { diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index eb8f07c8b3..9131bf9509 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -1,5 +1,5 @@ -#include "math_functions.hh" -#include "context.h" +#include "deepgalois/math_functions.hh" +#include "deepgalois/context.h" #include "gg.h" #include "ggcuda.h" #include "cub/cub.cuh" @@ -29,11 +29,11 @@ bool isnan_gpu(int n, const float_t *array) { } void gpu_rng_uniform(const int n, unsigned* r) { - CURAND_CHECK(curandGenerate(Context::curand_generator(), r, n)); + CURAND_CHECK(curandGenerate(deepgalois::Context::curand_generator(), r, n)); } void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) { - CURAND_CHECK(curandGenerateUniform(Context::curand_generator(), r, n)); + CURAND_CHECK(curandGenerateUniform(deepgalois::Context::curand_generator(), r, n)); const float range = b - a; if (range != float_t(1)) scal_gpu(n, range, r); @@ -42,7 +42,7 @@ void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) } void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t* r) { - CURAND_CHECK(curandGenerateNormal(Context::curand_generator(), r, n, mu, sigma)); + CURAND_CHECK(curandGenerateNormal(deepgalois::Context::curand_generator(), r, n, mu, sigma)); } bool is_allocated_device(float_t* data) { @@ -171,7 +171,7 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasSgemm(Context::cublas_handle(), cuTransB, cuTransA, + CUBLAS_CHECK(cublasSgemm(deepgalois::Context::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } @@ -189,14 +189,14 @@ void csrmm_gpu(const int M, const int N, const int K, const int nnz, const float* B, const float beta, float* C) { float *transpose_C; CUDA_CHECK(cudaMalloc((void**)&transpose_C, N * K * sizeof(float))); - CUSPARSE_CHECK(cusparseScsrmm2(Context::cusparse_handle(), + CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::Context::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, - M, N, K, nnz, &alpha, Context::cusparse_matdescr(), A_nonzeros, + M, N, K, nnz, &alpha, deepgalois::Context::cusparse_matdescr(), A_nonzeros, A_idx_ptr, A_nnz_idx, B, N, &beta, transpose_C, M)); //transpose C const float one = 1.0; const float zero = 0.0; - CUBLAS_CHECK(cublasSgeam(Context::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T, + CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, transpose_C, M, C, N)); } @@ -205,25 +205,25 @@ void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float beta, float* y) { cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasSgemv(Context::cublas_handle(), cuTransA, N, M, &alpha, A, + CUBLAS_CHECK(cublasSgemv(deepgalois::Context::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); } void scal_gpu(const int N, const float alpha, float* X) { - CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), N, &alpha, X, 1)); + CUBLAS_CHECK(cublasSscal(deepgalois::Context::cublas_handle(), N, &alpha, X, 1)); } void dot_gpu(const int n, const float* x, const float* y, float* out) { - CUBLAS_CHECK(cublasSdot(Context::cublas_handle(), n, x, 1, y, 1, out)); + CUBLAS_CHECK(cublasSdot(deepgalois::Context::cublas_handle(), n, x, 1, y, 1, out)); } void asum_gpu(const int n, const float* x, float* y) { - CUBLAS_CHECK(cublasSasum(Context::cublas_handle(), n, x, 1, y)); + CUBLAS_CHECK(cublasSasum(deepgalois::Context::cublas_handle(), n, x, 1, y)); } void scale_gpu(const int n, const float alpha, const float* x, float* y) { - CUBLAS_CHECK(cublasScopy(Context::cublas_handle(), n, x, 1, y, 1)); - CUBLAS_CHECK(cublasSscal(Context::cublas_handle(), n, &alpha, y, 1)); + CUBLAS_CHECK(cublasScopy(deepgalois::Context::cublas_handle(), n, x, 1, y, 1)); + CUBLAS_CHECK(cublasSscal(deepgalois::Context::cublas_handle(), n, &alpha, y, 1)); } __global__ void set_kernel(const int n, const float_t alpha, float_t* y) { diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu index e6d149a540..88d486f369 100644 --- a/libdeepgalois/src/node.cu +++ b/libdeepgalois/src/node.cu @@ -1,18 +1,18 @@ -#include "node.h" -#include "cutils.h" +#include "deepgalois/layers/node.h" +#include "deepgalois/cutils.h" -void edge::alloc_gpu() { +void deepgalois::edge::alloc_gpu() { CUDA_CHECK( cudaMalloc((void**)&data_, num_samples_ * ft_dim_ * sizeof(float_t))); CUDA_CHECK( cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t))); } -void edge::merge_grads_gpu(float_t* dst) { +void deepgalois::edge::merge_grads_gpu(float_t* dst) { CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost)); } -void edge::clear_grads_gpu() { +void deepgalois::edge::clear_grads_gpu() { CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t))); } diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu index bf279e4e37..7628c3aeba 100644 --- a/libdeepgalois/src/optimizer.cu +++ b/libdeepgalois/src/optimizer.cu @@ -1,6 +1,6 @@ -#include "optimizer.h" -#include "cutils.h" -#include "math_functions.hh" +#include "deepgalois/optimizer.h" +#include "deepgalois/cutils.h" +#include "deepgalois/math_functions.hh" __global__ void update_kernel(const int n, float_t alpha, float_t b1, float_t b2, float_t b1_t, float_t b2_t, @@ -14,7 +14,7 @@ __global__ void update_kernel(const int n, float_t alpha, float_t b1, } } -void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { +void deepgalois::adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { //std::cout << "updating weights on GPU, n = " << n << "\n"; //print_device_vector(10, dW, "dW"); float_t* cache = get_gpu<0>(n, W); From c9001a727b93d2598898d79c0e233b7582b2cd27 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Mar 2020 11:47:36 -0600 Subject: [PATCH 094/660] aggregator moved to layers (only used htere) --- libdeepgalois/CMakeLists.txt | 2 +- libdeepgalois/include/deepgalois/{ => layers}/aggregator.h | 0 libdeepgalois/include/deepgalois/layers/graph_conv_layer.h | 5 +++-- libdeepgalois/include/deepgalois/layers/layer.h | 2 +- libdeepgalois/src/{ => layers}/aggregator.cpp | 2 +- libdeepgalois/src/{ => layers}/aggregator.cu | 0 6 files changed, 6 insertions(+), 5 deletions(-) rename libdeepgalois/include/deepgalois/{ => layers}/aggregator.h (100%) rename libdeepgalois/src/{ => layers}/aggregator.cpp (96%) rename libdeepgalois/src/{ => layers}/aggregator.cu (100%) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 1ce41abc73..813992d433 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -46,8 +46,8 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") set(sources src/layers/graph_conv_layer.cpp src/layers/softmax_loss_layer.cpp + src/layers/aggregator.cpp src/math_functions.cpp - src/aggregator.cpp src/optimizer.cpp src/context.cpp src/node.cpp diff --git a/libdeepgalois/include/deepgalois/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h similarity index 100% rename from libdeepgalois/include/deepgalois/aggregator.h rename to libdeepgalois/include/deepgalois/layers/aggregator.h diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index ed681bdf30..f5b7906f73 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -1,9 +1,10 @@ #pragma once #include "layer.h" -#include "deepgalois/aggregator.h" +#include "deepgalois/layers/aggregator.h" /** - * GraphConv Layer; based on DGL implementation + * GraphConv Layer; based on DGL implementation + follows TinyDNN layer + * convention * https://docs.dgl.ai/en/0.4.x/_modules/dgl/nn/pytorch/conv/graphconv.html * * Parameters diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index f30ad03b7b..da6e866d6b 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -1,6 +1,6 @@ #pragma once /** - * Code from on below link. Modified under Galois. + * Code from on below link. Modified under Galois's license. * * https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/layers/layer.h * diff --git a/libdeepgalois/src/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp similarity index 96% rename from libdeepgalois/src/aggregator.cpp rename to libdeepgalois/src/layers/aggregator.cpp index 360300dba3..5c7586e9a4 100644 --- a/libdeepgalois/src/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -1,4 +1,4 @@ -#include "deepgalois/aggregator.h" +#include "deepgalois/layers/aggregator.h" #include "deepgalois/math_functions.hh" void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, diff --git a/libdeepgalois/src/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu similarity index 100% rename from libdeepgalois/src/aggregator.cu rename to libdeepgalois/src/layers/aggregator.cu From 263ca21ba14138dc8c19d8d15b62d3fd2cfdb712 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Mar 2020 12:22:26 -0600 Subject: [PATCH 095/660] aggregator comments; vadd moved to deepgalois::math --- .../include/deepgalois/layers/aggregator.h | 2 ++ .../include/deepgalois/math_functions.hh | 12 +++++++ libdeepgalois/src/layers/aggregator.cpp | 31 +++++++++++++------ libdeepgalois/src/layers/graph_conv_layer.cpp | 2 +- libdeepgalois/src/math_functions.cpp | 8 +++++ 5 files changed, 45 insertions(+), 10 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h index bdc8c5aa5e..806f81a3e0 100644 --- a/libdeepgalois/include/deepgalois/layers/aggregator.h +++ b/libdeepgalois/include/deepgalois/layers/aggregator.h @@ -2,6 +2,8 @@ #include "deepgalois/types.h" #include "deepgalois/gtypes.h" namespace deepgalois { +//! For each node in the graph, add the embeddings of all of its neighbors +//! together (using norm_factor if specified) void update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); } diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index b5c51203f8..101d5125be 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -20,8 +20,18 @@ extern "C" { const float negative_slope = 0; +namespace deepgalois { +namespace math { + +//! add two same size vectors into out void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add +//! add 2 arrays for n elements void vadd(size_t n, const float_t* a, const float_t* b, float_t* out); + +} // deepgalois +} // math + + void vsub(const vec_t& a, const vec_t& b, vec_t& out); void vmul(const vec_t& a, const vec_t& b, vec_t& out); void vdiv(const vec_t& a, const vec_t& b, vec_t& out); @@ -50,7 +60,9 @@ void transpose(size_t x, size_t y, const vec_t& in, vec_t& out); void transpose(size_t x, size_t y, const float_t* in, float_t* out); int argmax(const size_t n, const vec_t& x); // the arguments of the maxima int argmax(const size_t n, const float_t* x); // the arguments of the maxima +//! clear entire vector void clear(vec_t& in); +//! clear n elements of a vector void clear(size_t n, float_t* in); void relu(const vec_t& in, vec_t& out); // ReLU void relu(size_t n, const float_t* in, float_t* out); // ReLU diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 5c7586e9a4..398da276d2 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -3,26 +3,39 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { - galois::do_all(galois::iterate(g.begin(), g.end()), - [&](const auto& src) { + galois::do_all(galois::iterate(g), + [&](const GNode src) { + // zero out this node's out values clear(len, &out[src * len]); - float_t a = 0.0, b = 0.0; - if (norm) - a = norm_factor[src]; + float_t a = 0.0; + float_t b = 0.0; + + // get normalization factor if needed + if (norm) a = norm_factor[src]; + // gather neighbors' embeddings for (const auto e : g.edges(src)) { const auto dst = g.getEdgeDst(e); + if (norm) { + // normalize b as well b = a * norm_factor[dst]; vec_t neighbor(len); + // scale the neighbor's data using the normalization + // factor mul_scalar(len, b, &in[dst * len], &neighbor[0]); - vadd(len, &out[src * len], &neighbor[0], + // use scaled data to update + deepgalois::math::vadd(len, &out[src * len], &neighbor[0], &out[src * len]); // out[src] += in[dst] } else - vadd(len, &out[src * len], &in[dst * len], - &out[src * len]); // out[src] += in[dst] + // add embeddings from neighbors together + deepgalois::math::vadd(len, &out[src * len], + &in[dst * len], + &out[src * len]); // out[src] += in[dst] } }, - galois::chunk_size(), galois::steal(), + galois::chunk_size(), + galois::steal(), + galois::no_stats(), galois::loopname("update_all")); } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index b496f52d57..442478b220 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -21,7 +21,7 @@ void graph_conv_layer::combine(const vec_t& self, const vec_t& neighbors, vec_t& vec_t b(out.size(), 0); mvmul(Q, self, a); mvmul(W, neighbors, b); - vadd(a, b, out); // out = W*self + Q*neighbors + deepgalois::math::vadd(a, b, out); // out = W*self + Q*neighbors } graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 979f5ce9d7..5b5ee78031 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -7,6 +7,9 @@ extern "C" { //#include } +namespace deepgalois { +namespace math { + // vector add #if defined(__AVX__) || defined(__AVX2__) void vadd(const vec_t& a, const vec_t& b, vec_t& out) { @@ -41,6 +44,11 @@ void vadd(size_t n, const float_t* a, const float_t* b, float_t* out) { } #endif +} // deepgalois +} // math + + + // vector subtract void vsub(const vec_t& in_a, const vec_t& in_b, vec_t& out) { for (size_t i = 0; i < out.size(); ++i) From 8bbb5aa5673ba7ed87b823ec497b0590504cc8e9 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Mar 2020 12:25:49 -0600 Subject: [PATCH 096/660] cmakelist change for previous commit aggregator change --- libdeepgalois/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 813992d433..7f481cb385 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -28,11 +28,11 @@ else() link_directories(${CUDA_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgpu) set(CUDA_SOURCES + src/layers/aggregator.cu src/math_functions.cu - src/aggregator.cu - src/optimizer.cu - src/context.cu - src/node.cu + src/optimizer.cu + src/context.cu + src/node.cu ) cuda_add_library(dg_gpu ${CUDA_SOURCES}) target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcurand) From 1f03f4dcdc786be43b31f03fc3ccd1ade0b1fafe Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Mar 2020 12:26:04 -0600 Subject: [PATCH 097/660] mulscalar and clear moved to deepgalois::math --- .../include/deepgalois/math_functions.hh | 14 ++++--- libdeepgalois/src/layers/aggregator.cpp | 4 +- libdeepgalois/src/math_functions.cpp | 41 ++++++++++--------- 3 files changed, 31 insertions(+), 28 deletions(-) diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 101d5125be..8ac1eb653b 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -27,6 +27,14 @@ namespace math { void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add //! add 2 arrays for n elements void vadd(size_t n, const float_t* a, const float_t* b, float_t* out); +//! multiply vector by scalar +void mul_scalar(const float_t alpha, vec_t& Y); +//! multiply n elements of vector by scalar +void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out); +//! clear entire vector +void clear(vec_t& in); +//! clear n elements of a vector +void clear(size_t n, float_t* in); } // deepgalois } // math @@ -37,8 +45,6 @@ void vmul(const vec_t& a, const vec_t& b, vec_t& out); void vdiv(const vec_t& a, const vec_t& b, vec_t& out); void add_scalar(const float_t alpha, vec_t& Y); void sub_scalar(const float_t alpha, vec_t& Y); -void mul_scalar(const float_t alpha, vec_t& Y); -void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out); void div_scalar(const float_t alpha, vec_t& Y); float_t dot(const vec_t& x, const vec_t& y); void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector); @@ -60,10 +66,6 @@ void transpose(size_t x, size_t y, const vec_t& in, vec_t& out); void transpose(size_t x, size_t y, const float_t* in, float_t* out); int argmax(const size_t n, const vec_t& x); // the arguments of the maxima int argmax(const size_t n, const float_t* x); // the arguments of the maxima -//! clear entire vector -void clear(vec_t& in); -//! clear n elements of a vector -void clear(size_t n, float_t* in); void relu(const vec_t& in, vec_t& out); // ReLU void relu(size_t n, const float_t* in, float_t* out); // ReLU void d_relu(const vec_t& in_diff, const vec_t& data, diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 398da276d2..33d0033638 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -6,7 +6,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou galois::do_all(galois::iterate(g), [&](const GNode src) { // zero out this node's out values - clear(len, &out[src * len]); + deepgalois::math::clear(len, &out[src * len]); float_t a = 0.0; float_t b = 0.0; @@ -23,7 +23,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou vec_t neighbor(len); // scale the neighbor's data using the normalization // factor - mul_scalar(len, b, &in[dst * len], &neighbor[0]); + deepgalois::math::mul_scalar(len, b, &in[dst * len], &neighbor[0]); // use scaled data to update deepgalois::math::vadd(len, &out[src * len], &neighbor[0], &out[src * len]); // out[src] += in[dst] diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 5b5ee78031..b0206dea90 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -44,6 +44,27 @@ void vadd(size_t n, const float_t* a, const float_t* b, float_t* out) { } #endif +void clear(vec_t& in) { + for (size_t i = 0; i < in.size(); i++) + in[i] = 0; +} + +void clear(size_t n, float_t* in) { + for (size_t i = 0; i < n; i++) + in[i] = 0; +} + +// vector multiply scalar +void mul_scalar(const float_t alpha, vec_t& Y) { + for (size_t i = 0; i < Y.size(); ++i) + Y[i] *= alpha; +} + +void mul_scalar(size_t n, const float_t alpha, const float_t* in, + float_t* out) { + for (size_t i = 0; i < n; ++i) + out[i] = alpha * in[i]; +} } // deepgalois } // math @@ -81,17 +102,6 @@ void sub_scalar(const float_t alpha, vec_t& Y) { Y[i] -= alpha; } -// vector multiply scalar -void mul_scalar(const float_t alpha, vec_t& Y) { - for (size_t i = 0; i < Y.size(); ++i) - Y[i] *= alpha; -} - -void mul_scalar(size_t n, const float_t alpha, const float_t* in, - float_t* out) { - for (size_t i = 0; i < n; ++i) - out[i] = alpha * in[i]; -} // vector divide scalar void div_scalar(const float_t alpha, vec_t& Y) { @@ -299,15 +309,6 @@ int argmax(const size_t n, const float_t* x) { return max_ind; } -void clear(vec_t& in) { - for (size_t i = 0; i < in.size(); i++) - in[i] = 0; -} - -void clear(size_t n, float_t* in) { - for (size_t i = 0; i < n; i++) - in[i] = 0; -} void relu(const vec_t& in, vec_t& out) { for (size_t i = 0; i < out.size(); ++i) { From 23fd8672ea036d302d5413b6cfa1ef04dca6b82c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Mar 2020 12:34:49 -0600 Subject: [PATCH 098/660] removed chunk size specification from aggregator (for now?) --- libdeepgalois/src/layers/aggregator.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 33d0033638..d32ab2c598 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -34,7 +34,6 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou &out[src * len]); // out[src] += in[dst] } }, - galois::chunk_size(), galois::steal(), galois::no_stats(), galois::loopname("update_all")); From a91e585431f46006d5b6f5e6d9489b9596917916 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Mar 2020 12:35:06 -0600 Subject: [PATCH 099/660] qualified net_phase with deepgalois namepsace --- libdeepgalois/include/deepgalois/layers/graph_conv_layer.h | 4 ++-- libdeepgalois/include/deepgalois/layers/layer.h | 2 +- libdeepgalois/include/deepgalois/net.h | 2 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index f5b7906f73..92bf1587fc 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -33,7 +33,7 @@ class graph_conv_layer : public layer { ~graph_conv_layer() {} void init(); std::string layer_type() const override { return std::string("graph_conv"); } - void set_netphase(net_phase ctx) override { phase_ = ctx; } + void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; } // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data); // virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, // vec_t &out_grad, vec_t &in_grad); @@ -57,7 +57,7 @@ class graph_conv_layer : public layer { bool dropout_; // whether to use dropout at first const float_t dropout_rate_; float_t scale_; - net_phase phase_; + deepgalois::net_phase phase_; size_t x; size_t y; size_t z; diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index da6e866d6b..941a1aa9b3 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -56,7 +56,7 @@ class layer : public deepgalois::node { } virtual ~layer() = default; virtual std::string layer_type() const = 0; - virtual void set_netphase(net_phase phase) {} + virtual void set_netphase(deepgalois::net_phase phase) {} //! save context virtual void set_context(deepgalois::Context* ctx) { context = ctx; } virtual acc_t get_masked_loss() { return acc_t(0); } diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 0e18f39e1c..c7c574510e 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -34,7 +34,7 @@ class Net { for (size_t i = 0; i < num_layers; i++) layers[i]->set_context(context); } - void set_netphases(net_phase phase) { + void set_netphases(deepgalois::net_phase phase) { for (size_t i = 0; i < num_layers; i++) layers[i]->set_netphase(phase); } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 442478b220..8665674ead 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -68,7 +68,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ // input: x*y; W: y*z; output: x*z // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W (not implemented yet) - if (dropout_ && phase_ == net_phase::train) { + if (dropout_ && phase_ == deepgalois::net_phase::train) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { dropout(y, scale_, dropout_rate_, &in_data[i * y], @@ -133,7 +133,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { assert(y <= 128); // currently only support feature length <= 128 init_const_gpu(x*z, 0.0, out_temp); - if (dropout_ && phase_ == net_phase::train) { + if (dropout_ && phase_ == deepgalois::net_phase::train) { dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp); } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp); From 3c8bd7b771462749a202726cd9d511bdddd32043 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Mar 2020 13:11:00 -0600 Subject: [PATCH 100/660] comments/rearrange layer/net --- libdeepgalois/include/deepgalois/layers/layer.h | 9 +++++---- libdeepgalois/include/deepgalois/net.h | 11 ++++++++--- 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 941a1aa9b3..10a60c7f89 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -59,6 +59,7 @@ class layer : public deepgalois::node { virtual void set_netphase(deepgalois::net_phase phase) {} //! save context virtual void set_context(deepgalois::Context* ctx) { context = ctx; } + //! return layer loss virtual acc_t get_masked_loss() { return acc_t(0); } // main functions for layer work @@ -78,9 +79,9 @@ class layer : public deepgalois::node { mask_t* get_device_masks() { return d_masks_; } //! debug print function void print_layer_info() { - std::cout << "Layer" << level_ << " type: " << layer_type() << " input[" - << input_dims[0] << "," << input_dims[1] << "] output[" - << output_dims[0] << "," << output_dims[1] << "]\n"; + galois::gPrint("Layer", level_, " type: ", layer_type(), " input[", + input_dims[0], ",", input_dims[1], "] output[", + output_dims[0], ",", output_dims[1], "]\n"); } virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, mask_t* masks) { @@ -124,7 +125,7 @@ class layer : public deepgalois::node { next()->get_gradient(), prev()->get_gradient()); } - //! use optimizer to update weights given gradient + //! use optimizer to update weights given gradient (weight_grad) void update_weight(deepgalois::optimizer* opt) { // vec_t diff; // prev()->merge_grads(&diff); diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index c7c574510e..67f7f10eae 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -34,15 +34,18 @@ class Net { for (size_t i = 0; i < num_layers; i++) layers[i]->set_context(context); } + //! set netphases for all layers in this network void set_netphases(deepgalois::net_phase phase) { for (size_t i = 0; i < num_layers; i++) layers[i]->set_netphase(phase); } + //! print all layers void print_layers_info() { for (size_t i = 0; i < num_layers; i++) layers[i]->print_layer_info(); } + //! Add a convolution layer to the network void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, bool bias = false, bool dropout = true, float_t dropout_rate = 0.5) { @@ -58,6 +61,7 @@ class Net { connect(layers[layer_id - 1], layers[layer_id]); } + //! Add an output layer to the network void append_out_layer(size_t layer_id) { assert(layer_id > 0); // can not be the first layer std::vector in_dims(2), out_dims(2); @@ -68,15 +72,16 @@ class Net { connect(layers[layer_id - 1], layers[layer_id]); } - // forward propagation: [begin, end) is the range of samples used. + //! forward propagation: [begin, end) is the range of samples used. + //! calls "forward" on the layers of the network and returns the loss of the + //! final layer acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) { // set mask for the last layer layers[num_layers - 1]->set_sample_mask(begin, end, count, masks); // layer0: from N x D to N x 16 // layer1: from N x 16 to N x E // layer2: from N x E to N x E (normalize only) - for (size_t i = 0; i < num_layers; i++) - layers[i]->forward(); + for (size_t i = 0; i < num_layers; i++) layers[i]->forward(); return layers[num_layers - 1]->get_masked_loss(); } From b5848b4fa869e11ff9feef008a92e99db8d0650b Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Mar 2020 13:11:23 -0600 Subject: [PATCH 101/660] matmul1d1d moved to deepgalois::math --- .../deepgalois/layers/graph_conv_layer.h | 3 - .../include/deepgalois/math_functions.hh | 7 ++- libdeepgalois/src/layers/graph_conv_layer.cpp | 12 ++-- libdeepgalois/src/math_functions.cpp | 60 ++++++++++--------- 4 files changed, 42 insertions(+), 40 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 92bf1587fc..3fe9ddc31d 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -34,9 +34,6 @@ class graph_conv_layer : public layer { void init(); std::string layer_type() const override { return std::string("graph_conv"); } void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; } - // virtual void forward_propagation(const vec_t &in_data, vec_t &out_data); - // virtual void back_propagation(const vec_t &in_data, const vec_t &out_data, - // vec_t &out_grad, vec_t &in_grad); virtual void forward_propagation(const float_t* in_data, float_t* out_data); virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 8ac1eb653b..410078ce99 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -36,6 +36,10 @@ void clear(vec_t& in); //! clear n elements of a vector void clear(size_t n, float_t* in); +void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, + const float_t* A, const float_t* B, + float_t* C); // matrix multiply + } // deepgalois } // math @@ -55,9 +59,6 @@ void copy2D1D(const tensor_t& in, vec_t& out); void copy1D1D(const vec_t& in, vec_t& out); void copy1D1D(size_t len, const float_t* in, float_t* out); void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C); -void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, - const float_t* A, const float_t* B, - float_t* C); // matrix multiply void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B, vec_t& C); void transpose2D(const tensor_t& in, tensor_t& out); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 8665674ead..f535a3812a 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -74,12 +74,12 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ dropout(y, scale_, dropout_rate_, &in_data[i * y], &dropout_mask[i * y], &in_temp[i * y]); }, galois::loopname("dropout")); - matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z + deepgalois::math::matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z } else { - matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z + deepgalois::math::matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z } - aggregate(z, context->graph_cpu, out_temp, out_data); + graph_conv_layer::aggregate(z, context->graph_cpu, out_temp, out_data); if (act_) { galois::do_all( @@ -107,7 +107,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, if (level_ != 0) { // no need to calculate in_grad for the first layer vec_t trans_W(z * y); transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix - matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y + deepgalois::math::matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y // sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y -> // x*y NOTE: since graph is symmetric, the derivative is the same update_all(y, context->graph_cpu, in_temp, in_grad, true, @@ -124,7 +124,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // calculate weight gradients transpose(x, y, in_data, trans_data); // y*x - matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z + deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z } #else @@ -137,7 +137,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp); } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp); - aggregate(z, context->graph_gpu, out_temp, out_data); + graph_conv_layer::aggregate(z, context->graph_gpu, out_temp, out_data); if (act_) relu_gpu(x * z, out_data, out_data); } diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index b0206dea90..255e6483ce 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -44,6 +44,18 @@ void vadd(size_t n, const float_t* a, const float_t* b, float_t* out) { } #endif +// vector multiply scalar +void mul_scalar(const float_t alpha, vec_t& Y) { + for (size_t i = 0; i < Y.size(); ++i) + Y[i] *= alpha; +} + +void mul_scalar(size_t n, const float_t alpha, const float_t* in, + float_t* out) { + for (size_t i = 0; i < n; ++i) + out[i] = alpha * in[i]; +} + void clear(vec_t& in) { for (size_t i = 0; i < in.size(); i++) in[i] = 0; @@ -54,17 +66,27 @@ void clear(size_t n, float_t* in) { in[i] = 0; } -// vector multiply scalar -void mul_scalar(const float_t alpha, vec_t& Y) { - for (size_t i = 0; i < Y.size(); ++i) - Y[i] *= alpha; +void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, + beta, C, N); } -void mul_scalar(size_t n, const float_t alpha, const float_t* in, - float_t* out) { - for (size_t i = 0; i < n; ++i) - out[i] = alpha * in[i]; +// num rows in A, C; num columns in B, C; num columns in A, rows in B +void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, + const float_t* A, const float_t* B, float_t* C) { + galois::StatTimer Tmatmul("MatMul"); + Tmatmul.start(); + const CBLAS_TRANSPOSE TransA = CblasNoTrans; + const CBLAS_TRANSPOSE TransB = CblasNoTrans; + sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); + Tmatmul.stop(); } + + } // deepgalois } // math @@ -174,14 +196,6 @@ void copy1D1D(size_t len, const float_t* in, float_t* out) { std::copy(in, in + len, out); } -void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, - const int M, const int N, const int K, const float alpha, - const float* A, const float* B, const float beta, float* C) { - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, - beta, C, N); -} void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C) { // A: x*z; B: z*y; C: x*y @@ -202,16 +216,6 @@ void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C) { } } -// num rows in A, C; num columns in B, C; num columns in A, rows in B -void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, - const float_t* A, const float_t* B, float_t* C) { - galois::StatTimer Tmatmul("MatMul"); - Tmatmul.start(); - const CBLAS_TRANSPOSE TransA = CblasNoTrans; - const CBLAS_TRANSPOSE TransB = CblasNoTrans; - sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); - Tmatmul.stop(); -} void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B, vec_t& C) { @@ -222,7 +226,7 @@ void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B, assert(C.size() == dim_x * dim_y); vec_t A1D(dim_x * dim_z); copy2D1D(A, A1D); - matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C[0]); + deepgalois::math::matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C[0]); } void matmul(const tensor_t& A, const vec_t& B, tensor_t& C) { @@ -239,7 +243,7 @@ void matmul(const tensor_t& A, const vec_t& B, tensor_t& C) { std::copy(A[i].begin(), A[i].end(), ptr); ptr += dim_z; } - matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C1D[0]); + deepgalois::math::matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C1D[0]); for (size_t i = 0; i < dim_x; i++) { for (size_t j = 0; j < dim_y; ++j) { C[i][j] = C1D[i * dim_y + j]; From e9e3153be737a7b56cc2d10a7a94acaac5b63784 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Mar 2020 13:18:41 -0600 Subject: [PATCH 102/660] dropout functions moved to deepgalios::math + commented --- .../include/deepgalois/math_functions.hh | 26 +++--- libdeepgalois/src/layers/graph_conv_layer.cpp | 4 +- libdeepgalois/src/math_functions.cpp | 90 ++++++++++--------- 3 files changed, 63 insertions(+), 57 deletions(-) diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 410078ce99..09fd3d753a 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -36,6 +36,21 @@ void clear(vec_t& in); //! clear n elements of a vector void clear(size_t n, float_t* in); +// dropout functions apply a random scale to in vector +void dropout(const float scale, const float dropout_rate, const vec_t& in, + std::vector& mask, vec_t& out); // dropout +void dropout(const float scale, const float dropout_rate, const vec_t& in, + std::vector& mask, float_t* out); +void dropout(size_t n, const float scale, const float dropout_rate, + const float_t* in, unsigned* mask, float_t* out); +// dropout calls that use existing scales in masks instead of generating them +void d_dropout(const float scale, const vec_t& in_diff, + std::vector& mask, + vec_t& out_diff); // dropout derivative +void d_dropout(size_t n, const float scale, const float_t* in_diff, + unsigned* mask, float_t* out_diff); + + void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t* A, const float_t* B, float_t* C); // matrix multiply @@ -71,17 +86,6 @@ void relu(const vec_t& in, vec_t& out); // ReLU void relu(size_t n, const float_t* in, float_t* out); // ReLU void d_relu(const vec_t& in_diff, const vec_t& data, vec_t& out_diff); // ReLU derivative -void dropout(const float scale, const float dropout_rate, const vec_t& in, - std::vector& mask, vec_t& out); // dropout -void dropout(const float scale, const float dropout_rate, const vec_t& in, - std::vector& mask, float_t* out); -void dropout(size_t n, const float scale, const float dropout_rate, - const float_t* in, unsigned* mask, float_t* out); -void d_dropout(const float scale, const vec_t& in_diff, - std::vector& mask, - vec_t& out_diff); // dropout derivative -void d_dropout(size_t n, const float scale, const float_t* in_diff, - unsigned* mask, float_t* out_diff); void softmax(const vec_t& input, vec_t& output); void softmax(size_t n, const float_t* input, float_t* output); void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index f535a3812a..c071a1cd0d 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -71,7 +71,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ if (dropout_ && phase_ == deepgalois::net_phase::train) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - dropout(y, scale_, dropout_rate_, &in_data[i * y], + deepgalois::math::dropout(y, scale_, dropout_rate_, &in_data[i * y], &dropout_mask[i * y], &in_temp[i * y]); }, galois::loopname("dropout")); deepgalois::math::matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z @@ -115,7 +115,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, if (dropout_) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - d_dropout(y, scale_, &in_grad[i * y], + deepgalois::math::d_dropout(y, scale_, &in_grad[i * y], &dropout_mask[i * y], &in_grad[i * y]); }, galois::chunk_size(), galois::steal(), galois::loopname("d_dropout")); diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 255e6483ce..96e9552b56 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -10,6 +10,16 @@ extern "C" { namespace deepgalois { namespace math { +//! wrapper function to call cblas_sgemm +void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C) { + int lda = (TransA == CblasNoTrans) ? K : M; + int ldb = (TransB == CblasNoTrans) ? N : K; + cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, + beta, C, N); +} + // vector add #if defined(__AVX__) || defined(__AVX2__) void vadd(const vec_t& a, const vec_t& b, vec_t& out) { @@ -66,13 +76,42 @@ void clear(size_t n, float_t* in) { in[i] = 0; } -void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, - const int M, const int N, const int K, const float alpha, - const float* A, const float* B, const float beta, float* C) { - int lda = (TransA == CblasNoTrans) ? K : M; - int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, - beta, C, N); +void dropout(const float scale, const float dropout_rate, const vec_t& in, + std::vector& masks, vec_t& out) { + assert(masks.size() == out.size()); + // rng_bernoulli(1. - dropout_rate, masks); // Create random numbers + for (size_t i = 0; i < in.size(); ++i) + masks[i] = deepgalois::bernoulli(dropout_rate); + for (size_t i = 0; i < in.size(); ++i) + out[i] = in[i] * masks[i] * scale; +} + +void dropout(const float scale, const float dropout_rate, const vec_t& in, + std::vector& masks, float_t* out) { + for (size_t i = 0; i < in.size(); ++i) + masks[i] = deepgalois::bernoulli(dropout_rate); + for (size_t i = 0; i < in.size(); ++i) + out[i] = in[i] * masks[i] * scale; +} + +void dropout(size_t n, const float scale, const float dropout_rate, + const float_t* in, unsigned* masks, float_t* out) { + for (size_t i = 0; i < n; ++i) + masks[i] = deepgalois::bernoulli(dropout_rate); + for (size_t i = 0; i < n; ++i) + out[i] = in[i] * masks[i] * scale; +} + +void d_dropout(const float scale, const vec_t& in_diff, + std::vector& masks, vec_t& out_diff) { + for (size_t i = 0; i < in_diff.size(); ++i) + out_diff[i] = in_diff[i] * masks[i] * scale; +} + +void d_dropout(size_t n, const float scale, const float_t* in_diff, + unsigned* masks, float_t* out_diff) { + for (size_t i = 0; i < n; ++i) + out_diff[i] = in_diff[i] * masks[i] * scale; } // num rows in A, C; num columns in B, C; num columns in A, rows in B @@ -86,7 +125,6 @@ void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, Tmatmul.stop(); } - } // deepgalois } // math @@ -352,43 +390,7 @@ float reduce_mean(const vec_t& x) { return sum / (float)n; } -void dropout(const float scale, const float dropout_rate, const vec_t& in, - std::vector& masks, vec_t& out) { - assert(masks.size() == out.size()); - // rng_bernoulli(1. - dropout_rate, masks); // Create random numbers - for (size_t i = 0; i < in.size(); ++i) - masks[i] = deepgalois::bernoulli(dropout_rate); - for (size_t i = 0; i < in.size(); ++i) - out[i] = in[i] * masks[i] * scale; -} - -void dropout(const float scale, const float dropout_rate, const vec_t& in, - std::vector& masks, float_t* out) { - for (size_t i = 0; i < in.size(); ++i) - masks[i] = deepgalois::bernoulli(dropout_rate); - for (size_t i = 0; i < in.size(); ++i) - out[i] = in[i] * masks[i] * scale; -} - -void dropout(size_t n, const float scale, const float dropout_rate, - const float_t* in, unsigned* masks, float_t* out) { - for (size_t i = 0; i < n; ++i) - masks[i] = deepgalois::bernoulli(dropout_rate); - for (size_t i = 0; i < n; ++i) - out[i] = in[i] * masks[i] * scale; -} - -void d_dropout(const float scale, const vec_t& in_diff, - std::vector& masks, vec_t& out_diff) { - for (size_t i = 0; i < in_diff.size(); ++i) - out_diff[i] = in_diff[i] * masks[i] * scale; -} -void d_dropout(size_t n, const float scale, const float_t* in_diff, - unsigned* masks, float_t* out_diff) { - for (size_t i = 0; i < n; ++i) - out_diff[i] = in_diff[i] * masks[i] * scale; -} float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; } From 88f6c009b687a56ac578d4affc83c904b84669fc Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Mar 2020 16:41:23 -0600 Subject: [PATCH 103/660] relu/copy1d1d moved to deepgalois::math, some comments in graphconv --- .../deepgalois/layers/graph_conv_layer.h | 2 +- .../include/deepgalois/math_functions.hh | 26 ++++++---- libdeepgalois/include/deepgalois/net.h | 5 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 15 +++--- libdeepgalois/src/math_functions.cpp | 52 ++++++++++--------- 5 files changed, 58 insertions(+), 42 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 3fe9ddc31d..518dede084 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -58,7 +58,7 @@ class graph_conv_layer : public layer { size_t x; size_t y; size_t z; - float_t* out_temp; + float_t* out_temp; //!< intermediate data temporary float_t* in_temp; float_t* trans_data; // y*x unsigned* dropout_mask; // x*y diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 09fd3d753a..c9ecbe6dea 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -18,11 +18,12 @@ extern "C" { // TODO namespace -const float negative_slope = 0; namespace deepgalois { namespace math { +const float negative_slope = 0; + //! add two same size vectors into out void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add //! add 2 arrays for n elements @@ -36,20 +37,33 @@ void clear(vec_t& in); //! clear n elements of a vector void clear(size_t n, float_t* in); -// dropout functions apply a random scale to in vector +// dropout functions randomly remove weights void dropout(const float scale, const float dropout_rate, const vec_t& in, std::vector& mask, vec_t& out); // dropout void dropout(const float scale, const float dropout_rate, const vec_t& in, std::vector& mask, float_t* out); void dropout(size_t n, const float scale, const float dropout_rate, const float_t* in, unsigned* mask, float_t* out); -// dropout calls that use existing scales in masks instead of generating them +// dropout calls that use existing dropouts in masks instead of generating them; +// derivative void d_dropout(const float scale, const vec_t& in_diff, std::vector& mask, vec_t& out_diff); // dropout derivative void d_dropout(size_t n, const float scale, const float_t* in_diff, unsigned* mask, float_t* out_diff); +//! relu = keep if positive +void relu(const vec_t& in, vec_t& out); +//! relu = keep if positive; first n units +void relu(size_t n, const float_t* in, float_t* out); +//! relu derivative; generally, 1 if x > 0, 0 otherwise +void d_relu(const vec_t& in_diff, const vec_t& data, + vec_t& out_diff); // ReLU derivative + +//! copy vector from in -> out +void copy1D1D(const vec_t& in, vec_t& out); +//! copy vector from in -> out; first len elements +void copy1D1D(size_t len, const float_t* in, float_t* out); void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t* A, const float_t* B, @@ -71,8 +85,6 @@ void vvmul(const vec_t& a, const vec_t& b, tensor_t& out); void matadd(size_t x, size_t y, const tensor_t& A, const tensor_t& B, tensor_t& C); void copy2D1D(const tensor_t& in, vec_t& out); -void copy1D1D(const vec_t& in, vec_t& out); -void copy1D1D(size_t len, const float_t* in, float_t* out); void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C); void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B, vec_t& C); @@ -82,10 +94,6 @@ void transpose(size_t x, size_t y, const vec_t& in, vec_t& out); void transpose(size_t x, size_t y, const float_t* in, float_t* out); int argmax(const size_t n, const vec_t& x); // the arguments of the maxima int argmax(const size_t n, const float_t* x); // the arguments of the maxima -void relu(const vec_t& in, vec_t& out); // ReLU -void relu(size_t n, const float_t* in, float_t* out); // ReLU -void d_relu(const vec_t& in_diff, const vec_t& data, - vec_t& out_diff); // ReLU derivative void softmax(const vec_t& input, vec_t& output); void softmax(size_t n, const float_t* input, float_t* output); void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp); diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 67f7f10eae..47a48dea78 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -81,7 +81,10 @@ class Net { // layer0: from N x D to N x 16 // layer1: from N x 16 to N x E // layer2: from N x E to N x E (normalize only) - for (size_t i = 0; i < num_layers; i++) layers[i]->forward(); + for (size_t i = 0; i < num_layers; i++) { + layers[i]->forward(); + // TODO need to sync model between layers here + } return layers[num_layers - 1]->get_masked_loss(); } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index c071a1cd0d..9060850c02 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -4,14 +4,14 @@ namespace deepgalois { #ifdef CPU_ONLY void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { - update_all(len, g, in, out, true, context->norm_factor); + deepgalois::update_all(len, g, in, out, true, context->norm_factor); } #else void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { #ifdef USE_CUSPARSE update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); #else - update_all(len, g, in, out, true, context->d_norm_factor); + deepgalois::update_all(len, g, in, out, true, context->d_norm_factor); #endif } #endif @@ -81,10 +81,12 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ graph_conv_layer::aggregate(z, context->graph_cpu, out_temp, out_data); + // run relu activation on output if specified if (act_) { galois::do_all( galois::iterate((size_t)0, x), - [&](const auto& i) { relu(z, &out_data[i * z], &out_data[i * z]); }, + [&](const auto& i) { deepgalois::math::relu(z, &out_data[i * z], + &out_data[i * z]); }, galois::loopname("relu")); } } @@ -94,6 +96,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { if (act_) { + // note; assumption here is that out_grad contains 1s or 0s via relu? galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { for (size_t j = 0; j < z; ++j) // TODO: use in_data or out_data? @@ -101,7 +104,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, ? out_grad[i * z + j] : float_t(0); }, galois::loopname("d_relu")); } else { - copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying + deepgalois::math::copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying } if (level_ != 0) { // no need to calculate in_grad for the first layer @@ -110,8 +113,8 @@ void graph_conv_layer::back_propagation(const float_t* in_data, deepgalois::math::matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y // sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y -> // x*y NOTE: since graph is symmetric, the derivative is the same - update_all(y, context->graph_cpu, in_temp, in_grad, true, - context->norm_factor); // x*x; x*y -> x*y + deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, true, + context->norm_factor); // x*x; x*y -> x*y if (dropout_) { galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 96e9552b56..42ce73b689 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -114,6 +114,33 @@ void d_dropout(size_t n, const float scale, const float_t* in_diff, out_diff[i] = in_diff[i] * masks[i] * scale; } +void relu(const vec_t& in, vec_t& out) { + for (size_t i = 0; i < out.size(); ++i) { + out[i] = std::max(in[i], (float_t)0) + + negative_slope * std::min(in[i], (float_t)0); + } +} + +void relu(size_t n, const float_t* in, float_t* out) { + for (size_t i = 0; i < n; ++i) + out[i] = std::max(in[i], float_t(0)); +} + +void d_relu(const vec_t& in_diff, const vec_t& fv, vec_t& out_diff) { + for (size_t i = 0; i < out_diff.size(); ++i) { + out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) + + negative_slope * (fv[i] <= (float_t)0)); + } +} + +void copy1D1D(const vec_t& in, vec_t& out) { + std::copy(in.begin(), in.end(), &out[0]); +} + +void copy1D1D(size_t len, const float_t* in, float_t* out) { + std::copy(in, in + len, out); +} + // num rows in A, C; num columns in B, C; num columns in A, rows in B void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t* A, const float_t* B, float_t* C) { @@ -226,13 +253,6 @@ void copy2D1D(const tensor_t& in, vec_t& out) { } } -void copy1D1D(const vec_t& in, vec_t& out) { - std::copy(in.begin(), in.end(), &out[0]); -} - -void copy1D1D(size_t len, const float_t* in, float_t* out) { - std::copy(in, in + len, out); -} void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C) { @@ -352,24 +372,6 @@ int argmax(const size_t n, const float_t* x) { } -void relu(const vec_t& in, vec_t& out) { - for (size_t i = 0; i < out.size(); ++i) { - out[i] = std::max(in[i], (float_t)0) + - negative_slope * std::min(in[i], (float_t)0); - } -} - -void relu(size_t n, const float_t* in, float_t* out) { - for (size_t i = 0; i < n; ++i) - out[i] = std::max(in[i], float_t(0)); -} - -void d_relu(const vec_t& in_diff, const vec_t& fv, vec_t& out_diff) { - for (size_t i = 0; i < out_diff.size(); ++i) { - out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) + - negative_slope * (fv[i] <= (float_t)0)); - } -} void d_mvmul(vec_t& in_diff, vec_t& h_in, tensor_t& out_diff) { vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff From 3d156aaaaf4cf0af996b852b24ae735afef9a149 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Mar 2020 17:21:47 -0600 Subject: [PATCH 104/660] transpose to math TODO make parallel or remove completely and use sgemm --- .../include/deepgalois/math_functions.hh | 7 ++-- libdeepgalois/src/layers/graph_conv_layer.cpp | 8 +++-- libdeepgalois/src/math_functions.cpp | 33 ++++++++++--------- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index c9ecbe6dea..cc277093f3 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -69,6 +69,11 @@ void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t* A, const float_t* B, float_t* C); // matrix multiply +//! transposes a matrix (vector) +void transpose(size_t x, size_t y, const vec_t& in, vec_t& out); +//! transposes a matrix (malloc'd array) +void transpose(size_t x, size_t y, const float_t* in, float_t* out); + } // deepgalois } // math @@ -90,8 +95,6 @@ void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B, vec_t& C); void transpose2D(const tensor_t& in, tensor_t& out); void transpose2D1D(const tensor_t& in, vec_t& out); -void transpose(size_t x, size_t y, const vec_t& in, vec_t& out); -void transpose(size_t x, size_t y, const float_t* in, float_t* out); int argmax(const size_t n, const vec_t& x); // the arguments of the maxima int argmax(const size_t n, const float_t* x); // the arguments of the maxima void softmax(const vec_t& input, vec_t& output); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 9060850c02..2ce46756c3 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -107,9 +107,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data, deepgalois::math::copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying } + // at this point, out_temp has the derivative of activation + + // this calculates feature gradients if (level_ != 0) { // no need to calculate in_grad for the first layer vec_t trans_W(z * y); - transpose(y, z, W, trans_W); // derivative of matmul needs transposed matrix + // derivative of matmul needs transposed matrix + deepgalois::math::transpose(y, z, W, trans_W); deepgalois::math::matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y // sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y -> // x*y NOTE: since graph is symmetric, the derivative is the same @@ -126,7 +130,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, } // calculate weight gradients - transpose(x, y, in_data, trans_data); // y*x + deepgalois::math::transpose(x, y, in_data, trans_data); // y*x deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z } diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 42ce73b689..af4e62c90c 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -152,6 +152,24 @@ void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, Tmatmul.stop(); } +// TODO make parallel +void transpose(size_t x, size_t y, const vec_t& in, vec_t& out) { + for (size_t i = 0; i < y; i++) { + for (size_t j = 0; j < x; j++) { + out[i * x + j] = in[j * y + i]; + } + } +} + +// TODO make parallel +void transpose(size_t x, size_t y, const float_t* in, float_t* out) { + for (size_t i = 0; i < y; i++) { + for (size_t j = 0; j < x; j++) { + out[i * x + j] = in[j * y + i]; + } + } +} + } // deepgalois } // math @@ -331,21 +349,6 @@ void transpose2D1D(const tensor_t& in, vec_t& out) { } } -void transpose(size_t x, size_t y, const vec_t& in, vec_t& out) { - for (size_t i = 0; i < y; i++) { - for (size_t j = 0; j < x; j++) { - out[i * x + j] = in[j * y + i]; - } - } -} - -void transpose(size_t x, size_t y, const float_t* in, float_t* out) { - for (size_t i = 0; i < y; i++) { - for (size_t j = 0; j < x; j++) { - out[i * x + j] = in[j * y + i]; - } - } -} int argmax(const size_t n, const vec_t& x) { float_t max = x[0]; From 7f3a386da149ae14e40336505f750ff7d53ed38c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Mar 2020 19:55:19 -0600 Subject: [PATCH 105/660] comments for training flow; clean up; namespace scoping --- .../deepgalois/layers/graph_conv_layer.h | 4 +++ .../include/deepgalois/layers/layer.h | 19 +++----------- .../include/deepgalois/layers/node.h | 3 +-- libdeepgalois/include/deepgalois/net.h | 3 +-- libdeepgalois/src/layers/graph_conv_layer.cpp | 25 +++++++++++-------- .../src/layers/softmax_loss_layer.cpp | 1 + libdeepgalois/src/net.cpp | 20 ++++++++++++--- 7 files changed, 42 insertions(+), 33 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 518dede084..0a43cf0095 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -34,7 +34,11 @@ class graph_conv_layer : public layer { void init(); std::string layer_type() const override { return std::string("graph_conv"); } void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; } + //! Uses weights contained in this layer to update in_data (results from previous) + //! and save result to out_data virtual void forward_propagation(const float_t* in_data, float_t* out_data); + //! Uses gradients from layer after this one to update both own weight gradients + //! as well as gradients for the features (in_grad) virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); // user-defined aggregate function diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 10a60c7f89..bb009cd57a 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -134,7 +134,7 @@ class layer : public deepgalois::node { // parallelize only when target size is big enough to mitigate thread // spawning overhead. bool parallel = (W.size() >= 512); - opt->update(weight_grad, W, parallel); // W += grad + opt->update(layer::weight_grad, layer::W, parallel); // W += grad #else //std::cout << name_ << ": "; opt->update_gpu(input_dims[1]*output_dims[1], d_weight_grad, d_W); // W += grad @@ -166,20 +166,9 @@ class layer : public deepgalois::node { }; -// head: layer i+1, tail: layer i -inline void connect(layer* head, layer* tail, size_t head_index = 0, - size_t tail_index = 0) { - // auto out_shape = head->out_shape()[head_index]; - // auto in_shape = tail->in_shape()[tail_index]; - // head->setup(false); - // if (in_shape.size() == 0) { - // tail->set_in_shape(out_shape); - // in_shape = out_shape; - //} - // if (out_shape.size() != in_shape.size()) - // connection_mismatch(*head, *tail); - // if (!head->next_[head_index]) - // throw nn_error("output edge must not be null"); +//! Connects tail to head's edge and sets that edge's target to tail +//inline void connect(layer* head, layer* tail) { +inline void connect(layer* head, layer* tail) { tail->prev_ = head->next_; tail->prev_->add_next_node(tail); } diff --git a/libdeepgalois/include/deepgalois/layers/node.h b/libdeepgalois/include/deepgalois/layers/node.h index fcb20513c0..9b43167656 100644 --- a/libdeepgalois/include/deepgalois/layers/node.h +++ b/libdeepgalois/include/deepgalois/layers/node.h @@ -34,8 +34,7 @@ class node : public std::enable_shared_from_this { protected: node() = delete; - friend void connect(layer* head, layer* tail, size_t head_index, - size_t tail_index); + friend void connect(layer* head, layer* tail); mutable edgeptr_t prev_; mutable edgeptr_t next_; }; diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 47a48dea78..dfc4f3d0d7 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -57,8 +57,7 @@ class Net { out_dims[1] = get_out_dim(layer_id); layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); - if (layer_id > 0) - connect(layers[layer_id - 1], layers[layer_id]); + if (layer_id > 0) connect(layers[layer_id - 1], layers[layer_id]); } //! Add an output layer to the network diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 2ce46756c3..f40f9ad591 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -48,7 +48,7 @@ void graph_conv_layer::init() { #ifdef CPU_ONLY rand_init_matrix(y, z, W); // randomly initialize trainable parameters // rand_init_matrix(y, z, Q); - zero_init_matrix(y, z, weight_grad); + zero_init_matrix(y, z, layer::weight_grad); if (dropout_) dropout_mask = new unsigned[x * y]; in_temp = new float_t[x * y]; @@ -56,7 +56,7 @@ void graph_conv_layer::init() { // https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py trans_data = new float_t[y * x]; // y*x #else - gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, d_weight_grad); + gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, layer::d_weight_grad); #endif t_alloc.Stop(); std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; @@ -74,11 +74,12 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ deepgalois::math::dropout(y, scale_, dropout_rate_, &in_data[i * y], &dropout_mask[i * y], &in_temp[i * y]); }, galois::loopname("dropout")); - deepgalois::math::matmul1D1D(x, z, y, in_temp, &W[0], out_temp); // x*y; y*z; x*z + deepgalois::math::matmul1D1D(x, z, y, in_temp, &layer::W[0], out_temp); // x*y; y*z; x*z } else { - deepgalois::math::matmul1D1D(x, z, y, in_data, &W[0], out_temp); // x*y; y*z; x*z + deepgalois::math::matmul1D1D(x, z, y, in_data, &layer::W[0], out_temp); // x*y; y*z; x*z } + // aggregate based on graph topology graph_conv_layer::aggregate(z, context->graph_cpu, out_temp, out_data); // run relu activation on output if specified @@ -100,6 +101,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { for (size_t j = 0; j < z; ++j) // TODO: use in_data or out_data? + // check if original data greater than 0; if so keep grad out_temp[i * z + j] = out_data[i * z + j] > float_t(0) ? out_grad[i * z + j] : float_t(0); }, galois::loopname("d_relu")); @@ -107,13 +109,14 @@ void graph_conv_layer::back_propagation(const float_t* in_data, deepgalois::math::copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying } - // at this point, out_temp has the derivative of activation + // at this point, out_temp has the derivative of data from last step to + // use for both updating gradients for features and gradients for weights - // this calculates feature gradients + // this calculates gradients for the node predictions if (level_ != 0) { // no need to calculate in_grad for the first layer vec_t trans_W(z * y); // derivative of matmul needs transposed matrix - deepgalois::math::transpose(y, z, W, trans_W); + deepgalois::math::transpose(y, z, layer::W, trans_W); deepgalois::math::matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y // sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y -> // x*y NOTE: since graph is symmetric, the derivative is the same @@ -129,9 +132,11 @@ void graph_conv_layer::back_propagation(const float_t* in_data, } } - // calculate weight gradients + // calculate weight gradients by using previous layer's transpose multiplied + // by gradients from last back prop step deepgalois::math::transpose(x, y, in_data, trans_data); // y*x - deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &weight_grad[0]); // y*x; x*z; y*z + // updates THIS layer's weight gradients to update them + deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &layer::weight_grad[0]); // y*x; x*z; y*z } #else @@ -163,7 +168,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, #endif if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad); } - sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, d_weight_grad); + sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad); } #endif diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index f16ba58fbe..eda3de054d 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -36,6 +36,7 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { + // note: out_grad is ignored because it shouldn't exist (this is output layer) size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 2221b3daad..7407e99d9f 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -49,8 +49,9 @@ void Net::train(optimizer* opt, bool need_validate) { galois::StatTimer Tfw("Train-Forward"); galois::StatTimer Tbw("Train-Backward"); galois::StatTimer Tval("Validation"); + Timer t_epoch; - // run epoches + // run epochs for (unsigned i = 0; i < num_epochs; i++) { std::cout << "Epoch " << std::setw(2) << i << std::fixed << std::setprecision(3) << ":"; @@ -59,18 +60,29 @@ void Net::train(optimizer* opt, bool need_validate) { // training steps set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; + + // forward: after this phase, layer edges will contain intermediate features + // for use during backprop Tfw.start(); train_loss = - fprop(train_begin, train_end, train_count, &train_mask[0]); // forward + Net::fprop(train_begin, train_end, train_count, &train_mask[0]); // forward train_acc = masked_accuracy(train_begin, train_end, train_count, &train_mask[0]); // predict Tfw.stop(); + + // backward: use intermediate features + ground truth to update layers + // with feature gradients whcih are then used to calculate weight gradients Tbw.start(); - bprop(); // back propogation + Net::bprop(); Tbw.stop(); + + // gradient update: use gradients stored on each layer to update model for + // next epoch Tupdate.start(); - update_weights(opt); // update parameters + Net::update_weights(opt); // update parameters Tupdate.stop(); + + // validation / testing set_netphases(net_phase::test); std::cout << " train_loss = " << std::setw(5) << train_loss << " train_acc = " << std::setw(5) << train_acc; From 916e51e39bd5c3205b1ba580b6c5141c0bdb2b82 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 4 Mar 2020 08:49:19 -0600 Subject: [PATCH 106/660] fix header --- libdeepgalois/src/layers/aggregator.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu index cd89cd92b1..166ea4b4bb 100644 --- a/libdeepgalois/src/layers/aggregator.cu +++ b/libdeepgalois/src/layers/aggregator.cu @@ -2,7 +2,7 @@ #include "ggcuda.h" #include "cub/cub.cuh" #include "deepgalois/cutils.h" -#include "deepgalois/aggregator.h" +#include "deepgalois/layers/aggregator.h" #include "deepgalois/math_functions.hh" // TODO: use warp From 6f1a3e25697ae8ae373b86bf85934b2aa3c68b2a Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 4 Mar 2020 10:23:48 -0600 Subject: [PATCH 107/660] fix gpu --- libdeepgalois/CMakeLists.txt | 32 ++++--- .../include/deepgalois/layers/aggregator.h | 7 +- .../deepgalois/layers/graph_conv_layer.h | 15 ++-- .../include/deepgalois/layers/layer.h | 7 +- .../include/deepgalois/math_functions.hh | 3 +- libdeepgalois/src/layers/aggregator.cpp | 60 ++++++------- libdeepgalois/src/layers/aggregator.cu | 4 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 89 +++++-------------- libdeepgalois/src/layers/graph_conv_layer.cu | 50 +++++++++++ libdeepgalois/src/layers/layer.cpp | 12 +++ libdeepgalois/src/math_functions.cpp | 4 +- 11 files changed, 149 insertions(+), 134 deletions(-) create mode 100644 libdeepgalois/src/layers/graph_conv_layer.cu create mode 100644 libdeepgalois/src/layers/layer.cpp diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 7f481cb385..9f797fc655 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -1,11 +1,11 @@ cmake_minimum_required(VERSION 2.8) -SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include) -SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib) +SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) +SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers -SET(CUDA_INC /org/centers/cdgc/cuda/cuda-8.0/include) -SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-8.0/lib64/) +SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include) +SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-10.0/lib64/) include_directories(${OPENBLAS_INC}) include_directories(${CMAKE_SOURCE_DIR}/libgalois/include) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) @@ -16,26 +16,33 @@ include_directories("${MGPU_ROOT}/src") link_directories(${OPENBLAS_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgalois) -set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support") +set(USE_CPU OFF CACHE BOOL "Build DeepGalois without CUDA support") if(USE_CPU) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") else() + #set( CMAKE_VERBOSE_MAKEFILE on ) find_package(CUDA REQUIRED) set(CUDA_SEPARABLE_COMPILATION ON) set(CUDA_PROPAGATE_HOST_FLAGS OFF) set(CUDA_HOST_COMPILER g++) - #list(APPEND CUDA_NVCC_FLAGS "-gencode arch=compute_60,code=sm_60") + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_60,code=sm_60) + #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_61,code=sm_61) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_70,code=sm_70) + #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -G -Xcompiler -rdynamic) + #set(CUDA_INCLUDE_DIRS /org/centers/cdgc/cuda/cuda-10.0/include ${CUDA_INCLUDE_DIRS}) link_directories(${CUDA_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgpu) set(CUDA_SOURCES - src/layers/aggregator.cu + src/layers/graph_conv_layer.cu + #src/layers/softmax_loss_layer.cu + src/layers/aggregator.cu src/math_functions.cu - src/optimizer.cu - src/context.cu - src/node.cu + src/optimizer.cu + src/context.cu + src/node.cu ) cuda_add_library(dg_gpu ${CUDA_SOURCES}) - target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcurand) + target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcusparse -lcurand) set_target_properties(dg_gpu PROPERTIES COMPILE_FLAGS "-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA") set_target_properties(dg_gpu PROPERTIES CUDA_SEPERABLE_COMPILATION ON) #cuda_compile(MF_O src/math_functions.cu) @@ -47,6 +54,7 @@ set(sources src/layers/graph_conv_layer.cpp src/layers/softmax_loss_layer.cpp src/layers/aggregator.cpp + src/layers/layer.cpp src/math_functions.cpp src/optimizer.cpp src/context.cpp @@ -63,7 +71,7 @@ endif() target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES}) target_link_libraries(dg_cpu -lopenblas) -target_link_libraries(dg_cpu -lcudart -lcublas -lcurand) +target_link_libraries(dg_cpu -lcudart -lcublas -lcusparse -lcurand) target_include_directories(dg_cpu PUBLIC ${CMAKE_SOURCE_DIR}/libllvm/include diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h index 806f81a3e0..1d6a1acebb 100644 --- a/libdeepgalois/include/deepgalois/layers/aggregator.h +++ b/libdeepgalois/include/deepgalois/layers/aggregator.h @@ -1,13 +1,14 @@ #pragma once #include "deepgalois/types.h" -#include "deepgalois/gtypes.h" -namespace deepgalois { //! For each node in the graph, add the embeddings of all of its neighbors //! together (using norm_factor if specified) +#ifdef CPU_ONLY +#include "deepgalois/gtypes.h" +namespace deepgalois { void update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); } -#ifndef CPU_ONLY +#else #include "graph_gpu.h" namespace deepgalois { void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 0a43cf0095..b2b80b69e8 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -23,15 +23,15 @@ namespace deepgalois { class graph_conv_layer : public layer { public: - graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout, - float_t dropout_rate, std::vector in_dims, - std::vector out_dims); + graph_conv_layer(unsigned level, bool act, bool norm, bool bias, + bool dropout, float_t dropout_rate, + std::vector in_dims, std::vector out_dims); graph_conv_layer(unsigned level, std::vector in_dims, std::vector out_dims) - : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, - out_dims) {} + : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {} ~graph_conv_layer() {} void init(); + void init_gpu(); std::string layer_type() const override { return std::string("graph_conv"); } void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; } //! Uses weights contained in this layer to update in_data (results from previous) @@ -45,11 +45,10 @@ class graph_conv_layer : public layer { #ifdef CPU_ONLY virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out); #else - virtual void aggregate(size_t len, CSRGraph& g, const float_t* in, - float_t* out); + virtual void aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out); #endif // user-defined combine function - virtual void combine(const vec_t& self, const vec_t& neighbors, vec_t& out); + virtual void combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out); private: bool act_; // whether to use activation function at the end diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index bb009cd57a..19bb176f90 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -25,7 +25,6 @@ #include "deepgalois/layers/node.h" #include "deepgalois/types.h" #include "deepgalois/utils.h" -#include "deepgalois/gtypes.h" #include "deepgalois/context.h" #include "deepgalois/optimizer.h" #include "deepgalois/math_functions.hh" @@ -78,11 +77,7 @@ class layer : public deepgalois::node { mask_t* get_device_masks() { return d_masks_; } //! debug print function - void print_layer_info() { - galois::gPrint("Layer", level_, " type: ", layer_type(), " input[", - input_dims[0], ",", input_dims[1], "] output[", - output_dims[0], ",", output_dims[1], "]\n"); - } + void print_layer_info(); virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, mask_t* masks) { begin_ = sample_begin; diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index cc277093f3..61eceda3f2 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -85,7 +85,8 @@ void add_scalar(const float_t alpha, vec_t& Y); void sub_scalar(const float_t alpha, vec_t& Y); void div_scalar(const float_t alpha, vec_t& Y); float_t dot(const vec_t& x, const vec_t& y); -void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector); +//void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector); +void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, float_t *out_vector); void vvmul(const vec_t& a, const vec_t& b, tensor_t& out); void matadd(size_t x, size_t y, const tensor_t& A, const tensor_t& B, tensor_t& C); diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index d32ab2c598..581c5f564c 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -1,40 +1,38 @@ #include "deepgalois/layers/aggregator.h" #include "deepgalois/math_functions.hh" +#ifdef CPU_ONLY void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { - galois::do_all(galois::iterate(g), - [&](const GNode src) { - // zero out this node's out values - deepgalois::math::clear(len, &out[src * len]); - float_t a = 0.0; - float_t b = 0.0; + galois::do_all(galois::iterate(g), [&](const GNode src) { + // zero out this node's out values + deepgalois::math::clear(len, &out[src * len]); + float_t a = 0.0; + float_t b = 0.0; - // get normalization factor if needed - if (norm) a = norm_factor[src]; + // get normalization factor if needed + if (norm) a = norm_factor[src]; - // gather neighbors' embeddings - for (const auto e : g.edges(src)) { - const auto dst = g.getEdgeDst(e); + // gather neighbors' embeddings + for (const auto e : g.edges(src)) { + const auto dst = g.getEdgeDst(e); - if (norm) { - // normalize b as well - b = a * norm_factor[dst]; - vec_t neighbor(len); - // scale the neighbor's data using the normalization - // factor - deepgalois::math::mul_scalar(len, b, &in[dst * len], &neighbor[0]); - // use scaled data to update - deepgalois::math::vadd(len, &out[src * len], &neighbor[0], - &out[src * len]); // out[src] += in[dst] - } else - // add embeddings from neighbors together - deepgalois::math::vadd(len, &out[src * len], - &in[dst * len], - &out[src * len]); // out[src] += in[dst] - } - }, - galois::steal(), - galois::no_stats(), - galois::loopname("update_all")); + if (norm) { + // normalize b as well + b = a * norm_factor[dst]; + vec_t neighbor(len); + // scale the neighbor's data using the normalization + // factor + deepgalois::math::mul_scalar(len, b, &in[dst * len], &neighbor[0]); + // use scaled data to update + deepgalois::math::vadd(len, &out[src * len], &neighbor[0], + &out[src * len]); // out[src] += in[dst] + } else + // add embeddings from neighbors together + deepgalois::math::vadd(len, &out[src * len], + &in[dst * len], + &out[src * len]); // out[src] += in[dst] + } + }, galois::steal(), galois::no_stats(), galois::loopname("update_all")); } +#endif diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu index 166ea4b4bb..06eac8bb75 100644 --- a/libdeepgalois/src/layers/aggregator.cu +++ b/libdeepgalois/src/layers/aggregator.cu @@ -59,7 +59,7 @@ __global__ void update_all_warp(size_t n, size_t len, CSRGraph g, } } -void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, +void deepgalois::update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { unsigned n = g.nnodes; CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); @@ -68,7 +68,7 @@ void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, CudaTest("solving update_all kernel failed"); } -void update_all_cusparse(size_t len, CSRGraph& g, const float_t* in, float_t* out, +void deepgalois::update_all_cusparse(size_t len, CSRGraph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { unsigned n = g.nnodes; CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index f40f9ad591..8e7b9f4eb5 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -2,28 +2,6 @@ namespace deepgalois { -#ifdef CPU_ONLY -void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { - deepgalois::update_all(len, g, in, out, true, context->norm_factor); -} -#else -void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { - #ifdef USE_CUSPARSE - update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); - #else - deepgalois::update_all(len, g, in, out, true, context->d_norm_factor); - #endif -} -#endif - -void graph_conv_layer::combine(const vec_t& self, const vec_t& neighbors, vec_t& out) { - vec_t a(out.size(), 0); - vec_t b(out.size(), 0); - mvmul(Q, self, a); - mvmul(W, neighbors, b); - deepgalois::math::vadd(a, b, out); // out = W*self + Q*neighbors -} - graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout, float_t dropout_rate, std::vector in_dims, @@ -36,16 +14,29 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, z = output_dims[1]; trainable_ = true; name_ = layer_type() + "_" + std::to_string(level); +#ifdef CPU_ONLY init(); +#else + init_gpu(); +#endif assert(dropout_rate_ < 1.); scale_ = 1. / (1. - dropout_rate_); } -void graph_conv_layer::init() { - //std::cout << name_ << ": allocating memory for params and temp data... "; - Timer t_alloc; - t_alloc.Start(); #ifdef CPU_ONLY +void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { + deepgalois::update_all(len, g, in, out, true, context->norm_factor); +} + +void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out) { + vec_t a(dim_y, 0); + vec_t b(dim_y, 0); + mvmul(dim_x, dim_y, Q, self, a); + mvmul(dim_x, dim_y, W, neighbors, b); + deepgalois::math::vadd(len, a, b, out); // out = W*self + Q*neighbors +} + +void graph_conv_layer::init() { rand_init_matrix(y, z, W); // randomly initialize trainable parameters // rand_init_matrix(y, z, Q); zero_init_matrix(y, z, layer::weight_grad); @@ -55,25 +46,18 @@ void graph_conv_layer::init() { out_temp = new float_t[x * z]; // same as pre_sup in original GCN code: // https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py trans_data = new float_t[y * x]; // y*x -#else - gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, layer::d_weight_grad); -#endif - t_alloc.Stop(); - std::cout << "Done, allocation time: " << t_alloc.Millisecs() << " ms\n"; } -#ifdef CPU_ONLY // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { // input: x*y; W: y*z; output: x*z // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W (not implemented yet) if (dropout_ && phase_ == deepgalois::net_phase::train) { - galois::do_all(galois::iterate((size_t)0, x), - [&](const auto& i) { - deepgalois::math::dropout(y, scale_, dropout_rate_, &in_data[i * y], - &dropout_mask[i * y], &in_temp[i * y]); - }, galois::loopname("dropout")); + galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { + deepgalois::math::dropout(y, scale_, dropout_rate_, &in_data[i * y], + &dropout_mask[i * y], &in_temp[i * y]); + }, galois::loopname("dropout")); deepgalois::math::matmul1D1D(x, z, y, in_temp, &layer::W[0], out_temp); // x*y; y*z; x*z } else { deepgalois::math::matmul1D1D(x, z, y, in_data, &layer::W[0], out_temp); // x*y; y*z; x*z @@ -139,37 +123,6 @@ void graph_conv_layer::back_propagation(const float_t* in_data, deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &layer::weight_grad[0]); // y*x; x*z; y*z } -#else -// GPU forward: compute output features -void graph_conv_layer::forward_propagation(const float_t* in_data, - float_t* out_data) { - assert(y <= 128); // currently only support feature length <= 128 - init_const_gpu(x*z, 0.0, out_temp); - if (dropout_ && phase_ == deepgalois::net_phase::train) { - dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); - sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp); - } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp); - graph_conv_layer::aggregate(z, context->graph_gpu, out_temp, out_data); - if (act_) relu_gpu(x * z, out_data, out_data); -} - -// GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad) -void graph_conv_layer::back_propagation(const float_t* in_data, - const float_t* out_data, - float_t* out_grad, float_t* in_grad) { - if (act_) d_relu_gpu(x * z, out_grad, out_data, out_temp); - else copy_gpu(x * z, out_grad, out_temp); - if (level_ != 0) { - sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); -#ifdef USE_CUSPARSE - update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); -#else - update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); -#endif - if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad); - } - sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad); -} #endif } // namespace diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu new file mode 100644 index 0000000000..dc6e0e72db --- /dev/null +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -0,0 +1,50 @@ +#include "deepgalois/layers/graph_conv_layer.h" + +namespace deepgalois { + +void graph_conv_layer::init_gpu() { + gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, layer::d_weight_grad); +} + +void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { + #ifdef USE_CUSPARSE + deepgalois::update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); + #else + deepgalois::update_all(len, g, in, out, true, context->d_norm_factor); + #endif +} + +void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out) { +} + +// GPU forward: compute output features +void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { + //assert(y <= 128); // currently only support feature length <= 128 + init_const_gpu(x*z, 0.0, out_temp); + if (dropout_ && phase_ == deepgalois::net_phase::train) { + dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); + sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp); + } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp); + graph_conv_layer::aggregate(z, context->graph_gpu, out_temp, out_data); + if (act_) relu_gpu(x * z, out_data, out_data); +} + +// GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad) +void graph_conv_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + if (act_) d_relu_gpu(x * z, out_grad, out_data, out_temp); + else copy_gpu(x * z, out_grad, out_temp); + if (level_ != 0) { + sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); +#ifdef USE_CUSPARSE + update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); +#else + update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); +#endif + if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad); + } + sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad); +} + +} diff --git a/libdeepgalois/src/layers/layer.cpp b/libdeepgalois/src/layers/layer.cpp new file mode 100644 index 0000000000..6abb1ffb6a --- /dev/null +++ b/libdeepgalois/src/layers/layer.cpp @@ -0,0 +1,12 @@ +#include "deepgalois/layers/layer.h" +#include "galois/Galois.h" + +namespace deepgalois { + +void layer::print_layer_info() { + galois::gPrint("Layer", level_, " type: ", layer_type(), " input[", + input_dims[0], ",", input_dims[1], "] output[", + output_dims[0], ",", output_dims[1], "]\n"); +} + +} diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index af4e62c90c..12a2907500 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -231,9 +231,7 @@ float_t dot(size_t n, const float_t* x, const float_t* y) { } // matrix-vector multiply -void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector) { - size_t m = out_vector.size(); - size_t n = in_vector.size(); +void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, float_t *out_vector) { for (size_t i = 0; i < m; ++i) { for (size_t j = 0; j < n; ++j) { out_vector[i] += matrix[i * n + j] * in_vector[j]; From b07c624f21671bd04ecdbffd4a93530f4f508d9a Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 4 Mar 2020 10:33:20 -0600 Subject: [PATCH 108/660] fix cpu --- libdeepgalois/CMakeLists.txt | 6 +++--- libdeepgalois/src/layers/graph_conv_layer.cpp | 10 +++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 9f797fc655..34e094ce14 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -1,7 +1,7 @@ cmake_minimum_required(VERSION 2.8) -SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS/build/include) -SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS/build/lib) +SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include) +SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib) set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include) @@ -16,7 +16,7 @@ include_directories("${MGPU_ROOT}/src") link_directories(${OPENBLAS_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgalois) -set(USE_CPU OFF CACHE BOOL "Build DeepGalois without CUDA support") +set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support") if(USE_CPU) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") else() diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 8e7b9f4eb5..d174b716ac 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -28,11 +28,11 @@ void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_ deepgalois::update_all(len, g, in, out, true, context->norm_factor); } -void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out) { - vec_t a(dim_y, 0); - vec_t b(dim_y, 0); - mvmul(dim_x, dim_y, Q, self, a); - mvmul(dim_x, dim_y, W, neighbors, b); +void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t* neighbors, float_t* out) { + float_t *a = new float_t[len]; + float_t *b = new float_t[len]; + mvmul(n, len, &Q[0], self, a); + mvmul(n, len, &W[0], neighbors, b); deepgalois::math::vadd(len, a, b, out); // out = W*self + Q*neighbors } From 6612216a0845bb39e86e3aa38265c7ece5e4aab3 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 4 Mar 2020 13:55:19 -0600 Subject: [PATCH 109/660] use sgemm_cpu --- .../include/deepgalois/math_functions.hh | 70 +++++++++--------- libdeepgalois/src/layers/aggregator.cpp | 10 +-- libdeepgalois/src/layers/graph_conv_layer.cpp | 73 +++++-------------- libdeepgalois/src/math_functions.cpp | 67 +++++++++-------- 4 files changed, 90 insertions(+), 130 deletions(-) diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 61eceda3f2..26639f6f55 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -21,63 +21,61 @@ extern "C" { namespace deepgalois { namespace math { - -const float negative_slope = 0; - -//! add two same size vectors into out -void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add //! add 2 arrays for n elements -void vadd(size_t n, const float_t* a, const float_t* b, float_t* out); -//! multiply vector by scalar -void mul_scalar(const float_t alpha, vec_t& Y); +void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out); //! multiply n elements of vector by scalar void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out); -//! clear entire vector -void clear(vec_t& in); //! clear n elements of a vector -void clear(size_t n, float_t* in); +void clear_cpu(size_t n, float_t* in); +// dropout functions randomly remove weights +void dropout_cpu(size_t n, const float scale, const float dropout_rate, + const float_t* in, unsigned* mask, float_t* out); +// dropout derivative: use existing dropouts in masks instead of generating them; +void d_dropout_cpu(size_t n, const float scale, const float_t* in_diff, + unsigned* mask, float_t* out_diff); +//! ReLU = keep if positive +void relu_cpu(size_t n, const float_t* in, float_t* out); +//! ReLU derivative; generally, 1 if data > 0, 0 otherwise +void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out); +//! copy vector from in -> out; first len elements +void copy_cpu(size_t len, const float_t* in, float_t* out); +// single-precision dense matrix multiply +void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C); +// single-precision sparse matrix dense matrix multiply, C = A * B, A is sparse +void csrmm_cpu(const int M, const int N, const int K, const int nnz, + const float alpha, const float* A_nonzeros, + const int* A_idx_ptr, const int* A_nonzero_idx, + const float* B, const float beta, float* C); +} // deepgalois +} // math +//! clear entire vector +void clear(vec_t& in); +//! multiply vector by scalar +void mul_scalar(const float_t alpha, vec_t& Y); +//! add two same size vectors into out +void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add // dropout functions randomly remove weights void dropout(const float scale, const float dropout_rate, const vec_t& in, std::vector& mask, vec_t& out); // dropout void dropout(const float scale, const float dropout_rate, const vec_t& in, std::vector& mask, float_t* out); -void dropout(size_t n, const float scale, const float dropout_rate, - const float_t* in, unsigned* mask, float_t* out); -// dropout calls that use existing dropouts in masks instead of generating them; -// derivative void d_dropout(const float scale, const vec_t& in_diff, - std::vector& mask, - vec_t& out_diff); // dropout derivative -void d_dropout(size_t n, const float scale, const float_t* in_diff, - unsigned* mask, float_t* out_diff); - -//! relu = keep if positive + std::vector& mask, vec_t& out_diff); +//! ReLU = keep if positive void relu(const vec_t& in, vec_t& out); -//! relu = keep if positive; first n units -void relu(size_t n, const float_t* in, float_t* out); -//! relu derivative; generally, 1 if x > 0, 0 otherwise -void d_relu(const vec_t& in_diff, const vec_t& data, - vec_t& out_diff); // ReLU derivative - //! copy vector from in -> out void copy1D1D(const vec_t& in, vec_t& out); -//! copy vector from in -> out; first len elements -void copy1D1D(size_t len, const float_t* in, float_t* out); - +//! matrix multiply void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t* A, const float_t* B, float_t* C); // matrix multiply - //! transposes a matrix (vector) void transpose(size_t x, size_t y, const vec_t& in, vec_t& out); //! transposes a matrix (malloc'd array) void transpose(size_t x, size_t y, const float_t* in, float_t* out); - -} // deepgalois -} // math - - void vsub(const vec_t& a, const vec_t& b, vec_t& out); void vmul(const vec_t& a, const vec_t& b, vec_t& out); void vdiv(const vec_t& a, const vec_t& b, vec_t& out); diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 581c5f564c..3fffb86054 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -6,7 +6,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou bool norm, const float_t* norm_factor) { galois::do_all(galois::iterate(g), [&](const GNode src) { // zero out this node's out values - deepgalois::math::clear(len, &out[src * len]); + deepgalois::math::clear_cpu(len, &out[src * len]); float_t a = 0.0; float_t b = 0.0; @@ -21,16 +21,14 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou // normalize b as well b = a * norm_factor[dst]; vec_t neighbor(len); - // scale the neighbor's data using the normalization - // factor + // scale the neighbor's data using the normalization factor deepgalois::math::mul_scalar(len, b, &in[dst * len], &neighbor[0]); // use scaled data to update - deepgalois::math::vadd(len, &out[src * len], &neighbor[0], + deepgalois::math::vadd_cpu(len, &out[src * len], &neighbor[0], &out[src * len]); // out[src] += in[dst] } else // add embeddings from neighbors together - deepgalois::math::vadd(len, &out[src * len], - &in[dst * len], + deepgalois::math::vadd_cpu(len, &out[src * len], &in[dst * len], &out[src * len]); // out[src] += in[dst] } }, galois::steal(), galois::no_stats(), galois::loopname("update_all")); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index d174b716ac..01c313d97d 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -33,18 +33,16 @@ void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t *b = new float_t[len]; mvmul(n, len, &Q[0], self, a); mvmul(n, len, &W[0], neighbors, b); - deepgalois::math::vadd(len, a, b, out); // out = W*self + Q*neighbors + deepgalois::math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors } void graph_conv_layer::init() { rand_init_matrix(y, z, W); // randomly initialize trainable parameters // rand_init_matrix(y, z, Q); zero_init_matrix(y, z, layer::weight_grad); - if (dropout_) - dropout_mask = new unsigned[x * y]; + if (dropout_) dropout_mask = new unsigned[x * y]; in_temp = new float_t[x * y]; - out_temp = new float_t[x * z]; // same as pre_sup in original GCN code: - // https://github.com/chenxuhao/gcn/blob/master/gcn/layers.py + out_temp = new float_t[x * z]; trans_data = new float_t[y * x]; // y*x } @@ -54,75 +52,42 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W (not implemented yet) if (dropout_ && phase_ == deepgalois::net_phase::train) { - galois::do_all(galois::iterate((size_t)0, x), [&](const auto& i) { - deepgalois::math::dropout(y, scale_, dropout_rate_, &in_data[i * y], - &dropout_mask[i * y], &in_temp[i * y]); - }, galois::loopname("dropout")); - deepgalois::math::matmul1D1D(x, z, y, in_temp, &layer::W[0], out_temp); // x*y; y*z; x*z - } else { - deepgalois::math::matmul1D1D(x, z, y, in_data, &layer::W[0], out_temp); // x*y; y*z; x*z - } + deepgalois::math::dropout_cpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); + deepgalois::math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp); + } else deepgalois::math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, &layer::W[0], 0.0, out_temp); // aggregate based on graph topology graph_conv_layer::aggregate(z, context->graph_cpu, out_temp, out_data); // run relu activation on output if specified - if (act_) { - galois::do_all( - galois::iterate((size_t)0, x), - [&](const auto& i) { deepgalois::math::relu(z, &out_data[i * z], - &out_data[i * z]); }, - galois::loopname("relu")); - } + if (act_) deepgalois::math::relu_cpu(x*z, out_data, out_data); } // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { - if (act_) { - // note; assumption here is that out_grad contains 1s or 0s via relu? - galois::do_all(galois::iterate((size_t)0, x), - [&](const auto& i) { - for (size_t j = 0; j < z; ++j) // TODO: use in_data or out_data? - // check if original data greater than 0; if so keep grad - out_temp[i * z + j] = out_data[i * z + j] > float_t(0) - ? out_grad[i * z + j] : float_t(0); - }, galois::loopname("d_relu")); - } else { - deepgalois::math::copy1D1D(x * z, out_grad, out_temp); // TODO: avoid copying - } + // note; assumption here is that out_grad contains 1s or 0s via relu? + if (act_) deepgalois::math::d_relu_cpu(x*z, out_grad, out_data, out_temp); + else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying // at this point, out_temp has the derivative of data from last step to // use for both updating gradients for features and gradients for weights - // this calculates gradients for the node predictions if (level_ != 0) { // no need to calculate in_grad for the first layer - vec_t trans_W(z * y); // derivative of matmul needs transposed matrix - deepgalois::math::transpose(y, z, layer::W, trans_W); - deepgalois::math::matmul1D1D(x, y, z, out_temp, &trans_W[0], in_temp); // x*z; z*y -> x*y - // sgemm_cpu(x, y, z, 1.0, out_temp, trans_W, 0.0, in_temp); // x*z; z*y -> + deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_temp); // x*z; z*y -> // x*y NOTE: since graph is symmetric, the derivative is the same - deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, true, - context->norm_factor); // x*x; x*y -> x*y - if (dropout_) { - galois::do_all(galois::iterate((size_t)0, x), - [&](const auto& i) { - deepgalois::math::d_dropout(y, scale_, &in_grad[i * y], - &dropout_mask[i * y], &in_grad[i * y]); - }, galois::chunk_size(), galois::steal(), - galois::loopname("d_dropout")); - } + deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y + if (dropout_) deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad); } - // calculate weight gradients by using previous layer's transpose multiplied - // by gradients from last back prop step - deepgalois::math::transpose(x, y, in_data, trans_data); // y*x - // updates THIS layer's weight gradients to update them - deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &layer::weight_grad[0]); // y*x; x*z; y*z + // calculate weight gradients using input data + // multiplied by gradients from last back prop step + //deepgalois::math::transpose(x, y, in_data, trans_data); // x*y -> y*x + //deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &layer::weight_grad[0]); // y*x; x*z; y*z + deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z } - #endif - } // namespace + diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 12a2907500..700a4ce688 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -1,5 +1,6 @@ #include "deepgalois/math_functions.hh" #include "galois/Timer.h" +#include "galois/Galois.h" #include extern "C" { @@ -14,10 +15,13 @@ namespace math { void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C) { + galois::StatTimer Tmatmul("MatMul"); + Tmatmul.start(); int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); + Tmatmul.stop(); } // vector add @@ -34,23 +38,20 @@ void vadd(const vec_t& a, const vec_t& b, vec_t& out) { out[i] = a[i] + b[i]; } -void vadd(size_t n, const float_t* a, const float_t* b, float_t* out) { +void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) { size_t vec_len = 8; const size_t alignedN = n - n % vec_len; for (size_t i = 0; i < alignedN; i += vec_len) - _mm256_storeu_ps( - &out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); - for (size_t i = alignedN; i < n; ++i) - out[i] = a[i] + b[i]; + _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); + for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i]; } #else void vadd(const vec_t& a, const vec_t& b, vec_t& out) { for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i]; } -void vadd(size_t n, const float_t* a, const float_t* b, float_t* out) { - for (size_t i = 0; i < n; ++i) - out[i] = a[i] + b[i]; +void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) { + for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i]; } #endif @@ -71,9 +72,9 @@ void clear(vec_t& in) { in[i] = 0; } -void clear(size_t n, float_t* in) { - for (size_t i = 0; i < n; i++) - in[i] = 0; +void clear_cpu(size_t n, float_t* in) { + for (size_t i = 0; i < n; i++) in[i] = 0; + // memset(in, 0, n*sizeof(float_t)); } void dropout(const float scale, const float dropout_rate, const vec_t& in, @@ -94,12 +95,12 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in, out[i] = in[i] * masks[i] * scale; } -void dropout(size_t n, const float scale, const float dropout_rate, +void dropout_cpu(size_t n, const float scale, const float dropout_rate, const float_t* in, unsigned* masks, float_t* out) { - for (size_t i = 0; i < n; ++i) + galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { masks[i] = deepgalois::bernoulli(dropout_rate); - for (size_t i = 0; i < n; ++i) out[i] = in[i] * masks[i] * scale; + }, galois::loopname("dropout")); } void d_dropout(const float scale, const vec_t& in_diff, @@ -108,48 +109,46 @@ void d_dropout(const float scale, const vec_t& in_diff, out_diff[i] = in_diff[i] * masks[i] * scale; } -void d_dropout(size_t n, const float scale, const float_t* in_diff, - unsigned* masks, float_t* out_diff) { - for (size_t i = 0; i < n; ++i) - out_diff[i] = in_diff[i] * masks[i] * scale; +void d_dropout_cpu(size_t n, const float scale, const float_t* in, + unsigned* masks, float_t* out) { + galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { + out[i] = in[i] * masks[i] * scale; + }, galois::loopname("d_dropout")); } void relu(const vec_t& in, vec_t& out) { for (size_t i = 0; i < out.size(); ++i) { - out[i] = std::max(in[i], (float_t)0) + - negative_slope * std::min(in[i], (float_t)0); + out[i] = std::max(in[i], (float_t)0); } } -void relu(size_t n, const float_t* in, float_t* out) { - for (size_t i = 0; i < n; ++i) +void relu_cpu(size_t n, const float_t* in, float_t* out) { + // TODO: vectorize + galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { out[i] = std::max(in[i], float_t(0)); + }, galois::loopname("relu")); } -void d_relu(const vec_t& in_diff, const vec_t& fv, vec_t& out_diff) { - for (size_t i = 0; i < out_diff.size(); ++i) { - out_diff[i] = in_diff[i] * ((fv[i] > (float_t)0) + - negative_slope * (fv[i] <= (float_t)0)); - } +void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out) { + // TODO: vectorize + // check if original data greater than 0; if so keep grad + galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { + out[i] = data[i] > float_t(0) ? in[i] : float_t(0); + }, galois::loopname("d_relu")); } void copy1D1D(const vec_t& in, vec_t& out) { std::copy(in.begin(), in.end(), &out[0]); } -void copy1D1D(size_t len, const float_t* in, float_t* out) { +void copy_cpu(size_t len, const float_t* in, float_t* out) { std::copy(in, in + len, out); } // num rows in A, C; num columns in B, C; num columns in A, rows in B void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t* A, const float_t* B, float_t* C) { - galois::StatTimer Tmatmul("MatMul"); - Tmatmul.start(); - const CBLAS_TRANSPOSE TransA = CblasNoTrans; - const CBLAS_TRANSPOSE TransB = CblasNoTrans; - sgemm_cpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); - Tmatmul.stop(); + sgemm_cpu(CblasNoTrans, CblasNoTrans, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); } // TODO make parallel From 032827f23b3138662ef730c5670ea8c35dd50261 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 4 Mar 2020 14:53:24 -0600 Subject: [PATCH 110/660] add csrmm_cpu impl --- .../include/deepgalois/layers/aggregator.h | 4 +- libdeepgalois/src/layers/aggregator.cpp | 57 ++++++++++--------- libdeepgalois/src/layers/aggregator.cu | 2 +- libdeepgalois/src/math_functions.cpp | 21 +++++++ 4 files changed, 56 insertions(+), 28 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h index 1d6a1acebb..ffdd3935a8 100644 --- a/libdeepgalois/include/deepgalois/layers/aggregator.h +++ b/libdeepgalois/include/deepgalois/layers/aggregator.h @@ -7,13 +7,15 @@ namespace deepgalois { void update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); +void update_all_csrmm(size_t len, Graph& g, const float_t* in, + float_t* out, bool norm, const float_t* norm_factor); } #else #include "graph_gpu.h" namespace deepgalois { void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); -void update_all_cusparse(size_t len, CSRGraph& g, const float_t* in, +void update_all_csrmm(size_t len, CSRGraph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); } #endif diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 3fffb86054..94752742ed 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -4,33 +4,38 @@ #ifdef CPU_ONLY void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { + // zero out the output data + deepgalois::math::clear_cpu(g.size()*len, out); galois::do_all(galois::iterate(g), [&](const GNode src) { - // zero out this node's out values - deepgalois::math::clear_cpu(len, &out[src * len]); - float_t a = 0.0; - float_t b = 0.0; - - // get normalization factor if needed - if (norm) a = norm_factor[src]; - - // gather neighbors' embeddings - for (const auto e : g.edges(src)) { - const auto dst = g.getEdgeDst(e); + float_t a = 0.0; + float_t b = 0.0; + // get normalization factor if needed + if (norm) a = norm_factor[src]; + // gather neighbors' embeddings + for (const auto e : g.edges(src)) { + const auto dst = g.getEdgeDst(e); + if (norm) { + // normalize b as well + b = a * norm_factor[dst]; + vec_t neighbor(len); + // scale the neighbor's data using the normalization factor + deepgalois::math::mul_scalar(len, b, &in[dst * len], &neighbor[0]); + // use scaled data to update + deepgalois::math::vadd_cpu(len, &out[src * len], &neighbor[0], + &out[src * len]); // out[src] += in[dst] + } else + // add embeddings from neighbors together + deepgalois::math::vadd_cpu(len, &out[src * len], &in[dst * len], + &out[src * len]); // out[src] += in[dst] + } + }, galois::steal(), galois::no_stats(), galois::loopname("update_all")); +} - if (norm) { - // normalize b as well - b = a * norm_factor[dst]; - vec_t neighbor(len); - // scale the neighbor's data using the normalization factor - deepgalois::math::mul_scalar(len, b, &in[dst * len], &neighbor[0]); - // use scaled data to update - deepgalois::math::vadd_cpu(len, &out[src * len], &neighbor[0], - &out[src * len]); // out[src] += in[dst] - } else - // add embeddings from neighbors together - deepgalois::math::vadd_cpu(len, &out[src * len], &in[dst * len], - &out[src * len]); // out[src] += in[dst] - } - }, galois::steal(), galois::no_stats(), galois::loopname("update_all")); +void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out, + bool norm, const float_t* norm_factor) { + unsigned n = g.size(); + deepgalois::math::clear_cpu(n*len, out); + //csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, + // (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, out); } #endif diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu index 06eac8bb75..ee5fe56b4d 100644 --- a/libdeepgalois/src/layers/aggregator.cu +++ b/libdeepgalois/src/layers/aggregator.cu @@ -68,7 +68,7 @@ void deepgalois::update_all(size_t len, CSRGraph& g, const float_t* in, float_t* CudaTest("solving update_all kernel failed"); } -void deepgalois::update_all_cusparse(size_t len, CSRGraph& g, const float_t* in, float_t* out, +void deepgalois::update_all_csrmm(size_t len, CSRGraph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { unsigned n = g.nnodes; CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 700a4ce688..d3f7d0fca0 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -8,6 +8,12 @@ extern "C" { //#include } +#define NOT_IMPLEMENTED \ + do { \ + std::cout << "Not Implemented Yet";\ + exit(1); \ + } while(0); + namespace deepgalois { namespace math { @@ -24,6 +30,21 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, Tmatmul.stop(); } +void csrmm_cpu(const int M, const int N, const int K, const int nnz, + const float alpha, const float* A_nonzeros, + const int* A_idx_ptr, const int* A_nnz_idx, + const float* B, const float beta, float* C) { +#ifdef USE_MKL + const char *matdescra = "GXXCX";//6 bytes + const char transa = 'N'; + mkl_scsrmm (&transa, &M , &N, &K, &alpha , matdescra, + A_nonzeros, A_nnz_idx, A_idx_ptr, A_idx_ptr+1, + B, &N, &beta , C, &N); +#else + NOT_IMPLEMENTED; +#endif +} + // vector add #if defined(__AVX__) || defined(__AVX2__) void vadd(const vec_t& a, const vec_t& b, vec_t& out) { From d832912238e873ead1c16238d07f76f2f2ae8497 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 4 Mar 2020 15:25:43 -0600 Subject: [PATCH 111/660] fix agg --- libdeepgalois/src/layers/aggregator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 94752742ed..6d7c7f6cbe 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -5,8 +5,8 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { // zero out the output data - deepgalois::math::clear_cpu(g.size()*len, out); galois::do_all(galois::iterate(g), [&](const GNode src) { + deepgalois::math::clear_cpu(len , &out[src * len]); float_t a = 0.0; float_t b = 0.0; // get normalization factor if needed From 2f03a623639a8b0aac97a2baa57aae9e4270e055 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 4 Mar 2020 18:55:43 -0600 Subject: [PATCH 112/660] modify norm_factor --- libdeepgalois/include/deepgalois/context.h | 4 -- .../deepgalois/layers/graph_conv_layer.h | 3 +- libdeepgalois/src/context.cpp | 16 ----- libdeepgalois/src/context.cu | 49 --------------- libdeepgalois/src/layers/graph_conv_layer.cpp | 20 ++++-- libdeepgalois/src/layers/graph_conv_layer.cu | 61 ++++++++++++++++--- libdeepgalois/src/net.cpp | 1 - 7 files changed, 70 insertions(+), 84 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index 2f769dc917..c906661d76 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -44,8 +44,6 @@ class Context { void DeviceQuery() {} bool CheckDevice(const int device_id) { return true; } int FindDevice(const int start_id = 0) { return 0; } - void norm_factor_counting(); - void norm_factor_counting_gpu(); size_t n; // number of samples: N size_t num_classes; // number of classes: E @@ -54,8 +52,6 @@ class Context { label_t* d_labels; // labels on device vec_t h_feats; // input features: N x D float_t* d_feats; // input features on device - float_t* norm_factor; // normalization constant based on graph structure - float_t* d_norm_factor; // norm_factor on device #ifdef CPU_ONLY Graph graph_cpu; // the input graph, |V| = N diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index b2b80b69e8..d4935b85db 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -31,7 +31,7 @@ class graph_conv_layer : public layer { : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {} ~graph_conv_layer() {} void init(); - void init_gpu(); + void norm_factor_counting(); std::string layer_type() const override { return std::string("graph_conv"); } void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; } //! Uses weights contained in this layer to update in_data (results from previous) @@ -65,6 +65,7 @@ class graph_conv_layer : public layer { float_t* in_temp; float_t* trans_data; // y*x unsigned* dropout_mask; // x*y + float_t* norm_factor; // normalization constant based on graph structure, TODO: make it static // Glorot & Bengio (AISTATS 2010) inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) { diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 5e2ccf4c02..54ff169c37 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -91,22 +91,6 @@ void Context::add_selfloop(Graph &og, Graph &g) { float_t* Context::get_in_ptr() { return &h_feats[0]; } #endif -void Context::norm_factor_counting() { -#ifdef CPU_ONLY - norm_factor = new float_t[n]; - galois::do_all(galois::iterate((size_t)0, n), - [&](auto v) { - auto degree = std::distance(graph_cpu.edge_begin(v), - graph_cpu.edge_end(v)); - float_t temp = std::sqrt(float_t(degree)); - if (temp == 0.0) norm_factor[v] = 0.0; - else norm_factor[v] = 1.0 / temp; - }, galois::loopname("NormCounting")); -#else - norm_factor_counting_gpu(); -#endif -} - // labels contain the ground truth (e.g. vertex classes) for each example // (num_examples x 1). Note that labels is not one-hot encoded vector and it can // be computed as y.argmax(axis=1) from one-hot encoded vector (y) of labels if diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 29bec6f008..3ea78b0912 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -22,57 +22,8 @@ int64_t cluster_seedgen(void) { return seed; } -// computing normalization factor for each vertex -__global__ void norm_factor_counting_node(int n, CSRGraph graph, - float_t* norm_fac) { - CUDA_KERNEL_LOOP(i, n) { - float_t temp = sqrt(float_t(graph.getOutDegree(i))); - if (temp == 0.0) - norm_fac[i] = 0.0; - else - norm_fac[i] = 1.0 / temp; - } -} - -// TODO: make sure self-loop added for each vertex -// computing normalization factor for each edge -__global__ void norm_factor_counting_edge(int n, CSRGraph graph, - float_t* norm_fac) { - CUDA_KERNEL_LOOP(src, n) { - float_t d_src = float_t(graph.getOutDegree(src)); - assert(d_src != 0.0); // should never be zero since self-loop added for each vertex - d_src = 1.0 / sqrt(d_src); - index_type start = graph.edge_begin(src); - index_type end = graph.edge_end(src); - for (index_type e = start; e != end; e++) { - index_type dst = graph.getEdgeDst(e); - float_t d_dst = float_t(graph.getOutDegree(dst)); - assert(d_dst != 0.0); - d_dst = 1.0 / sqrt(d_dst); - norm_fac[e] = d_src * d_dst; - } - } -} - namespace deepgalois { -void Context::norm_factor_counting_gpu() { - assert(graph_gpu.nnodes == n); - std::cout << "Pre-computing normalization factor (n=" << n << ")\n"; -#ifdef USE_CUSPARSE - int nnz = graph_gpu.nedges; - CUDA_CHECK(cudaMalloc((void**)&d_norm_factor, nnz * sizeof(float_t))); - init_const_kernel<<>>(nnz, 0.0, d_norm_factor); - norm_factor_counting_edge<<>>( - n, graph_gpu, d_norm_factor); -#else - CUDA_CHECK(cudaMalloc((void**)&d_norm_factor, n * sizeof(float_t))); - norm_factor_counting_node<<>>( - n, graph_gpu, d_norm_factor); -#endif - CudaTest("solving norm_factor_counting kernel failed"); -} - cublasHandle_t Context::cublas_handle_ = 0; cusparseHandle_t Context::cusparse_handle_ = 0; cusparseMatDescr_t Context::cusparse_matdescr_ = 0; diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 01c313d97d..96ddf2339d 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -14,18 +14,15 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, z = output_dims[1]; trainable_ = true; name_ = layer_type() + "_" + std::to_string(level); -#ifdef CPU_ONLY init(); -#else - init_gpu(); -#endif assert(dropout_rate_ < 1.); scale_ = 1. / (1. - dropout_rate_); + if (norm_) norm_factor_counting(); // pre-compute normalizing factor } #ifdef CPU_ONLY void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { - deepgalois::update_all(len, g, in, out, true, context->norm_factor); + deepgalois::update_all(len, g, in, out, norm_, norm_factor); } void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t* neighbors, float_t* out) { @@ -46,6 +43,17 @@ void graph_conv_layer::init() { trans_data = new float_t[y * x]; // y*x } +void graph_conv_layer::norm_factor_counting() { + norm_factor = new float_t[n]; + galois::do_all(galois::iterate((size_t)0, n), + [&](auto v) { + auto degree = std::distance(context->graph_cpu.edge_begin(v), context->graph_cpu.edge_end(v)); + float_t temp = std::sqrt(float_t(degree)); + if (temp == 0.0) norm_factor[v] = 0.0; + else norm_factor[v] = 1.0 / temp; + }, galois::loopname("NormCounting")); +} + // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { // input: x*y; W: y*z; output: x*z @@ -78,7 +86,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // derivative of matmul needs transposed matrix deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_temp); // x*z; z*y -> // x*y NOTE: since graph is symmetric, the derivative is the same - deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, true, context->norm_factor); // x*x; x*y -> x*y + deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y if (dropout_) deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad); } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index dc6e0e72db..210dd8e54d 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -1,22 +1,70 @@ #include "deepgalois/layers/graph_conv_layer.h" +// computing normalization factor for each vertex +__global__ void norm_factor_counting_node(int n, CSRGraph graph, + float_t* norm_fac) { + CUDA_KERNEL_LOOP(i, n) { + float_t temp = sqrt(float_t(graph.getOutDegree(i))); + if (temp == 0.0) norm_fac[i] = 0.0; + else norm_fac[i] = 1.0 / temp; + } +} + +// TODO: make sure self-loop added for each vertex +// computing normalization factor for each edge +__global__ void norm_factor_counting_edge(int n, CSRGraph graph, float_t* norm_fac) { + CUDA_KERNEL_LOOP(src, n) { + float_t d_src = float_t(graph.getOutDegree(src)); + assert(d_src != 0.0); // should never be zero since self-loop added for each vertex + d_src = 1.0 / sqrt(d_src); + index_type start = graph.edge_begin(src); + index_type end = graph.edge_end(src); + for (index_type e = start; e != end; e++) { + index_type dst = graph.getEdgeDst(e); + float_t d_dst = float_t(graph.getOutDegree(dst)); + assert(d_dst != 0.0); + d_dst = 1.0 / sqrt(d_dst); + norm_fac[e] = d_src * d_dst; + } + } +} + namespace deepgalois { -void graph_conv_layer::init_gpu() { +void graph_conv_layer::init() { gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, layer::d_weight_grad); } void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { #ifdef USE_CUSPARSE - deepgalois::update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); + deepgalois::update_all_csrmm(y, context->graph_gpu, in_temp, in_grad, norm_, norm_factor); #else - deepgalois::update_all(len, g, in, out, true, context->d_norm_factor); + deepgalois::update_all(len, g, in, out, norm_, norm_factor); #endif } void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out) { } +void graph_conv_layer::norm_factor_counting() { + std::cout << "debug\n"; + int n = x;//context->graph_gpu.nnodes; + std::cout << "Pre-computing normalization factor (n=" << n << ") ... "; +#ifdef USE_CUSPARSE + int nnz = context->graph_gpu.nedges; + CUDA_CHECK(cudaMalloc((void**)&norm_factor, nnz * sizeof(float_t))); + init_const_kernel<<>>(nnz, 0.0, norm_factor); + norm_factor_counting_edge<<>>( + n, context->graph_gpu, norm_factor); +#else + CUDA_CHECK(cudaMalloc((void**)&norm_factor, n * sizeof(float_t))); + norm_factor_counting_node<<>>( + n, context->graph_gpu, norm_factor); +#endif + CudaTest("solving norm_factor_counting kernel failed"); + std::cout << "Done\n"; +} + // GPU forward: compute output features void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { //assert(y <= 128); // currently only support feature length <= 128 @@ -38,13 +86,12 @@ void graph_conv_layer::back_propagation(const float_t* in_data, if (level_ != 0) { sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); #ifdef USE_CUSPARSE - update_all_cusparse(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); + update_all_csrmm(y, context->graph_gpu, in_temp, in_grad, true, norm_factor); #else - update_all(y, context->graph_gpu, in_temp, in_grad, true, context->d_norm_factor); + update_all(y, context->graph_gpu, in_temp, in_grad, true, norm_factor); #endif if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad); } sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad); } - -} +} // namespace diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 7407e99d9f..031541e060 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -7,7 +7,6 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool // read graph, get num nodes num_samples = context->read_graph(dataset_str, selfloop); num_classes = context->read_labels(dataset_str); - context->norm_factor_counting(); // pre-compute normalizing factor num_epochs = epochs; //std::cout << "Reading label masks ... "; From 5b887666eda70b827afb418d7c9a9eccc8173d8e Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 4 Mar 2020 19:41:51 -0600 Subject: [PATCH 113/660] fix bug --- libdeepgalois/include/deepgalois/context.h | 2 + .../deepgalois/layers/graph_conv_layer.h | 4 +- libdeepgalois/src/context.cpp | 17 +++--- libdeepgalois/src/context.cu | 50 ++++++++++++++++++ libdeepgalois/src/layers/graph_conv_layer.cpp | 14 +---- libdeepgalois/src/layers/graph_conv_layer.cu | 52 +------------------ libdeepgalois/src/net.cpp | 1 + 7 files changed, 69 insertions(+), 71 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index c906661d76..a622a0f0f7 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -44,6 +44,7 @@ class Context { void DeviceQuery() {} bool CheckDevice(const int device_id) { return true; } int FindDevice(const int start_id = 0) { return 0; } + void norm_factor_counting(); size_t n; // number of samples: N size_t num_classes; // number of classes: E @@ -52,6 +53,7 @@ class Context { label_t* d_labels; // labels on device vec_t h_feats; // input features: N x D float_t* d_feats; // input features on device + float_t* norm_factor; // normalization constant based on graph structure #ifdef CPU_ONLY Graph graph_cpu; // the input graph, |V| = N diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index d4935b85db..b3f9d16d2b 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -31,9 +31,9 @@ class graph_conv_layer : public layer { : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {} ~graph_conv_layer() {} void init(); - void norm_factor_counting(); std::string layer_type() const override { return std::string("graph_conv"); } void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; } + void set_context(deepgalois::Context* ctx) { context = ctx; norm_factor = ctx->norm_factor; } //! Uses weights contained in this layer to update in_data (results from previous) //! and save result to out_data virtual void forward_propagation(const float_t* in_data, float_t* out_data); @@ -65,7 +65,7 @@ class graph_conv_layer : public layer { float_t* in_temp; float_t* trans_data; // y*x unsigned* dropout_mask; // x*y - float_t* norm_factor; // normalization constant based on graph structure, TODO: make it static + float_t* norm_factor; // normalization constant based on graph structure // Glorot & Bengio (AISTATS 2010) inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) { diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 54ff169c37..8b5917b70c 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -7,18 +7,12 @@ Context::Context() : mode_(Context::CPU), solver_count_(1), solver_rank_(0), multiprocess_(false) {} Context::~Context() {} -#endif size_t Context::read_graph(std::string dataset_str, bool selfloop) { -#ifdef CPU_ONLY n = read_graph_cpu(dataset_str, "gr", selfloop); -#else - n = read_graph_gpu(dataset_str, selfloop); -#endif return n; } -#ifdef CPU_ONLY size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) { galois::StatTimer Tread("GraphReadingTime"); Tread.start(); @@ -60,6 +54,17 @@ void Context::genGraph(LGraph& lg, Graph& g) { } } +void Context::norm_factor_counting() { + norm_factor = new float_t[n]; + galois::do_all(galois::iterate((size_t)0, n), + [&](auto v) { + auto degree = std::distance(graph_cpu.edge_begin(v), graph_cpu.edge_end(v)); + float_t temp = std::sqrt(float_t(degree)); + if (temp == 0.0) norm_factor[v] = 0.0; + else norm_factor[v] = 1.0 / temp; + }, galois::loopname("NormCounting")); +} + void Context::add_selfloop(Graph &og, Graph &g) { g.allocateFrom(og.size(), og.size()+og.sizeEdges()); g.constructNodes(); diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 3ea78b0912..5ddbdc3dd8 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -22,6 +22,34 @@ int64_t cluster_seedgen(void) { return seed; } +// computing normalization factor for each vertex +__global__ void norm_factor_counting_node(int n, CSRGraph graph, float_t* norm_fac) { + CUDA_KERNEL_LOOP(i, n) { + float_t temp = sqrt(float_t(graph.getOutDegree(i))); + if (temp == 0.0) norm_fac[i] = 0.0; + else norm_fac[i] = 1.0 / temp; + } +} + +// TODO: make sure self-loop added for each vertex +// computing normalization factor for each edge +__global__ void norm_factor_counting_edge(int n, CSRGraph graph, float_t* norm_fac) { + CUDA_KERNEL_LOOP(src, n) { + float_t d_src = float_t(graph.getOutDegree(src)); + assert(d_src != 0.0); // should never be zero since self-loop added for each vertex + d_src = 1.0 / sqrt(d_src); + index_type start = graph.edge_begin(src); + index_type end = graph.edge_end(src); + for (index_type e = start; e != end; e++) { + index_type dst = graph.getEdgeDst(e); + float_t d_dst = float_t(graph.getOutDegree(dst)); + assert(d_dst != 0.0); + d_dst = 1.0 / sqrt(d_dst); + norm_fac[e] = d_src * d_dst; + } + } +} + namespace deepgalois { cublasHandle_t Context::cublas_handle_ = 0; @@ -52,6 +80,28 @@ Context::~Context() { CURAND_CHECK(curandDestroyGenerator(curand_generator_)); } +size_t Context::read_graph(std::string dataset_str, bool selfloop) { + n = read_graph_gpu(dataset_str, selfloop); + return n; +} + +void Context::norm_factor_counting() { + std::cout << "Pre-computing normalization factor (n=" << n << ") ... "; +#ifdef USE_CUSPARSE + int nnz = graph_gpu.nedges; + CUDA_CHECK(cudaMalloc((void**)&norm_factor, nnz * sizeof(float_t))); + init_const_kernel<<>>(nnz, 0.0, norm_factor); + norm_factor_counting_edge<<>>( + n, graph_gpu, norm_factor); +#else + CUDA_CHECK(cudaMalloc((void**)&norm_factor, n * sizeof(float_t))); + norm_factor_counting_node<<>>( + n, graph_gpu, norm_factor); +#endif + CudaTest("solving norm_factor_counting kernel failed"); + std::cout << "Done\n"; +} + void Context::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 96ddf2339d..189f396cf8 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -17,7 +17,6 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, init(); assert(dropout_rate_ < 1.); scale_ = 1. / (1. - dropout_rate_); - if (norm_) norm_factor_counting(); // pre-compute normalizing factor } #ifdef CPU_ONLY @@ -43,17 +42,6 @@ void graph_conv_layer::init() { trans_data = new float_t[y * x]; // y*x } -void graph_conv_layer::norm_factor_counting() { - norm_factor = new float_t[n]; - galois::do_all(galois::iterate((size_t)0, n), - [&](auto v) { - auto degree = std::distance(context->graph_cpu.edge_begin(v), context->graph_cpu.edge_end(v)); - float_t temp = std::sqrt(float_t(degree)); - if (temp == 0.0) norm_factor[v] = 0.0; - else norm_factor[v] = 1.0 / temp; - }, galois::loopname("NormCounting")); -} - // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { // input: x*y; W: y*z; output: x*z @@ -86,7 +74,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // derivative of matmul needs transposed matrix deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_temp); // x*z; z*y -> // x*y NOTE: since graph is symmetric, the derivative is the same - deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, true, norm_factor); // x*x; x*y -> x*y + deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, norm_, norm_factor); // x*x; x*y -> x*y if (dropout_) deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad); } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index 210dd8e54d..69630b50f9 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -1,34 +1,5 @@ #include "deepgalois/layers/graph_conv_layer.h" -// computing normalization factor for each vertex -__global__ void norm_factor_counting_node(int n, CSRGraph graph, - float_t* norm_fac) { - CUDA_KERNEL_LOOP(i, n) { - float_t temp = sqrt(float_t(graph.getOutDegree(i))); - if (temp == 0.0) norm_fac[i] = 0.0; - else norm_fac[i] = 1.0 / temp; - } -} - -// TODO: make sure self-loop added for each vertex -// computing normalization factor for each edge -__global__ void norm_factor_counting_edge(int n, CSRGraph graph, float_t* norm_fac) { - CUDA_KERNEL_LOOP(src, n) { - float_t d_src = float_t(graph.getOutDegree(src)); - assert(d_src != 0.0); // should never be zero since self-loop added for each vertex - d_src = 1.0 / sqrt(d_src); - index_type start = graph.edge_begin(src); - index_type end = graph.edge_end(src); - for (index_type e = start; e != end; e++) { - index_type dst = graph.getEdgeDst(e); - float_t d_dst = float_t(graph.getOutDegree(dst)); - assert(d_dst != 0.0); - d_dst = 1.0 / sqrt(d_dst); - norm_fac[e] = d_src * d_dst; - } - } -} - namespace deepgalois { void graph_conv_layer::init() { @@ -46,25 +17,6 @@ void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, flo void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out) { } -void graph_conv_layer::norm_factor_counting() { - std::cout << "debug\n"; - int n = x;//context->graph_gpu.nnodes; - std::cout << "Pre-computing normalization factor (n=" << n << ") ... "; -#ifdef USE_CUSPARSE - int nnz = context->graph_gpu.nedges; - CUDA_CHECK(cudaMalloc((void**)&norm_factor, nnz * sizeof(float_t))); - init_const_kernel<<>>(nnz, 0.0, norm_factor); - norm_factor_counting_edge<<>>( - n, context->graph_gpu, norm_factor); -#else - CUDA_CHECK(cudaMalloc((void**)&norm_factor, n * sizeof(float_t))); - norm_factor_counting_node<<>>( - n, context->graph_gpu, norm_factor); -#endif - CudaTest("solving norm_factor_counting kernel failed"); - std::cout << "Done\n"; -} - // GPU forward: compute output features void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { //assert(y <= 128); // currently only support feature length <= 128 @@ -86,9 +38,9 @@ void graph_conv_layer::back_propagation(const float_t* in_data, if (level_ != 0) { sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); #ifdef USE_CUSPARSE - update_all_csrmm(y, context->graph_gpu, in_temp, in_grad, true, norm_factor); + update_all_csrmm(y, context->graph_gpu, in_temp, in_grad, norm_, norm_factor); #else - update_all(y, context->graph_gpu, in_temp, in_grad, true, norm_factor); + update_all(y, context->graph_gpu, in_temp, in_grad, norm_, norm_factor); #endif if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad); } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 031541e060..30f0e86488 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -111,6 +111,7 @@ void Net::construct_layers() { append_conv_layer(1); // hidden1 layer append_out_layer(2); // output layer layers[0]->set_in_data(context->get_in_ptr()); // feed input data + context->norm_factor_counting(); set_contexts(); } From cee94061ab951b7ec7996fbaeb07027708c1b877 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 5 Mar 2020 18:10:21 -0600 Subject: [PATCH 114/660] gluon gradients wrapper (WIP: need mirror setup) --- .../deepgalois/layers/GluonGradients.h | 112 ++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 libdeepgalois/include/deepgalois/layers/GluonGradients.h diff --git a/libdeepgalois/include/deepgalois/layers/GluonGradients.h b/libdeepgalois/include/deepgalois/layers/GluonGradients.h new file mode 100644 index 0000000000..4131fd22d4 --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/GluonGradients.h @@ -0,0 +1,112 @@ +#ifndef __GLUON_GRADIENTS__ +#define __GLUON_GRADIENTS__ + +#include "deepgalois/types.h" + +/** + * Wraps the weight gradients and provides an interface for Gluon to + * synchronize them during distributed execution. + */ +class GluonGradients { +private: + //! Data type used for gradients + using GradientType = float_t; + //! type that's being used by the gradient vector + using GradientVecType = vec_t; + + GradientVecType& _gradients; + size_t _numWeights; + size_t _numOwned; + + //! my nodes whose's masters are on other hosts; global ids + std::vector> mirrorNodes; + // TODO save mirror ranges here as well + +public: + /** + * Save weight gradients + number of them (i.e. size). + * Then setup mirror metadata for Gluon to use during setup. + */ + GluonGradients(GradientVecType& gradients, size_t numWeights) + : _gradients(gradients), _numWeights(numWeights) { + } + + //! Size is number of weights + size_t size() const { + return _numWeights; + } + + //! Global size is number of weights + size_t globalSize() const { + return _numWeights; + } + + //! Return the weights owned by this host + size_t numMasters const { + return _numOwned; + } + + //! GID is same as LID since all hosts have all weights + uint32_t getGID(const uint32_t nodeID) const { + return nodeID; + } + + //! LID is same as GID since all hosts have all weights + uint32_t getLID(const uint32_t nodeID) const { + return nodeID; + } + + //! Return local weight w + GradientType getData(uint32_t w) { + return _gradients[w]; + } + + std::vector> getMirrorRanges() const { + // TODO + } + + //! Return mirror nodes for each host from this host's point of view + std::vector>& getMirrorNodes() { + return mirrorNodes; + } + + //! clears the vector + // TODO return to this when we start distributing on GPUs + void deallocate() { + _gradients.clear(); + } + + // Essentially no-op functions follow + + //! no nodes with edges + size_t getNumNodesWithEdges() { + return 0; + } + + //! No edges; not a vertex cut + bool is_vertex_cut() const { + return false; + } + + //! no edges, return 0 + unsigned edge_begin(uint32_t dummy) { + return 0; + } + + //! no edges, return 0 + unsigned edge_end(uint32_t dummy) { + return 0; + } + + //! no edges, return 0 + unsigned getEdgeDst(uint32_t dummy) { + return 0; + } + + //! no edges, return 0 + unsigned getEdgeData(uint32_t dummy) { + return 0; + } +}; + +#endif // end header guard From 798edcb907771557aa04ac059bc5b5d9091a1bf1 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 5 Mar 2020 18:15:44 -0600 Subject: [PATCH 115/660] removing redundant includes --- libdeepgalois/include/deepgalois/layers/layer.h | 13 ------------- libdeepgalois/include/deepgalois/lgraph.h | 4 ---- 2 files changed, 17 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 19bb176f90..b2e06d5d61 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -9,19 +9,6 @@ * Reused/revised under 3-BSD */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include "deepgalois/layers/node.h" #include "deepgalois/types.h" #include "deepgalois/utils.h" diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index 7a86960338..029d12d44b 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -4,10 +4,6 @@ // defines the Learning Graph (LGraph) data structure #include #include -#include -#include -#include -#include namespace deepgalois { From d04129b541e1fcab955832863e77bdd36fc8965b Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Thu, 5 Mar 2020 19:33:05 -0600 Subject: [PATCH 116/660] fix matmul&aggregate order --- libdeepgalois/src/layers/graph_conv_layer.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 189f396cf8..0457936b84 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -64,17 +64,17 @@ void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { // note; assumption here is that out_grad contains 1s or 0s via relu? - if (act_) deepgalois::math::d_relu_cpu(x*z, out_grad, out_data, out_temp); - else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying + if (act_) deepgalois::math::d_relu_cpu(x*z, out_grad, out_data, out_grad); + //else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying // at this point, out_temp has the derivative of data from last step to // use for both updating gradients for features and gradients for weights // this calculates gradients for the node predictions if (level_ != 0) { // no need to calculate in_grad for the first layer - // derivative of matmul needs transposed matrix - deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_temp); // x*z; z*y -> // x*y NOTE: since graph is symmetric, the derivative is the same - deepgalois::update_all(y, context->graph_cpu, in_temp, in_grad, norm_, norm_factor); // x*x; x*y -> x*y + deepgalois::update_all(z, context->graph_cpu, out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z + // derivative of matmul needs transposed matrix + deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y if (dropout_) deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad); } @@ -82,7 +82,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // multiplied by gradients from last back prop step //deepgalois::math::transpose(x, y, in_data, trans_data); // x*y -> y*x //deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &layer::weight_grad[0]); // y*x; x*z; y*z - deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z + deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z } #endif } // namespace From 063046ee79ab3ee1445c4158eb72f2237660a665 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 6 Mar 2020 06:59:52 -0600 Subject: [PATCH 117/660] update cmake --- libdeepgalois/CMakeLists.txt | 6 ++---- lonestargnn/CMakeLists.txt | 4 ++-- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 34e094ce14..da85c18185 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -17,7 +17,7 @@ link_directories(${OPENBLAS_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgalois) set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support") -if(USE_CPU) +if(NOT ENABLE_HETERO_GALOIS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") else() #set( CMAKE_VERBOSE_MAKEFILE on ) @@ -45,8 +45,6 @@ else() target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcusparse -lcurand) set_target_properties(dg_gpu PROPERTIES COMPILE_FLAGS "-DTHRUST_DEVICE_SYSTEM=THRUST_DEVICE_SYSTEM_CUDA") set_target_properties(dg_gpu PROPERTIES CUDA_SEPERABLE_COMPILATION ON) - #cuda_compile(MF_O src/math_functions.cu) - #cuda_compile(AGG_O src/aggregator.cu) endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") @@ -63,7 +61,7 @@ set(sources ) add_library(dg_cpu STATIC ${sources}) -if(USE_CPU) +if(NOT ENABLE_HETERO_GALOIS) target_link_libraries(dg_cpu galois_shmem gllvm) else() target_link_libraries(dg_cpu galois_shmem gllvm galois_gpu) diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index e48887e261..3f6cb7672f 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -7,7 +7,7 @@ include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include) SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include) include_directories(${CUDA_INC}) -if(NOT USE_CPU) +if(ENABLE_HETERO_GALOIS) include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) endif() @@ -15,7 +15,7 @@ SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include) SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib) include_directories(${OPENBLAS_INC}) link_directories(${OPENBLAS_LIB}) -if(USE_CPU) +if(NOT ENABLE_HETERO_GALOIS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") endif() From a2b5d625ddb2e88bb65133430074ff91ef269d5f Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 6 Mar 2020 07:25:54 -0600 Subject: [PATCH 118/660] fix cmake --- CMakeLists.txt | 19 ------------------- lonestargnn/gcn/CMakeLists.txt | 2 +- 2 files changed, 1 insertion(+), 20 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 79555a0b31..f1b0489c10 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -502,27 +502,8 @@ if(USE_PANGOLIN) add_subdirectory(lonestarmine) endif(USE_PANGOLIN) if(USE_DEEPGALOIS) - SET(CUDA_SEPARABLE_COMPILATION ON) - find_package(CUDA REQUIRED) - set(CUDA_PROPAGATE_HOST_FLAGS off) - set(CUDA_SEPARABLE_COMPILATION on) - set(CUDA_HOST_COMPILER g++) - string(REPLACE "." "" GENCODES ${CUDA_CAPABILITY}) - string(REPLACE "," ";" GENCODES ${GENCODES}) - foreach(GENCODE ${GENCODES}) - set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; --expt-extended-lambda -gencode arch=compute_${GENCODE},code=sm_${GENCODE}) - endforeach() - list(APPEND CUDA_NVCC_FLAGS "-std=c++11") - cuda_include_directories("${CMAKE_SOURCE_DIR}/libgpu/include") add_subdirectory(libdeepgalois) add_subdirectory(lonestargnn) - set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers - cuda_include_directories("${CUB_ROOT}") - link_directories(${CMAKE_SOURCE_DIR}/cub) - set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers - cuda_include_directories("${MGPU_ROOT}/src") - link_directories(${CMAKE_SOURCE_DIR}/moderngpu/src) - add_subdirectory(libgpu) endif(USE_DEEPGALOIS) if(ENABLE_DIST_GALOIS) add_subdirectory(libdist) diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt index 3d25bb3966..c3fb95c07f 100644 --- a/lonestargnn/gcn/CMakeLists.txt +++ b/lonestargnn/gcn/CMakeLists.txt @@ -1,6 +1,6 @@ app(gcn gcn.cpp) target_link_libraries(gcn dg_cpu) -if(NOT USE_CPU) +if(ENABLE_HETERO_GALOIS) target_link_libraries(gcn dg_gpu) target_link_libraries(gcn -lcudart -lcublas -lcurand -lcudadevrt) endif() From 3d7e20e94b9a81a22c0369523bad7cc941b8b9f7 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 6 Mar 2020 10:37:52 -0600 Subject: [PATCH 119/660] add softmax_loss_layer.cu --- libdeepgalois/CMakeLists.txt | 2 +- .../include/deepgalois/math_functions.hh | 2 - .../src/layers/softmax_loss_layer.cpp | 25 +------- .../src/layers/softmax_loss_layer.cu | 62 +++++++++++++++++++ libdeepgalois/src/math_functions.cu | 29 +-------- libdeepgalois/src/net.cu | 45 ++++++++++++++ 6 files changed, 111 insertions(+), 54 deletions(-) create mode 100644 libdeepgalois/src/layers/softmax_loss_layer.cu create mode 100644 libdeepgalois/src/net.cu diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index da85c18185..1d53f24bd5 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -34,7 +34,7 @@ else() link_directories(${CMAKE_SOURCE_DIR}/libgpu) set(CUDA_SOURCES src/layers/graph_conv_layer.cu - #src/layers/softmax_loss_layer.cu + src/layers/softmax_loss_layer.cu src/layers/aggregator.cu src/math_functions.cu src/optimizer.cu diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 26639f6f55..06dd72c528 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -139,8 +139,6 @@ void d_softmax_cross_entropy_gpu(int len, int bengin, int end, const float_t* out_data, float_t* diff); void scal_gpu(const int N, const float alpha, float* X); void add_scalar_gpu(const int N, const float_t alpha, float_t* Y); -acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, - float_t* loss); acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count, mask_t* masks, float_t* preds, label_t* labels); diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index eda3de054d..4146dcd17f 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -2,19 +2,16 @@ namespace deepgalois { +#ifdef CPU_ONLY softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims) : layer(level, in_dims, out_dims) { trainable_ = false; name_ = layer_type() + "_" + std::to_string(level); -#ifdef CPU_ONLY loss = new float_t[in_dims[0]]; // error for each sample -#else - float_malloc_device(in_dims[0], loss); -#endif } -#ifdef CPU_ONLY + // TODO: need kernel fusion optimization // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] void softmax_loss_layer::forward_propagation(const float_t* in_data, @@ -69,24 +66,6 @@ acc_t softmax_loss_layer::get_masked_loss() { assert(valid_sample_count.reduce() == count_); return total_loss.reduce() / (acc_t)count_; } -#else // GPU implementation -void softmax_loss_layer::forward_propagation(const float_t* in_data, - float_t* out_data) { - init_const_gpu(input_dims[0], 0.0, loss); - softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, - d_masks_, context->d_labels, loss, out_data); -} - -void softmax_loss_layer::back_propagation(const float_t* in_data, - const float_t* out_data, - float_t* out_grad, float_t* in_grad) { - d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, - context->d_labels, out_data, in_grad); -} - -acc_t softmax_loss_layer::get_masked_loss() { - return masked_avg_loss(begin_, end_, count_, d_masks_, loss); -} #endif } // namespace diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu new file mode 100644 index 0000000000..e9216b1ae2 --- /dev/null +++ b/libdeepgalois/src/layers/softmax_loss_layer.cu @@ -0,0 +1,62 @@ +#include "deepgalois/layers/softmax_loss_layer.h" +#include "gg.h" +#include "ggcuda.h" + +__global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks, + float_t* loss, + HGAccumulator total) { + total.thread_entry(); + __shared__ cub::BlockReduce::TempStorage local_loss; + CUDA_KERNEL_LOOP(i, end - begin) { + if (masks[begin + i] == 1) + // total += loss[begin+i]; + total.reduce(loss[begin + i]); + } + total.thread_exit>(local_loss); +} + +//acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, float_t* loss); +acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, + float_t* loss) { + assert(count > 0); + HGAccumulator loss_accum; + Shared total_loss = Shared(1); + *(total_loss.cpu_wr_ptr()) = 0; + loss_accum.rv = total_loss.gpu_wr_ptr(); + masked_avg_loss_kernel<<>>( + begin, end, masks, loss, loss_accum); + CudaTest("solving masked_avg_loss kernel failed"); + cudaDeviceSynchronize(); + return *(total_loss.cpu_rd_ptr()) / count; +} + +namespace deepgalois { + +softmax_loss_layer::softmax_loss_layer(unsigned level, + std::vector in_dims, + std::vector out_dims) + : layer(level, in_dims, out_dims) { + trainable_ = false; + name_ = layer_type() + "_" + std::to_string(level); + float_malloc_device(in_dims[0], loss); +} + +void softmax_loss_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + init_const_gpu(input_dims[0], 0.0, loss); + softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, + d_masks_, context->d_labels, loss, out_data); +} + +void softmax_loss_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, + context->d_labels, out_data, in_grad); +} + +acc_t softmax_loss_layer::get_masked_loss() { + return masked_avg_loss(begin_, end_, count_, d_masks_, loss); +} + +} // namespace diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 9131bf9509..e899f16226 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -357,33 +357,6 @@ void d_softmax_cross_entropy_gpu(int len, int begin, int end, CudaTest("solving d_softmax_cross_entropy kernel failed"); } -__global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks, - float_t* loss, - HGAccumulator total) { - total.thread_entry(); - __shared__ cub::BlockReduce::TempStorage local_loss; - CUDA_KERNEL_LOOP(i, end - begin) { - if (masks[begin + i] == 1) - // total += loss[begin+i]; - total.reduce(loss[begin + i]); - } - total.thread_exit>(local_loss); -} - -acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, - float_t* loss) { - assert(count > 0); - HGAccumulator loss_accum; - Shared total_loss = Shared(1); - *(total_loss.cpu_wr_ptr()) = 0; - loss_accum.rv = total_loss.gpu_wr_ptr(); - masked_avg_loss_kernel<<>>( - begin, end, masks, loss, loss_accum); - CudaTest("solving masked_avg_loss kernel failed"); - cudaDeviceSynchronize(); - return *(total_loss.cpu_rd_ptr()) / count; -} - // the arguments of the maxima __device__ int argmax_device(const int n, const float_t* x) { float_t max = x[0]; @@ -425,7 +398,7 @@ acc_t masked_accuracy_gpu(int num_classes, int begin, int end, accuracy_accum.rv = total_accuracy.gpu_wr_ptr(); masked_accuracy_kernel<<>>( num_classes, begin, end, masks, preds, labels, accuracy_accum); - CudaTest("solving masked_avg_loss kernel failed"); + CudaTest("solving masked_accuracy kernel failed"); cudaDeviceSynchronize(); return *(total_accuracy.cpu_rd_ptr()) / count; } diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu new file mode 100644 index 0000000000..28cd021df9 --- /dev/null +++ b/libdeepgalois/src/net.cu @@ -0,0 +1,45 @@ +#include "deepgalois/net.h" +#include "gg.h" +#include "ggcuda.h" + +__global__ void masked_accuracy_kernel(int num_classes, int begin, + int end, mask_t* masks, + float_t* preds, label_t* labels, + HGAccumulator total) { + total.thread_entry(); + __shared__ cub::BlockReduce::TempStorage + local_accuracy; + CUDA_KERNEL_LOOP(i, end - begin) { + if (masks[begin + i] == 1) { + label_t pred = (label_t)argmax_device(num_classes, + preds + (begin + i) * num_classes); + if (pred == labels[begin + i]) + total.reduce(1.0); + } + } + total.thread_exit>(local_accuracy); +} + +acc_t masked_accuracy_gpu(int num_classes, int begin, int end, + int count, mask_t* masks, float_t* preds, + label_t* labels) { + assert(count > 0); + HGAccumulator accuracy_accum; + Shared total_accuracy = Shared(1); + *(total_accuracy.cpu_wr_ptr()) = 0; + accuracy_accum.rv = total_accuracy.gpu_wr_ptr(); + masked_accuracy_kernel<<>>( + num_classes, begin, end, masks, preds, labels, accuracy_accum); + CudaTest("solving masked_accuracy kernel failed"); + cudaDeviceSynchronize(); + return *(total_accuracy.cpu_rd_ptr()) / count; +} + +acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, + mask_t* masks) { + return masked_accuracy_gpu(num_classes, begin, end, count, + layers[NUM_CONV_LAYERS]->get_device_masks(), + layers[NUM_CONV_LAYERS - 1]->next()->get_data(), + context->d_labels); +} + From 2393762b5f1ebcbe1e23968bda6731a636b2112b Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 6 Mar 2020 10:54:35 -0600 Subject: [PATCH 120/660] add net.cu --- libdeepgalois/CMakeLists.txt | 1 + .../include/deepgalois/math_functions.hh | 3 -- libdeepgalois/src/math_functions.cu | 45 ------------------- libdeepgalois/src/net.cpp | 10 +---- libdeepgalois/src/net.cu | 22 +++++++-- 5 files changed, 21 insertions(+), 60 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 1d53f24bd5..2cf03d281f 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -40,6 +40,7 @@ else() src/optimizer.cu src/context.cu src/node.cu + src/net.cu ) cuda_add_library(dg_gpu ${CUDA_SOURCES}) target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcusparse -lcurand) diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 06dd72c528..158455f73b 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -139,9 +139,6 @@ void d_softmax_cross_entropy_gpu(int len, int bengin, int end, const float_t* out_data, float_t* diff); void scal_gpu(const int N, const float alpha, float* X); void add_scalar_gpu(const int N, const float_t alpha, float_t* Y); -acc_t masked_accuracy_gpu(int num_classes, int begin, int end, - int count, mask_t* masks, float_t* preds, - label_t* labels); bool is_allocated_device(float_t* data); void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks); void float_malloc_device(int n, float_t*& loss); diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index e899f16226..531480091d 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -357,48 +357,3 @@ void d_softmax_cross_entropy_gpu(int len, int begin, int end, CudaTest("solving d_softmax_cross_entropy kernel failed"); } -// the arguments of the maxima -__device__ int argmax_device(const int n, const float_t* x) { - float_t max = x[0]; - int max_ind = 0; - for (int i = 1; i < n; i++) { - if (x[i] > max) { - max_ind = i; - max = x[i]; - } - } - return max_ind; -} - -__global__ void masked_accuracy_kernel(int num_classes, int begin, - int end, mask_t* masks, - float_t* preds, label_t* labels, - HGAccumulator total) { - total.thread_entry(); - __shared__ cub::BlockReduce::TempStorage - local_accuracy; - CUDA_KERNEL_LOOP(i, end - begin) { - if (masks[begin + i] == 1) { - label_t pred = (label_t)argmax_device(num_classes, - preds + (begin + i) * num_classes); - if (pred == labels[begin + i]) - total.reduce(1.0); - } - } - total.thread_exit>(local_accuracy); -} - -acc_t masked_accuracy_gpu(int num_classes, int begin, int end, - int count, mask_t* masks, float_t* preds, - label_t* labels) { - assert(count > 0); - HGAccumulator accuracy_accum; - Shared total_accuracy = Shared(1); - *(total_accuracy.cpu_wr_ptr()) = 0; - accuracy_accum.rv = total_accuracy.gpu_wr_ptr(); - masked_accuracy_kernel<<>>( - num_classes, begin, end, masks, preds, labels, accuracy_accum); - CudaTest("solving masked_accuracy kernel failed"); - cudaDeviceSynchronize(); - return *(total_accuracy.cpu_rd_ptr()) / count; -} diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 30f0e86488..a194bd43d7 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -115,9 +115,8 @@ void Net::construct_layers() { set_contexts(); } -acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, - mask_t* masks) { #ifdef CPU_ONLY +acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks) { AccumF accuracy_all; accuracy_all.reset(); galois::do_all(galois::iterate(begin, end), [&](const auto& i) { @@ -130,12 +129,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); return accuracy_all.reduce() / (acc_t)count; -#else - return masked_accuracy_gpu(num_classes, begin, end, count, - layers[NUM_CONV_LAYERS]->get_device_masks(), - layers[NUM_CONV_LAYERS - 1]->next()->get_data(), - context->d_labels); -#endif } +#endif } // namespace deepgalois diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index 28cd021df9..947967d07c 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -2,6 +2,19 @@ #include "gg.h" #include "ggcuda.h" +// the arguments of the maxima +__device__ int argmax_device(const int n, const float_t* x) { + float_t max = x[0]; + int max_ind = 0; + for (int i = 1; i < n; i++) { + if (x[i] > max) { + max_ind = i; + max = x[i]; + } + } + return max_ind; +} + __global__ void masked_accuracy_kernel(int num_classes, int begin, int end, mask_t* masks, float_t* preds, label_t* labels, @@ -20,9 +33,9 @@ __global__ void masked_accuracy_kernel(int num_classes, int begin, total.thread_exit>(local_accuracy); } -acc_t masked_accuracy_gpu(int num_classes, int begin, int end, - int count, mask_t* masks, float_t* preds, - label_t* labels) { +//acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count, mask_t* masks, float_t* preds, label_t* labels); +acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count, + mask_t* masks, float_t* preds, label_t* labels) { assert(count > 0); HGAccumulator accuracy_accum; Shared total_accuracy = Shared(1); @@ -35,6 +48,7 @@ acc_t masked_accuracy_gpu(int num_classes, int begin, int end, return *(total_accuracy.cpu_rd_ptr()) / count; } +namespace deepgalois { acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks) { return masked_accuracy_gpu(num_classes, begin, end, count, @@ -42,4 +56,4 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, layers[NUM_CONV_LAYERS - 1]->next()->get_data(), context->d_labels); } - +} From 92d1687d757494a36f47213328d0d64fbee0ee71 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 6 Mar 2020 11:15:19 -0600 Subject: [PATCH 121/660] fix aggregate --- libdeepgalois/src/layers/graph_conv_layer.cpp | 9 ++++----- libdeepgalois/src/layers/graph_conv_layer.cu | 13 +++++++------ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 0457936b84..3800e6d2ad 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -67,12 +67,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data, if (act_) deepgalois::math::d_relu_cpu(x*z, out_grad, out_data, out_grad); //else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying + // x*y NOTE: since graph is symmetric, the derivative is the same + deepgalois::update_all(z, context->graph_cpu, out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z + // at this point, out_temp has the derivative of data from last step to // use for both updating gradients for features and gradients for weights // this calculates gradients for the node predictions if (level_ != 0) { // no need to calculate in_grad for the first layer - // x*y NOTE: since graph is symmetric, the derivative is the same - deepgalois::update_all(z, context->graph_cpu, out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z // derivative of matmul needs transposed matrix deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y if (dropout_) deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad); @@ -80,9 +81,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // calculate weight gradients using input data // multiplied by gradients from last back prop step - //deepgalois::math::transpose(x, y, in_data, trans_data); // x*y -> y*x - //deepgalois::math::matmul1D1D(y, z, x, trans_data, out_temp, &layer::weight_grad[0]); // y*x; x*z; y*z - deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z + deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z } #endif } // namespace diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index 69630b50f9..5717d37af8 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -33,17 +33,18 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { - if (act_) d_relu_gpu(x * z, out_grad, out_data, out_temp); - else copy_gpu(x * z, out_grad, out_temp); - if (level_ != 0) { - sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_temp); + if (act_) d_relu_gpu(x * z, out_grad, out_data, out_grad); #ifdef USE_CUSPARSE - update_all_csrmm(y, context->graph_gpu, in_temp, in_grad, norm_, norm_factor); + update_all_csrmm(z, context->graph_gpu, out_grad, out_temp, norm_, norm_factor); #else - update_all(y, context->graph_gpu, in_temp, in_grad, norm_, norm_factor); + update_all(z, context->graph_gpu, out_grad, out_temp, norm_, norm_factor); #endif + if (level_ != 0) { + sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_grad); if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad); } sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad); } + } // namespace + From fab9f9eeadddcfb9780b09f7ba3dcd98d0b284ba Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 6 Mar 2020 13:44:33 -0600 Subject: [PATCH 122/660] linking dist libs with dg_cpu, reorg cmakelists --- libdeepgalois/CMakeLists.txt | 42 +++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 12 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 2cf03d281f..ae00edabc0 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -1,26 +1,29 @@ cmake_minimum_required(VERSION 2.8) +# open blas SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include) SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib) -set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers -set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers -SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include) -SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-10.0/lib64/) include_directories(${OPENBLAS_INC}) +link_directories(${OPENBLAS_LIB}) + +# galois base libs include_directories(${CMAKE_SOURCE_DIR}/libgalois/include) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) -include_directories(${CUDA_INC}) -include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) -include_directories("${CUB_ROOT}") -include_directories("${MGPU_ROOT}/src") -link_directories(${OPENBLAS_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgalois) -set(USE_CPU ON CACHE BOOL "Build DeepGalois without CUDA support") if(NOT ENABLE_HETERO_GALOIS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") else() - #set( CMAKE_VERBOSE_MAKEFILE on ) + # hetero path + set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers + include_directories("${CUB_ROOT}") + set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers + include_directories("${MGPU_ROOT}/src") + + SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include) + include_directories(${CUDA_INC}) + include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) + find_package(CUDA REQUIRED) set(CUDA_SEPARABLE_COMPILATION ON) set(CUDA_PROPAGATE_HOST_FLAGS OFF) @@ -30,8 +33,10 @@ else() set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_70,code=sm_70) #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -G -Xcompiler -rdynamic) #set(CUDA_INCLUDE_DIRS /org/centers/cdgc/cuda/cuda-10.0/include ${CUDA_INCLUDE_DIRS}) + SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-10.0/lib64/) link_directories(${CUDA_LIB}) link_directories(${CMAKE_SOURCE_DIR}/libgpu) + set(CUDA_SOURCES src/layers/graph_conv_layer.cu src/layers/softmax_loss_layer.cu @@ -70,7 +75,10 @@ endif() target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES}) target_link_libraries(dg_cpu -lopenblas) -target_link_libraries(dg_cpu -lcudart -lcublas -lcusparse -lcurand) + +if(ENABLE_HETERO_GALOIS) + target_link_libraries(dg_cpu -lcudart -lcublas -lcusparse -lcurand) +endif() target_include_directories(dg_cpu PUBLIC ${CMAKE_SOURCE_DIR}/libllvm/include @@ -78,6 +86,16 @@ target_include_directories(dg_cpu PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include ) +# dist galois setup/linking to dg_cpu +if(ENABLE_DIST_GALOIS) + target_link_libraries(dg_cpu galois_dist_async galois_cusp galois_gluon) + target_include_directories(dg_cpu PUBLIC + ${CMAKE_SOURCE_DIR}/libdist/include + ${CMAKE_SOURCE_DIR}/libcusp/include + ${CMAKE_SOURCE_DIR}/libgluon/include + ) +endif() + set_target_properties(dg_cpu PROPERTIES INTERFACE_POSITION_INDEPENDENT_CODE On POSITION_INDEPENDENT_CODE On From cf6b1541be59f2406c7701c2707e106ebb3e457f Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 6 Mar 2020 15:59:47 -0600 Subject: [PATCH 123/660] add avx2 for mul_scalar --- libdeepgalois/src/math_functions.cpp | 41 ++++++++++++++-------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index d3f7d0fca0..cd21a6b1a0 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -45,48 +45,47 @@ void csrmm_cpu(const int M, const int N, const int K, const int nnz, #endif } +const size_t vec_len = 8; // vector add #if defined(__AVX__) || defined(__AVX2__) -void vadd(const vec_t& a, const vec_t& b, vec_t& out) { - // for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i]; - size_t n = out.size(); - size_t vec_len = 8; - const size_t alignedN = n - n % vec_len; - for (size_t i = 0; i < alignedN; i += vec_len) - _mm256_storeu_ps( - &out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); - for (size_t i = alignedN; i < n; ++i) - out[i] = a[i] + b[i]; -} - void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) { - size_t vec_len = 8; const size_t alignedN = n - n % vec_len; for (size_t i = 0; i < alignedN; i += vec_len) _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i]; } + +void vadd(const vec_t& a, const vec_t& b, vec_t& out) { + size_t n = out.size(); + vadd_cpu(n, &a[0], &b[0], &out[0]); +} #else void vadd(const vec_t& a, const vec_t& b, vec_t& out) { - for (size_t i = 0; i < out.size(); ++i) - out[i] = a[i] + b[i]; + for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i]; } void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) { for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i]; } #endif +#if defined(__AVX__) || defined(__AVX2__) +void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) { + const size_t alignedN = n - n % vec_len; + const __m256 scal = _mm256_set1_ps(alpha); + for (size_t i = 0; i < alignedN; i += vec_len) + _mm256_storeu_ps(&out[i], _mm256_mul_ps(_mm256_loadu_ps(&in[i]), scal)); + for (size_t i = alignedN; i < n; ++i) out[i] = alpha * in[i]; +} +#else // vector multiply scalar void mul_scalar(const float_t alpha, vec_t& Y) { - for (size_t i = 0; i < Y.size(); ++i) - Y[i] *= alpha; + for (size_t i = 0; i < Y.size(); ++i) Y[i] *= alpha; } -void mul_scalar(size_t n, const float_t alpha, const float_t* in, - float_t* out) { - for (size_t i = 0; i < n; ++i) - out[i] = alpha * in[i]; +void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) { + for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i]; } +#endif void clear(vec_t& in) { for (size_t i = 0; i < in.size(); i++) From b963c932210790e5074a83d606362c1ac729a213 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 6 Mar 2020 16:12:48 -0600 Subject: [PATCH 124/660] finishing up mirror setup for gluon gradients --- .../deepgalois/layers/GluonGradients.h | 95 +++++++++++++++++-- 1 file changed, 86 insertions(+), 9 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/GluonGradients.h b/libdeepgalois/include/deepgalois/layers/GluonGradients.h index 4131fd22d4..578226552e 100644 --- a/libdeepgalois/include/deepgalois/layers/GluonGradients.h +++ b/libdeepgalois/include/deepgalois/layers/GluonGradients.h @@ -1,8 +1,12 @@ #ifndef __GLUON_GRADIENTS__ #define __GLUON_GRADIENTS__ +#include "galois/gstl.h" +#include "galois/runtime/Network.h" #include "deepgalois/types.h" +namespace deepgalois { + /** * Wraps the weight gradients and provides an interface for Gluon to * synchronize them during distributed execution. @@ -15,13 +19,25 @@ class GluonGradients { using GradientVecType = vec_t; GradientVecType& _gradients; + //! number of weight gradients size_t _numWeights; + //! number of gradients this host is responsible for size_t _numOwned; - //! my nodes whose's masters are on other hosts; global ids - std::vector> mirrorNodes; - // TODO save mirror ranges here as well + //! My host ID + unsigned _myHost; + //! Total num hosts in system + unsigned _totalHosts; + + //! first node I own + unsigned _beginMaster; + //! last node I own (contiguous chunk) + unsigned _endMaster; + //! my nodes whose's masters are on other hosts; global ids + std::vector> _mirrorNodes; + //! nodes that are mirrors on this host + std::vector> _mirrorRanges; public: /** * Save weight gradients + number of them (i.e. size). @@ -29,6 +45,53 @@ class GluonGradients { */ GluonGradients(GradientVecType& gradients, size_t numWeights) : _gradients(gradients), _numWeights(numWeights) { + _myHost = galois::runtime::getSystemNetworkInterface().ID; + _totalHosts = galois::runtime::getSystemNetworkInterface().Num; + + // allocate a vector for each host + _mirrorNodes.resize(_totalHosts); + + // loop through distribution of weights to hosts + for (unsigned h = 0; h < _totalHosts; h++) { + std::pair curRange = + galois::block_range((size_t)0, _numWeights, h, _totalHosts); + + if (h != _myHost) { + // setup mirrors for the host h which is just the list of IDs + size_t curW = curRange.first; + size_t lastW = curRange.second; + size_t numW = lastW - curW; + + // set mirrors for host h + _mirrorNodes[h].reserve(numW); + for (; curW < lastW; curW++) { + _mirrorNodes[h].push_back(curW); + } + } else { + // these belong to this host; save, then mirror ranges can be + // calculated from this + _beginMaster = curRange.first; + _endMaster = curRange.second; + _numOwned = _endMaster - _beginMaster; + + // first range is 0 to begin master + if (_beginMaster > 0) { + galois::gInfo("[", _myHost, "] Mirror range ", 0, " to ", + _beginMaster); + _mirrorRanges.emplace_back(0, _beginMaster); + } + + // second range is endMaster to end + if (_endMaster < _numWeights) { + galois::gInfo("[", _myHost, "] Mirror range ", _endMaster, " to ", + _numWeights); + _mirrorRanges.emplace_back(_endMaster, _numWeights); + } + } + } + + galois::gInfo("[", _myHost, "] This host owns ", _beginMaster, " to ", + _endMaster); } //! Size is number of weights @@ -42,10 +105,20 @@ class GluonGradients { } //! Return the weights owned by this host - size_t numMasters const { + size_t numMasters() const { return _numOwned; } + //! Return host ID + unsigned myHostID() const { + return _myHost; + } + + //! Return num hosts in the system + unsigned numHosts() const { + return _totalHosts; + } + //! GID is same as LID since all hosts have all weights uint32_t getGID(const uint32_t nodeID) const { return nodeID; @@ -57,21 +130,23 @@ class GluonGradients { } //! Return local weight w - GradientType getData(uint32_t w) { + GradientType getData(uint32_t w) const { return _gradients[w]; } - std::vector> getMirrorRanges() const { - // TODO + //! Return ranges for mirrors (unowned nodes) + const std::vector>& getMirrorRanges() const { + return _mirrorRanges; } //! Return mirror nodes for each host from this host's point of view std::vector>& getMirrorNodes() { - return mirrorNodes; + return _mirrorNodes; } //! clears the vector - // TODO return to this when we start distributing on GPUs + // TODO return to this when we start distributing on GPUs; wrapper + // end probably shouldn't be managing this MAYBE void deallocate() { _gradients.clear(); } @@ -109,4 +184,6 @@ class GluonGradients { } }; +} + #endif // end header guard From 6775ca2eefc82c151beb84535fa7ce9de4e847c6 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 6 Mar 2020 16:13:01 -0600 Subject: [PATCH 125/660] initialize gluon substrate in conv layer --- libdeepgalois/include/deepgalois/layers/layer.h | 12 +++++++++++- libdeepgalois/src/layers/graph_conv_layer.cpp | 11 +++++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index b2e06d5d61..c394ac7bbf 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -9,12 +9,16 @@ * Reused/revised under 3-BSD */ -#include "deepgalois/layers/node.h" #include "deepgalois/types.h" #include "deepgalois/utils.h" #include "deepgalois/context.h" #include "deepgalois/optimizer.h" #include "deepgalois/math_functions.hh" +#include "deepgalois/layers/node.h" +#ifdef GALOIS_USE_DIST +#include "deepgalois/layers/GluonGradients.h" +#include "galois/graphs/GluonSubstrate.h" +#endif namespace deepgalois { @@ -145,6 +149,12 @@ class layer : public deepgalois::node { mask_t* d_masks_; float_t* loss; // error for each vertex: N x 1 deepgalois::Context* context; + +#ifdef GALOIS_USE_DIST + // Used for synchronization of weight gradients + deepgalois::GluonGradients* gradientGraph; + galois::graphs::GluonSubstrate* syncSub; +#endif }; diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 3800e6d2ad..48051e1ab7 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -36,6 +36,17 @@ void graph_conv_layer::init() { rand_init_matrix(y, z, W); // randomly initialize trainable parameters // rand_init_matrix(y, z, Q); zero_init_matrix(y, z, layer::weight_grad); + +#ifdef GALOIS_USE_DIST + // setup gluon + layer::gradientGraph = new deepgalois::GluonGradients(layer::weight_grad, + y * z); + layer::syncSub = + new galois::graphs::GluonSubstrate( + *layer::gradientGraph, layer::gradientGraph->myHostID(), + layer::gradientGraph->numHosts(), false); +#endif + if (dropout_) dropout_mask = new unsigned[x * y]; in_temp = new float_t[x * y]; out_temp = new float_t[x * z]; From 9d2b456f319cc5c71e8d9ed6ef4d1cf798f9d7a2 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 6 Mar 2020 16:49:26 -0600 Subject: [PATCH 126/660] LonestarGnn start uses DistSys as necessary --- lonestargnn/gcn/gcn.cpp | 5 +++++ lonestargnn/lonestargnn.h | 13 +++++++++++++ 2 files changed, 18 insertions(+) diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 3357fd904e..086b6701de 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -7,7 +7,12 @@ const char* desc = "Graph convolutional neural networks on an undirected graph"; const char* url = 0; int main(int argc, char** argv) { +#ifndef GALOIS_USE_DIST galois::SharedMemSys G; +#else + galois::DistMemSys G; +#endif + LonestarGnnStart(argc, argv, name, desc, url); deepgalois::Net network; // the neural network to train // read network, features, ground truth, initialize metadata diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/lonestargnn.h index a04905b5cb..e932738636 100644 --- a/lonestargnn/lonestargnn.h +++ b/lonestargnn/lonestargnn.h @@ -10,6 +10,10 @@ #include "galois/runtime/Profile.h" #include "llvm/Support/CommandLine.h" #include +#ifdef GALOIS_USE_DIST +#include "galois/DistGalois.h" +#include "galois/runtime/Network.h" +#endif namespace cll = llvm::cl; static cll::opt @@ -80,6 +84,11 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, llvm::cl::ParseCommandLineOptions(argc, argv); numThreads = galois::setActiveThreads(numThreads); galois::runtime::setStatFile(statFile); + +#ifdef GALOIS_USE_DIST + auto& net = galois::runtime::getSystemNetworkInterface(); + if (net.ID == 0) { +#endif LonestarGnnPrintVersion(); std::cout << "Copyright (C) " << galois::getCopyrightYear() << " The University of Texas at Austin\n"; @@ -99,6 +108,10 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, } galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str()); galois::runtime::reportParam("(NULL)", "Threads", numThreads); +#ifdef GALOIS_USE_DIST + } +#endif + char name[256]; gethostname(name, 256); galois::runtime::reportParam("(NULL)", "Hostname", name); From 1816956da3eed26543b93462a9db6fecd5cf18a5 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 6 Mar 2020 17:37:54 -0600 Subject: [PATCH 127/660] dummy sync structures for weight gradients --- .../deepgalois/layers/GluonGradients.h | 2 +- .../deepgalois/layers/GradientSyncStructs.h | 51 +++++++++++++++++++ .../include/deepgalois/layers/layer.h | 3 +- 3 files changed, 54 insertions(+), 2 deletions(-) create mode 100644 libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h diff --git a/libdeepgalois/include/deepgalois/layers/GluonGradients.h b/libdeepgalois/include/deepgalois/layers/GluonGradients.h index 578226552e..1643a62027 100644 --- a/libdeepgalois/include/deepgalois/layers/GluonGradients.h +++ b/libdeepgalois/include/deepgalois/layers/GluonGradients.h @@ -130,7 +130,7 @@ class GluonGradients { } //! Return local weight w - GradientType getData(uint32_t w) const { + GradientType& getData(uint32_t w) const { return _gradients[w]; } diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h new file mode 100644 index 0000000000..e38eb5192a --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h @@ -0,0 +1,51 @@ +#ifndef __GRAD_SYNC_STRUCT__ +#define __GRAD_SYNC_STRUCT__ + +#include "deepgalois/types.h" + +struct GradientSync { + using ValTy = float_t; + + static ValTy extract(uint32_t node_id, float_t& weight) { + return weight; + } + + static bool reduce(uint32_t node_id, float_t& weight, ValTy y) { + // TODO merge function here + // for now make sure the weights are close enough + if (std::abs(weight - y) > 0.00001) { + galois::gInfo("weight ", node_id, " not consistent with one received"); + } + + return true; + } + + //! reset weight to 0 + static void reset(uint32_t node_id, float_t &weight) { + weight = 0; + } + + //! save weight + static void setVal(uint32_t node_id, float_t &weight, ValTy y) { + weight = y; + } + + // GPU options TODO for GPU + static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_batch(unsigned, uint8_t*) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, + DataCommMode*) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*) { return false; } + static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { + return false; + } + static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } + +}; + +// TODO bitset; might have to do it manually +//GALOIS_SYNC_STRUCTURE_BITSET(TODOTHIS?); +#endif diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index c394ac7bbf..a91f495915 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -16,8 +16,9 @@ #include "deepgalois/math_functions.hh" #include "deepgalois/layers/node.h" #ifdef GALOIS_USE_DIST -#include "deepgalois/layers/GluonGradients.h" #include "galois/graphs/GluonSubstrate.h" +#include "deepgalois/layers/GluonGradients.h" +#include "deepgalois/layers/GradientSyncStructs.h" #endif namespace deepgalois { From 67b3b1fc211e6c99ee29a9f2229def9bb5bcb3c9 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 6 Mar 2020 18:03:55 -0600 Subject: [PATCH 128/660] sync of weight gradients called + working --- .../include/deepgalois/layers/GradientSyncStructs.h | 1 - libdeepgalois/src/layers/graph_conv_layer.cpp | 13 ++++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h index e38eb5192a..df88352bcf 100644 --- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h +++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h @@ -43,7 +43,6 @@ struct GradientSync { return false; } static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } - }; // TODO bitset; might have to do it manually diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 48051e1ab7..ceeae8605d 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -86,13 +86,20 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // this calculates gradients for the node predictions if (level_ != 0) { // no need to calculate in_grad for the first layer // derivative of matmul needs transposed matrix - deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y - if (dropout_) deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad); + deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, + out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y + if (dropout_) { + deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, + in_grad); + } } // calculate weight gradients using input data // multiplied by gradients from last back prop step - deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z + deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, + out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z + layer::syncSub->sync("GradientSync"); + //galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); } #endif } // namespace From 65e65d75c0bdecab7f25909aff287930dee3c426 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 9 Mar 2020 12:30:29 -0500 Subject: [PATCH 129/660] clean up CMake for deepgalois; fixed gpu issue --- libdeepgalois/CMakeLists.txt | 22 ++++++++++------------ 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index ae00edabc0..bdc0f97942 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -66,20 +66,9 @@ set(sources src/net.cpp ) add_library(dg_cpu STATIC ${sources}) - -if(NOT ENABLE_HETERO_GALOIS) - target_link_libraries(dg_cpu galois_shmem gllvm) -else() - target_link_libraries(dg_cpu galois_shmem gllvm galois_gpu) -endif() - +target_link_libraries(dg_cpu galois_shmem gllvm) target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES}) target_link_libraries(dg_cpu -lopenblas) - -if(ENABLE_HETERO_GALOIS) - target_link_libraries(dg_cpu -lcudart -lcublas -lcusparse -lcurand) -endif() - target_include_directories(dg_cpu PUBLIC ${CMAKE_SOURCE_DIR}/libllvm/include ${CMAKE_SOURCE_DIR}/libgalois/include @@ -94,6 +83,15 @@ if(ENABLE_DIST_GALOIS) ${CMAKE_SOURCE_DIR}/libcusp/include ${CMAKE_SOURCE_DIR}/libgluon/include ) + + if(ENABLE_HETERO_GALOIS) + target_link_libraries(dg_gpu galois_dist_async galois_cusp galois_gluon) + target_include_directories(dg_gpu PUBLIC + ${CMAKE_SOURCE_DIR}/libdist/include + ${CMAKE_SOURCE_DIR}/libcusp/include + ${CMAKE_SOURCE_DIR}/libgluon/include + ) + endif() endif() set_target_properties(dg_cpu PROPERTIES From 02d8f2082bcdd1b0be0dec51ce4f2f6049ef12a1 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 9 Mar 2020 13:53:01 -0500 Subject: [PATCH 130/660] add cusparse flag --- libdeepgalois/include/deepgalois/types.h | 1 + libdeepgalois/src/context.cpp | 4 ++-- libdeepgalois/src/context.cu | 3 ++- libdeepgalois/src/layers/graph_conv_layer.cu | 2 +- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index 118f04bd04..1a32d5a47d 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -26,4 +26,5 @@ typedef uint8_t mask_t; // mask is used to indicate different uses of labels: #define BLOCK_SIZE 256 #define WARP_SIZE 32 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE) +#define USE_CUSPARSE #endif diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 8b5917b70c..dc959f5876 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -70,7 +70,7 @@ void Context::add_selfloop(Graph &og, Graph &g) { g.constructNodes(); /* for (size_t src = 0; src < og.size(); src++) { - g.getData(src) = 1; + //g.getData(src) = 1; auto begin = og.edge_begin(src); auto end = og.edge_end(src); g.fixEndEdge(src, end+src+1); @@ -90,7 +90,7 @@ void Context::add_selfloop(Graph &og, Graph &g) { } else g.constructEdge(e+src+1, dst, 0); } } - */ + //*/ } float_t* Context::get_in_ptr() { return &h_feats[0]; } diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 5ddbdc3dd8..564c0aaa08 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -3,6 +3,7 @@ #include #include #include "deepgalois/context.h" +#include "deepgalois/math_functions.hh" // random seeding int64_t cluster_seedgen(void) { @@ -90,7 +91,7 @@ void Context::norm_factor_counting() { #ifdef USE_CUSPARSE int nnz = graph_gpu.nedges; CUDA_CHECK(cudaMalloc((void**)&norm_factor, nnz * sizeof(float_t))); - init_const_kernel<<>>(nnz, 0.0, norm_factor); + init_const_gpu(nnz, 0.0, norm_factor); norm_factor_counting_edge<<>>( n, graph_gpu, norm_factor); #else diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index 5717d37af8..b2a9209bd4 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -8,7 +8,7 @@ void graph_conv_layer::init() { void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { #ifdef USE_CUSPARSE - deepgalois::update_all_csrmm(y, context->graph_gpu, in_temp, in_grad, norm_, norm_factor); + deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor); #else deepgalois::update_all(len, g, in, out, norm_, norm_factor); #endif From 64bb52ded981880cb5e178cc20fc2f737317ba48 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 9 Mar 2020 15:30:20 -0500 Subject: [PATCH 131/660] fix bug in adding selfloop --- libdeepgalois/src/context.cpp | 4 ++++ libdeepgalois/src/context.cu | 3 +++ libgpu/include/graph_gpu.h | 4 ++++ 3 files changed, 11 insertions(+) diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index dc959f5876..79bd0be985 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -75,6 +75,10 @@ void Context::add_selfloop(Graph &og, Graph &g) { auto end = og.edge_end(src); g.fixEndEdge(src, end+src+1); bool self_inserted = false; + if (begin == end) { + new_edge_dst[begin+i] = i; + continue; + } for (auto e = begin; e != end; e++) { auto dst = og.getEdgeDst(e); if (!self_inserted) { diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 564c0aaa08..d727904107 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -36,6 +36,7 @@ __global__ void norm_factor_counting_node(int n, CSRGraph graph, float_t* norm_f // computing normalization factor for each edge __global__ void norm_factor_counting_edge(int n, CSRGraph graph, float_t* norm_fac) { CUDA_KERNEL_LOOP(src, n) { + assert(src < n); float_t d_src = float_t(graph.getOutDegree(src)); assert(d_src != 0.0); // should never be zero since self-loop added for each vertex d_src = 1.0 / sqrt(d_src); @@ -43,6 +44,8 @@ __global__ void norm_factor_counting_edge(int n, CSRGraph graph, float_t* norm_f index_type end = graph.edge_end(src); for (index_type e = start; e != end; e++) { index_type dst = graph.getEdgeDst(e); + if (dst >= n) printf("src=%d, dst=%d, e=%d, start=%d, end=%d\n", src, dst, e, start, end); + assert(dst < n); float_t d_dst = float_t(graph.getOutDegree(dst)); assert(d_dst != 0.0); d_dst = 1.0 / sqrt(d_dst); diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index da420ea416..3f2c88a308 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -123,6 +123,10 @@ struct CSRGraph { index_type start = row_start[i]; index_type end = row_start[i+1]; bool selfloop_inserted = false; + if (start == end) { + new_edge_dst[start+i] = i; + continue; + } for (index_type e = start; e != end; e++) { index_type dst = edge_dst[e]; if (!selfloop_inserted) { From ae75bd890398396f067ddbe995ca2ada02e2cf3c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 9 Mar 2020 12:53:26 -0500 Subject: [PATCH 132/660] seed argument to rand_init_matrix --- libdeepgalois/include/deepgalois/layers/graph_conv_layer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index b3f9d16d2b..4904b13905 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -68,9 +68,9 @@ class graph_conv_layer : public layer { float_t* norm_factor; // normalization constant based on graph structure // Glorot & Bengio (AISTATS 2010) - inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) { + inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed=1) { auto init_range = sqrt(6.0 / (dim_x + dim_y)); - std::default_random_engine rng; + std::default_random_engine rng(seed); std::uniform_real_distribution dist(-init_range, init_range); matrix.resize(dim_x * dim_y); for (size_t i = 0; i < dim_x; ++i) { From 5129f80aa2e9f53bce2598f0247eacbe4724b7e9 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 9 Mar 2020 13:42:22 -0500 Subject: [PATCH 133/660] made it so dist execution explicitly uses same seed for weight matrix --- libdeepgalois/src/layers/graph_conv_layer.cpp | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index ceeae8605d..3a0cf8ad4e 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -33,10 +33,6 @@ void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const } void graph_conv_layer::init() { - rand_init_matrix(y, z, W); // randomly initialize trainable parameters - // rand_init_matrix(y, z, Q); - zero_init_matrix(y, z, layer::weight_grad); - #ifdef GALOIS_USE_DIST // setup gluon layer::gradientGraph = new deepgalois::GluonGradients(layer::weight_grad, @@ -47,6 +43,16 @@ void graph_conv_layer::init() { layer::gradientGraph->numHosts(), false); #endif +#ifdef GALOIS_USE_DIST + // make sure seed consistent across all hosts for weight matrix + rand_init_matrix(y, z, W, 1); +#else + rand_init_matrix(y, z, W); +#endif + + // rand_init_matrix(y, z, Q); + zero_init_matrix(y, z, layer::weight_grad); + if (dropout_) dropout_mask = new unsigned[x * y]; in_temp = new float_t[x * y]; out_temp = new float_t[x * z]; From 3526cf2022f6c44beb8ec081fab6c0c4cf558f3e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 9 Mar 2020 14:31:45 -0500 Subject: [PATCH 134/660] softmax math functions to namespace; TODOs placed for sync --- .../include/deepgalois/math_functions.hh | 21 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 3 + .../src/layers/softmax_loss_layer.cpp | 35 ++- libdeepgalois/src/math_functions.cpp | 226 ++++++++---------- 4 files changed, 140 insertions(+), 145 deletions(-) diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 158455f73b..8f73ed609e 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -25,6 +25,8 @@ namespace math { void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out); //! multiply n elements of vector by scalar void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out); +//! do dot product of 2 vectors +float_t dot(const vec_t& x, const vec_t& y); //! clear n elements of a vector void clear_cpu(size_t n, float_t* in); // dropout functions randomly remove weights @@ -37,6 +39,15 @@ void d_dropout_cpu(size_t n, const float scale, const float_t* in_diff, void relu_cpu(size_t n, const float_t* in, float_t* out); //! ReLU derivative; generally, 1 if data > 0, 0 otherwise void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out); +void softmax(const vec_t& input, vec_t& output); +void softmax(size_t n, const float_t* input, float_t* output); +void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp); +void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, + const float_t* dp); +float_t cross_entropy(const vec_t& y, const vec_t& p); +float_t cross_entropy(size_t n, const float_t* y, const float_t* p); +void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d); +void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d); //! copy vector from in -> out; first len elements void copy_cpu(size_t len, const float_t* in, float_t* out); // single-precision dense matrix multiply @@ -82,7 +93,6 @@ void vdiv(const vec_t& a, const vec_t& b, vec_t& out); void add_scalar(const float_t alpha, vec_t& Y); void sub_scalar(const float_t alpha, vec_t& Y); void div_scalar(const float_t alpha, vec_t& Y); -float_t dot(const vec_t& x, const vec_t& y); //void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector); void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, float_t *out_vector); void vvmul(const vec_t& a, const vec_t& b, tensor_t& out); @@ -96,15 +106,6 @@ void transpose2D(const tensor_t& in, tensor_t& out); void transpose2D1D(const tensor_t& in, vec_t& out); int argmax(const size_t n, const vec_t& x); // the arguments of the maxima int argmax(const size_t n, const float_t* x); // the arguments of the maxima -void softmax(const vec_t& input, vec_t& output); -void softmax(size_t n, const float_t* input, float_t* output); -void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp); -void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, - const float_t* dp); -float_t cross_entropy(const vec_t& y, const vec_t& p); -float_t cross_entropy(size_t n, const float_t* y, const float_t* p); -void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d); -void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d); // GPU operators bool isnan_gpu(int n, const float_t *array); // does array contain any 'nan' element diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 3a0cf8ad4e..1c631a9d21 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -71,6 +71,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ // aggregate based on graph topology graph_conv_layer::aggregate(z, context->graph_cpu, out_temp, out_data); + // TODO sync required here // run relu activation on output if specified if (act_) deepgalois::math::relu_cpu(x*z, out_data, out_data); @@ -85,7 +86,9 @@ void graph_conv_layer::back_propagation(const float_t* in_data, //else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying // x*y NOTE: since graph is symmetric, the derivative is the same + // this is the aggregate call deepgalois::update_all(z, context->graph_cpu, out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z + // TODO sync required here // at this point, out_temp has the derivative of data from last step to // use for both updating gradients for features and gradients for weights diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 4146dcd17f..9b64a0d353 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -20,33 +20,42 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { if (masks_[i] == 1) { // masked - softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax - // y is a one hot encoded vector for the labels - std::vector y(output_dims[1], 0.0); // ground truth - y[context->get_label(i)] = 1.0; // one-hot - loss[i] = cross_entropy(len, &y[0], &out_data[len*i]); + // output is normalized input for this layer + math::softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax + // one hot encoded vector for the labels + std::vector groundTruth(output_dims[1], 0.0); // ground truth + groundTruth[context->get_label(i)] = 1.0; // one-hot + // loss calculation + loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]); } }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-fw")); + + // no sync required in distributed execution since no graph topology used + // in this forward pass; only a post-process pretty much } void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { // note: out_grad is ignored because it shouldn't exist (this is output layer) - size_t len = input_dims[1]; - galois::do_all(galois::iterate(begin_, end_), + size_t len = layer::input_dims[1]; + galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { if (masks_[i] == 1) { // masked vec_t norm_grad(len); - std::vector y(len, 0.0); // ground truth - y[context->get_label(i)] = 1.0; - d_cross_entropy(len, &y[0], &out_data[len * i], &norm_grad[0]); - d_softmax(len, &in_data[len * i], &out_data[len * i], - &in_grad[len * i], &norm_grad[0]); + std::vector groundTruth(len, 0.0); + groundTruth[context->get_label(i)] = 1.0; + // use ground truth to determine derivative of cross entropy + math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]); + // derviative softmax to gradient used in the next layer + math::d_softmax(len, &in_data[len * i], &out_data[len * i], + &in_grad[len * i], &norm_grad[0]); } }, galois::chunk_size(), galois::steal(), galois::loopname("softmax-loss-bw")); + + // no weight sync required: this is all local graph information } acc_t softmax_loss_layer::get_masked_loss() { @@ -55,7 +64,7 @@ acc_t softmax_loss_layer::get_masked_loss() { AccumU valid_sample_count; total_loss.reset(); valid_sample_count.reset(); - galois::do_all(galois::iterate(begin_, end_), + galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { if (masks_[i]) { total_loss += loss[i]; diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index cd21a6b1a0..6b383e4b78 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -87,6 +87,21 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) } #endif +// dot product +float_t dot(const vec_t& x, const vec_t& y) { + float_t sum = 0; + for (size_t i = 0; i < x.size(); ++i) + sum += x[i] * y[i]; + return sum; +} + +float_t dot(size_t n, const float_t* x, const float_t* y) { + float_t sum = 0; + for (size_t i = 0; i < n; ++i) + sum += x[i] * y[i]; + return sum; +} + void clear(vec_t& in) { for (size_t i = 0; i < in.size(); i++) in[i] = 0; @@ -157,6 +172,95 @@ void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out) }, galois::loopname("d_relu")); } +void softmax(const vec_t& input, vec_t& output) { + const float_t max = *std::max_element(input.begin(), input.end()); + float_t denominator(0); + for (size_t i = 0; i < input.size(); i++) { + output[i] = std::exp(input[i] - max); + denominator += output[i]; + } + for (size_t i = 0; i < input.size(); i++) + output[i] /= denominator; +} + +void softmax(size_t n, const float_t* input, float_t* output) { + const float_t max = *std::max_element(input, input + n); + float_t denominator(0); + for (size_t i = 0; i < n; i++) { + output[i] = std::exp(input[i] - max); + denominator += output[i]; + } + for (size_t i = 0; i < n; i++) + output[i] /= denominator; +} + +void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp) { + auto n = y.size(); + vec_t df(n, 0); + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < n; j++) { + // float_t delta_ij = i == j? 1 : 0; + // df[i] += p[j] * (delta_ij - p[i]); + df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i]; + } + // dy = dp * (gradient of softmax) + dy[i] = dot(dp, df); + } +} + +void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, + const float_t* dp) { + vec_t df(n, 0); + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < n; j++) { + df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i]; + } + dy[i] = dot(n, dp, &df[0]); + } +} + +// cross-entropy loss function for multi-class classification +// y: ground truth +// p: predicted probability +float_t cross_entropy(const vec_t& y, const vec_t& p) { + auto n = y.size(); + assert(n > 0); + float_t loss = 0.0; + for (size_t i = 0; i < n; i++) { + if (y[i] == float_t(0)) + continue; + if (p[i] == float_t(0)) + loss -= y[i] * std::log(float_t(1e-10)); + else loss -= y[i] * std::log(p[i]); + } + return loss; +} + +float_t cross_entropy(size_t n, const float_t* y, const float_t* p) { + float_t loss = 0.0; + for (size_t i = 0; i < n; i++) { + if (y[i] == float_t(0)) + continue; + if (p[i] == float_t(0)) + loss -= y[i] * std::log(float_t(1e-10)); + else + loss -= y[i] * std::log(p[i]); + } + return loss; +} + +void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d) { + auto n = y.size(); + for (size_t i = 0; i < n; i++) { + d[i] = -y[i] / (p[i] + float_t(1e-10)); + } +} + +void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) { + for (size_t i = 0; i < n; i++) { + d[i] = -y[i] / (p[i] + float_t(1e-10)); + } +} void copy1D1D(const vec_t& in, vec_t& out) { std::copy(in.begin(), in.end(), &out[0]); } @@ -188,7 +292,6 @@ void transpose(size_t x, size_t y, const float_t* in, float_t* out) { } } } - } // deepgalois } // math @@ -234,20 +337,6 @@ void div_scalar(const float_t alpha, vec_t& Y) { Y[i] /= alpha; } -// dot product -float_t dot(const vec_t& x, const vec_t& y) { - float_t sum = 0; - for (size_t i = 0; i < x.size(); ++i) - sum += x[i] * y[i]; - return sum; -} - -float_t dot(size_t n, const float_t* x, const float_t* y) { - float_t sum = 0; - for (size_t i = 0; i < n; ++i) - sum += x[i] * y[i]; - return sum; -} // matrix-vector multiply void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, float_t *out_vector) { @@ -424,112 +513,5 @@ void sigmoid(vec_t& fv) { } } -// Softmax function takes an N-dimensional vector (X) of real number, -// and transforms it into a vector of real number in range (0,1) which add -// upto 1. To make softmax func numerically stable, we simply normalize the -// values in the vector, by multiplying the numerator and denominator with a -// constant C, where log(C)=-max(X) -// exps = np.exp(X - np.max(X)) -// exps / np.sum(exps) -void softmax(const vec_t& input, vec_t& output) { - const float_t max = *std::max_element(input.begin(), input.end()); - float_t denominator(0); - for (size_t i = 0; i < input.size(); i++) { - output[i] = std::exp(input[i] - max); - denominator += output[i]; - } - for (size_t i = 0; i < input.size(); i++) - output[i] /= denominator; -} - -void softmax(size_t n, const float_t* input, float_t* output) { - const float_t max = *std::max_element(input, input + n); - float_t denominator(0); - for (size_t i = 0; i < n; i++) { - output[i] = std::exp(input[i] - max); - denominator += output[i]; - } - for (size_t i = 0; i < n; i++) - output[i] /= denominator; -} - -void log_softmax(const vec_t& input, vec_t& output) { - const float_t max = *std::max_element(input.begin(), input.end()); - float_t denominator(0); - for (size_t i = 0; i < input.size(); i++) - denominator += std::exp(input[i] - max); - for (size_t i = 0; i < input.size(); i++) - output[i] = input[i] - max - denominator; -} - -// Due to the desirable property of softmax function outputting a probability -// distribution, we often use it as the final layer in neural networks. For this -// we need to calculate the derivative or gradient, and pass it back to the -// previous layer during backpropagation. -void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp) { - auto n = y.size(); - vec_t df(n, 0); - for (size_t i = 0; i < n; i++) { - for (size_t j = 0; j < n; j++) { - // float_t delta_ij = i == j? 1 : 0; - // df[i] += p[j] * (delta_ij - p[i]); - df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i]; - } - // dy = dp * (gradient of softmax) - dy[i] = dot(dp, df); - } -} - -void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, - const float_t* dp) { - vec_t df(n, 0); - for (size_t i = 0; i < n; i++) { - for (size_t j = 0; j < n; j++) { - df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i]; - } - dy[i] = dot(n, dp, &df[0]); - } -} -// cross-entropy loss function for multi-class classification -// y: ground truth -// p: predicted probability -float_t cross_entropy(const vec_t& y, const vec_t& p) { - auto n = y.size(); - assert(n > 0); - float_t loss = 0.0; - for (size_t i = 0; i < n; i++) { - if (y[i] == float_t(0)) - continue; - if (p[i] == float_t(0)) - loss -= y[i] * std::log(float_t(1e-10)); - else loss -= y[i] * std::log(p[i]); - } - return loss; -} - -float_t cross_entropy(size_t n, const float_t* y, const float_t* p) { - float_t loss = 0.0; - for (size_t i = 0; i < n; i++) { - if (y[i] == float_t(0)) - continue; - if (p[i] == float_t(0)) - loss -= y[i] * std::log(float_t(1e-10)); - else - loss -= y[i] * std::log(p[i]); - } - return loss; -} -void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d) { - auto n = y.size(); - for (size_t i = 0; i < n; i++) { - d[i] = -y[i] / (p[i] + float_t(1e-10)); - } -} - -void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) { - for (size_t i = 0; i < n; i++) { - d[i] = -y[i] / (p[i] + float_t(1e-10)); - } -} From 5fdcc11eb9dc75ac41da511291f60b07c3ab3b4e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 9 Mar 2020 17:36:15 -0500 Subject: [PATCH 135/660] removed unnecessary Caffe things from context; updated license note --- libdeepgalois/include/deepgalois/context.h | 21 +--------- libdeepgalois/licensenote.txt | 49 ++++++++++++++++++++++ libdeepgalois/src/context.cpp | 8 ++-- 3 files changed, 56 insertions(+), 22 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index a622a0f0f7..be0154e33f 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -1,6 +1,6 @@ #pragma once /** - * TODO if used from somewhere, get copyright/licences + * Based on common.hpp file of the Caffe deep learning library. */ #include @@ -20,16 +20,7 @@ class Context { public: Context(); ~Context(); - enum Brew { CPU, GPU }; - Brew mode() { return mode_; } - void set_mode(Brew mode) { mode_ = mode; } - int solver_count() { return solver_count_; } - void set_solver_count(int val) { solver_count_ = val; } - int solver_rank() { return solver_rank_; } - void set_solver_rank(int val) { solver_rank_ = val; } - bool multiprocess() { return multiprocess_; } - void set_multiprocess(bool val) { multiprocess_ = val; } - bool root_solver() { return solver_rank_ == 0; } + size_t read_graph(std::string dataset_str, bool selfloop); size_t read_labels(std::string dataset_str); size_t read_features(std::string dataset_str); @@ -40,10 +31,6 @@ class Context { size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop); size_t read_graph_gpu(std::string dataset_str, bool selfloop); void copy_data_to_device(); // copy labels and input features - void SetDevice(const int device_id); - void DeviceQuery() {} - bool CheckDevice(const int device_id) { return true; } - int FindDevice(const int start_id = 0) { return 0; } void norm_factor_counting(); size_t n; // number of samples: N @@ -76,9 +63,5 @@ class Context { static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE static curandGenerator_t curand_generator_; // used to generate random numbers on GPU #endif - Brew mode_; - int solver_count_; - int solver_rank_; - bool multiprocess_; }; } // end deepgalois namespace diff --git a/libdeepgalois/licensenote.txt b/libdeepgalois/licensenote.txt index 224adbc701..cf1aeb6caf 100644 --- a/libdeepgalois/licensenote.txt +++ b/libdeepgalois/licensenote.txt @@ -8,3 +8,52 @@ https://github.com/tiny-dnn/tiny-dnn/tree/master/tiny_dnn under BSD-3 DGL structure as well from what I can tell + +================================================================================ +Caffe License +================================================================================ + +COPYRIGHT + +All contributions by the University of California: +Copyright (c) 2014-2017 The Regents of the University of California (Regents) +All rights reserved. + +All other contributions: +Copyright (c) 2014-2017, the respective contributors +All rights reserved. + +Caffe uses a shared copyright model: each contributor holds copyright over +their contributions to Caffe. The project versioning records all such +contribution and copyright details. If a contributor wants to further mark +their specific copyright on a particular contribution, they should indicate +their copyright solely in the commit message of the change when it is +committed. + +LICENSE + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR +ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +CONTRIBUTION AGREEMENT + +By contributing to the BVLC/caffe repository through pull-request, comment, +or otherwise, the contributor releases their content to the +license and copyright terms herein. diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 79bd0be985..2717567d28 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -1,11 +1,13 @@ +/** + * Based on common.hpp file of the Caffe deep learning library. + */ + #include "deepgalois/context.h" namespace deepgalois { #ifdef CPU_ONLY -Context::Context() - : mode_(Context::CPU), solver_count_(1), solver_rank_(0), - multiprocess_(false) {} +Context::Context() {} Context::~Context() {} size_t Context::read_graph(std::string dataset_str, bool selfloop) { From 88097af6cbdf77bb60beded033d4924d42f6f594 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 9 Mar 2020 18:08:18 -0500 Subject: [PATCH 136/660] lonestargnn.h moved to include directry --- lonestargnn/CMakeLists.txt | 2 +- lonestargnn/{ => include}/lonestargnn.h | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename lonestargnn/{ => include}/lonestargnn.h (100%) diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index 3f6cb7672f..90711e2212 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -2,7 +2,7 @@ include_directories(BEFORE ${CMAKE_SOURCE_DIR}/libllvm/include ${CMAKE_CURRENT_BINARY_DIR}/../libllvm/include ) -include_directories(${CMAKE_SOURCE_DIR}/lonestargnn) +include_directories(${CMAKE_SOURCE_DIR}/lonestargnn/include) include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include) SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include) diff --git a/lonestargnn/lonestargnn.h b/lonestargnn/include/lonestargnn.h similarity index 100% rename from lonestargnn/lonestargnn.h rename to lonestargnn/include/lonestargnn.h From 7fb67db34d01aa1fe720b37acdeea7b858fa283e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 9 Mar 2020 18:09:41 -0500 Subject: [PATCH 137/660] net: caffe header notif --- libdeepgalois/include/deepgalois/net.h | 5 +++-- libdeepgalois/src/net.cpp | 4 ++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index dfc4f3d0d7..0daf730c42 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -1,5 +1,6 @@ -// TODO if this code was based on something, get copyright/license and put here - +/** + * Based on the net.hpp file from Caffe deep learning framework. + */ #ifndef _MODEL_H_ #define _MODEL_H_ diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index a194bd43d7..d594e789e8 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -1,3 +1,7 @@ +/** + * Based on the net.hpp file from Caffe deep learning framework. + */ + #include "deepgalois/net.h" namespace deepgalois { From 87297fc88ab58063018bf0144fd4a67d452326eb Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 9 Mar 2020 18:27:45 -0500 Subject: [PATCH 138/660] copy distgraphload to lonestargnn TODO need to remove some args and merge with current gnn cmdline args --- lonestargnn/CMakeLists.txt | 9 + lonestargnn/gcn/CMakeLists.txt | 3 + lonestargnn/gcn/gcn.cpp | 8 + lonestargnn/include/DistributedGraphLoader.h | 441 +++++++++++++++++++ lonestargnn/src/DistributedGraphLoader.cpp | 87 ++++ 5 files changed, 548 insertions(+) create mode 100644 lonestargnn/include/DistributedGraphLoader.h create mode 100644 lonestargnn/src/DistributedGraphLoader.cpp diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index 90711e2212..a06dd1907b 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -2,6 +2,7 @@ include_directories(BEFORE ${CMAKE_SOURCE_DIR}/libllvm/include ${CMAKE_CURRENT_BINARY_DIR}/../libllvm/include ) + include_directories(${CMAKE_SOURCE_DIR}/lonestargnn/include) include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include) @@ -19,4 +20,12 @@ if(NOT ENABLE_HETERO_GALOIS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") endif() +if(ENABLE_DIST_GALOIS) + add_library(distgraphloader STATIC src/DistributedGraphLoader.cpp) + target_include_directories(distgraphloader PUBLIC + include + ) + target_link_libraries(distgraphloader galois_cusp) +endif() + add_subdirectory(gcn) diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt index c3fb95c07f..48c7156dcc 100644 --- a/lonestargnn/gcn/CMakeLists.txt +++ b/lonestargnn/gcn/CMakeLists.txt @@ -1,5 +1,8 @@ app(gcn gcn.cpp) target_link_libraries(gcn dg_cpu) +if(ENABLE_DIST_GALOIS) + target_link_libraries(gcn distgraphloader) +endif() if(ENABLE_HETERO_GALOIS) target_link_libraries(gcn dg_gpu) target_link_libraries(gcn -lcudart -lcublas -lcurand -lcudadevrt) diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 086b6701de..f2e3f3f1eb 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -1,6 +1,9 @@ // Graph Neural Networks // Xuhao Chen #include "lonestargnn.h" +#ifdef GALOIS_USE_DIST +#include "DistributedGraphLoader.h" +#endif const char* name = "Graph Convolutional Networks"; const char* desc = "Graph convolutional neural networks on an undirected graph"; @@ -21,6 +24,11 @@ int main(int argc, char** argv) { // the user network.print_layers_info(); +#ifdef GALOIS_USE_DIST + std::vector dummy; + galois::graphs::constructSymmetricGraph(dummy); +#endif + // tracks peak memory usage deepgalois::ResourceManager rm; diff --git a/lonestargnn/include/DistributedGraphLoader.h b/lonestargnn/include/DistributedGraphLoader.h new file mode 100644 index 0000000000..43d27d9669 --- /dev/null +++ b/lonestargnn/include/DistributedGraphLoader.h @@ -0,0 +1,441 @@ +/* + * This file belongs to the Galois project, a C++ library for exploiting parallelism. + * The code is being released under the terms of the 3-Clause BSD License (a + * copy is located in LICENSE.txt at the top-level directory). + * + * Copyright (C) 2019, The University of Texas at Austin. All rights reserved. + * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS + * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF + * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF + * DEALING OR USAGE OF TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH + * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances + * shall University be liable for incidental, special, indirect, direct or + * consequential damages or loss of profits, interruption of business, or + * related expenses which may arise from use of Software or Documentation, + * including but not limited to those resulting from defects in Software and/or + * Documentation, or loss or inaccuracy of data of any kind. + */ + +/** + * @file DistributedGraphLoader.h + * + * Contains definitions for the common distributed graph loading functionality + * of Galois. + * + * @todo Refactoring a bunch of this code is likely very possible to do + */ +#ifndef D_GRAPH_LOADER +#define D_GRAPH_LOADER + +#include "galois/graphs/CuSPPartitioner.h" + +/******************************************************************************* + * Supported partitioning schemes + ******************************************************************************/ +namespace galois { +namespace graphs { + +//! enums of partitioning schemes supported +enum PARTITIONING_SCHEME { + OEC, //!< outgoing edge cut + IEC, //!< incoming edge cut + HOVC, //!< outgoing hybrid vertex cut + HIVC, //!< incoming hybrid vertex cut + CART_VCUT, //!< cartesian vertex cut + CART_VCUT_IEC, //!< cartesian vertex cut using iec + //CEC, //!< custom edge cut + GINGER_O, //!< Ginger, outgoing + GINGER_I, //!< Ginger, incoming + FENNEL_O, //!< Fennel, oec + FENNEL_I, //!< Fennel, iec + SUGAR_O //!< Sugar, oec +}; + +/** + * Turns a PARTITIONING_SCHEME enum to a string + * + * @param e partitioning scheme enum + * @return string version of e + */ +inline const char* EnumToString(PARTITIONING_SCHEME e) { + switch (e) { + case OEC: + return "oec"; + case IEC: + return "iec"; + case HOVC: + return "hovc"; + case HIVC: + return "hivc"; + case CART_VCUT: + return "cvc"; + case CART_VCUT_IEC: + return "cvc_iec"; + //case CEC: + // return "cec"; + case GINGER_O: + return "ginger-oec"; + case GINGER_I: + return "ginger-iec"; + case FENNEL_O: + return "fennel-oec"; + case FENNEL_I: + return "fennel-iec"; + case SUGAR_O: + return "sugar-oec"; + default: + GALOIS_DIE("Unsupported partition"); + } +} +} // end namespace graphs +} // end namespace galois + +/******************************************************************************* + * Graph-loading-related command line arguments + ******************************************************************************/ +namespace cll = llvm::cl; + +//! input graph file +extern cll::opt inputFile; +//! input graph file, but transposed +extern cll::opt inputFileTranspose; +//! symmetric input graph file +extern cll::opt inputFileSymmetric; +//! partitioning scheme to use +extern cll::opt partitionScheme; +////! path to vertex id map for custom edge cut +//extern cll::opt vertexIDMapFileName; +//! true if you want to read graph structure from a file +extern cll::opt readFromFile; +//! path to local graph structure to read +extern cll::opt localGraphFileName; +//! if true, the local graph structure will be saved to disk after partitioning +extern cll::opt saveLocalGraph; +//! file specifying blocking of masters +extern cll::opt mastersFile; + +// @todo command line argument for read balancing across hosts + +namespace galois { +namespace graphs { + +/******************************************************************************* + * Graph-loading functions + ******************************************************************************/ + +/** + * Loads a symmetric graph file (i.e. directed graph with edges in both + * directions) + * + * @tparam NodeData node data to store in graph + * @tparam EdgeData edge data to store in graph + * @param scaleFactor How to split nodes among hosts + * @returns a pointer to a newly allocated DistGraph based on the command line + * loaded based on command line arguments + */ +template +DistGraph* +constructSymmetricGraph(std::vector& scaleFactor) { + if (!inputFileSymmetric) { + GALOIS_DIE("Calling constructSymmetricGraph without inputFileSymmetric " + "flag"); + } + + switch (partitionScheme) { + case OEC: + case IEC: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose, + mastersFile + ); + case HOVC: + case HIVC: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose + ); + + case CART_VCUT: + case CART_VCUT_IEC: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose + ); + + //case CEC: + // return new Graph_customEdgeCut(inputFile, "", net.ID, net.Num, + // scaleFactor, vertexIDMapFileName, false); + + case GINGER_O: + case GINGER_I: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose + ); + + case FENNEL_O: + case FENNEL_I: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose + ); + + case SUGAR_O: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose + ); + default: + GALOIS_DIE("Error: partition scheme specified is invalid"); + return nullptr; + } +} + +/** + * Loads a graph file with the purpose of iterating over the out edges + * of the graph. + * + * @tparam NodeData node data to store in graph + * @tparam EdgeData edge data to store in graph + * @tparam iterateOut says if you want to iterate over out edges or not; if + * false, will iterate over in edgse + * @tparam enable_if this function will only be enabled if iterateOut is true + * @param scaleFactor How to split nodes among hosts + * @returns a pointer to a newly allocated DistGraph based on the command line + * loaded based on command line arguments + */ +template ::type* = nullptr> +DistGraph* +constructGraph(std::vector& scaleFactor) { + // 1 host = no concept of cut; just load from edgeCut, no transpose + auto& net = galois::runtime::getSystemNetworkInterface(); + if (net.Num == 1) { + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose + ); + } + + switch (partitionScheme) { + case OEC: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose, + mastersFile + ); + case IEC: + if (inputFileTranspose.size()) { + return cuspPartitionGraph( + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose, + mastersFile + ); + } else { + GALOIS_DIE("Error: attempting incoming edge cut without transpose " + "graph"); + break; + } + + case HOVC: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose + ); + case HIVC: + if (inputFileTranspose.size()) { + return cuspPartitionGraph( + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose + ); + } else { + GALOIS_DIE("Error: attempting incoming hybrid cut without transpose " + "graph"); + break; + } + + case CART_VCUT: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose + ); + + case CART_VCUT_IEC: + if (inputFileTranspose.size()) { + return cuspPartitionGraph( + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose + ); + } else { + GALOIS_DIE("Error: attempting cvc incoming cut without " + "transpose graph"); + break; + } + + //case CEC: + // return new Graph_customEdgeCut(inputFile, "", net.ID, net.Num, + // scaleFactor, vertexIDMapFileName, false); + + case GINGER_O: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose + ); + case GINGER_I: + if (inputFileTranspose.size()) { + return cuspPartitionGraph( + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose + ); + } else { + GALOIS_DIE("Error: attempting Ginger without transpose graph"); + break; + } + + case FENNEL_O: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose + ); + case FENNEL_I: + if (inputFileTranspose.size()) { + return cuspPartitionGraph( + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose + ); + } else { + GALOIS_DIE("Error: attempting Fennel incoming without transpose graph"); + break; + } + + case SUGAR_O: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose + ); + + default: + GALOIS_DIE("Error: partition scheme specified is invalid"); + return nullptr; + } +} + +/** + * Loads a graph file with the purpose of iterating over the in edges + * of the graph. + * + * @tparam NodeData node data to store in graph + * @tparam EdgeData edge data to store in graph + * @tparam iterateOut says if you want to iterate over out edges or not; if + * false, will iterate over in edges + * @tparam enable_if this function will only be enabled if iterateOut is false + * (i.e. iterate over in-edges) + * @param scaleFactor How to split nodes among hosts + * @returns a pointer to a newly allocated DistGraph based on the command line + * loaded based on command line arguments + */ +template ::type* = nullptr> +DistGraph* +constructGraph(std::vector& scaleFactor) { + auto& net = galois::runtime::getSystemNetworkInterface(); + + // 1 host = no concept of cut; just load from edgeCut + if (net.Num == 1) { + if (inputFileTranspose.size()) { + return cuspPartitionGraph( + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose + ); + } else { + fprintf(stderr, "WARNING: Loading transpose graph through in-memory " + "transpose to iterate over in-edges: pass in transpose " + "graph with -graphTranspose to avoid unnecessary " + "overhead.\n"); + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose + ); + } + } + + switch (partitionScheme) { + case OEC: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose, + mastersFile + ); + case IEC: + if (inputFileTranspose.size()) { + return cuspPartitionGraph( + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose, + mastersFile + ); + } else { + GALOIS_DIE("Error: attempting incoming edge cut without transpose " + "graph"); + break; + } + + case HOVC: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose + ); + case HIVC: + if (inputFileTranspose.size()) { + return cuspPartitionGraph( + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose + ); + } else { + GALOIS_DIE("Error: (hivc) iterate over in-edges without transpose graph"); + break; + } + + case CART_VCUT: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose + ); + case CART_VCUT_IEC: + if (inputFileTranspose.size()) { + return cuspPartitionGraph( + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose + ); + } else { + GALOIS_DIE("Error: (cvc) iterate over in-edges without transpose graph"); + break; + } + + //case CEC: + // if (inputFileTranspose.size()) { + // return new Graph_customEdgeCut(inputFileTranspose, "", net.ID, + // net.Num, scaleFactor, vertexIDMapFileName, + // false); + // } else { + // GALOIS_DIE("Error: (cec) iterate over in-edges without transpose graph"); + // break; + // } + + case GINGER_O: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose + ); + case GINGER_I: + if (inputFileTranspose.size()) { + return cuspPartitionGraph( + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose + ); + } else { + GALOIS_DIE("Error: attempting Ginger without transpose graph"); + break; + } + + case FENNEL_O: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose + ); + case FENNEL_I: + if (inputFileTranspose.size()) { + return cuspPartitionGraph( + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose + ); + } else { + GALOIS_DIE("Error: attempting Fennel incoming without transpose graph"); + break; + } + + case SUGAR_O: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose + ); + + default: + GALOIS_DIE("Error: partition scheme specified is invalid"); + return nullptr; + } +} + +} // end namespace graphs +} // end namespace galois +#endif diff --git a/lonestargnn/src/DistributedGraphLoader.cpp b/lonestargnn/src/DistributedGraphLoader.cpp new file mode 100644 index 0000000000..f2e336028e --- /dev/null +++ b/lonestargnn/src/DistributedGraphLoader.cpp @@ -0,0 +1,87 @@ +/* + * This file belongs to the Galois project, a C++ library for exploiting parallelism. + * The code is being released under the terms of the 3-Clause BSD License (a + * copy is located in LICENSE.txt at the top-level directory). + * + * Copyright (C) 2019, The University of Texas at Austin. All rights reserved. + * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS + * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF + * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF + * DEALING OR USAGE OF TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH + * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances + * shall University be liable for incidental, special, indirect, direct or + * consequential damages or loss of profits, interruption of business, or + * related expenses which may arise from use of Software or Documentation, + * including but not limited to those resulting from defects in Software and/or + * Documentation, or loss or inaccuracy of data of any kind. + */ + +/** + * @file DistributedGraphLoader.cpp + * + * Contains definitions for command line arguments related to distributed + * graph loading. + */ + +#include "DistributedGraphLoader.h" + +using namespace galois::graphs; + +namespace cll = llvm::cl; + +cll::opt inputFile(cll::Positional, cll::desc(""), + cll::Required); +cll::opt inputFileTranspose("graphTranspose", + cll::desc(""), + cll::init("")); +cll::opt + inputFileSymmetric("symmetricGraph", + cll::desc("Set this flag if graph is symmetric"), + cll::init(false)); + +cll::opt partitionScheme( + "partition", cll::desc("Type of partitioning."), + cll::values( + clEnumValN(OEC, "oec", "Outgoing Edge-Cut (default)"), + clEnumValN(IEC, "iec", "Incoming Edge-Cut"), + clEnumValN(HOVC, "hovc", "Outgoing Hybrid Vertex-Cut"), + clEnumValN(HIVC, "hivc", "Incoming Hybrid Vertex-Cut"), + clEnumValN(CART_VCUT, "cvc", "Cartesian Vertex-Cut of oec"), + clEnumValN(CART_VCUT_IEC, "cvc-iec", "Cartesian Vertex-Cut of iec"), + //clEnumValN(CEC, "cec", "Custom edge cut from vertexID mapping"), + clEnumValN(GINGER_O, "ginger-o", "ginger, outgiong edges, using CuSP"), + clEnumValN(GINGER_I, "ginger-i", "ginger, incoming edges, using CuSP"), + clEnumValN(FENNEL_O, "fennel-o", "fennel, outgoing edge cut, using CuSP"), + clEnumValN(FENNEL_I, "fennel-i", "fennel, incoming edge cut, using CuSP"), + clEnumValN(SUGAR_O, "sugar-o", "fennel, incoming edge cut, using CuSP"), + clEnumValEnd), + cll::init(OEC)); + +//cll::opt +// vertexIDMapFileName("vertexIDMapFileName", +// cll::desc(""), +// cll::init(""), cll::Hidden); + +cll::opt readFromFile("readFromFile", + cll::desc("Set this flag if graph is to be " + "constructed from file (file must be " + "created by Abelian CSR)"), + cll::init(false), cll::Hidden); + +cll::opt + localGraphFileName("localGraphFileName", + cll::desc("Name of the local file to construct " + "local graph (file must be created by " + "Abelian CSR)"), + cll::init("local_graph"), cll::Hidden); + +cll::opt saveLocalGraph("saveLocalGraph", + cll::desc("Set to save the local CSR graph"), + cll::init(false), cll::Hidden); + +cll::opt mastersFile("mastersFile", + cll::desc("File specifying masters blocking"), + cll::init(""), cll::Hidden); From c58b33c2b2666faba9170f1f87e303764f1ebd19 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 9 Mar 2020 18:42:03 -0500 Subject: [PATCH 139/660] constructSymGraph now integrated: partitioning of input possible --- lonestargnn/include/DistributedGraphLoader.h | 298 +------------------ lonestargnn/src/DistributedGraphLoader.cpp | 38 --- 2 files changed, 14 insertions(+), 322 deletions(-) diff --git a/lonestargnn/include/DistributedGraphLoader.h b/lonestargnn/include/DistributedGraphLoader.h index 43d27d9669..b7da4faa54 100644 --- a/lonestargnn/include/DistributedGraphLoader.h +++ b/lonestargnn/include/DistributedGraphLoader.h @@ -23,12 +23,15 @@ * Contains definitions for the common distributed graph loading functionality * of Galois. * + * Version for GNNs which only support symmetric graphs at this point in time. + * * @todo Refactoring a bunch of this code is likely very possible to do */ -#ifndef D_GRAPH_LOADER -#define D_GRAPH_LOADER +#ifndef D_GRAPH_LOADER_SYM +#define D_GRAPH_LOADER_SYM #include "galois/graphs/CuSPPartitioner.h" +#include "deepgalois/utils.h" /******************************************************************************* * Supported partitioning schemes @@ -44,7 +47,6 @@ enum PARTITIONING_SCHEME { HIVC, //!< incoming hybrid vertex cut CART_VCUT, //!< cartesian vertex cut CART_VCUT_IEC, //!< cartesian vertex cut using iec - //CEC, //!< custom edge cut GINGER_O, //!< Ginger, outgoing GINGER_I, //!< Ginger, incoming FENNEL_O, //!< Fennel, oec @@ -72,8 +74,6 @@ inline const char* EnumToString(PARTITIONING_SCHEME e) { return "cvc"; case CART_VCUT_IEC: return "cvc_iec"; - //case CEC: - // return "cec"; case GINGER_O: return "ginger-oec"; case GINGER_I: @@ -97,23 +97,9 @@ inline const char* EnumToString(PARTITIONING_SCHEME e) { namespace cll = llvm::cl; //! input graph file -extern cll::opt inputFile; -//! input graph file, but transposed -extern cll::opt inputFileTranspose; -//! symmetric input graph file -extern cll::opt inputFileSymmetric; +extern cll::opt dataset; //! partitioning scheme to use extern cll::opt partitionScheme; -////! path to vertex id map for custom edge cut -//extern cll::opt vertexIDMapFileName; -//! true if you want to read graph structure from a file -extern cll::opt readFromFile; -//! path to local graph structure to read -extern cll::opt localGraphFileName; -//! if true, the local graph structure will be saved to disk after partitioning -extern cll::opt saveLocalGraph; -//! file specifying blocking of masters -extern cll::opt mastersFile; // @todo command line argument for read balancing across hosts @@ -137,299 +123,43 @@ namespace graphs { template DistGraph* constructSymmetricGraph(std::vector& scaleFactor) { - if (!inputFileSymmetric) { - GALOIS_DIE("Calling constructSymmetricGraph without inputFileSymmetric " - "flag"); - } + std::string inputFile = deepgalois::path + dataset + ".csgr"; + galois::gInfo("File to read is ", inputFile); switch (partitionScheme) { case OEC: case IEC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose, - mastersFile + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "" ); case HOVC: case HIVC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "" ); case CART_VCUT: case CART_VCUT_IEC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "" ); - //case CEC: - // return new Graph_customEdgeCut(inputFile, "", net.ID, net.Num, - // scaleFactor, vertexIDMapFileName, false); - case GINGER_O: case GINGER_I: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "" ); case FENNEL_O: case FENNEL_I: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "" ); case SUGAR_O: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose - ); - default: - GALOIS_DIE("Error: partition scheme specified is invalid"); - return nullptr; - } -} - -/** - * Loads a graph file with the purpose of iterating over the out edges - * of the graph. - * - * @tparam NodeData node data to store in graph - * @tparam EdgeData edge data to store in graph - * @tparam iterateOut says if you want to iterate over out edges or not; if - * false, will iterate over in edgse - * @tparam enable_if this function will only be enabled if iterateOut is true - * @param scaleFactor How to split nodes among hosts - * @returns a pointer to a newly allocated DistGraph based on the command line - * loaded based on command line arguments - */ -template ::type* = nullptr> -DistGraph* -constructGraph(std::vector& scaleFactor) { - // 1 host = no concept of cut; just load from edgeCut, no transpose - auto& net = galois::runtime::getSystemNetworkInterface(); - if (net.Num == 1) { - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose - ); - } - - switch (partitionScheme) { - case OEC: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose, - mastersFile + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "" ); - case IEC: - if (inputFileTranspose.size()) { - return cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose, - mastersFile - ); - } else { - GALOIS_DIE("Error: attempting incoming edge cut without transpose " - "graph"); - break; - } - - case HOVC: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose - ); - case HIVC: - if (inputFileTranspose.size()) { - return cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose - ); - } else { - GALOIS_DIE("Error: attempting incoming hybrid cut without transpose " - "graph"); - break; - } - - case CART_VCUT: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose - ); - - case CART_VCUT_IEC: - if (inputFileTranspose.size()) { - return cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose - ); - } else { - GALOIS_DIE("Error: attempting cvc incoming cut without " - "transpose graph"); - break; - } - - //case CEC: - // return new Graph_customEdgeCut(inputFile, "", net.ID, net.Num, - // scaleFactor, vertexIDMapFileName, false); - - case GINGER_O: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose - ); - case GINGER_I: - if (inputFileTranspose.size()) { - return cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose - ); - } else { - GALOIS_DIE("Error: attempting Ginger without transpose graph"); - break; - } - - case FENNEL_O: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose - ); - case FENNEL_I: - if (inputFileTranspose.size()) { - return cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, inputFileTranspose - ); - } else { - GALOIS_DIE("Error: attempting Fennel incoming without transpose graph"); - break; - } - - case SUGAR_O: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, inputFileTranspose - ); - - default: - GALOIS_DIE("Error: partition scheme specified is invalid"); - return nullptr; - } -} - -/** - * Loads a graph file with the purpose of iterating over the in edges - * of the graph. - * - * @tparam NodeData node data to store in graph - * @tparam EdgeData edge data to store in graph - * @tparam iterateOut says if you want to iterate over out edges or not; if - * false, will iterate over in edges - * @tparam enable_if this function will only be enabled if iterateOut is false - * (i.e. iterate over in-edges) - * @param scaleFactor How to split nodes among hosts - * @returns a pointer to a newly allocated DistGraph based on the command line - * loaded based on command line arguments - */ -template ::type* = nullptr> -DistGraph* -constructGraph(std::vector& scaleFactor) { - auto& net = galois::runtime::getSystemNetworkInterface(); - - // 1 host = no concept of cut; just load from edgeCut - if (net.Num == 1) { - if (inputFileTranspose.size()) { - return cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose - ); - } else { - fprintf(stderr, "WARNING: Loading transpose graph through in-memory " - "transpose to iterate over in-edges: pass in transpose " - "graph with -graphTranspose to avoid unnecessary " - "overhead.\n"); - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose - ); - } - } - - switch (partitionScheme) { - case OEC: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose, - mastersFile - ); - case IEC: - if (inputFileTranspose.size()) { - return cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose, - mastersFile - ); - } else { - GALOIS_DIE("Error: attempting incoming edge cut without transpose " - "graph"); - break; - } - - case HOVC: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose - ); - case HIVC: - if (inputFileTranspose.size()) { - return cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose - ); - } else { - GALOIS_DIE("Error: (hivc) iterate over in-edges without transpose graph"); - break; - } - - case CART_VCUT: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose - ); - case CART_VCUT_IEC: - if (inputFileTranspose.size()) { - return cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose - ); - } else { - GALOIS_DIE("Error: (cvc) iterate over in-edges without transpose graph"); - break; - } - - //case CEC: - // if (inputFileTranspose.size()) { - // return new Graph_customEdgeCut(inputFileTranspose, "", net.ID, - // net.Num, scaleFactor, vertexIDMapFileName, - // false); - // } else { - // GALOIS_DIE("Error: (cec) iterate over in-edges without transpose graph"); - // break; - // } - - case GINGER_O: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose - ); - case GINGER_I: - if (inputFileTranspose.size()) { - return cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose - ); - } else { - GALOIS_DIE("Error: attempting Ginger without transpose graph"); - break; - } - - case FENNEL_O: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose - ); - case FENNEL_I: - if (inputFileTranspose.size()) { - return cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, inputFileTranspose - ); - } else { - GALOIS_DIE("Error: attempting Fennel incoming without transpose graph"); - break; - } - - case SUGAR_O: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, inputFileTranspose - ); - default: GALOIS_DIE("Error: partition scheme specified is invalid"); return nullptr; diff --git a/lonestargnn/src/DistributedGraphLoader.cpp b/lonestargnn/src/DistributedGraphLoader.cpp index f2e336028e..dbdf24ab90 100644 --- a/lonestargnn/src/DistributedGraphLoader.cpp +++ b/lonestargnn/src/DistributedGraphLoader.cpp @@ -30,16 +30,6 @@ using namespace galois::graphs; namespace cll = llvm::cl; -cll::opt inputFile(cll::Positional, cll::desc(""), - cll::Required); -cll::opt inputFileTranspose("graphTranspose", - cll::desc(""), - cll::init("")); -cll::opt - inputFileSymmetric("symmetricGraph", - cll::desc("Set this flag if graph is symmetric"), - cll::init(false)); - cll::opt partitionScheme( "partition", cll::desc("Type of partitioning."), cll::values( @@ -57,31 +47,3 @@ cll::opt partitionScheme( clEnumValN(SUGAR_O, "sugar-o", "fennel, incoming edge cut, using CuSP"), clEnumValEnd), cll::init(OEC)); - -//cll::opt -// vertexIDMapFileName("vertexIDMapFileName", -// cll::desc(""), -// cll::init(""), cll::Hidden); - -cll::opt readFromFile("readFromFile", - cll::desc("Set this flag if graph is to be " - "constructed from file (file must be " - "created by Abelian CSR)"), - cll::init(false), cll::Hidden); - -cll::opt - localGraphFileName("localGraphFileName", - cll::desc("Name of the local file to construct " - "local graph (file must be created by " - "Abelian CSR)"), - cll::init("local_graph"), cll::Hidden); - -cll::opt saveLocalGraph("saveLocalGraph", - cll::desc("Set to save the local CSR graph"), - cll::init(false), cll::Hidden); - -cll::opt mastersFile("mastersFile", - cll::desc("File specifying masters blocking"), - cll::init(""), cll::Hidden); From 408090268a484583651b67dbffa084928511484c Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 10 Mar 2020 07:45:12 -0500 Subject: [PATCH 140/660] remove print --- libgpu/include/graph_gpu.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index 3f2c88a308..e2057bf7af 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -116,8 +116,8 @@ struct CSRGraph { printf("]\n"); } void add_selfloop() { - print_neighbors(nnodes-1); - print_neighbors(0); + //print_neighbors(nnodes-1); + //print_neighbors(0); index_type *new_edge_dst = new index_type[nnodes+nedges]; for (index_type i = 0; i < nnodes; i++) { index_type start = row_start[i]; @@ -147,8 +147,8 @@ struct CSRGraph { edge_dst = new_edge_dst; nedges += nnodes; printf("nnodes = %d, nedges = %d\n", nnodes, nedges); - print_neighbors(nnodes-1); - print_neighbors(0); + //print_neighbors(nnodes-1); + //print_neighbors(0); } __device__ __host__ index_type getEdgeDst(unsigned edge) { From 4ff0d384936e1368498e6ef8b290fa6a4eaaa36b Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 10 Mar 2020 09:25:34 -0500 Subject: [PATCH 141/660] fix GPU compiling --- .../include/deepgalois/math_functions.hh | 5 ++-- libdeepgalois/src/context.cu | 23 +++++++------------ libdeepgalois/src/layers/aggregator.cu | 5 +++- libdeepgalois/src/math_functions.cu | 12 ++++++---- 4 files changed, 22 insertions(+), 23 deletions(-) diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 8f73ed609e..593ef03c5c 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -131,7 +131,7 @@ void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, void csrmm_gpu(const int M, const int N, const int K, const int nnz, const float alpha, const float* A_nonzeros, const int* A_idx_ptr, const int* A_nonzero_idx, - const float* B, const float beta, float* C); + const float* B, const float beta, float* trans_C, float* C); void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in_data, const mask_t* masks, const label_t* labels, float_t* loss, float_t* out_data); @@ -142,7 +142,8 @@ void scal_gpu(const int N, const float alpha, float* X); void add_scalar_gpu(const int N, const float_t alpha, float_t* Y); bool is_allocated_device(float_t* data); void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks); -void float_malloc_device(int n, float_t*& loss); +void float_malloc_device(int n, float_t*& ptr); +void float_free_device(float_t*& ptr); void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned*& masks, float_t*& in, float_t*& out, float_t*& matrix, float_t*& grad); diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index d727904107..dfb0e3cc5e 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -61,9 +61,7 @@ cusparseHandle_t Context::cusparse_handle_ = 0; cusparseMatDescr_t Context::cusparse_matdescr_ = 0; curandGenerator_t Context::curand_generator_ = 0; -Context::Context() - : mode_(Context::GPU), solver_count_(1), solver_rank_(0), - multiprocess_(false) { +Context::Context() { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_)); @@ -105,24 +103,19 @@ void Context::norm_factor_counting() { CudaTest("solving norm_factor_counting kernel failed"); std::cout << "Done\n"; } - +/* void Context::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); - if (current_device == device_id) - return; + if (current_device == device_id) return; CUDA_CHECK(cudaSetDevice(device_id)); - if (cublas_handle_) - CUBLAS_CHECK(cublasDestroy(cublas_handle_)); - if (curand_generator_) - CURAND_CHECK(curandDestroyGenerator(curand_generator_)); + if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); + if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_)); CUBLAS_CHECK(cublasCreate(&cublas_handle_)); - CURAND_CHECK( - curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK( - curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); + CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); } - +*/ size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) { std::string filename = path + dataset_str + ".csgr"; CSRGraph g; diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu index ee5fe56b4d..1f739eef58 100644 --- a/libdeepgalois/src/layers/aggregator.cu +++ b/libdeepgalois/src/layers/aggregator.cu @@ -74,5 +74,8 @@ void deepgalois::update_all_csrmm(size_t len, CSRGraph& g, const float_t* in, fl CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); //std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n"; //print_device_vector(10, norm_factor, "norm_factor"); - csrmm_gpu(n, len, n, g.nedges, 1.0, norm_factor, (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, out); + float *temp; + float_malloc_device(n*len, temp); // TODO: avoid repetitive allocation + csrmm_gpu(n, len, n, g.nedges, 1.0, norm_factor, (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, temp, out); + float_free_device(temp); } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 531480091d..e723ba289f 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -53,8 +53,12 @@ bool is_allocated_device(float_t* data) { return false; } -void float_malloc_device(int n, float_t*& loss) { - CUDA_CHECK(cudaMalloc((void**)&loss, n * sizeof(float_t))); +void float_malloc_device(int n, float_t*& ptr) { + CUDA_CHECK(cudaMalloc((void**)&ptr, n * sizeof(float_t))); +} + +void float_free_device(float_t*& ptr) { + CUDA_CHECK(cudaFree(ptr)); } void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) { @@ -186,9 +190,7 @@ void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, void csrmm_gpu(const int M, const int N, const int K, const int nnz, const float alpha, const float* A_nonzeros, const int* A_idx_ptr, const int* A_nnz_idx, - const float* B, const float beta, float* C) { - float *transpose_C; - CUDA_CHECK(cudaMalloc((void**)&transpose_C, N * K * sizeof(float))); + const float* B, const float beta, float *transpose_C, float* C) { CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::Context::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, M, N, K, nnz, &alpha, deepgalois::Context::cusparse_matdescr(), A_nonzeros, From ba38d5158ced6bd458a6a1956e730e4132aa84bb Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Mar 2020 14:19:19 -0500 Subject: [PATCH 142/660] graph_cpu is now a pointer Done for compatibility with Dist template graphs later --- libdeepgalois/include/deepgalois/context.h | 2 +- libdeepgalois/src/context.cpp | 17 ++++++++++------- libdeepgalois/src/layers/graph_conv_layer.cpp | 6 ++++-- 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index be0154e33f..b765515e50 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -43,7 +43,7 @@ class Context { float_t* norm_factor; // normalization constant based on graph structure #ifdef CPU_ONLY - Graph graph_cpu; // the input graph, |V| = N + Graph* graph_cpu; // the input graph, |V| = N void genGraph(LGraph& lg, Graph& g); void add_selfloop(Graph &og, Graph &g); #else diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 2717567d28..cf481aa040 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -15,6 +15,7 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) { return n; } +#ifndef GALOIS_USE_DIST size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) { galois::StatTimer Tread("GraphReadingTime"); Tread.start(); @@ -23,7 +24,7 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo printf("Reading .el file: %s\n", filename.c_str()); LGraph lgraph; lgraph.read_edgelist(filename.c_str(), true); // symmetrize - genGraph(lgraph, graph_cpu); + genGraph(lgraph, *graph_cpu); lgraph.clean(); } else if (filetype == "gr") { std::string filename = path + dataset_str + ".csgr"; @@ -31,16 +32,17 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo if (selfloop) { Graph graph_temp; galois::graphs::readGraph(graph_temp, filename); - add_selfloop(graph_temp, graph_cpu); - } else galois::graphs::readGraph(graph_cpu, filename); + add_selfloop(graph_temp, *graph_cpu); + } else galois::graphs::readGraph(*graph_cpu, filename); +// TODO dist version of self loop } else { printf("Unkown file format\n"); exit(1); } Tread.stop(); - std::cout << "num_vertices " << graph_cpu.size() << " num_edges " - << graph_cpu.sizeEdges() << "\n"; - return graph_cpu.size(); + std::cout << "num_vertices " << graph_cpu->size() << " num_edges " + << graph_cpu->sizeEdges() << "\n"; + return graph_cpu->size(); } void Context::genGraph(LGraph& lg, Graph& g) { @@ -55,12 +57,13 @@ void Context::genGraph(LGraph& lg, Graph& g) { g.constructEdge(offset, lg.get_dest(offset), 0); } } +#endif void Context::norm_factor_counting() { norm_factor = new float_t[n]; galois::do_all(galois::iterate((size_t)0, n), [&](auto v) { - auto degree = std::distance(graph_cpu.edge_begin(v), graph_cpu.edge_end(v)); + auto degree = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v)); float_t temp = std::sqrt(float_t(degree)); if (temp == 0.0) norm_factor[v] = 0.0; else norm_factor[v] = 1.0 / temp; diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 1c631a9d21..86ab1abd2f 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -70,7 +70,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ } else deepgalois::math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, &layer::W[0], 0.0, out_temp); // aggregate based on graph topology - graph_conv_layer::aggregate(z, context->graph_cpu, out_temp, out_data); + graph_conv_layer::aggregate(z, *(context->graph_cpu), out_temp, out_data); // TODO sync required here // run relu activation on output if specified @@ -87,7 +87,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // x*y NOTE: since graph is symmetric, the derivative is the same // this is the aggregate call - deepgalois::update_all(z, context->graph_cpu, out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z + deepgalois::update_all(z, *(context->graph_cpu), out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z // TODO sync required here // at this point, out_temp has the derivative of data from last step to @@ -107,8 +107,10 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // multiplied by gradients from last back prop step deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z +#ifdef GALOIS_USE_DIST layer::syncSub->sync("GradientSync"); //galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); +#endif } #endif } // namespace From 85665b9fa429bb71e50143a203ed9f14cdbc7a0f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Mar 2020 15:56:21 -0500 Subject: [PATCH 143/660] gcn app update to load distgraph --- lonestargnn/gcn/gcn.cpp | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index f2e3f3f1eb..3642959b95 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -18,16 +18,18 @@ int main(int argc, char** argv) { LonestarGnnStart(argc, argv, name, desc, url); deepgalois::Net network; // the neural network to train + +#ifdef GALOIS_USE_DIST + std::vector dummy; + Graph* testing = galois::graphs::constructSymmetricGraph(dummy); +#endif + // read network, features, ground truth, initialize metadata network.init(dataset, epochs, hidden1, add_selfloop); network.construct_layers(); // default setting for now; can be customized by // the user network.print_layers_info(); -#ifdef GALOIS_USE_DIST - std::vector dummy; - galois::graphs::constructSymmetricGraph(dummy); -#endif // tracks peak memory usage deepgalois::ResourceManager rm; From f14825ae6178deb4d0192016f60fcc152acb7bc2 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Mar 2020 15:57:23 -0500 Subject: [PATCH 144/660] dist context placeholders --- .../include/deepgalois/DistContext.h | 39 +++++++++++++++++++ libdeepgalois/src/DistContext.cpp | 29 ++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 libdeepgalois/include/deepgalois/DistContext.h create mode 100644 libdeepgalois/src/DistContext.cpp diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h new file mode 100644 index 0000000000..f22d7c221a --- /dev/null +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -0,0 +1,39 @@ +#ifndef __DG_DIST_CONTEXT__ +#define __DG_DIST_CONTEXT__ +/** + * Based on common.hpp file of the Caffe deep learning library. + */ +#include "deepgalois/types.h" +#include "deepgalois/utils.h" +#include "deepgalois/gtypes.h" + +namespace deepgalois { + +class DistContext { + size_t n; // number of samples: N + size_t num_classes; // number of classes: E + size_t feat_len; // input feature length: D + std::vector labels; // labels for classification: N x 1 + vec_t h_feats; // input features: N x D + +public: + DistContext(); + ~DistContext(); + + size_t saveGraph(Graph* dGraph); + size_t read_labels(std::string dataset_str); + size_t read_features(std::string dataset_str); + void norm_factor_counting(); + + // TODO why are these public + float_t* norm_factor; // normalization constant based on graph structure + Graph* graph_cpu; // the input graph, |V| = N + + label_t get_label(size_t i) { return labels[i]; } + size_t read_graph_cpu(std::string dataset_str); + float_t* get_in_ptr(); +}; + +} // end deepgalois namespace + +#endif diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp new file mode 100644 index 0000000000..22ecdbb1c6 --- /dev/null +++ b/libdeepgalois/src/DistContext.cpp @@ -0,0 +1,29 @@ +#include "deepgalois/DistContext.h" + +namespace deepgalois { +DistContext::DistContext() {} +DistContext::~DistContext() {} + +size_t DistContext::saveGraph(Graph* dGraph) { + // TODO + return 0; +} +size_t DistContext::read_labels(std::string dataset_str) { + // TODO + return 0; +} +size_t DistContext::read_features(std::string dataset_str) { + // TODO + return 0; +} + +float_t* DistContext::get_in_ptr() { + // TODO + return nullptr; +} + +void DistContext::norm_factor_counting() { + // TODO +} + +} // deepgalois From ae361bef40770467062a2eeb535d8f090fc6708a Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Mar 2020 16:02:55 -0500 Subject: [PATCH 145/660] if dist_galois, use DistContext; all placeholder at the moment TODO implement new functions/new flow --- libdeepgalois/CMakeLists.txt | 16 +++++++++++++ libdeepgalois/include/deepgalois/gtypes.h | 7 ++++++ .../deepgalois/layers/graph_conv_layer.h | 2 +- .../include/deepgalois/layers/layer.h | 14 +++++++++-- libdeepgalois/include/deepgalois/net.h | 12 +++++++++- libdeepgalois/src/context.cpp | 24 +++++++++---------- libdeepgalois/src/layers/aggregator.cpp | 8 ++++++- libdeepgalois/src/net.cpp | 9 ++++++- 8 files changed, 73 insertions(+), 19 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index bdc0f97942..b625c317e3 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -54,6 +54,20 @@ else() endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +if(ENABLE_DIST_GALOIS) +# do not link regular context.cpp; TODO do this conditional in cleaner way +set(sources + src/layers/graph_conv_layer.cpp + src/layers/softmax_loss_layer.cpp + src/layers/aggregator.cpp + src/layers/layer.cpp + src/math_functions.cpp + src/optimizer.cpp + src/DistContext.cpp + src/node.cpp + src/net.cpp +) +else() set(sources src/layers/graph_conv_layer.cpp src/layers/softmax_loss_layer.cpp @@ -65,6 +79,8 @@ set(sources src/node.cpp src/net.cpp ) +endif() + add_library(dg_cpu STATIC ${sources}) target_link_libraries(dg_cpu galois_shmem gllvm) target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES}) diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h index 5278660692..c30c72f730 100644 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ b/libdeepgalois/include/deepgalois/gtypes.h @@ -1,12 +1,16 @@ #pragma once #include "galois/Galois.h" #include "galois/graphs/LCGraph.h" +#ifdef GALOIS_USE_DIST +#include "galois/graphs/NewGeneric.h" +#endif // TODO namespace typedef galois::GAccumulator AccumF; typedef galois::GAccumulator AccumU; +#ifndef GALOIS_USE_DIST #ifdef EDGE_LABEL typedef galois::graphs::LC_CSR_Graph::with_numa_alloc< true>::type ::with_no_lockable::type Graph; @@ -14,5 +18,8 @@ typedef galois::graphs::LC_CSR_Graph::with_numa_alloc< typedef galois::graphs::LC_CSR_Graph::with_numa_alloc< true>::type ::with_no_lockable::type Graph; #endif +#else +using Graph = galois::graphs::DistGraph; +#endif typedef Graph::GraphNode GNode; diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 4904b13905..66749a8572 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -33,7 +33,7 @@ class graph_conv_layer : public layer { void init(); std::string layer_type() const override { return std::string("graph_conv"); } void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; } - void set_context(deepgalois::Context* ctx) { context = ctx; norm_factor = ctx->norm_factor; } + void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->norm_factor; } //! Uses weights contained in this layer to update in_data (results from previous) //! and save result to out_data virtual void forward_propagation(const float_t* in_data, float_t* out_data); diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index a91f495915..116ab43aa1 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -11,7 +11,11 @@ #include "deepgalois/types.h" #include "deepgalois/utils.h" +#ifndef GALOIS_USE_DIST #include "deepgalois/context.h" +#else +#include "deepgalois/DistContext.h" +#endif #include "deepgalois/optimizer.h" #include "deepgalois/math_functions.hh" #include "deepgalois/layers/node.h" @@ -38,6 +42,12 @@ namespace deepgalois { **/ class layer : public deepgalois::node { public: +#ifndef GALOIS_USE_DIST + using ContextType = deepgalois::Context; +#else + using ContextType = deepgalois::DistContext; +#endif + layer(unsigned level, std::vector in_dims, std::vector out_dims) : node(in_dims.size(), out_dims.size()), level_(level), begin_(0), @@ -49,7 +59,7 @@ class layer : public deepgalois::node { virtual std::string layer_type() const = 0; virtual void set_netphase(deepgalois::net_phase phase) {} //! save context - virtual void set_context(deepgalois::Context* ctx) { context = ctx; } + virtual void set_context(ContextType* ctx) { context = ctx; } //! return layer loss virtual acc_t get_masked_loss() { return acc_t(0); } @@ -149,7 +159,7 @@ class layer : public deepgalois::node { mask_t* masks_; // masks to show which samples are valid mask_t* d_masks_; float_t* loss; // error for each vertex: N x 1 - deepgalois::Context* context; + ContextType* context; #ifdef GALOIS_USE_DIST // Used for synchronization of weight gradients diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 0daf730c42..4f481d1d0a 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -8,10 +8,16 @@ #include "galois/Timer.h" #include "deepgalois/types.h" #include "deepgalois/gtypes.h" -#include "deepgalois/context.h" #include "deepgalois/layers/graph_conv_layer.h" #include "deepgalois/layers/softmax_loss_layer.h" #include "deepgalois/optimizer.h" +#ifndef GALOIS_USE_DIST +#include "deepgalois/context.h" +#else +#include "deepgalois/DistContext.h" +#endif + + #define NUM_CONV_LAYERS 2 @@ -113,7 +119,11 @@ class Net { } protected: +#ifndef GALOIS_USE_DIST deepgalois::Context* context; +#else + deepgalois::DistContext* context; +#endif size_t num_samples; // number of samples: N size_t num_classes; // number of vertex classes: E size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index cf481aa040..9206b1cc1a 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -15,7 +15,6 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) { return n; } -#ifndef GALOIS_USE_DIST size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) { galois::StatTimer Tread("GraphReadingTime"); Tread.start(); @@ -57,18 +56,6 @@ void Context::genGraph(LGraph& lg, Graph& g) { g.constructEdge(offset, lg.get_dest(offset), 0); } } -#endif - -void Context::norm_factor_counting() { - norm_factor = new float_t[n]; - galois::do_all(galois::iterate((size_t)0, n), - [&](auto v) { - auto degree = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v)); - float_t temp = std::sqrt(float_t(degree)); - if (temp == 0.0) norm_factor[v] = 0.0; - else norm_factor[v] = 1.0 / temp; - }, galois::loopname("NormCounting")); -} void Context::add_selfloop(Graph &og, Graph &g) { g.allocateFrom(og.size(), og.size()+og.sizeEdges()); @@ -103,6 +90,17 @@ void Context::add_selfloop(Graph &og, Graph &g) { } float_t* Context::get_in_ptr() { return &h_feats[0]; } + +void Context::norm_factor_counting() { + norm_factor = new float_t[n]; + galois::do_all(galois::iterate((size_t)0, n), + [&](auto v) { + auto degree = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v)); + float_t temp = std::sqrt(float_t(degree)); + if (temp == 0.0) norm_factor[v] = 0.0; + else norm_factor[v] = 1.0 / temp; + }, galois::loopname("NormCounting")); +} #endif // labels contain the ground truth (e.g. vertex classes) for each example diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 6d7c7f6cbe..4e86f148e8 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -5,7 +5,13 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { // zero out the output data - galois::do_all(galois::iterate(g), [&](const GNode src) { + #ifndef GALOIS_USE_DIST + galois::do_all(g, + #else + auto& rangeObj = g.allNodesRange(); + galois::do_all(galois::iterate(rangeObj), + #endif + [&](const GNode src) { deepgalois::math::clear_cpu(len , &out[src * len]); float_t a = 0.0; float_t b = 0.0; diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index d594e789e8..53baa60c13 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -7,9 +7,16 @@ namespace deepgalois { void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop) { + #ifndef GALOIS_USE_DIST context = new deepgalois::Context(); - // read graph, get num nodes num_samples = context->read_graph(dataset_str, selfloop); + #else + context = new deepgalois::DistContext(); + // TODO self loop? + // TODO num samples + #endif + + // read graph, get num nodes num_classes = context->read_labels(dataset_str); num_epochs = epochs; From 7e9c10d7caf1244b85815dcb290054acf91e235a Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Mar 2020 16:58:29 -0500 Subject: [PATCH 146/660] Dist net: read masks, test sets, saving dGraph setup TODO labels and features in dist setting --- .../include/deepgalois/DistContext.h | 7 ++- libdeepgalois/include/deepgalois/gtypes.h | 7 ++- libdeepgalois/include/deepgalois/net.h | 15 ++++-- libdeepgalois/include/deepgalois/utils.h | 46 ++++++++++++++++++- libdeepgalois/src/DistContext.cpp | 42 +++++++++++++++-- libdeepgalois/src/net.cpp | 44 ++++++++++++++---- lonestargnn/gcn/gcn.cpp | 26 +++++++++-- 7 files changed, 162 insertions(+), 25 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index f22d7c221a..c7317dd7d2 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -20,7 +20,7 @@ class DistContext { DistContext(); ~DistContext(); - size_t saveGraph(Graph* dGraph); + void saveGraph(Graph* dGraph); size_t read_labels(std::string dataset_str); size_t read_features(std::string dataset_str); void norm_factor_counting(); @@ -29,7 +29,10 @@ class DistContext { float_t* norm_factor; // normalization constant based on graph structure Graph* graph_cpu; // the input graph, |V| = N - label_t get_label(size_t i) { return labels[i]; } + label_t get_label(size_t i) { + // TODO global id only or lid only or both? + return labels[i]; + } size_t read_graph_cpu(std::string dataset_str); float_t* get_in_ptr(); }; diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h index c30c72f730..5dc08fc99e 100644 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ b/libdeepgalois/include/deepgalois/gtypes.h @@ -1,6 +1,9 @@ -#pragma once +#ifndef __DG_GTYPES__ +#define __DG_GTYPES__ + #include "galois/Galois.h" #include "galois/graphs/LCGraph.h" +#include "deepgalois/types.h" #ifdef GALOIS_USE_DIST #include "galois/graphs/NewGeneric.h" #endif @@ -23,3 +26,5 @@ using Graph = galois::graphs::DistGraph; #endif typedef Graph::GraphNode GNode; + +#endif diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 4f481d1d0a..f905d2a595 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -30,7 +30,12 @@ namespace deepgalois { class Net { public: Net() {} + #ifndef GALOIS_USE_DIST void init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop); + #else + void init(std::string dataset_str, unsigned epochs, unsigned hidden1, + bool selfloop, Graph* dGraph); + #endif size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } size_t get_nnodes() { return num_samples; } @@ -96,20 +101,24 @@ class Net { // back propogation void bprop() { - for (size_t i = num_layers; i != 0; i--) + for (size_t i = num_layers; i != 0; i--) { layers[i - 1]->backward(); + } } // update trainable weights after back-propagation void update_weights(optimizer* opt) { - for (size_t i = 0; i < num_layers; i++) - if (layers[i]->trainable()) + for (size_t i = 0; i < num_layers; i++) { + if (layers[i]->trainable()) { layers[i]->update_weight(opt); + } + } } // evaluate, i.e. inference or predict double evaluate(size_t begin, size_t end, size_t count, mask_t* masks, acc_t& loss, acc_t& acc) { + // TODO may need to do something for the dist case Timer t_eval; t_eval.Start(); loss = fprop(begin, end, count, masks); diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h index 51c0bb5c95..ad33285879 100644 --- a/libdeepgalois/include/deepgalois/utils.h +++ b/libdeepgalois/include/deepgalois/utils.h @@ -7,6 +7,9 @@ #include #include #include +#ifdef GALOIS_USE_DIST +#include "deepgalois/gtypes.h" +#endif namespace deepgalois { @@ -100,7 +103,9 @@ inline bool bernoulli(float_t p) { return uniform_rand(float_t(0), float_t(1)) <= p; } -//! Get masks from datafile where first line tells range of + +#ifndef GALOIS_USE_DIST +//! Get masks from datafile where first line tells range of //! set to create mask from inline size_t read_masks(std::string dataset_str, std::string mask_type, size_t& begin, size_t& end, @@ -134,5 +139,44 @@ inline size_t read_masks(std::string dataset_str, std::string mask_type, in.close(); return sample_count; } +#else +//! Get masks from datafile where first line tells range of +//! set to create mask from; needs graph object due to local IDs +inline size_t read_masks(std::string dataset_str, std::string mask_type, + size_t& begin, size_t& end, + std::vector& masks, Graph* dGraph) { + if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed") { + std::cout << "Dataset currently not supported\n"; + exit(1); + } + size_t i = 0; + size_t sample_count = 0; + std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; + + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + in >> begin >> end >> std::ws; + while (std::getline(in, line)) { + std::istringstream mask_stream(line); + if (i >= begin && i < end) { + unsigned mask = 0; + mask_stream >> mask; + if (mask == 1) { + // only bother if it's local + if (dGraph->isLocal(i)) { + masks[dGraph->getLID(i)] = 1; + sample_count++; + } + } + } + i++; + } + std::cout << mask_type + "_mask range: [" << begin << ", " << end + << ") Number of valid samples: " << sample_count << "\n"; + in.close(); + return sample_count; +} +#endif } diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 22ecdbb1c6..a6b85965fe 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -4,14 +4,46 @@ namespace deepgalois { DistContext::DistContext() {} DistContext::~DistContext() {} -size_t DistContext::saveGraph(Graph* dGraph) { - // TODO - return 0; +void DistContext::saveGraph(Graph* dGraph) { + graph_cpu = dGraph; } size_t DistContext::read_labels(std::string dataset_str) { - // TODO - return 0; + Graph* dGraph = DistContext::graph_cpu; + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + galois::gPrint("[", myID, "] Reading labels...\n"); + + //std::string filename = path + dataset_str + "-labels.txt"; + //std::ifstream in; + //std::string line; + //in.open(filename, std::ios::in); + //size_t m; + //// read file header + //an >> m >> num_classes >> std::ws; + //assert(m == dGraph->globalSize()); + //// size of labels is only # local nodes + //labels.resize(dGraph.size(), 0); + + //unsigned v = 0; + //while (std::getline(in, line)) { + // std::istringstream label_stream(line); + // unsigned x; + // for (size_t idx = 0; idx < num_classes; ++idx) { + // label_stream >> x; + // if (x != 0) { + // labels[v] = idx; + // break; + // } + // } + // v++; + //} + //in.close(); + + //// print the number of vertex classes + //std::cout << "Done, unique label counts: " << num_classes + // << ", time: " << t_read.Millisecs() << " ms\n"; + //return num_classes; } + size_t DistContext::read_features(std::string dataset_str) { // TODO return 0; diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 53baa60c13..704951f59e 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -6,15 +6,22 @@ namespace deepgalois { +#ifndef GALOIS_USE_DIST void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop) { - #ifndef GALOIS_USE_DIST +#else +void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, + bool selfloop, Graph* dGraph) { +#endif +#ifndef GALOIS_USE_DIST context = new deepgalois::Context(); num_samples = context->read_graph(dataset_str, selfloop); - #else +#else context = new deepgalois::DistContext(); + num_samples = dGraph->size(); + context->saveGraph(dGraph); // TODO self loop? // TODO num samples - #endif +#endif // read graph, get num nodes num_classes = context->read_labels(dataset_str); @@ -28,14 +35,35 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool train_begin = 0, train_count = 153431, train_end = train_begin + train_count; val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; - for (size_t i = train_begin; i < train_end; i++) - train_mask[i] = 1; - for (size_t i = val_begin; i < val_end; i++) - val_mask[i] = 1; + // TODO do all can be used below +#ifndef GALOIS_USE_DIST + for (size_t i = train_begin; i < train_end; i++) train_mask[i] = 1; + for (size_t i = val_begin; i < val_end; i++) val_mask[i] = 1; +#else + // find local ID from global ID, set if it exists + for (size_t i = train_begin; i < train_end; i++) { + if (dGraph->isLocal(i)) { + train_mask[dGraph->getLID(i)] = 1; + } + } + for (size_t i = val_begin; i < val_end; i++) { + if (dGraph->isLocal(i)) { + val_mask[dGraph->getLID(i)] = 1; + } + } +#endif } else { +#ifndef GALOIS_USE_DIST train_count = read_masks(dataset_str, "train", train_begin, train_end, train_mask); val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask); +#else + train_count = + read_masks(dataset_str, "train", train_begin, train_end, train_mask, + dGraph); + val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask, + dGraph); +#endif } //std::cout << "Done\n"; @@ -132,7 +160,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks accuracy_all.reset(); galois::do_all(galois::iterate(begin, end), [&](const auto& i) { if (masks[i] == 1) { - int preds = argmax(num_classes, + int preds = argmax(num_classes, &(layers[NUM_CONV_LAYERS - 1]->next()->get_data()[i * num_classes])); if ((label_t)preds == context->get_label(i)) accuracy_all += 1.0; diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 3642959b95..d688258cd3 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -15,22 +15,24 @@ int main(int argc, char** argv) { #else galois::DistMemSys G; #endif - LonestarGnnStart(argc, argv, name, desc, url); deepgalois::Net network; // the neural network to train #ifdef GALOIS_USE_DIST - std::vector dummy; - Graph* testing = galois::graphs::constructSymmetricGraph(dummy); + std::vector dummyVec; + Graph* dGraph = galois::graphs::constructSymmetricGraph(dummyVec); #endif +#ifndef GALOIS_USE_DIST // read network, features, ground truth, initialize metadata network.init(dataset, epochs, hidden1, add_selfloop); +#else + network.init(dataset, epochs, hidden1, add_selfloop, dGraph); +#endif network.construct_layers(); // default setting for now; can be customized by // the user network.print_layers_info(); - // tracks peak memory usage deepgalois::ResourceManager rm; @@ -54,10 +56,24 @@ int main(int argc, char** argv) { test_begin = 177262; test_count = 55703; test_end = test_begin + test_count; +#ifndef GALOIS_USE_DIST for (size_t i = test_begin; i < test_end; i++) test_mask[i] = 1; - } else +#else + for (size_t i = test_begin; i < test_end; i++) { + if (dGraph->isLocal(i)) { + test_mask[dGraph->getLID(i)] = 1; + } + } +#endif + } else { +#ifndef GALOIS_USE_DIST test_count = deepgalois::read_masks(dataset, "test", test_begin, test_end, test_mask); +#else + test_count = deepgalois::read_masks(dataset, "test", test_begin, test_end, + test_mask, dGraph); +#endif + } galois::StatTimer Ttest("Test"); Ttest.start(); double test_time = network.evaluate(test_begin, test_end, test_count, From eada9951fb134c82b0769495236be23e55d340c5 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Mar 2020 17:01:39 -0500 Subject: [PATCH 147/660] do_all aggregator fix for shared mem deepgalois --- libdeepgalois/src/layers/aggregator.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 4e86f148e8..40a8fdcf8f 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -6,7 +6,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou bool norm, const float_t* norm_factor) { // zero out the output data #ifndef GALOIS_USE_DIST - galois::do_all(g, + galois::do_all(galois::iterate(g), #else auto& rangeObj = g.allNodesRange(); galois::do_all(galois::iterate(rangeObj), From 32f1c47e147ba9955a9a7196e493718024748c42 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Mar 2020 17:18:34 -0500 Subject: [PATCH 148/660] cmake changes for DEEPGALOIS + HETERO DistContext currently does not support gpus, so have to separate it out for now --- CMakeLists.txt | 39 +++++++++++++++++++++++++++++++++++---- 1 file changed, 35 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f1b0489c10..4be9753f54 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -182,7 +182,36 @@ add_definitions(-DGALOIS_COPYRIGHT_YEAR=${GALOIS_COPYRIGHT_YEAR}) # Distributed-heterogeneous features if(ENABLE_HETERO_GALOIS) - set(ENABLE_DIST_GALOIS ON) + if (NOT USE_DEEPGALOIS) + # do not turn on DIST_GALOIS by default if DEEP_GALOIS is enabled + # with HETERO galois + set(ENABLE_DIST_GALOIS ON) + endif() + if (USE_DEEPGALOIS) + SET(CUDA_SEPARABLE_COMPILATION ON) + find_package(CUDA REQUIRED) + set(CUDA_PROPAGATE_HOST_FLAGS off) + set(CUDA_SEPARABLE_COMPILATION on) + set(CUDA_HOST_COMPILER g++) + + string(REPLACE "." "" GENCODES ${CUDA_CAPABILITY}) + string(REPLACE "," ";" GENCODES ${GENCODES}) + foreach(GENCODE ${GENCODES}) + set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; --expt-extended-lambda -gencode arch=compute_${GENCODE},code=sm_${GENCODE}) + endforeach() + + cuda_include_directories("${CMAKE_SOURCE_DIR}/libgpu/include") + + # MGPU v1.1 + set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers + cuda_include_directories("${MGPU_ROOT}/src") + + # CUB v1.6.4 + set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers + cuda_include_directories("${CUB_ROOT}") + + #find_package(OpenCL REQUIRED) + endif() endif() if(ENABLE_DIST_GALOIS) add_definitions(-DGALOIS_USE_DIST) @@ -509,10 +538,12 @@ if(ENABLE_DIST_GALOIS) add_subdirectory(libdist) add_subdirectory(libcusp) add_subdirectory(libgluon) - if(ENABLE_HETERO_GALOIS) - add_subdirectory(libgpu) - endif(ENABLE_HETERO_GALOIS) endif(ENABLE_DIST_GALOIS) + +if(ENABLE_HETERO_GALOIS) + add_subdirectory(libgpu) +endif(ENABLE_HETERO_GALOIS) + add_subdirectory(tools) add_subdirectory(scripts) From 110793810dd57ab217b6921597fd8e7dab199c60 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Mar 2020 17:28:00 -0500 Subject: [PATCH 149/660] naive distirubted label reading for deepgalois complete --- libdeepgalois/src/DistContext.cpp | 76 ++++++++++++++++++------------- 1 file changed, 44 insertions(+), 32 deletions(-) diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index a6b85965fe..4847079376 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -10,38 +10,50 @@ void DistContext::saveGraph(Graph* dGraph) { size_t DistContext::read_labels(std::string dataset_str) { Graph* dGraph = DistContext::graph_cpu; unsigned myID = galois::runtime::getSystemNetworkInterface().ID; - galois::gPrint("[", myID, "] Reading labels...\n"); - - //std::string filename = path + dataset_str + "-labels.txt"; - //std::ifstream in; - //std::string line; - //in.open(filename, std::ios::in); - //size_t m; - //// read file header - //an >> m >> num_classes >> std::ws; - //assert(m == dGraph->globalSize()); - //// size of labels is only # local nodes - //labels.resize(dGraph.size(), 0); - - //unsigned v = 0; - //while (std::getline(in, line)) { - // std::istringstream label_stream(line); - // unsigned x; - // for (size_t idx = 0; idx < num_classes; ++idx) { - // label_stream >> x; - // if (x != 0) { - // labels[v] = idx; - // break; - // } - // } - // v++; - //} - //in.close(); - - //// print the number of vertex classes - //std::cout << "Done, unique label counts: " << num_classes - // << ", time: " << t_read.Millisecs() << " ms\n"; - //return num_classes; + galois::gPrint("[", myID, "] Reading labels from disk...\n"); + + std::string filename = path + dataset_str + "-labels.txt"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + size_t m; + // read file header + in >> m >> num_classes >> std::ws; + assert(m == dGraph->globalSize()); + // size of labels should be # local nodes + labels.resize(dGraph->size(), 0); + + uint32_t foundVertices = 0; + unsigned v = 0; + // each line contains a set of 0s and 1s + while (std::getline(in, line)) { + // only bother if local node + if (dGraph->isLocal(v)) { + std::istringstream label_stream(line); + unsigned x; + // for each class + for (size_t idx = 0; idx < num_classes; ++idx) { + // check if that class is labeled + label_stream >> x; + if (x != 0) { + // set local id + labels[dGraph->getLID(v)] = idx; + foundVertices++; + break; + } + } + } + // always increment v + v++; + } + + in.close(); + + // print the number of vertex classes + galois::gPrint("[", myID, "] Done with labels, unique label counts: ", + num_classes, "; set ", foundVertices, " nodes\n"); + + return num_classes; } size_t DistContext::read_features(std::string dataset_str) { From c891c338f41aba5a9275037db9d1b5337f4997a4 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Mar 2020 17:47:09 -0500 Subject: [PATCH 150/660] feature reading for distributed case --- .../include/deepgalois/DistContext.h | 3 +- libdeepgalois/src/DistContext.cpp | 41 ++++++++++++++++++- libdeepgalois/src/net.cpp | 1 - 3 files changed, 41 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index c7317dd7d2..5449b337fb 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -26,13 +26,14 @@ class DistContext { void norm_factor_counting(); // TODO why are these public - float_t* norm_factor; // normalization constant based on graph structure + float_t* norm_factor; // normalization constant based on graph structure Graph* graph_cpu; // the input graph, |V| = N label_t get_label(size_t i) { // TODO global id only or lid only or both? return labels[i]; } + size_t read_graph_cpu(std::string dataset_str); float_t* get_in_ptr(); }; diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 4847079376..3859263f1d 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -57,8 +57,45 @@ size_t DistContext::read_labels(std::string dataset_str) { } size_t DistContext::read_features(std::string dataset_str) { - // TODO - return 0; + Graph* dGraph = DistContext::graph_cpu; + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + galois::gPrint("[", myID, "] Reading features from disk...\n"); + + std::string filename = path + dataset_str + ".ft"; + std::ifstream in; + std::string line; + + in.open(filename, std::ios::in); + size_t m; // m = number of global vertices + + // header read + in >> m >> feat_len >> std::ws; + // use local size, not global size + h_feats.resize(dGraph->size() * feat_len, 0); + + // loop through all features + while (std::getline(in, line)) { + std::istringstream edge_stream(line); + unsigned u, v; + float_t w; + // vertex to set feature for + edge_stream >> u; + // only set if local + if (dGraph->isLocal(u)) { + // feature index + edge_stream >> v; + // actual feature + edge_stream >> w; + + h_feats[dGraph->getLID(u) * feat_len + v] = w; + } + } + in.close(); + + galois::gPrint("[", myID, "] Done with features, feature length: ", + feat_len, "\n"); + + return feat_len; } float_t* DistContext::get_in_ptr() { diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 704951f59e..a101ddb4ff 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -20,7 +20,6 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, num_samples = dGraph->size(); context->saveGraph(dGraph); // TODO self loop? - // TODO num samples #endif // read graph, get num nodes From eb4c80d5db5fecb522b935e439c39bd89f465631 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Mar 2020 18:01:33 -0500 Subject: [PATCH 151/660] the rest of missing distcontext functions TODO norm factor needs dist execution --- .../include/deepgalois/DistContext.h | 18 +++++++++++------ libdeepgalois/src/DistContext.cpp | 20 ++++++++++++++++--- 2 files changed, 29 insertions(+), 9 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 5449b337fb..15b91babda 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -10,31 +10,37 @@ namespace deepgalois { class DistContext { - size_t n; // number of samples: N + size_t localVertices; // number of samples: N size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D std::vector labels; // labels for classification: N x 1 vec_t h_feats; // input features: N x D public: + // TODO why are these public + float_t* norm_factor; // normalization constant based on graph structure + Graph* graph_cpu; // the input graph, |V| = N + DistContext(); ~DistContext(); + //! save graph pointer to context object void saveGraph(Graph* dGraph); + //! read labels of local nodes only size_t read_labels(std::string dataset_str); + //! read features of local nodes only size_t read_features(std::string dataset_str); + //! find norm factor by looking at degree + // TODO this is a distributed operation void norm_factor_counting(); - // TODO why are these public - float_t* norm_factor; // normalization constant based on graph structure - Graph* graph_cpu; // the input graph, |V| = N - + //! return label for some node label_t get_label(size_t i) { // TODO global id only or lid only or both? return labels[i]; } - size_t read_graph_cpu(std::string dataset_str); + //! returns pointer to the features of each local node float_t* get_in_ptr(); }; diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 3859263f1d..90214b19d0 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -6,7 +6,10 @@ DistContext::~DistContext() {} void DistContext::saveGraph(Graph* dGraph) { graph_cpu = dGraph; + + localVertices = graph_cpu->size(); } + size_t DistContext::read_labels(std::string dataset_str) { Graph* dGraph = DistContext::graph_cpu; unsigned myID = galois::runtime::getSystemNetworkInterface().ID; @@ -99,12 +102,23 @@ size_t DistContext::read_features(std::string dataset_str) { } float_t* DistContext::get_in_ptr() { - // TODO - return nullptr; + return &h_feats[0]; } void DistContext::norm_factor_counting() { - // TODO + // TODO: this is a distributed operation + + // create for now, TODO need to actually fill it in + norm_factor = new float_t[localVertices]; + //galois::do_all(galois::iterate((size_t)0, localVertices), + // [&](auto v) { + // auto degree = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v)); + // float_t temp = std::sqrt(float_t(degree)); + // if (temp == 0.0) norm_factor[v] = 0.0; + // else norm_factor[v] = 1.0 / temp; + // }, galois::loopname("NormCounting")); + + return; } } // deepgalois From 56b945d443617559a7118232c624652d56e0292b Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Mar 2020 18:49:02 -0500 Subject: [PATCH 152/660] temp dist context fix for norm_factor --- libdeepgalois/src/DistContext.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 90214b19d0..768b1dbab9 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -110,6 +110,11 @@ void DistContext::norm_factor_counting() { // create for now, TODO need to actually fill it in norm_factor = new float_t[localVertices]; + galois::do_all(galois::iterate((size_t)0, localVertices), + [&](auto v) { + norm_factor[v] = 0.01; + }, galois::loopname("NormCounting")); + //galois::do_all(galois::iterate((size_t)0, localVertices), // [&](auto v) { // auto degree = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v)); From aa11e2d083988b9974f998c57ae681a0845706b7 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 10 Mar 2020 18:54:24 -0500 Subject: [PATCH 153/660] gpu errors --- libdeepgalois/include/deepgalois/math_functions.hh | 1 + libdeepgalois/src/context.cu | 7 +++++++ libdeepgalois/src/math_functions.cu | 4 ++++ libdeepgalois/src/net.cu | 1 + 4 files changed, 13 insertions(+) diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 593ef03c5c..27866be13c 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -144,6 +144,7 @@ bool is_allocated_device(float_t* data); void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks); void float_malloc_device(int n, float_t*& ptr); void float_free_device(float_t*& ptr); +void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr); void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, unsigned*& masks, float_t*& in, float_t*& out, float_t*& matrix, float_t*& grad); diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index dfb0e3cc5e..4d77433eda 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -136,5 +136,12 @@ void Context::copy_data_to_device() { //print_device_vector(10, d_feats, "d_feats"); } +//void Context::copy_data_to_device() { + //float_malloc_device(n, d_labels); + //float_copy_device(n, &labels[0], d_labels); + //float_malloc_device(n*feat_len, d_feats); + //float_copy_device(n*feat_len, &h_feats[0], d_feats); +//} + float_t* Context::get_in_ptr() { return d_feats; } } // namespace context diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index e723ba289f..7cb5253e13 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -61,6 +61,10 @@ void float_free_device(float_t*& ptr) { CUDA_CHECK(cudaFree(ptr)); } +void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr) { + CUDA_CHECK(cudaMemcpy(d_ptr, h_ptr, n * sizeof(float_t), cudaMemcpyHostToDevice)); +} + void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) { assert(h_masks != NULL); CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t))); diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index 947967d07c..62dec7cad4 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -1,4 +1,5 @@ #include "deepgalois/net.h" +#include "deepgalois/cutils.h" #include "gg.h" #include "ggcuda.h" From ef6372ed8ae6f642998f1eab719c7aeb7f1d182e Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 11 Mar 2020 10:41:12 -0500 Subject: [PATCH 154/660] d_softmax_cross_entropy_kernel --- libdeepgalois/src/math_functions.cu | 55 ++++++++++++++++++++++------- 1 file changed, 43 insertions(+), 12 deletions(-) diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 7cb5253e13..6ebc222412 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -322,8 +322,29 @@ void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in, CudaTest("solving softmax_cross_entropy kernel failed"); } +__device__ void d_cross_entropy_device(int n, const label_t idx, const float_t* p, float_t* d) { + for (int i = 0; i < n; i++) { + if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10); + else d[i] = 0.0; + } +} + +__global__ void d_cross_entropy_kernel(int len, int begin, int end, + const mask_t* masks, const label_t* labels, + const float_t* data, float_t* grad) { + int base = begin * len; + CUDA_KERNEL_LOOP(i, (end-begin)*len) { + int id = begin + i/len; + if (masks[id] == 1) { // masked + if (i%len == (int)labels[id]) grad[i] = -1.0 / (data[i+base] + 1e-10); + else grad[i] = 0.0; + //d_cross_entropy_device(len, labels[id], data + len*id, grad + len*i); + } + } +} + // TODO: use warp -__device__ void d_softmax(int n, const float_t* p, const float_t* dp, float_t* dy) { +__device__ void d_softmax_device(int n, const float_t* p, const float_t* dp, float_t* dy) { for (int i = 0; i < n; i++) { dy[i] = 0; for (int j = 0; j < n; j++) { @@ -333,14 +354,16 @@ __device__ void d_softmax(int n, const float_t* p, const float_t* dp, float_t* d } } -__device__ void d_cross_entropy(int n, const label_t idx, const float_t* p, float_t* d) { - for (int i = 0; i < n; i++) { - //assert(p[i] >= 0.0); - //assert(p[i] >= 0.0 && p[i] <= 1.0); - if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10); - else d[i] = 0.0; +__global__ void d_softmax_kernel(int len, int begin, int end, + const mask_t* masks, const float_t* data, + const float_t* in_grad, float_t* out_grad) { + CUDA_KERNEL_LOOP(i, end-begin) { + int id = begin + i; + if (masks[id] == 1) { // masked + d_softmax_device(len, data + len*id, in_grad + len*i, out_grad + len*id); + } } -} +} __global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end, const mask_t* masks, const label_t* labels, @@ -349,8 +372,8 @@ __global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end, int id = begin + i; if (masks[id] == 1) { // masked float_t out_grad[41]; // TODO - d_cross_entropy(len, labels[id], out + len*id, out_grad); - d_softmax(len, out + len*id, out_grad, diff + len*id); + d_cross_entropy_device(len, labels[id], out + len*id, out_grad); + d_softmax_device(len, out + len*id, out_grad, diff + len*id); } } } @@ -358,8 +381,16 @@ __global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end, void d_softmax_cross_entropy_gpu(int len, int begin, int end, const mask_t* masks, const label_t* labels, const float_t* out, float_t* diff) { - d_softmax_cross_entropy_kernel<<>>( - len, begin, end, masks, labels, out, diff); +// d_softmax_cross_entropy_kernel<<>>( +// len, begin, end, masks, labels, out, diff); +// CudaTest("solving d_softmax_cross_entropy kernel failed"); + float_t *grad; + float_malloc_device((end-begin)*len, grad); + d_cross_entropy_kernel<<>>( + len, begin, end, masks, labels, out, grad); + CudaTest("solving d_cross_entropy kernel failed"); + d_softmax_kernel<<>>( + len, begin, end, masks, out, grad, diff); CudaTest("solving d_softmax_cross_entropy kernel failed"); } From a447d05253a722d6ee5ca0f1c3712337693e8be5 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 11 Mar 2020 14:34:52 -0500 Subject: [PATCH 155/660] gpu softmax updated --- libdeepgalois/include/deepgalois/types.h | 1 + libdeepgalois/src/math_functions.cu | 137 +++++++++++++++++++++-- 2 files changed, 130 insertions(+), 8 deletions(-) diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index 1a32d5a47d..611ba57828 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -25,6 +25,7 @@ typedef uint8_t mask_t; // mask is used to indicate different uses of labels: #define TB_SIZE 256 #define BLOCK_SIZE 256 #define WARP_SIZE 32 +#define MAX_NUM_CLASSES 64 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE) #define USE_CUSPARSE #endif diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 6ebc222412..e15a503eca 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -343,6 +343,36 @@ __global__ void d_cross_entropy_kernel(int len, int begin, int end, } } +__global__ void d_cross_entropy_warp(int len, int begin, int end, + const mask_t* masks, const label_t* labels, + const float_t* data, float_t* grad) { + __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; + const int thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + for (int wid = warp_id; wid < end-begin; wid += num_warps) { + int id = begin + wid; + int base = id * len; + if (masks[id] == 1) { + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) p[warp_lane][pid] = data[base+pid]; + } + __syncthreads(); + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) { + if (pid == (int)labels[id]) + grad[wid*len+pid] = -1.0 / (p[warp_lane][pid] + 1e-10); + else grad[wid*len+pid] = 0.0; + } + } + } + } +} // TODO: use warp __device__ void d_softmax_device(int n, const float_t* p, const float_t* dp, float_t* dy) { for (int i = 0; i < n; i++) { @@ -365,6 +395,46 @@ __global__ void d_softmax_kernel(int len, int begin, int end, } } +__global__ void d_softmax_warp(int len, int begin, int end, + const mask_t* masks, const float_t* data, + const float_t* in_grad, float_t* out_grad) { + __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; + __shared__ float_t d[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; + const int thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + for (int wid = warp_id; wid < end-begin; wid += num_warps) { + int id = begin + wid; + int base = id * len; + if (masks[id] == 1) { + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) { + p[warp_lane][pid] = data[base+pid]; + d[warp_lane][pid] = in_grad[wid*len+pid]; + } + } + __syncthreads(); + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) { + float_t sum = 0.0; + float_t self = p[warp_lane][pid]; + for (int j = 0; j < len; j++) { + float_t df = (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self; + sum += df * d[warp_lane][j]; + } + out_grad[base+pid] = sum; + } + } + __syncthreads(); + } + } +} + __global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end, const mask_t* masks, const label_t* labels, const float_t* out, float_t* diff) { @@ -378,19 +448,70 @@ __global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end, } } +__global__ void d_softmax_cross_entropy_warp(int len, int begin, int end, + const mask_t* masks, const label_t* labels, + const float_t* data, float_t* grad) { + __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; + __shared__ float_t d[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; + const int thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + for (int wid = warp_id; wid < end-begin; wid += num_warps) { + int id = begin + wid; + int base = id * len; + if (masks[id] == 1) { + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) p[warp_lane][pid] = data[base+pid]; + } + __syncthreads(); + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) { + if (pid == (int)labels[id]) + d[warp_lane][pid] = -1.0 / (p[warp_lane][pid] + 1e-10); + else d[warp_lane][pid] = 0.0; + } + } + __syncthreads(); + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) { + float_t sum = 0.0; + float_t self = p[warp_lane][pid]; + for (int j = 0; j < len; j++) { + float_t df = (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self; + sum += df * d[warp_lane][j]; + } + grad[base+pid] = sum; + } + } + __syncthreads(); + } + } +} + void d_softmax_cross_entropy_gpu(int len, int begin, int end, const mask_t* masks, const label_t* labels, const float_t* out, float_t* diff) { // d_softmax_cross_entropy_kernel<<>>( // len, begin, end, masks, labels, out, diff); // CudaTest("solving d_softmax_cross_entropy kernel failed"); - float_t *grad; - float_malloc_device((end-begin)*len, grad); - d_cross_entropy_kernel<<>>( - len, begin, end, masks, labels, out, grad); - CudaTest("solving d_cross_entropy kernel failed"); - d_softmax_kernel<<>>( - len, begin, end, masks, out, grad, diff); - CudaTest("solving d_softmax_cross_entropy kernel failed"); + //float_t *grad; + //float_malloc_device((end-begin)*len, grad); + //d_cross_entropy_kernel<<>>( + //d_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>( + // len, begin, end, masks, labels, out, grad); + //CudaTest("solving d_cross_entropy kernel failed"); + //d_softmax_kernel<<>>( + //d_softmax_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>( + // len, begin, end, masks, out, grad, diff); + //CudaTest("solving d_softmax kernel failed"); + d_softmax_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>( + len, begin, end, masks, labels, out, diff); + CudaTest("solving d_softmax_cross_entropy_warp kernel failed"); } From b36c722acc90441ca87e6aeec3879f48e8e2d1d5 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 13 Mar 2020 08:53:15 -0500 Subject: [PATCH 156/660] match DGL gpu performance --- .../deepgalois/layers/graph_conv_layer.h | 2 + .../include/deepgalois/math_functions.hh | 4 +- libdeepgalois/src/layers/graph_conv_layer.cu | 64 +++++++++++++++---- libdeepgalois/src/math_functions.cu | 61 ++++++++++++------ 4 files changed, 94 insertions(+), 37 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 66749a8572..c77467eeca 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -49,6 +49,7 @@ class graph_conv_layer : public layer { #endif // user-defined combine function virtual void combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out); + void d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out); private: bool act_; // whether to use activation function at the end @@ -63,6 +64,7 @@ class graph_conv_layer : public layer { size_t z; float_t* out_temp; //!< intermediate data temporary float_t* in_temp; + float_t* in_temp1; float_t* trans_data; // y*x unsigned* dropout_mask; // x*y float_t* norm_factor; // normalization constant based on graph structure diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 27866be13c..ffc0343438 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -140,12 +140,10 @@ void d_softmax_cross_entropy_gpu(int len, int bengin, int end, const float_t* out_data, float_t* diff); void scal_gpu(const int N, const float alpha, float* X); void add_scalar_gpu(const int N, const float_t alpha, float_t* Y); +void rng_uniform_gpu(const int n, const float_t a, const float_t b, float_t* r); bool is_allocated_device(float_t* data); void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks); void float_malloc_device(int n, float_t*& ptr); void float_free_device(float_t*& ptr); void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr); -void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, - unsigned*& masks, float_t*& in, float_t*& out, - float_t*& matrix, float_t*& grad); #endif diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index b2a9209bd4..15796c95d3 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -3,7 +3,26 @@ namespace deepgalois { void graph_conv_layer::init() { - gconv_malloc_device(x, y, z, dropout_, dropout_mask, in_temp, out_temp, d_W, layer::d_weight_grad); + if (dropout_) CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(unsigned))); + //CUDA_CHECK(cudaMalloc((void**)&in_temp, x * y * sizeof(float_t))); + float_malloc_device(x*y, in_temp); + init_const_gpu(x*y, 0.0, in_temp); + if (y <= z) { + float_malloc_device(x*y, in_temp1); + init_const_gpu(x*y, 0.0, in_temp1); + } + //CUDA_CHECK(cudaMalloc((void**)&out_temp, x * z * sizeof(float_t))); + float_malloc_device(x*z, out_temp); + init_const_gpu(x*z, 0.0, out_temp); + //CUDA_CHECK(cudaMalloc((void**)&d_W, y * z * sizeof(float_t))); + float_malloc_device(y*z, d_W); + auto init_range = sqrt(6.0 / (y + z)); + // Glorot & Bengio (AISTATS 2010) + rng_uniform_gpu(y * z, -init_range, init_range, d_W); + //CUDA_CHECK(cudaMalloc((void**)&layer::d_weight_grad, y * z * sizeof(float_t))); + float_malloc_device(y*z, layer::d_weight_grad); + //CUDA_CHECK(cudaMemset(layer::d_weight_grad, 0, y * z * sizeof(float_t))); + init_const_gpu(y*z, 0.0, layer::d_weight_grad); } void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { @@ -14,18 +33,31 @@ void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, flo #endif } +void graph_conv_layer::d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { +#ifdef USE_CUSPARSE + deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor); +#else + deepgalois::update_all(len, g, in, out, norm_, norm_factor); +#endif +} + void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out) { } // GPU forward: compute output features +// NOTE: in_data will be used in back-prop, so it can not be modified void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { - //assert(y <= 128); // currently only support feature length <= 128 + assert(z <= MAX_NUM_CLASSES); // currently only support feature length <= 128 init_const_gpu(x*z, 0.0, out_temp); - if (dropout_ && phase_ == deepgalois::net_phase::train) { + if (dropout_ && phase_ == deepgalois::net_phase::train) dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); + if (y > z) { sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp); - } else sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, d_W, 0.0, out_temp); - graph_conv_layer::aggregate(z, context->graph_gpu, out_temp, out_data); + graph_conv_layer::aggregate(z, context->graph_gpu, out_temp, out_data); + } else { + graph_conv_layer::aggregate(y, context->graph_gpu, in_temp, in_temp1); + sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, d_W, 0.0, out_data); + } if (act_) relu_gpu(x * z, out_data, out_data); } @@ -34,16 +66,20 @@ void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { if (act_) d_relu_gpu(x * z, out_grad, out_data, out_grad); -#ifdef USE_CUSPARSE - update_all_csrmm(z, context->graph_gpu, out_grad, out_temp, norm_, norm_factor); -#else - update_all(z, context->graph_gpu, out_grad, out_temp, norm_, norm_factor); -#endif - if (level_ != 0) { - sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_grad); - if (dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad); + if (y > z) { + graph_conv_layer::d_aggregate(z, context->graph_gpu, out_grad, out_temp); + if (level_ != 0) + sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_grad); + sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad); + } else { + if (level_ != 0) { + sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, d_W, 0.0, in_temp); + graph_conv_layer::d_aggregate(y, context->graph_gpu, in_temp, in_grad); + } + sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, layer::d_weight_grad); } - sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad); + if (level_ != 0 && dropout_) + d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad); } } // namespace diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index e15a503eca..8002d728a5 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -32,7 +32,7 @@ void gpu_rng_uniform(const int n, unsigned* r) { CURAND_CHECK(curandGenerate(deepgalois::Context::curand_generator(), r, n)); } -void gpu_rng_uniform(const int n, const float_t a, const float_t b, float_t* r) { +void rng_uniform_gpu(const int n, const float_t a, const float_t b, float_t* r) { CURAND_CHECK(curandGenerateUniform(deepgalois::Context::curand_generator(), r, n)); const float range = b - a; if (range != float_t(1)) @@ -71,22 +71,6 @@ void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) { CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); } -void gconv_malloc_device(size_t x, size_t y, size_t z, bool dropout, - unsigned*& masks, float_t*& in, float_t*& out, - float_t*& matrix, float_t*& grad) { - if (dropout) CUDA_CHECK(cudaMalloc((void**)&masks, x * y * sizeof(unsigned))); - CUDA_CHECK(cudaMalloc((void**)&in, x * y * sizeof(float_t))); - init_const_gpu(x*y, 0.0, in); - CUDA_CHECK(cudaMalloc((void**)&out, x * z * sizeof(float_t))); - init_const_gpu(x*z, 0.0, out); - CUDA_CHECK(cudaMalloc((void**)&matrix, y * z * sizeof(float_t))); - auto init_range = sqrt(6.0 / (y + z)); - // Glorot & Bengio (AISTATS 2010) - gpu_rng_uniform(y * z, -init_range, init_range, matrix); - CUDA_CHECK(cudaMalloc((void**)&grad, y * z * sizeof(float_t))); - CUDA_CHECK(cudaMemset(grad, 0, y * z * sizeof(float_t))); -} - __global__ void setup_curand_kernel(const int n, curandState* state) { CUDA_KERNEL_LOOP(i, n) { // curand_init(1234, i, 0, &state[i]); // Each thread gets same seed 1234 @@ -185,16 +169,21 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t* A, const float_t* B, float_t* C) { - // std::cout << "[debug]: matmul1D1D_gpu\n"; const CBLAS_TRANSPOSE TransA = CblasNoTrans; const CBLAS_TRANSPOSE TransB = CblasNoTrans; sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); } +// C = A x B, where A is a sparse matrix in CSR format, B is the dense matrix for vertex +// feature tensor. However, since cusparse only supports column-major, while feature +// tensor is stored in row-major, the actual computation is: C = trans(A x trans(B)). +// Currently, we use cublasSgeam to implement transposition and allocate intermediate +// workspace memory (transpose_C) for this. void csrmm_gpu(const int M, const int N, const int K, const int nnz, const float alpha, const float* A_nonzeros, const int* A_idx_ptr, const int* A_nnz_idx, const float* B, const float beta, float *transpose_C, float* C) { + //std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n"; CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::Context::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, M, N, K, nnz, &alpha, deepgalois::Context::cusparse_matdescr(), A_nonzeros, @@ -203,9 +192,41 @@ void csrmm_gpu(const int M, const int N, const int K, const int nnz, const float one = 1.0; const float zero = 0.0; CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T, - N, M, &one, transpose_C, M, &zero, transpose_C, M, C, N)); + N, M, &one, transpose_C, M, &zero, NULL, M, C, N)); } - +/* +void csrmm_gpu_new(const int M, const int N, const int K, const int nnz, + const float alpha, const float* A_nonzeros, + const int* A_idx_ptr, const int* A_nnz_idx, + const float* B, const float beta, float *transpose_C, float* C) { + std::cout << "[debug]: csrmm_gpu\n"; + cusparseSpMatDescr_t A_descr; + CUSPARSE_CHECK(cusparseCreateCsr(&A_descr, M, K, nnz, A_idx_ptr, A_nnz_idx, A_nonzeros, + CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); + cusparseDnMatDescr_t B_descr; + CUSPARSE_CHECK(cusparseCreateDnMat(&B_descr, K, N, K, B, CUDA_R_32F, CUSPARSE_ORDER_COL)); + cusparseDnMatDescr_t C_descr; + CUSPARSE_CHECK(cusparseCreateDnMat(&C_descr, M, N, M, C, CUDA_R_32F, CUSPARSE_ORDER_COL)); + size_t bufferSize; + CUSPARSE_CHECK(cusparseSpMM_bufferSize(deepgalois::Context::cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, + (void*)&alpha, A_descr, B_descr, (void*)&beta, C_descr, + CUDA_R_32F, CUSPARSE_COOMM_ALG1, &bufferSize)); + cudaDeviceSynchronize(); + void* buffer = NULL; + if (bufferSize > 0) CUDA_CHECK(cudaMalloc(&buffer, bufferSize)); + CUSPARSE_CHECK(cusparseSpMM(deepgalois::Context::cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, + (const void*)&alpha, A_descr, B_descr, (const void*)&beta, C_descr, + CUDA_R_32F, CUSPARSE_COOMM_ALG1, buffer)); + cudaDeviceSynchronize(); + //transpose C + const float one = 1.0; + const float zero = 0.0; + CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T, + N, M, &one, transpose_C, M, &zero, NULL, M, C, N)); +} +//*/ void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float* A, const float* x, const float beta, float* y) { From 6f2955d076e06d7479d5e694ecf2ddc62d0ce9cf Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 13 Mar 2020 10:17:50 -0500 Subject: [PATCH 157/660] fix bug --- libdeepgalois/src/layers/graph_conv_layer.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index 15796c95d3..12d9902179 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -51,6 +51,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ init_const_gpu(x*z, 0.0, out_temp); if (dropout_ && phase_ == deepgalois::net_phase::train) dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); + else copy_gpu(x*y, in_data, in_temp); if (y > z) { sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp); graph_conv_layer::aggregate(z, context->graph_gpu, out_temp, out_data); From 17184398f80797ca8d25646beb2dd6a8eba7a09a Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 15 Mar 2020 16:05:56 -0500 Subject: [PATCH 158/660] add gin --- libdeepgalois/include/deepgalois/types.h | 2 ++ lonestargnn/gin/CMakeLists.txt | 9 ++++++ lonestargnn/gin/gin.cpp | 35 ++++++++++++++++++++++++ 3 files changed, 46 insertions(+) create mode 100644 lonestargnn/gin/CMakeLists.txt create mode 100644 lonestargnn/gin/gin.cpp diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index 611ba57828..c1658c045f 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -21,6 +21,8 @@ typedef float acc_t; // Accuracy type typedef short label_t; // label is for classification (supervised learning) typedef uint8_t mask_t; // mask is used to indicate different uses of labels: // train, val, test +typedef uint32_t VertexID; + #define CHUNK_SIZE 256 #define TB_SIZE 256 #define BLOCK_SIZE 256 diff --git a/lonestargnn/gin/CMakeLists.txt b/lonestargnn/gin/CMakeLists.txt new file mode 100644 index 0000000000..f32f47179e --- /dev/null +++ b/lonestargnn/gin/CMakeLists.txt @@ -0,0 +1,9 @@ +app(gin gin.cpp) +target_link_libraries(gin dg_cpu) +if(ENABLE_DIST_GALOIS) + target_link_libraries(gin distgraphloader) +endif() +if(ENABLE_HETERO_GALOIS) + target_link_libraries(gin dg_gpu) + target_link_libraries(gin -lcudart -lcublas -lcurand -lcudadevrt) +endif() diff --git a/lonestargnn/gin/gin.cpp b/lonestargnn/gin/gin.cpp new file mode 100644 index 0000000000..aecfcf9b35 --- /dev/null +++ b/lonestargnn/gin/gin.cpp @@ -0,0 +1,35 @@ +// Graph Neural Networks +// Xuhao Chen +#include "lonestargnn.h" +#ifdef GALOIS_USE_DIST +#include "DistributedGraphLoader.h" +#endif + +const char* name = "Graph Isomorphism Network (GIN)"; +const char* desc = "Graph isomorphism neural networks on an undirected graph"; +const char* url = 0; +static cll::optlearn_eps("le", cll::desc("whether to learn the parameter epsilon (default value false)"), cll::init(0)); +static cll::optagg_type("at", cll::desc("Aggregator Type"), cll::init("sum")); + +template <> +class graph_conv_layer { +public: + FV apply_edge(VertexID src, VertexID dst, FV2D in_data) { + return in_data[dst]; + } + FV apply_vertex(VertexID src, FV2D in_data) { + FV a = deepgalois::matmul(deepgalois::accum, deepgalois::W); + FV b = deepgalois::scale(in_data[src], 1.0 + self.eps); + return deepgalois::vadd(a, b); + } +}; + +int main(int argc, char** argv) { + galois::SharedMemSys G; + LonestarGnnStart(argc, argv, name, desc, url); + deepgalois::Net network; // the neural network to train + + graph_conv_layer layer0; + return 0; +} + From 04bac8e4d0634e8ef3a8a85b66ff95ffb25f5322 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 18 Mar 2020 15:36:15 -0500 Subject: [PATCH 159/660] initialization fo sync substrate --- .../include/deepgalois/DistContext.h | 7 ++++++- libdeepgalois/src/DistContext.cpp | 14 +++++++++++++ libdeepgalois/src/net.cpp | 21 +++++++++---------- 3 files changed, 30 insertions(+), 12 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 15b91babda..4baaaae8a9 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -3,6 +3,7 @@ /** * Based on common.hpp file of the Caffe deep learning library. */ +#include "galois/graphs/GluonSubstrate.h" #include "deepgalois/types.h" #include "deepgalois/utils.h" #include "deepgalois/gtypes.h" @@ -15,6 +16,7 @@ class DistContext { size_t feat_len; // input feature length: D std::vector labels; // labels for classification: N x 1 vec_t h_feats; // input features: N x D + galois::graphs::GluonSubstrate* syncSubstrate; public: // TODO why are these public @@ -34,9 +36,12 @@ class DistContext { // TODO this is a distributed operation void norm_factor_counting(); + void initializeSyncSubstrate(); + galois::graphs::GluonSubstrate* getSyncSubstrate(); + //! return label for some node + //! NOTE: this is LID, not GID label_t get_label(size_t i) { - // TODO global id only or lid only or both? return labels[i]; } diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 768b1dbab9..c206c1d654 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -126,4 +126,18 @@ void DistContext::norm_factor_counting() { return; } +void DistContext::initializeSyncSubstrate() { + DistContext::syncSubstrate = + new galois::graphs::GluonSubstrate( + *DistContext::graph_cpu, + galois::runtime::getSystemNetworkInterface().ID, + galois::runtime::getSystemNetworkInterface().Num, + false + ); +} + +galois::graphs::GluonSubstrate* DistContext::getSyncSubstrate() { + return DistContext::syncSubstrate; +}; + } // deepgalois diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index a101ddb4ff..c14a8397c6 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -19,7 +19,8 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, context = new deepgalois::DistContext(); num_samples = dGraph->size(); context->saveGraph(dGraph); - // TODO self loop? + // TODO self loop setup? + context->initializeSyncSubstrate(); #endif // read graph, get num nodes @@ -90,8 +91,7 @@ void Net::train(optimizer* opt, bool need_validate) { Timer t_epoch; // run epochs for (unsigned i = 0; i < num_epochs; i++) { - std::cout << "Epoch " << std::setw(2) << i << std::fixed - << std::setprecision(3) << ":"; + galois::gPrint("Epoch ", std::setw(2), i, std::fixed, std::setprecision(3), ":"); t_epoch.Start(); // training steps @@ -121,8 +121,8 @@ void Net::train(optimizer* opt, bool need_validate) { // validation / testing set_netphases(net_phase::test); - std::cout << " train_loss = " << std::setw(5) << train_loss - << " train_acc = " << std::setw(5) << train_acc; + galois::gPrint("train_loss = ", std::setw(5), train_loss, " train_acc = ", + std::setw(5), train_acc); t_epoch.Stop(); double epoch_time = t_epoch.Millisecs(); if (need_validate) { @@ -132,13 +132,12 @@ void Net::train(optimizer* opt, bool need_validate) { double val_time = evaluate(val_begin, val_end, val_count, &val_mask[0], val_loss, val_acc); Tval.stop(); - std::cout << " val_loss = " << std::setw(5) << val_loss - << " val_acc = " << std::setw(5) << val_acc; - std::cout << " time = " << epoch_time + val_time - << " ms (train_time = " << epoch_time - << " val_time = " << val_time << ")\n"; + galois::gPrint(" val_loss = ", std::setw(5), val_loss, " val_acc = ", + std::setw(5), val_acc); + galois::gPrint(" time = ", epoch_time + val_time, " ms (train_time = ", + epoch_time, " val_time = ", val_time, ")\n"); } else { - std::cout << " train_time = " << epoch_time << " ms\n"; + galois::gPrint(" train_time = ", epoch_time, " ms\n"); } } } From 3d591c3e6b9a359ee236916767dc03a84bf1ff7a Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 18 Mar 2020 17:24:13 -0500 Subject: [PATCH 160/660] fix compile issue of conv layer on cpu --- libdeepgalois/include/deepgalois/layers/graph_conv_layer.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index c77467eeca..2267b1a55c 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -49,7 +49,9 @@ class graph_conv_layer : public layer { #endif // user-defined combine function virtual void combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out); +#ifndef CPU_ONLY void d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out); +#endif private: bool act_; // whether to use activation function at the end From c5f5843b5cb96e39d321d39b9ea2bc09fd5cb1f2 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 18 Mar 2020 17:34:35 -0500 Subject: [PATCH 161/660] Sync structs for graph conv layer defined --- .../layers/GraphConvSyncStructures.h | 63 +++++++++++++++++++ .../deepgalois/layers/graph_conv_layer.h | 3 + libdeepgalois/include/deepgalois/types.h | 16 ++++- 3 files changed, 80 insertions(+), 2 deletions(-) create mode 100644 libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h new file mode 100644 index 0000000000..3b95d55f82 --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h @@ -0,0 +1,63 @@ +#ifndef __GRAPH_CONV_SYNC_STRUCT__ +#define __GRAPH_CONV_SYNC_STRUCT__ + +struct GraphConvSync { + using ValTy = std::vector; + + //! return a vector of floats to sync + static ValTy extract(uint32_t node_id, char& filler) { + // TODO figure out how to avoid copy from C array to vector; best + // way is if original data is in a vector probably, but that has the + // issue of not being able to directly call BLAS + ValTy vecToReturn; + // allocate space + vecToReturn.resize(deepgalois::_syncVectorSize); + // copy the node's data to vector to serialize/send + for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) { + vecToReturn[i] = + deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i]; + } + // move constructor should kick in here to avoid return copy + return vecToReturn; + } + + //! reduction is addition in this case; add received vector to + //! own vector + static bool reduce(uint32_t node_id, char& filler, ValTy y) { + assert(y.size() == deepgalois::_syncVectorSize); + // loop and do addition + for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) { + deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i] += y[i]; + } + return true; + } + + //! do nothing (waste of a write) + static void reset(uint32_t node_id, char& filler) { + } + + //! element wise set + static void setVal(uint32_t node_id, char& filler, ValTy y) { + assert(y.size() == deepgalois::_syncVectorSize); + // loop and do addition + for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) { + deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i] = y[i]; + } + } + + // GPU options TODO for GPU + static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_batch(unsigned, uint8_t*) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, + DataCommMode*) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*) { return false; } + static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { + return false; + } + static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } +}; + +#endif diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 2267b1a55c..0bf7a7e698 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -1,6 +1,9 @@ #pragma once #include "layer.h" #include "deepgalois/layers/aggregator.h" +#ifdef GALOIS_USE_DIST +#include "deepgalois/layers/GraphConvSyncStructures.h" +#endif /** * GraphConv Layer; based on DGL implementation + follows TinyDNN layer diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index c1658c045f..3c3c7ce747 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -1,5 +1,5 @@ -#ifndef TYPES_H -#define TYPES_H +#ifndef _GNN_TYPES_H_ +#define _GNN_TYPES_H_ #include #include @@ -30,4 +30,16 @@ typedef uint32_t VertexID; #define MAX_NUM_CLASSES 64 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE) #define USE_CUSPARSE + + +#ifdef GALOIS_USE_DIST +namespace deepgalois { + //! Set this to let sync struct know where to get data from + static float_t* _dataToSync = nullptr; + //! Set this to let sync struct know the size of the vector to use during + //! sync + static long unsigned _syncVectorSize = 0; +} +#endif + #endif From 6c500c55704285f8d778fa19f15a94e89752bcac Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 18 Mar 2020 18:12:26 -0500 Subject: [PATCH 162/660] norm factor is temporarily 1 for dist execution TODO needs to be based on degree? --- libdeepgalois/src/DistContext.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index c206c1d654..9069fad351 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -112,7 +112,7 @@ void DistContext::norm_factor_counting() { norm_factor = new float_t[localVertices]; galois::do_all(galois::iterate((size_t)0, localVertices), [&](auto v) { - norm_factor[v] = 0.01; + norm_factor[v] = 1; }, galois::loopname("NormCounting")); //galois::do_all(galois::iterate((size_t)0, localVertices), From 2c94782bd7b4393c4a22c36f9adefcc32f93dff8 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 18 Mar 2020 18:21:27 -0500 Subject: [PATCH 163/660] less messy prints for dist execution purposes (new line) TODO merge evertyhign into single print --- libdeepgalois/src/net.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index c14a8397c6..f8d21dee99 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -122,7 +122,7 @@ void Net::train(optimizer* opt, bool need_validate) { // validation / testing set_netphases(net_phase::test); galois::gPrint("train_loss = ", std::setw(5), train_loss, " train_acc = ", - std::setw(5), train_acc); + std::setw(5), train_acc, "\n"); t_epoch.Stop(); double epoch_time = t_epoch.Millisecs(); if (need_validate) { @@ -133,7 +133,7 @@ void Net::train(optimizer* opt, bool need_validate) { val_loss, val_acc); Tval.stop(); galois::gPrint(" val_loss = ", std::setw(5), val_loss, " val_acc = ", - std::setw(5), val_acc); + std::setw(5), val_acc, "\n"); galois::gPrint(" time = ", epoch_time + val_time, " ms (train_time = ", epoch_time, " val_time = ", val_time, ")\n"); } else { From b29c1a54da2a58dcb52c4a01b88247744bba7034 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 18 Mar 2020 18:25:42 -0500 Subject: [PATCH 164/660] sync calls added to graph_conv_layer TODO weight gradient combination --- libdeepgalois/src/layers/graph_conv_layer.cpp | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 86ab1abd2f..f3dbd62e94 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -71,7 +71,11 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ // aggregate based on graph topology graph_conv_layer::aggregate(z, *(context->graph_cpu), out_temp, out_data); - // TODO sync required here + // TODO sync of out_data required here + deepgalois::_syncVectorSize = z; + deepgalois::_dataToSync = out_data; + layer::context->getSyncSubstrate()->sync("AggSync"); // run relu activation on output if specified if (act_) deepgalois::math::relu_cpu(x*z, out_data, out_data); @@ -88,7 +92,11 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // x*y NOTE: since graph is symmetric, the derivative is the same // this is the aggregate call deepgalois::update_all(z, *(context->graph_cpu), out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z - // TODO sync required here + // sync agg + deepgalois::_syncVectorSize = z; + deepgalois::_dataToSync = out_temp; + layer::context->getSyncSubstrate()->sync("AggSyncBack"); // at this point, out_temp has the derivative of data from last step to // use for both updating gradients for features and gradients for weights From dbb0204b9b94c08a1069b0c0523f3f0a45504028 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 18 Mar 2020 18:36:48 -0500 Subject: [PATCH 165/660] for now gradient sync is a trivial summation TODO change it to something different --- .../include/deepgalois/layers/GradientSyncStructs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h index df88352bcf..d0074d11ed 100644 --- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h +++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h @@ -13,10 +13,10 @@ struct GradientSync { static bool reduce(uint32_t node_id, float_t& weight, ValTy y) { // TODO merge function here // for now make sure the weights are close enough - if (std::abs(weight - y) > 0.00001) { - galois::gInfo("weight ", node_id, " not consistent with one received"); - } - + //if (std::abs(weight - y) > 0.00001) { + // galois::gInfo("weight ", node_id, " not consistent with one received"); + //} + weight += y; return true; } From 75f2dad1c51dab464f188be2f37701672b2f43a2 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 18 Mar 2020 18:44:41 -0500 Subject: [PATCH 166/660] USE_DST wrapping around sync calls --- libdeepgalois/src/layers/graph_conv_layer.cpp | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index f3dbd62e94..171b32305c 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -71,12 +71,13 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ // aggregate based on graph topology graph_conv_layer::aggregate(z, *(context->graph_cpu), out_temp, out_data); +#ifdef GALOIS_USE_DIST // TODO sync of out_data required here deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_data; layer::context->getSyncSubstrate()->sync("AggSync"); - +#endif // run relu activation on output if specified if (act_) deepgalois::math::relu_cpu(x*z, out_data, out_data); } @@ -92,11 +93,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // x*y NOTE: since graph is symmetric, the derivative is the same // this is the aggregate call deepgalois::update_all(z, *(context->graph_cpu), out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z +#ifdef GALOIS_USE_DIST // sync agg deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_temp; layer::context->getSyncSubstrate()->sync("AggSyncBack"); +#endif // at this point, out_temp has the derivative of data from last step to // use for both updating gradients for features and gradients for weights From 61af2fcb94aeb71ce31ea3a9dacac29a9fe274df Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 27 Mar 2020 11:44:12 -0500 Subject: [PATCH 167/660] net: print modifications that take into account dist execution --- libdeepgalois/src/net.cpp | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index f8d21dee99..7677417c99 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -82,6 +82,12 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, } void Net::train(optimizer* opt, bool need_validate) { +#ifdef GALOIS_USE_DIST + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; +#else + unsigned myID = 0; +#endif + galois::gPrint("\nStart training...\n"); galois::StatTimer Tupdate("Train-WeightUpdate"); galois::StatTimer Tfw("Train-Forward"); @@ -91,7 +97,8 @@ void Net::train(optimizer* opt, bool need_validate) { Timer t_epoch; // run epochs for (unsigned i = 0; i < num_epochs; i++) { - galois::gPrint("Epoch ", std::setw(2), i, std::fixed, std::setprecision(3), ":"); + galois::gPrint("[", myID, "] Epoch ", std::setw(2), i, std::fixed, + std::setprecision(3), "\n"); t_epoch.Start(); // training steps @@ -121,7 +128,7 @@ void Net::train(optimizer* opt, bool need_validate) { // validation / testing set_netphases(net_phase::test); - galois::gPrint("train_loss = ", std::setw(5), train_loss, " train_acc = ", + galois::gPrint("[", myID, "] train_loss = ", std::setw(5), train_loss, " train_acc = ", std::setw(5), train_acc, "\n"); t_epoch.Stop(); double epoch_time = t_epoch.Millisecs(); @@ -132,12 +139,12 @@ void Net::train(optimizer* opt, bool need_validate) { double val_time = evaluate(val_begin, val_end, val_count, &val_mask[0], val_loss, val_acc); Tval.stop(); - galois::gPrint(" val_loss = ", std::setw(5), val_loss, " val_acc = ", + galois::gPrint("[", myID, "] val_loss = ", std::setw(5), val_loss, " val_acc = ", std::setw(5), val_acc, "\n"); - galois::gPrint(" time = ", epoch_time + val_time, " ms (train_time = ", + galois::gPrint("[", myID, "] time = ", epoch_time + val_time, " ms (train_time = ", epoch_time, " val_time = ", val_time, ")\n"); } else { - galois::gPrint(" train_time = ", epoch_time, " ms\n"); + galois::gPrint("[", myID, "] train_time = ", epoch_time, " ms\n"); } } } From f7b48605f2537a0cb5b8d1ae9065929b140b3bb7 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 27 Mar 2020 12:53:58 -0500 Subject: [PATCH 168/660] shared memory context return graph pointer --- libdeepgalois/include/deepgalois/context.h | 2 ++ libdeepgalois/src/context.cpp | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index b765515e50..b5822a5555 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -46,6 +46,8 @@ class Context { Graph* graph_cpu; // the input graph, |V| = N void genGraph(LGraph& lg, Graph& g); void add_selfloop(Graph &og, Graph &g); + //! returns pointer to the graph + Graph* getGraphPointer(); #else CSRGraph graph_gpu; // the input graph, |V| = N inline static cublasHandle_t cublas_handle() { return cublas_handle_; } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 9206b1cc1a..404b8fef7f 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -89,6 +89,10 @@ void Context::add_selfloop(Graph &og, Graph &g) { //*/ } +Graph* Context::getGraphPointer() { + return Context::graph_cpu; +} + float_t* Context::get_in_ptr() { return &h_feats[0]; } void Context::norm_factor_counting() { From 2bbbcde18139727ef32b892c49e473b815c39517 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 27 Mar 2020 12:54:24 -0500 Subject: [PATCH 169/660] dist context, return graph pointer --- libdeepgalois/include/deepgalois/DistContext.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 4baaaae8a9..704247d54b 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -39,6 +39,10 @@ class DistContext { void initializeSyncSubstrate(); galois::graphs::GluonSubstrate* getSyncSubstrate(); + Graph* getGraphPointer() { + return graph_cpu; + } + //! return label for some node //! NOTE: this is LID, not GID label_t get_label(size_t i) { From 03412cd81f3ec20fe8346ed3b7624d705bd9c18c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 27 Mar 2020 13:03:01 -0500 Subject: [PATCH 170/660] semblance of unified accuracy (masked_accuracy made distributed) --- libdeepgalois/include/deepgalois/gtypes.h | 3 ++ libdeepgalois/include/deepgalois/net.h | 5 ++- libdeepgalois/src/net.cpp | 53 +++++++++++++++++++++-- 3 files changed, 55 insertions(+), 6 deletions(-) diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h index 5dc08fc99e..dfc2e1d8c6 100644 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ b/libdeepgalois/include/deepgalois/gtypes.h @@ -12,6 +12,9 @@ typedef galois::GAccumulator AccumF; typedef galois::GAccumulator AccumU; +#ifdef GALOIS_USE_DIST +using AccuracyAccum = galois::DGAccumulator; +#endif #ifndef GALOIS_USE_DIST #ifdef EDGE_LABEL diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index f905d2a595..74cf3f6058 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -122,7 +122,7 @@ class Net { Timer t_eval; t_eval.Start(); loss = fprop(begin, end, count, masks); - acc = masked_accuracy(begin, end, count, masks); + acc = masked_accuracy(begin, end, count, masks, context->getGraphPointer()); t_eval.Stop(); return t_eval.Millisecs(); } @@ -142,7 +142,8 @@ class Net { size_t train_begin, train_end, train_count, val_begin, val_end, val_count; std::vector layers; // all the layers in the neural network // comparing outputs with the ground truth (labels) - acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks); + acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, + Graph* dGraph); }; } // namespace deepgalois diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 7677417c99..8991f779c5 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -67,6 +67,9 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, } //std::cout << "Done\n"; + // NOTE: train_begin/train_end are global IDs, train_mask is a local id + // train count and val count are LOCAL counts + num_layers = NUM_CONV_LAYERS + 1; // initialize feature metadata feature_dims.resize(num_layers + 1); @@ -111,7 +114,7 @@ void Net::train(optimizer* opt, bool need_validate) { train_loss = Net::fprop(train_begin, train_end, train_count, &train_mask[0]); // forward train_acc = masked_accuracy(train_begin, train_end, train_count, - &train_mask[0]); // predict + &train_mask[0], context->getGraphPointer()); // predict Tfw.stop(); // backward: use intermediate features + ground truth to update layers @@ -160,18 +163,60 @@ void Net::construct_layers() { } #ifdef CPU_ONLY -acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks) { +/** + * + * @param begin GLOBAL begin + * @param end GLOBAL end + * @param count GLOBAL training count + */ +acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, + Graph* dGraph) { +#ifndef GALOIS_USE_DIST AccumF accuracy_all; +#else + AccuracyAccum accuracy_all; + galois::DGAccumulator sampleCount; + sampleCount.reset(); +#endif + accuracy_all.reset(); + galois::do_all(galois::iterate(begin, end), [&](const auto& i) { +#ifndef GALOIS_USE_DIST if (masks[i] == 1) { + // get prediction int preds = argmax(num_classes, - &(layers[NUM_CONV_LAYERS - 1]->next()->get_data()[i * num_classes])); + &(layers[NUM_CONV_LAYERS - 1]->next()->get_data()[i * num_classes])); + // check prediction if ((label_t)preds == context->get_label(i)) accuracy_all += 1.0; } +#else + // only look at owned nodes (i.e. masters); the prediction for these + // should only by handled on the owner + if (dGraph->isOwned(i)) { + sampleCount += 1; + + uint32_t localID = dGraph->getLID(i); + if (masks[localID] == 1) { + // get prediction + int preds = argmax(num_classes, + &(layers[NUM_CONV_LAYERS - 1]->next()->get_data()[localID * num_classes])); + // check prediction + if ((label_t)preds == context->get_label(localID)) + accuracy_all += 1.0; + } + } +#endif }, - galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); + galois::loopname("getMaskedLoss")); + +#ifdef GALOIS_USE_DIST + count = sampleCount.reduce(); + galois::gDebug("sample count is ", count); +#endif + + // all hosts should get same accuracy return accuracy_all.reduce() / (acc_t)count; } #endif From 4315ebc546f82633ccb65c27436b80078580fbdb Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Thu, 16 Apr 2020 19:39:46 -0500 Subject: [PATCH 171/660] add sampler --- libdeepgalois/CMakeLists.txt | 1 + libdeepgalois/include/deepgalois/sampler.h | 7 +++++++ libdeepgalois/src/sampler.cpp | 4 ++++ 3 files changed, 12 insertions(+) create mode 100644 libdeepgalois/include/deepgalois/sampler.h create mode 100644 libdeepgalois/src/sampler.cpp diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index b625c317e3..e8ff6e420a 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -76,6 +76,7 @@ set(sources src/math_functions.cpp src/optimizer.cpp src/context.cpp + src/sampler.cpp src/node.cpp src/net.cpp ) diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h new file mode 100644 index 0000000000..079a84d415 --- /dev/null +++ b/libdeepgalois/include/deepgalois/sampler.h @@ -0,0 +1,7 @@ +#pragma once +#include "deepgalois/gtypes.h" + +void subgraph_sampler(Graph &g, Graph &sg); +galois::runtime::iterable > neighbor_sampler(Graph &g, GNode v); +Graph::edge_iterator sampled_edge_begin(Graph &g, GNode v) { return g.edge_begin(v); } +Graph::edge_iterator sampled_edge_end(Graph &g, GNode v) { return g.edge_end(v); } diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp new file mode 100644 index 0000000000..5077d5b756 --- /dev/null +++ b/libdeepgalois/src/sampler.cpp @@ -0,0 +1,4 @@ +#include "deepgalois/sampler.h" + +void subgraph_sampler(Graph &g, Graph &sg) { +} From b2c17a8498e69c4647812a988f1c64ba9d64246e Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Thu, 16 Apr 2020 21:55:47 -0500 Subject: [PATCH 172/660] update sampler --- libdeepgalois/src/sampler.cpp | 111 +++++++++++++++++++++++++++++++++- 1 file changed, 110 insertions(+), 1 deletion(-) diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index 5077d5b756..98dac8c75c 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -1,4 +1,113 @@ #include "deepgalois/sampler.h" +#include +#include -void subgraph_sampler(Graph &g, Graph &sg) { +// selecet k vertices from begin to end +static std::vector selectVertex(GNode begin, GNode end, size_t k) { + auto i = begin; + + // reservoir[] is the output array. Initialize + // it with first k vertices + std::vector reservoir(k); + for (; i < k; i++) reservoir[i] = i; + + // Use a different seed value so that we don't get + // same result each time we run this program + srand(time(NULL)); + + // Iterate from the (k+1)th element to nth element + for (; i < end; i++) { + // Pick a random index from 0 to i. + auto j = rand() % (i + 1); + + // If the randomly picked index is smaller than k, + // then replace the element present at the index + // with new element from stream + if (j < k) reservoir[j] = i; + } + return reservoir; +} + +// Utility function to find ceiling of r in arr[l..h] +int findCeil(std::vector arr, unsigned r, unsigned l, unsigned h) { + unsigned mid; + while (l < h) { + mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2 + (r > arr[mid]) ? (l = mid + 1) : (h = mid); + } + return (arr[l] >= r) ? l : -1; +} + +// select one element from n elements given a frequency (probability) distribution +// https://www.geeksforgeeks.org/random-number-generator-in-arbitrary-probability-distribution-fashion/ +size_t selectOneVertex(size_t n, std::vector dist) { + std::vector offsets(n); + offsets[0] = dist[0]; + // compute the prefix sum of the distribution + for (size_t i = 1; i < n; ++i) offsets[i] = offsets[i-1] + dist[i]; + // offsets[n-1] is sum of all frequencies + unsigned sum = offsets[n-1]; + unsigned r = (rand() % sum) + 1; + // find which range r falls into, + // and return the index of the range + return findCeil(offsets, r, 0, n - 1); +} + +inline unsigned getDegree(Graph &g, GNode v) { + return std::distance(g.edge_begin(v), g.edge_end(v)); +} + +void generate_subgraph(std::set &vertex_set, Graph &g, Graph &sub) { + auto nv = vertex_set.size(); + size_t ne = 0; + std::vector offsets(nv+1); + offsets[0] = 0; + size_t i = 0; + std::vector vertices(nv); + for (auto v : vertex_set) { + vertices[i] = v; + offsets[i+1] = offsets[i] + getDegree(g, v); + i++; + } + // TODO: need to remove edges whose has endpoint not belong to the selected vertex subset + sub.allocateFrom(nv, ne); + sub.constructNodes(); + for (i = 0; i < nv; i++) { + g.fixEndEdge(i, offsets[i+1]); + for (unsigned offset = 0; offset < offsets[i+1]-offsets[i]; offset ++) { + g.constructEdge(offsets[i]+offset, g.getEdgeDst(g.edge_begin(vertices[i])+offset), 0); + } + } +} + +// generate a subgraph sg with size n from the input graph g +// n: number of vertices in the subgraph +// m: number of vertices in the frontier +void subgraph_sampler(Graph &g, Graph &sg, size_t n, size_t m) { + auto num_vertices = g.size(); // number of vertices in the original input graph + auto frontier = selectVertex(0, num_vertices, m); // randomly select m vertices from g as frontier + std::set vertex_set; + for (size_t i = 0; i < m; i++) + vertex_set.insert(frontier[i]); + std::vector degrees(m); + //std::vector probabilities(m); + //unsigned sum_degree = 0; + for (size_t i = 0; i < m; i++) { + degrees[i] = getDegree(g, frontier[i]); + //sum_degree += degrees[i]; + } + for (size_t i = 0; i < n - m; i++) { + //for (size_t i = 0; i < m; i++) + // probabilities[i] = (float)degrees[i] / (float)sum_degree; + auto pos = selectOneVertex(m, degrees); + GNode u = frontier[pos]; + auto degree = degrees[pos]; + auto neighbor_id = rand() % degree; + frontier[pos] = g.getEdgeDst(g.edge_begin(u) + neighbor_id); + degrees[pos] = getDegree(g, frontier[pos]); + //sum_degree -= degree; + //sum_degree += degrees[pos]; + vertex_set.insert(u); + } + generate_subgraph(vertex_set, g, sg); } From e292fbb556511747fb7c8526a9273d2489ad2add Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 17 Apr 2020 20:51:16 -0500 Subject: [PATCH 173/660] add sigmoid --- libdeepgalois/CMakeLists.txt | 1 + .../deepgalois/layers/sigmoid_loss_layer.h | 18 +++++ .../include/deepgalois/math_functions.hh | 11 +++- .../src/layers/sigmoid_loss_layer.cpp | 65 +++++++++++++++++++ libdeepgalois/src/math_functions.cpp | 25 ++++--- 5 files changed, 109 insertions(+), 11 deletions(-) create mode 100644 libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h create mode 100644 libdeepgalois/src/layers/sigmoid_loss_layer.cpp diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index e8ff6e420a..f92d8950a9 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -71,6 +71,7 @@ else() set(sources src/layers/graph_conv_layer.cpp src/layers/softmax_loss_layer.cpp + src/layers/sigmoid_loss_layer.cpp src/layers/aggregator.cpp src/layers/layer.cpp src/math_functions.cpp diff --git a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h new file mode 100644 index 0000000000..31bab85daa --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h @@ -0,0 +1,18 @@ +#pragma once +#include "layer.h" + +namespace deepgalois { +class sigmoid_loss_layer : public layer { +public: + sigmoid_loss_layer(unsigned level, std::vector in_dims, + std::vector out_dims); + ~sigmoid_loss_layer() {} + std::string layer_type() const override { + return std::string("sigmoid_loss"); + } + virtual void forward_propagation(const float_t* in_data, float_t* out_data); + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad); + virtual acc_t get_masked_loss(); +}; +} diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index ffc0343438..7aa388ab13 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -39,15 +39,22 @@ void d_dropout_cpu(size_t n, const float scale, const float_t* in_diff, void relu_cpu(size_t n, const float_t* in, float_t* out); //! ReLU derivative; generally, 1 if data > 0, 0 otherwise void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out); + +// Loss function for single-class label (one-hot) data: softmax void softmax(const vec_t& input, vec_t& output); void softmax(size_t n, const float_t* input, float_t* output); void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp); -void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, - const float_t* dp); +void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp); + float_t cross_entropy(const vec_t& y, const vec_t& p); float_t cross_entropy(size_t n, const float_t* y, const float_t* p); void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d); void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d); + +// Loss function for multi-class label (one-hot) data: sigmoid +void sigmoid(size_t n, const float_t* input, float_t* output); +void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp); + //! copy vector from in -> out; first len elements void copy_cpu(size_t len, const float_t* in, float_t* out); // single-precision dense matrix multiply diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp new file mode 100644 index 0000000000..220d3da102 --- /dev/null +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -0,0 +1,65 @@ +#include "deepgalois/layers/sigmoid_loss_layer.h" + +namespace deepgalois { + +#ifdef CPU_ONLY +sigmoid_loss_layer::sigmoid_loss_layer(unsigned level, + std::vector in_dims, + std::vector out_dims) + : layer(level, in_dims, out_dims) { + trainable_ = false; + name_ = layer_type() + "_" + std::to_string(level); + loss = new float_t[in_dims[0]]; // error for each sample +} + +void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { + size_t len = input_dims[1]; + galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { + if (masks_[i] == 1) { // masked + // output is normalized input for this layer + math::sigmoid(len, &in_data[len*i], &out_data[len*i]); // normalize using sigmoid + // one hot encoded vector for the labels + std::vector groundTruth(output_dims[1], 0.0); // ground truth + groundTruth[context->get_label(i)] = 1.0; // one-hot TODO: modify for multi-class label + // loss calculation + loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]); + } + }, galois::chunk_size(), galois::steal(), galois::loopname("sigmoid-loss-fw")); +} + +void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + size_t len = layer::input_dims[1]; + galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { + if (masks_[i] == 1) { // masked + vec_t norm_grad(len); + std::vector groundTruth(len, 0.0); + groundTruth[context->get_label(i)] = 1.0; + // use ground truth to determine derivative of cross entropy + math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]); + // derviative sigmoid to gradient used in the next layer + math::d_sigmoid(len, &in_data[len * i], &out_data[len * i], &in_grad[len * i], &norm_grad[0]); + } + }, galois::chunk_size(), galois::steal(), galois::loopname("sigmoid-loss-bw")); +} + +acc_t sigmoid_loss_layer::get_masked_loss() { + assert(count_ > 0); + AccumF total_loss; + AccumU valid_sample_count; + total_loss.reset(); + valid_sample_count.reset(); + galois::do_all(galois::iterate(layer::begin_, layer::end_), + [&](const auto& i) { + if (masks_[i]) { + total_loss += loss[i]; + valid_sample_count += 1; + } + }, galois::chunk_size<256>(), galois::steal(), + galois::loopname("getMaskedLoss")); + assert(valid_sample_count.reduce() == count_); + return total_loss.reduce() / (acc_t)count_; +} +#endif + +} // namespace diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 6b383e4b78..58e3543652 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -501,17 +501,24 @@ float reduce_mean(const vec_t& x) { return sum / (float)n; } - - -float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; } +// use sigmoid instead of softmax for multi-class datasets, e.g. ppi, yelp and amazon +// inline float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; } +inline float_t sigmoid_func(float_t x) { return 1./(1.+expf(-x)); } // Sigmoid -void sigmoid(vec_t& fv) { - size_t count = fv.size(); - for (size_t i = 0; i < count; ++i) { - fv[i] = sigmoid_func(fv[i]); - } +void sigmoid(const vec_t& input, vec_t &output) { + for (size_t i = 0; i < input.size(); ++i) + output[i] = sigmoid_func(input[i]); } +void sigmoid(size_t n, const float_t* input, float_t* output) { + for (int i=0; i< n; i++) { + output[i] = 1. / (1. + expf(-input[i])); + } +} - +void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp) { + for (int i=0; i< n; i++) { + dy[i] = dp[i] * p[i] * (float_t(1) - p[i]); + } +} From fdd84d3c5a22a7875a37f540aa74d5ec2b559659 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 18 Apr 2020 17:51:35 -0500 Subject: [PATCH 174/660] fix bug --- libdeepgalois/include/deepgalois/context.h | 2 + .../include/deepgalois/math_functions.hh | 2 + libdeepgalois/include/deepgalois/net.h | 10 ++++- libdeepgalois/src/context.cpp | 1 + libdeepgalois/src/math_functions.cpp | 44 ++++++++++--------- libdeepgalois/src/net.cpp | 28 +++++++----- lonestargnn/gcn/gcn.cpp | 2 +- lonestargnn/include/lonestargnn.h | 4 +- 8 files changed, 57 insertions(+), 36 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index b5822a5555..754b7a8491 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -32,10 +32,12 @@ class Context { size_t read_graph_gpu(std::string dataset_str, bool selfloop); void copy_data_to_device(); // copy labels and input features void norm_factor_counting(); + //void set_label_class(bool is_single = true) { is_single_class = is_single; } size_t n; // number of samples: N size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D + //bool is_single_class; // single-class (one-hot) or multi-class label std::vector labels; // labels for classification: N x 1 label_t* d_labels; // labels on device vec_t h_feats; // input features: N x D diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 7aa388ab13..46f571ac35 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -52,7 +52,9 @@ void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d); void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d); // Loss function for multi-class label (one-hot) data: sigmoid +void sigmoid(const vec_t& input, vec_t& output); void sigmoid(size_t n, const float_t* input, float_t* output); +void d_sigmoid(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp); void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp); //! copy vector from in -> out; first len elements diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 74cf3f6058..e29e1863ff 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -10,6 +10,7 @@ #include "deepgalois/gtypes.h" #include "deepgalois/layers/graph_conv_layer.h" #include "deepgalois/layers/softmax_loss_layer.h" +#include "deepgalois/layers/sigmoid_loss_layer.h" #include "deepgalois/optimizer.h" #ifndef GALOIS_USE_DIST #include "deepgalois/context.h" @@ -31,7 +32,8 @@ class Net { public: Net() {} #ifndef GALOIS_USE_DIST - void init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop); + void init(std::string dataset_str, unsigned epochs, unsigned hidden1, + bool selfloop, bool is_single = true); #else void init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop, Graph* dGraph); @@ -79,7 +81,10 @@ class Net { in_dims[0] = out_dims[0] = num_samples; in_dims[1] = get_in_dim(layer_id); out_dims[1] = get_out_dim(layer_id); - layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); + if (is_single_class) + layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); + else + layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims); connect(layers[layer_id - 1], layers[layer_id]); } @@ -133,6 +138,7 @@ class Net { #else deepgalois::DistContext* context; #endif + bool is_single_class; // single-class (one-hot) or multi-class label size_t num_samples; // number of samples: N size_t num_classes; // number of vertex classes: E size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 404b8fef7f..52db06ca62 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -18,6 +18,7 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) { size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) { galois::StatTimer Tread("GraphReadingTime"); Tread.start(); + graph_cpu = new Graph(); if (filetype == "el") { std::string filename = path + dataset_str + ".el"; printf("Reading .el file: %s\n", filename.c_str()); diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 58e3543652..cdde9cc964 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -261,6 +261,29 @@ void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) { d[i] = -y[i] / (p[i] + float_t(1e-10)); } } + +// use sigmoid instead of softmax for multi-class datasets, e.g. ppi, yelp and amazon +// inline float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; } +inline float_t sigmoid_func(float_t x) { return 1./(1.+expf(-x)); } + +// Sigmoid +void sigmoid(const vec_t& in, vec_t &out) { + for (size_t i = 0; i < in.size(); ++i) + out[i] = sigmoid_func(in[i]); +} + +void sigmoid(size_t n, const float_t* in, float_t* out) { + for (size_t i = 0; i < n; i++) { + out[i] = 1. / (1. + expf(-in[i])); + } +} + +void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp) { + for (size_t i = 0; i < n; i++) { + dy[i] = dp[i] * p[i] * (float_t(1) - p[i]); + } +} + void copy1D1D(const vec_t& in, vec_t& out) { std::copy(in.begin(), in.end(), &out[0]); } @@ -501,24 +524,3 @@ float reduce_mean(const vec_t& x) { return sum / (float)n; } -// use sigmoid instead of softmax for multi-class datasets, e.g. ppi, yelp and amazon -// inline float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; } -inline float_t sigmoid_func(float_t x) { return 1./(1.+expf(-x)); } - -// Sigmoid -void sigmoid(const vec_t& input, vec_t &output) { - for (size_t i = 0; i < input.size(); ++i) - output[i] = sigmoid_func(input[i]); -} - -void sigmoid(size_t n, const float_t* input, float_t* output) { - for (int i=0; i< n; i++) { - output[i] = 1. / (1. + expf(-input[i])); - } -} - -void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp) { - for (int i=0; i< n; i++) { - dy[i] = dp[i] * p[i] * (float_t(1) - p[i]); - } -} diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 8991f779c5..19a3508ebf 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -7,12 +7,14 @@ namespace deepgalois { #ifndef GALOIS_USE_DIST -void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop) { +void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, + bool selfloop, bool is_single) { #else void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop, Graph* dGraph) { #endif #ifndef GALOIS_USE_DIST + is_single_class = is_single; context = new deepgalois::Context(); num_samples = context->read_graph(dataset_str, selfloop); #else @@ -87,8 +89,13 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, void Net::train(optimizer* opt, bool need_validate) { #ifdef GALOIS_USE_DIST unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + std::string header = "[" + std::to_string(myID) + "] "; + std::string seperator = "\n"; #else - unsigned myID = 0; + //std::string header = "[" + std::to_string(0) + "] "; + //std::string seperator = "\n"; + std::string header = ""; + std::string seperator = " "; #endif galois::gPrint("\nStart training...\n"); @@ -100,8 +107,7 @@ void Net::train(optimizer* opt, bool need_validate) { Timer t_epoch; // run epochs for (unsigned i = 0; i < num_epochs; i++) { - galois::gPrint("[", myID, "] Epoch ", std::setw(2), i, std::fixed, - std::setprecision(3), "\n"); + galois::gPrint(header, "Epoch ", std::setw(3), i, seperator); t_epoch.Start(); // training steps @@ -131,8 +137,8 @@ void Net::train(optimizer* opt, bool need_validate) { // validation / testing set_netphases(net_phase::test); - galois::gPrint("[", myID, "] train_loss = ", std::setw(5), train_loss, " train_acc = ", - std::setw(5), train_acc, "\n"); + galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, train_loss, + " train_acc ", train_acc, seperator); t_epoch.Stop(); double epoch_time = t_epoch.Millisecs(); if (need_validate) { @@ -142,12 +148,12 @@ void Net::train(optimizer* opt, bool need_validate) { double val_time = evaluate(val_begin, val_end, val_count, &val_mask[0], val_loss, val_acc); Tval.stop(); - galois::gPrint("[", myID, "] val_loss = ", std::setw(5), val_loss, " val_acc = ", - std::setw(5), val_acc, "\n"); - galois::gPrint("[", myID, "] time = ", epoch_time + val_time, " ms (train_time = ", - epoch_time, " val_time = ", val_time, ")\n"); + galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, val_loss, + " val_acc ", val_acc, seperator); + galois::gPrint(header, "time ", std::setprecision(3), std::fixed, epoch_time + val_time, + " ms (train_time ", epoch_time, " val_time ", val_time, ")\n"); } else { - galois::gPrint("[", myID, "] train_time = ", epoch_time, " ms\n"); + galois::gPrint(header, "train_time ", std::fixed, epoch_time, " ms\n"); } } } diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index d688258cd3..e23097befe 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -25,7 +25,7 @@ int main(int argc, char** argv) { #ifndef GALOIS_USE_DIST // read network, features, ground truth, initialize metadata - network.init(dataset, epochs, hidden1, add_selfloop); + network.init(dataset, epochs, hidden1, add_selfloop, is_single_class); #else network.init(dataset, epochs, hidden1, add_selfloop, dGraph); #endif diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h index e932738636..7e2c3ec589 100644 --- a/lonestargnn/include/lonestargnn.h +++ b/lonestargnn/include/lonestargnn.h @@ -52,7 +52,9 @@ static cll::opt max_degree( static cll::opt do_validate("dv", cll::desc("enable validation"), cll::init(1)); static cll::opt do_test("dt", cll::desc("enable test"), cll::init(1)); -static cll::opt add_selfloop("sl", cll::desc("add selfloop"), cll::init(0)); +static cll::opt add_selfloop("sl", cll::desc("add selfloop"), cll::init(0)); +static cll::opt is_single_class("sc", + cll::desc("single-class or multi-class label (default single)"), cll::init(1)); //! standard global options to the benchmarks extern llvm::cl::opt skipVerify; From 20b7a0985aeb521ccdd6c1f47f43574df1d0d00c Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 18 Apr 2020 23:23:15 -0500 Subject: [PATCH 175/660] use binary input feature --- libdeepgalois/include/deepgalois/context.h | 27 ++++---- libdeepgalois/include/deepgalois/types.h | 2 +- libdeepgalois/include/deepgalois/utils.h | 4 +- libdeepgalois/src/context.cpp | 67 +++++++++++++------ .../src/layers/sigmoid_loss_layer.cpp | 12 ++-- libdeepgalois/src/net.cpp | 1 + 6 files changed, 70 insertions(+), 43 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index 754b7a8491..a2407bd478 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -23,25 +23,20 @@ class Context { size_t read_graph(std::string dataset_str, bool selfloop); size_t read_labels(std::string dataset_str); - size_t read_features(std::string dataset_str); - label_t get_label(size_t i) { return labels[i]; } - label_t* get_labels_ptr(size_t i) { return &(labels[0]); } + size_t read_features(std::string dataset_str, std::string filetype = "bin"); + label_t get_label(size_t i) { return labels[i]; } // single-class (one-hot) label + label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label + label_t* get_labels_ptr(size_t i) { return labels; } float_t* get_in_ptr(); size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop); size_t read_graph_gpu(std::string dataset_str, bool selfloop); void copy_data_to_device(); // copy labels and input features void norm_factor_counting(); - //void set_label_class(bool is_single = true) { is_single_class = is_single; } + void set_label_class(bool is_single = true) { is_single_class = is_single; } - size_t n; // number of samples: N - size_t num_classes; // number of classes: E - size_t feat_len; // input feature length: D - //bool is_single_class; // single-class (one-hot) or multi-class label - std::vector labels; // labels for classification: N x 1 - label_t* d_labels; // labels on device - vec_t h_feats; // input features: N x D float_t* d_feats; // input features on device + label_t* d_labels; // labels on device float_t* norm_factor; // normalization constant based on graph structure #ifdef CPU_ONLY @@ -55,12 +50,16 @@ class Context { inline static cublasHandle_t cublas_handle() { return cublas_handle_; } inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; } inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; } - inline static curandGenerator_t curand_generator() { - return curand_generator_; - } + inline static curandGenerator_t curand_generator() { return curand_generator_; } #endif protected: + size_t n; // number of samples: N + size_t num_classes; // number of classes: E + size_t feat_len; // input feature length: D + bool is_single_class; // single-class (one-hot) or multi-class label + label_t *labels; // labels for classification: N x 1 + float_t* h_feats; // input features: N x D #ifndef CPU_ONLY static cublasHandle_t cublas_handle_; // used to call cuBLAS static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index 3c3c7ce747..e7600b4605 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -18,7 +18,7 @@ typedef std::vector typedef std::vector FV; // feature vector typedef std::vector FV2D; // feature vectors: num_samples x feature_dim typedef float acc_t; // Accuracy type -typedef short label_t; // label is for classification (supervised learning) +typedef uint8_t label_t; // label is for classification (supervised learning) typedef uint8_t mask_t; // mask is used to indicate different uses of labels: // train, val, test typedef uint32_t VertexID; diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h index ad33285879..8279dca8e8 100644 --- a/libdeepgalois/include/deepgalois/utils.h +++ b/libdeepgalois/include/deepgalois/utils.h @@ -110,7 +110,7 @@ inline bool bernoulli(float_t p) { inline size_t read_masks(std::string dataset_str, std::string mask_type, size_t& begin, size_t& end, std::vector& masks) { - if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed") { + if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed" && dataset_str != "flickr") { std::cout << "Dataset currently not supported\n"; exit(1); } @@ -145,7 +145,7 @@ inline size_t read_masks(std::string dataset_str, std::string mask_type, inline size_t read_masks(std::string dataset_str, std::string mask_type, size_t& begin, size_t& end, std::vector& masks, Graph* dGraph) { - if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed") { + if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed" && dataset_str != "flickr") { std::cout << "Dataset currently not supported\n"; exit(1); } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 52db06ca62..8ffaacb8b6 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -94,7 +94,7 @@ Graph* Context::getGraphPointer() { return Context::graph_cpu; } -float_t* Context::get_in_ptr() { return &h_feats[0]; } +float_t* Context::get_in_ptr() { return h_feats; } void Context::norm_factor_counting() { norm_factor = new float_t[n]; @@ -123,16 +123,23 @@ size_t Context::read_labels(std::string dataset_str) { size_t m; // m: number of samples in >> m >> num_classes >> std::ws; assert(m == n); - labels.resize(m, 0); // label for each vertex: N x 1 + if (is_single_class) + labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1 + else + labels = new label_t[m*num_classes]; // multi-class label for each vertex: N x E unsigned v = 0; while (std::getline(in, line)) { std::istringstream label_stream(line); unsigned x; for (size_t idx = 0; idx < num_classes; ++idx) { label_stream >> x; - if (x != 0) { - labels[v] = idx; - break; + if (is_single_class) { + if (x != 0) { + labels[v] = idx; + break; + } + } else { + labels[v*num_classes+idx] = x; } } v++; @@ -142,36 +149,56 @@ size_t Context::read_labels(std::string dataset_str) { // print the number of vertex classes std::cout << "Done, unique label counts: " << num_classes << ", time: " << t_read.Millisecs() << " ms\n"; + //for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " << unsigned(labels[i]) << "\n"; return num_classes; } //! Read features, return the length of a feature vector //! Features are stored in the Context class -size_t Context::read_features(std::string dataset_str) { +size_t Context::read_features(std::string dataset_str, std::string filetype) { + //filetype = "txt"; std::cout << "Reading features ... "; Timer t_read; t_read.Start(); + size_t m; // m = number of vertices std::string filename = path + dataset_str + ".ft"; std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - size_t m; // m = number of vertices - in >> m >> feat_len >> std::ws; - // assert(m == ); - h_feats.resize(m * feat_len, 0); - while (std::getline(in, line)) { - std::istringstream edge_stream(line); - unsigned u, v; - float_t w; - edge_stream >> u; - edge_stream >> v; - edge_stream >> w; - h_feats[u * feat_len + v] = w; + + if (filetype == "bin") { + std::string file_dims = path + dataset_str + "-dims.txt"; + std::ifstream ifs; + ifs.open(file_dims, std::ios::in); + ifs >> m >> feat_len >> std::ws; + ifs.close(); + } else { + in.open(filename, std::ios::in); + in >> m >> feat_len >> std::ws; + } + std::cout << "N x D: " << m << " x " << feat_len << "\n"; + h_feats = new float_t[m * feat_len]; + if (filetype == "bin") { + filename = path + dataset_str + "-feats.bin"; + in.open(filename, std::ios::binary|std::ios::in); + in.read((char*)h_feats, sizeof(float_t) * m * feat_len); + } else { + std::string line; + while (std::getline(in, line)) { + std::istringstream edge_stream(line); + unsigned u, v; + float_t w; + edge_stream >> u; + edge_stream >> v; + edge_stream >> w; + h_feats[u * feat_len + v] = w; + } } in.close(); t_read.Stop(); std::cout << "Done, feature length: " << feat_len << ", time: " << t_read.Millisecs() << " ms\n"; + //for (auto i = 0; i < 6; i ++) + //for (auto j = 0; j < 6; j ++) + //std::cout << "feats[" << i << "][" << j << "] = " << h_feats[i*feat_len+j] << "\n"; return feat_len; } diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index 220d3da102..30c01d846c 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -19,10 +19,10 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* ou // output is normalized input for this layer math::sigmoid(len, &in_data[len*i], &out_data[len*i]); // normalize using sigmoid // one hot encoded vector for the labels - std::vector groundTruth(output_dims[1], 0.0); // ground truth - groundTruth[context->get_label(i)] = 1.0; // one-hot TODO: modify for multi-class label + acc_t *ground_truth = new acc_t[len]; + for (size_t j = 0; j < len; j++) ground_truth[j] = context->get_label(i, j); // loss calculation - loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]); + loss[i] = math::cross_entropy(len, ground_truth, &out_data[len*i]); } }, galois::chunk_size(), galois::steal(), galois::loopname("sigmoid-loss-fw")); } @@ -33,10 +33,10 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { if (masks_[i] == 1) { // masked vec_t norm_grad(len); - std::vector groundTruth(len, 0.0); - groundTruth[context->get_label(i)] = 1.0; + acc_t *ground_truth = new acc_t[len]; + for (size_t j = 0; j < len; j++) ground_truth[j] = context->get_label(i, j); // use ground truth to determine derivative of cross entropy - math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]); + math::d_cross_entropy(len, ground_truth, &out_data[len * i], &norm_grad[0]); // derviative sigmoid to gradient used in the next layer math::d_sigmoid(len, &in_data[len * i], &out_data[len * i], &in_grad[len * i], &norm_grad[0]); } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 19a3508ebf..45e79f4cf4 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -16,6 +16,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, #ifndef GALOIS_USE_DIST is_single_class = is_single; context = new deepgalois::Context(); + context->set_label_class(is_single); num_samples = context->read_graph(dataset_str, selfloop); #else context = new deepgalois::DistContext(); From 8f9d86dc55b0ba72d1ce764c47dede9f1189d40e Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 19 Apr 2020 14:59:54 -0500 Subject: [PATCH 176/660] add multi-class accuracy --- libdeepgalois/CMakeLists.txt | 2 + libdeepgalois/include/deepgalois/context.h | 2 +- libdeepgalois/include/deepgalois/net.h | 58 ++------ libdeepgalois/include/deepgalois/utils.h | 82 ++---------- libdeepgalois/src/context.cpp | 7 +- .../src/layers/sigmoid_loss_layer.cpp | 14 +- libdeepgalois/src/net.cpp | 61 ++++++++- libdeepgalois/src/utils.cpp | 124 ++++++++++++++++++ 8 files changed, 216 insertions(+), 134 deletions(-) create mode 100644 libdeepgalois/src/utils.cpp diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index f92d8950a9..58538bb8b0 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -64,6 +64,7 @@ set(sources src/math_functions.cpp src/optimizer.cpp src/DistContext.cpp + src/utils.cpp src/node.cpp src/net.cpp ) @@ -78,6 +79,7 @@ set(sources src/optimizer.cpp src/context.cpp src/sampler.cpp + src/utils.cpp src/node.cpp src/net.cpp ) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index a2407bd478..b73de071cc 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -26,7 +26,7 @@ class Context { size_t read_features(std::string dataset_str, std::string filetype = "bin"); label_t get_label(size_t i) { return labels[i]; } // single-class (one-hot) label label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label - label_t* get_labels_ptr(size_t i) { return labels; } + label_t* get_labels_ptr() { return labels; } float_t* get_in_ptr(); size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop); diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index e29e1863ff..dcd538e7f6 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -41,8 +41,17 @@ class Net { size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } size_t get_nnodes() { return num_samples; } - void train(optimizer* opt, bool need_validate); // training void construct_layers(); + void append_out_layer(size_t layer_id); + void train(optimizer* opt, bool need_validate); // training + double evaluate(size_t begin, size_t end, size_t count, + mask_t* masks, acc_t& loss, acc_t& acc); // inference + + //! Add a convolution layer to the network + void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, + bool bias = false, bool dropout = true, + float_t dropout_rate = 0.5); + //! Save the context object to all layers of the network void set_contexts() { for (size_t i = 0; i < num_layers; i++) @@ -59,35 +68,6 @@ class Net { layers[i]->print_layer_info(); } - //! Add a convolution layer to the network - void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, - bool bias = false, bool dropout = true, - float_t dropout_rate = 0.5) { - assert(dropout_rate < 1.0); - assert(layer_id < NUM_CONV_LAYERS); - std::vector in_dims(2), out_dims(2); - in_dims[0] = out_dims[0] = num_samples; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, - dropout_rate, in_dims, out_dims); - if (layer_id > 0) connect(layers[layer_id - 1], layers[layer_id]); - } - - //! Add an output layer to the network - void append_out_layer(size_t layer_id) { - assert(layer_id > 0); // can not be the first layer - std::vector in_dims(2), out_dims(2); - in_dims[0] = out_dims[0] = num_samples; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - if (is_single_class) - layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); - else - layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims); - connect(layers[layer_id - 1], layers[layer_id]); - } - //! forward propagation: [begin, end) is the range of samples used. //! calls "forward" on the layers of the network and returns the loss of the //! final layer @@ -120,18 +100,6 @@ class Net { } } - // evaluate, i.e. inference or predict - double evaluate(size_t begin, size_t end, size_t count, mask_t* masks, - acc_t& loss, acc_t& acc) { - // TODO may need to do something for the dist case - Timer t_eval; - t_eval.Start(); - loss = fprop(begin, end, count, masks); - acc = masked_accuracy(begin, end, count, masks, context->getGraphPointer()); - t_eval.Stop(); - return t_eval.Millisecs(); - } - protected: #ifndef GALOIS_USE_DIST deepgalois::Context* context; @@ -143,13 +111,15 @@ class Net { size_t num_classes; // number of vertex classes: E size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 unsigned num_epochs; // number of epochs + std::vector feature_dims; // feature dimnesions for each layer std::vector train_mask, val_mask; // masks for traning and validation size_t train_begin, train_end, train_count, val_begin, val_end, val_count; std::vector layers; // all the layers in the neural network + // comparing outputs with the ground truth (labels) - acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, - Graph* dGraph); + acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph); + acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph); }; } // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h index 8279dca8e8..b7a84bb10a 100644 --- a/libdeepgalois/include/deepgalois/utils.h +++ b/libdeepgalois/include/deepgalois/utils.h @@ -9,6 +9,8 @@ #include #ifdef GALOIS_USE_DIST #include "deepgalois/gtypes.h" +#else +#include "deepgalois/types.h" #endif namespace deepgalois { @@ -103,80 +105,14 @@ inline bool bernoulli(float_t p) { return uniform_rand(float_t(0), float_t(1)) <= p; } +acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, + size_t num_classes, label_t *ground_truth, float_t *pred); -#ifndef GALOIS_USE_DIST -//! Get masks from datafile where first line tells range of -//! set to create mask from -inline size_t read_masks(std::string dataset_str, std::string mask_type, - size_t& begin, size_t& end, - std::vector& masks) { - if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed" && dataset_str != "flickr") { - std::cout << "Dataset currently not supported\n"; - exit(1); - } - size_t i = 0; - size_t sample_count = 0; - std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; - // std::cout << "Reading " << filename << "\n"; - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - in >> begin >> end >> std::ws; - while (std::getline(in, line)) { - std::istringstream mask_stream(line); - if (i >= begin && i < end) { - unsigned mask = 0; - mask_stream >> mask; - if (mask == 1) { - masks[i] = 1; - sample_count++; - } - } - i++; - } - std::cout << mask_type + "_mask range: [" << begin << ", " << end - << ") Number of valid samples: " << sample_count << "\n"; - in.close(); - return sample_count; -} +#ifdef GALOIS_USE_DIST +size_t read_masks(std::string dataset_str, std::string mask_type, + size_t& begin, size_t& end, std::vector& masks, Graph* dGraph); #else -//! Get masks from datafile where first line tells range of -//! set to create mask from; needs graph object due to local IDs -inline size_t read_masks(std::string dataset_str, std::string mask_type, - size_t& begin, size_t& end, - std::vector& masks, Graph* dGraph) { - if (dataset_str != "citeseer" && dataset_str != "cora" && dataset_str != "pubmed" && dataset_str != "flickr") { - std::cout << "Dataset currently not supported\n"; - exit(1); - } - size_t i = 0; - size_t sample_count = 0; - std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; - - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - in >> begin >> end >> std::ws; - while (std::getline(in, line)) { - std::istringstream mask_stream(line); - if (i >= begin && i < end) { - unsigned mask = 0; - mask_stream >> mask; - if (mask == 1) { - // only bother if it's local - if (dGraph->isLocal(i)) { - masks[dGraph->getLID(i)] = 1; - sample_count++; - } - } - } - i++; - } - std::cout << mask_type + "_mask range: [" << begin << ", " << end - << ") Number of valid samples: " << sample_count << "\n"; - in.close(); - return sample_count; -} +size_t read_masks(std::string dataset_str, std::string mask_type, + size_t& begin, size_t& end, std::vector& masks); #endif - } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 8ffaacb8b6..daf83a6b24 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -123,10 +123,13 @@ size_t Context::read_labels(std::string dataset_str) { size_t m; // m: number of samples in >> m >> num_classes >> std::ws; assert(m == n); - if (is_single_class) + if (is_single_class) { + std::cout << "Using single-class (one-hot) labels\n"; labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1 - else + } else { + std::cout << "Using multi-class labels\n"; labels = new label_t[m*num_classes]; // multi-class label for each vertex: N x E + } unsigned v = 0; while (std::getline(in, line)) { std::istringstream label_stream(line); diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index 30c01d846c..5a9508c1aa 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -49,14 +49,12 @@ acc_t sigmoid_loss_layer::get_masked_loss() { AccumU valid_sample_count; total_loss.reset(); valid_sample_count.reset(); - galois::do_all(galois::iterate(layer::begin_, layer::end_), - [&](const auto& i) { - if (masks_[i]) { - total_loss += loss[i]; - valid_sample_count += 1; - } - }, galois::chunk_size<256>(), galois::steal(), - galois::loopname("getMaskedLoss")); + galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { + if (masks_[i]) { + total_loss += loss[i]; + valid_sample_count += 1; + } + }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); assert(valid_sample_count.reduce() == count_); return total_loss.reduce() / (acc_t)count_; } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 45e79f4cf4..56cd0c2dfc 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -120,8 +120,13 @@ void Net::train(optimizer* opt, bool need_validate) { Tfw.start(); train_loss = Net::fprop(train_begin, train_end, train_count, &train_mask[0]); // forward - train_acc = masked_accuracy(train_begin, train_end, train_count, - &train_mask[0], context->getGraphPointer()); // predict + if (is_single_class) { + train_acc = masked_accuracy(train_begin, train_end, train_count, + &train_mask[0], context->getGraphPointer()); // predict + } else { + train_acc = masked_multi_class_accuracy(train_begin, train_end, train_count, + &train_mask[0], context->getGraphPointer()); // predict + } Tfw.stop(); // backward: use intermediate features + ground truth to update layers @@ -159,6 +164,18 @@ void Net::train(optimizer* opt, bool need_validate) { } } +// evaluate, i.e. inference or predict +double Net::evaluate(size_t begin, size_t end, size_t count, mask_t* masks, + acc_t& loss, acc_t& acc) { + // TODO may need to do something for the dist case + Timer t_eval; + t_eval.Start(); + loss = fprop(begin, end, count, masks); + acc = masked_accuracy(begin, end, count, masks, context->getGraphPointer()); + t_eval.Stop(); + return t_eval.Millisecs(); +} + void Net::construct_layers() { std::cout << "\nConstructing layers...\n"; append_conv_layer(0, true); // first conv layer @@ -169,6 +186,34 @@ void Net::construct_layers() { set_contexts(); } +//! Add an output layer to the network +void Net::append_out_layer(size_t layer_id) { + assert(layer_id > 0); // can not be the first layer + std::vector in_dims(2), out_dims(2); + in_dims[0] = out_dims[0] = num_samples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + if (is_single_class) + layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); + else + layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims); + connect(layers[layer_id - 1], layers[layer_id]); +} + +//! Add a convolution layer to the network +void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, + bool dropout, float_t dropout_rate) { + assert(dropout_rate < 1.0); + assert(layer_id < NUM_CONV_LAYERS); + std::vector in_dims(2), out_dims(2); + in_dims[0] = out_dims[0] = num_samples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, + dropout_rate, in_dims, out_dims); + if (layer_id > 0) connect(layers[layer_id - 1], layers[layer_id]); +} + #ifdef CPU_ONLY /** * @@ -176,8 +221,7 @@ void Net::construct_layers() { * @param end GLOBAL end * @param count GLOBAL training count */ -acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, - Graph* dGraph) { +acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph) { #ifndef GALOIS_USE_DIST AccumF accuracy_all; #else @@ -215,8 +259,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks } } #endif - }, - galois::loopname("getMaskedLoss")); + }, galois::loopname("getMaskedLoss")); #ifdef GALOIS_USE_DIST count = sampleCount.reduce(); @@ -226,6 +269,12 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks // all hosts should get same accuracy return accuracy_all.reduce() / (acc_t)count; } + +acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph) { + auto preds = layers[NUM_CONV_LAYERS - 1]->next()->get_data(); + auto ground_truth = context->get_labels_ptr(); + return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, ground_truth, preds); +} #endif } // namespace deepgalois diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp new file mode 100644 index 0000000000..d43bec403d --- /dev/null +++ b/libdeepgalois/src/utils.cpp @@ -0,0 +1,124 @@ +#include "galois/Galois.h" +#include "deepgalois/utils.h" + +namespace deepgalois { + +#define NUM_DATASETS 8 +const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pubmed", "flickr", "yelp", "reddit", "amazon"}; + +acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, + size_t num_classes, label_t *ground_truth, float_t *pred) { + std::vector true_positive(num_classes, 0); + std::vector false_positive(num_classes, 0); + galois::do_all(galois::iterate(begin, end), [&](const auto& i) { + if (masks[i] == 1) { + for (size_t j = 0; j < num_classes; j++) { + auto idx = i * num_classes + j; + if (ground_truth[idx] == 1 && pred[idx] > 0.5) { + true_positive[j] ++; + } else if (ground_truth[idx] == 0 && pred[idx] > 0.5) { + false_positive[j] ++; + } + } + } + }, galois::loopname("MaskedF1Score")); + acc_t pNumerator = 0.0; + acc_t pDenominator = 0.0; + for (size_t i = 0; i < num_classes; i++) { + auto fp = false_positive[i]; // false positive + auto tp = true_positive[i]; // true positive + pNumerator = pNumerator + tp; + pDenominator = pDenominator + (tp + fp); + } + acc_t precisionMicro = pNumerator / pDenominator; + return precisionMicro; +} + +#ifndef GALOIS_USE_DIST +//! Get masks from datafile where first line tells range of +//! set to create mask from +size_t read_masks(std::string dataset_str, std::string mask_type, + size_t& begin, size_t& end, std::vector& masks) { + bool dataset_found = false; + for (int i = 0; i < NUM_DATASETS; i++) { + if (dataset_str == dataset_names[i]) { + dataset_found = true; + break; + } + } + if (!dataset_found) { + std::cout << "Dataset currently not supported\n"; + exit(1); + } + size_t i = 0; + size_t sample_count = 0; + std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; + // std::cout << "Reading " << filename << "\n"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + in >> begin >> end >> std::ws; + while (std::getline(in, line)) { + std::istringstream mask_stream(line); + if (i >= begin && i < end) { + unsigned mask = 0; + mask_stream >> mask; + if (mask == 1) { + masks[i] = 1; + sample_count++; + } + } + i++; + } + std::cout << mask_type + "_mask range: [" << begin << ", " << end + << ") Number of valid samples: " << sample_count << " (" + << (float)sample_count/(float)masks.size()*(float)100 << "\%)\n"; + in.close(); + return sample_count; +} +#else +size_t read_masks(std::string dataset_str, std::string mask_type, + size_t& begin, size_t& end, + std::vector& masks, Graph* dGraph) { + bool dataset_found = false; + for (int i = 0; i < NUM_DATASETS; i++) { + if (dataset_str == dataset_names[i]) { + dataset_found = true; + break; + } + } + if (!dataset_found) { + std::cout << "Dataset currently not supported\n"; + exit(1); + } + size_t i = 0; + size_t sample_count = 0; + std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; + + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + in >> begin >> end >> std::ws; + while (std::getline(in, line)) { + std::istringstream mask_stream(line); + if (i >= begin && i < end) { + unsigned mask = 0; + mask_stream >> mask; + if (mask == 1) { + // only bother if it's local + if (dGraph->isLocal(i)) { + masks[dGraph->getLID(i)] = 1; + sample_count++; + } + } + } + i++; + } + std::cout << mask_type + "_mask range: [" << begin << ", " << end + << ") Number of valid samples: " << sample_count << "\n"; + in.close(); + return sample_count; +} +#endif + +} From 966dc071c63b43d6e8f6e6ce1076774a93f7f759 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 19 Apr 2020 18:51:35 -0500 Subject: [PATCH 177/660] fix f1 --- .../deepgalois/layers/softmax_loss_layer.h | 2 +- libdeepgalois/src/context.cpp | 6 +- .../src/layers/sigmoid_loss_layer.cpp | 21 +++--- .../src/layers/softmax_loss_layer.cpp | 72 +++++++++---------- .../src/layers/softmax_loss_layer.cu | 4 ++ libdeepgalois/src/net.cpp | 6 +- libdeepgalois/src/utils.cpp | 26 ++++++- 7 files changed, 88 insertions(+), 49 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h index 798ad7a79a..7194d06f2e 100644 --- a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h +++ b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h @@ -6,7 +6,7 @@ class softmax_loss_layer : public layer { public: softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims); - ~softmax_loss_layer() {} + ~softmax_loss_layer(); std::string layer_type() const override { return std::string("softmax_loss"); } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index daf83a6b24..8d6616182f 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -8,7 +8,11 @@ namespace deepgalois { #ifdef CPU_ONLY Context::Context() {} -Context::~Context() {} +Context::~Context() { + if (labels) delete labels; + if (h_feats) delete h_feats; + if (norm_factor) delete norm_factor; +} size_t Context::read_graph(std::string dataset_str, bool selfloop) { n = read_graph_cpu(dataset_str, "gr", selfloop); diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index 5a9508c1aa..57264976db 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -16,13 +16,15 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* ou size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { if (masks_[i] == 1) { // masked + size_t idx = len * i; // output is normalized input for this layer - math::sigmoid(len, &in_data[len*i], &out_data[len*i]); // normalize using sigmoid + math::sigmoid(len, &in_data[idx], &out_data[idx]); // normalize using sigmoid // one hot encoded vector for the labels - acc_t *ground_truth = new acc_t[len]; - for (size_t j = 0; j < len; j++) ground_truth[j] = context->get_label(i, j); + float_t *ground_truth = new float_t[len]; + for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)context->get_label(i, j); // loss calculation - loss[i] = math::cross_entropy(len, ground_truth, &out_data[len*i]); + loss[i] = math::cross_entropy(len, ground_truth, &out_data[idx]); + delete ground_truth; } }, galois::chunk_size(), galois::steal(), galois::loopname("sigmoid-loss-fw")); } @@ -32,13 +34,16 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* size_t len = layer::input_dims[1]; galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { if (masks_[i] == 1) { // masked - vec_t norm_grad(len); - acc_t *ground_truth = new acc_t[len]; + size_t idx = len * i; + float_t *norm_grad = new float_t[len]; + float_t *ground_truth = new float_t[len]; for (size_t j = 0; j < len; j++) ground_truth[j] = context->get_label(i, j); // use ground truth to determine derivative of cross entropy - math::d_cross_entropy(len, ground_truth, &out_data[len * i], &norm_grad[0]); + math::d_cross_entropy(len, ground_truth, &out_data[idx], norm_grad); // derviative sigmoid to gradient used in the next layer - math::d_sigmoid(len, &in_data[len * i], &out_data[len * i], &in_grad[len * i], &norm_grad[0]); + math::d_sigmoid(len, &in_data[idx], &out_data[idx], &in_grad[idx], norm_grad); + delete norm_grad; + delete ground_truth; } }, galois::chunk_size(), galois::steal(), galois::loopname("sigmoid-loss-bw")); } diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 9b64a0d353..7c5b11d233 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -12,27 +12,29 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, loss = new float_t[in_dims[0]]; // error for each sample } +softmax_loss_layer::~softmax_loss_layer() { + delete loss; +} + // TODO: need kernel fusion optimization // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { size_t len = input_dims[1]; - galois::do_all(galois::iterate(begin_, end_), - [&](const auto& i) { - if (masks_[i] == 1) { // masked - // output is normalized input for this layer - math::softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax - // one hot encoded vector for the labels - std::vector groundTruth(output_dims[1], 0.0); // ground truth - groundTruth[context->get_label(i)] = 1.0; // one-hot - // loss calculation - loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]); - } - }, galois::chunk_size(), galois::steal(), - galois::loopname("softmax-loss-fw")); + galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { + if (masks_[i] == 1) { // masked + // output is normalized input for this layer + math::softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax + // one hot encoded vector for the labels + std::vector groundTruth(output_dims[1], 0.0); // ground truth + groundTruth[context->get_label(i)] = 1.0; // one-hot + // loss calculation + loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]); + } + }, galois::chunk_size<64>(), galois::steal(), galois::loopname("softmax-loss-fw")); - // no sync required in distributed execution since no graph topology used - // in this forward pass; only a post-process pretty much + // no sync required in distributed execution since no graph topology used + // in this forward pass; only a post-process pretty much } void softmax_loss_layer::back_propagation(const float_t* in_data, @@ -40,20 +42,18 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, float_t* out_grad, float_t* in_grad) { // note: out_grad is ignored because it shouldn't exist (this is output layer) size_t len = layer::input_dims[1]; - galois::do_all(galois::iterate(layer::begin_, layer::end_), - [&](const auto& i) { - if (masks_[i] == 1) { // masked - vec_t norm_grad(len); - std::vector groundTruth(len, 0.0); - groundTruth[context->get_label(i)] = 1.0; - // use ground truth to determine derivative of cross entropy - math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]); - // derviative softmax to gradient used in the next layer - math::d_softmax(len, &in_data[len * i], &out_data[len * i], - &in_grad[len * i], &norm_grad[0]); - } - }, galois::chunk_size(), galois::steal(), - galois::loopname("softmax-loss-bw")); + galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { + if (masks_[i] == 1) { // masked + vec_t norm_grad(len); + std::vector groundTruth(len, 0.0); + groundTruth[context->get_label(i)] = 1.0; + // use ground truth to determine derivative of cross entropy + math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]); + // derviative softmax to gradient used in the next layer + math::d_softmax(len, &in_data[len * i], &out_data[len * i], + &in_grad[len * i], &norm_grad[0]); + } + }, galois::chunk_size<64>(), galois::steal(), galois::loopname("softmax-loss-bw")); // no weight sync required: this is all local graph information } @@ -64,14 +64,12 @@ acc_t softmax_loss_layer::get_masked_loss() { AccumU valid_sample_count; total_loss.reset(); valid_sample_count.reset(); - galois::do_all(galois::iterate(layer::begin_, layer::end_), - [&](const auto& i) { - if (masks_[i]) { - total_loss += loss[i]; - valid_sample_count += 1; - } - }, galois::chunk_size<256>(), galois::steal(), - galois::loopname("getMaskedLoss")); + galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { + if (masks_[i]) { + total_loss += loss[i]; + valid_sample_count += 1; + } + }, galois::chunk_size<64>(), galois::steal(), galois::loopname("getMaskedLoss")); assert(valid_sample_count.reduce() == count_); return total_loss.reduce() / (acc_t)count_; } diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu index e9216b1ae2..6ed45bc98e 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cu +++ b/libdeepgalois/src/layers/softmax_loss_layer.cu @@ -41,6 +41,10 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, float_malloc_device(in_dims[0], loss); } +softmax_loss_layer::~softmax_loss_layer() { + float_free_device(loss); +} + void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { init_const_gpu(input_dims[0], 0.0, loss); diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 56cd0c2dfc..340b77650a 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -171,7 +171,11 @@ double Net::evaluate(size_t begin, size_t end, size_t count, mask_t* masks, Timer t_eval; t_eval.Start(); loss = fprop(begin, end, count, masks); - acc = masked_accuracy(begin, end, count, masks, context->getGraphPointer()); + if (is_single_class) { + acc = masked_accuracy(begin, end, count, masks, context->getGraphPointer()); + } else { + acc = masked_multi_class_accuracy(begin, end, count, masks, context->getGraphPointer()); + } t_eval.Stop(); return t_eval.Millisecs(); } diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index d43bec403d..72d6560aca 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -6,10 +6,24 @@ namespace deepgalois { #define NUM_DATASETS 8 const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pubmed", "flickr", "yelp", "reddit", "amazon"}; +// Compute the F1 score, also known as balanced F-score or F-measure +// The F1 score can be interpreted as a weighted average of the precision and recall, +// where an F1 score reaches its best value at 1 and worst score at 0. +// The relative contribution of precision and recall to the F1 score are equal. +// The formula for the F1 score is: +// F1 = 2 * (precision * recall) / (precision + recall) +// where precision = TP / (TP + FP), recall = TP / (TP + FN) +// TP: true positive; FP: false positive; FN: false negtive. +// In the multi-class and multi-label case, this is the weighted average of the F1 score of each class. +// Please refer to https://sebastianraschka.com/faq/docs/multiclass-metric.html, +// http://pageperso.lif.univ-mrs.fr/~francois.denis/IAAM1/scikit-learn-docs.pdf (p.1672) +// and https://github.com/ashokpant/accuracy-evaluation-cpp/blob/master/src/evaluation.hpp acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, size_t num_classes, label_t *ground_truth, float_t *pred) { + float beta = 1; std::vector true_positive(num_classes, 0); std::vector false_positive(num_classes, 0); + std::vector false_negtive(num_classes, 0); galois::do_all(galois::iterate(begin, end), [&](const auto& i) { if (masks[i] == 1) { for (size_t j = 0; j < num_classes; j++) { @@ -18,20 +32,30 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, true_positive[j] ++; } else if (ground_truth[idx] == 0 && pred[idx] > 0.5) { false_positive[j] ++; + } else if (ground_truth[idx] == 1 && pred[idx] <= 0.5) { + false_negtive[j] ++; } } } }, galois::loopname("MaskedF1Score")); acc_t pNumerator = 0.0; acc_t pDenominator = 0.0; + acc_t rNumerator = 0.0; + acc_t rDenominator = 0.0; for (size_t i = 0; i < num_classes; i++) { + auto fn = false_negtive[i]; // false negtive auto fp = false_positive[i]; // false positive auto tp = true_positive[i]; // true positive pNumerator = pNumerator + tp; pDenominator = pDenominator + (tp + fp); + rNumerator = rNumerator + tp; + rDenominator = rDenominator + (tp + fn); } + auto recallMicro = rNumerator / rDenominator; acc_t precisionMicro = pNumerator / pDenominator; - return precisionMicro; + auto fscoreMicro = (((beta * beta) + 1) * precisionMicro * recallMicro) / + ((beta * beta) * precisionMicro + recallMicro); + return fscoreMicro; } #ifndef GALOIS_USE_DIST From 20a7e5224b63431bc3e11936916bc95ffca763da Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 19 Apr 2020 19:52:45 -0500 Subject: [PATCH 178/660] add scripts --- libdeepgalois/scripts/run-multi.sh | 33 ++++++++++++++++++++++++++++ libdeepgalois/scripts/run-single.sh | 33 ++++++++++++++++++++++++++++ libdeepgalois/scripts/test-multi.sh | 1 + libdeepgalois/scripts/test-single.sh | 1 + libdeepgalois/src/net.cpp | 6 +++++ lonestargnn/include/lonestargnn.h | 6 ++--- 6 files changed, 77 insertions(+), 3 deletions(-) create mode 100755 libdeepgalois/scripts/run-multi.sh create mode 100755 libdeepgalois/scripts/run-single.sh create mode 100755 libdeepgalois/scripts/test-multi.sh create mode 100755 libdeepgalois/scripts/test-single.sh diff --git a/libdeepgalois/scripts/run-multi.sh b/libdeepgalois/scripts/run-multi.sh new file mode 100755 index 0000000000..660fac74b3 --- /dev/null +++ b/libdeepgalois/scripts/run-multi.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +GALOIS_HOME=/net/ohm/export/cdgc/cxh/GaloisCpp +LONESTARGNN=$GALOIS_HOME/build-gnn-cpu/lonestargnn +GNNS="gcn" +#GRAPHS="ppi yelp amazon" +GRAPHS="ppi" +EPOCHS="200" +NTHREADS="56" +DROPOUT="0.1 0.2 0.3 0.5" +LEARNINGRATES="0.01" +HIDDENDIM="16 64 128" +OUTDIR=/net/ohm/export/cdgc/cxh/outputs/DeepGalois + +for GNN in $GNNS; do + for NT in $NTHREADS; do + for GR in $GRAPHS; do + for K in $EPOCHS; do + for DR in $DROPOUT; do + for LR in $LEARNINGRATES; do + for HD in $HIDDENDIM; do + EXEC_DIR=$LONESTARGNN/$GNN + echo $EXEC_DIR + echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log" + $EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD -sc=0 &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log + echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log" + done + done + done + done + done + done +done diff --git a/libdeepgalois/scripts/run-single.sh b/libdeepgalois/scripts/run-single.sh new file mode 100755 index 0000000000..9c0d9fcb63 --- /dev/null +++ b/libdeepgalois/scripts/run-single.sh @@ -0,0 +1,33 @@ +#!/bin/bash + +GALOIS_HOME=/net/ohm/export/cdgc/cxh/GaloisCpp +LONESTARGNN=$GALOIS_HOME/build-gnn-cpu/lonestargnn +GNNS="gcn" +GRAPHS="cora citeseer pudmed flickr reddit" +#GRAPHS="cora" +EPOCHS="200" +NTHREADS="56" +DROPOUT="0.1 0.2 0.3 0.5" +LEARNINGRATES="0.01" +HIDDENDIM="16 32 64 128 256 512" +OUTDIR=/net/ohm/export/cdgc/cxh/outputs/DeepGalois + +for GNN in $GNNS; do + for NT in $NTHREADS; do + for GR in $GRAPHS; do + for K in $EPOCHS; do + for DR in $DROPOUT; do + for LR in $LEARNINGRATES; do + for HD in $HIDDENDIM; do + EXEC_DIR=$LONESTARGNN/$GNN + echo $EXEC_DIR + echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log" + $EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log + echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log" + done + done + done + done + done + done +done diff --git a/libdeepgalois/scripts/test-multi.sh b/libdeepgalois/scripts/test-multi.sh new file mode 100755 index 0000000000..a67bd047a8 --- /dev/null +++ b/libdeepgalois/scripts/test-multi.sh @@ -0,0 +1 @@ +./gcn ppi -k=20 -t=14 -sc=0 -h=128 diff --git a/libdeepgalois/scripts/test-single.sh b/libdeepgalois/scripts/test-single.sh new file mode 100755 index 0000000000..78093d71ed --- /dev/null +++ b/libdeepgalois/scripts/test-single.sh @@ -0,0 +1 @@ +./gcn cora -k=200 -t=14 diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 340b77650a..6d390ea867 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -104,6 +104,7 @@ void Net::train(optimizer* opt, bool need_validate) { galois::StatTimer Tfw("Train-Forward"); galois::StatTimer Tbw("Train-Backward"); galois::StatTimer Tval("Validation"); + double total_train_time = 0.0; Timer t_epoch; // run epochs @@ -147,6 +148,7 @@ void Net::train(optimizer* opt, bool need_validate) { " train_acc ", train_acc, seperator); t_epoch.Stop(); double epoch_time = t_epoch.Millisecs(); + total_train_time += epoch_time; if (need_validate) { // Validation acc_t val_loss = 0.0, val_acc = 0.0; @@ -162,6 +164,10 @@ void Net::train(optimizer* opt, bool need_validate) { galois::gPrint(header, "train_time ", std::fixed, epoch_time, " ms\n"); } } + double avg_train_time = total_train_time / (double)num_epochs; + double throughput = 1000.0 * (double)num_epochs / total_train_time; + galois::gPrint("\nAverage training time: ", avg_train_time, + " ms. Throughput: ", throughput, " epoch/s\n"); } // evaluate, i.e. inference or predict diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h index 7e2c3ec589..72acb8d1ff 100644 --- a/lonestargnn/include/lonestargnn.h +++ b/lonestargnn/include/lonestargnn.h @@ -19,9 +19,9 @@ namespace cll = llvm::cl; static cll::opt dataset(cll::Positional, cll::desc(""), cll::Required); // 'cora', 'citeseer', 'pubmed' -static cll::opt - filetype(cll::Positional, cll::desc(""), - cll::init("gr")); // file format of the input graph +//static cll::opt +// filetype(cll::Positional, cll::desc(""), +// cll::init("gr")); // file format of the input graph static cll::opt model("m", cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense' From b9401f378244a0db3d623dc6882759b2b5ad1994 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 20 Apr 2020 19:54:49 -0500 Subject: [PATCH 179/660] add gpu sigmoid --- libdeepgalois/CMakeLists.txt | 1 + libdeepgalois/include/deepgalois/context.h | 3 +- .../deepgalois/layers/sigmoid_loss_layer.h | 2 +- .../include/deepgalois/math_functions.hh | 7 ++ libdeepgalois/include/deepgalois/net.h | 5 ++ libdeepgalois/include/deepgalois/sampler.h | 35 +++++++- libdeepgalois/include/deepgalois/types.h | 2 + libdeepgalois/src/context.cu | 5 +- .../src/layers/sigmoid_loss_layer.cpp | 4 + .../src/layers/sigmoid_loss_layer.cu | 38 +++++++++ .../src/layers/softmax_loss_layer.cu | 30 +------ libdeepgalois/src/math_functions.cu | 83 +++++++++++++++++-- libdeepgalois/src/net.cpp | 21 +++-- libdeepgalois/src/net.cu | 74 ++++++++++++++++- libdeepgalois/src/sampler.cpp | 51 +++++++----- libdeepgalois/src/utils.cpp | 2 +- 16 files changed, 289 insertions(+), 74 deletions(-) create mode 100644 libdeepgalois/src/layers/sigmoid_loss_layer.cu diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 58538bb8b0..c2c64d4f0c 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -40,6 +40,7 @@ else() set(CUDA_SOURCES src/layers/graph_conv_layer.cu src/layers/softmax_loss_layer.cu + src/layers/sigmoid_loss_layer.cu src/layers/aggregator.cu src/math_functions.cu src/optimizer.cu diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index b73de071cc..d6bb004b7a 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -44,13 +44,14 @@ class Context { void genGraph(LGraph& lg, Graph& g); void add_selfloop(Graph &og, Graph &g); //! returns pointer to the graph - Graph* getGraphPointer(); + Graph* getCpuGraphPointer(); #else CSRGraph graph_gpu; // the input graph, |V| = N inline static cublasHandle_t cublas_handle() { return cublas_handle_; } inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; } inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; } inline static curandGenerator_t curand_generator() { return curand_generator_; } + CSRGraph* getGpuGraphPointer() { return &graph_gpu; } #endif protected: diff --git a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h index 31bab85daa..334bf4363e 100644 --- a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h +++ b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h @@ -6,7 +6,7 @@ class sigmoid_loss_layer : public layer { public: sigmoid_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims); - ~sigmoid_loss_layer() {} + ~sigmoid_loss_layer(); std::string layer_type() const override { return std::string("sigmoid_loss"); } diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 46f571ac35..2c3a8014ee 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -147,6 +147,12 @@ void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in_da void d_softmax_cross_entropy_gpu(int len, int bengin, int end, const mask_t* masks, const label_t* labels, const float_t* out_data, float_t* diff); +void sigmoid_cross_entropy_gpu(int len, int begin, int end, const float_t* in_data, + const mask_t* masks, const label_t* labels, + float_t* loss, float_t* out_data); +void d_sigmoid_cross_entropy_gpu(int len, int bengin, int end, + const mask_t* masks, const label_t* labels, + const float_t* out_data, float_t* diff); void scal_gpu(const int N, const float alpha, float* X); void add_scalar_gpu(const int N, const float_t alpha, float_t* Y); void rng_uniform_gpu(const int n, const float_t a, const float_t b, float_t* r); @@ -155,4 +161,5 @@ void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks); void float_malloc_device(int n, float_t*& ptr); void float_free_device(float_t*& ptr); void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr); +acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss); #endif diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index dcd538e7f6..98573d60b5 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -118,8 +118,13 @@ class Net { std::vector layers; // all the layers in the neural network // comparing outputs with the ground truth (labels) +#ifdef CPU_ONLY acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph); acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph); +#else + acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *gGraph); + acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *gGraph); +#endif }; } // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h index 079a84d415..8842f0e442 100644 --- a/libdeepgalois/include/deepgalois/sampler.h +++ b/libdeepgalois/include/deepgalois/sampler.h @@ -1,7 +1,34 @@ #pragma once #include "deepgalois/gtypes.h" -void subgraph_sampler(Graph &g, Graph &sg); -galois::runtime::iterable > neighbor_sampler(Graph &g, GNode v); -Graph::edge_iterator sampled_edge_begin(Graph &g, GNode v) { return g.edge_begin(v); } -Graph::edge_iterator sampled_edge_end(Graph &g, GNode v) { return g.edge_end(v); } +namespace deepgalois { +class Sampler { +public: + Sampler() : m(1000) {} + ~Sampler() {} + + // sample a subgraph sg of size n from graph g + void subgraph_sampler(Graph &g, Graph &sg, size_t n); + + // !API function for user-defined selection strategy + virtual void select_vertices(Graph &g, VertexList &vertex_set, size_t n, size_t m); + + galois::runtime::iterable > neighbor_sampler(Graph &g, GNode v); + + Graph::edge_iterator sampled_edge_begin(Graph &g, GNode v) { return g.edge_begin(v); } + + Graph::edge_iterator sampled_edge_end(Graph &g, GNode v) { return g.edge_end(v); } + +protected: + size_t m; + // Utility function to randomly select k items from [begin, end) + VertexList selectVertex(GNode begin, GNode end, size_t k); + // Utility function to find ceiling of r in arr[l..h] + inline int findCeil(std::vector arr, unsigned r, unsigned l, unsigned h); + // Utility function to select one element from n elements given a frequency (probability) distribution + size_t selectOneVertex(size_t n, std::vector dist); + // Given a subset of vertices and a graph g, generate a subgraph sg from the graph g + void generate_subgraph(VertexList &vertex_set, Graph &g, Graph &sub); +}; + +} diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index e7600b4605..35f9970b4c 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -22,6 +22,8 @@ typedef uint8_t label_t; // label is for classification (supervised learning) typedef uint8_t mask_t; // mask is used to indicate different uses of labels: // train, val, test typedef uint32_t VertexID; +typedef uint64_t EdgeID; +typedef std::vector VertexList; #define CHUNK_SIZE 256 #define TB_SIZE 256 diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 4d77433eda..34db607c60 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -126,9 +126,8 @@ size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) { } void Context::copy_data_to_device() { - assert(labels.size() == n); CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t))); - CUDA_CHECK(cudaMemcpy(d_labels, &labels[0], n * sizeof(label_t), + CUDA_CHECK(cudaMemcpy(d_labels, labels, n * sizeof(label_t), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t))); CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), @@ -138,7 +137,7 @@ void Context::copy_data_to_device() { //void Context::copy_data_to_device() { //float_malloc_device(n, d_labels); - //float_copy_device(n, &labels[0], d_labels); + //float_copy_device(n, labels, d_labels); //float_malloc_device(n*feat_len, d_feats); //float_copy_device(n*feat_len, &h_feats[0], d_feats); //} diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index 57264976db..4a76861860 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -12,6 +12,10 @@ sigmoid_loss_layer::sigmoid_loss_layer(unsigned level, loss = new float_t[in_dims[0]]; // error for each sample } +sigmoid_loss_layer::~sigmoid_loss_layer() { + delete loss; +} + void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu new file mode 100644 index 0000000000..185a03f1fe --- /dev/null +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu @@ -0,0 +1,38 @@ +#include "deepgalois/layers/sigmoid_loss_layer.h" +#include "gg.h" +#include "ggcuda.h" + +namespace deepgalois { + +sigmoid_loss_layer::sigmoid_loss_layer(unsigned level, + std::vector in_dims, + std::vector out_dims) + : layer(level, in_dims, out_dims) { + trainable_ = false; + name_ = layer_type() + "_" + std::to_string(level); + float_malloc_device(in_dims[0], loss); +} + +sigmoid_loss_layer::~sigmoid_loss_layer() { + float_free_device(loss); +} + +void sigmoid_loss_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + init_const_gpu(input_dims[0], 0.0, loss); + sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, + d_masks_, context->d_labels, loss, out_data); +} + +void sigmoid_loss_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + d_sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, + context->d_labels, out_data, in_grad); +} + +acc_t sigmoid_loss_layer::get_masked_loss() { + return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss); +} + +} // namespace diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu index 6ed45bc98e..b232284017 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cu +++ b/libdeepgalois/src/layers/softmax_loss_layer.cu @@ -2,34 +2,6 @@ #include "gg.h" #include "ggcuda.h" -__global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks, - float_t* loss, - HGAccumulator total) { - total.thread_entry(); - __shared__ cub::BlockReduce::TempStorage local_loss; - CUDA_KERNEL_LOOP(i, end - begin) { - if (masks[begin + i] == 1) - // total += loss[begin+i]; - total.reduce(loss[begin + i]); - } - total.thread_exit>(local_loss); -} - -//acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, float_t* loss); -acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, - float_t* loss) { - assert(count > 0); - HGAccumulator loss_accum; - Shared total_loss = Shared(1); - *(total_loss.cpu_wr_ptr()) = 0; - loss_accum.rv = total_loss.gpu_wr_ptr(); - masked_avg_loss_kernel<<>>( - begin, end, masks, loss, loss_accum); - CudaTest("solving masked_avg_loss kernel failed"); - cudaDeviceSynchronize(); - return *(total_loss.cpu_rd_ptr()) / count; -} - namespace deepgalois { softmax_loss_layer::softmax_loss_layer(unsigned level, @@ -60,7 +32,7 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, } acc_t softmax_loss_layer::get_masked_loss() { - return masked_avg_loss(begin_, end_, count_, d_masks_, loss); + return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss); } } // namespace diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 8002d728a5..5e607f6bed 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -311,6 +311,11 @@ __device__ void softmax_device(int n, const float_t* input, float_t* output) { } } +__device__ void sigmoid_device(int n, const float_t* in, float_t* out) { + for (int i = 0; i < n; i++) + out[i] = 1. / (1. + expf(-in[i])); +} + __device__ void cross_entropy_device(int n, const label_t idx, const float_t* p, float_t& loss) { if (p[idx] == 0.0) loss -= logf(float_t(1e-10)); else loss -= logf(p[idx]); @@ -343,6 +348,31 @@ void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in, CudaTest("solving softmax_cross_entropy kernel failed"); } +// n: number of vectors +// len: length of vectors +// for each vector, do softmax to normalize the vector, and then compute a loss +__global__ void sigmoid_cross_entropy_kernel(int len, int begin, int end, + const float_t* in_data, + const mask_t* masks, + const label_t* labels, + float_t* loss, float_t* out_data) { + CUDA_KERNEL_LOOP(i, end-begin) { + int id = begin + i; + if (masks[id] == 1) { // masked + sigmoid_device(len, in_data + len*id, out_data + len*id); + cross_entropy_device(len, labels[id], out_data + len*id, loss[id]); + } + } +} + +void sigmoid_cross_entropy_gpu(int len, int begin, int end, const float_t* in, + const mask_t* masks, const label_t* labels, + float_t* loss, float_t* out) { + sigmoid_cross_entropy_kernel<<>>( + len, begin, end, in, masks, labels, loss, out); + CudaTest("solving sigmoid_cross_entropy kernel failed"); +} + __device__ void d_cross_entropy_device(int n, const label_t idx, const float_t* p, float_t* d) { for (int i = 0; i < n; i++) { if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10); @@ -394,7 +424,7 @@ __global__ void d_cross_entropy_warp(int len, int begin, int end, } } } -// TODO: use warp + __device__ void d_softmax_device(int n, const float_t* p, const float_t* dp, float_t* dy) { for (int i = 0; i < n; i++) { dy[i] = 0; @@ -406,8 +436,8 @@ __device__ void d_softmax_device(int n, const float_t* p, const float_t* dp, flo } __global__ void d_softmax_kernel(int len, int begin, int end, - const mask_t* masks, const float_t* data, - const float_t* in_grad, float_t* out_grad) { + const mask_t* masks, const float_t* data, + const float_t* in_grad, float_t* out_grad) { CUDA_KERNEL_LOOP(i, end-begin) { int id = begin + i; if (masks[id] == 1) { // masked @@ -417,8 +447,8 @@ __global__ void d_softmax_kernel(int len, int begin, int end, } __global__ void d_softmax_warp(int len, int begin, int end, - const mask_t* masks, const float_t* data, - const float_t* in_grad, float_t* out_grad) { + const mask_t* masks, const float_t* data, + const float_t* in_grad, float_t* out_grad) { __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; __shared__ float_t d[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; const int thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index @@ -457,8 +487,8 @@ __global__ void d_softmax_warp(int len, int begin, int end, } __global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end, - const mask_t* masks, const label_t* labels, - const float_t* out, float_t* diff) { + const mask_t* masks, const label_t* labels, + const float_t* out, float_t* diff) { CUDA_KERNEL_LOOP(i, end-begin) { int id = begin + i; if (masks[id] == 1) { // masked @@ -536,3 +566,42 @@ void d_softmax_cross_entropy_gpu(int len, int begin, int end, CudaTest("solving d_softmax_cross_entropy_warp kernel failed"); } +__global__ void d_sigmoid_cross_entropy_warp(int len, int begin, int end, + const mask_t* masks, const label_t* labels, + const float_t* data, float_t* grad) { + +} + +void d_sigmoid_cross_entropy_gpu(int len, int begin, int end, + const mask_t* masks, const label_t* labels, + const float_t* out, float_t* diff) { + d_sigmoid_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>( + len, begin, end, masks, labels, out, diff); + CudaTest("solving d_softmax_cross_entropy_warp kernel failed"); +} + +__global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks, + float_t* loss, HGAccumulator total) { + total.thread_entry(); + __shared__ cub::BlockReduce::TempStorage local_loss; + CUDA_KERNEL_LOOP(i, end - begin) { + if (masks[begin + i] == 1) + total.reduce(loss[begin + i]); + } + total.thread_exit>(local_loss); +} + +//acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, float_t* loss); +acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss) { + assert(count > 0); + HGAccumulator loss_accum; + Shared total_loss = Shared(1); + *(total_loss.cpu_wr_ptr()) = 0; + loss_accum.rv = total_loss.gpu_wr_ptr(); + masked_avg_loss_kernel<<>>( + begin, end, masks, loss, loss_accum); + CudaTest("solving masked_avg_loss kernel failed"); + cudaDeviceSynchronize(); + return *(total_loss.cpu_rd_ptr()) / count; +} + diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 6d390ea867..6f1d18351d 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -119,14 +119,18 @@ void Net::train(optimizer* opt, bool need_validate) { // forward: after this phase, layer edges will contain intermediate features // for use during backprop Tfw.start(); - train_loss = - Net::fprop(train_begin, train_end, train_count, &train_mask[0]); // forward + train_loss = Net::fprop(train_begin, train_end, train_count, &train_mask[0]); // forward +#ifdef CPU_ONLY + Graph *g = context->getGraphPointer(); +#else + CSRGraph *g = context->getGpuGraphPointer(); +#endif if (is_single_class) { train_acc = masked_accuracy(train_begin, train_end, train_count, - &train_mask[0], context->getGraphPointer()); // predict + &train_mask[0], g); // predict } else { train_acc = masked_multi_class_accuracy(train_begin, train_end, train_count, - &train_mask[0], context->getGraphPointer()); // predict + &train_mask[0], g); // predict } Tfw.stop(); @@ -177,10 +181,15 @@ double Net::evaluate(size_t begin, size_t end, size_t count, mask_t* masks, Timer t_eval; t_eval.Start(); loss = fprop(begin, end, count, masks); +#ifdef CPU_ONLY + Graph* g = context->getCpuGraphPointer(); +#else + CSRGraph* g = context->getGpuGraphPointer(); +#endif if (is_single_class) { - acc = masked_accuracy(begin, end, count, masks, context->getGraphPointer()); + acc = masked_accuracy(begin, end, count, masks, g); } else { - acc = masked_multi_class_accuracy(begin, end, count, masks, context->getGraphPointer()); + acc = masked_multi_class_accuracy(begin, end, count, masks, g); } t_eval.Stop(); return t_eval.Millisecs(); diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index 62dec7cad4..70f70b9a88 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -34,7 +34,6 @@ __global__ void masked_accuracy_kernel(int num_classes, int begin, total.thread_exit>(local_accuracy); } -//acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count, mask_t* masks, float_t* preds, label_t* labels); acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count, mask_t* masks, float_t* preds, label_t* labels) { assert(count > 0); @@ -49,12 +48,83 @@ acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count, return *(total_accuracy.cpu_rd_ptr()) / count; } +__global__ void masked_f1_score_kernel(int num_classes, int begin, + int end, mask_t* masks, + float_t* preds, label_t* labels, + float_t* true_positive, + float_t* false_positive, + float_t* false_negtive) { + CUDA_KERNEL_LOOP(i, end - begin) { + if (masks[begin + i] == 1) { + for (size_t j = 0; j < num_classes; j++) { + auto idx = i * num_classes + j; + if (labels[idx] == 1 && preds[idx] > 0.5) { + true_positive[j] ++; + } else if (labels[idx] == 0 && preds[idx] > 0.5) { + false_positive[j] ++; + } else if (labels[idx] == 1 && preds[idx] <= 0.5) { + false_negtive[j] ++; + } + } + } + } +} + +acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, + mask_t* masks, float_t* preds, label_t* labels) { + float beta = 1.0; + assert(count > 0); + float *h_tp = new float[num_classes]; + float *h_fp = new float[num_classes]; + float *h_fn = new float[num_classes]; + float *d_tp, *d_fp, *d_fn; + float_malloc_device(num_classes, d_tp); + float_malloc_device(num_classes, d_fp); + float_malloc_device(num_classes, d_fn); + masked_f1_score_kernel<<>>( + num_classes, begin, end, masks, preds, labels, d_tp, d_fp, d_fn); + cudaMemcpy(&h_tp, d_tp, sizeof(bool), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_fp, d_fp, sizeof(bool), cudaMemcpyDeviceToHost); + cudaMemcpy(&h_fn, d_fn, sizeof(bool), cudaMemcpyDeviceToHost); + + acc_t pNumerator = 0.0; + acc_t pDenominator = 0.0; + acc_t rNumerator = 0.0; + acc_t rDenominator = 0.0; + for (size_t i = 0; i < num_classes; i++) { + auto fn = h_fn[i]; // false negtive + auto fp = h_fp[i]; // false positive + auto tp = h_tp[i]; // true positive + pNumerator = pNumerator + tp; + pDenominator = pDenominator + (tp + fp); + rNumerator = rNumerator + tp; + rDenominator = rDenominator + (tp + fn); + } + auto recallMicro = rNumerator / rDenominator; + acc_t precisionMicro = pNumerator / pDenominator; + auto fscoreMicro = (((beta * beta) + 1) * precisionMicro * recallMicro) / + ((beta * beta) * precisionMicro + recallMicro); + float_free_device(d_tp); + float_free_device(d_fp); + float_free_device(d_fn); + return fscoreMicro; +} + namespace deepgalois { acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, - mask_t* masks) { + mask_t* masks, CSRGraph *g) { return masked_accuracy_gpu(num_classes, begin, end, count, layers[NUM_CONV_LAYERS]->get_device_masks(), layers[NUM_CONV_LAYERS - 1]->next()->get_data(), context->d_labels); } + +acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, + mask_t* masks, CSRGraph* g) { + return masked_f1_score_gpu(num_classes, begin, end, count, + layers[NUM_CONV_LAYERS]->get_device_masks(), + layers[NUM_CONV_LAYERS - 1]->next()->get_data(), + context->d_labels); } + +} // end namespace diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index 98dac8c75c..fdfb9802cf 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -2,13 +2,19 @@ #include #include -// selecet k vertices from begin to end -static std::vector selectVertex(GNode begin, GNode end, size_t k) { +inline unsigned getDegree(Graph &g, GNode v) { + return std::distance(g.edge_begin(v), g.edge_end(v)); +} + +namespace deepgalois { + +// Utility function to randomly select k items from [begin, end) +VertexList Sampler::selectVertex(GNode begin, GNode end, size_t k) { auto i = begin; // reservoir[] is the output array. Initialize // it with first k vertices - std::vector reservoir(k); + VertexList reservoir(k); for (; i < k; i++) reservoir[i] = i; // Use a different seed value so that we don't get @@ -29,7 +35,7 @@ static std::vector selectVertex(GNode begin, GNode end, size_t k) { } // Utility function to find ceiling of r in arr[l..h] -int findCeil(std::vector arr, unsigned r, unsigned l, unsigned h) { +inline int Sampler::findCeil(std::vector arr, unsigned r, unsigned l, unsigned h) { unsigned mid; while (l < h) { mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2 @@ -38,9 +44,9 @@ int findCeil(std::vector arr, unsigned r, unsigned l, unsigned h) { return (arr[l] >= r) ? l : -1; } -// select one element from n elements given a frequency (probability) distribution +// Utility function to select one element from n elements given a frequency (probability) distribution // https://www.geeksforgeeks.org/random-number-generator-in-arbitrary-probability-distribution-fashion/ -size_t selectOneVertex(size_t n, std::vector dist) { +size_t Sampler::selectOneVertex(size_t n, std::vector dist) { std::vector offsets(n); offsets[0] = dist[0]; // compute the prefix sum of the distribution @@ -53,17 +59,14 @@ size_t selectOneVertex(size_t n, std::vector dist) { return findCeil(offsets, r, 0, n - 1); } -inline unsigned getDegree(Graph &g, GNode v) { - return std::distance(g.edge_begin(v), g.edge_end(v)); -} - -void generate_subgraph(std::set &vertex_set, Graph &g, Graph &sub) { +// Given a subset of vertices and a graph g, generate a subgraph sg from the graph g +void Sampler::generate_subgraph(VertexList &vertex_set, Graph &g, Graph &sub) { auto nv = vertex_set.size(); size_t ne = 0; std::vector offsets(nv+1); offsets[0] = 0; size_t i = 0; - std::vector vertices(nv); + VertexList vertices(nv); for (auto v : vertex_set) { vertices[i] = v; offsets[i+1] = offsets[i] + getDegree(g, v); @@ -80,15 +83,15 @@ void generate_subgraph(std::set &vertex_set, Graph &g, Graph &sub) { } } -// generate a subgraph sg with size n from the input graph g -// n: number of vertices in the subgraph -// m: number of vertices in the frontier -void subgraph_sampler(Graph &g, Graph &sg, size_t n, size_t m) { +// !API function for user-defined selection strategy +// Select n vertices from graph g and put them in vertex_set. +// n: number of vertices in the subgraph; +// m: number of vertices in the frontier. +void Sampler::select_vertices(Graph &g, VertexList &vertex_set, size_t n, size_t m) { + assert(n == vertex_set.size()); auto num_vertices = g.size(); // number of vertices in the original input graph auto frontier = selectVertex(0, num_vertices, m); // randomly select m vertices from g as frontier - std::set vertex_set; - for (size_t i = 0; i < m; i++) - vertex_set.insert(frontier[i]); + for (size_t i = 0; i < m; i++) vertex_set[i] = frontier[i]; std::vector degrees(m); //std::vector probabilities(m); //unsigned sum_degree = 0; @@ -107,7 +110,15 @@ void subgraph_sampler(Graph &g, Graph &sg, size_t n, size_t m) { degrees[pos] = getDegree(g, frontier[pos]); //sum_degree -= degree; //sum_degree += degrees[pos]; - vertex_set.insert(u); + vertex_set.push_back(u); } +} + +void Sampler::subgraph_sampler(Graph &g, Graph&sg, size_t n) { + VertexList vertex_set(n); + select_vertices(g, vertex_set, n, m); generate_subgraph(vertex_set, g, sg); } + +} // end namespace + diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index 72d6560aca..9030af2249 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -20,7 +20,7 @@ const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pub // and https://github.com/ashokpant/accuracy-evaluation-cpp/blob/master/src/evaluation.hpp acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, size_t num_classes, label_t *ground_truth, float_t *pred) { - float beta = 1; + float beta = 1.0; std::vector true_positive(num_classes, 0); std::vector false_positive(num_classes, 0); std::vector false_negtive(num_classes, 0); From de60b193054ac35bf0bd61917d2648d450252e6c Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 20 Apr 2020 21:35:30 -0500 Subject: [PATCH 180/660] fix --- libdeepgalois/include/deepgalois/DistContext.h | 2 +- libdeepgalois/scripts/run-single.sh | 2 +- libdeepgalois/src/context.cpp | 2 +- libdeepgalois/src/net.cpp | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 704247d54b..37e2eea372 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -39,7 +39,7 @@ class DistContext { void initializeSyncSubstrate(); galois::graphs::GluonSubstrate* getSyncSubstrate(); - Graph* getGraphPointer() { + Graph* getCpuGraphPointer() { return graph_cpu; } diff --git a/libdeepgalois/scripts/run-single.sh b/libdeepgalois/scripts/run-single.sh index 9c0d9fcb63..37a393d788 100755 --- a/libdeepgalois/scripts/run-single.sh +++ b/libdeepgalois/scripts/run-single.sh @@ -3,7 +3,7 @@ GALOIS_HOME=/net/ohm/export/cdgc/cxh/GaloisCpp LONESTARGNN=$GALOIS_HOME/build-gnn-cpu/lonestargnn GNNS="gcn" -GRAPHS="cora citeseer pudmed flickr reddit" +GRAPHS="cora citeseer pubmed flickr reddit" #GRAPHS="cora" EPOCHS="200" NTHREADS="56" diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 8d6616182f..7a94df8c17 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -94,7 +94,7 @@ void Context::add_selfloop(Graph &og, Graph &g) { //*/ } -Graph* Context::getGraphPointer() { +Graph* Context::getCpuGraphPointer() { return Context::graph_cpu; } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 6f1d18351d..08af8872f0 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -121,7 +121,7 @@ void Net::train(optimizer* opt, bool need_validate) { Tfw.start(); train_loss = Net::fprop(train_begin, train_end, train_count, &train_mask[0]); // forward #ifdef CPU_ONLY - Graph *g = context->getGraphPointer(); + Graph *g = context->getCpuGraphPointer(); #else CSRGraph *g = context->getGpuGraphPointer(); #endif From d8943bafa95b694d246e20377aafedcbb3c5abda Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 21 Apr 2020 09:25:47 -0500 Subject: [PATCH 181/660] fix minor --- libdeepgalois/include/deepgalois/context.h | 1 + libdeepgalois/include/deepgalois/types.h | 2 +- libdeepgalois/src/context.cpp | 7 ++++++- libdeepgalois/src/context.cu | 14 ++++++++++++-- libdeepgalois/src/math_functions.cu | 12 +++++++++++- 5 files changed, 31 insertions(+), 5 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index d6bb004b7a..206b395bb8 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -59,6 +59,7 @@ class Context { size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D bool is_single_class; // single-class (one-hot) or multi-class label + bool is_selfloop_added; // whether selfloop is added to the input graph label_t *labels; // labels for classification: N x 1 float_t* h_feats; // input features: N x D #ifndef CPU_ONLY diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index 35f9970b4c..92e0d31772 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -29,7 +29,7 @@ typedef std::vector VertexList; #define TB_SIZE 256 #define BLOCK_SIZE 256 #define WARP_SIZE 32 -#define MAX_NUM_CLASSES 64 +#define MAX_NUM_CLASSES 128 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE) #define USE_CUSPARSE diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 7a94df8c17..98b3f7ed15 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -7,7 +7,11 @@ namespace deepgalois { #ifdef CPU_ONLY -Context::Context() {} +Context::Context() : n(0), num_classes(0), feat_len(0), + is_single_class(true), is_selfloop_added(false), + labels(NULL), h_feats(NULL), norm_factor(NULL), + d_labels(NULL), d_feats(NULL) {} + Context::~Context() { if (labels) delete labels; if (h_feats) delete h_feats; @@ -37,6 +41,7 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo Graph graph_temp; galois::graphs::readGraph(graph_temp, filename); add_selfloop(graph_temp, *graph_cpu); + is_selfloop_added = selfloop; } else galois::graphs::readGraph(*graph_cpu, filename); // TODO dist version of self loop } else { diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 34db607c60..4ed442c70d 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -61,7 +61,10 @@ cusparseHandle_t Context::cusparse_handle_ = 0; cusparseMatDescr_t Context::cusparse_matdescr_ = 0; curandGenerator_t Context::curand_generator_ = 0; -Context::Context() { +Context::Context() : n(0), num_classes(0), feat_len(0), + is_single_class(true), is_selfloop_added(false), + labels(NULL), h_feats(NULL), norm_factor(NULL), + d_labels(NULL), d_feats(NULL) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_)); @@ -89,6 +92,10 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) { void Context::norm_factor_counting() { std::cout << "Pre-computing normalization factor (n=" << n << ") ... "; + if (!is_selfloop_added) { + std::cout << "Set -sl=1 to add selfloop\n"; + exit(0); + } #ifdef USE_CUSPARSE int nnz = graph_gpu.nedges; CUDA_CHECK(cudaMalloc((void**)&norm_factor, nnz * sizeof(float_t))); @@ -120,7 +127,10 @@ size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) { std::string filename = path + dataset_str + ".csgr"; CSRGraph g; g.read(filename.c_str(), false); - if (selfloop) g.add_selfloop(); + if (selfloop) { + g.add_selfloop(); + is_selfloop_added = selfloop; + } g.copy_to_gpu(graph_gpu); return graph_gpu.nnodes; } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 5e607f6bed..fa5f02de21 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -321,6 +321,16 @@ __device__ void cross_entropy_device(int n, const label_t idx, const float_t* p, else loss -= logf(p[idx]); } +// y: ground truth +// p: predictions +__device__ void cross_entropy_multi_device(int n, const label_t *y, const float_t* p, float_t& loss) { + for (int i = 0; i < n; i++) { + if (y[i] == 0) continue; + if (p[i] == float_t(0)) loss -= logf(float_t(1e-10)); // avoid NaN exception + else loss -= logf(p[i]); + } +} + // n: number of vectors // len: length of vectors // for each vector, do softmax to normalize the vector, and then compute a loss @@ -360,7 +370,7 @@ __global__ void sigmoid_cross_entropy_kernel(int len, int begin, int end, int id = begin + i; if (masks[id] == 1) { // masked sigmoid_device(len, in_data + len*id, out_data + len*id); - cross_entropy_device(len, labels[id], out_data + len*id, loss[id]); + cross_entropy_multi_device(len, labels, out_data + len*id, loss[id]); } } } From f7dfad169878768f2b0eef400aef87d5b746a236 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 21 Apr 2020 10:22:28 -0500 Subject: [PATCH 182/660] refine --- libdeepgalois/include/deepgalois/context.h | 14 ++++--- .../deepgalois/layers/graph_conv_layer.h | 2 +- .../src/layers/sigmoid_loss_layer.cu | 4 +- .../src/layers/softmax_loss_layer.cu | 4 +- libdeepgalois/src/net.cu | 41 ++++++++++--------- libdeepgalois/src/utils.cpp | 18 ++++---- 6 files changed, 43 insertions(+), 40 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index 206b395bb8..e40b6a6371 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -22,23 +22,22 @@ class Context { ~Context(); size_t read_graph(std::string dataset_str, bool selfloop); + size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop); + size_t read_graph_gpu(std::string dataset_str, bool selfloop); size_t read_labels(std::string dataset_str); size_t read_features(std::string dataset_str, std::string filetype = "bin"); + label_t get_label(size_t i) { return labels[i]; } // single-class (one-hot) label label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label label_t* get_labels_ptr() { return labels; } + label_t* get_labels_device_ptr() { return d_labels; } float_t* get_in_ptr(); + float_t* get_norm_factor() { return norm_factor; } - size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop); - size_t read_graph_gpu(std::string dataset_str, bool selfloop); void copy_data_to_device(); // copy labels and input features void norm_factor_counting(); void set_label_class(bool is_single = true) { is_single_class = is_single; } - float_t* d_feats; // input features on device - label_t* d_labels; // labels on device - float_t* norm_factor; // normalization constant based on graph structure - #ifdef CPU_ONLY Graph* graph_cpu; // the input graph, |V| = N void genGraph(LGraph& lg, Graph& g); @@ -62,6 +61,9 @@ class Context { bool is_selfloop_added; // whether selfloop is added to the input graph label_t *labels; // labels for classification: N x 1 float_t* h_feats; // input features: N x D + float_t* norm_factor; // normalization constant based on graph structure + label_t* d_labels; // labels on device + float_t* d_feats; // input features on device #ifndef CPU_ONLY static cublasHandle_t cublas_handle_; // used to call cuBLAS static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 0bf7a7e698..c9b8729d62 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -36,7 +36,7 @@ class graph_conv_layer : public layer { void init(); std::string layer_type() const override { return std::string("graph_conv"); } void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; } - void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->norm_factor; } + void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->get_norm_factor(); } //! Uses weights contained in this layer to update in_data (results from previous) //! and save result to out_data virtual void forward_propagation(const float_t* in_data, float_t* out_data); diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu index 185a03f1fe..6d7268d4af 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cu +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu @@ -21,14 +21,14 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { init_const_gpu(input_dims[0], 0.0, loss); sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, - d_masks_, context->d_labels, loss, out_data); + d_masks_, context->get_labels_device_ptr(), loss, out_data); } void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { d_sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, - context->d_labels, out_data, in_grad); + context->get_labels_device_ptr(), out_data, in_grad); } acc_t sigmoid_loss_layer::get_masked_loss() { diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu index b232284017..c2f3a98303 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cu +++ b/libdeepgalois/src/layers/softmax_loss_layer.cu @@ -21,14 +21,14 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { init_const_gpu(input_dims[0], 0.0, loss); softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, - d_masks_, context->d_labels, loss, out_data); + d_masks_, context->get_labels_device_ptr(), loss, out_data); } void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, - context->d_labels, out_data, in_grad); + context->get_labels_device_ptr(), out_data, in_grad); } acc_t softmax_loss_layer::get_masked_loss() { diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index 70f70b9a88..e7f7d7b603 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -48,22 +48,23 @@ acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count, return *(total_accuracy.cpu_rd_ptr()) / count; } +typedef float f1count_t; __global__ void masked_f1_score_kernel(int num_classes, int begin, int end, mask_t* masks, float_t* preds, label_t* labels, - float_t* true_positive, - float_t* false_positive, - float_t* false_negtive) { + f1count_t* true_positive, + f1count_t* false_positive, + f1count_t* false_negtive) { CUDA_KERNEL_LOOP(i, end - begin) { if (masks[begin + i] == 1) { for (size_t j = 0; j < num_classes; j++) { auto idx = i * num_classes + j; if (labels[idx] == 1 && preds[idx] > 0.5) { - true_positive[j] ++; + atomicAdd(&true_positive[j], 1.0); } else if (labels[idx] == 0 && preds[idx] > 0.5) { - false_positive[j] ++; + atomicAdd(&false_positive[j], 1.0); } else if (labels[idx] == 1 && preds[idx] <= 0.5) { - false_negtive[j] ++; + atomicAdd(&false_negtive[j], 1.0); } } } @@ -74,35 +75,35 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, mask_t* masks, float_t* preds, label_t* labels) { float beta = 1.0; assert(count > 0); - float *h_tp = new float[num_classes]; - float *h_fp = new float[num_classes]; - float *h_fn = new float[num_classes]; - float *d_tp, *d_fp, *d_fn; + f1count_t* h_tp = new f1count_t[num_classes]; + f1count_t* h_fp = new f1count_t[num_classes]; + f1count_t* h_fn = new f1count_t[num_classes]; + f1count_t* d_tp, *d_fp, *d_fn; float_malloc_device(num_classes, d_tp); float_malloc_device(num_classes, d_fp); float_malloc_device(num_classes, d_fn); masked_f1_score_kernel<<>>( num_classes, begin, end, masks, preds, labels, d_tp, d_fp, d_fn); - cudaMemcpy(&h_tp, d_tp, sizeof(bool), cudaMemcpyDeviceToHost); - cudaMemcpy(&h_fp, d_fp, sizeof(bool), cudaMemcpyDeviceToHost); - cudaMemcpy(&h_fn, d_fn, sizeof(bool), cudaMemcpyDeviceToHost); + cudaMemcpy(h_tp, d_tp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost); + cudaMemcpy(h_fp, d_fp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost); + cudaMemcpy(h_fn, d_fn, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost); acc_t pNumerator = 0.0; acc_t pDenominator = 0.0; acc_t rNumerator = 0.0; acc_t rDenominator = 0.0; for (size_t i = 0; i < num_classes; i++) { - auto fn = h_fn[i]; // false negtive - auto fp = h_fp[i]; // false positive - auto tp = h_tp[i]; // true positive + acc_t fn = (acc_t)h_fn[i]; // false negtive + acc_t fp = (acc_t)h_fp[i]; // false positive + acc_t tp = (acc_t)h_tp[i]; // true positive pNumerator = pNumerator + tp; pDenominator = pDenominator + (tp + fp); rNumerator = rNumerator + tp; rDenominator = rDenominator + (tp + fn); } - auto recallMicro = rNumerator / rDenominator; + acc_t recallMicro = rNumerator / rDenominator; acc_t precisionMicro = pNumerator / pDenominator; - auto fscoreMicro = (((beta * beta) + 1) * precisionMicro * recallMicro) / + acc_t fscoreMicro = (((beta * beta) + 1) * precisionMicro * recallMicro) / ((beta * beta) * precisionMicro + recallMicro); float_free_device(d_tp); float_free_device(d_fp); @@ -116,7 +117,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, return masked_accuracy_gpu(num_classes, begin, end, count, layers[NUM_CONV_LAYERS]->get_device_masks(), layers[NUM_CONV_LAYERS - 1]->next()->get_data(), - context->d_labels); + context->get_labels_device_ptr()); } acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, @@ -124,7 +125,7 @@ acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, return masked_f1_score_gpu(num_classes, begin, end, count, layers[NUM_CONV_LAYERS]->get_device_masks(), layers[NUM_CONV_LAYERS - 1]->next()->get_data(), - context->d_labels); + context->get_labels_device_ptr()); } } // end namespace diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index 9030af2249..77657c3f3c 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -21,19 +21,19 @@ const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pub acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, size_t num_classes, label_t *ground_truth, float_t *pred) { float beta = 1.0; - std::vector true_positive(num_classes, 0); - std::vector false_positive(num_classes, 0); - std::vector false_negtive(num_classes, 0); + std::vector true_positive(num_classes, 0); + std::vector false_positive(num_classes, 0); + std::vector false_negtive(num_classes, 0); galois::do_all(galois::iterate(begin, end), [&](const auto& i) { if (masks[i] == 1) { for (size_t j = 0; j < num_classes; j++) { auto idx = i * num_classes + j; if (ground_truth[idx] == 1 && pred[idx] > 0.5) { - true_positive[j] ++; + __sync_fetch_and_add(&true_positive[j], 1); } else if (ground_truth[idx] == 0 && pred[idx] > 0.5) { - false_positive[j] ++; + __sync_fetch_and_add(&false_positive[j], 1); } else if (ground_truth[idx] == 1 && pred[idx] <= 0.5) { - false_negtive[j] ++; + __sync_fetch_and_add(&false_negtive[j], 1); } } } @@ -43,9 +43,9 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, acc_t rNumerator = 0.0; acc_t rDenominator = 0.0; for (size_t i = 0; i < num_classes; i++) { - auto fn = false_negtive[i]; // false negtive - auto fp = false_positive[i]; // false positive - auto tp = true_positive[i]; // true positive + acc_t fn = (acc_t)false_negtive[i]; // false negtive + acc_t fp = (acc_t)false_positive[i]; // false positive + acc_t tp = (acc_t)true_positive[i]; // true positive pNumerator = pNumerator + tp; pDenominator = pDenominator + (tp + fp); rNumerator = rNumerator + tp; From 9a78d96b23ef786175b93b37f740c88f933a5b45 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 21 Apr 2020 13:42:13 -0500 Subject: [PATCH 183/660] fix gpu --- .../include/deepgalois/layers/layer.h | 5 +- libdeepgalois/include/deepgalois/net.h | 37 +++--- libdeepgalois/include/deepgalois/utils.h | 4 +- libdeepgalois/src/net.cpp | 107 +++++++++++++----- libdeepgalois/src/net.cu | 6 +- libdeepgalois/src/utils.cpp | 11 +- lonestargnn/gcn/gcn.cpp | 31 +---- 7 files changed, 123 insertions(+), 78 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 116ab43aa1..e3c47bf72c 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -85,9 +85,10 @@ class layer : public deepgalois::node { begin_ = sample_begin; end_ = sample_end; count_ = sample_count; +#ifdef CPU_ONLY masks_ = masks; -#ifndef CPU_ONLY - copy_masks_device(input_dims[0], masks_, d_masks_); +#else + d_masks_ = masks; #endif } diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 98573d60b5..af55864424 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -30,7 +30,11 @@ namespace deepgalois { // layer 2: features N x 16, weights 16 x E, out N x E class Net { public: - Net() {} + Net() : is_single_class(true), num_samples(0), num_classes(0), + num_layers(0), num_epochs(0), + train_begin(0), train_end(0), train_count(0), + val_begin(0), val_end(0), val_count(0), + train_masks(NULL), val_masks(NULL), context(NULL) {} #ifndef GALOIS_USE_DIST void init(std::string dataset_str, unsigned epochs, unsigned hidden1, bool selfloop, bool is_single = true); @@ -44,8 +48,8 @@ class Net { void construct_layers(); void append_out_layer(size_t layer_id); void train(optimizer* opt, bool need_validate); // training - double evaluate(size_t begin, size_t end, size_t count, - mask_t* masks, acc_t& loss, acc_t& acc); // inference + double evaluate(std::string type, acc_t& loss, acc_t& acc); // inference + void read_test_masks(std::string dataset, Graph* dGraph); //! Add a convolution layer to the network void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, @@ -101,21 +105,28 @@ class Net { } protected: + bool is_single_class; // single-class (one-hot) or multi-class label + size_t num_samples; // number of samples: N + size_t num_classes; // number of vertex classes: E + size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 + unsigned num_epochs; // number of epochs + size_t train_begin, train_end, train_count; + size_t val_begin, val_end, val_count; + size_t test_begin, test_end, test_count; + + mask_t* train_masks; // masks for training + mask_t* d_train_masks; // masks for training on device + mask_t* val_masks; // masks for validation + mask_t* d_val_masks; // masks for validation on device + mask_t* test_masks; // masks for test + mask_t* d_test_masks; // masks for test on device + std::vector feature_dims; // feature dimnesions for each layer + std::vector layers; // all the layers in the neural network #ifndef GALOIS_USE_DIST deepgalois::Context* context; #else deepgalois::DistContext* context; #endif - bool is_single_class; // single-class (one-hot) or multi-class label - size_t num_samples; // number of samples: N - size_t num_classes; // number of vertex classes: E - size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 - unsigned num_epochs; // number of epochs - - std::vector feature_dims; // feature dimnesions for each layer - std::vector train_mask, val_mask; // masks for traning and validation - size_t train_begin, train_end, train_count, val_begin, val_end, val_count; - std::vector layers; // all the layers in the neural network // comparing outputs with the ground truth (labels) #ifdef CPU_ONLY diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h index b7a84bb10a..71a0b7748c 100644 --- a/libdeepgalois/include/deepgalois/utils.h +++ b/libdeepgalois/include/deepgalois/utils.h @@ -110,9 +110,9 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, #ifdef GALOIS_USE_DIST size_t read_masks(std::string dataset_str, std::string mask_type, - size_t& begin, size_t& end, std::vector& masks, Graph* dGraph); + size_t n, size_t& begin, size_t& end, mask_t* masks, Graph* dGraph); #else size_t read_masks(std::string dataset_str, std::string mask_type, - size_t& begin, size_t& end, std::vector& masks); + size_t n, size_t& begin, size_t& end, mask_t* masks); #endif } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 08af8872f0..c0919b8c52 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -31,8 +31,10 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, num_epochs = epochs; //std::cout << "Reading label masks ... "; - train_mask.resize(num_samples, 0); - val_mask.resize(num_samples, 0); + train_masks = new mask_t[num_samples]; + val_masks = new mask_t[num_samples]; + std::fill(train_masks, train_masks+num_samples, 0); + std::fill(val_masks, val_masks+num_samples, 0); // get testing and validation sets if (dataset_str == "reddit") { train_begin = 0, train_count = 153431, @@ -40,37 +42,32 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; // TODO do all can be used below #ifndef GALOIS_USE_DIST - for (size_t i = train_begin; i < train_end; i++) train_mask[i] = 1; - for (size_t i = val_begin; i < val_end; i++) val_mask[i] = 1; + for (size_t i = train_begin; i < train_end; i++) train_masks[i] = 1; + for (size_t i = val_begin; i < val_end; i++) val_masks[i] = 1; #else // find local ID from global ID, set if it exists for (size_t i = train_begin; i < train_end; i++) { if (dGraph->isLocal(i)) { - train_mask[dGraph->getLID(i)] = 1; + train_masks[dGraph->getLID(i)] = 1; } } for (size_t i = val_begin; i < val_end; i++) { if (dGraph->isLocal(i)) { - val_mask[dGraph->getLID(i)] = 1; + val_masks[dGraph->getLID(i)] = 1; } } #endif } else { #ifndef GALOIS_USE_DIST - train_count = - read_masks(dataset_str, "train", train_begin, train_end, train_mask); - val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask); + train_count = read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks); + val_count = read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks); #else - train_count = - read_masks(dataset_str, "train", train_begin, train_end, train_mask, - dGraph); - val_count = read_masks(dataset_str, "val", val_begin, val_end, val_mask, - dGraph); + train_count = read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks, dGraph); + val_count = read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks, dGraph); #endif } - //std::cout << "Done\n"; - // NOTE: train_begin/train_end are global IDs, train_mask is a local id + // NOTE: train_begin/train_end are global IDs, train_masks is a local id // train count and val count are LOCAL counts num_layers = NUM_CONV_LAYERS + 1; @@ -82,7 +79,10 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, feature_dims[2] = num_classes; // output embedding: E feature_dims[3] = num_classes; // normalized output embedding: E layers.resize(num_layers); + #ifndef CPU_ONLY + copy_masks_device(num_samples, train_masks, d_train_masks); + copy_masks_device(num_samples, val_masks, d_val_masks); context->copy_data_to_device(); // copy labels and input features to the device #endif } @@ -119,19 +119,20 @@ void Net::train(optimizer* opt, bool need_validate) { // forward: after this phase, layer edges will contain intermediate features // for use during backprop Tfw.start(); - train_loss = Net::fprop(train_begin, train_end, train_count, &train_mask[0]); // forward + double fw_time = evaluate("train", train_loss, train_acc); + /* + train_loss = Net::fprop(train_begin, train_end, train_count, train_masks); // forward #ifdef CPU_ONLY Graph *g = context->getCpuGraphPointer(); #else CSRGraph *g = context->getGpuGraphPointer(); #endif if (is_single_class) { - train_acc = masked_accuracy(train_begin, train_end, train_count, - &train_mask[0], g); // predict + train_acc = masked_accuracy(train_begin, train_end, train_count, train_masks, g); // predict } else { - train_acc = masked_multi_class_accuracy(train_begin, train_end, train_count, - &train_mask[0], g); // predict + train_acc = masked_multi_class_accuracy(train_begin, train_end, train_count, train_masks, g); // predict } + */ Tfw.stop(); // backward: use intermediate features + ground truth to update layers @@ -157,15 +158,15 @@ void Net::train(optimizer* opt, bool need_validate) { // Validation acc_t val_loss = 0.0, val_acc = 0.0; Tval.start(); - double val_time = evaluate(val_begin, val_end, val_count, &val_mask[0], - val_loss, val_acc); + double val_time = evaluate("val", val_loss, val_acc); Tval.stop(); galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, val_loss, " val_acc ", val_acc, seperator); galois::gPrint(header, "time ", std::setprecision(3), std::fixed, epoch_time + val_time, " ms (train_time ", epoch_time, " val_time ", val_time, ")\n"); } else { - galois::gPrint(header, "train_time ", std::fixed, epoch_time, " ms\n"); + galois::gPrint(header, "train_time ", std::fixed, epoch_time, + " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, ")\n"); } } double avg_train_time = total_train_time / (double)num_epochs; @@ -175,11 +176,38 @@ void Net::train(optimizer* opt, bool need_validate) { } // evaluate, i.e. inference or predict -double Net::evaluate(size_t begin, size_t end, size_t count, mask_t* masks, - acc_t& loss, acc_t& acc) { +double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { // TODO may need to do something for the dist case Timer t_eval; t_eval.Start(); + size_t begin = 0, end = 0, count = 0; + mask_t* masks = NULL; + if (type == "train") { + begin = train_begin; + end = train_end; + count = train_count; + masks = train_masks; + } else if (type == "val") { + begin = val_begin; + end = val_end; + count = val_count; + masks = val_masks; + } else { + begin = test_begin; + end = test_end; + count = test_count; + masks = test_masks; + } +#ifndef CPU_ONLY + if (type == "train") { + masks = d_train_masks; + } else if (type == "val") { + masks = d_val_masks; + } else { + masks = d_test_masks; + } +#endif + loss = fprop(begin, end, count, masks); #ifdef CPU_ONLY Graph* g = context->getCpuGraphPointer(); @@ -233,6 +261,33 @@ void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, if (layer_id > 0) connect(layers[layer_id - 1], layers[layer_id]); } +void Net::read_test_masks(std::string dataset, Graph* dGraph) { + test_masks = new mask_t[num_samples]; + if (dataset == "reddit") { + test_begin = 177262; + test_count = 55703; + test_end = test_begin + test_count; +#ifndef GALOIS_USE_DIST + for (size_t i = test_begin; i < test_end; i++) test_masks[i] = 1; +#else + for (size_t i = test_begin; i < test_end; i++) { + if (dGraph->isLocal(i)) { + test_masks[dGraph->getLID(i)] = 1; + } + } +#endif + } else { +#ifndef GALOIS_USE_DIST + test_count = deepgalois::read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks); +#else + test_count = deepgalois::read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks, dGraph); +#endif + } +#ifndef CPU_ONLY + copy_masks_device(num_samples, test_masks, d_test_masks); +#endif +} + #ifdef CPU_ONLY /** * diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index e7f7d7b603..a26cf603b6 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -114,16 +114,14 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, namespace deepgalois { acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *g) { - return masked_accuracy_gpu(num_classes, begin, end, count, - layers[NUM_CONV_LAYERS]->get_device_masks(), + return masked_accuracy_gpu(num_classes, begin, end, count, masks, layers[NUM_CONV_LAYERS - 1]->next()->get_data(), context->get_labels_device_ptr()); } acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph* g) { - return masked_f1_score_gpu(num_classes, begin, end, count, - layers[NUM_CONV_LAYERS]->get_device_masks(), + return masked_f1_score_gpu(num_classes, begin, end, count, masks, layers[NUM_CONV_LAYERS - 1]->next()->get_data(), context->get_labels_device_ptr()); } diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index 77657c3f3c..46470e2997 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -62,7 +62,7 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, //! Get masks from datafile where first line tells range of //! set to create mask from size_t read_masks(std::string dataset_str, std::string mask_type, - size_t& begin, size_t& end, std::vector& masks) { + size_t n, size_t& begin, size_t& end, mask_t* masks) { bool dataset_found = false; for (int i = 0; i < NUM_DATASETS; i++) { if (dataset_str == dataset_names[i]) { @@ -96,14 +96,14 @@ size_t read_masks(std::string dataset_str, std::string mask_type, } std::cout << mask_type + "_mask range: [" << begin << ", " << end << ") Number of valid samples: " << sample_count << " (" - << (float)sample_count/(float)masks.size()*(float)100 << "\%)\n"; + << (float)sample_count/(float)n*(float)100 << "\%)\n"; in.close(); return sample_count; } #else size_t read_masks(std::string dataset_str, std::string mask_type, - size_t& begin, size_t& end, - std::vector& masks, Graph* dGraph) { + size_t n, size_t& begin, size_t& end, + mask_t* masks, Graph* dGraph) { bool dataset_found = false; for (int i = 0; i < NUM_DATASETS; i++) { if (dataset_str == dataset_names[i]) { @@ -139,7 +139,8 @@ size_t read_masks(std::string dataset_str, std::string mask_type, i++; } std::cout << mask_type + "_mask range: [" << begin << ", " << end - << ") Number of valid samples: " << sample_count << "\n"; + << ") Number of valid samples: " << sample_count << "(" + << (float)sample_count/(float)n*(float)100 << "\%)\n"; in.close(); return sample_count; } diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index e23097befe..109b0522f1 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -46,38 +46,17 @@ int main(int argc, char** argv) { Ttrain.stop(); if (do_test) { - galois::gPrint("\n"); // test using test samples - size_t n = network.get_nnodes(); - acc_t test_loss = 0.0, test_acc = 0.0; - size_t test_begin = 0, test_end = n, test_count = n; - std::vector test_mask(n, 0); - if (dataset == "reddit") { - test_begin = 177262; - test_count = 55703; - test_end = test_begin + test_count; -#ifndef GALOIS_USE_DIST - for (size_t i = test_begin; i < test_end; i++) - test_mask[i] = 1; -#else - for (size_t i = test_begin; i < test_end; i++) { - if (dGraph->isLocal(i)) { - test_mask[dGraph->getLID(i)] = 1; - } - } -#endif - } else { + galois::gPrint("\n"); #ifndef GALOIS_USE_DIST - test_count = deepgalois::read_masks(dataset, "test", test_begin, test_end, test_mask); + network.read_test_masks(dataset, NULL); #else - test_count = deepgalois::read_masks(dataset, "test", test_begin, test_end, - test_mask, dGraph); + network.read_test_masks(dataset, dGraph); #endif - } galois::StatTimer Ttest("Test"); Ttest.start(); - double test_time = network.evaluate(test_begin, test_end, test_count, - &test_mask[0], test_loss, test_acc); + acc_t test_loss = 0.0, test_acc = 0.0; + double test_time = network.evaluate("test", test_loss, test_acc); galois::gPrint("Testing: test_loss = ", test_loss, " test_acc = ", test_acc, " test_time = ", test_time, "\n"); Ttest.stop(); From bcb03c51dcb1550f50cdba46bc19d2e740cc056b Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 21 Apr 2020 18:46:06 -0500 Subject: [PATCH 184/660] fix f1score gpu --- libdeepgalois/include/deepgalois/net.h | 17 +------ libdeepgalois/src/context.cu | 16 +++++-- .../src/layers/sigmoid_loss_layer.cpp | 2 +- libdeepgalois/src/math_functions.cu | 46 ++++++++++++++++++- libdeepgalois/src/net.cpp | 16 +++++++ libdeepgalois/src/net.cu | 22 ++++++--- 6 files changed, 89 insertions(+), 30 deletions(-) diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index af55864424..cb7578dafc 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -50,6 +50,7 @@ class Net { void train(optimizer* opt, bool need_validate); // training double evaluate(std::string type, acc_t& loss, acc_t& acc); // inference void read_test_masks(std::string dataset, Graph* dGraph); + acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks); // forward propagation //! Add a convolution layer to the network void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, @@ -72,22 +73,6 @@ class Net { layers[i]->print_layer_info(); } - //! forward propagation: [begin, end) is the range of samples used. - //! calls "forward" on the layers of the network and returns the loss of the - //! final layer - acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) { - // set mask for the last layer - layers[num_layers - 1]->set_sample_mask(begin, end, count, masks); - // layer0: from N x D to N x 16 - // layer1: from N x 16 to N x E - // layer2: from N x E to N x E (normalize only) - for (size_t i = 0; i < num_layers; i++) { - layers[i]->forward(); - // TODO need to sync model between layers here - } - return layers[num_layers - 1]->get_masked_loss(); - } - // back propogation void bprop() { for (size_t i = num_layers; i != 0; i--) { diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 4ed442c70d..93300abffb 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -83,6 +83,9 @@ Context::~Context() { CUSPARSE_CHECK(cusparseDestroyMatDescr(cusparse_matdescr_)); if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_)); + if (d_labels) CUDA_CHECK(cudaFree(d_labels)); + if (d_feats) CUDA_CHECK(cudaFree(d_feats)); + if (norm_factor) CUDA_CHECK(cudaFree(norm_factor)); } size_t Context::read_graph(std::string dataset_str, bool selfloop) { @@ -136,12 +139,15 @@ size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) { } void Context::copy_data_to_device() { - CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t))); - CUDA_CHECK(cudaMemcpy(d_labels, labels, n * sizeof(label_t), - cudaMemcpyHostToDevice)); + if (is_single_class) { + CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t))); + CUDA_CHECK(cudaMemcpy(d_labels, labels, n * sizeof(label_t), cudaMemcpyHostToDevice)); + } else { + CUDA_CHECK(cudaMalloc((void**)&d_labels, n * num_classes * sizeof(label_t))); + CUDA_CHECK(cudaMemcpy(d_labels, labels, n * num_classes * sizeof(label_t), cudaMemcpyHostToDevice)); + } CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t))); - CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); //print_device_vector(10, d_feats, "d_feats"); } diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index 4a76861860..feb493a636 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -41,7 +41,7 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* size_t idx = len * i; float_t *norm_grad = new float_t[len]; float_t *ground_truth = new float_t[len]; - for (size_t j = 0; j < len; j++) ground_truth[j] = context->get_label(i, j); + for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)context->get_label(i, j); // use ground truth to determine derivative of cross entropy math::d_cross_entropy(len, ground_truth, &out_data[idx], norm_grad); // derviative sigmoid to gradient used in the next layer diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index fa5f02de21..6438fc5db3 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -529,6 +529,8 @@ __global__ void d_softmax_cross_entropy_warp(int len, int begin, int end, if (pid < len) p[warp_lane][pid] = data[base+pid]; } __syncthreads(); + + // cross entropy derivative for (int i = 0; i < len; i += WARP_SIZE) { int pid = thread_lane + i; if (pid < len) { @@ -538,6 +540,8 @@ __global__ void d_softmax_cross_entropy_warp(int len, int begin, int end, } } __syncthreads(); + + // softmax derivative for (int i = 0; i < len; i += WARP_SIZE) { int pid = thread_lane + i; if (pid < len) { @@ -579,7 +583,47 @@ void d_softmax_cross_entropy_gpu(int len, int begin, int end, __global__ void d_sigmoid_cross_entropy_warp(int len, int begin, int end, const mask_t* masks, const label_t* labels, const float_t* data, float_t* grad) { + __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; + __shared__ float_t d[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; + const int thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + for (int wid = warp_id; wid < end-begin; wid += num_warps) { + int id = begin + wid; + int base = id * len; + if (masks[id] == 1) { + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) p[warp_lane][pid] = data[base+pid]; + } + __syncthreads(); + // cross entropy derivative + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) { + //if (p[warp_lane][pid] == 0) + d[warp_lane][pid] = -(float_t)labels[base+pid] / (p[warp_lane][pid] + 1e-10); + //else d[warp_lane][pid] = -(float_t)labels[pid] / 1e-10; + } + } + __syncthreads(); + + // sigmoid derivative + for (int i = 0; i < len; i += WARP_SIZE) { + int pid = thread_lane + i; + if (pid < len) { + float_t self = p[warp_lane][pid]; + float_t dp = d[warp_lane][pid]; + grad[base+pid] = dp * self * (float_t(1) - self); + } + } + __syncthreads(); + } + } } void d_sigmoid_cross_entropy_gpu(int len, int begin, int end, @@ -587,7 +631,7 @@ void d_sigmoid_cross_entropy_gpu(int len, int begin, int end, const float_t* out, float_t* diff) { d_sigmoid_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>( len, begin, end, masks, labels, out, diff); - CudaTest("solving d_softmax_cross_entropy_warp kernel failed"); + CudaTest("solving d_sigmoid_cross_entropy_warp kernel failed"); } __global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks, diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index c0919b8c52..45b91142d8 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -223,6 +223,22 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { return t_eval.Millisecs(); } +//! forward propagation: [begin, end) is the range of samples used. +//! calls "forward" on the layers of the network and returns the loss of the +//! final layer +acc_t Net::fprop(size_t begin, size_t end, size_t count, mask_t* masks) { + // set mask for the last layer + layers[num_layers - 1]->set_sample_mask(begin, end, count, masks); + // layer0: from N x D to N x 16 + // layer1: from N x 16 to N x E + // layer2: from N x E to N x E (normalize only) + for (size_t i = 0; i < num_layers; i++) { + layers[i]->forward(); + // TODO need to sync model between layers here + } + return layers[num_layers - 1]->get_masked_loss(); +} + void Net::construct_layers() { std::cout << "\nConstructing layers...\n"; append_conv_layer(0, true); // first conv layer diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index a26cf603b6..e8d60b1e03 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -25,8 +25,7 @@ __global__ void masked_accuracy_kernel(int num_classes, int begin, local_accuracy; CUDA_KERNEL_LOOP(i, end - begin) { if (masks[begin + i] == 1) { - label_t pred = (label_t)argmax_device(num_classes, - preds + (begin + i) * num_classes); + label_t pred = (label_t)argmax_device(num_classes, preds + (begin + i) * num_classes); if (pred == labels[begin + i]) total.reduce(1.0); } @@ -56,9 +55,10 @@ __global__ void masked_f1_score_kernel(int num_classes, int begin, f1count_t* false_positive, f1count_t* false_negtive) { CUDA_KERNEL_LOOP(i, end - begin) { - if (masks[begin + i] == 1) { + int id = begin + i; + if (masks[id] == 1) { for (size_t j = 0; j < num_classes; j++) { - auto idx = i * num_classes + j; + int idx = id * num_classes + j; if (labels[idx] == 1 && preds[idx] > 0.5) { atomicAdd(&true_positive[j], 1.0); } else if (labels[idx] == 0 && preds[idx] > 0.5) { @@ -82,11 +82,15 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, float_malloc_device(num_classes, d_tp); float_malloc_device(num_classes, d_fp); float_malloc_device(num_classes, d_fn); + init_const_gpu(num_classes, 0.0, d_tp); + init_const_gpu(num_classes, 0.0, d_fp); + init_const_gpu(num_classes, 0.0, d_fn); masked_f1_score_kernel<<>>( num_classes, begin, end, masks, preds, labels, d_tp, d_fp, d_fn); - cudaMemcpy(h_tp, d_tp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost); - cudaMemcpy(h_fp, d_fp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost); - cudaMemcpy(h_fn, d_fn, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost); + CudaTest("solving masked_f1_score_kernel kernel failed"); + CUDA_CHECK(cudaMemcpy(h_tp, d_tp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_fp, d_fp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_fn, d_fn, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost)); acc_t pNumerator = 0.0; acc_t pDenominator = 0.0; @@ -105,9 +109,13 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, acc_t precisionMicro = pNumerator / pDenominator; acc_t fscoreMicro = (((beta * beta) + 1) * precisionMicro * recallMicro) / ((beta * beta) * precisionMicro + recallMicro); + float_free_device(d_tp); float_free_device(d_fp); float_free_device(d_fn); + delete h_tp; + delete h_fp; + delete h_fn; return fscoreMicro; } From a2d83a606bf2a8f60a5018bd16daa0be0b1eea4c Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 22 Apr 2020 20:25:18 -0500 Subject: [PATCH 185/660] add weight_decay --- .../deepgalois/layers/graph_conv_layer.h | 1 + .../include/deepgalois/layers/layer.h | 3 +- .../deepgalois/layers/sigmoid_loss_layer.h | 2 +- .../deepgalois/layers/softmax_loss_layer.h | 2 +- .../include/deepgalois/math_functions.hh | 7 ++- libdeepgalois/include/deepgalois/net.h | 29 ++++----- libdeepgalois/include/deepgalois/utils.h | 2 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 11 +++- libdeepgalois/src/layers/graph_conv_layer.cu | 11 +++- .../src/layers/sigmoid_loss_layer.cpp | 2 +- .../src/layers/sigmoid_loss_layer.cu | 2 +- .../src/layers/softmax_loss_layer.cpp | 2 +- .../src/layers/softmax_loss_layer.cu | 2 +- libdeepgalois/src/math_functions.cpp | 25 ++++++-- libdeepgalois/src/math_functions.cu | 5 ++ libdeepgalois/src/net.cpp | 56 +++++++++-------- libdeepgalois/src/net.cu | 4 +- lonestargnn/gcn/gcn.cpp | 19 +++--- lonestargnn/include/lonestargnn.h | 60 +++++++------------ 19 files changed, 139 insertions(+), 106 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index c9b8729d62..63062133df 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -37,6 +37,7 @@ class graph_conv_layer : public layer { std::string layer_type() const override { return std::string("graph_conv"); } void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; } void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->get_norm_factor(); } + virtual acc_t get_weight_decay_loss(); //! Uses weights contained in this layer to update in_data (results from previous) //! and save result to out_data virtual void forward_propagation(const float_t* in_data, float_t* out_data); diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index e3c47bf72c..188feebe75 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -61,7 +61,8 @@ class layer : public deepgalois::node { //! save context virtual void set_context(ContextType* ctx) { context = ctx; } //! return layer loss - virtual acc_t get_masked_loss() { return acc_t(0); } + virtual acc_t get_prediction_loss() { return acc_t(0); } + virtual acc_t get_weight_decay_loss() { return acc_t(0); } // main functions for layer work virtual void forward_propagation(const float_t* in_data, diff --git a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h index 334bf4363e..0f46cde043 100644 --- a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h +++ b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h @@ -13,6 +13,6 @@ class sigmoid_loss_layer : public layer { virtual void forward_propagation(const float_t* in_data, float_t* out_data); virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); - virtual acc_t get_masked_loss(); + virtual acc_t get_prediction_loss(); }; } diff --git a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h index 7194d06f2e..1a5b7e86ee 100644 --- a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h +++ b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h @@ -13,6 +13,6 @@ class softmax_loss_layer : public layer { virtual void forward_propagation(const float_t* in_data, float_t* out_data); virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); - virtual acc_t get_masked_loss(); + virtual acc_t get_prediction_loss(); }; } diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 2c3a8014ee..a70ae071f9 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -27,6 +27,8 @@ void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out); void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out); //! do dot product of 2 vectors float_t dot(const vec_t& x, const vec_t& y); +//! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2 +float_t l2_norm(size_t n, const float_t* a); //! clear n elements of a vector void clear_cpu(size_t n, float_t* in); // dropout functions randomly remove weights @@ -153,8 +155,8 @@ void sigmoid_cross_entropy_gpu(int len, int begin, int end, const float_t* in_da void d_sigmoid_cross_entropy_gpu(int len, int bengin, int end, const mask_t* masks, const label_t* labels, const float_t* out_data, float_t* diff); -void scal_gpu(const int N, const float alpha, float* X); -void add_scalar_gpu(const int N, const float_t alpha, float_t* Y); +void scal_gpu(const int n, const float alpha, float* X); +void add_scalar_gpu(const int n, const float_t alpha, float_t* Y); void rng_uniform_gpu(const int n, const float_t a, const float_t b, float_t* r); bool is_allocated_device(float_t* data); void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks); @@ -162,4 +164,5 @@ void float_malloc_device(int n, float_t*& ptr); void float_free_device(float_t*& ptr); void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr); acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss); +acc_t l2_norm_gpu(int n, float_t *tensor); #endif diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index cb7578dafc..6a03611371 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -18,10 +18,6 @@ #include "deepgalois/DistContext.h" #endif - - -#define NUM_CONV_LAYERS 2 - namespace deepgalois { // N: number of vertices, D: feature vector dimentions, @@ -31,17 +27,15 @@ namespace deepgalois { class Net { public: Net() : is_single_class(true), num_samples(0), num_classes(0), - num_layers(0), num_epochs(0), + num_conv_layers(0), num_layers(0), num_epochs(0), + learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0), train_begin(0), train_end(0), train_count(0), val_begin(0), val_end(0), val_count(0), - train_masks(NULL), val_masks(NULL), context(NULL) {} - #ifndef GALOIS_USE_DIST - void init(std::string dataset_str, unsigned epochs, unsigned hidden1, - bool selfloop, bool is_single = true); - #else - void init(std::string dataset_str, unsigned epochs, unsigned hidden1, - bool selfloop, Graph* dGraph); - #endif + test_begin(0), test_end(0), test_count(0), + train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {} + void init(std::string dataset_str, unsigned num_conv, unsigned epochs, + unsigned hidden1, float lr, float dropout, float wd, + bool selfloop, bool is_single, Graph* dGraph); size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } size_t get_nnodes() { return num_samples; } @@ -54,8 +48,7 @@ class Net { //! Add a convolution layer to the network void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, - bool bias = false, bool dropout = true, - float_t dropout_rate = 0.5); + bool bias = false, bool dropout = true); //! Save the context object to all layers of the network void set_contexts() { @@ -93,8 +86,12 @@ class Net { bool is_single_class; // single-class (one-hot) or multi-class label size_t num_samples; // number of samples: N size_t num_classes; // number of vertex classes: E - size_t num_layers; // for now hard-coded: NUM_CONV_LAYERS + 1 + size_t num_conv_layers; // number of convolutional layers + size_t num_layers; // total number of layers (conv + output) unsigned num_epochs; // number of epochs + float learning_rate; // learning rate + float dropout_rate; // dropout rate + float weight_decay; // weighti decay for over-fitting size_t train_begin, train_end, train_count; size_t val_begin, val_end, val_count; size_t test_begin, test_end, test_count; diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h index 71a0b7748c..097457290d 100644 --- a/libdeepgalois/include/deepgalois/utils.h +++ b/libdeepgalois/include/deepgalois/utils.h @@ -102,7 +102,7 @@ uniform_rand(T min, T max) { } inline bool bernoulli(float_t p) { - return uniform_rand(float_t(0), float_t(1)) <= p; + return uniform_rand(float_t(0), float_t(1)) > p; } acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 171b32305c..b640acd75a 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -123,6 +123,15 @@ void graph_conv_layer::back_propagation(const float_t* in_data, //galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); #endif } -#endif + +acc_t graph_conv_layer::get_weight_decay_loss() { + acc_t loss = 0.0; + for (size_t i = 0; i < y*z; i+=z) { + loss += math::l2_norm(z, &layer::W[i]); + } + return loss; +} + +#endif // end if CPU_ONLY } // namespace diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index 12d9902179..322500d916 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -47,7 +47,11 @@ void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, // GPU forward: compute output features // NOTE: in_data will be used in back-prop, so it can not be modified void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { - assert(z <= MAX_NUM_CLASSES); // currently only support feature length <= 128 + if (z > MAX_NUM_CLASSES) { + std::cout << "Currently support maximum hidden feature length of " << MAX_NUM_CLASSES << "\n"; + // currently only support feature length <= 128 + exit(0); + } init_const_gpu(x*z, 0.0, out_temp); if (dropout_ && phase_ == deepgalois::net_phase::train) dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); @@ -83,5 +87,10 @@ void graph_conv_layer::back_propagation(const float_t* in_data, d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad); } +acc_t graph_conv_layer::get_weight_decay_loss() { + acc_t loss = l2_norm_gpu(y*z, d_W); + return loss; +} + } // namespace diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index feb493a636..763bd6646d 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -52,7 +52,7 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* }, galois::chunk_size(), galois::steal(), galois::loopname("sigmoid-loss-bw")); } -acc_t sigmoid_loss_layer::get_masked_loss() { +acc_t sigmoid_loss_layer::get_prediction_loss() { assert(count_ > 0); AccumF total_loss; AccumU valid_sample_count; diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu index 6d7268d4af..c52b9089f0 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cu +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu @@ -31,7 +31,7 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, context->get_labels_device_ptr(), out_data, in_grad); } -acc_t sigmoid_loss_layer::get_masked_loss() { +acc_t sigmoid_loss_layer::get_prediction_loss() { return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss); } diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 7c5b11d233..4a92e56ec3 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -58,7 +58,7 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, // no weight sync required: this is all local graph information } -acc_t softmax_loss_layer::get_masked_loss() { +acc_t softmax_loss_layer::get_prediction_loss() { assert(count_ > 0); AccumF total_loss; AccumU valid_sample_count; diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu index c2f3a98303..e73ef27f33 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cu +++ b/libdeepgalois/src/layers/softmax_loss_layer.cu @@ -31,7 +31,7 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, context->get_labels_device_ptr(), out_data, in_grad); } -acc_t softmax_loss_layer::get_masked_loss() { +acc_t softmax_loss_layer::get_prediction_loss() { return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss); } diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index cdde9cc964..f81444fa70 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -45,7 +45,7 @@ void csrmm_cpu(const int M, const int N, const int K, const int nnz, #endif } -const size_t vec_len = 8; +const size_t vec_len = 8; // for 32-bit floating point in AVX2 // vector add #if defined(__AVX__) || defined(__AVX2__) void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) { @@ -76,6 +76,17 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) _mm256_storeu_ps(&out[i], _mm256_mul_ps(_mm256_loadu_ps(&in[i]), scal)); for (size_t i = alignedN; i < n; ++i) out[i] = alpha * in[i]; } + +float_t l2_norm(size_t n, const float_t* in) { + const size_t alignedN = n - n % vec_len; + __m256 vsum = _mm256_set1_ps(0.0); + for (size_t i = 0; i < alignedN; i += vec_len) { + __m256 a = _mm256_loadu_ps(&in[i]); + vsum = _mm256_add_ps(vsum, _mm256_mul_ps(a, a)); + } + __m256 sum = _mm256_hadd_ps(vsum, vsum); + return ((float_t*)&sum)[0] + ((float_t*)&sum)[2];; +} #else // vector multiply scalar void mul_scalar(const float_t alpha, vec_t& Y) { @@ -85,6 +96,12 @@ void mul_scalar(const float_t alpha, vec_t& Y) { void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) { for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i]; } + +float_t l2_norm(size_t n, const float_t* a) { + float_t sum = 0.0; + for (size_t i = 0; i < n; ++i) sum += a[i] * a[i]; + return sum/2.0; +} #endif // dot product @@ -117,7 +134,7 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in, assert(masks.size() == out.size()); // rng_bernoulli(1. - dropout_rate, masks); // Create random numbers for (size_t i = 0; i < in.size(); ++i) - masks[i] = deepgalois::bernoulli(dropout_rate); + masks[i] = deepgalois::bernoulli(dropout_rate)?1:0; for (size_t i = 0; i < in.size(); ++i) out[i] = in[i] * masks[i] * scale; } @@ -125,7 +142,7 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in, void dropout(const float scale, const float dropout_rate, const vec_t& in, std::vector& masks, float_t* out) { for (size_t i = 0; i < in.size(); ++i) - masks[i] = deepgalois::bernoulli(dropout_rate); + masks[i] = deepgalois::bernoulli(dropout_rate)?1:0; for (size_t i = 0; i < in.size(); ++i) out[i] = in[i] * masks[i] * scale; } @@ -133,7 +150,7 @@ void dropout(const float scale, const float dropout_rate, const vec_t& in, void dropout_cpu(size_t n, const float scale, const float dropout_rate, const float_t* in, unsigned* masks, float_t* out) { galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { - masks[i] = deepgalois::bernoulli(dropout_rate); + masks[i] = deepgalois::bernoulli(dropout_rate)?1:0; out[i] = in[i] * masks[i] * scale; }, galois::loopname("dropout")); } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 6438fc5db3..c15b749e8d 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -659,3 +659,8 @@ acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* return *(total_loss.cpu_rd_ptr()) / count; } +acc_t l2_norm_gpu(int n, float_t * tensor) { + acc_t sum = 0.0; + return sum / 2.0; +} + diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 45b91142d8..91e39affeb 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -6,15 +6,22 @@ namespace deepgalois { -#ifndef GALOIS_USE_DIST -void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, - bool selfloop, bool is_single) { -#else -void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, - bool selfloop, Graph* dGraph) { -#endif -#ifndef GALOIS_USE_DIST +void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, + unsigned hidden1, float lr, float dropout, float wd, + bool selfloop, bool is_single, Graph* dGraph) { + num_conv_layers = num_conv; + num_epochs = epochs; + learning_rate = lr; + dropout_rate = dropout; + weight_decay = wd; is_single_class = is_single; + galois::gPrint("Configuration: num_conv_layers ", num_conv_layers, + ", num_epochs ", num_epochs, + ", hidden1 ", hidden1, + ", learning_rate ", learning_rate, + ", dropout_rate ", dropout_rate, + ", weight_decay ", weight_decay, "\n"); +#ifndef GALOIS_USE_DIST context = new deepgalois::Context(); context->set_label_class(is_single); num_samples = context->read_graph(dataset_str, selfloop); @@ -28,14 +35,14 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, // read graph, get num nodes num_classes = context->read_labels(dataset_str); - num_epochs = epochs; //std::cout << "Reading label masks ... "; train_masks = new mask_t[num_samples]; val_masks = new mask_t[num_samples]; std::fill(train_masks, train_masks+num_samples, 0); std::fill(val_masks, val_masks+num_samples, 0); - // get testing and validation sets + + // get training and validation sets if (dataset_str == "reddit") { train_begin = 0, train_count = 153431, train_end = train_begin + train_count; @@ -70,7 +77,7 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, // NOTE: train_begin/train_end are global IDs, train_masks is a local id // train count and val count are LOCAL counts - num_layers = NUM_CONV_LAYERS + 1; + num_layers = num_conv_layers + 1; // initialize feature metadata feature_dims.resize(num_layers + 1); feature_dims[0] = @@ -88,15 +95,12 @@ void Net::init(std::string dataset_str, unsigned epochs, unsigned hidden1, } void Net::train(optimizer* opt, bool need_validate) { -#ifdef GALOIS_USE_DIST - unsigned myID = galois::runtime::getSystemNetworkInterface().ID; - std::string header = "[" + std::to_string(myID) + "] "; - std::string seperator = "\n"; -#else - //std::string header = "[" + std::to_string(0) + "] "; - //std::string seperator = "\n"; std::string header = ""; std::string seperator = " "; +#ifdef GALOIS_USE_DIST + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + header = "[" + std::to_string(myID) + "] "; + seperator = "\n"; #endif galois::gPrint("\nStart training...\n"); @@ -236,7 +240,11 @@ acc_t Net::fprop(size_t begin, size_t end, size_t count, mask_t* masks) { layers[i]->forward(); // TODO need to sync model between layers here } - return layers[num_layers - 1]->get_masked_loss(); + // prediction error + auto loss = layers[num_layers - 1]->get_prediction_loss(); + // Squared Norm Regularization to mitigate overfitting + loss += weight_decay * layers[0]->get_weight_decay_loss(); + return loss; } void Net::construct_layers() { @@ -265,9 +273,9 @@ void Net::append_out_layer(size_t layer_id) { //! Add a convolution layer to the network void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, - bool dropout, float_t dropout_rate) { + bool dropout) { assert(dropout_rate < 1.0); - assert(layer_id < NUM_CONV_LAYERS); + assert(layer_id < num_conv_layers); std::vector in_dims(2), out_dims(2); in_dims[0] = out_dims[0] = num_samples; in_dims[1] = get_in_dim(layer_id); @@ -327,7 +335,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks if (masks[i] == 1) { // get prediction int preds = argmax(num_classes, - &(layers[NUM_CONV_LAYERS - 1]->next()->get_data()[i * num_classes])); + &(layers[num_conv_layers - 1]->next()->get_data()[i * num_classes])); // check prediction if ((label_t)preds == context->get_label(i)) accuracy_all += 1.0; @@ -342,7 +350,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks if (masks[localID] == 1) { // get prediction int preds = argmax(num_classes, - &(layers[NUM_CONV_LAYERS - 1]->next()->get_data()[localID * num_classes])); + &(layers[num_conv_layers - 1]->next()->get_data()[localID * num_classes])); // check prediction if ((label_t)preds == context->get_label(localID)) accuracy_all += 1.0; @@ -361,7 +369,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks } acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph) { - auto preds = layers[NUM_CONV_LAYERS - 1]->next()->get_data(); + auto preds = layers[num_conv_layers - 1]->next()->get_data(); auto ground_truth = context->get_labels_ptr(); return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, ground_truth, preds); } diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index e8d60b1e03..c7acda5666 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -123,14 +123,14 @@ namespace deepgalois { acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *g) { return masked_accuracy_gpu(num_classes, begin, end, count, masks, - layers[NUM_CONV_LAYERS - 1]->next()->get_data(), + layers[num_conv_layers - 1]->next()->get_data(), context->get_labels_device_ptr()); } acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph* g) { return masked_f1_score_gpu(num_classes, begin, end, count, masks, - layers[NUM_CONV_LAYERS - 1]->next()->get_data(), + layers[num_conv_layers - 1]->next()->get_data(), context->get_labels_device_ptr()); } diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 109b0522f1..1a3698bc96 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -18,25 +18,22 @@ int main(int argc, char** argv) { LonestarGnnStart(argc, argv, name, desc, url); deepgalois::Net network; // the neural network to train + Graph* dGraph = NULL; #ifdef GALOIS_USE_DIST std::vector dummyVec; Graph* dGraph = galois::graphs::constructSymmetricGraph(dummyVec); #endif -#ifndef GALOIS_USE_DIST // read network, features, ground truth, initialize metadata - network.init(dataset, epochs, hidden1, add_selfloop, is_single_class); -#else - network.init(dataset, epochs, hidden1, add_selfloop, dGraph); -#endif - network.construct_layers(); // default setting for now; can be customized by - // the user + network.init(dataset, num_conv_layers, epochs, hidden1, learning_rate, + dropout_rate, weight_decay, add_selfloop, is_single_class, dGraph); + // default setting for now; can be customized by the user + network.construct_layers(); network.print_layers_info(); + deepgalois::ResourceManager rm; // tracks peak memory usage - // tracks peak memory usage - deepgalois::ResourceManager rm; - - // the optimizer used to update parameters, see optimizer.h for more details + // the optimizer used to update parameters, + // see optimizer.h for more details // optimizer *opt = new gradient_descent(); // optimizer *opt = new adagrad(); deepgalois::optimizer* opt = new deepgalois::adam(); diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h index 72acb8d1ff..1c96548a36 100644 --- a/lonestargnn/include/lonestargnn.h +++ b/lonestargnn/include/lonestargnn.h @@ -16,45 +16,31 @@ #endif namespace cll = llvm::cl; -static cll::opt - dataset(cll::Positional, cll::desc(""), - cll::Required); // 'cora', 'citeseer', 'pubmed' -//static cll::opt -// filetype(cll::Positional, cll::desc(""), -// cll::init("gr")); // file format of the input graph -static cll::opt - model("m", cll::desc("Model string"), - cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense' -static cll::opt - learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"), - cll::init(0.01)); -static cll::opt - epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"), - cll::init(1)); -static cll::opt - hidden1("h", - cll::desc("Number of units in hidden layer 1 (default value 16)"), - cll::init(16)); -static cll::opt dropout_rate( - "d", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), - cll::init(0.5)); -static cll::opt weight_decay( - "wd", - cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), - cll::init(5e-4)); -static cll::opt early_stopping( - "es", - cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), - cll::init(10)); -static cll::opt max_degree( - "md", cll::desc("Maximum Chebyshev polynomial degree (default value 3)"), - cll::init(3)); -static cll::opt do_validate("dv", cll::desc("enable validation"), - cll::init(1)); -static cll::opt do_test("dt", cll::desc("enable test"), cll::init(1)); -static cll::opt add_selfloop("sl", cll::desc("add selfloop"), cll::init(0)); +static cll::opt dataset(cll::Positional, + cll::desc(""), cll::Required); // 'cora', 'citeseer', 'pubmed' +//static cll::opt model("m", +// cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense' +static cll::opt epochs("k", + cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1)); +static cll::opt num_conv_layers("nc", + cll::desc("number of convolutional layers, (default value 2)"), cll::init(2)); +static cll::opt hidden1("h", + cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16)); +static cll::opt learning_rate("lr", + cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01)); +static cll::opt dropout_rate("d", + cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5)); +static cll::opt weight_decay("wd", + cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4)); +static cll::opt early_stopping("es", + cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10)); +static cll::opt max_degree("md", + cll::desc("Maximum size of the downsampled adjacency lists (default value 25)"), cll::init(25)); static cll::opt is_single_class("sc", cll::desc("single-class or multi-class label (default single)"), cll::init(1)); +static cll::opt do_validate("dv", cll::desc("enable validation"), cll::init(1)); +static cll::opt do_test("dt", cll::desc("enable test"), cll::init(1)); +static cll::opt add_selfloop("sl", cll::desc("add selfloop"), cll::init(0)); //! standard global options to the benchmarks extern llvm::cl::opt skipVerify; From 6b59d1c7a9a29b84188d8696467f37d6501ac3c8 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Thu, 23 Apr 2020 10:09:18 -0500 Subject: [PATCH 186/660] add leaky_relu_layer --- libdeepgalois/CMakeLists.txt | 17 ++++-- .../deepgalois/layers/leaky_relu_layer.h | 20 +++++++ .../include/deepgalois/layers/relu_layer.h | 11 +--- .../include/deepgalois/math_functions.hh | 4 ++ libdeepgalois/include/deepgalois/types.h | 2 +- libdeepgalois/src/layers/leaky_relu_layer.cpp | 32 ++++++++++++ libdeepgalois/src/layers/leaky_relu_layer.cu | 17 ++++++ libdeepgalois/src/layers/relu_layer.cpp | 52 +++++-------------- libdeepgalois/src/layers/relu_layer.cu | 19 +++++++ libdeepgalois/src/math_functions.cu | 32 ++++++++++-- 10 files changed, 149 insertions(+), 57 deletions(-) create mode 100644 libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h create mode 100644 libdeepgalois/src/layers/leaky_relu_layer.cpp create mode 100644 libdeepgalois/src/layers/leaky_relu_layer.cu create mode 100644 libdeepgalois/src/layers/relu_layer.cu diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index c2c64d4f0c..193988f414 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -41,6 +41,8 @@ else() src/layers/graph_conv_layer.cu src/layers/softmax_loss_layer.cu src/layers/sigmoid_loss_layer.cu + src/layers/leaky_relu_layer.cu + src/layers/relu_layer.cu src/layers/aggregator.cu src/math_functions.cu src/optimizer.cu @@ -58,25 +60,30 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if(ENABLE_DIST_GALOIS) # do not link regular context.cpp; TODO do this conditional in cleaner way set(sources - src/layers/graph_conv_layer.cpp src/layers/softmax_loss_layer.cpp + src/layers/sigmoid_loss_layer.cpp + src/layers/graph_conv_layer.cpp + src/layers/leaky_relu_layer.cpp + src/layers/relu_layer.cpp src/layers/aggregator.cpp - src/layers/layer.cpp src/math_functions.cpp - src/optimizer.cpp + src/layers/layer.cpp src/DistContext.cpp + src/optimizer.cpp src/utils.cpp src/node.cpp src/net.cpp ) else() set(sources - src/layers/graph_conv_layer.cpp src/layers/softmax_loss_layer.cpp src/layers/sigmoid_loss_layer.cpp + src/layers/graph_conv_layer.cpp + src/layers/leaky_relu_layer.cpp + src/layers/relu_layer.cpp src/layers/aggregator.cpp - src/layers/layer.cpp src/math_functions.cpp + src/layers/layer.cpp src/optimizer.cpp src/context.cpp src/sampler.cpp diff --git a/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h b/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h new file mode 100644 index 0000000000..a8b6136eea --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h @@ -0,0 +1,20 @@ +#pragma once +#include "layer.h" + +namespace deepgalois { +// Leaky ReLU Layer +class leaky_relu_layer : public layer { +public: + leaky_relu_layer(unsigned level, float_t eps, dims_t in_dims, dims_t out_dims); + leaky_relu_layer(unsigned level, dims_t in_dims, dims_t out_dims) : + leaky_relu_layer(level, 0.0, in_dims, out_dims) {} + ~leaky_relu_layer() {} + std::string layer_type() const override { return std::string("leaky_relu"); } + virtual void forward_propagation(const float_t* in_data, float_t* out_data); + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad); +protected: + float_t epsilon_; + size_t n; +}; +} // namespace diff --git a/libdeepgalois/include/deepgalois/layers/relu_layer.h b/libdeepgalois/include/deepgalois/layers/relu_layer.h index a85d51608d..601c5d67ed 100644 --- a/libdeepgalois/include/deepgalois/layers/relu_layer.h +++ b/libdeepgalois/include/deepgalois/layers/relu_layer.h @@ -5,18 +5,11 @@ namespace deepgalois { // ReLU Layer class relu_layer : public layer { public: - relu_layer(unsigned level, std::vector in_dims, - std::vector out_dims) - : layer(level, in_dims, out_dims) { - trainable_ = false; - } + relu_layer(unsigned level, dims_t in_dims, dims_t out_dims) + : layer(level, in_dims, out_dims) { trainable_ = false; } ~relu_layer() {} std::string layer_type() const override { return std::string("relu"); } - virtual void forward_propagation(const tensor_t& in_data, tensor_t& out_data); virtual void forward_propagation(const float_t* in_data, float_t* out_data); - virtual void back_propagation(const tensor_t& in_data, - const tensor_t& out_data, tensor_t& out_grad, - tensor_t& in_grad); virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); }; diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index a70ae071f9..a66d721d34 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -127,6 +127,10 @@ void vadd_gpu(const int n, const float_t* a, const float_t* b, void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff); // ReLU derivative +void leaky_relu_gpu(const int n, const float_t epsilon, + const float_t* in, float_t* out); // Leaky ReLU +void d_leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in_diff, + const float_t* data, float_t* out_diff); // Leaky ReLU derivative void dropout_gpu(const int n, const float scale, const float dropout_rate, const float_t* in, unsigned* masks, float_t* out); // dropout void d_dropout_gpu(const int n, const float scale, const float dropout_rate, diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index 92e0d31772..a2f6164439 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -24,6 +24,7 @@ typedef uint8_t mask_t; // mask is used to indicate different uses of labels: typedef uint32_t VertexID; typedef uint64_t EdgeID; typedef std::vector VertexList; +typedef std::vector dims_t; // dimentions type #define CHUNK_SIZE 256 #define TB_SIZE 256 @@ -33,7 +34,6 @@ typedef std::vector VertexList; #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE) #define USE_CUSPARSE - #ifdef GALOIS_USE_DIST namespace deepgalois { //! Set this to let sync struct know where to get data from diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp new file mode 100644 index 0000000000..650a0aa1be --- /dev/null +++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp @@ -0,0 +1,32 @@ +#include "deepgalois/layers/leaky_relu_layer.h" + +namespace deepgalois { + +leaky_relu_layer::leaky_relu_layer(unsigned level, float_t eps, + dims_t in_dims, dims_t out_dims) + : layer(level, in_dims, out_dims), epsilon_(eps) { + assert(input_dims[0] == output_dims[0]); // num_vertices + trainable_ = false; + n = input_dims[0] * input_dims[1]; + name_ = layer_type() + "_" + std::to_string(level); +} + +#ifdef CPU_ONLY +// ๐‘ฆ[๐‘™] = ๐‘ฆ[๐‘™โˆ’1] > 0 ? ๐‘ฆ[๐‘™โˆ’1]) : ๐‘ฆ[๐‘™โˆ’1] * ฮต +void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) { + galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { + out_data[i] = in_data[i] > (float_t)0 ? in_data[i] : epsilon_ * in_data[i]; + }, galois::chunk_size<64>(), galois::steal(), galois::loopname("leaky_relu_layer-fw")); +} + +// ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ * ฮต, ๐‘–๐‘“ (๐‘ฆ[๐‘™] โ‰ค 0) +// = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™, ๐‘–๐‘“ (๐‘ฆ[๐‘™] > 0) +void leaky_relu_layer::back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { + in_grad[i] = out_grad[i] * (out_data[i] > float_t(0) ? float_t(1) : epsilon_); + }, galois::chunk_size<64>(), galois::steal(), galois::loopname("leaky_relu_layer-bw")); +} +#endif + +} // namespace diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cu b/libdeepgalois/src/layers/leaky_relu_layer.cu new file mode 100644 index 0000000000..43e7f93d04 --- /dev/null +++ b/libdeepgalois/src/layers/leaky_relu_layer.cu @@ -0,0 +1,17 @@ +#include "deepgalois/layers/leaky_relu_layer.h" + +namespace deepgalois { + +// ๐‘ฆ[๐‘™] = ๐‘ฆ[๐‘™โˆ’1] > 0 ? ๐‘ฆ[๐‘™โˆ’1]) : ๐‘ฆ[๐‘™โˆ’1] * ฮต +void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) { + leaky_relu_gpu(n, epsilon_, in_data, out_data); +} + +// ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ * ฮต, ๐‘–๐‘“ (๐‘ฆ[๐‘™] โ‰ค 0) +// = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™, ๐‘–๐‘“ (๐‘ฆ[๐‘™] > 0) +void leaky_relu_layer::back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + d_leaky_relu_gpu(n, epsilon_, out_grad, in_data, in_grad); +} + +} // namespace diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp index 7441294f83..f0d3a74a49 100644 --- a/libdeepgalois/src/layers/relu_layer.cpp +++ b/libdeepgalois/src/layers/relu_layer.cpp @@ -2,48 +2,24 @@ namespace deepgalois { +#ifdef CPU_ONLY // ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) -void relu_layer::forward_propagation(const tensor_t& in_data, - tensor_t& out_data) { - galois::do_all(galois::iterate((size_t)0, input_dims[0]), - [&](const auto& i) { - for (size_t j = 0; j < input_dims[1]; ++j) - out_data[i][j] = std::max(in_data[i][j], (float_t)0); - }, - galois::chunk_size(), galois::steal(), - galois::loopname("relu_layer-fw")); -} - -// ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) -void relu_layer::forward_propagation(const float_t* in_data, - float_t* out_data) { - const size_t count = input_dims[0] * input_dims[1]; - relu_gpu(count, in_data, out_data); -} - -// ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) -// = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ , ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ -void relu_layer::back_propagation(const tensor_t& in_data, - const tensor_t& out_data, tensor_t& out_grad, - tensor_t& in_grad) { - galois::do_all(galois::iterate((size_t)0, input_dims[0]), - [&](const auto& i) { - for (size_t j = 0; j < input_dims[1]; ++j) - in_grad[i][j] = out_data[i][j] > float_t(0) - ? out_grad[i][j] - : float_t(0); - }, - galois::chunk_size(), galois::steal(), - galois::loopname("relu_layer-bw")); +void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) { + size_t n = input_dims[0] * input_dims[1]; + galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { + out_data[i] = std::max(in_data[i], (float_t)0); + }, galois::chunk_size<64>(), galois::steal(), galois::loopname("relu_layer-fw")); } // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) -// = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ , ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ -void relu_layer::back_propagation(const float_t* in_data, - const float_t* out_data, float_t* out_grad, - float_t* in_grad) { - const size_t count = input_dims[0] * input_dims[1]; - d_relu_gpu(count, out_grad, in_data, in_grad); +// = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™, ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ +void relu_layer::back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + size_t n = input_dims[0] * input_dims[1]; + galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { + in_grad[i] = out_data[i] > float_t(0) ? out_grad[i] : float_t(0); + }, galois::chunk_size<64>(), galois::steal(), galois::loopname("relu_layer-bw")); } +#endif } // namespace diff --git a/libdeepgalois/src/layers/relu_layer.cu b/libdeepgalois/src/layers/relu_layer.cu new file mode 100644 index 0000000000..f3a45936b4 --- /dev/null +++ b/libdeepgalois/src/layers/relu_layer.cu @@ -0,0 +1,19 @@ +#include "deepgalois/layers/relu_layer.h" + +namespace deepgalois { + +// ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) +void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) { + const size_t count = input_dims[0] * input_dims[1]; + relu_gpu(count, in_data, out_data); +} + +// ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) +// = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™, ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ +void relu_layer::back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + const size_t count = input_dims[0] * input_dims[1]; + d_relu_gpu(count, out_grad, in_data, in_grad); +} + +} // namespace diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index c15b749e8d..61114f0daf 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -109,7 +109,7 @@ void d_dropout_gpu(const int n, const float scale, const float dropout_rate, // flattern data into 1D before feed into the ReLU operater __global__ void relu_kernel(const int n, const float_t* in, float_t* out) { - CUDA_KERNEL_LOOP(index, n) { out[index] = in[index] > 0 ? in[index] : 0; } + CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] > 0 ? in[i] : 0; } } void relu_gpu(const int n, const float_t* in, float_t* out) { @@ -119,9 +119,7 @@ void relu_gpu(const int n, const float_t* in, float_t* out) { __global__ void d_relu_kernel(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff) { - CUDA_KERNEL_LOOP(index, n) { - out_diff[index] = data[index] > 0 ? in_diff[index] : 0; - } + CUDA_KERNEL_LOOP(i, n) { out_diff[i] = data[i] > 0 ? in_diff[i] : 0; } } void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, @@ -131,6 +129,32 @@ void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, CudaTest("solving d_relu kernel failed"); } +// flattern data into 1D before feed into the ReLU operater +__global__ void leaky_relu_kernel(const int n, const float_t epsilon, + const float_t* in, float_t* out) { + CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] > 0 ? in[i] : epsilon * in[i]; } +} + +void leaky_relu_gpu(const int n, const float_t epsilon, + const float_t* in, float_t* out) { + leaky_relu_kernel<<>>(n, epsilon, in, out); + CudaTest("solving leaky_relu kernel failed"); +} + +__global__ void d_leaky_relu_kernel(const int n, const float_t epsilon, + const float_t* in_diff, const float_t* data, float_t* out_diff) { + CUDA_KERNEL_LOOP(i, n) { + out_diff[i] = in_diff[i] * (data[i] > 0 ? 1.0 : epsilon); + } +} + +void d_leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in_diff, + const float_t* data, float_t* out_diff) { + d_leaky_relu_kernel<<>>( + n, epsilon, in_diff, data, out_diff); + CudaTest("solving d_leaky_relu kernel failed"); +} + __global__ void matmul_kernel(int x, int y, int z, const float_t* A, const float_t* B, float_t* C) { int row = blockIdx.x*blockDim.x+threadIdx.x; From 693faed70c3b3d93f9d571d159faf6bd5773d234 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Thu, 23 Apr 2020 12:47:30 -0500 Subject: [PATCH 187/660] update leaky_relu --- .../include/deepgalois/math_functions.hh | 2 ++ libdeepgalois/src/layers/leaky_relu_layer.cpp | 8 ++------ libdeepgalois/src/layers/relu_layer.cpp | 8 ++------ libdeepgalois/src/math_functions.cpp | 19 +++++++++++++++++-- 4 files changed, 23 insertions(+), 14 deletions(-) diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index a66d721d34..dd4d5e4219 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -41,6 +41,8 @@ void d_dropout_cpu(size_t n, const float scale, const float_t* in_diff, void relu_cpu(size_t n, const float_t* in, float_t* out); //! ReLU derivative; generally, 1 if data > 0, 0 otherwise void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out); +void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out); +void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, const float_t* data, float_t* out); // Loss function for single-class label (one-hot) data: softmax void softmax(const vec_t& input, vec_t& output); diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp index 650a0aa1be..0d5a7f66fb 100644 --- a/libdeepgalois/src/layers/leaky_relu_layer.cpp +++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp @@ -14,18 +14,14 @@ leaky_relu_layer::leaky_relu_layer(unsigned level, float_t eps, #ifdef CPU_ONLY // ๐‘ฆ[๐‘™] = ๐‘ฆ[๐‘™โˆ’1] > 0 ? ๐‘ฆ[๐‘™โˆ’1]) : ๐‘ฆ[๐‘™โˆ’1] * ฮต void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) { - galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { - out_data[i] = in_data[i] > (float_t)0 ? in_data[i] : epsilon_ * in_data[i]; - }, galois::chunk_size<64>(), galois::steal(), galois::loopname("leaky_relu_layer-fw")); + math::leaky_relu_cpu(n, epsilon_, in_data, out_data); } // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ * ฮต, ๐‘–๐‘“ (๐‘ฆ[๐‘™] โ‰ค 0) // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™, ๐‘–๐‘“ (๐‘ฆ[๐‘™] > 0) void leaky_relu_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { - galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { - in_grad[i] = out_grad[i] * (out_data[i] > float_t(0) ? float_t(1) : epsilon_); - }, galois::chunk_size<64>(), galois::steal(), galois::loopname("leaky_relu_layer-bw")); + math::d_leaky_relu_cpu(n, epsilon_, out_grad, out_data, in_grad); } #endif diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp index f0d3a74a49..2e89af1bd5 100644 --- a/libdeepgalois/src/layers/relu_layer.cpp +++ b/libdeepgalois/src/layers/relu_layer.cpp @@ -6,9 +6,7 @@ namespace deepgalois { // ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) { size_t n = input_dims[0] * input_dims[1]; - galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { - out_data[i] = std::max(in_data[i], (float_t)0); - }, galois::chunk_size<64>(), galois::steal(), galois::loopname("relu_layer-fw")); + math::relu_cpu(n, in_data, out_data); } // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) @@ -16,9 +14,7 @@ void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) void relu_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { size_t n = input_dims[0] * input_dims[1]; - galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { - in_grad[i] = out_data[i] > float_t(0) ? out_grad[i] : float_t(0); - }, galois::chunk_size<64>(), galois::steal(), galois::loopname("relu_layer-bw")); + math::d_relu_cpu(n, out_grad, out_data, in_grad); } #endif diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index f81444fa70..c6b64504e8 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -178,7 +178,7 @@ void relu_cpu(size_t n, const float_t* in, float_t* out) { // TODO: vectorize galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { out[i] = std::max(in[i], float_t(0)); - }, galois::loopname("relu")); + }, galois::chunk_size<64>(), galois::loopname("relu")); } void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out) { @@ -186,7 +186,22 @@ void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out) // check if original data greater than 0; if so keep grad galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { out[i] = data[i] > float_t(0) ? in[i] : float_t(0); - }, galois::loopname("d_relu")); + }, galois::chunk_size<64>(), galois::loopname("d_relu")); +} + +void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out) { + // TODO: vectorize + galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { + out[i] = in[i] > 0 ? in[i] : epsilon * in[i]; + }, galois::chunk_size<64>(), galois::loopname("leaky_relu")); +} + +void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, + const float_t* data, float_t* out) { + // TODO: vectorize + galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { + out[i] = in[i] * (data[i] > float_t(0) ? float_t(1) : epsilon); + }, galois::chunk_size<64>(), galois::loopname("d_leaky_relu")); } void softmax(const vec_t& input, vec_t& output) { From b5796f33a8b42bf84153081c2643acab37f297f2 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 24 Apr 2020 14:32:43 -0500 Subject: [PATCH 188/660] support arbitrary num_conv_layers --- .../include/deepgalois/math_functions.hh | 5 +-- libdeepgalois/include/deepgalois/net.h | 18 +++-------- libdeepgalois/src/math_functions.cpp | 15 +++++++++ libdeepgalois/src/math_functions.cu | 30 +++++++++++------ libdeepgalois/src/net.cpp | 32 ++++++++++++++----- 5 files changed, 66 insertions(+), 34 deletions(-) diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index dd4d5e4219..edd7fc6eb6 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -27,6 +27,7 @@ void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out); void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out); //! do dot product of 2 vectors float_t dot(const vec_t& x, const vec_t& y); +float_t axpy(size_t n, const float_t a, float_t *x, float_t *y); //! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2 float_t l2_norm(size_t n, const float_t* a); //! clear n elements of a vector @@ -124,8 +125,8 @@ int argmax(const size_t n, const float_t* x); // the arguments of the maxima bool isnan_gpu(int n, const float_t *array); // does array contain any 'nan' element void init_const_gpu(int n, float_t value, float_t *array); void copy_gpu(int len, const float_t* in, float_t* out); -void vadd_gpu(const int n, const float_t* a, const float_t* b, - float_t* out); // vector add +void vadd_gpu(const int n, const float_t* a, const float_t* b, float_t* out); // vector add +void axpy_gpu(const int n, const float_t a, const float_t* x, float_t* y); // axpy void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff); // ReLU derivative diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 6a03611371..e4016231d4 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -66,21 +66,11 @@ class Net { layers[i]->print_layer_info(); } - // back propogation - void bprop() { - for (size_t i = num_layers; i != 0; i--) { - layers[i - 1]->backward(); - } - } - + void bprop(); // back propogation + void normalize(); + void regularize(); // update trainable weights after back-propagation - void update_weights(optimizer* opt) { - for (size_t i = 0; i < num_layers; i++) { - if (layers[i]->trainable()) { - layers[i]->update_weight(opt); - } - } - } + void update_weights(optimizer* opt); protected: bool is_single_class; // single-class (one-hot) or multi-class label diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index c6b64504e8..9cff465a73 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -77,6 +77,17 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) for (size_t i = alignedN; i < n; ++i) out[i] = alpha * in[i]; } +// SAXPY stands for โ€œSingle-precision A*X Plus Y" +float_t axpy(size_t n, const float_t a, float_t *x, float_t *y) { + const size_t alignedN = n - n % vec_len; + const __m256 alpha = _mm256_set1_ps(a); + for (size_t i = 0; i < alignedN; i += vec_len) { + __m256 product = _mm256_mul_ps(_mm256_loadu_ps(&x[i]), alpha); + _mm256_storeu_ps(&y[i], _mm256_add_ps(_mm256_loadu_ps(&y[i]), product)); + } + for (size_t i = alignedN; i < n; ++i) y[i] = a * x[i] + y[i]; +} + float_t l2_norm(size_t n, const float_t* in) { const size_t alignedN = n - n % vec_len; __m256 vsum = _mm256_set1_ps(0.0); @@ -97,6 +108,10 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i]; } +float_t axpy(size_t n, const float_t a, float_t *x, float_t *y) { + for (size_t i = 0; i < n; ++i) y[i] = a * x[i] + y[i]; +} + float_t l2_norm(size_t n, const float_t* a) { float_t sum = 0.0; for (size_t i = 0; i < n; ++i) sum += a[i] * a[i]; diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 61114f0daf..b906702d9c 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -281,12 +281,12 @@ __global__ void set_kernel(const int n, const float_t alpha, float_t* y) { CUDA_KERNEL_LOOP(index, n) { y[index] = alpha; } } -void set_gpu(const int N, const float_t alpha, float_t* Y) { +void set_gpu(const int n, const float_t alpha, float_t* Y) { if (alpha == 0) { - CUDA_CHECK(cudaMemset(Y, 0, sizeof(float_t) * N)); + CUDA_CHECK(cudaMemset(Y, 0, sizeof(float_t) * n)); return; } - set_kernel<<>>(N, alpha, Y); + set_kernel<<>>(n, alpha, Y); CudaTest("solving set kernel failed"); } @@ -295,8 +295,8 @@ __global__ void add_scalar_kernel(const int n, const float_t alpha, CUDA_KERNEL_LOOP(index, n) { y[index] += alpha; } } -void add_scalar_gpu(const int N, const float_t alpha, float_t* Y) { - add_scalar_kernel<<>>(N, alpha, Y); +void add_scalar_gpu(const int n, const float_t alpha, float_t* Y) { + add_scalar_kernel<<>>(n, alpha, Y); CudaTest("solving add_scalar kernel failed"); } @@ -305,13 +305,23 @@ __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] + b[index]; } } -void copy_gpu(int len, const float_t* in, float_t* out) { - CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); +void vadd_gpu(const int n, const float_t* a, const float_t* b, float_t* y) { + vadd_kernel<<>>(n, a, b, y); + CudaTest("solving vadd kernel failed"); } -void vadd_gpu(const int N, const float_t* a, const float_t* b, float_t* y) { - vadd_kernel<<>>(N, a, b, y); - CudaTest("solving vadd kernel failed"); +__global__ void axpy_kernel(const int n, const float_t a, const float_t* x, + float_t* y) { + CUDA_KERNEL_LOOP(i, n) { y[i] = a * x[i] + y[i]; } +} + +void axpy_gpu(const int n, const float_t a, const float_t* x, float_t* y) { + axpy_kernel<<>>(n, a, x, y); + CudaTest("solving axpy kernel failed"); +} + +void copy_gpu(int len, const float_t* in, float_t* out) { + CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); } // TODO: use warp diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 91e39affeb..58fe59312e 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -80,11 +80,11 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, num_layers = num_conv_layers + 1; // initialize feature metadata feature_dims.resize(num_layers + 1); - feature_dims[0] = - context->read_features(dataset_str); // input feature dimension: D - feature_dims[1] = hidden1; // hidden1 level embedding: 16 - feature_dims[2] = num_classes; // output embedding: E - feature_dims[3] = num_classes; // normalized output embedding: E + feature_dims[0] = context->read_features(dataset_str); // input feature dimension: D + for (size_t i = 1; i < num_conv_layers; i++) + feature_dims[i] = hidden1; // hidden1 level embedding: 16 + feature_dims[num_conv_layers] = num_classes; // output embedding: E + feature_dims[num_layers] = num_classes; // normalized output embedding: E layers.resize(num_layers); #ifndef CPU_ONLY @@ -247,11 +247,27 @@ acc_t Net::fprop(size_t begin, size_t end, size_t count, mask_t* masks) { return loss; } +void Net::bprop() { + for (size_t i = num_layers; i != 0; i--) { + layers[i - 1]->backward(); + } +} + +void Net::update_weights(optimizer* opt) { + for (size_t i = 0; i < num_layers; i++) { + if (layers[i]->trainable()) { + layers[i]->update_weight(opt); + } + } +} + void Net::construct_layers() { + // append conv layers std::cout << "\nConstructing layers...\n"; - append_conv_layer(0, true); // first conv layer - append_conv_layer(1); // hidden1 layer - append_out_layer(2); // output layer + for (size_t i = 0; i < num_conv_layers-1; i++) + append_conv_layer(i, true); // conv layers, act=true + append_conv_layer(num_conv_layers-1); // the last hidden layer, act=false + append_out_layer(num_layers-1); // output layer layers[0]->set_in_data(context->get_in_ptr()); // feed input data context->norm_factor_counting(); set_contexts(); From d38dba1823f49d80c0ae14d80e9da358b6759147 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 24 Apr 2020 16:31:11 -0500 Subject: [PATCH 189/660] fix weight decay --- .../include/deepgalois/layers/layer.h | 8 +++- .../include/deepgalois/math_functions.hh | 2 +- libdeepgalois/include/deepgalois/net.h | 7 ++-- libdeepgalois/src/layers/graph_conv_layer.cpp | 6 +-- libdeepgalois/src/layers/graph_conv_layer.cu | 3 +- libdeepgalois/src/math_functions.cpp | 28 +++++++++---- libdeepgalois/src/math_functions.cu | 41 ++++++++++++------- libdeepgalois/src/net.cpp | 20 +++++++++ 8 files changed, 78 insertions(+), 37 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 188feebe75..17ab4e6694 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -79,6 +79,11 @@ class layer : public deepgalois::node { std::string get_name() { return name_; } mask_t* get_device_masks() { return d_masks_; } + float_t* get_weights_ptr() { return &W[0]; } + float_t* get_weights_device_ptr() { return d_W; } + float_t* get_grads_ptr() { return &weight_grad[0]; } + float_t* get_grads_device_ptr() { return d_weight_grad; } + //! debug print function void print_layer_info(); virtual void set_sample_mask(size_t sample_begin, size_t sample_end, @@ -126,16 +131,15 @@ class layer : public deepgalois::node { //! use optimizer to update weights given gradient (weight_grad) void update_weight(deepgalois::optimizer* opt) { + // std::cout << name_ << ": weight updating ... "; // vec_t diff; // prev()->merge_grads(&diff); #ifdef CPU_ONLY - // std::cout << name_ << ": weight updating ... "; // parallelize only when target size is big enough to mitigate thread // spawning overhead. bool parallel = (W.size() >= 512); opt->update(layer::weight_grad, layer::W, parallel); // W += grad #else - //std::cout << name_ << ": "; opt->update_gpu(input_dims[1]*output_dims[1], d_weight_grad, d_W); // W += grad #endif // prev()->clear_grads(); diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index edd7fc6eb6..e33345793b 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -27,7 +27,7 @@ void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out); void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out); //! do dot product of 2 vectors float_t dot(const vec_t& x, const vec_t& y); -float_t axpy(size_t n, const float_t a, float_t *x, float_t *y); +void axpy(size_t n, const float_t a, float_t *x, float_t *y); //! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2 float_t l2_norm(size_t n, const float_t* a); //! clear n elements of a vector diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index e4016231d4..69355ee6b2 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -67,10 +67,9 @@ class Net { } void bprop(); // back propogation - void normalize(); - void regularize(); - // update trainable weights after back-propagation - void update_weights(optimizer* opt); + void normalize(); // Scale gradient to counterbalance accumulation + void regularize(); // add weight decay + void update_weights(optimizer* opt); // update trainable weights after back-propagation protected: bool is_single_class; // single-class (one-hot) or multi-class label diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index b640acd75a..3233cd0bc6 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -125,11 +125,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, } acc_t graph_conv_layer::get_weight_decay_loss() { - acc_t loss = 0.0; - for (size_t i = 0; i < y*z; i+=z) { - loss += math::l2_norm(z, &layer::W[i]); - } - return loss; + return math::l2_norm(y*z, &layer::W[0]); } #endif // end if CPU_ONLY diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index 322500d916..28e6002279 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -88,8 +88,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, } acc_t graph_conv_layer::get_weight_decay_loss() { - acc_t loss = l2_norm_gpu(y*z, d_W); - return loss; + return l2_norm_gpu(y*z, d_W); } } // namespace diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 9cff465a73..0cc7812e9e 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -78,7 +78,8 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) } // SAXPY stands for โ€œSingle-precision A*X Plus Y" -float_t axpy(size_t n, const float_t a, float_t *x, float_t *y) { +/* +void axpy(size_t n, const float_t a, float_t *x, float_t *y) { const size_t alignedN = n - n % vec_len; const __m256 alpha = _mm256_set1_ps(a); for (size_t i = 0; i < alignedN; i += vec_len) { @@ -96,8 +97,9 @@ float_t l2_norm(size_t n, const float_t* in) { vsum = _mm256_add_ps(vsum, _mm256_mul_ps(a, a)); } __m256 sum = _mm256_hadd_ps(vsum, vsum); - return ((float_t*)&sum)[0] + ((float_t*)&sum)[2];; + return (((float_t*)&sum)[0] + ((float_t*)&sum)[2]) / 2.0; } +*/ #else // vector multiply scalar void mul_scalar(const float_t alpha, vec_t& Y) { @@ -108,16 +110,24 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i]; } -float_t axpy(size_t n, const float_t a, float_t *x, float_t *y) { - for (size_t i = 0; i < n; ++i) y[i] = a * x[i] + y[i]; +//void axpy(size_t n, const float_t a, float_t *x, float_t *y) { +// for (size_t i = 0; i < n; ++i) y[i] = a * x[i] + y[i]; +//} + +//float_t l2_norm(size_t n, const float_t* a) { +// float_t sum = 0.0; +// for (size_t i = 0; i < n; ++i) sum += a[i] * a[i]; +// return sum / 2.0; +//} +#endif + +void axpy(size_t n, const float_t a, float_t *x, float_t *y) { + cblas_saxpy(n, a, x, 1, y, 1); } -float_t l2_norm(size_t n, const float_t* a) { - float_t sum = 0.0; - for (size_t i = 0; i < n; ++i) sum += a[i] * a[i]; - return sum/2.0; +float_t l2_norm(size_t n, const float_t* x) { + return cblas_snrm2(n, x, 1); } -#endif // dot product float_t dot(const vec_t& x, const vec_t& y) { diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index b906702d9c..0bef3a47d3 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -278,21 +278,20 @@ void scale_gpu(const int n, const float alpha, const float* x, float* y) { } __global__ void set_kernel(const int n, const float_t alpha, float_t* y) { - CUDA_KERNEL_LOOP(index, n) { y[index] = alpha; } + CUDA_KERNEL_LOOP(i, n) { y[i] = alpha; } } -void set_gpu(const int n, const float_t alpha, float_t* Y) { +void set_gpu(const int n, const float_t alpha, float_t* y) { if (alpha == 0) { - CUDA_CHECK(cudaMemset(Y, 0, sizeof(float_t) * n)); + CUDA_CHECK(cudaMemset(y, 0, sizeof(float_t) * n)); return; } - set_kernel<<>>(n, alpha, Y); + set_kernel<<>>(n, alpha, y); CudaTest("solving set kernel failed"); } -__global__ void add_scalar_kernel(const int n, const float_t alpha, - float_t* y) { - CUDA_KERNEL_LOOP(index, n) { y[index] += alpha; } +__global__ void add_scalar_kernel(const int n, const float_t a, float_t* y) { + CUDA_KERNEL_LOOP(i, n) { y[i] += a; } } void add_scalar_gpu(const int n, const float_t alpha, float_t* Y) { @@ -302,7 +301,7 @@ void add_scalar_gpu(const int n, const float_t alpha, float_t* Y) { __global__ void vadd_kernel(const int n, const float_t* a, const float_t* b, float_t* y) { - CUDA_KERNEL_LOOP(index, n) { y[index] = a[index] + b[index]; } + CUDA_KERNEL_LOOP(i, n) { y[i] = a[i] + b[i]; } } void vadd_gpu(const int n, const float_t* a, const float_t* b, float_t* y) { @@ -316,10 +315,29 @@ __global__ void axpy_kernel(const int n, const float_t a, const float_t* x, } void axpy_gpu(const int n, const float_t a, const float_t* x, float_t* y) { - axpy_kernel<<>>(n, a, x, y); + //axpy_kernel<<>>(n, a, x, y); + CUBLAS_CHECK(cublasSaxpy(deepgalois::Context::cublas_handle(), n, &a, x, 1, y, 1)); CudaTest("solving axpy kernel failed"); } +__global__ void l2_norm_kernel(const int n, const float_t* a, float_t *sum) { + CUDA_KERNEL_LOOP(i, n) { + float_t product = a[i] * a[i]; + atomicAdd(sum, product); + } +} + +acc_t l2_norm_gpu(int n, float_t* x) { + float_t sum = 0.0; + CUBLAS_CHECK(cublasSnrm2(deepgalois::Context::cublas_handle(), n, x, 1, &sum)); + //float_t *d_sum; + //CUDA_CHECK(cudaMalloc((void**)&d_sum, sizeof(float_t)); + //CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(acc_t), cudaMemcpyHostToDevice)); + //l2_norm_kernel<<>>(n, x, d_sum); + //CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(float_t), cudaMemcpyDeviceToHost)); + return (acc_t)sum / 2.0; +} + void copy_gpu(int len, const float_t* in, float_t* out) { CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); } @@ -693,8 +711,3 @@ acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* return *(total_loss.cpu_rd_ptr()) / count; } -acc_t l2_norm_gpu(int n, float_t * tensor) { - acc_t sum = 0.0; - return sum / 2.0; -} - diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 58fe59312e..1428c7508d 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -253,7 +253,27 @@ void Net::bprop() { } } +// Scale gradient to counterbalance accumulation +void Net::normalize() { +} + +// add weight decay +void Net::regularize() { + size_t layer_id = 0; + auto n = feature_dims[layer_id] * feature_dims[layer_id+1]; +#ifdef CPU_ONLY + // TODO: parallel + math::axpy(n, weight_decay, layers[layer_id]->get_weights_ptr(), + layers[layer_id]->get_grads_ptr()); +#else + axpy_gpu(n, weight_decay, layers[layer_id]->get_weights_device_ptr(), + layers[layer_id]->get_grads_device_ptr()); +#endif +} + void Net::update_weights(optimizer* opt) { + normalize(); + regularize(); for (size_t i = 0; i < num_layers; i++) { if (layers[i]->trainable()) { layers[i]->update_weight(opt); From 725f7b91b51c413cc6c04f2ec106ab50a4b9c9b7 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 25 Apr 2020 16:16:55 -0500 Subject: [PATCH 190/660] add l2_norm_layer --- libdeepgalois/CMakeLists.txt | 4 ++ .../include/deepgalois/layers/l2_norm_layer.h | 20 ++++++ .../include/deepgalois/math_functions.hh | 4 +- libdeepgalois/include/deepgalois/net.h | 26 ++++---- libdeepgalois/src/layers/l2_norm_layer.cpp | 54 +++++++++++++++ libdeepgalois/src/layers/l2_norm_layer.cu | 18 +++++ libdeepgalois/src/math_functions.cu | 9 ++- libdeepgalois/src/net.cpp | 35 ++++++++-- libdeepgalois/src/net.cu | 2 +- libdeepgalois/src/utils.cpp | 66 ++++++++++--------- lonestargnn/gcn/gcn.cpp | 3 +- lonestargnn/include/lonestargnn.h | 4 +- 12 files changed, 191 insertions(+), 54 deletions(-) create mode 100644 libdeepgalois/include/deepgalois/layers/l2_norm_layer.h create mode 100644 libdeepgalois/src/layers/l2_norm_layer.cpp create mode 100644 libdeepgalois/src/layers/l2_norm_layer.cu diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 193988f414..3f592f0d18 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -42,6 +42,7 @@ else() src/layers/softmax_loss_layer.cu src/layers/sigmoid_loss_layer.cu src/layers/leaky_relu_layer.cu + src/layers/l2_norm_layer.cu src/layers/relu_layer.cu src/layers/aggregator.cu src/math_functions.cu @@ -64,12 +65,14 @@ set(sources src/layers/sigmoid_loss_layer.cpp src/layers/graph_conv_layer.cpp src/layers/leaky_relu_layer.cpp + src/layers/l2_norm_layer.cpp src/layers/relu_layer.cpp src/layers/aggregator.cpp src/math_functions.cpp src/layers/layer.cpp src/DistContext.cpp src/optimizer.cpp + src/sampler.cpp src/utils.cpp src/node.cpp src/net.cpp @@ -80,6 +83,7 @@ set(sources src/layers/sigmoid_loss_layer.cpp src/layers/graph_conv_layer.cpp src/layers/leaky_relu_layer.cpp + src/layers/l2_norm_layer.cpp src/layers/relu_layer.cpp src/layers/aggregator.cpp src/math_functions.cpp diff --git a/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h new file mode 100644 index 0000000000..b15c1ae671 --- /dev/null +++ b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h @@ -0,0 +1,20 @@ +#pragma once +#include "layer.h" + +namespace deepgalois { +// L2 Normalization Layer +class l2_norm_layer : public layer { +public: + l2_norm_layer(unsigned level, float_t eps, float_t scale, dims_t in_dims, dims_t out_dims); + l2_norm_layer(unsigned level, dims_t in_dims, dims_t out_dims) : + l2_norm_layer(level, 1e-12, 20, in_dims, out_dims) {} + ~l2_norm_layer() {} + std::string layer_type() const override { return std::string("l2_norm"); } + virtual void forward_propagation(const float_t* in_data, float_t* out_data); + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad); +protected: + float_t epsilon_; + float_t scale_; +}; +} // namespace diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index e33345793b..5611caaa94 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -171,5 +171,7 @@ void float_malloc_device(int n, float_t*& ptr); void float_free_device(float_t*& ptr); void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr); acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss); -acc_t l2_norm_gpu(int n, float_t *tensor); +acc_t l2_norm_gpu(int n, const float_t *in); +void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t *out); +void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t *in_diff, float_t *out_diff); #endif diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 69355ee6b2..a3fa9d0dee 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -26,7 +26,8 @@ namespace deepgalois { // layer 2: features N x 16, weights 16 x E, out N x E class Net { public: - Net() : is_single_class(true), num_samples(0), num_classes(0), + Net() : is_single_class(true), has_l2norm(false), has_dense(false), + num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0), num_epochs(0), learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0), train_begin(0), train_end(0), train_count(0), @@ -35,20 +36,26 @@ class Net { train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {} void init(std::string dataset_str, unsigned num_conv, unsigned epochs, unsigned hidden1, float lr, float dropout, float wd, - bool selfloop, bool is_single, Graph* dGraph); + bool selfloop, bool single, bool l2norm, bool dense, Graph* dGraph); size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } size_t get_nnodes() { return num_samples; } + void construct_layers(); void append_out_layer(size_t layer_id); + void append_l2norm_layer(size_t layer_id); + void append_dense_layer(size_t layer_id); + void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, + bool bias = false, bool dropout = true); //! Add a convolution layer to the network + void train(optimizer* opt, bool need_validate); // training double evaluate(std::string type, acc_t& loss, acc_t& acc); // inference void read_test_masks(std::string dataset, Graph* dGraph); acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks); // forward propagation - - //! Add a convolution layer to the network - void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, - bool bias = false, bool dropout = true); + void bprop(); // back propogation + void normalize(); // Scale gradient to counterbalance accumulation + void regularize(); // add weight decay + void update_weights(optimizer* opt); // update trainable weights after back-propagation //! Save the context object to all layers of the network void set_contexts() { @@ -66,13 +73,10 @@ class Net { layers[i]->print_layer_info(); } - void bprop(); // back propogation - void normalize(); // Scale gradient to counterbalance accumulation - void regularize(); // add weight decay - void update_weights(optimizer* opt); // update trainable weights after back-propagation - protected: bool is_single_class; // single-class (one-hot) or multi-class label + bool has_l2norm; // whether the net contains an l2_norm layer + bool has_dense; // whether the net contains an dense layer size_t num_samples; // number of samples: N size_t num_classes; // number of vertex classes: E size_t num_conv_layers; // number of convolutional layers diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp new file mode 100644 index 0000000000..46379aed60 --- /dev/null +++ b/libdeepgalois/src/layers/l2_norm_layer.cpp @@ -0,0 +1,54 @@ +#include "deepgalois/layers/l2_norm_layer.h" + +namespace deepgalois { + +l2_norm_layer::l2_norm_layer(unsigned level, float_t eps, float_t scale, + dims_t in_dims, dims_t out_dims) + : layer(level, in_dims, out_dims), epsilon_(eps), scale_(scale) { + assert(input_dims[0] == output_dims[0]); // num_vertices + trainable_ = false; + name_ = layer_type() + "_" + std::to_string(level); +} + +#ifdef CPU_ONLY +void l2_norm_layer::forward_propagation(const float_t* in_data, float_t* out_data) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + galois::do_all(galois::iterate((size_t)0, x), [&](const auto i) { + //for (size_t i = 0; i < x; i++) { + float_t sum = 0.0; + size_t idx = i * y; + for (size_t j = 0; j < y; j++) { + sum += in_data[idx + j] * in_data[idx + j]; + } + sum = std::max(sum, epsilon_); + sum = sqrt(sum); + for (size_t j = 0; j < y; j++) { + out_data[idx + j] = in_data[idx + j] / sum * scale_; + } + }, galois::loopname("l2_norm")); +} + +void l2_norm_layer::back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + galois::do_all(galois::iterate((size_t)0, x), [&](const auto i) { + //for (size_t i = 0; i < x; i++) { + float_t sum_x2 = 0.0; + float_t coef0_axis0 = 0, coef1_axis0 = 0; + size_t idx = i * y; + for (size_t j = 0; j < y; j++) { + sum_x2 += powf(in_data[idx + j], 2); + coef0_axis0 -= in_data[idx + j] * out_grad[idx + j]; + } + coef1_axis0 = powf(sum_x2, -1.5); + for (size_t j = 0; j < y; j++) { + in_grad[idx + j] = in_data[idx + j] * coef0_axis0 * coef1_axis0 + + out_grad[idx + j] * sum_x2 * coef1_axis0; + } + }, galois::loopname("d_l2_norm")); +} +#endif + +} // namespace diff --git a/libdeepgalois/src/layers/l2_norm_layer.cu b/libdeepgalois/src/layers/l2_norm_layer.cu new file mode 100644 index 0000000000..56128eb0d3 --- /dev/null +++ b/libdeepgalois/src/layers/l2_norm_layer.cu @@ -0,0 +1,18 @@ +#include "deepgalois/layers/l2_norm_layer.h" + +namespace deepgalois { + +void l2_norm_layer::forward_propagation(const float_t* in_data, float_t* out_data) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + l2_norm_gpu(x, y, in_data, out_data); +} + +void l2_norm_layer::back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + d_l2_norm_gpu(x, y, in_data, out_grad, in_grad); +} + +} // namespace diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 0bef3a47d3..c1746d9075 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -327,7 +327,7 @@ __global__ void l2_norm_kernel(const int n, const float_t* a, float_t *sum) { } } -acc_t l2_norm_gpu(int n, float_t* x) { +acc_t l2_norm_gpu(int n, const float_t* x) { float_t sum = 0.0; CUBLAS_CHECK(cublasSnrm2(deepgalois::Context::cublas_handle(), n, x, 1, &sum)); //float_t *d_sum; @@ -338,6 +338,13 @@ acc_t l2_norm_gpu(int n, float_t* x) { return (acc_t)sum / 2.0; } +void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t *out) { +} + +void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, + float_t *in_diff, float_t *out_diff) { +} + void copy_gpu(int len, const float_t* in, float_t* out) { CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 1428c7508d..c5ef556032 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -8,13 +8,16 @@ namespace deepgalois { void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, unsigned hidden1, float lr, float dropout, float wd, - bool selfloop, bool is_single, Graph* dGraph) { + bool selfloop, bool single, bool l2norm, bool dense, Graph* dGraph) { + assert(num_conv > 0); num_conv_layers = num_conv; num_epochs = epochs; learning_rate = lr; dropout_rate = dropout; weight_decay = wd; - is_single_class = is_single; + is_single_class = single; + has_l2norm = l2norm; + has_dense = dense; galois::gPrint("Configuration: num_conv_layers ", num_conv_layers, ", num_epochs ", num_epochs, ", hidden1 ", hidden1, @@ -23,7 +26,7 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, ", weight_decay ", weight_decay, "\n"); #ifndef GALOIS_USE_DIST context = new deepgalois::Context(); - context->set_label_class(is_single); + context->set_label_class(is_single_class); num_samples = context->read_graph(dataset_str, selfloop); #else context = new deepgalois::DistContext(); @@ -78,13 +81,19 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, // train count and val count are LOCAL counts num_layers = num_conv_layers + 1; + if (has_l2norm) num_layers ++; + if (has_dense) num_layers ++; // initialize feature metadata feature_dims.resize(num_layers + 1); feature_dims[0] = context->read_features(dataset_str); // input feature dimension: D for (size_t i = 1; i < num_conv_layers; i++) - feature_dims[i] = hidden1; // hidden1 level embedding: 16 - feature_dims[num_conv_layers] = num_classes; // output embedding: E - feature_dims[num_layers] = num_classes; // normalized output embedding: E + feature_dims[i] = hidden1; // hidden1 level embedding: 16 + feature_dims[num_conv_layers] = num_classes; // output embedding: E + if (has_l2norm) + feature_dims[num_conv_layers+1] = num_classes; // l2 normalized embedding: E + if (has_dense) + feature_dims[num_layers-1] = num_classes; // MLP embedding: E + feature_dims[num_layers] = num_classes; // normalized output embedding: E layers.resize(num_layers); #ifndef CPU_ONLY @@ -287,12 +296,24 @@ void Net::construct_layers() { for (size_t i = 0; i < num_conv_layers-1; i++) append_conv_layer(i, true); // conv layers, act=true append_conv_layer(num_conv_layers-1); // the last hidden layer, act=false + if (has_l2norm) + append_l2norm_layer(num_conv_layers); // l2_norm layer + if (has_dense) + append_dense_layer(num_layers-2); // dense layer append_out_layer(num_layers-1); // output layer layers[0]->set_in_data(context->get_in_ptr()); // feed input data context->norm_factor_counting(); set_contexts(); } +//! Add an l2_norm layer to the network +void Net::append_l2norm_layer(size_t layer_id) { +} + +//! Add an dense layer to the network +void Net::append_dense_layer(size_t layer_id) { +} + //! Add an output layer to the network void Net::append_out_layer(size_t layer_id) { assert(layer_id > 0); // can not be the first layer @@ -405,7 +426,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks } acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph) { - auto preds = layers[num_conv_layers - 1]->next()->get_data(); + auto preds = layers[num_conv_layers]->next()->get_data(); auto ground_truth = context->get_labels_ptr(); return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, ground_truth, preds); } diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index c7acda5666..d46b807711 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -130,7 +130,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph* g) { return masked_f1_score_gpu(num_classes, begin, end, count, masks, - layers[num_conv_layers - 1]->next()->get_data(), + layers[num_conv_layers]->next()->get_data(), context->get_labels_device_ptr()); } diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index 46470e2997..b2b65c9582 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -13,49 +13,53 @@ const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pub // The formula for the F1 score is: // F1 = 2 * (precision * recall) / (precision + recall) // where precision = TP / (TP + FP), recall = TP / (TP + FN) -// TP: true positive; FP: false positive; FN: false negtive. +// TP: true positive; FP: false positive; FN: false negative. // In the multi-class and multi-label case, this is the weighted average of the F1 score of each class. // Please refer to https://sebastianraschka.com/faq/docs/multiclass-metric.html, // http://pageperso.lif.univ-mrs.fr/~francois.denis/IAAM1/scikit-learn-docs.pdf (p.1672) // and https://github.com/ashokpant/accuracy-evaluation-cpp/blob/master/src/evaluation.hpp acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, size_t num_classes, label_t *ground_truth, float_t *pred) { - float beta = 1.0; - std::vector true_positive(num_classes, 0); - std::vector false_positive(num_classes, 0); - std::vector false_negtive(num_classes, 0); - galois::do_all(galois::iterate(begin, end), [&](const auto& i) { - if (masks[i] == 1) { - for (size_t j = 0; j < num_classes; j++) { - auto idx = i * num_classes + j; + double precision_cls(0.), recall_cls(0.), f1_accum(0.); + int tp_accum(0), fn_accum(0), fp_accum(0), tn_accum(0); + for (size_t col = 0; col < num_classes; col++) { + int tp_cls(0), fp_cls(0), fn_cls(0), tn_cls(0); + for (size_t row = begin; row < end; row ++) { + //galois::do_all(galois::iterate(begin, end), [&](const auto& row) { + if (masks[row] == 1) { + auto idx = row * num_classes + col; if (ground_truth[idx] == 1 && pred[idx] > 0.5) { - __sync_fetch_and_add(&true_positive[j], 1); + //__sync_fetch_and_add(&tp_cls, 1); + tp_cls += 1; } else if (ground_truth[idx] == 0 && pred[idx] > 0.5) { - __sync_fetch_and_add(&false_positive[j], 1); + //__sync_fetch_and_add(&fp_cls, 1); + fp_cls += 1; } else if (ground_truth[idx] == 1 && pred[idx] <= 0.5) { - __sync_fetch_and_add(&false_negtive[j], 1); + //__sync_fetch_and_add(&fn_cls, 1); + fn_cls += 1; + } else if (ground_truth[idx] == 0 && pred[idx] <= 0.5) { + //__sync_fetch_and_add(&tn_cls, 1); + tn_cls += 1; } } - } - }, galois::loopname("MaskedF1Score")); - acc_t pNumerator = 0.0; - acc_t pDenominator = 0.0; - acc_t rNumerator = 0.0; - acc_t rDenominator = 0.0; - for (size_t i = 0; i < num_classes; i++) { - acc_t fn = (acc_t)false_negtive[i]; // false negtive - acc_t fp = (acc_t)false_positive[i]; // false positive - acc_t tp = (acc_t)true_positive[i]; // true positive - pNumerator = pNumerator + tp; - pDenominator = pDenominator + (tp + fp); - rNumerator = rNumerator + tp; - rDenominator = rDenominator + (tp + fn); + } + //}, galois::loopname("MaskedF1Score")); + tp_accum += tp_cls; + fn_accum += fn_cls; + fp_accum += fp_cls; + tn_accum += tn_cls; + precision_cls = tp_cls + fp_cls > 0 ? (double)tp_cls/(double)(tp_cls+fp_cls) : 0.; + recall_cls = tp_cls+fn_cls > 0 ? (double)tp_cls/(double)(tp_cls+fn_cls) : 0.; + f1_accum += recall_cls+precision_cls > 0. ? 2.*(recall_cls*precision_cls)/(recall_cls+precision_cls) : 0.; } - auto recallMicro = rNumerator / rDenominator; - acc_t precisionMicro = pNumerator / pDenominator; - auto fscoreMicro = (((beta * beta) + 1) * precisionMicro * recallMicro) / - ((beta * beta) * precisionMicro + recallMicro); - return fscoreMicro; + double f1_macro = f1_accum/(double)num_classes; + //double accuracy_mic = (double)(tp_accum+tn_accum)/(double)(tp_accum+tn_accum+fp_accum+fn_accum); + double precision_mic = tp_accum+fp_accum > 0 ? (double)tp_accum/(double)(tp_accum+fp_accum) : 0.; + double recall_mic = tp_accum+fn_accum > 0 ? (double)tp_accum/(double)(tp_accum+fn_accum) : 0.; + double f1_micro = recall_mic+precision_mic > 0. ? 2.*(recall_mic*precision_mic)/(recall_mic+precision_mic) : 0.; + std::cout << std::setprecision(3) << std::fixed << + " (f1_micro: " << f1_micro << ", f1_macro: " << f1_macro << ") "; + return f1_micro; } #ifndef GALOIS_USE_DIST diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 1a3698bc96..4a6a06639a 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -26,7 +26,8 @@ int main(int argc, char** argv) { // read network, features, ground truth, initialize metadata network.init(dataset, num_conv_layers, epochs, hidden1, learning_rate, - dropout_rate, weight_decay, add_selfloop, is_single_class, dGraph); + dropout_rate, weight_decay, add_selfloop, + is_single_class, add_l2norm, add_dense, dGraph); // default setting for now; can be customized by the user network.construct_layers(); network.print_layers_info(); diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h index 1c96548a36..e2191fb7a1 100644 --- a/lonestargnn/include/lonestargnn.h +++ b/lonestargnn/include/lonestargnn.h @@ -28,7 +28,7 @@ static cll::opt hidden1("h", cll::desc("Number of units in hidden layer 1 (default value 16)"), cll::init(16)); static cll::opt learning_rate("lr", cll::desc("Initial learning rate (default value 0.01)"), cll::init(0.01)); -static cll::opt dropout_rate("d", +static cll::opt dropout_rate("dr", cll::desc("Dropout rate (1 - keep probability) (default value 0.5)"), cll::init(0.5)); static cll::opt weight_decay("wd", cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4)); @@ -41,6 +41,8 @@ static cll::opt is_single_class("sc", static cll::opt do_validate("dv", cll::desc("enable validation"), cll::init(1)); static cll::opt do_test("dt", cll::desc("enable test"), cll::init(1)); static cll::opt add_selfloop("sl", cll::desc("add selfloop"), cll::init(0)); +static cll::opt add_l2norm("l2", cll::desc("add an l2_norm layer"), cll::init(0)); +static cll::opt add_dense("d", cll::desc("add an dense layer"), cll::init(0)); //! standard global options to the benchmarks extern llvm::cl::opt skipVerify; From 707a773285e3ffbff0d2ed1264971300ff2725ed Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 25 Apr 2020 20:49:51 -0500 Subject: [PATCH 191/660] update sampler --- libdeepgalois/include/deepgalois/context.h | 6 ++- .../include/deepgalois/layers/layer.h | 4 +- .../deepgalois/layers/sigmoid_loss_layer.h | 1 + .../deepgalois/layers/softmax_loss_layer.h | 5 +-- libdeepgalois/include/deepgalois/net.h | 15 ++++++- libdeepgalois/include/deepgalois/sampler.h | 2 +- .../src/layers/sigmoid_loss_layer.cpp | 9 +++- .../src/layers/sigmoid_loss_layer.cu | 6 ++- .../src/layers/softmax_loss_layer.cpp | 9 +++- .../src/layers/softmax_loss_layer.cu | 6 ++- libdeepgalois/src/net.cpp | 43 ++++++++++++++++++- libdeepgalois/src/sampler.cpp | 4 +- lonestargnn/gcn/gcn.cpp | 3 +- lonestargnn/include/lonestargnn.h | 4 +- 14 files changed, 95 insertions(+), 22 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index e40b6a6371..786fc48d5d 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -30,6 +30,7 @@ class Context { label_t get_label(size_t i) { return labels[i]; } // single-class (one-hot) label label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label label_t* get_labels_ptr() { return labels; } + label_t* get_labels_subg_ptr() { return labels_subg; } label_t* get_labels_device_ptr() { return d_labels; } float_t* get_in_ptr(); float_t* get_norm_factor() { return norm_factor; } @@ -40,10 +41,12 @@ class Context { #ifdef CPU_ONLY Graph* graph_cpu; // the input graph, |V| = N + Graph* subgraph_cpu; void genGraph(LGraph& lg, Graph& g); void add_selfloop(Graph &og, Graph &g); //! returns pointer to the graph Graph* getCpuGraphPointer(); + Graph* getCpuSubgraphPointer() { return subgraph_cpu; }; #else CSRGraph graph_gpu; // the input graph, |V| = N inline static cublasHandle_t cublas_handle() { return cublas_handle_; } @@ -59,7 +62,8 @@ class Context { size_t feat_len; // input feature length: D bool is_single_class; // single-class (one-hot) or multi-class label bool is_selfloop_added; // whether selfloop is added to the input graph - label_t *labels; // labels for classification: N x 1 + label_t *labels; // labels for classification. Single-class label: Nx1, multi-class label: NxE + label_t *labels_subg; // labels for subgraph float_t* h_feats; // input features: N x D float_t* norm_factor; // normalization constant based on graph structure label_t* d_labels; // labels on device diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 17ab4e6694..a18802f198 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -52,7 +52,7 @@ class layer : public deepgalois::node { std::vector out_dims) : node(in_dims.size(), out_dims.size()), level_(level), begin_(0), end_(0), num_dims(in_dims.size()), input_dims(in_dims), - output_dims(out_dims) { + output_dims(out_dims), labels(NULL) { add_edge(); } virtual ~layer() = default; @@ -72,6 +72,7 @@ class layer : public deepgalois::node { // is this layer trainable? void set_trainable(bool trainable) { trainable_ = trainable; } + void set_labels_ptr(label_t *ptr) { labels = ptr; } bool trainable() const { return trainable_; } // name metadata @@ -166,6 +167,7 @@ class layer : public deepgalois::node { mask_t* d_masks_; float_t* loss; // error for each vertex: N x 1 ContextType* context; + label_t* labels; #ifdef GALOIS_USE_DIST // Used for synchronization of weight gradients diff --git a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h index 0f46cde043..760b6f0ab1 100644 --- a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h +++ b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h @@ -10,6 +10,7 @@ class sigmoid_loss_layer : public layer { std::string layer_type() const override { return std::string("sigmoid_loss"); } + inline label_t get_label(size_t i, size_t j); virtual void forward_propagation(const float_t* in_data, float_t* out_data); virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); diff --git a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h index 1a5b7e86ee..060698e3d9 100644 --- a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h +++ b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h @@ -7,9 +7,8 @@ class softmax_loss_layer : public layer { softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims); ~softmax_loss_layer(); - std::string layer_type() const override { - return std::string("softmax_loss"); - } + std::string layer_type() const override { return std::string("softmax_loss"); } + inline label_t get_label(size_t i); virtual void forward_propagation(const float_t* in_data, float_t* out_data); virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index a3fa9d0dee..0cd94adc05 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -8,10 +8,12 @@ #include "galois/Timer.h" #include "deepgalois/types.h" #include "deepgalois/gtypes.h" +#include "deepgalois/layers/l2_norm_layer.h" #include "deepgalois/layers/graph_conv_layer.h" #include "deepgalois/layers/softmax_loss_layer.h" #include "deepgalois/layers/sigmoid_loss_layer.h" #include "deepgalois/optimizer.h" +#include "deepgalois/sampler.h" #ifndef GALOIS_USE_DIST #include "deepgalois/context.h" #else @@ -27,6 +29,7 @@ namespace deepgalois { class Net { public: Net() : is_single_class(true), has_l2norm(false), has_dense(false), + neighbor_sample_size(0), subgraph_sample_size(0), num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0), num_epochs(0), learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0), @@ -36,7 +39,9 @@ class Net { train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {} void init(std::string dataset_str, unsigned num_conv, unsigned epochs, unsigned hidden1, float lr, float dropout, float wd, - bool selfloop, bool single, bool l2norm, bool dense, Graph* dGraph); + bool selfloop, bool single, bool l2norm, bool dense, + unsigned neigh_sample_size = 0, unsigned subg_sample = 0, + Graph* dGraph = NULL); size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } size_t get_nnodes() { return num_samples; } @@ -77,6 +82,8 @@ class Net { bool is_single_class; // single-class (one-hot) or multi-class label bool has_l2norm; // whether the net contains an l2_norm layer bool has_dense; // whether the net contains an dense layer + unsigned neighbor_sample_size; // neighbor sampling + unsigned subgraph_sample_size; // subgraph sampling size_t num_samples; // number of samples: N size_t num_classes; // number of vertex classes: E size_t num_conv_layers; // number of convolutional layers @@ -95,16 +102,20 @@ class Net { mask_t* d_val_masks; // masks for validation on device mask_t* test_masks; // masks for test mask_t* d_test_masks; // masks for test on device + mask_t* subgraph_masks; // masks for subgraph std::vector feature_dims; // feature dimnesions for each layer std::vector layers; // all the layers in the neural network + Sampler *sampler; #ifndef GALOIS_USE_DIST deepgalois::Context* context; #else deepgalois::DistContext* context; #endif - // comparing outputs with the ground truth (labels) + void lookup_labels(size_t n, mask_t *masks, const label_t *labels, label_t *sub_labels); + #ifdef CPU_ONLY + // comparing outputs with the ground truth (labels) acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph); acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph); #else diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h index 8842f0e442..900ff1de2e 100644 --- a/libdeepgalois/include/deepgalois/sampler.h +++ b/libdeepgalois/include/deepgalois/sampler.h @@ -8,7 +8,7 @@ class Sampler { ~Sampler() {} // sample a subgraph sg of size n from graph g - void subgraph_sampler(Graph &g, Graph &sg, size_t n); + void subgraph_sample(size_t n, Graph &g, Graph &sg, VertexList &vertex_set, mask_t *masks); // !API function for user-defined selection strategy virtual void select_vertices(Graph &g, VertexList &vertex_set, size_t n, size_t m); diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index 763bd6646d..b94cd83e14 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -16,6 +16,11 @@ sigmoid_loss_layer::~sigmoid_loss_layer() { delete loss; } +inline label_t sigmoid_loss_layer::get_label(size_t i, size_t j) { + return context->get_label(i, j); + //return labels(i*input_dims[1]+j); +} + void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { @@ -25,7 +30,7 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* ou math::sigmoid(len, &in_data[idx], &out_data[idx]); // normalize using sigmoid // one hot encoded vector for the labels float_t *ground_truth = new float_t[len]; - for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)context->get_label(i, j); + for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)get_label(i, j); // loss calculation loss[i] = math::cross_entropy(len, ground_truth, &out_data[idx]); delete ground_truth; @@ -41,7 +46,7 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* size_t idx = len * i; float_t *norm_grad = new float_t[len]; float_t *ground_truth = new float_t[len]; - for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)context->get_label(i, j); + for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)get_label(i, j); // use ground truth to determine derivative of cross entropy math::d_cross_entropy(len, ground_truth, &out_data[idx], norm_grad); // derviative sigmoid to gradient used in the next layer diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu index c52b9089f0..e5adbcfc6f 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cu +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu @@ -19,16 +19,18 @@ sigmoid_loss_layer::~sigmoid_loss_layer() { void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { + //label_t *labels = context->get_labels_device_ptr(); init_const_gpu(input_dims[0], 0.0, loss); sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, - d_masks_, context->get_labels_device_ptr(), loss, out_data); + d_masks_, labels, loss, out_data); } void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { + //label_t *labels = context->get_labels_device_ptr(); d_sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, - context->get_labels_device_ptr(), out_data, in_grad); + labels, out_data, in_grad); } acc_t sigmoid_loss_layer::get_prediction_loss() { diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 4a92e56ec3..0428f248b2 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -16,6 +16,11 @@ softmax_loss_layer::~softmax_loss_layer() { delete loss; } +inline label_t softmax_loss_layer::get_label(size_t i) { + //return labels[i]; + return context->get_label(i); +} + // TODO: need kernel fusion optimization // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] void softmax_loss_layer::forward_propagation(const float_t* in_data, @@ -27,7 +32,7 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, math::softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax // one hot encoded vector for the labels std::vector groundTruth(output_dims[1], 0.0); // ground truth - groundTruth[context->get_label(i)] = 1.0; // one-hot + groundTruth[get_label(i)] = 1.0; // one-hot // loss calculation loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]); } @@ -46,7 +51,7 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, if (masks_[i] == 1) { // masked vec_t norm_grad(len); std::vector groundTruth(len, 0.0); - groundTruth[context->get_label(i)] = 1.0; + groundTruth[get_label(i)] = 1.0; // use ground truth to determine derivative of cross entropy math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]); // derviative softmax to gradient used in the next layer diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu index e73ef27f33..5e9a573abe 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cu +++ b/libdeepgalois/src/layers/softmax_loss_layer.cu @@ -20,15 +20,17 @@ softmax_loss_layer::~softmax_loss_layer() { void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { init_const_gpu(input_dims[0], 0.0, loss); + //label_t *labels = context->get_labels_device_ptr(); softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, - d_masks_, context->get_labels_device_ptr(), loss, out_data); + d_masks_, labels, loss, out_data); } void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { + //label_t *labels = context->get_labels_device_ptr(); d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, - context->get_labels_device_ptr(), out_data, in_grad); + labels, out_data, in_grad); } acc_t softmax_loss_layer::get_prediction_loss() { diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index c5ef556032..596aadac04 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -8,7 +8,8 @@ namespace deepgalois { void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, unsigned hidden1, float lr, float dropout, float wd, - bool selfloop, bool single, bool l2norm, bool dense, Graph* dGraph) { + bool selfloop, bool single, bool l2norm, bool dense, + unsigned neigh_sz, unsigned subg_sz, Graph* dGraph) { assert(num_conv > 0); num_conv_layers = num_conv; num_epochs = epochs; @@ -18,6 +19,8 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, is_single_class = single; has_l2norm = l2norm; has_dense = dense; + neighbor_sample_size = neigh_sz; + subgraph_sample_size = subg_sz; galois::gPrint("Configuration: num_conv_layers ", num_conv_layers, ", num_epochs ", num_epochs, ", hidden1 ", hidden1, @@ -28,6 +31,7 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, context = new deepgalois::Context(); context->set_label_class(is_single_class); num_samples = context->read_graph(dataset_str, selfloop); + if (subgraph_sample_size) sampler = new deepgalois::Sampler(); #else context = new deepgalois::DistContext(); num_samples = dGraph->size(); @@ -103,6 +107,9 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, #endif } +void Net::lookup_labels(size_t n, mask_t *masks, const label_t *labels, label_t *sub_labels) { +} + void Net::train(optimizer* opt, bool need_validate) { std::string header = ""; std::string seperator = " "; @@ -118,6 +125,11 @@ void Net::train(optimizer* opt, bool need_validate) { galois::StatTimer Tbw("Train-Backward"); galois::StatTimer Tval("Validation"); double total_train_time = 0.0; + int num_subg_remain = 0; + if (subgraph_sample_size) { + subgraph_masks = new mask_t[num_samples]; + std::copy(train_masks, train_masks+num_samples, subgraph_masks); + } Timer t_epoch; // run epochs @@ -125,6 +137,15 @@ void Net::train(optimizer* opt, bool need_validate) { galois::gPrint(header, "Epoch ", std::setw(3), i, seperator); t_epoch.Start(); + if (subgraph_sample_size && num_subg_remain == 0) { +#ifdef CPU_ONLY + VertexList vertices; + sampler->subgraph_sample(subgraph_sample_size, *(context->getCpuGraphPointer()), + *(context->getCpuSubgraphPointer()), vertices, subgraph_masks); + lookup_labels(num_samples, subgraph_masks, context->get_labels_ptr(), context->get_labels_subg_ptr()); +#endif + num_subg_remain += 1; // num_threads + } // training steps set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; @@ -200,6 +221,7 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { end = train_end; count = train_count; masks = train_masks; + if (subgraph_sample_size) masks = subgraph_masks; } else if (type == "val") { begin = val_begin; end = val_end; @@ -308,10 +330,24 @@ void Net::construct_layers() { //! Add an l2_norm layer to the network void Net::append_l2norm_layer(size_t layer_id) { + assert(layer_id > 0); // can not be the first layer + std::vector in_dims(2), out_dims(2); + in_dims[0] = num_samples; + in_dims[0] = num_samples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + layers[layer_id] = new l2_norm_layer(layer_id, in_dims, out_dims); } //! Add an dense layer to the network void Net::append_dense_layer(size_t layer_id) { + assert(layer_id > 0); // can not be the first layer + std::vector in_dims(2), out_dims(2); + in_dims[0] = num_samples; + in_dims[0] = num_samples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + //layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims); } //! Add an output layer to the network @@ -325,6 +361,11 @@ void Net::append_out_layer(size_t layer_id) { layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); else layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims); +#ifdef CPU_ONLY + layers[layer_id]->set_labels_ptr(context->get_labels_ptr()); +#else + layers[layer_id]->set_labels_ptr(context->get_labels_device_ptr()); +#endif connect(layers[layer_id - 1], layers[layer_id]); } diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index fdfb9802cf..3b3ae84b85 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -114,8 +114,8 @@ void Sampler::select_vertices(Graph &g, VertexList &vertex_set, size_t n, size_t } } -void Sampler::subgraph_sampler(Graph &g, Graph&sg, size_t n) { - VertexList vertex_set(n); +void Sampler::subgraph_sample(size_t n, Graph &g, Graph&sg, VertexList &vertex_set, mask_t *masks) { + vertex_set.resize(n); select_vertices(g, vertex_set, n, m); generate_subgraph(vertex_set, g, sg); } diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 4a6a06639a..ba9cbe3529 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -27,7 +27,8 @@ int main(int argc, char** argv) { // read network, features, ground truth, initialize metadata network.init(dataset, num_conv_layers, epochs, hidden1, learning_rate, dropout_rate, weight_decay, add_selfloop, - is_single_class, add_l2norm, add_dense, dGraph); + is_single_class, add_l2norm, add_dense, + neighbor_sample_sz, subgraph_sample_sz, dGraph); // default setting for now; can be customized by the user network.construct_layers(); network.print_layers_info(); diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h index e2191fb7a1..cdfda9eba0 100644 --- a/lonestargnn/include/lonestargnn.h +++ b/lonestargnn/include/lonestargnn.h @@ -34,8 +34,6 @@ static cll::opt weight_decay("wd", cll::desc("Weight for L2 loss on embedding matrix (default value 5e-4)"), cll::init(5e-4)); static cll::opt early_stopping("es", cll::desc("Tolerance for early stopping (# of epochs) (default value 10)"), cll::init(10)); -static cll::opt max_degree("md", - cll::desc("Maximum size of the downsampled adjacency lists (default value 25)"), cll::init(25)); static cll::opt is_single_class("sc", cll::desc("single-class or multi-class label (default single)"), cll::init(1)); static cll::opt do_validate("dv", cll::desc("enable validation"), cll::init(1)); @@ -43,6 +41,8 @@ static cll::opt do_test("dt", cll::desc("enable test"), cll::init(1)); static cll::opt add_selfloop("sl", cll::desc("add selfloop"), cll::init(0)); static cll::opt add_l2norm("l2", cll::desc("add an l2_norm layer"), cll::init(0)); static cll::opt add_dense("d", cll::desc("add an dense layer"), cll::init(0)); +static cll::opt neighbor_sample_sz("ns", cll::desc("neighbor sampling size (default value 0)"), cll::init(0)); +static cll::opt subgraph_sample_sz("ss", cll::desc("subgraph sampling size (default value 0)"), cll::init(0)); //! standard global options to the benchmarks extern llvm::cl::opt skipVerify; From 22efddfcf0cad139a7eb660dc95bb7b7735ba19e Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 26 Apr 2020 16:41:35 -0500 Subject: [PATCH 192/660] refine --- .../include/deepgalois/DistContext.h | 2 +- libdeepgalois/include/deepgalois/context.h | 33 +++--- .../deepgalois/layers/graph_conv_layer.h | 2 +- .../include/deepgalois/layers/layer.h | 66 ++++++------ libdeepgalois/include/deepgalois/lgraph.h | 100 ------------------ libdeepgalois/src/context.cpp | 88 ++++++++++----- libdeepgalois/src/context.cu | 1 - libdeepgalois/src/layers/graph_conv_layer.cpp | 27 +++-- libdeepgalois/src/layers/graph_conv_layer.cu | 8 +- .../src/layers/sigmoid_loss_layer.cpp | 4 +- .../src/layers/sigmoid_loss_layer.cu | 2 - .../src/layers/softmax_loss_layer.cpp | 4 +- .../src/layers/softmax_loss_layer.cu | 2 - libdeepgalois/src/net.cpp | 28 +---- libdeepgalois/src/net.cu | 39 +++++-- 15 files changed, 174 insertions(+), 232 deletions(-) delete mode 100644 libdeepgalois/include/deepgalois/lgraph.h diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 37e2eea372..704247d54b 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -39,7 +39,7 @@ class DistContext { void initializeSyncSubstrate(); galois::graphs::GluonSubstrate* getSyncSubstrate(); - Graph* getCpuGraphPointer() { + Graph* getGraphPointer() { return graph_cpu; } diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index 786fc48d5d..ea2b5f2156 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -8,7 +8,6 @@ #include "deepgalois/types.h" #include "deepgalois/utils.h" #ifdef CPU_ONLY -#include "deepgalois/lgraph.h" #include "deepgalois/gtypes.h" #else #include "graph_gpu.h" @@ -28,32 +27,35 @@ class Context { size_t read_features(std::string dataset_str, std::string filetype = "bin"); label_t get_label(size_t i) { return labels[i]; } // single-class (one-hot) label - label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label - label_t* get_labels_ptr() { return labels; } - label_t* get_labels_subg_ptr() { return labels_subg; } - label_t* get_labels_device_ptr() { return d_labels; } - float_t* get_in_ptr(); - float_t* get_norm_factor() { return norm_factor; } + //label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label + float_t* get_norm_factor_ptr() { return norm_factor; } + void set_label_class(bool is_single = true) { is_single_class = is_single; } void copy_data_to_device(); // copy labels and input features void norm_factor_counting(); - void set_label_class(bool is_single = true) { is_single_class = is_single; } #ifdef CPU_ONLY Graph* graph_cpu; // the input graph, |V| = N Graph* subgraph_cpu; - void genGraph(LGraph& lg, Graph& g); void add_selfloop(Graph &og, Graph &g); //! returns pointer to the graph - Graph* getCpuGraphPointer(); - Graph* getCpuSubgraphPointer() { return subgraph_cpu; }; + Graph* getGraphPointer() { return graph_cpu; } + Graph* getSubgraphPointer() { return subgraph_cpu; }; + float_t* get_in_ptr() { return h_feats; } + label_t* get_labels_ptr() { return labels; } + label_t* get_labels_subg_ptr() { return labels_subg; } #else CSRGraph graph_gpu; // the input graph, |V| = N + CSRGraph subgraph_gpu; + CSRGraph* getGraphPointer() { return &graph_gpu; } + CSRGraph* getSubgraphPointer() { return &subgraph_gpu; }; + float_t* get_in_ptr() { return d_feats; } + label_t* get_labels_ptr() { return d_labels; } + label_t* get_labels_subg_ptr() { return d_labels_subg; } inline static cublasHandle_t cublas_handle() { return cublas_handle_; } inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; } inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; } inline static curandGenerator_t curand_generator() { return curand_generator_; } - CSRGraph* getGpuGraphPointer() { return &graph_gpu; } #endif protected: @@ -67,12 +69,17 @@ class Context { float_t* h_feats; // input features: N x D float_t* norm_factor; // normalization constant based on graph structure label_t* d_labels; // labels on device + label_t *d_labels_subg; // labels for subgraph on device float_t* d_feats; // input features on device -#ifndef CPU_ONLY + +#ifdef CPU_ONLY + void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false); +#else static cublasHandle_t cublas_handle_; // used to call cuBLAS static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE static curandGenerator_t curand_generator_; // used to generate random numbers on GPU #endif }; + } // end deepgalois namespace diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 63062133df..eb42fe1093 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -36,7 +36,7 @@ class graph_conv_layer : public layer { void init(); std::string layer_type() const override { return std::string("graph_conv"); } void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; } - void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->get_norm_factor(); } + void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->get_norm_factor_ptr(); } virtual acc_t get_weight_decay_loss(); //! Uses weights contained in this layer to update in_data (results from previous) //! and save result to out_data diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index a18802f198..0e94a53d49 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -57,36 +57,38 @@ class layer : public deepgalois::node { } virtual ~layer() = default; virtual std::string layer_type() const = 0; - virtual void set_netphase(deepgalois::net_phase phase) {} - //! save context - virtual void set_context(ContextType* ctx) { context = ctx; } - //! return layer loss + void print_layer_info(); //! debug print function + + // get methods virtual acc_t get_prediction_loss() { return acc_t(0); } virtual acc_t get_weight_decay_loss() { return acc_t(0); } - - // main functions for layer work - virtual void forward_propagation(const float_t* in_data, - float_t* out_data) = 0; - virtual void back_propagation(const float_t* in_data, const float_t* out_data, - float_t* out_grad, float_t* in_grad) = 0; - - // is this layer trainable? - void set_trainable(bool trainable) { trainable_ = trainable; } - void set_labels_ptr(label_t *ptr) { labels = ptr; } bool trainable() const { return trainable_; } - - // name metadata - void set_name(std::string name) { name_ = name; } std::string get_name() { return name_; } - mask_t* get_device_masks() { return d_masks_; } float_t* get_weights_ptr() { return &W[0]; } float_t* get_weights_device_ptr() { return d_W; } float_t* get_grads_ptr() { return &weight_grad[0]; } float_t* get_grads_device_ptr() { return d_weight_grad; } - //! debug print function - void print_layer_info(); + // set methods + virtual void set_netphase(deepgalois::net_phase phase) {} + virtual void set_context(ContextType* ctx) { context = ctx; } + void set_trainable(bool trainable) { trainable_ = trainable; } // is this layer trainable? + void set_labels_ptr(label_t *ptr) { labels = ptr; } + void set_name(std::string name) { name_ = name; } // name metadata +#ifdef CPU_ONLY + void set_graph_ptr(Graph *ptr) { graph_cpu = ptr; } +#else + void set_graph_ptr(CSRGraph *ptr) { graph_gpu = ptr; } +#endif + + //! set the data of the previous layer connected to this one + void set_in_data(float_t* data) { + prev_ = std::make_shared(this, input_dims[0], input_dims[1]); + prev_->set_data(data); + // no need to allocate memory for gradients, since this is the input layer. + } + virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, mask_t* masks) { begin_ = sample_begin; @@ -99,22 +101,18 @@ class layer : public deepgalois::node { #endif } - //! set the data of the previous layer connected to this one - void set_in_data(float_t* data) { - prev_ = std::make_shared(this, input_dims[0], input_dims[1]); - prev_->set_data(data); - // no need to allocate memory for gradients, since this is the input layer. - } - void add_edge() { // add an outgoing edge next_ = std::make_shared(this, output_dims[0], output_dims[1]); // allocate memory for intermediate feature vectors and gradients next_->alloc(); } - void alloc_grad() { - // allocate memory for intermediate gradients - } + + // main functions for layer work + virtual void forward_propagation(const float_t* in_data, + float_t* out_data) = 0; + virtual void back_propagation(const float_t* in_data, const float_t* out_data, + float_t* out_grad, float_t* in_grad) = 0; //! calls forward propagation using previous layer as input and writes //! to next layer as output @@ -132,9 +130,6 @@ class layer : public deepgalois::node { //! use optimizer to update weights given gradient (weight_grad) void update_weight(deepgalois::optimizer* opt) { - // std::cout << name_ << ": weight updating ... "; - // vec_t diff; - // prev()->merge_grads(&diff); #ifdef CPU_ONLY // parallelize only when target size is big enough to mitigate thread // spawning overhead. @@ -168,6 +163,11 @@ class layer : public deepgalois::node { float_t* loss; // error for each vertex: N x 1 ContextType* context; label_t* labels; +#ifdef CPU_ONLY + Graph *graph_cpu; +#else + CSRGraph *graph_gpu; +#endif #ifdef GALOIS_USE_DIST // Used for synchronization of weight gradients diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h deleted file mode 100644 index 029d12d44b..0000000000 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ /dev/null @@ -1,100 +0,0 @@ -#ifndef __LGRAPH_HPP__ -#define __LGRAPH_HPP__ - -// defines the Learning Graph (LGraph) data structure -#include -#include - -namespace deepgalois { - -typedef unsigned IndexT; -typedef float ValueT; - -/** - * Learning graph. - * - * Provides basic accesors and such; nothing special. Just a CSR. - * Ultimatly becomes an LC_CSR. - * - * @todo remove this intermediate step if using edgelists - */ -class LGraph { -public: - LGraph() : directed_(false) {} - void clean() { - delete[] rowptr_; - delete[] colidx_; - } - bool directed() const { return directed_; } - size_t num_vertices() const { return num_vertices_; } - size_t num_edges() const { return num_edges_; } - IndexT* out_rowptr() const { return rowptr_; } - IndexT* out_colidx() const { return colidx_; } - unsigned out_degree(IndexT n) const { return rowptr_[n + 1] - rowptr_[n]; } - IndexT get_offset(IndexT n) { return rowptr_[n]; } - IndexT get_dest(IndexT n) { return colidx_[n]; } - - void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false) { - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - size_t m, n; - in >> m >> n >> std::ws; - num_vertices_ = m; - num_edges_ = 0; - std::cout << "num_vertices " << num_vertices_ << "\n"; - std::vector > vertices(m); - for (size_t i = 0; i < n; i++) { - std::set neighbors; - if (add_self_loop) neighbors.insert(i); - vertices.push_back(neighbors); - } - while (std::getline(in, line)) { - std::istringstream edge_stream(line); - IndexT u, v; - edge_stream >> u; - edge_stream >> v; - vertices[u].insert(v); - if (symmetrize) vertices[v].insert(u); - } - in.close(); - for (size_t i = 0; i < n; i++) num_edges_ += vertices[i].size(); - std::cout << "num_edges " << num_edges_ << "\n"; - MakeCSR(vertices); - } - -private: - bool directed_; - size_t num_vertices_; - size_t num_edges_; - IndexT* rowptr_; - IndexT* colidx_; - - void MakeCSR(std::vector > vertices) { - std::vector degrees; - degrees.resize(num_vertices_); - std::fill(degrees.begin(), degrees.end(), 0); - for (size_t i = 0; i < num_vertices_; i++) - degrees[i] = vertices[i].size(); - std::vector offsets(degrees.size() + 1); - IndexT total = 0; - for (size_t n = 0; n < degrees.size(); n++) { - offsets[n] = total; - total += degrees[n]; - } - offsets[degrees.size()] = total; - degrees.clear(); - assert(num_edges_ == offsets[num_vertices_]); - colidx_ = new IndexT[num_edges_]; - rowptr_ = new IndexT[num_vertices_ + 1]; - for (size_t i = 0; i < num_vertices_ + 1; i++) - rowptr_[i] = offsets[i]; - for (size_t i = 0; i < num_vertices_; i++) { - for (auto dst : vertices[i]) - colidx_[offsets[i]++] = dst; - } - } -}; - -} // namespace -#endif diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 98b3f7ed15..0fc3fe0a95 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -1,7 +1,7 @@ /** * Based on common.hpp file of the Caffe deep learning library. */ - +#include #include "deepgalois/context.h" namespace deepgalois { @@ -30,10 +30,7 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo if (filetype == "el") { std::string filename = path + dataset_str + ".el"; printf("Reading .el file: %s\n", filename.c_str()); - LGraph lgraph; - lgraph.read_edgelist(filename.c_str(), true); // symmetrize - genGraph(lgraph, *graph_cpu); - lgraph.clean(); + read_edgelist(filename.c_str(), true); // symmetrize } else if (filetype == "gr") { std::string filename = path + dataset_str + ".csgr"; printf("Reading .gr file: %s\n", filename.c_str()); @@ -54,19 +51,6 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo return graph_cpu->size(); } -void Context::genGraph(LGraph& lg, Graph& g) { - g.allocateFrom(lg.num_vertices(), lg.num_edges()); - g.constructNodes(); - for (size_t i = 0; i < lg.num_vertices(); i++) { - g.getData(i) = 1; - auto row_begin = lg.get_offset(i); - auto row_end = lg.get_offset(i + 1); - g.fixEndEdge(i, row_end); - for (auto offset = row_begin; offset < row_end; offset++) - g.constructEdge(offset, lg.get_dest(offset), 0); - } -} - void Context::add_selfloop(Graph &og, Graph &g) { g.allocateFrom(og.size(), og.size()+og.sizeEdges()); g.constructNodes(); @@ -99,12 +83,6 @@ void Context::add_selfloop(Graph &og, Graph &g) { //*/ } -Graph* Context::getCpuGraphPointer() { - return Context::graph_cpu; -} - -float_t* Context::get_in_ptr() { return h_feats; } - void Context::norm_factor_counting() { norm_factor = new float_t[n]; galois::do_all(galois::iterate((size_t)0, n), @@ -115,6 +93,68 @@ void Context::norm_factor_counting() { else norm_factor[v] = 1.0 / temp; }, galois::loopname("NormCounting")); } + +void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self_loop) { + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + size_t m, n; + in >> m >> n >> std::ws; + size_t num_vertices_ = m; + size_t num_edges_ = 0; + std::cout << "num_vertices " << num_vertices_ << "\n"; + std::vector > vertices(m); + for (size_t i = 0; i < n; i++) { + std::set neighbors; + if (add_self_loop) neighbors.insert(i); + vertices.push_back(neighbors); + } + while (std::getline(in, line)) { + std::istringstream edge_stream(line); + VertexID u, v; + edge_stream >> u; + edge_stream >> v; + vertices[u].insert(v); + if (symmetrize) vertices[v].insert(u); + } + in.close(); + for (size_t i = 0; i < n; i++) num_edges_ += vertices[i].size(); + std::cout << "num_edges " << num_edges_ << "\n"; + + std::vector degrees; + degrees.resize(num_vertices_); + std::fill(degrees.begin(), degrees.end(), 0); + for (size_t i = 0; i < num_vertices_; i++) + degrees[i] = vertices[i].size(); + std::vector offsets(degrees.size() + 1); + uint32_t total = 0; + for (size_t n = 0; n < degrees.size(); n++) { + offsets[n] = total; + total += degrees[n]; + } + offsets[degrees.size()] = total; + degrees.clear(); + assert(num_edges_ == offsets[num_vertices_]); + EdgeID *colidx_ = new EdgeID[num_edges_]; + VertexID *rowptr_ = new VertexID[num_vertices_ + 1]; + for (size_t i = 0; i < num_vertices_ + 1; i++) + rowptr_[i] = offsets[i]; + for (size_t i = 0; i < num_vertices_; i++) { + for (auto dst : vertices[i]) + colidx_[offsets[i]++] = dst; + } + + graph_cpu->allocateFrom(num_vertices_, num_edges_); + graph_cpu->constructNodes(); + for (size_t i = 0; i < num_vertices_; i++) { + auto row_begin = rowptr_[i]; + auto row_end = rowptr_[i+1]; + graph_cpu->fixEndEdge(i, row_end); + for (auto offset = row_begin; offset < row_end; offset++) + graph_cpu->constructEdge(offset, colidx_[offset], 0); + } +} + #endif // labels contain the ground truth (e.g. vertex classes) for each example diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 93300abffb..86ad9003bf 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -158,5 +158,4 @@ void Context::copy_data_to_device() { //float_copy_device(n*feat_len, &h_feats[0], d_feats); //} -float_t* Context::get_in_ptr() { return d_feats; } } // namespace context diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 3233cd0bc6..3c63468159 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -29,7 +29,7 @@ void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t *b = new float_t[len]; mvmul(n, len, &Q[0], self, a); mvmul(n, len, &W[0], neighbors, b); - deepgalois::math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors + math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors } void graph_conv_layer::init() { @@ -65,12 +65,12 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W (not implemented yet) if (dropout_ && phase_ == deepgalois::net_phase::train) { - deepgalois::math::dropout_cpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); - deepgalois::math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp); - } else deepgalois::math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, &layer::W[0], 0.0, out_temp); + math::dropout_cpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); + math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp); + } else math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, &layer::W[0], 0.0, out_temp); // aggregate based on graph topology - graph_conv_layer::aggregate(z, *(context->graph_cpu), out_temp, out_data); + graph_conv_layer::aggregate(z, *graph_cpu, out_temp, out_data); #ifdef GALOIS_USE_DIST // TODO sync of out_data required here deepgalois::_syncVectorSize = z; @@ -79,7 +79,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ GraphConvSync>("AggSync"); #endif // run relu activation on output if specified - if (act_) deepgalois::math::relu_cpu(x*z, out_data, out_data); + if (act_) math::relu_cpu(x*z, out_data, out_data); } // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ @@ -87,12 +87,12 @@ void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { // note; assumption here is that out_grad contains 1s or 0s via relu? - if (act_) deepgalois::math::d_relu_cpu(x*z, out_grad, out_data, out_grad); + if (act_) math::d_relu_cpu(x*z, out_grad, out_data, out_grad); //else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying // x*y NOTE: since graph is symmetric, the derivative is the same // this is the aggregate call - deepgalois::update_all(z, *(context->graph_cpu), out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z + deepgalois::update_all(z, *graph_cpu, out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z #ifdef GALOIS_USE_DIST // sync agg deepgalois::_syncVectorSize = z; @@ -106,18 +106,17 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // this calculates gradients for the node predictions if (level_ != 0) { // no need to calculate in_grad for the first layer // derivative of matmul needs transposed matrix - deepgalois::math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, - out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y + math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, + out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y if (dropout_) { - deepgalois::math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, - in_grad); + math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad); } } // calculate weight gradients using input data // multiplied by gradients from last back prop step - deepgalois::math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, - out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z + math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, + out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z #ifdef GALOIS_USE_DIST layer::syncSub->sync("GradientSync"); //galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index 28e6002279..ac29b73a7b 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -58,9 +58,9 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ else copy_gpu(x*y, in_data, in_temp); if (y > z) { sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp); - graph_conv_layer::aggregate(z, context->graph_gpu, out_temp, out_data); + graph_conv_layer::aggregate(z, *graph_gpu, out_temp, out_data); } else { - graph_conv_layer::aggregate(y, context->graph_gpu, in_temp, in_temp1); + graph_conv_layer::aggregate(y, *graph_gpu, in_temp, in_temp1); sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, d_W, 0.0, out_data); } if (act_) relu_gpu(x * z, out_data, out_data); @@ -72,14 +72,14 @@ void graph_conv_layer::back_propagation(const float_t* in_data, float_t* out_grad, float_t* in_grad) { if (act_) d_relu_gpu(x * z, out_grad, out_data, out_grad); if (y > z) { - graph_conv_layer::d_aggregate(z, context->graph_gpu, out_grad, out_temp); + graph_conv_layer::d_aggregate(z, *graph_gpu, out_grad, out_temp); if (level_ != 0) sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_grad); sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad); } else { if (level_ != 0) { sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, d_W, 0.0, in_temp); - graph_conv_layer::d_aggregate(y, context->graph_gpu, in_temp, in_grad); + graph_conv_layer::d_aggregate(y, *graph_gpu, in_temp, in_grad); } sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, layer::d_weight_grad); } diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index b94cd83e14..a5ec7eef49 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -17,8 +17,8 @@ sigmoid_loss_layer::~sigmoid_loss_layer() { } inline label_t sigmoid_loss_layer::get_label(size_t i, size_t j) { - return context->get_label(i, j); - //return labels(i*input_dims[1]+j); + //return context->get_label(i, j); + return labels[i*input_dims[1]+j]; } void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu index e5adbcfc6f..1fcc55e207 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cu +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu @@ -19,7 +19,6 @@ sigmoid_loss_layer::~sigmoid_loss_layer() { void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { - //label_t *labels = context->get_labels_device_ptr(); init_const_gpu(input_dims[0], 0.0, loss); sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, d_masks_, labels, loss, out_data); @@ -28,7 +27,6 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { - //label_t *labels = context->get_labels_device_ptr(); d_sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, labels, out_data, in_grad); } diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 0428f248b2..2f944656de 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -17,8 +17,8 @@ softmax_loss_layer::~softmax_loss_layer() { } inline label_t softmax_loss_layer::get_label(size_t i) { - //return labels[i]; - return context->get_label(i); + return labels[i]; + //return context->get_label(i); } // TODO: need kernel fusion optimization diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu index 5e9a573abe..3eb5065edd 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cu +++ b/libdeepgalois/src/layers/softmax_loss_layer.cu @@ -20,7 +20,6 @@ softmax_loss_layer::~softmax_loss_layer() { void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { init_const_gpu(input_dims[0], 0.0, loss); - //label_t *labels = context->get_labels_device_ptr(); softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, d_masks_, labels, loss, out_data); } @@ -28,7 +27,6 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { - //label_t *labels = context->get_labels_device_ptr(); d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, labels, out_data, in_grad); } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 596aadac04..d44c9b4632 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -140,8 +140,8 @@ void Net::train(optimizer* opt, bool need_validate) { if (subgraph_sample_size && num_subg_remain == 0) { #ifdef CPU_ONLY VertexList vertices; - sampler->subgraph_sample(subgraph_sample_size, *(context->getCpuGraphPointer()), - *(context->getCpuSubgraphPointer()), vertices, subgraph_masks); + sampler->subgraph_sample(subgraph_sample_size, *(context->getGraphPointer()), + *(context->getSubgraphPointer()), vertices, subgraph_masks); lookup_labels(num_samples, subgraph_masks, context->get_labels_ptr(), context->get_labels_subg_ptr()); #endif num_subg_remain += 1; // num_threads @@ -154,19 +154,6 @@ void Net::train(optimizer* opt, bool need_validate) { // for use during backprop Tfw.start(); double fw_time = evaluate("train", train_loss, train_acc); - /* - train_loss = Net::fprop(train_begin, train_end, train_count, train_masks); // forward -#ifdef CPU_ONLY - Graph *g = context->getCpuGraphPointer(); -#else - CSRGraph *g = context->getGpuGraphPointer(); -#endif - if (is_single_class) { - train_acc = masked_accuracy(train_begin, train_end, train_count, train_masks, g); // predict - } else { - train_acc = masked_multi_class_accuracy(train_begin, train_end, train_count, train_masks, g); // predict - } - */ Tfw.stop(); // backward: use intermediate features + ground truth to update layers @@ -244,11 +231,7 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { #endif loss = fprop(begin, end, count, masks); -#ifdef CPU_ONLY - Graph* g = context->getCpuGraphPointer(); -#else - CSRGraph* g = context->getGpuGraphPointer(); -#endif + auto g = context->getGraphPointer(); if (is_single_class) { acc = masked_accuracy(begin, end, count, masks, g); } else { @@ -361,11 +344,7 @@ void Net::append_out_layer(size_t layer_id) { layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); else layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims); -#ifdef CPU_ONLY layers[layer_id]->set_labels_ptr(context->get_labels_ptr()); -#else - layers[layer_id]->set_labels_ptr(context->get_labels_device_ptr()); -#endif connect(layers[layer_id - 1], layers[layer_id]); } @@ -380,6 +359,7 @@ void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, out_dims[1] = get_out_dim(layer_id); layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); + layers[layer_id]->set_graph_ptr(context->getGraphPointer()); if (layer_id > 0) connect(layers[layer_id - 1], layers[layer_id]); } diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index d46b807711..27c3ea5de8 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -53,7 +53,8 @@ __global__ void masked_f1_score_kernel(int num_classes, int begin, float_t* preds, label_t* labels, f1count_t* true_positive, f1count_t* false_positive, - f1count_t* false_negtive) { + f1count_t* false_negtive, + f1count_t* true_negtive) { CUDA_KERNEL_LOOP(i, end - begin) { int id = begin + i; if (masks[id] == 1) { @@ -65,6 +66,8 @@ __global__ void masked_f1_score_kernel(int num_classes, int begin, atomicAdd(&false_positive[j], 1.0); } else if (labels[idx] == 1 && preds[idx] <= 0.5) { atomicAdd(&false_negtive[j], 1.0); + } else { + atomicAdd(&true_negtive[j], 1.0); } } } @@ -78,45 +81,63 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, f1count_t* h_tp = new f1count_t[num_classes]; f1count_t* h_fp = new f1count_t[num_classes]; f1count_t* h_fn = new f1count_t[num_classes]; - f1count_t* d_tp, *d_fp, *d_fn; + f1count_t* h_tn = new f1count_t[num_classes]; + f1count_t* d_tp, *d_fp, *d_fn, *d_tn; float_malloc_device(num_classes, d_tp); float_malloc_device(num_classes, d_fp); float_malloc_device(num_classes, d_fn); + float_malloc_device(num_classes, d_tn); init_const_gpu(num_classes, 0.0, d_tp); init_const_gpu(num_classes, 0.0, d_fp); init_const_gpu(num_classes, 0.0, d_fn); + init_const_gpu(num_classes, 0.0, d_tn); masked_f1_score_kernel<<>>( - num_classes, begin, end, masks, preds, labels, d_tp, d_fp, d_fn); + num_classes, begin, end, masks, preds, labels, d_tp, d_fp, d_fn, d_tn); CudaTest("solving masked_f1_score_kernel kernel failed"); CUDA_CHECK(cudaMemcpy(h_tp, d_tp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(h_fp, d_fp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost)); CUDA_CHECK(cudaMemcpy(h_fn, d_fn, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_tn, d_tn, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost)); acc_t pNumerator = 0.0; acc_t pDenominator = 0.0; acc_t rNumerator = 0.0; acc_t rDenominator = 0.0; + acc_t precisionMacro = 0.0; + acc_t recallMacro = 0.0; for (size_t i = 0; i < num_classes; i++) { acc_t fn = (acc_t)h_fn[i]; // false negtive acc_t fp = (acc_t)h_fp[i]; // false positive acc_t tp = (acc_t)h_tp[i]; // true positive + acc_t tn = (acc_t)h_tn[i]; // true positive + + precisionMacro = precisionMacro + (tp / (tp + fp)); + recallMacro = recallMacro + (tp / (tp + fn)); pNumerator = pNumerator + tp; pDenominator = pDenominator + (tp + fp); rNumerator = rNumerator + tp; rDenominator = rDenominator + (tp + fn); } + precisionMacro = precisionMacro / num_classes; + recallMacro = recallMacro / num_classes; + acc_t f1_macro = (((beta * beta) + 1) * precisionMacro * recallMacro) / + ((beta * beta) * precisionMacro + recallMacro); acc_t recallMicro = rNumerator / rDenominator; acc_t precisionMicro = pNumerator / pDenominator; - acc_t fscoreMicro = (((beta * beta) + 1) * precisionMicro * recallMicro) / - ((beta * beta) * precisionMicro + recallMicro); - + acc_t f1_micro = (((beta * beta) + 1) * precisionMicro * recallMicro) / + ((beta * beta) * precisionMicro + recallMicro); + std::cout << std::setprecision(3) << std::fixed << + " (f1_micro: " << f1_micro << ", f1_macro: " << f1_macro << ") "; + float_free_device(d_tp); float_free_device(d_fp); float_free_device(d_fn); + float_free_device(d_tn); delete h_tp; delete h_fp; delete h_fn; - return fscoreMicro; + delete h_tn; + return f1_micro; } namespace deepgalois { @@ -124,14 +145,14 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *g) { return masked_accuracy_gpu(num_classes, begin, end, count, masks, layers[num_conv_layers - 1]->next()->get_data(), - context->get_labels_device_ptr()); + context->get_labels_ptr()); } acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph* g) { return masked_f1_score_gpu(num_classes, begin, end, count, masks, layers[num_conv_layers]->next()->get_data(), - context->get_labels_device_ptr()); + context->get_labels_ptr()); } } // end namespace From b897ef54159935f4b8a8885834c799c93748663e Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 26 Apr 2020 23:16:10 -0500 Subject: [PATCH 193/660] update sampler --- libdeepgalois/include/deepgalois/context.h | 20 +-- .../include/deepgalois/layers/layer.h | 5 +- libdeepgalois/include/deepgalois/net.h | 4 +- libdeepgalois/include/deepgalois/sampler.h | 43 +++++-- libdeepgalois/include/deepgalois/types.h | 2 + libdeepgalois/src/context.cpp | 13 +- libdeepgalois/src/context.cu | 8 +- libdeepgalois/src/net.cpp | 70 +++++++++-- libdeepgalois/src/sampler.cpp | 118 +++++++++++------- lonestargnn/include/lonestargnn.h | 1 + 10 files changed, 200 insertions(+), 84 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index ea2b5f2156..52a306e90d 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -26,7 +26,7 @@ class Context { size_t read_labels(std::string dataset_str); size_t read_features(std::string dataset_str, std::string filetype = "bin"); - label_t get_label(size_t i) { return labels[i]; } // single-class (one-hot) label + label_t get_label(size_t i) { return h_labels[i]; } // single-class (one-hot) label //label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label float_t* get_norm_factor_ptr() { return norm_factor; } @@ -41,15 +41,17 @@ class Context { //! returns pointer to the graph Graph* getGraphPointer() { return graph_cpu; } Graph* getSubgraphPointer() { return subgraph_cpu; }; - float_t* get_in_ptr() { return h_feats; } - label_t* get_labels_ptr() { return labels; } - label_t* get_labels_subg_ptr() { return labels_subg; } + float_t* get_feats_ptr() { return h_feats; } + float_t* get_feats_subg_ptr() { return h_feats_subg; } + label_t* get_labels_ptr() { return h_labels; } + label_t* get_labels_subg_ptr() { return h_labels_subg; } #else CSRGraph graph_gpu; // the input graph, |V| = N CSRGraph subgraph_gpu; CSRGraph* getGraphPointer() { return &graph_gpu; } CSRGraph* getSubgraphPointer() { return &subgraph_gpu; }; - float_t* get_in_ptr() { return d_feats; } + float_t* get_feats_ptr() { return d_feats; } + float_t* get_feats_subg_ptr() { return d_feats_subg; } label_t* get_labels_ptr() { return d_labels; } label_t* get_labels_subg_ptr() { return d_labels_subg; } inline static cublasHandle_t cublas_handle() { return cublas_handle_; } @@ -64,13 +66,15 @@ class Context { size_t feat_len; // input feature length: D bool is_single_class; // single-class (one-hot) or multi-class label bool is_selfloop_added; // whether selfloop is added to the input graph - label_t *labels; // labels for classification. Single-class label: Nx1, multi-class label: NxE - label_t *labels_subg; // labels for subgraph + label_t *h_labels; // labels for classification. Single-class label: Nx1, multi-class label: NxE + label_t *h_labels_subg; // labels for subgraph float_t* h_feats; // input features: N x D - float_t* norm_factor; // normalization constant based on graph structure + float_t* h_feats_subg; // input features for subgraph label_t* d_labels; // labels on device label_t *d_labels_subg; // labels for subgraph on device float_t* d_feats; // input features on device + float_t* d_feats_subg; // input features for subgraph on device + float_t* norm_factor; // normalization constant based on graph structure #ifdef CPU_ONLY void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false); diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 0e94a53d49..a359467ad8 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -52,9 +52,7 @@ class layer : public deepgalois::node { std::vector out_dims) : node(in_dims.size(), out_dims.size()), level_(level), begin_(0), end_(0), num_dims(in_dims.size()), input_dims(in_dims), - output_dims(out_dims), labels(NULL) { - add_edge(); - } + output_dims(out_dims), labels(NULL) { } virtual ~layer() = default; virtual std::string layer_type() const = 0; void print_layer_info(); //! debug print function @@ -81,6 +79,7 @@ class layer : public deepgalois::node { #else void set_graph_ptr(CSRGraph *ptr) { graph_gpu = ptr; } #endif + void update_dim_size(size_t sg_size) { input_dims[0] = output_dims[0] = sg_size; } //! set the data of the previous layer connected to this one void set_in_data(float_t* data) { diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 0cd94adc05..910cae89b5 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -95,6 +95,7 @@ class Net { size_t train_begin, train_end, train_count; size_t val_begin, val_end, val_count; size_t test_begin, test_end, test_count; + int val_interval; mask_t* train_masks; // masks for training mask_t* d_train_masks; // masks for training on device @@ -112,7 +113,8 @@ class Net { deepgalois::DistContext* context; #endif - void lookup_labels(size_t n, mask_t *masks, const label_t *labels, label_t *sub_labels); + void lookup_labels(size_t n, const mask_t *masks, const label_t *labels, label_t *sub_labels); + void lookup_feats(size_t n, const mask_t *masks, const float_t *feats, float_t *sg_feats); #ifdef CPU_ONLY // comparing outputs with the ground truth (labels) diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h index 900ff1de2e..676426c0c3 100644 --- a/libdeepgalois/include/deepgalois/sampler.h +++ b/libdeepgalois/include/deepgalois/sampler.h @@ -4,14 +4,14 @@ namespace deepgalois { class Sampler { public: - Sampler() : m(1000) {} + Sampler() : m_(1000) {} ~Sampler() {} // sample a subgraph sg of size n from graph g - void subgraph_sample(size_t n, Graph &g, Graph &sg, VertexList &vertex_set, mask_t *masks); + void subgraph_sample(size_t n, Graph &sg, mask_t *masks); // !API function for user-defined selection strategy - virtual void select_vertices(Graph &g, VertexList &vertex_set, size_t n, size_t m); + virtual void select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexList &vertex_set); galois::runtime::iterable > neighbor_sampler(Graph &g, GNode v); @@ -19,16 +19,45 @@ class Sampler { Graph::edge_iterator sampled_edge_end(Graph &g, GNode v) { return g.edge_end(v); } + void set_masked_graph(size_t begin, size_t end, size_t count, mask_t *masks, Graph *g) { + begin_ = begin; + end_ = end; + count_ = count; + masks_ = masks; + graph = g; + generate_masked_graph(count, masks, *g, masked_graph); + size_t idx = 0; + vertices_.resize(count); + for (size_t i = begin; i < end; i++) { + if (masks_[i] == 1) vertices_[idx++] = i; + } + } + protected: - size_t m; + int m_; + size_t count_; + size_t begin_; + size_t end_; + VertexList vertices_; + mask_t *masks_; + Graph masked_graph; + Graph *graph; + // Utility function to randomly select k items from [begin, end) - VertexList selectVertex(GNode begin, GNode end, size_t k); + template + T* select_k_items(T k, T begin, T end); + // Utility function to find ceiling of r in arr[l..h] - inline int findCeil(std::vector arr, unsigned r, unsigned l, unsigned h); + template + inline T findCeil(std::vector arr, T r, T l, T h); + // Utility function to select one element from n elements given a frequency (probability) distribution - size_t selectOneVertex(size_t n, std::vector dist); + template + T select_one_item(T n, std::vector dist); + // Given a subset of vertices and a graph g, generate a subgraph sg from the graph g void generate_subgraph(VertexList &vertex_set, Graph &g, Graph &sub); + void generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &mg); }; } diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index a2f6164439..9c4a8333d3 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -1,5 +1,6 @@ #ifndef _GNN_TYPES_H_ #define _GNN_TYPES_H_ +#include #include #include @@ -24,6 +25,7 @@ typedef uint8_t mask_t; // mask is used to indicate different uses of labels: typedef uint32_t VertexID; typedef uint64_t EdgeID; typedef std::vector VertexList; +typedef std::set VertexSet; typedef std::vector dims_t; // dimentions type #define CHUNK_SIZE 256 diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 0fc3fe0a95..103aa94363 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -1,7 +1,6 @@ /** * Based on common.hpp file of the Caffe deep learning library. */ -#include #include "deepgalois/context.h" namespace deepgalois { @@ -9,11 +8,11 @@ namespace deepgalois { #ifdef CPU_ONLY Context::Context() : n(0), num_classes(0), feat_len(0), is_single_class(true), is_selfloop_added(false), - labels(NULL), h_feats(NULL), norm_factor(NULL), + h_labels(NULL), h_feats(NULL), norm_factor(NULL), d_labels(NULL), d_feats(NULL) {} Context::~Context() { - if (labels) delete labels; + if (h_labels) delete h_labels; if (h_feats) delete h_feats; if (norm_factor) delete norm_factor; } @@ -174,10 +173,10 @@ size_t Context::read_labels(std::string dataset_str) { assert(m == n); if (is_single_class) { std::cout << "Using single-class (one-hot) labels\n"; - labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1 + h_labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1 } else { std::cout << "Using multi-class labels\n"; - labels = new label_t[m*num_classes]; // multi-class label for each vertex: N x E + h_labels = new label_t[m*num_classes]; // multi-class label for each vertex: N x E } unsigned v = 0; while (std::getline(in, line)) { @@ -187,11 +186,11 @@ size_t Context::read_labels(std::string dataset_str) { label_stream >> x; if (is_single_class) { if (x != 0) { - labels[v] = idx; + h_labels[v] = idx; break; } } else { - labels[v*num_classes+idx] = x; + h_labels[v*num_classes+idx] = x; } } v++; diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 86ad9003bf..7530bd3946 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -63,7 +63,7 @@ curandGenerator_t Context::curand_generator_ = 0; Context::Context() : n(0), num_classes(0), feat_len(0), is_single_class(true), is_selfloop_added(false), - labels(NULL), h_feats(NULL), norm_factor(NULL), + h_labels(NULL), h_feats(NULL), norm_factor(NULL), d_labels(NULL), d_feats(NULL) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); @@ -141,10 +141,10 @@ size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) { void Context::copy_data_to_device() { if (is_single_class) { CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t))); - CUDA_CHECK(cudaMemcpy(d_labels, labels, n * sizeof(label_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * sizeof(label_t), cudaMemcpyHostToDevice)); } else { CUDA_CHECK(cudaMalloc((void**)&d_labels, n * num_classes * sizeof(label_t))); - CUDA_CHECK(cudaMemcpy(d_labels, labels, n * num_classes * sizeof(label_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * num_classes * sizeof(label_t), cudaMemcpyHostToDevice)); } CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t))); CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); @@ -153,7 +153,7 @@ void Context::copy_data_to_device() { //void Context::copy_data_to_device() { //float_malloc_device(n, d_labels); - //float_copy_device(n, labels, d_labels); + //float_copy_device(n, h_labels, d_labels); //float_malloc_device(n*feat_len, d_feats); //float_copy_device(n*feat_len, &h_feats[0], d_feats); //} diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index d44c9b4632..ca875ebf12 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -21,6 +21,7 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, has_dense = dense; neighbor_sample_size = neigh_sz; subgraph_sample_size = subg_sz; + val_interval = 1; galois::gPrint("Configuration: num_conv_layers ", num_conv_layers, ", num_epochs ", num_epochs, ", hidden1 ", hidden1, @@ -107,7 +108,30 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, #endif } -void Net::lookup_labels(size_t n, mask_t *masks, const label_t *labels, label_t *sub_labels) { +// generate labels for the subgraph +void Net::lookup_labels(size_t n, const mask_t *masks, const label_t *labels, label_t *sg_labels) { + size_t count = 0; + for (size_t i = 0; i < n; i++) { + if (masks[i] == 1) { + if (is_single_class) { + sg_labels[count] = labels[i]; + } else { + std::copy(labels+i*num_classes, labels+(i+1)*num_classes, sg_labels+count*num_classes); + } + count ++; + } + } +} + +void Net::lookup_feats(size_t n, const mask_t *masks, const float_t *feats, float_t *sg_feats) { + size_t count = 0; + size_t len = feature_dims[0]; + for (size_t i = 0; i < n; i++) { + if (masks[i] == 1) { + std::copy(feats+i*len, feats+(i+1)*len, sg_feats+count*len); + count ++; + } + } } void Net::train(optimizer* opt, bool need_validate) { @@ -125,24 +149,37 @@ void Net::train(optimizer* opt, bool need_validate) { galois::StatTimer Tbw("Train-Backward"); galois::StatTimer Tval("Validation"); double total_train_time = 0.0; + int num_subg_remain = 0; +#ifdef CPU_ONLY if (subgraph_sample_size) { subgraph_masks = new mask_t[num_samples]; - std::copy(train_masks, train_masks+num_samples, subgraph_masks); + sampler->set_masked_graph(train_begin, train_end, train_count, train_masks, context->getGraphPointer()); } - +#endif Timer t_epoch; // run epochs - for (unsigned i = 0; i < num_epochs; i++) { - galois::gPrint(header, "Epoch ", std::setw(3), i, seperator); + for (unsigned ep = 0; ep < num_epochs; ep++) { + galois::gPrint(header, "Epoch ", std::setw(3), ep, seperator); t_epoch.Start(); if (subgraph_sample_size && num_subg_remain == 0) { #ifdef CPU_ONLY - VertexList vertices; - sampler->subgraph_sample(subgraph_sample_size, *(context->getGraphPointer()), - *(context->getSubgraphPointer()), vertices, subgraph_masks); + // generate subgraph + sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer()), subgraph_masks); + for (size_t i = 0; i < num_conv_layers-1; i++) { + layers[i]->set_graph_ptr(context->getSubgraphPointer()); + } + // update masks for subgraph + layers[num_layers - 1]->set_sample_mask(train_begin, train_end, train_count, subgraph_masks); + + // update labels for subgraph lookup_labels(num_samples, subgraph_masks, context->get_labels_ptr(), context->get_labels_subg_ptr()); + layers[num_layers-1]->set_labels_ptr(context->get_labels_subg_ptr()); + + // update features for subgraph + lookup_feats(num_samples, subgraph_masks, context->get_feats_ptr(), context->get_feats_subg_ptr()); + layers[0]->set_in_data(context->get_feats_subg_ptr()); // feed input data #endif num_subg_remain += 1; // num_threads } @@ -175,7 +212,7 @@ void Net::train(optimizer* opt, bool need_validate) { t_epoch.Stop(); double epoch_time = t_epoch.Millisecs(); total_train_time += epoch_time; - if (need_validate) { + if (need_validate && ep % val_interval == 0) { // Validation acc_t val_loss = 0.0, val_acc = 0.0; Tval.start(); @@ -306,7 +343,18 @@ void Net::construct_layers() { if (has_dense) append_dense_layer(num_layers-2); // dense layer append_out_layer(num_layers-1); // output layer - layers[0]->set_in_data(context->get_in_ptr()); // feed input data + + // allocate memory for intermediate features and gradients + for (size_t i = 0; i < num_layers; i++) { + if (subgraph_sample_size) + layers[i]->update_dim_size(subgraph_sample_size); + layers[i]->add_edge(); + } + for (size_t i = 1; i < num_layers; i++) { + connect(layers[i - 1], layers[i]); + } + layers[0]->set_in_data(context->get_feats_ptr()); // feed input data + // precompute the normalization constant based on graph structure context->norm_factor_counting(); set_contexts(); } @@ -345,7 +393,6 @@ void Net::append_out_layer(size_t layer_id) { else layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims); layers[layer_id]->set_labels_ptr(context->get_labels_ptr()); - connect(layers[layer_id - 1], layers[layer_id]); } //! Add a convolution layer to the network @@ -360,7 +407,6 @@ void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); layers[layer_id]->set_graph_ptr(context->getGraphPointer()); - if (layer_id > 0) connect(layers[layer_id - 1], layers[layer_id]); } void Net::read_test_masks(std::string dataset, Graph* dGraph) { diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index 3b3ae84b85..dbdd984556 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -8,13 +8,14 @@ inline unsigned getDegree(Graph &g, GNode v) { namespace deepgalois { -// Utility function to randomly select k items from [begin, end) -VertexList Sampler::selectVertex(GNode begin, GNode end, size_t k) { +// Utility function to randomly select k vertices from [begin, end) +template +T* Sampler::select_k_items(T k, T begin, T end) { auto i = begin; // reservoir[] is the output array. Initialize // it with first k vertices - VertexList reservoir(k); + T *reservoir = new T[k]; for (; i < k; i++) reservoir[i] = i; // Use a different seed value so that we don't get @@ -35,8 +36,9 @@ VertexList Sampler::selectVertex(GNode begin, GNode end, size_t k) { } // Utility function to find ceiling of r in arr[l..h] -inline int Sampler::findCeil(std::vector arr, unsigned r, unsigned l, unsigned h) { - unsigned mid; +template +inline T Sampler::findCeil(std::vector arr, T r, T l, T h) { + T mid; while (l < h) { mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2 (r > arr[mid]) ? (l = mid + 1) : (h = mid); @@ -46,16 +48,16 @@ inline int Sampler::findCeil(std::vector arr, unsigned r, unsigned l, // Utility function to select one element from n elements given a frequency (probability) distribution // https://www.geeksforgeeks.org/random-number-generator-in-arbitrary-probability-distribution-fashion/ -size_t Sampler::selectOneVertex(size_t n, std::vector dist) { - std::vector offsets(n); +template +T Sampler::select_one_item(T n, std::vector dist) { + std::vector offsets(n); offsets[0] = dist[0]; // compute the prefix sum of the distribution - for (size_t i = 1; i < n; ++i) offsets[i] = offsets[i-1] + dist[i]; + for (T i = 1; i < n; ++i) offsets[i] = offsets[i-1] + dist[i]; // offsets[n-1] is sum of all frequencies - unsigned sum = offsets[n-1]; - unsigned r = (rand() % sum) + 1; - // find which range r falls into, - // and return the index of the range + T sum = offsets[n-1]; + T r = (rand() % sum) + 1; + // find which range r falls into, and return the index of the range return findCeil(offsets, r, 0, n - 1); } @@ -83,41 +85,73 @@ void Sampler::generate_subgraph(VertexList &vertex_set, Graph &g, Graph &sub) { } } +void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &sub) { + std::vector degrees(n, 0); + galois::do_all(galois::iterate(g), [&](const GNode src) { + if (masks[src] == 1) { + for (const auto e : g.edges(src)) { + const auto dst = g.getEdgeDst(e); + if (masks[dst] == 1) degrees[src] ++; + } + } + }, galois::loopname("update_degrees")); + std::vector offsets(n+1); + offsets[0] = 0; + for (size_t i = 0; i < n; i ++) { + offsets[i+1] = offsets[i] + degrees[i]; + } + size_t ne = offsets[n]; + sub.allocateFrom(n, ne); + sub.constructNodes(); + galois::do_all(galois::iterate(sub), [&](const GNode src) { + g.fixEndEdge(src, offsets[src+1]); + if (masks[src] == 1) { + auto idx = offsets[src]; + for (const auto e : g.edges(src)) { + const auto dst = g.getEdgeDst(e); + if (masks[dst] == 1) g.constructEdge(idx++, dst, 0); + } + } + }, galois::loopname("gen_subgraph")); +} + // !API function for user-defined selection strategy -// Select n vertices from graph g and put them in vertex_set. +// Select n vertices from vertices and put them in vertex_set. +// nv: number of vertices in the original graph; // n: number of vertices in the subgraph; // m: number of vertices in the frontier. -void Sampler::select_vertices(Graph &g, VertexList &vertex_set, size_t n, size_t m) { - assert(n == vertex_set.size()); - auto num_vertices = g.size(); // number of vertices in the original input graph - auto frontier = selectVertex(0, num_vertices, m); // randomly select m vertices from g as frontier - for (size_t i = 0; i < m; i++) vertex_set[i] = frontier[i]; - std::vector degrees(m); - //std::vector probabilities(m); - //unsigned sum_degree = 0; - for (size_t i = 0; i < m; i++) { - degrees[i] = getDegree(g, frontier[i]); - //sum_degree += degrees[i]; - } - for (size_t i = 0; i < n - m; i++) { - //for (size_t i = 0; i < m; i++) - // probabilities[i] = (float)degrees[i] / (float)sum_degree; - auto pos = selectOneVertex(m, degrees); - GNode u = frontier[pos]; - auto degree = degrees[pos]; - auto neighbor_id = rand() % degree; - frontier[pos] = g.getEdgeDst(g.edge_begin(u) + neighbor_id); - degrees[pos] = getDegree(g, frontier[pos]); - //sum_degree -= degree; - //sum_degree += degrees[pos]; - vertex_set.push_back(u); - } +void Sampler::select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexList &vertex_set) { + assert(nv == vertices.size()); + assert(n == vertex_set.size()); + auto frontier_indices = select_k_items(m, 0, (int)nv); // randomly select m vertices from vertices as frontier + VertexList frontier(m); + for (int i = 0; i < m; i++) vertex_set[i] = frontier[i] = vertices[frontier_indices[i]]; + std::vector degrees(m); + galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto i) { + degrees[i] = getDegree(g, frontier[i]); + }, galois::loopname("compute_degrees")); + for (size_t i = 0; i < n - m; i++) { + auto pos = select_one_item((int)m, degrees); + auto u = frontier[pos]; + auto degree = degrees[pos]; + auto neighbor_id = rand() % degree; // randomly select a neighbor + auto dst = g.getEdgeDst(g.edge_begin(u) + neighbor_id); + frontier[pos] = dst; + degrees[pos] = getDegree(g, frontier[pos]); + vertex_set.push_back(u); + } +} + +void update_masks(size_t n, VertexList vertices, mask_t *masks) { + std::fill(masks, masks+n, 0); + for (auto v : vertices) masks[v] = 1; } -void Sampler::subgraph_sample(size_t n, Graph &g, Graph&sg, VertexList &vertex_set, mask_t *masks) { - vertex_set.resize(n); - select_vertices(g, vertex_set, n, m); - generate_subgraph(vertex_set, g, sg); +void Sampler::subgraph_sample(size_t n, Graph&sg, mask_t *masks) { + VertexList vertex_set(n); + select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); + generate_subgraph(vertex_set, masked_graph, sg); + update_masks(graph->size(), vertex_set, masks); } } // end namespace diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h index cdfda9eba0..324f5a31ba 100644 --- a/lonestargnn/include/lonestargnn.h +++ b/lonestargnn/include/lonestargnn.h @@ -41,6 +41,7 @@ static cll::opt do_test("dt", cll::desc("enable test"), cll::init(1)); static cll::opt add_selfloop("sl", cll::desc("add selfloop"), cll::init(0)); static cll::opt add_l2norm("l2", cll::desc("add an l2_norm layer"), cll::init(0)); static cll::opt add_dense("d", cll::desc("add an dense layer"), cll::init(0)); +static cll::opt val_interval("vi", cll::desc("validation interval (default value 1)"), cll::init(1)); static cll::opt neighbor_sample_sz("ns", cll::desc("neighbor sampling size (default value 0)"), cll::init(0)); static cll::opt subgraph_sample_sz("ss", cll::desc("subgraph sampling size (default value 0)"), cll::init(0)); From 87c742db977096f8038c77632a11b057ba597dba Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 27 Apr 2020 11:18:12 -0500 Subject: [PATCH 194/660] update utils --- .../include/deepgalois/DistContext.h | 8 +- libdeepgalois/include/deepgalois/configs.h | 11 ++ libdeepgalois/include/deepgalois/context.h | 4 +- .../deepgalois/layers/graph_conv_layer.h | 24 +-- .../include/deepgalois/layers/layer.h | 3 +- .../include/deepgalois/math_functions.hh | 1 - libdeepgalois/include/deepgalois/net.h | 2 +- libdeepgalois/include/deepgalois/sampler.h | 20 +-- libdeepgalois/include/deepgalois/types.h | 2 + libdeepgalois/include/deepgalois/utils.h | 83 ++++++++-- libdeepgalois/src/DistContext.cpp | 46 ++++++ libdeepgalois/src/context.cpp | 51 +++++- libdeepgalois/src/context.cu | 10 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 22 ++- libdeepgalois/src/layers/graph_conv_layer.cu | 2 +- libdeepgalois/src/math_functions.cpp | 1 + libdeepgalois/src/net.cpp | 13 +- libdeepgalois/src/net.cu | 3 +- libdeepgalois/src/sampler.cpp | 152 +++++++----------- libdeepgalois/src/utils.cpp | 127 +++++---------- 20 files changed, 333 insertions(+), 252 deletions(-) create mode 100644 libdeepgalois/include/deepgalois/configs.h diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 704247d54b..3054915ded 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -5,7 +5,6 @@ */ #include "galois/graphs/GluonSubstrate.h" #include "deepgalois/types.h" -#include "deepgalois/utils.h" #include "deepgalois/gtypes.h" namespace deepgalois { @@ -28,10 +27,17 @@ class DistContext { //! save graph pointer to context object void saveGraph(Graph* dGraph); + //! read labels of local nodes only size_t read_labels(std::string dataset_str); + //! read features of local nodes only size_t read_features(std::string dataset_str); + + //! read masks of local nodes only + size_t read_masks(std::string dataset_str, std::string mask_type, + size_t n, size_t& begin, size_t& end, mask_t* masks, Graph* dGraph); + //! find norm factor by looking at degree // TODO this is a distributed operation void norm_factor_counting(); diff --git a/libdeepgalois/include/deepgalois/configs.h b/libdeepgalois/include/deepgalois/configs.h new file mode 100644 index 0000000000..3de67ecb74 --- /dev/null +++ b/libdeepgalois/include/deepgalois/configs.h @@ -0,0 +1,11 @@ +#pragma once + +namespace deepgalois { + +const std::string path = + "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset + +#define NUM_DATASETS 8 +const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pubmed", "flickr", "yelp", "reddit", "amazon"}; + +} diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index 52a306e90d..d995a41c8c 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -6,7 +6,6 @@ #include #include #include "deepgalois/types.h" -#include "deepgalois/utils.h" #ifdef CPU_ONLY #include "deepgalois/gtypes.h" #else @@ -15,6 +14,7 @@ #endif namespace deepgalois { + class Context { public: Context(); @@ -25,6 +25,8 @@ class Context { size_t read_graph_gpu(std::string dataset_str, bool selfloop); size_t read_labels(std::string dataset_str); size_t read_features(std::string dataset_str, std::string filetype = "bin"); + size_t read_masks(std::string dataset_str, std::string mask_type, + size_t n, size_t& begin, size_t& end, mask_t* masks); label_t get_label(size_t i) { return h_labels[i]; } // single-class (one-hot) label //label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index eb42fe1093..8a6992e30c 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -35,7 +35,7 @@ class graph_conv_layer : public layer { ~graph_conv_layer() {} void init(); std::string layer_type() const override { return std::string("graph_conv"); } - void set_netphase(deepgalois::net_phase ctx) override { phase_ = ctx; } + void set_netphase(net_phase ctx) override { phase_ = ctx; } void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->get_norm_factor_ptr(); } virtual acc_t get_weight_decay_loss(); //! Uses weights contained in this layer to update in_data (results from previous) @@ -64,7 +64,7 @@ class graph_conv_layer : public layer { bool dropout_; // whether to use dropout at first const float_t dropout_rate_; float_t scale_; - deepgalois::net_phase phase_; + net_phase phase_; size_t x; size_t y; size_t z; @@ -76,22 +76,8 @@ class graph_conv_layer : public layer { float_t* norm_factor; // normalization constant based on graph structure // Glorot & Bengio (AISTATS 2010) - inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed=1) { - auto init_range = sqrt(6.0 / (dim_x + dim_y)); - std::default_random_engine rng(seed); - std::uniform_real_distribution dist(-init_range, init_range); - matrix.resize(dim_x * dim_y); - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) - matrix[i * dim_y + j] = dist(rng); - } - } - inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) { - matrix.resize(dim_x * dim_y); - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) - matrix[i * dim_y + j] = 0; - } - } + inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed=1); + inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix); }; + } // namespace diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index a359467ad8..d0bfac6e16 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -10,7 +10,6 @@ */ #include "deepgalois/types.h" -#include "deepgalois/utils.h" #ifndef GALOIS_USE_DIST #include "deepgalois/context.h" #else @@ -69,7 +68,7 @@ class layer : public deepgalois::node { float_t* get_grads_device_ptr() { return d_weight_grad; } // set methods - virtual void set_netphase(deepgalois::net_phase phase) {} + virtual void set_netphase(net_phase phase) {} virtual void set_context(ContextType* ctx) { context = ctx; } void set_trainable(bool trainable) { trainable_ = trainable; } // is this layer trainable? void set_labels_ptr(label_t *ptr) { labels = ptr; } diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 5611caaa94..9c0e58dc45 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -9,7 +9,6 @@ #include #include #include "deepgalois/types.h" -#include "deepgalois/utils.h" extern "C" { #include diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 910cae89b5..d0adf2d55f 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -68,7 +68,7 @@ class Net { layers[i]->set_context(context); } //! set netphases for all layers in this network - void set_netphases(deepgalois::net_phase phase) { + void set_netphases(net_phase phase) { for (size_t i = 0; i < num_layers; i++) layers[i]->set_netphase(phase); } diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h index 676426c0c3..9f57ed53da 100644 --- a/libdeepgalois/include/deepgalois/sampler.h +++ b/libdeepgalois/include/deepgalois/sampler.h @@ -11,7 +11,7 @@ class Sampler { void subgraph_sample(size_t n, Graph &sg, mask_t *masks); // !API function for user-defined selection strategy - virtual void select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexList &vertex_set); + virtual void select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexSet &vertex_set); galois::runtime::iterable > neighbor_sampler(Graph &g, GNode v); @@ -43,21 +43,13 @@ class Sampler { Graph masked_graph; Graph *graph; - // Utility function to randomly select k items from [begin, end) - template - T* select_k_items(T k, T begin, T end); - - // Utility function to find ceiling of r in arr[l..h] - template - inline T findCeil(std::vector arr, T r, T l, T h); - - // Utility function to select one element from n elements given a frequency (probability) distribution - template - T select_one_item(T n, std::vector dist); - // Given a subset of vertices and a graph g, generate a subgraph sg from the graph g - void generate_subgraph(VertexList &vertex_set, Graph &g, Graph &sub); + void generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub); void generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &mg); + + void get_masked_degrees(size_t n, mask_t *masks, Graph &g, std::vector °rees); + void update_masks(size_t n, VertexSet vertices, mask_t *masks); + inline VertexList reindexing_vertice(VertexSet vertex_set); }; } diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index 9c4a8333d3..9c6c79c6e5 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -28,6 +28,8 @@ typedef std::vector VertexList; typedef std::set VertexSet; typedef std::vector dims_t; // dimentions type +enum class net_phase { train, test }; + #define CHUNK_SIZE 256 #define TB_SIZE 256 #define BLOCK_SIZE 256 diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h index 097457290d..60974b9f8a 100644 --- a/libdeepgalois/include/deepgalois/utils.h +++ b/libdeepgalois/include/deepgalois/utils.h @@ -15,11 +15,6 @@ namespace deepgalois { -const std::string path = - "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset - -enum class net_phase { train, test }; - //! tracks max mem usage with rusage // TODO use Galois's getrusage functionality class ResourceManager { @@ -105,14 +100,76 @@ inline bool bernoulli(float_t p) { return uniform_rand(float_t(0), float_t(1)) > p; } +// sequential prefix sum +template +inline std::vector prefix_sum(const std::vector &in) { + std::vector prefix(in.size() + 1); + OutTy total = 0; + for (size_t i = 0; i < in.size(); i ++) { + prefix[i] = total; + total += (OutTy)in[i]; + } + prefix[in.size()] = total; + return prefix; +} + +template +OutTy* parallel_prefix_sum(const std::vector &in); + +// Utility function to randomly select k items from [begin, end) +template +inline T* select_k_items(T k, T begin, T end) { + auto i = begin; + + // reservoir[] is the output array. Initialize + // it with first k vertices + T *reservoir = new T[k]; + for (; i < k; i++) reservoir[i] = i; + + // Use a different seed value so that we don't get + // same result each time we run this program + srand(time(NULL)); + + // Iterate from the (k+1)th element to nth element + for (; i < end; i++) { + // Pick a random index from 0 to i. + auto j = rand() % (i + 1); + + // If the randomly picked index is smaller than k, + // then replace the element present at the index + // with new element from stream + if (j < k) reservoir[j] = i; + } + return reservoir; +} + +// Utility function to find ceiling of r in arr[l..h] +template +inline T find_ceil(T *arr, T r, T l, T h) { + T mid; + while (l < h) { + mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2 + (r > arr[mid]) ? (l = mid + 1) : (h = mid); + } + return (arr[l] >= r) ? l : -1; +} + +// Utility function to select one element from n elements given a frequency (probability) distribution +// https://www.geeksforgeeks.org/random-number-generator-in-arbitrary-probability-distribution-fashion/ +template +T select_one_item(T n, T *dist) { + T *offsets = new T[n]; + offsets[0] = dist[0]; + // compute the prefix sum of the distribution + for (T i = 1; i < n; ++i) offsets[i] = offsets[i-1] + dist[i]; + // offsets[n-1] is sum of all frequencies + T sum = offsets[n-1]; + T r = (rand() % sum) + 1; + // find which range r falls into, and return the index of the range + return find_ceil(offsets, r, 0, n - 1); +} + acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, size_t num_classes, label_t *ground_truth, float_t *pred); -#ifdef GALOIS_USE_DIST -size_t read_masks(std::string dataset_str, std::string mask_type, - size_t n, size_t& begin, size_t& end, mask_t* masks, Graph* dGraph); -#else -size_t read_masks(std::string dataset_str, std::string mask_type, - size_t n, size_t& begin, size_t& end, mask_t* masks); -#endif -} +} // end namespace diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 9069fad351..7c4fd00a46 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -1,4 +1,6 @@ #include "deepgalois/DistContext.h" +#include "deepgalois/utils.h" +#include "deepgalois/configs.h" namespace deepgalois { DistContext::DistContext() {} @@ -101,6 +103,50 @@ size_t DistContext::read_features(std::string dataset_str) { return feat_len; } +size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, + size_t n, size_t& begin, size_t& end, + mask_t* masks, Graph* dGraph) { + bool dataset_found = false; + for (int i = 0; i < NUM_DATASETS; i++) { + if (dataset_str == dataset_names[i]) { + dataset_found = true; + break; + } + } + if (!dataset_found) { + std::cout << "Dataset currently not supported\n"; + exit(1); + } + size_t i = 0; + size_t sample_count = 0; + std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; + + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + in >> begin >> end >> std::ws; + while (std::getline(in, line)) { + std::istringstream mask_stream(line); + if (i >= begin && i < end) { + unsigned mask = 0; + mask_stream >> mask; + if (mask == 1) { + // only bother if it's local + if (dGraph->isLocal(i)) { + masks[dGraph->getLID(i)] = 1; + sample_count++; + } + } + } + i++; + } + std::cout << mask_type + "_mask range: [" << begin << ", " << end + << ") Number of valid samples: " << sample_count << "(" + << (float)sample_count/(float)n*(float)100 << "\%)\n"; + in.close(); + return sample_count; +} + float_t* DistContext::get_in_ptr() { return &h_feats[0]; } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 103aa94363..38ee7543c0 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -2,14 +2,19 @@ * Based on common.hpp file of the Caffe deep learning library. */ #include "deepgalois/context.h" +#include "deepgalois/utils.h" +#include "deepgalois/configs.h" namespace deepgalois { #ifdef CPU_ONLY Context::Context() : n(0), num_classes(0), feat_len(0), is_single_class(true), is_selfloop_added(false), - h_labels(NULL), h_feats(NULL), norm_factor(NULL), - d_labels(NULL), d_feats(NULL) {} + h_labels(NULL), h_labels_subg(NULL), + h_feats(NULL), h_feats_subg(NULL), + d_labels(NULL), d_labels_subg(NULL), + d_feats(NULL), d_feats_subg(NULL), + norm_factor(NULL) {} Context::~Context() { if (h_labels) delete h_labels; @@ -253,6 +258,48 @@ size_t Context::read_features(std::string dataset_str, std::string filetype) { return feat_len; } +//! Get masks from datafile where first line tells range of +//! set to create mask from +size_t Context::read_masks(std::string dataset_str, std::string mask_type, + size_t n, size_t& begin, size_t& end, mask_t* masks) { + bool dataset_found = false; + for (int i = 0; i < NUM_DATASETS; i++) { + if (dataset_str == dataset_names[i]) { + dataset_found = true; + break; + } + } + if (!dataset_found) { + std::cout << "Dataset currently not supported\n"; + exit(1); + } + size_t i = 0; + size_t sample_count = 0; + std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; + // std::cout << "Reading " << filename << "\n"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + in >> begin >> end >> std::ws; + while (std::getline(in, line)) { + std::istringstream mask_stream(line); + if (i >= begin && i < end) { + unsigned mask = 0; + mask_stream >> mask; + if (mask == 1) { + masks[i] = 1; + sample_count++; + } + } + i++; + } + std::cout << mask_type + "_mask range: [" << begin << ", " << end + << ") Number of valid samples: " << sample_count << " (" + << (float)sample_count/(float)n*(float)100 << "\%)\n"; + in.close(); + return sample_count; +} + /* inline void init_features(size_t dim, vec_t &x) { std::default_random_engine rng; diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 7530bd3946..bdef92b52a 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -4,6 +4,7 @@ #include #include "deepgalois/context.h" #include "deepgalois/math_functions.hh" +#include "deepgalois/configs.h" // random seeding int64_t cluster_seedgen(void) { @@ -62,9 +63,12 @@ cusparseMatDescr_t Context::cusparse_matdescr_ = 0; curandGenerator_t Context::curand_generator_ = 0; Context::Context() : n(0), num_classes(0), feat_len(0), - is_single_class(true), is_selfloop_added(false), - h_labels(NULL), h_feats(NULL), norm_factor(NULL), - d_labels(NULL), d_feats(NULL) { + is_single_class(true), is_selfloop_added(false), + h_labels(NULL), h_labels_subg(NULL), + h_feats(NULL), h_feats_subg(NULL), + d_labels(NULL), d_labels_subg(NULL), + d_feats(NULL), d_feats_subg(NULL), + norm_factor(NULL) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_)); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 3c63468159..7616bfa6c6 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -1,4 +1,5 @@ #include "deepgalois/layers/graph_conv_layer.h" +#include "deepgalois/utils.h" namespace deepgalois { @@ -19,6 +20,25 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, scale_ = 1. / (1. - dropout_rate_); } +inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed) { + auto init_range = sqrt(6.0 / (dim_x + dim_y)); + std::default_random_engine rng(seed); + std::uniform_real_distribution dist(-init_range, init_range); + matrix.resize(dim_x * dim_y); + for (size_t i = 0; i < dim_x; ++i) { + for (size_t j = 0; j < dim_y; ++j) + matrix[i * dim_y + j] = dist(rng); + } +} + +inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) { + matrix.resize(dim_x * dim_y); + for (size_t i = 0; i < dim_x; ++i) { + for (size_t j = 0; j < dim_y; ++j) + matrix[i * dim_y + j] = 0; + } +} + #ifdef CPU_ONLY void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { deepgalois::update_all(len, g, in, out, norm_, norm_factor); @@ -64,7 +84,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ // input: x*y; W: y*z; output: x*z // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W (not implemented yet) - if (dropout_ && phase_ == deepgalois::net_phase::train) { + if (dropout_ && phase_ == net_phase::train) { math::dropout_cpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp); } else math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, &layer::W[0], 0.0, out_temp); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index ac29b73a7b..3702a0d709 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -53,7 +53,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ exit(0); } init_const_gpu(x*z, 0.0, out_temp); - if (dropout_ && phase_ == deepgalois::net_phase::train) + if (dropout_ && phase_ == net_phase::train) dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); else copy_gpu(x*y, in_data, in_temp); if (y > z) { diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 0cc7812e9e..555eb7bfca 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -2,6 +2,7 @@ #include "galois/Timer.h" #include "galois/Galois.h" #include +#include "deepgalois/utils.h" extern "C" { #include diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index ca875ebf12..4d73752436 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -3,6 +3,7 @@ */ #include "deepgalois/net.h" +#include "deepgalois/utils.h" namespace deepgalois { @@ -74,11 +75,11 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, #endif } else { #ifndef GALOIS_USE_DIST - train_count = read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks); - val_count = read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks); + train_count = context->read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks); + val_count = context->read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks); #else - train_count = read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks, dGraph); - val_count = read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks, dGraph); + train_count = context->read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks, dGraph); + val_count = context->read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks, dGraph); #endif } @@ -426,9 +427,9 @@ void Net::read_test_masks(std::string dataset, Graph* dGraph) { #endif } else { #ifndef GALOIS_USE_DIST - test_count = deepgalois::read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks); + test_count = context->read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks); #else - test_count = deepgalois::read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks, dGraph); + test_count = context->read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks, dGraph); #endif } #ifndef CPU_ONLY diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index 27c3ea5de8..900ba1a762 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -2,6 +2,7 @@ #include "deepgalois/cutils.h" #include "gg.h" #include "ggcuda.h" +#include // the arguments of the maxima __device__ int argmax_device(const int n, const float_t* x) { @@ -109,7 +110,7 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, acc_t fn = (acc_t)h_fn[i]; // false negtive acc_t fp = (acc_t)h_fp[i]; // false positive acc_t tp = (acc_t)h_tp[i]; // true positive - acc_t tn = (acc_t)h_tn[i]; // true positive + //acc_t tn = (acc_t)h_tn[i]; // true positive precisionMacro = precisionMacro + (tp / (tp + fp)); recallMacro = recallMacro + (tp / (tp + fn)); diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index dbdd984556..a86fa110c2 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -1,3 +1,4 @@ +#include "deepgalois/utils.h" #include "deepgalois/sampler.h" #include #include @@ -8,85 +9,8 @@ inline unsigned getDegree(Graph &g, GNode v) { namespace deepgalois { -// Utility function to randomly select k vertices from [begin, end) -template -T* Sampler::select_k_items(T k, T begin, T end) { - auto i = begin; - - // reservoir[] is the output array. Initialize - // it with first k vertices - T *reservoir = new T[k]; - for (; i < k; i++) reservoir[i] = i; - - // Use a different seed value so that we don't get - // same result each time we run this program - srand(time(NULL)); - - // Iterate from the (k+1)th element to nth element - for (; i < end; i++) { - // Pick a random index from 0 to i. - auto j = rand() % (i + 1); - - // If the randomly picked index is smaller than k, - // then replace the element present at the index - // with new element from stream - if (j < k) reservoir[j] = i; - } - return reservoir; -} - -// Utility function to find ceiling of r in arr[l..h] -template -inline T Sampler::findCeil(std::vector arr, T r, T l, T h) { - T mid; - while (l < h) { - mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2 - (r > arr[mid]) ? (l = mid + 1) : (h = mid); - } - return (arr[l] >= r) ? l : -1; -} - -// Utility function to select one element from n elements given a frequency (probability) distribution -// https://www.geeksforgeeks.org/random-number-generator-in-arbitrary-probability-distribution-fashion/ -template -T Sampler::select_one_item(T n, std::vector dist) { - std::vector offsets(n); - offsets[0] = dist[0]; - // compute the prefix sum of the distribution - for (T i = 1; i < n; ++i) offsets[i] = offsets[i-1] + dist[i]; - // offsets[n-1] is sum of all frequencies - T sum = offsets[n-1]; - T r = (rand() % sum) + 1; - // find which range r falls into, and return the index of the range - return findCeil(offsets, r, 0, n - 1); -} - -// Given a subset of vertices and a graph g, generate a subgraph sg from the graph g -void Sampler::generate_subgraph(VertexList &vertex_set, Graph &g, Graph &sub) { - auto nv = vertex_set.size(); - size_t ne = 0; - std::vector offsets(nv+1); - offsets[0] = 0; - size_t i = 0; - VertexList vertices(nv); - for (auto v : vertex_set) { - vertices[i] = v; - offsets[i+1] = offsets[i] + getDegree(g, v); - i++; - } - // TODO: need to remove edges whose has endpoint not belong to the selected vertex subset - sub.allocateFrom(nv, ne); - sub.constructNodes(); - for (i = 0; i < nv; i++) { - g.fixEndEdge(i, offsets[i+1]); - for (unsigned offset = 0; offset < offsets[i+1]-offsets[i]; offset ++) { - g.constructEdge(offsets[i]+offset, g.getEdgeDst(g.edge_begin(vertices[i])+offset), 0); - } - } -} - -void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &sub) { - std::vector degrees(n, 0); +void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph &g, std::vector °rees) { + assert(degrees.size() == n); galois::do_all(galois::iterate(g), [&](const GNode src) { if (masks[src] == 1) { for (const auto e : g.edges(src)) { @@ -95,11 +19,12 @@ void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &su } } }, galois::loopname("update_degrees")); - std::vector offsets(n+1); - offsets[0] = 0; - for (size_t i = 0; i < n; i ++) { - offsets[i+1] = offsets[i] + degrees[i]; - } +} + +void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &sub) { + std::vector degrees(n, 0); + get_masked_degrees(n, masks, g, degrees); + auto offsets = deepgalois::parallel_prefix_sum(degrees); size_t ne = offsets[n]; sub.allocateFrom(n, ne); sub.constructNodes(); @@ -120,15 +45,16 @@ void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &su // nv: number of vertices in the original graph; // n: number of vertices in the subgraph; // m: number of vertices in the frontier. -void Sampler::select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexList &vertex_set) { +void Sampler::select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexSet &vertex_set) { assert(nv == vertices.size()); - assert(n == vertex_set.size()); - auto frontier_indices = select_k_items(m, 0, (int)nv); // randomly select m vertices from vertices as frontier + auto frontier_indices = deepgalois::select_k_items(m, 0, (int)nv); // randomly select m vertices from vertices as frontier VertexList frontier(m); - for (int i = 0; i < m; i++) vertex_set[i] = frontier[i] = vertices[frontier_indices[i]]; - std::vector degrees(m); + for (int i = 0; i < m; i++) + frontier[i] = vertices[frontier_indices[i]]; + vertex_set.insert(frontier.begin(), frontier.end()); + int *degrees = new int[m]; galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto i) { - degrees[i] = getDegree(g, frontier[i]); + degrees[i] = (int)getDegree(g, frontier[i]); }, galois::loopname("compute_degrees")); for (size_t i = 0; i < n - m; i++) { auto pos = select_one_item((int)m, degrees); @@ -138,20 +64,56 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList v auto dst = g.getEdgeDst(g.edge_begin(u) + neighbor_id); frontier[pos] = dst; degrees[pos] = getDegree(g, frontier[pos]); - vertex_set.push_back(u); + vertex_set.insert(u); } + assert(n == vertex_set.size()); } -void update_masks(size_t n, VertexList vertices, mask_t *masks) { +void Sampler::update_masks(size_t n, VertexSet vertices, mask_t *masks) { std::fill(masks, masks+n, 0); for (auto v : vertices) masks[v] = 1; } +inline VertexList Sampler::reindexing_vertice(VertexSet vertex_set) { + VertexList new_ids(vertex_set.size(), 0); + int vid = 0; + for (auto v : vertex_set) { + new_ids[v] = vid++; // reindex + } + return new_ids; +} + +// Given a subset of vertices and a graph g, generate a subgraph sg from the graph g +void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { + //auto n = g.size(); // old graph size + auto nv = vertex_set.size(); // new graph (subgraph) size + VertexList new_ids = reindexing_vertice(vertex_set); + std::vector degrees(nv, 0); // degrees of vertices in the subgraph + for (auto v : vertex_set) { + degrees[new_ids[v]] = std::distance(g.edge_begin(v), g.edge_end(v)); + } + auto offsets = deepgalois::parallel_prefix_sum(degrees); + auto ne = offsets[nv]; + sub.allocateFrom(nv, ne); + sub.constructNodes(); + VertexList old_ids(vertex_set.begin(), vertex_set.end()); // vertex ID mapping + galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) { + g.fixEndEdge(i, offsets[i+1]); + unsigned j = 0; + auto old_id = old_ids[i]; + for (auto e : g.edges(old_id)) { + g.constructEdge(offsets[i]+j, g.getEdgeDst(e), 0); + j ++; + } + }, galois::loopname("compute_degrees")); +} + void Sampler::subgraph_sample(size_t n, Graph&sg, mask_t *masks) { - VertexList vertex_set(n); - select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); + VertexSet vertex_set; // n = 9000 by default + select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m = 1000 by default + update_masks(graph->size(), vertex_set, masks); // set masks for vertices in the vertex_set + generate_masked_graph(n, masks, masked_graph, sg); // remove edges whose destination is not masked generate_subgraph(vertex_set, masked_graph, sg); - update_masks(graph->size(), vertex_set, masks); } } // end namespace diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index b2b65c9582..dedb9c225a 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -3,8 +3,41 @@ namespace deepgalois { -#define NUM_DATASETS 8 -const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pubmed", "flickr", "yelp", "reddit", "amazon"}; +// parallel prefix sum +template +OutTy* parallel_prefix_sum(const std::vector &in) { + const size_t block_size = 1<<20; + const size_t num_blocks = (in.size() + block_size - 1) / block_size; + std::vector local_sums(num_blocks); + // count how many bits are set on each thread + galois::do_all(galois::iterate((size_t)0, num_blocks), [&](const size_t& block) { + OutTy lsum = 0; + size_t block_end = std::min((block + 1) * block_size, in.size()); + for (size_t i=block * block_size; i < block_end; i++) + lsum += in[i]; + local_sums[block] = lsum; + }); + std::vector bulk_prefix(num_blocks+1); + OutTy total = 0; + for (size_t block=0; block < num_blocks; block++) { + bulk_prefix[block] = total; + total += local_sums[block]; + } + bulk_prefix[num_blocks] = total; + OutTy *prefix = new OutTy[in.size() + 1]; + galois::do_all(galois::iterate((size_t)0, num_blocks), [&](const size_t& block) { + OutTy local_total = bulk_prefix[block]; + size_t block_end = std::min((block + 1) * block_size, in.size()); + for (size_t i=block * block_size; i < block_end; i++) { + prefix[i] = local_total; + local_total += in[i]; + } + }); + prefix[in.size()] = bulk_prefix[num_blocks]; + return prefix; +} + +template uint32_t* parallel_prefix_sum(const std::vector &in); // Compute the F1 score, also known as balanced F-score or F-measure // The F1 score can be interpreted as a weighted average of the precision and recall, @@ -62,92 +95,4 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, return f1_micro; } -#ifndef GALOIS_USE_DIST -//! Get masks from datafile where first line tells range of -//! set to create mask from -size_t read_masks(std::string dataset_str, std::string mask_type, - size_t n, size_t& begin, size_t& end, mask_t* masks) { - bool dataset_found = false; - for (int i = 0; i < NUM_DATASETS; i++) { - if (dataset_str == dataset_names[i]) { - dataset_found = true; - break; - } - } - if (!dataset_found) { - std::cout << "Dataset currently not supported\n"; - exit(1); - } - size_t i = 0; - size_t sample_count = 0; - std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; - // std::cout << "Reading " << filename << "\n"; - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - in >> begin >> end >> std::ws; - while (std::getline(in, line)) { - std::istringstream mask_stream(line); - if (i >= begin && i < end) { - unsigned mask = 0; - mask_stream >> mask; - if (mask == 1) { - masks[i] = 1; - sample_count++; - } - } - i++; - } - std::cout << mask_type + "_mask range: [" << begin << ", " << end - << ") Number of valid samples: " << sample_count << " (" - << (float)sample_count/(float)n*(float)100 << "\%)\n"; - in.close(); - return sample_count; -} -#else -size_t read_masks(std::string dataset_str, std::string mask_type, - size_t n, size_t& begin, size_t& end, - mask_t* masks, Graph* dGraph) { - bool dataset_found = false; - for (int i = 0; i < NUM_DATASETS; i++) { - if (dataset_str == dataset_names[i]) { - dataset_found = true; - break; - } - } - if (!dataset_found) { - std::cout << "Dataset currently not supported\n"; - exit(1); - } - size_t i = 0; - size_t sample_count = 0; - std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; - - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - in >> begin >> end >> std::ws; - while (std::getline(in, line)) { - std::istringstream mask_stream(line); - if (i >= begin && i < end) { - unsigned mask = 0; - mask_stream >> mask; - if (mask == 1) { - // only bother if it's local - if (dGraph->isLocal(i)) { - masks[dGraph->getLID(i)] = 1; - sample_count++; - } - } - } - i++; - } - std::cout << mask_type + "_mask range: [" << begin << ", " << end - << ") Number of valid samples: " << sample_count << "(" - << (float)sample_count/(float)n*(float)100 << "\%)\n"; - in.close(); - return sample_count; -} -#endif - -} +} // end namespace From 5202c7adffc275670e988907320c3242f5bc7c95 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 27 Apr 2020 12:47:49 -0500 Subject: [PATCH 195/660] fix dist --- .../include/deepgalois/DistContext.h | 34 ++++++++++++------- libdeepgalois/include/deepgalois/sampler.h | 4 +-- libdeepgalois/src/DistContext.cpp | 6 ++-- libdeepgalois/src/sampler.cpp | 25 ++++++++------ lonestargnn/gcn/gcn.cpp | 2 +- lonestargnn/include/DistributedGraphLoader.h | 2 +- 6 files changed, 44 insertions(+), 29 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 3054915ded..c0ee3ec704 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -10,18 +10,25 @@ namespace deepgalois { class DistContext { +protected: size_t localVertices; // number of samples: N size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D - std::vector labels; // labels for classification: N x 1 - vec_t h_feats; // input features: N x D galois::graphs::GluonSubstrate* syncSubstrate; -public: - // TODO why are these public - float_t* norm_factor; // normalization constant based on graph structure - Graph* graph_cpu; // the input graph, |V| = N + Graph* graph_cpu; // the input graph, |V| = N + Graph* subgraph_cpu; + label_t *h_labels; // labels for classification. Single-class label: Nx1, multi-class label: NxE + label_t *h_labels_subg; // labels for subgraph + float_t* h_feats; // input features: N x D + float_t* h_feats_subg; // input features for subgraph + label_t* d_labels; // labels on device + label_t *d_labels_subg; // labels for subgraph on device + float_t* d_feats; // input features on device + float_t* d_feats_subg; // input features for subgraph on device + float_t* norm_factor; // normalization constant based on graph structure +public: DistContext(); ~DistContext(); @@ -42,18 +49,21 @@ class DistContext { // TODO this is a distributed operation void norm_factor_counting(); + float_t* get_norm_factor_ptr() { return norm_factor; } + Graph* getGraphPointer() { return graph_cpu; } + Graph* getSubgraphPointer() { return subgraph_cpu; }; + float_t* get_feats_ptr() { return h_feats; } + float_t* get_feats_subg_ptr() { return h_feats_subg; } + label_t* get_labels_ptr() { return h_labels; } + label_t* get_labels_subg_ptr() { return h_labels_subg; } + void initializeSyncSubstrate(); galois::graphs::GluonSubstrate* getSyncSubstrate(); - Graph* getGraphPointer() { - return graph_cpu; - } //! return label for some node //! NOTE: this is LID, not GID - label_t get_label(size_t i) { - return labels[i]; - } + label_t get_label(size_t i) { return h_labels[i]; } //! returns pointer to the features of each local node float_t* get_in_ptr(); diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h index 9f57ed53da..14342c1c6d 100644 --- a/libdeepgalois/include/deepgalois/sampler.h +++ b/libdeepgalois/include/deepgalois/sampler.h @@ -40,12 +40,12 @@ class Sampler { size_t end_; VertexList vertices_; mask_t *masks_; - Graph masked_graph; + Graph *masked_graph; Graph *graph; // Given a subset of vertices and a graph g, generate a subgraph sg from the graph g void generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub); - void generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &mg); + void generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph *mg); void get_masked_degrees(size_t n, mask_t *masks, Graph &g, std::vector °rees); void update_masks(size_t n, VertexSet vertices, mask_t *masks); diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 7c4fd00a46..2a9ad81575 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -26,7 +26,7 @@ size_t DistContext::read_labels(std::string dataset_str) { in >> m >> num_classes >> std::ws; assert(m == dGraph->globalSize()); // size of labels should be # local nodes - labels.resize(dGraph->size(), 0); + h_labels = new label_t[dGraph->size()]; // single-class (one-hot) label for each vertex: N x 1 uint32_t foundVertices = 0; unsigned v = 0; @@ -42,7 +42,7 @@ size_t DistContext::read_labels(std::string dataset_str) { label_stream >> x; if (x != 0) { // set local id - labels[dGraph->getLID(v)] = idx; + h_labels[dGraph->getLID(v)] = idx; foundVertices++; break; } @@ -76,7 +76,7 @@ size_t DistContext::read_features(std::string dataset_str) { // header read in >> m >> feat_len >> std::ws; // use local size, not global size - h_feats.resize(dGraph->size() * feat_len, 0); + h_feats = new float_t[dGraph->size() * feat_len]; // loop through all features while (std::getline(in, line)) { diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index a86fa110c2..6ee47a452e 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -11,7 +11,7 @@ namespace deepgalois { void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph &g, std::vector °rees) { assert(degrees.size() == n); - galois::do_all(galois::iterate(g), [&](const GNode src) { + galois::do_all(galois::iterate(size_t(0), n), [&](const GNode src) { if (masks[src] == 1) { for (const auto e : g.edges(src)) { const auto dst = g.getEdgeDst(e); @@ -21,14 +21,16 @@ void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph &g, std::vector< }, galois::loopname("update_degrees")); } -void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &sub) { +void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph *sub) { std::vector degrees(n, 0); get_masked_degrees(n, masks, g, degrees); auto offsets = deepgalois::parallel_prefix_sum(degrees); size_t ne = offsets[n]; - sub.allocateFrom(n, ne); - sub.constructNodes(); - galois::do_all(galois::iterate(sub), [&](const GNode src) { +#ifndef GALOIS_USE_DIST + sub = new Graph(); + sub->allocateFrom(n, ne); + sub->constructNodes(); + galois::do_all(galois::iterate((size_t)0, n), [&](const GNode src) { g.fixEndEdge(src, offsets[src+1]); if (masks[src] == 1) { auto idx = offsets[src]; @@ -38,6 +40,7 @@ void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph &su } } }, galois::loopname("gen_subgraph")); +#endif } // !API function for user-defined selection strategy @@ -53,7 +56,7 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList v frontier[i] = vertices[frontier_indices[i]]; vertex_set.insert(frontier.begin(), frontier.end()); int *degrees = new int[m]; - galois::do_all(galois::iterate(g.begin(), g.end()), [&](const auto i) { + galois::do_all(galois::iterate(size_t(0), g.size()), [&](const auto i) { degrees[i] = (int)getDegree(g, frontier[i]); }, galois::loopname("compute_degrees")); for (size_t i = 0; i < n - m; i++) { @@ -94,6 +97,7 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { } auto offsets = deepgalois::parallel_prefix_sum(degrees); auto ne = offsets[nv]; +#ifndef GALOIS_USE_DIST sub.allocateFrom(nv, ne); sub.constructNodes(); VertexList old_ids(vertex_set.begin(), vertex_set.end()); // vertex ID mapping @@ -105,15 +109,16 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { g.constructEdge(offsets[i]+j, g.getEdgeDst(e), 0); j ++; } - }, galois::loopname("compute_degrees")); + }, galois::loopname("construct_graph")); +#endif } void Sampler::subgraph_sample(size_t n, Graph&sg, mask_t *masks) { VertexSet vertex_set; // n = 9000 by default - select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m = 1000 by default + select_vertices(count_, n, m_, *masked_graph, vertices_, vertex_set); // m = 1000 by default update_masks(graph->size(), vertex_set, masks); // set masks for vertices in the vertex_set - generate_masked_graph(n, masks, masked_graph, sg); // remove edges whose destination is not masked - generate_subgraph(vertex_set, masked_graph, sg); + generate_masked_graph(n, masks, *masked_graph, &sg); // remove edges whose destination is not masked + generate_subgraph(vertex_set, *masked_graph, sg); } } // end namespace diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index ba9cbe3529..de999a095e 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -21,7 +21,7 @@ int main(int argc, char** argv) { Graph* dGraph = NULL; #ifdef GALOIS_USE_DIST std::vector dummyVec; - Graph* dGraph = galois::graphs::constructSymmetricGraph(dummyVec); + dGraph = galois::graphs::constructSymmetricGraph(dummyVec); #endif // read network, features, ground truth, initialize metadata diff --git a/lonestargnn/include/DistributedGraphLoader.h b/lonestargnn/include/DistributedGraphLoader.h index b7da4faa54..247ad0763c 100644 --- a/lonestargnn/include/DistributedGraphLoader.h +++ b/lonestargnn/include/DistributedGraphLoader.h @@ -31,7 +31,7 @@ #define D_GRAPH_LOADER_SYM #include "galois/graphs/CuSPPartitioner.h" -#include "deepgalois/utils.h" +#include "deepgalois/configs.h" /******************************************************************************* * Supported partitioning schemes From b183f2b1d05868e492ee10400471be85c3f71e6a Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 27 Apr 2020 15:12:34 -0500 Subject: [PATCH 196/660] fix some bugs --- .../include/deepgalois/DistContext.h | 1 + libdeepgalois/include/deepgalois/context.h | 1 + .../deepgalois/layers/graph_conv_layer.h | 5 +- .../include/deepgalois/layers/layer.h | 1 + .../deepgalois/layers/sigmoid_loss_layer.h | 5 +- .../deepgalois/layers/softmax_loss_layer.h | 1 + libdeepgalois/include/deepgalois/sampler.h | 27 ++---- libdeepgalois/src/context.cpp | 4 + libdeepgalois/src/layers/graph_conv_layer.cpp | 17 ++-- libdeepgalois/src/layers/graph_conv_layer.cu | 16 +++- .../src/layers/sigmoid_loss_layer.cpp | 5 +- .../src/layers/sigmoid_loss_layer.cu | 5 +- .../src/layers/softmax_loss_layer.cpp | 5 +- .../src/layers/softmax_loss_layer.cu | 5 +- libdeepgalois/src/net.cpp | 14 ++- libdeepgalois/src/sampler.cpp | 93 +++++++++++++------ 16 files changed, 132 insertions(+), 73 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index c0ee3ec704..4444143f09 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -48,6 +48,7 @@ class DistContext { //! find norm factor by looking at degree // TODO this is a distributed operation void norm_factor_counting(); + void createSubgraph() {} float_t* get_norm_factor_ptr() { return norm_factor; } Graph* getGraphPointer() { return graph_cpu; } diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index d995a41c8c..15b9605cec 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -39,6 +39,7 @@ class Context { #ifdef CPU_ONLY Graph* graph_cpu; // the input graph, |V| = N Graph* subgraph_cpu; + void createSubgraph(); void add_selfloop(Graph &og, Graph &g); //! returns pointer to the graph Graph* getGraphPointer() { return graph_cpu; } diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 8a6992e30c..7f0aa5a9a3 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -33,7 +33,7 @@ class graph_conv_layer : public layer { std::vector out_dims) : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {} ~graph_conv_layer() {} - void init(); + void malloc_and_init(); std::string layer_type() const override { return std::string("graph_conv"); } void set_netphase(net_phase ctx) override { phase_ = ctx; } void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->get_norm_factor_ptr(); } @@ -65,9 +65,6 @@ class graph_conv_layer : public layer { const float_t dropout_rate_; float_t scale_; net_phase phase_; - size_t x; - size_t y; - size_t z; float_t* out_temp; //!< intermediate data temporary float_t* in_temp; float_t* in_temp1; diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index d0bfac6e16..c604f6ffbe 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -55,6 +55,7 @@ class layer : public deepgalois::node { virtual ~layer() = default; virtual std::string layer_type() const = 0; void print_layer_info(); //! debug print function + virtual void malloc_and_init() {} // get methods virtual acc_t get_prediction_loss() { return acc_t(0); } diff --git a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h index 760b6f0ab1..c8b1241acc 100644 --- a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h +++ b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h @@ -7,9 +7,8 @@ class sigmoid_loss_layer : public layer { sigmoid_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims); ~sigmoid_loss_layer(); - std::string layer_type() const override { - return std::string("sigmoid_loss"); - } + std::string layer_type() const override { return std::string("sigmoid_loss"); } + void malloc_and_init(); inline label_t get_label(size_t i, size_t j); virtual void forward_propagation(const float_t* in_data, float_t* out_data); virtual void back_propagation(const float_t* in_data, const float_t* out_data, diff --git a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h index 060698e3d9..43f07728cd 100644 --- a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h +++ b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h @@ -8,6 +8,7 @@ class softmax_loss_layer : public layer { std::vector out_dims); ~softmax_loss_layer(); std::string layer_type() const override { return std::string("softmax_loss"); } + void malloc_and_init(); inline label_t get_label(size_t i); virtual void forward_propagation(const float_t* in_data, float_t* out_data); virtual void back_propagation(const float_t* in_data, const float_t* out_data, diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h index 14342c1c6d..01616d01f5 100644 --- a/libdeepgalois/include/deepgalois/sampler.h +++ b/libdeepgalois/include/deepgalois/sampler.h @@ -8,10 +8,10 @@ class Sampler { ~Sampler() {} // sample a subgraph sg of size n from graph g - void subgraph_sample(size_t n, Graph &sg, mask_t *masks); + void subgraph_sample(size_t n, Graph &sg, mask_t* masks); // !API function for user-defined selection strategy - virtual void select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexSet &vertex_set); + virtual void select_vertices(size_t nv, size_t n, int m, Graph* g, VertexList vertices, VertexSet &vertex_set); galois::runtime::iterable > neighbor_sampler(Graph &g, GNode v); @@ -19,19 +19,7 @@ class Sampler { Graph::edge_iterator sampled_edge_end(Graph &g, GNode v) { return g.edge_end(v); } - void set_masked_graph(size_t begin, size_t end, size_t count, mask_t *masks, Graph *g) { - begin_ = begin; - end_ = end; - count_ = count; - masks_ = masks; - graph = g; - generate_masked_graph(count, masks, *g, masked_graph); - size_t idx = 0; - vertices_.resize(count); - for (size_t i = begin; i < end; i++) { - if (masks_[i] == 1) vertices_[idx++] = i; - } - } + void set_masked_graph(size_t begin, size_t end, size_t count, mask_t* masks, Graph* g); protected: int m_; @@ -45,11 +33,10 @@ class Sampler { // Given a subset of vertices and a graph g, generate a subgraph sg from the graph g void generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub); - void generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph *mg); - - void get_masked_degrees(size_t n, mask_t *masks, Graph &g, std::vector °rees); - void update_masks(size_t n, VertexSet vertices, mask_t *masks); - inline VertexList reindexing_vertice(VertexSet vertex_set); + void generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& mg); + void get_masked_degrees(size_t n, mask_t* masks, Graph* g, std::vector °rees); + void update_masks(size_t n, VertexSet vertices, mask_t* masks); + inline VertexList reindexing_vertice(size_t n, VertexSet vertex_set); }; } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 38ee7543c0..f6d443f4f1 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -27,6 +27,10 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) { return n; } +void Context::createSubgraph() { + subgraph_cpu = new Graph(); +} + size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) { galois::StatTimer Tread("GraphReadingTime"); Tread.start(); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 7616bfa6c6..9903768070 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -10,12 +10,8 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias), dropout_(dropout), dropout_rate_(dropout_rate) { assert(input_dims[0] == output_dims[0]); // num_vertices - x = input_dims[0]; - y = input_dims[1]; - z = output_dims[1]; trainable_ = true; name_ = layer_type() + "_" + std::to_string(level); - init(); assert(dropout_rate_ < 1.); scale_ = 1. / (1. - dropout_rate_); } @@ -52,7 +48,10 @@ void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors } -void graph_conv_layer::init() { +void graph_conv_layer::malloc_and_init() { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; #ifdef GALOIS_USE_DIST // setup gluon layer::gradientGraph = new deepgalois::GluonGradients(layer::weight_grad, @@ -81,6 +80,9 @@ void graph_conv_layer::init() { // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; // input: x*y; W: y*z; output: x*z // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W (not implemented yet) @@ -106,6 +108,9 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; // note; assumption here is that out_grad contains 1s or 0s via relu? if (act_) math::d_relu_cpu(x*z, out_grad, out_data, out_grad); //else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying @@ -144,7 +149,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, } acc_t graph_conv_layer::get_weight_decay_loss() { - return math::l2_norm(y*z, &layer::W[0]); + return math::l2_norm(input_dims[1]*output_dims[1], &layer::W[0]); } #endif // end if CPU_ONLY diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index 3702a0d709..ed89089450 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -2,7 +2,11 @@ namespace deepgalois { -void graph_conv_layer::init() { +void graph_conv_layer::malloc_and_init() { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; + if (dropout_) CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(unsigned))); //CUDA_CHECK(cudaMalloc((void**)&in_temp, x * y * sizeof(float_t))); float_malloc_device(x*y, in_temp); @@ -47,6 +51,10 @@ void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, // GPU forward: compute output features // NOTE: in_data will be used in back-prop, so it can not be modified void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; + if (z > MAX_NUM_CLASSES) { std::cout << "Currently support maximum hidden feature length of " << MAX_NUM_CLASSES << "\n"; // currently only support feature length <= 128 @@ -70,6 +78,10 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; + if (act_) d_relu_gpu(x * z, out_grad, out_data, out_grad); if (y > z) { graph_conv_layer::d_aggregate(z, *graph_gpu, out_grad, out_temp); @@ -88,7 +100,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, } acc_t graph_conv_layer::get_weight_decay_loss() { - return l2_norm_gpu(y*z, d_W); + return l2_norm_gpu(input_dims[1]*output_dims[1], d_W); } } // namespace diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index a5ec7eef49..19606eec6c 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -9,13 +9,16 @@ sigmoid_loss_layer::sigmoid_loss_layer(unsigned level, : layer(level, in_dims, out_dims) { trainable_ = false; name_ = layer_type() + "_" + std::to_string(level); - loss = new float_t[in_dims[0]]; // error for each sample } sigmoid_loss_layer::~sigmoid_loss_layer() { delete loss; } +void sigmoid_loss_layer::malloc_and_init() { + loss = new float_t[input_dims[0]]; // error for each sample +} + inline label_t sigmoid_loss_layer::get_label(size_t i, size_t j) { //return context->get_label(i, j); return labels[i*input_dims[1]+j]; diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu index 1fcc55e207..4159569601 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cu +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu @@ -10,13 +10,16 @@ sigmoid_loss_layer::sigmoid_loss_layer(unsigned level, : layer(level, in_dims, out_dims) { trainable_ = false; name_ = layer_type() + "_" + std::to_string(level); - float_malloc_device(in_dims[0], loss); } sigmoid_loss_layer::~sigmoid_loss_layer() { float_free_device(loss); } +void sigmoid_loss_layer::malloc_and_init() { + float_malloc_device(input_dims[0], loss); +} + void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { init_const_gpu(input_dims[0], 0.0, loss); diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 2f944656de..9e4fda933e 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -9,13 +9,16 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, : layer(level, in_dims, out_dims) { trainable_ = false; name_ = layer_type() + "_" + std::to_string(level); - loss = new float_t[in_dims[0]]; // error for each sample } softmax_loss_layer::~softmax_loss_layer() { delete loss; } +void softmax_loss_layer::malloc_and_init() { + loss = new float_t[input_dims[0]]; // error for each sample +} + inline label_t softmax_loss_layer::get_label(size_t i) { return labels[i]; //return context->get_label(i); diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu index 3eb5065edd..fd3fc11140 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cu +++ b/libdeepgalois/src/layers/softmax_loss_layer.cu @@ -10,13 +10,16 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, : layer(level, in_dims, out_dims) { trainable_ = false; name_ = layer_type() + "_" + std::to_string(level); - float_malloc_device(in_dims[0], loss); } softmax_loss_layer::~softmax_loss_layer() { float_free_device(loss); } +void softmax_loss_layer::malloc_and_init() { + float_malloc_device(input_dims[0], loss); +} + void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { init_const_gpu(input_dims[0], 0.0, loss); diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 4d73752436..e9ed3b4fd4 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -111,6 +111,7 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, // generate labels for the subgraph void Net::lookup_labels(size_t n, const mask_t *masks, const label_t *labels, label_t *sg_labels) { + if (sg_labels == NULL) sg_labels = new label_t[subgraph_sample_size]; size_t count = 0; for (size_t i = 0; i < n; i++) { if (masks[i] == 1) { @@ -127,6 +128,7 @@ void Net::lookup_labels(size_t n, const mask_t *masks, const label_t *labels, la void Net::lookup_feats(size_t n, const mask_t *masks, const float_t *feats, float_t *sg_feats) { size_t count = 0; size_t len = feature_dims[0]; + if (sg_feats == NULL) sg_feats = new float_t[subgraph_sample_size*len]; for (size_t i = 0; i < n; i++) { if (masks[i] == 1) { std::copy(feats+i*len, feats+(i+1)*len, sg_feats+count*len); @@ -144,7 +146,6 @@ void Net::train(optimizer* opt, bool need_validate) { seperator = "\n"; #endif - galois::gPrint("\nStart training...\n"); galois::StatTimer Tupdate("Train-WeightUpdate"); galois::StatTimer Tfw("Train-Forward"); galois::StatTimer Tbw("Train-Backward"); @@ -154,10 +155,12 @@ void Net::train(optimizer* opt, bool need_validate) { int num_subg_remain = 0; #ifdef CPU_ONLY if (subgraph_sample_size) { + galois::gPrint("\nConstruct training vertex set induced graph...\n"); subgraph_masks = new mask_t[num_samples]; sampler->set_masked_graph(train_begin, train_end, train_count, train_masks, context->getGraphPointer()); } #endif + galois::gPrint("\nStart training...\n"); Timer t_epoch; // run epochs for (unsigned ep = 0; ep < num_epochs; ep++) { @@ -167,7 +170,9 @@ void Net::train(optimizer* opt, bool need_validate) { if (subgraph_sample_size && num_subg_remain == 0) { #ifdef CPU_ONLY // generate subgraph - sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer()), subgraph_masks); + context->createSubgraph(); + auto subgraph_ptr = context->getSubgraphPointer(); + sampler->subgraph_sample(subgraph_sample_size, *(subgraph_ptr), subgraph_masks); for (size_t i = 0; i < num_conv_layers-1; i++) { layers[i]->set_graph_ptr(context->getSubgraphPointer()); } @@ -351,9 +356,10 @@ void Net::construct_layers() { layers[i]->update_dim_size(subgraph_sample_size); layers[i]->add_edge(); } - for (size_t i = 1; i < num_layers; i++) { + for (size_t i = 1; i < num_layers; i++) connect(layers[i - 1], layers[i]); - } + for (size_t i = 0; i < num_layers; i++) + layers[i]->malloc_and_init(); layers[0]->set_in_data(context->get_feats_ptr()); // feed input data // precompute the normalization constant based on graph structure context->norm_factor_counting(); diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index 6ee47a452e..c126660fb4 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -3,40 +3,58 @@ #include #include -inline unsigned getDegree(Graph &g, GNode v) { - return std::distance(g.edge_begin(v), g.edge_end(v)); +inline unsigned getDegree(Graph *g, GNode v) { + return std::distance(g->edge_begin(v), g->edge_end(v)); } namespace deepgalois { -void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph &g, std::vector °rees) { +void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *masks, Graph *g) { + galois::gPrint("Set masked graph: begin=", begin, ", end=", end, ", count=", count, "\n"); + begin_ = begin; + end_ = end; + count_ = count; + masks_ = masks; + graph = g; +#ifndef GALOIS_USE_DIST + masked_graph = new Graph(); +#endif + generate_masked_graph(g->size(), masks, g, *masked_graph); + size_t idx = 0; + vertices_.resize(count); + for (size_t i = begin; i < end; i++) { + if (masks_[i] == 1) vertices_[idx++] = i; + } +} + +void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph *g, std::vector °rees) { assert(degrees.size() == n); galois::do_all(galois::iterate(size_t(0), n), [&](const GNode src) { if (masks[src] == 1) { - for (const auto e : g.edges(src)) { - const auto dst = g.getEdgeDst(e); + for (const auto e : g->edges(src)) { + const auto dst = g->getEdgeDst(e); if (masks[dst] == 1) degrees[src] ++; } } }, galois::loopname("update_degrees")); } -void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph *sub) { +void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& sub) { std::vector degrees(n, 0); get_masked_degrees(n, masks, g, degrees); auto offsets = deepgalois::parallel_prefix_sum(degrees); size_t ne = offsets[n]; + galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", ne, "\n"); #ifndef GALOIS_USE_DIST - sub = new Graph(); - sub->allocateFrom(n, ne); - sub->constructNodes(); + sub.allocateFrom(n, ne); + sub.constructNodes(); galois::do_all(galois::iterate((size_t)0, n), [&](const GNode src) { - g.fixEndEdge(src, offsets[src+1]); + sub.fixEndEdge(src, offsets[src+1]); if (masks[src] == 1) { auto idx = offsets[src]; - for (const auto e : g.edges(src)) { - const auto dst = g.getEdgeDst(e); - if (masks[dst] == 1) g.constructEdge(idx++, dst, 0); + for (const auto e : g->edges(src)) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) sub.constructEdge(idx++, dst, 0); } } }, galois::loopname("gen_subgraph")); @@ -48,37 +66,48 @@ void Sampler::generate_masked_graph(size_t n, mask_t *masks, Graph &g, Graph *su // nv: number of vertices in the original graph; // n: number of vertices in the subgraph; // m: number of vertices in the frontier. -void Sampler::select_vertices(size_t nv, size_t n, int m, Graph &g, VertexList vertices, VertexSet &vertex_set) { +void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList vertices, VertexSet &vertex_set) { + galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices, graph size: ", g->size(), "\n"); assert(nv == vertices.size()); auto frontier_indices = deepgalois::select_k_items(m, 0, (int)nv); // randomly select m vertices from vertices as frontier VertexList frontier(m); for (int i = 0; i < m; i++) frontier[i] = vertices[frontier_indices[i]]; vertex_set.insert(frontier.begin(), frontier.end()); + galois::gPrint("vertex_set size: ", vertex_set.size(), "\n"); int *degrees = new int[m]; - galois::do_all(galois::iterate(size_t(0), g.size()), [&](const auto i) { + galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) { degrees[i] = (int)getDegree(g, frontier[i]); }, galois::loopname("compute_degrees")); for (size_t i = 0; i < n - m; i++) { auto pos = select_one_item((int)m, degrees); auto u = frontier[pos]; auto degree = degrees[pos]; - auto neighbor_id = rand() % degree; // randomly select a neighbor - auto dst = g.getEdgeDst(g.edge_begin(u) + neighbor_id); - frontier[pos] = dst; - degrees[pos] = getDegree(g, frontier[pos]); - vertex_set.insert(u); + int j =0; + for (; j < degree; j ++) { + auto neighbor_id = rand() % degree; // randomly select a neighbor + auto dst = g->getEdgeDst(g->edge_begin(u) + neighbor_id); + if (vertex_set.find(dst) == vertex_set.end()) { + frontier[pos] = dst; + degrees[pos] = getDegree(g, frontier[pos]); + vertex_set.insert(dst); + break; + } + } + if (j == degree) galois::gPrint("Not found from ", degree, " neighbors\n"); } + galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), "\n"); assert(n == vertex_set.size()); } void Sampler::update_masks(size_t n, VertexSet vertices, mask_t *masks) { - std::fill(masks, masks+n, 0); - for (auto v : vertices) masks[v] = 1; + galois::gPrint("Updating masks, size = ", vertices.size(), "\n"); + std::fill(masks, masks+n, 0); + for (auto v : vertices) masks[v] = 1; } -inline VertexList Sampler::reindexing_vertice(VertexSet vertex_set) { - VertexList new_ids(vertex_set.size(), 0); +inline VertexList Sampler::reindexing_vertice(size_t n, VertexSet vertex_set) { + VertexList new_ids(n, 0); int vid = 0; for (auto v : vertex_set) { new_ids[v] = vid++; // reindex @@ -90,23 +119,24 @@ inline VertexList Sampler::reindexing_vertice(VertexSet vertex_set) { void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { //auto n = g.size(); // old graph size auto nv = vertex_set.size(); // new graph (subgraph) size - VertexList new_ids = reindexing_vertice(vertex_set); + VertexList new_ids = reindexing_vertice(graph->size(), vertex_set); std::vector degrees(nv, 0); // degrees of vertices in the subgraph for (auto v : vertex_set) { degrees[new_ids[v]] = std::distance(g.edge_begin(v), g.edge_end(v)); } auto offsets = deepgalois::parallel_prefix_sum(degrees); auto ne = offsets[nv]; + galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne, "\n"); #ifndef GALOIS_USE_DIST sub.allocateFrom(nv, ne); sub.constructNodes(); VertexList old_ids(vertex_set.begin(), vertex_set.end()); // vertex ID mapping galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) { - g.fixEndEdge(i, offsets[i+1]); + sub.fixEndEdge(i, offsets[i+1]); unsigned j = 0; auto old_id = old_ids[i]; for (auto e : g.edges(old_id)) { - g.constructEdge(offsets[i]+j, g.getEdgeDst(e), 0); + sub.constructEdge(offsets[i]+j, g.getEdgeDst(e), 0); j ++; } }, galois::loopname("construct_graph")); @@ -115,10 +145,13 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { void Sampler::subgraph_sample(size_t n, Graph&sg, mask_t *masks) { VertexSet vertex_set; // n = 9000 by default - select_vertices(count_, n, m_, *masked_graph, vertices_, vertex_set); // m = 1000 by default + select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m = 1000 by default update_masks(graph->size(), vertex_set, masks); // set masks for vertices in the vertex_set - generate_masked_graph(n, masks, *masked_graph, &sg); // remove edges whose destination is not masked - generate_subgraph(vertex_set, *masked_graph, sg); +#ifndef GALOIS_USE_DIST + Graph masked_sg; + generate_masked_graph(graph->size(), masks, masked_graph, masked_sg); // remove edges whose destination is not masked + generate_subgraph(vertex_set, masked_sg, sg); +#endif } } // end namespace From 4217994068865950c702f2a6f8fef00f53c8dd70 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 27 Apr 2020 19:40:04 -0500 Subject: [PATCH 197/660] fix pointers --- .../include/deepgalois/DistContext.h | 2 ++ libdeepgalois/include/deepgalois/context.h | 2 ++ .../include/deepgalois/layers/layer.h | 1 + libdeepgalois/include/deepgalois/net.h | 3 -- libdeepgalois/src/context.cpp | 28 +++++++++++++++ libdeepgalois/src/net.cpp | 34 ++----------------- 6 files changed, 36 insertions(+), 34 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 4444143f09..21dd025aec 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -49,6 +49,8 @@ class DistContext { // TODO this is a distributed operation void norm_factor_counting(); void createSubgraph() {} + void gen_subgraph_labels(size_t m, const mask_t *masks) {} + void gen_subgraph_feats(size_t m, const mask_t *masks) {} float_t* get_norm_factor_ptr() { return norm_factor; } Graph* getGraphPointer() { return graph_cpu; } diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index 15b9605cec..eb41fdf200 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -48,6 +48,8 @@ class Context { float_t* get_feats_subg_ptr() { return h_feats_subg; } label_t* get_labels_ptr() { return h_labels; } label_t* get_labels_subg_ptr() { return h_labels_subg; } + void gen_subgraph_labels(size_t m, const mask_t *masks); + void gen_subgraph_feats(size_t m, const mask_t *masks); #else CSRGraph graph_gpu; // the input graph, |V| = N CSRGraph subgraph_gpu; diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index c604f6ffbe..b49c8797a9 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -73,6 +73,7 @@ class layer : public deepgalois::node { virtual void set_context(ContextType* ctx) { context = ctx; } void set_trainable(bool trainable) { trainable_ = trainable; } // is this layer trainable? void set_labels_ptr(label_t *ptr) { labels = ptr; } + void set_feats_ptr(float_t *ptr) { prev_->set_data(ptr); } void set_name(std::string name) { name_ = name; } // name metadata #ifdef CPU_ONLY void set_graph_ptr(Graph *ptr) { graph_cpu = ptr; } diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index d0adf2d55f..820367bef5 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -113,9 +113,6 @@ class Net { deepgalois::DistContext* context; #endif - void lookup_labels(size_t n, const mask_t *masks, const label_t *labels, label_t *sub_labels); - void lookup_feats(size_t n, const mask_t *masks, const float_t *feats, float_t *sg_feats); - #ifdef CPU_ONLY // comparing outputs with the ground truth (labels) acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph); diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index f6d443f4f1..8d779c0b80 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -31,6 +31,34 @@ void Context::createSubgraph() { subgraph_cpu = new Graph(); } +// generate labels for the subgraph, m is subgraph size +void Context::gen_subgraph_labels(size_t m, const mask_t *masks) { + if (h_labels_subg == NULL) h_labels_subg = new label_t[m]; + size_t count = 0; + for (size_t i = 0; i < n; i++) { + if (masks[i] == 1) { + if (is_single_class) { + h_labels_subg[count] = h_labels[i]; + } else { + std::copy(h_labels+i*num_classes, h_labels+(i+1)*num_classes, h_labels_subg+count*num_classes); + } + count ++; + } + } +} + +// generate input features for the subgraph, m is subgraph size +void Context::gen_subgraph_feats(size_t m, const mask_t *masks) { + size_t count = 0; + if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len]; + for (size_t i = 0; i < n; i++) { + if (masks[i] == 1) { + std::copy(h_feats+i*feat_len, h_feats+(i+1)*feat_len, h_feats_subg+count*feat_len); + count ++; + } + } +} + size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) { galois::StatTimer Tread("GraphReadingTime"); Tread.start(); diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index e9ed3b4fd4..a991e5fe17 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -109,34 +109,6 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, #endif } -// generate labels for the subgraph -void Net::lookup_labels(size_t n, const mask_t *masks, const label_t *labels, label_t *sg_labels) { - if (sg_labels == NULL) sg_labels = new label_t[subgraph_sample_size]; - size_t count = 0; - for (size_t i = 0; i < n; i++) { - if (masks[i] == 1) { - if (is_single_class) { - sg_labels[count] = labels[i]; - } else { - std::copy(labels+i*num_classes, labels+(i+1)*num_classes, sg_labels+count*num_classes); - } - count ++; - } - } -} - -void Net::lookup_feats(size_t n, const mask_t *masks, const float_t *feats, float_t *sg_feats) { - size_t count = 0; - size_t len = feature_dims[0]; - if (sg_feats == NULL) sg_feats = new float_t[subgraph_sample_size*len]; - for (size_t i = 0; i < n; i++) { - if (masks[i] == 1) { - std::copy(feats+i*len, feats+(i+1)*len, sg_feats+count*len); - count ++; - } - } -} - void Net::train(optimizer* opt, bool need_validate) { std::string header = ""; std::string seperator = " "; @@ -180,12 +152,12 @@ void Net::train(optimizer* opt, bool need_validate) { layers[num_layers - 1]->set_sample_mask(train_begin, train_end, train_count, subgraph_masks); // update labels for subgraph - lookup_labels(num_samples, subgraph_masks, context->get_labels_ptr(), context->get_labels_subg_ptr()); + context->gen_subgraph_labels(subgraph_sample_size, subgraph_masks); layers[num_layers-1]->set_labels_ptr(context->get_labels_subg_ptr()); // update features for subgraph - lookup_feats(num_samples, subgraph_masks, context->get_feats_ptr(), context->get_feats_subg_ptr()); - layers[0]->set_in_data(context->get_feats_subg_ptr()); // feed input data + context->gen_subgraph_feats(subgraph_sample_size, subgraph_masks); + layers[0]->set_feats_ptr(context->get_feats_subg_ptr()); // feed input data #endif num_subg_remain += 1; // num_threads } From db20360b31885992ec70c290f7c9d351d6f1883a Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 27 Apr 2020 23:19:52 -0500 Subject: [PATCH 198/660] fix norm_factor --- .../include/deepgalois/DistContext.h | 3 +-- libdeepgalois/include/deepgalois/context.h | 4 +++- .../deepgalois/layers/graph_conv_layer.h | 7 ++---- .../include/deepgalois/layers/layer.h | 2 +- libdeepgalois/src/DistContext.cpp | 2 +- libdeepgalois/src/context.cpp | 24 ++++++++++--------- libdeepgalois/src/context.cu | 2 +- libdeepgalois/src/layers/aggregator.cpp | 8 +++---- libdeepgalois/src/layers/graph_conv_layer.cpp | 14 ++++++++--- libdeepgalois/src/layers/graph_conv_layer.cu | 2 ++ libdeepgalois/src/net.cpp | 5 +++- 11 files changed, 43 insertions(+), 30 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 21dd025aec..04aca5fc9e 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -47,7 +47,7 @@ class DistContext { //! find norm factor by looking at degree // TODO this is a distributed operation - void norm_factor_counting(); + void norm_factor_counting(size_t g_size); void createSubgraph() {} void gen_subgraph_labels(size_t m, const mask_t *masks) {} void gen_subgraph_feats(size_t m, const mask_t *masks) {} @@ -63,7 +63,6 @@ class DistContext { void initializeSyncSubstrate(); galois::graphs::GluonSubstrate* getSyncSubstrate(); - //! return label for some node //! NOTE: this is LID, not GID label_t get_label(size_t i) { return h_labels[i]; } diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index eb41fdf200..e368319dff 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -33,8 +33,9 @@ class Context { float_t* get_norm_factor_ptr() { return norm_factor; } void set_label_class(bool is_single = true) { is_single_class = is_single; } + void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; } void copy_data_to_device(); // copy labels and input features - void norm_factor_counting(); + void norm_factor_counting(size_t g_size); #ifdef CPU_ONLY Graph* graph_cpu; // the input graph, |V| = N @@ -71,6 +72,7 @@ class Context { size_t feat_len; // input feature length: D bool is_single_class; // single-class (one-hot) or multi-class label bool is_selfloop_added; // whether selfloop is added to the input graph + bool use_subgraph; // whether to use subgraph label_t *h_labels; // labels for classification. Single-class label: Nx1, multi-class label: NxE label_t *h_labels_subg; // labels for subgraph float_t* h_feats; // input features: N x D diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 7f0aa5a9a3..dc38642330 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -36,7 +36,6 @@ class graph_conv_layer : public layer { void malloc_and_init(); std::string layer_type() const override { return std::string("graph_conv"); } void set_netphase(net_phase ctx) override { phase_ = ctx; } - void set_context(layer::ContextType* ctx) { context = ctx; norm_factor = ctx->get_norm_factor_ptr(); } virtual acc_t get_weight_decay_loss(); //! Uses weights contained in this layer to update in_data (results from previous) //! and save result to out_data @@ -48,14 +47,13 @@ class graph_conv_layer : public layer { // user-defined aggregate function #ifdef CPU_ONLY virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out); + void d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out); #else virtual void aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out); + void d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out); #endif // user-defined combine function virtual void combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out); -#ifndef CPU_ONLY - void d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out); -#endif private: bool act_; // whether to use activation function at the end @@ -70,7 +68,6 @@ class graph_conv_layer : public layer { float_t* in_temp1; float_t* trans_data; // y*x unsigned* dropout_mask; // x*y - float_t* norm_factor; // normalization constant based on graph structure // Glorot & Bengio (AISTATS 2010) inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed=1); diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index b49c8797a9..79196172ca 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -70,7 +70,7 @@ class layer : public deepgalois::node { // set methods virtual void set_netphase(net_phase phase) {} - virtual void set_context(ContextType* ctx) { context = ctx; } + void set_context(ContextType* ctx) { context = ctx; } void set_trainable(bool trainable) { trainable_ = trainable; } // is this layer trainable? void set_labels_ptr(label_t *ptr) { labels = ptr; } void set_feats_ptr(float_t *ptr) { prev_->set_data(ptr); } diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 2a9ad81575..174e7eb210 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -151,7 +151,7 @@ float_t* DistContext::get_in_ptr() { return &h_feats[0]; } -void DistContext::norm_factor_counting() { +void DistContext::norm_factor_counting(size_t g_size) { // TODO: this is a distributed operation // create for now, TODO need to actually fill it in diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 8d779c0b80..b17f6d7eaa 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -8,8 +8,9 @@ namespace deepgalois { #ifdef CPU_ONLY -Context::Context() : n(0), num_classes(0), feat_len(0), - is_single_class(true), is_selfloop_added(false), +Context::Context() : n(0), num_classes(0), + feat_len(0), is_single_class(true), + is_selfloop_added(false), use_subgraph(false), h_labels(NULL), h_labels_subg(NULL), h_feats(NULL), h_feats_subg(NULL), d_labels(NULL), d_labels_subg(NULL), @@ -119,15 +120,16 @@ void Context::add_selfloop(Graph &og, Graph &g) { //*/ } -void Context::norm_factor_counting() { - norm_factor = new float_t[n]; - galois::do_all(galois::iterate((size_t)0, n), - [&](auto v) { - auto degree = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v)); - float_t temp = std::sqrt(float_t(degree)); - if (temp == 0.0) norm_factor[v] = 0.0; - else norm_factor[v] = 1.0 / temp; - }, galois::loopname("NormCounting")); +void Context::norm_factor_counting(size_t g_size) { + Graph *g = graph_cpu; + if (use_subgraph) g = subgraph_cpu; + if (norm_factor == NULL) norm_factor = new float_t[g_size]; + galois::do_all(galois::iterate((size_t)0, g_size), [&](auto v) { + auto degree = std::distance(g->edge_begin(v), g->edge_end(v)); + float_t temp = std::sqrt(float_t(degree)); + if (temp == 0.0) norm_factor[v] = 0.0; + else norm_factor[v] = 1.0 / temp; + }, galois::loopname("NormCounting")); } void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self_loop) { diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index bdef92b52a..23abd3f1c2 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -97,7 +97,7 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) { return n; } -void Context::norm_factor_counting() { +void Context::norm_factor_counting(size_t g_size) { std::cout << "Pre-computing normalization factor (n=" << n << ") ... "; if (!is_selfloop_added) { std::cout << "Set -sl=1 to add selfloop\n"; diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 40a8fdcf8f..65308172f1 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -6,12 +6,11 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou bool norm, const float_t* norm_factor) { // zero out the output data #ifndef GALOIS_USE_DIST - galois::do_all(galois::iterate(g), + galois::do_all(galois::iterate(size_t(0), g.size()),[&](const auto src) { #else auto& rangeObj = g.allNodesRange(); - galois::do_all(galois::iterate(rangeObj), + galois::do_all(galois::iterate(rangeObj), [&](const auto src) { #endif - [&](const GNode src) { deepgalois::math::clear_cpu(len , &out[src * len]); float_t a = 0.0; float_t b = 0.0; @@ -29,10 +28,11 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou // use scaled data to update deepgalois::math::vadd_cpu(len, &out[src * len], &neighbor[0], &out[src * len]); // out[src] += in[dst] - } else + } else { // add embeddings from neighbors together deepgalois::math::vadd_cpu(len, &out[src * len], &in[dst * len], &out[src * len]); // out[src] += in[dst] + } } }, galois::steal(), galois::no_stats(), galois::loopname("update_all")); } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 9903768070..5e3b6aa320 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -37,7 +37,15 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t #ifdef CPU_ONLY void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { - deepgalois::update_all(len, g, in, out, norm_, norm_factor); + // normalization constant based on graph structure + float_t* norm_consts = context->get_norm_factor_ptr(); + update_all(len, g, in, out, norm_, norm_consts); +} + +// since graph is symmetric, the derivative is the same +void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { + float_t* norm_consts = context->get_norm_factor_ptr(); + update_all(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z } void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t* neighbors, float_t* out) { @@ -83,6 +91,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ size_t x = input_dims[0]; size_t y = input_dims[1]; size_t z = output_dims[1]; + //std::cout << "x=" << x << ", y=" << y << ", z=" << z << "\n"; // input: x*y; W: y*z; output: x*z // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W (not implemented yet) @@ -115,9 +124,8 @@ void graph_conv_layer::back_propagation(const float_t* in_data, if (act_) math::d_relu_cpu(x*z, out_grad, out_data, out_grad); //else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying - // x*y NOTE: since graph is symmetric, the derivative is the same // this is the aggregate call - deepgalois::update_all(z, *graph_cpu, out_grad, out_temp, norm_, norm_factor); // x*x; x*z -> x*z + graph_conv_layer::d_aggregate(z, *graph_cpu, out_grad, out_temp); #ifdef GALOIS_USE_DIST // sync agg deepgalois::_syncVectorSize = z; diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index ed89089450..c3f97a49d4 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -30,6 +30,7 @@ void graph_conv_layer::malloc_and_init() { } void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { + float_t* norm_factor = context->get_norm_factor_ptr(); #ifdef USE_CUSPARSE deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor); #else @@ -38,6 +39,7 @@ void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, flo } void graph_conv_layer::d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { + float_t* norm_factor = context->get_norm_factor_ptr(); #ifdef USE_CUSPARSE deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor); #else diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index a991e5fe17..7c9d049fc2 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -32,6 +32,7 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, #ifndef GALOIS_USE_DIST context = new deepgalois::Context(); context->set_label_class(is_single_class); + context->set_use_subgraph(subgraph_sample_size > 0); num_samples = context->read_graph(dataset_str, selfloop); if (subgraph_sample_size) sampler = new deepgalois::Sampler(); #else @@ -158,6 +159,8 @@ void Net::train(optimizer* opt, bool need_validate) { // update features for subgraph context->gen_subgraph_feats(subgraph_sample_size, subgraph_masks); layers[0]->set_feats_ptr(context->get_feats_subg_ptr()); // feed input data + + context->norm_factor_counting(subgraph_sample_size); #endif num_subg_remain += 1; // num_threads } @@ -334,7 +337,7 @@ void Net::construct_layers() { layers[i]->malloc_and_init(); layers[0]->set_in_data(context->get_feats_ptr()); // feed input data // precompute the normalization constant based on graph structure - context->norm_factor_counting(); + if (!subgraph_sample_size) context->norm_factor_counting(num_samples); set_contexts(); } From d242fab483a4a07fcb2959351d1941462876eba9 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 28 Apr 2020 08:31:24 -0500 Subject: [PATCH 199/660] udapte g_conv --- libdeepgalois/src/layers/graph_conv_layer.cpp | 64 +++++++++++-------- 1 file changed, 36 insertions(+), 28 deletions(-) diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 5e3b6aa320..c7d0307fd4 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -36,6 +36,7 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t } #ifdef CPU_ONLY +// aggregate based on graph topology void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { // normalization constant based on graph structure float_t* norm_consts = context->get_norm_factor_ptr(); @@ -84,6 +85,7 @@ void graph_conv_layer::malloc_and_init() { in_temp = new float_t[x * y]; out_temp = new float_t[x * z]; trans_data = new float_t[y * x]; // y*x + if (y <= z) in_temp1 = new float_t[x * y]; } // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) @@ -92,22 +94,26 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ size_t y = input_dims[1]; size_t z = output_dims[1]; //std::cout << "x=" << x << ", y=" << y << ", z=" << z << "\n"; + // input: x*y; W: y*z; output: x*z // if y > z: mult W first to reduce the feature size for aggregation - // else: aggregate first then mult W (not implemented yet) - if (dropout_ && phase_ == net_phase::train) { + // else: aggregate first then mult W + if (dropout_ && phase_ == net_phase::train) math::dropout_cpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); - math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp); - } else math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_data, &layer::W[0], 0.0, out_temp); + else math::copy_cpu(x*y, in_data, in_temp); - // aggregate based on graph topology - graph_conv_layer::aggregate(z, *graph_cpu, out_temp, out_data); + if (y > z) { + math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp); + aggregate(z, *graph_cpu, out_temp, out_data); + } else { + aggregate(y, *graph_cpu, in_temp, in_temp1); + math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, &layer::W[0], 0.0, out_data); + } #ifdef GALOIS_USE_DIST // TODO sync of out_data required here deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_data; - layer::context->getSyncSubstrate()->sync("AggSync"); + layer::context->getSyncSubstrate()->sync("AggSync"); #endif // run relu activation on output if specified if (act_) math::relu_cpu(x*z, out_data, out_data); @@ -122,34 +128,36 @@ void graph_conv_layer::back_propagation(const float_t* in_data, size_t z = output_dims[1]; // note; assumption here is that out_grad contains 1s or 0s via relu? if (act_) math::d_relu_cpu(x*z, out_grad, out_data, out_grad); - //else deepgalois::math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying + //else math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying + + if (y > z) { + d_aggregate(z, *graph_cpu, out_grad, out_temp); + // at this point, out_temp has the derivative of data from last step to + // use for both updating gradients for features and gradients for weights + // this calculates gradients for the node predictions + if (level_ != 0) // no need to calculate in_grad for the first layer + // derivative of matmul needs transposed matrix + math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y + // calculate weight gradients using input data; multiplied by gradients from last back prop step + math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z + } else { + if (level_ != 0) { + math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, &W[0], 0.0, in_temp); + d_aggregate(y, *graph_cpu, in_temp, in_grad); + } + math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, &layer::weight_grad[0]); + } - // this is the aggregate call - graph_conv_layer::d_aggregate(z, *graph_cpu, out_grad, out_temp); #ifdef GALOIS_USE_DIST // sync agg deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_temp; - layer::context->getSyncSubstrate()->sync("AggSyncBack"); + layer::context->getSyncSubstrate()->sync("AggSyncBack"); #endif - // at this point, out_temp has the derivative of data from last step to - // use for both updating gradients for features and gradients for weights - // this calculates gradients for the node predictions - if (level_ != 0) { // no need to calculate in_grad for the first layer - // derivative of matmul needs transposed matrix - math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, - out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y - if (dropout_) { - math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad); - } - } + if (level_ != 0 && dropout_) + math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad); - // calculate weight gradients using input data - // multiplied by gradients from last back prop step - math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, - out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z #ifdef GALOIS_USE_DIST layer::syncSub->sync("GradientSync"); //galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); From 11f7ce4f910aa79a5911ceb2311a901c30e12119 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 28 Apr 2020 16:06:06 -0500 Subject: [PATCH 200/660] add MKL --- CMakeLists.txt | 14 +++++++ cmake/Modules/FindMKL.cmake | 24 ++++++++++++ libdeepgalois/CMakeLists.txt | 30 ++++++++------ .../include/deepgalois/layers/layer.h | 1 - .../include/deepgalois/math_functions.hh | 11 +++--- libdeepgalois/include/deepgalois/optimizer.h | 16 +------- libdeepgalois/src/layers/aggregator.cpp | 20 +++++----- libdeepgalois/src/layers/graph_conv_layer.cpp | 9 +++++ libdeepgalois/src/layers/graph_conv_layer.cu | 1 + libdeepgalois/src/layers/l2_norm_layer.cpp | 1 + libdeepgalois/src/layers/l2_norm_layer.cu | 1 + libdeepgalois/src/layers/leaky_relu_layer.cpp | 1 + libdeepgalois/src/layers/leaky_relu_layer.cu | 1 + libdeepgalois/src/layers/relu_layer.cpp | 1 + libdeepgalois/src/layers/relu_layer.cu | 1 + .../src/layers/sigmoid_loss_layer.cpp | 1 + .../src/layers/sigmoid_loss_layer.cu | 1 + .../src/layers/softmax_loss_layer.cpp | 1 + .../src/layers/softmax_loss_layer.cu | 1 + libdeepgalois/src/math_functions.cpp | 39 ++++++++++--------- libdeepgalois/src/net.cpp | 9 ++++- libdeepgalois/src/net.cu | 1 + libdeepgalois/src/optimizer.cpp | 1 + libdeepgalois/src/optimizer.cu | 17 +++++++- libdeepgalois/src/sampler.cpp | 9 ++++- .../include/galois/graphs/LC_CSR_Graph.h | 3 ++ lonestargnn/CMakeLists.txt | 14 +++---- 27 files changed, 157 insertions(+), 72 deletions(-) create mode 100644 cmake/Modules/FindMKL.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 4be9753f54..1f1b853aef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -16,6 +16,8 @@ set(GALOIS_COPYRIGHT_YEAR "2018") # Also in COPYRIGHT #set(GCC_MPIP_LINK_FLAGS "-L${MPIP_DIR} -L${LIBUNWIND_DIR} -lmpiP -lbfd -liberty -lm -lunwind") #link_directories(LIBUNWIND_DIR MPIP_DIR) +SET(OPENBLAS_ROOT /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build) +SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.0) if(NOT CMAKE_BUILD_TYPE) message(STATUS "No build type selected, default to release") @@ -37,6 +39,7 @@ set(NUM_TEST_GPUS "0" CACHE STRING "Number of test GPUs to use (on a single mach ###### General features ###### set(USE_GPROF OFF CACHE BOOL "Enable GCC profiling") set(USE_VTUNE OFF CACHE BOOL "Use VTune for profiling") +set(USE_MKL_BLAS OFF CACHE BOOL "Use MKL for BLAS") set(USE_PAPI OFF CACHE BOOL "Use PAPI counters for profiling") set(USE_HPCTK OFF CACHE BOOL "Use HPCToolKit for profiling") set(USE_STRICT_CONFIG OFF CACHE BOOL "Instead of falling back gracefully, fail") @@ -320,6 +323,17 @@ if(USE_VTUNE) endif() endif() +if(USE_MKL_BLAS) + SET(MKL_ROOT /opt/apps/sysnet/intel/17.0/mkl) + find_package(MKL) + message(STATUS "MKL: ${MKL_INCLUDE_DIRS}") + if (MKL_FOUND) + include_directories(${MKL_INCLUDE_DIRS}) + else() + message(WARNING "MKL not found") + endif() +endif() + if(USE_PAPI) if (PAPI_ROOT STREQUAL "") set(PAPI_ROOT /usr) diff --git a/cmake/Modules/FindMKL.cmake b/cmake/Modules/FindMKL.cmake new file mode 100644 index 0000000000..d87020f770 --- /dev/null +++ b/cmake/Modules/FindMKL.cmake @@ -0,0 +1,24 @@ +# Find MKL libraries +# Once done this will define +# MKL_FOUND - System has MKL +# MKL_INCLUDE_DIRS - The MKL include directories +# MKL_LIBRARIES - The libraries needed to use MKL + +set(MKL_LIBRARIES) # Include-only library + +if(MKL_INCLUDE_DIRS) + set(MKL_FIND_QUIETLY TRUE) +endif() + +find_path(MKL_INCLUDE_DIRS mkl.h PATHS ${MKL_ROOT} PATH_SUFFIXES include) +message(STATUS "MKL_INCLUDE_DIRS: ${MKL_INCLUDE_DIRS}") +find_library(MKL_LIBRARY NAMES mkl_rt PATHS ${MKL_ROOT} PATH_SUFFIXES lib/intel64) +message(STATUS "MKL_LIBRARY: ${MKL_LIBRARY}") + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(MKL DEFAULT_MSG MKL_LIBRARY MKL_INCLUDE_DIRS) +if(MKL_FOUND) + set(MKL_FOUND on) +endif() + +mark_as_advanced(MKL_INCLUDE_DIRS) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 3f592f0d18..de0cd30dc9 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -1,10 +1,20 @@ cmake_minimum_required(VERSION 2.8) -# open blas -SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include) -SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib) -include_directories(${OPENBLAS_INC}) -link_directories(${OPENBLAS_LIB}) +SET(BLAS_INC_DIR ${OPENBLAS_ROOT}/include) +SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib) +set(BLAS_LIB -lopenblas) +if(USE_MKL_BLAS) + SET(BLAS_INC_DIR ${MKL_ROOT}/include) + SET(BLAS_LIB_DIR ${MKL_ROOT}/lib/intel64) + set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MKL") +endif() + +# blas library +include_directories(${BLAS_INC_DIR}) +link_directories(${BLAS_LIB_DIR}) +message(STATUS "BLAS_INC_DIR: ${BLAS_INC_DIR}") +message(STATUS "BLAS_LIB_DIR: ${BLAS_LIB_DIR}") # galois base libs include_directories(${CMAKE_SOURCE_DIR}/libgalois/include) @@ -19,9 +29,7 @@ else() include_directories("${CUB_ROOT}") set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers include_directories("${MGPU_ROOT}/src") - - SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include) - include_directories(${CUDA_INC}) + include_directories(${CUDA_HOME}/include) include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) find_package(CUDA REQUIRED) @@ -32,9 +40,7 @@ else() #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_61,code=sm_61) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -gencode arch=compute_70,code=sm_70) #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -G -Xcompiler -rdynamic) - #set(CUDA_INCLUDE_DIRS /org/centers/cdgc/cuda/cuda-10.0/include ${CUDA_INCLUDE_DIRS}) - SET(CUDA_LIB /org/centers/cdgc/cuda/cuda-10.0/lib64/) - link_directories(${CUDA_LIB}) + link_directories(${CUDA_HOME}/lib64) link_directories(${CMAKE_SOURCE_DIR}/libgpu) set(CUDA_SOURCES @@ -100,7 +106,7 @@ endif() add_library(dg_cpu STATIC ${sources}) target_link_libraries(dg_cpu galois_shmem gllvm) target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES}) -target_link_libraries(dg_cpu -lopenblas) +target_link_libraries(dg_cpu ${BLAS_LIB}) target_include_directories(dg_cpu PUBLIC ${CMAKE_SOURCE_DIR}/libllvm/include ${CMAKE_SOURCE_DIR}/libgalois/include diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 79196172ca..7f1c05ce60 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -16,7 +16,6 @@ #include "deepgalois/DistContext.h" #endif #include "deepgalois/optimizer.h" -#include "deepgalois/math_functions.hh" #include "deepgalois/layers/node.h" #ifdef GALOIS_USE_DIST #include "galois/graphs/GluonSubstrate.h" diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 9c0e58dc45..72b836da64 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -10,15 +10,16 @@ #include #include "deepgalois/types.h" +#ifdef USE_MKL +#include +#else // If use MKL, simply include the MKL header extern "C" { #include -//#include } - -// TODO namespace - +#endif namespace deepgalois { + namespace math { //! add 2 arrays for n elements void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out); @@ -27,6 +28,7 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out); //! do dot product of 2 vectors float_t dot(const vec_t& x, const vec_t& y); void axpy(size_t n, const float_t a, float_t *x, float_t *y); +int argmax(const size_t n, const float_t* x); // the arguments of the maxima //! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2 float_t l2_norm(size_t n, const float_t* a); //! clear n elements of a vector @@ -118,7 +120,6 @@ void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B, void transpose2D(const tensor_t& in, tensor_t& out); void transpose2D1D(const tensor_t& in, vec_t& out); int argmax(const size_t n, const vec_t& x); // the arguments of the maxima -int argmax(const size_t n, const float_t* x); // the arguments of the maxima // GPU operators bool isnan_gpu(int n, const float_t *array); // does array contain any 'nan' element diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h index b6a90917ff..b745f12cb6 100644 --- a/libdeepgalois/include/deepgalois/optimizer.h +++ b/libdeepgalois/include/deepgalois/optimizer.h @@ -16,9 +16,6 @@ #include #include #include "deepgalois/types.h" -#ifndef CPU_ONLY -#include "deepgalois/math_functions.hh" -#endif namespace deepgalois { @@ -41,10 +38,8 @@ struct optimizer { template struct stateful_optimizer : public optimizer { void reset() override { - for (auto& e : E_) - e.clear(); + for (auto& e : E_) e.clear(); } - protected: template vec_t& get(const vec_t& key) { @@ -56,14 +51,7 @@ struct stateful_optimizer : public optimizer { std::unordered_map E_[N]; #ifndef CPU_ONLY template - float_t *get_gpu(const size_t n, const float_t *key) { - static_assert(Index < N, "index out of range"); - if (!is_allocated_device(dE_[Index][key])) { - float_malloc_device(n, dE_[Index][key]); - init_const_gpu(n, 0.0, dE_[Index][key]); - } - return dE_[Index][key]; - } + float_t *get_gpu(const size_t n, const float_t *key); std::unordered_map dE_[N]; #endif }; diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 65308172f1..b374dd9d91 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -11,7 +11,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou auto& rangeObj = g.allNodesRange(); galois::do_all(galois::iterate(rangeObj), [&](const auto src) { #endif - deepgalois::math::clear_cpu(len , &out[src * len]); + math::clear_cpu(len , &out[src * len]); float_t a = 0.0; float_t b = 0.0; // get normalization factor if needed @@ -24,14 +24,12 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou b = a * norm_factor[dst]; vec_t neighbor(len); // scale the neighbor's data using the normalization factor - deepgalois::math::mul_scalar(len, b, &in[dst * len], &neighbor[0]); - // use scaled data to update - deepgalois::math::vadd_cpu(len, &out[src * len], &neighbor[0], - &out[src * len]); // out[src] += in[dst] + math::mul_scalar(len, b, &in[dst * len], &neighbor[0]); + // use scaled data to update; out[src] += in[dst] + math::vadd_cpu(len, &out[src * len], &neighbor[0], &out[src * len]); } else { - // add embeddings from neighbors together - deepgalois::math::vadd_cpu(len, &out[src * len], &in[dst * len], - &out[src * len]); // out[src] += in[dst] + // add embeddings from neighbors together; out[src] += in[dst] + math::vadd_cpu(len, &out[src * len], &in[dst * len], &out[src * len]); } } }, galois::steal(), galois::no_stats(), galois::loopname("update_all")); @@ -40,8 +38,8 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { unsigned n = g.size(); - deepgalois::math::clear_cpu(n*len, out); - //csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, - // (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, out); + math::clear_cpu(n*len, out); + math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, + (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, out); } #endif diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index c7d0307fd4..dae3d14ce5 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -1,4 +1,5 @@ #include "deepgalois/layers/graph_conv_layer.h" +#include "deepgalois/math_functions.hh" #include "deepgalois/utils.h" namespace deepgalois { @@ -40,13 +41,21 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { // normalization constant based on graph structure float_t* norm_consts = context->get_norm_factor_ptr(); +#ifdef USE_MKL + update_all_csrmm(len, g, in, out, norm_, norm_consts); +#else update_all(len, g, in, out, norm_, norm_consts); +#endif } // since graph is symmetric, the derivative is the same void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { float_t* norm_consts = context->get_norm_factor_ptr(); +#ifdef USE_MKL + update_all_csrmm(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z +#else update_all(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z +#endif } void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t* neighbors, float_t* out) { diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index c3f97a49d4..41f6e30a0f 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -1,4 +1,5 @@ #include "deepgalois/layers/graph_conv_layer.h" +#include "deepgalois/math_functions.hh" namespace deepgalois { diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp index 46379aed60..3e12a1d603 100644 --- a/libdeepgalois/src/layers/l2_norm_layer.cpp +++ b/libdeepgalois/src/layers/l2_norm_layer.cpp @@ -1,4 +1,5 @@ #include "deepgalois/layers/l2_norm_layer.h" +#include "deepgalois/math_functions.hh" namespace deepgalois { diff --git a/libdeepgalois/src/layers/l2_norm_layer.cu b/libdeepgalois/src/layers/l2_norm_layer.cu index 56128eb0d3..e600b6fbbb 100644 --- a/libdeepgalois/src/layers/l2_norm_layer.cu +++ b/libdeepgalois/src/layers/l2_norm_layer.cu @@ -1,4 +1,5 @@ #include "deepgalois/layers/l2_norm_layer.h" +#include "deepgalois/math_functions.hh" namespace deepgalois { diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp index 0d5a7f66fb..f7cfe375cc 100644 --- a/libdeepgalois/src/layers/leaky_relu_layer.cpp +++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp @@ -1,4 +1,5 @@ #include "deepgalois/layers/leaky_relu_layer.h" +#include "deepgalois/math_functions.hh" namespace deepgalois { diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cu b/libdeepgalois/src/layers/leaky_relu_layer.cu index 43e7f93d04..6fe4d005ac 100644 --- a/libdeepgalois/src/layers/leaky_relu_layer.cu +++ b/libdeepgalois/src/layers/leaky_relu_layer.cu @@ -1,4 +1,5 @@ #include "deepgalois/layers/leaky_relu_layer.h" +#include "deepgalois/math_functions.hh" namespace deepgalois { diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp index 2e89af1bd5..aee6e29a07 100644 --- a/libdeepgalois/src/layers/relu_layer.cpp +++ b/libdeepgalois/src/layers/relu_layer.cpp @@ -1,4 +1,5 @@ #include "deepgalois/layers/relu_layer.h" +#include "deepgalois/math_functions.hh" namespace deepgalois { diff --git a/libdeepgalois/src/layers/relu_layer.cu b/libdeepgalois/src/layers/relu_layer.cu index f3a45936b4..0d39a9dab2 100644 --- a/libdeepgalois/src/layers/relu_layer.cu +++ b/libdeepgalois/src/layers/relu_layer.cu @@ -1,4 +1,5 @@ #include "deepgalois/layers/relu_layer.h" +#include "deepgalois/math_functions.hh" namespace deepgalois { diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index 19606eec6c..ca34389127 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -1,4 +1,5 @@ #include "deepgalois/layers/sigmoid_loss_layer.h" +#include "deepgalois/math_functions.hh" namespace deepgalois { diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu index 4159569601..f00689dfc9 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cu +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu @@ -1,4 +1,5 @@ #include "deepgalois/layers/sigmoid_loss_layer.h" +#include "deepgalois/math_functions.hh" #include "gg.h" #include "ggcuda.h" diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 9e4fda933e..f1c1aa27e4 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -1,4 +1,5 @@ #include "deepgalois/layers/softmax_loss_layer.h" +#include "deepgalois/math_functions.hh" namespace deepgalois { diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu index fd3fc11140..59a955526b 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cu +++ b/libdeepgalois/src/layers/softmax_loss_layer.cu @@ -1,4 +1,5 @@ #include "deepgalois/layers/softmax_loss_layer.h" +#include "deepgalois/math_functions.hh" #include "gg.h" #include "ggcuda.h" diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 555eb7bfca..aa41ffc41f 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -4,10 +4,13 @@ #include #include "deepgalois/utils.h" +#ifdef USE_MKL +#include +#else // If use MKL, simply include the MKL header extern "C" { #include -//#include } +#endif #define NOT_IMPLEMENTED \ do { \ @@ -38,9 +41,11 @@ void csrmm_cpu(const int M, const int N, const int K, const int nnz, #ifdef USE_MKL const char *matdescra = "GXXCX";//6 bytes const char transa = 'N'; - mkl_scsrmm (&transa, &M , &N, &K, &alpha , matdescra, - A_nonzeros, A_nnz_idx, A_idx_ptr, A_idx_ptr+1, - B, &N, &beta , C, &N); + printf("Calling Intel MKL\n"); + exit(1); + mkl_scsrmm(&transa, &M , &N, &K, &alpha , matdescra, + A_nonzeros, A_nnz_idx, A_idx_ptr, A_idx_ptr+1, + B, &N, &beta , C, &N); #else NOT_IMPLEMENTED; #endif @@ -126,6 +131,18 @@ void axpy(size_t n, const float_t a, float_t *x, float_t *y) { cblas_saxpy(n, a, x, 1, y, 1); } +int argmax(const size_t n, const float_t* x) { + float_t max = x[0]; + int max_ind = 0; + for (size_t i = 1; i < n; i++) { + if (x[i] > max) { + max_ind = i; + max = x[i]; + } + } + return max_ind; +} + float_t l2_norm(size_t n, const float_t* x) { return cblas_snrm2(n, x, 1); } @@ -549,20 +566,6 @@ int argmax(const size_t n, const vec_t& x) { return max_ind; } -int argmax(const size_t n, const float_t* x) { - float_t max = x[0]; - int max_ind = 0; - for (size_t i = 1; i < n; i++) { - if (x[i] > max) { - max_ind = i; - max = x[i]; - } - } - return max_ind; -} - - - void d_mvmul(vec_t& in_diff, vec_t& h_in, tensor_t& out_diff) { vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 7c9d049fc2..7da9fcbb18 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -4,6 +4,7 @@ #include "deepgalois/net.h" #include "deepgalois/utils.h" +#include "deepgalois/math_functions.hh" namespace deepgalois { @@ -84,6 +85,10 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, #endif } + if (subgraph_sample_size > train_count) { + galois::gPrint("FATAL: subgraph size can not be larger than the size of training set\n"); + exit(1); + } // NOTE: train_begin/train_end are global IDs, train_masks is a local id // train count and val count are LOCAL counts @@ -440,7 +445,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks #ifndef GALOIS_USE_DIST if (masks[i] == 1) { // get prediction - int preds = argmax(num_classes, + int preds = math::argmax(num_classes, &(layers[num_conv_layers - 1]->next()->get_data()[i * num_classes])); // check prediction if ((label_t)preds == context->get_label(i)) @@ -455,7 +460,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks uint32_t localID = dGraph->getLID(i); if (masks[localID] == 1) { // get prediction - int preds = argmax(num_classes, + int preds = math::argmax(num_classes, &(layers[num_conv_layers - 1]->next()->get_data()[localID * num_classes])); // check prediction if ((label_t)preds == context->get_label(localID)) diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index 900ba1a762..3077566512 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -1,5 +1,6 @@ #include "deepgalois/net.h" #include "deepgalois/cutils.h" +#include "deepgalois/math_functions.hh" #include "gg.h" #include "ggcuda.h" #include diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp index c3267f282e..0f00b4da33 100644 --- a/libdeepgalois/src/optimizer.cpp +++ b/libdeepgalois/src/optimizer.cpp @@ -1,5 +1,6 @@ #include "deepgalois/optimizer.h" #include "galois/Galois.h" +#include "deepgalois/math_functions.hh" namespace deepgalois { diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu index 7628c3aeba..355d959254 100644 --- a/libdeepgalois/src/optimizer.cu +++ b/libdeepgalois/src/optimizer.cu @@ -14,7 +14,20 @@ __global__ void update_kernel(const int n, float_t alpha, float_t b1, } } -void deepgalois::adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { +namespace deepgalois { + +template +template +float_t* stateful_optimizer::get_gpu(const size_t n, const float_t *key) { + static_assert(Index < N, "index out of range"); + if (!is_allocated_device(dE_[Index][key])) { + float_malloc_device(n, dE_[Index][key]); + init_const_gpu(n, 0.0, dE_[Index][key]); + } + return dE_[Index][key]; +} + +void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { //std::cout << "updating weights on GPU, n = " << n << "\n"; //print_device_vector(10, dW, "dW"); float_t* cache = get_gpu<0>(n, W); @@ -25,3 +38,5 @@ void deepgalois::adam::update_gpu(const size_t n, const float_t* dW, float_t* W) b1_t *= b1; b2_t *= b2; } + +} diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index c126660fb4..a0816f4cea 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -96,8 +96,15 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v } if (j == degree) galois::gPrint("Not found from ", degree, " neighbors\n"); } - galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), "\n"); assert(n == vertex_set.size()); + galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), ", set: ( "); + int counter = 0; + for (int i : vertex_set) { + counter ++; + if (counter > 16 && counter < n-16) continue; + galois::gPrint(i, " "); + } + galois::gPrint(" )\n"); } void Sampler::update_masks(size_t n, VertexSet vertices, mask_t *masks) { diff --git a/libgalois/include/galois/graphs/LC_CSR_Graph.h b/libgalois/include/galois/graphs/LC_CSR_Graph.h index 5516b22a92..a786b1b6aa 100644 --- a/libgalois/include/galois/graphs/LC_CSR_Graph.h +++ b/libgalois/include/galois/graphs/LC_CSR_Graph.h @@ -323,6 +323,9 @@ class LC_CSR_Graph : ar >> edgeData; } + // cxh + uint64_t* row_start_ptr() { return &edgeIndData[0]; } + uint32_t* edge_dst_ptr() { return &edgeDst[0]; } /** * Accesses the "prefix sum" of this graph; takes advantage of the fact * that edge_end(n) is basically prefix_sum[n] (if a prefix sum existed + diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index a06dd1907b..b551fa8acb 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -5,17 +5,17 @@ include_directories(BEFORE include_directories(${CMAKE_SOURCE_DIR}/lonestargnn/include) include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include) - -SET(CUDA_INC /org/centers/cdgc/cuda/cuda-10.0/include) -include_directories(${CUDA_INC}) +include_directories(${CUDA_HOME}/include) if(ENABLE_HETERO_GALOIS) include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) endif() -SET(OPENBLAS_INC /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/include) -SET(OPENBLAS_LIB /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build/lib) -include_directories(${OPENBLAS_INC}) -link_directories(${OPENBLAS_LIB}) +SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib) +if(USE_MKL_BLAS) + SET(BLAS_LIB_DIR "${MKL_ROOT}/lib/intel64") +endif() +link_directories(${BLAS_LIB_DIR}) + if(NOT ENABLE_HETERO_GALOIS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") endif() From a2583e8cde5e0805a1f8680349981f7a1b8a612f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 28 Apr 2020 17:34:23 -0500 Subject: [PATCH 201/660] app no longer used by cmake: gcn use new syntax --- lonestargnn/gcn/CMakeLists.txt | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestargnn/gcn/CMakeLists.txt index 48c7156dcc..eff742aa69 100644 --- a/lonestargnn/gcn/CMakeLists.txt +++ b/lonestargnn/gcn/CMakeLists.txt @@ -1,9 +1,13 @@ -app(gcn gcn.cpp) -target_link_libraries(gcn dg_cpu) +#app(gcn gcn.cpp) +add_executable(gcn gcn.cpp) +target_link_libraries(gcn PRIVATE Galois::shmem lonestar) + +target_link_libraries(gcn PRIVATE dg_cpu) if(ENABLE_DIST_GALOIS) - target_link_libraries(gcn distgraphloader) + target_link_libraries(gcn PRIVATE distgraphloader) endif() + if(ENABLE_HETERO_GALOIS) - target_link_libraries(gcn dg_gpu) - target_link_libraries(gcn -lcudart -lcublas -lcurand -lcudadevrt) + target_link_libraries(gcn PRIVATE dg_gpu) + target_link_libraries(gcn PRIVATE -lcudart -lcublas -lcurand -lcudadevrt) endif() From 3cd4f8e14189f060d84086e8faac549e6a26fa27 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 28 Apr 2020 17:36:53 -0500 Subject: [PATCH 202/660] signed vs unsigned comparison warning fix --- libdeepgalois/src/sampler.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index a0816f4cea..257cf1edef 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -98,7 +98,7 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v } assert(n == vertex_set.size()); galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), ", set: ( "); - int counter = 0; + unsigned counter = 0; for (int i : vertex_set) { counter ++; if (counter > 16 && counter < n-16) continue; From 623809922ee8727ea3df86af1ba379890309cae5 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 28 Apr 2020 17:37:18 -0500 Subject: [PATCH 203/660] initializing endbyte in numamem to var to avoid warning --- libgalois/src/NumaMem.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libgalois/src/NumaMem.cpp b/libgalois/src/NumaMem.cpp index cd86a970a0..01bdef4545 100644 --- a/libgalois/src/NumaMem.cpp +++ b/libgalois/src/NumaMem.cpp @@ -99,7 +99,7 @@ static void pageInSpecified(void* _ptr, size_t len, size_t pageSize, // first place if (beginLocation != endLocation) { size_t beginByte = beginLocation * elementSize; - size_t endByte; + size_t endByte = 0; if (endLocation != 0) { // -1 since end * element will result in the first From a36c9a2a8e024fe9a8768cc6b56fae717a32bc69 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 28 Apr 2020 17:55:25 -0500 Subject: [PATCH 204/660] fixing build gcc 8.1 for lonestargnn boiler --- lonestargnn/include/lonestargnn.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h index 324f5a31ba..77a2777d5f 100644 --- a/lonestargnn/include/lonestargnn.h +++ b/lonestargnn/include/lonestargnn.h @@ -63,9 +63,10 @@ llvm::cl::opt statFile( llvm::cl::desc("ouput file to print stats to (default value empty)"), llvm::cl::init("")); -static void LonestarGnnPrintVersion() { - std::cout << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " (" - << galois::getRevision() << ")\n"; +static void LonestarGnnPrintVersion(llvm::raw_ostream& out) { + out << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " (" + << galois::getRevision() << ")\n"; + out.flush(); } //! initialize lonestargnn benchmark @@ -80,7 +81,7 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, auto& net = galois::runtime::getSystemNetworkInterface(); if (net.ID == 0) { #endif - LonestarGnnPrintVersion(); + LonestarGnnPrintVersion(llvm::outs()); std::cout << "Copyright (C) " << galois::getCopyrightYear() << " The University of Texas at Austin\n"; std::cout << "http://iss.ices.utexas.edu/galois/\n\n"; From 70dece5548eb0cb6914b007a2879ed8eea495860 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 28 Apr 2020 17:55:47 -0500 Subject: [PATCH 205/660] cmake fixes to build gcn after merge TODO openblas needs to be recomopiled with 8.1 --- CMakeLists.txt | 6 +++--- libdeepgalois/CMakeLists.txt | 3 +-- lonestargnn/CMakeLists.txt | 5 ----- 3 files changed, 4 insertions(+), 10 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 715ecbc8a9..ef921b9e2d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,9 +6,6 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules") include(GNUInstallDirs) -# TODO; this is GNN related; find better way to do than hardcode -SET(OPENBLAS_ROOT /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build) -SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.0) file(STRINGS config/version.txt GALOIS_VERSION) string(REGEX REPLACE "[ \t\n]" "" GALOIS_VERSION ${GALOIS_VERSION}) @@ -47,6 +44,9 @@ set(USE_ARCH native CACHE STRING "Optimize for a specific processor architecture set(USE_DEEPGALOIS OFF CACHE BOOL "Use gnn apps as well as the DeepGalois library") set(USE_MKL_BLAS OFF CACHE BOOL "Use MKL for BLAS") +# TODO; this is GNN related; find better way to do than hardcode +SET(OPENBLAS_ROOT /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build) +SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.0) # This option is automatically handled by CMake. # It makes add_library build a shared lib unless STATIC is explicitly specified. diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index de0cd30dc9..e36e5784bd 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -104,11 +104,10 @@ set(sources endif() add_library(dg_cpu STATIC ${sources}) -target_link_libraries(dg_cpu galois_shmem gllvm) +target_link_libraries(dg_cpu galois_shmem) target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES}) target_link_libraries(dg_cpu ${BLAS_LIB}) target_include_directories(dg_cpu PUBLIC - ${CMAKE_SOURCE_DIR}/libllvm/include ${CMAKE_SOURCE_DIR}/libgalois/include ${CMAKE_CURRENT_SOURCE_DIR}/include ) diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index b551fa8acb..1ae1c63d78 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -1,8 +1,3 @@ -include_directories(BEFORE - ${CMAKE_SOURCE_DIR}/libllvm/include - ${CMAKE_CURRENT_BINARY_DIR}/../libllvm/include -) - include_directories(${CMAKE_SOURCE_DIR}/lonestargnn/include) include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include) include_directories(${CUDA_HOME}/include) From 47684cea96c339211f2e067511b93109882689dd Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 28 Apr 2020 19:42:00 -0500 Subject: [PATCH 206/660] fix openblas --- CMakeLists.txt | 12 +++++++++++- cmake/Modules/FindOpenBLAS.cmake | 24 ++++++++++++++++++++++++ libdeepgalois/CMakeLists.txt | 6 +++--- lonestargnn/CMakeLists.txt | 2 +- 4 files changed, 39 insertions(+), 5 deletions(-) create mode 100644 cmake/Modules/FindOpenBLAS.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index ef921b9e2d..5a0d440a3c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,7 +45,6 @@ set(USE_ARCH native CACHE STRING "Optimize for a specific processor architecture set(USE_DEEPGALOIS OFF CACHE BOOL "Use gnn apps as well as the DeepGalois library") set(USE_MKL_BLAS OFF CACHE BOOL "Use MKL for BLAS") # TODO; this is GNN related; find better way to do than hardcode -SET(OPENBLAS_ROOT /net/ohm/export/cdgc/cxh/OpenBLAS-faraday/build) SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.0) # This option is automatically handled by CMake. @@ -270,6 +269,17 @@ if(USE_MKL_BLAS) endif() endif() +SET(OPENBLAS_ROOT /org/centers/cdgc/openblas/gcc8.1) +if(USE_OPENBLAS) + find_package(OpenBLAS) + message(STATUS "OpenBLAS: ${OPENBLAS_INCLUDE_DIRS}") + if (OPENBLAS_FOUND) + include_directories(${OPENBLAS_INCLUDE_DIRS}) + else() + message(WARNING "OpenBLAS not found") + endif() +endif() + if(USE_PAPI) if (PAPI_ROOT STREQUAL "") set(PAPI_ROOT /usr) diff --git a/cmake/Modules/FindOpenBLAS.cmake b/cmake/Modules/FindOpenBLAS.cmake new file mode 100644 index 0000000000..3f595744d0 --- /dev/null +++ b/cmake/Modules/FindOpenBLAS.cmake @@ -0,0 +1,24 @@ +# Find OpenBLAS libraries +# Once done this will define +# OpenBLAS_FOUND - System has OpenBLAS +# OpenBLAS_INCLUDE_DIRS - The OpenBLAS include directories +# OpenBLAS_LIBRARIES - The libraries needed to use OpenBLAS + +set(OPENBLAS_LIBRARIES) # Include-only library + +if(OPENBLAS_INCLUDE_DIRS) + set(OPENBLAS_FIND_QUIETLY TRUE) +endif() + +find_path(OPENBLAS_INCLUDE_DIRS cblas.h PATHS ${OPENBLAS_ROOT} PATH_SUFFIXES include/openblas) +message(STATUS "OPENBLAS_INCLUDE_DIRS: ${OPENBLAS_INCLUDE_DIRS}") +find_library(OPENBLAS_LIBRARY NAMES openblas PATHS ${OPENBLAS_ROOT} PATH_SUFFIXES lib64) +message(STATUS "OPENBLAS_LIBRARY: ${OPENBLAS_LIBRARY}") + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(OPENBLAS DEFAULT_MSG OPENBLAS_LIBRARY OPENBLAS_INCLUDE_DIRS) +if(OPENBLAS_FOUND) + set(OPENBLAS_FOUND on) +endif() + +mark_as_advanced(OPENBLAS_INCLUDE_DIRS) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index e36e5784bd..fffe49af1a 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -1,8 +1,8 @@ cmake_minimum_required(VERSION 2.8) -SET(BLAS_INC_DIR ${OPENBLAS_ROOT}/include) -SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib) -set(BLAS_LIB -lopenblas) +SET(BLAS_INC_DIR ${OPENBLAS_ROOT}/include/openblas) +SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64) +set(BLAS_LIB "-lopenblas -lpthread") if(USE_MKL_BLAS) SET(BLAS_INC_DIR ${MKL_ROOT}/include) SET(BLAS_LIB_DIR ${MKL_ROOT}/lib/intel64) diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index 1ae1c63d78..24c9c6a726 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -5,7 +5,7 @@ if(ENABLE_HETERO_GALOIS) include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) endif() -SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib) +SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64) if(USE_MKL_BLAS) SET(BLAS_LIB_DIR "${MKL_ROOT}/lib/intel64") endif() From 0c5e36596674ef3f213c5fd77da967f93d79c4ed Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 28 Apr 2020 21:41:44 -0500 Subject: [PATCH 207/660] add lgraph --- libdeepgalois/CMakeLists.txt | 1 + libdeepgalois/include/deepgalois/context.h | 2 ++ libdeepgalois/include/deepgalois/lgraph.h | 33 ++++++++++++++++++++++ libdeepgalois/src/context.cpp | 3 ++ libdeepgalois/src/lgraph.cpp | 31 ++++++++++++++++++++ 5 files changed, 70 insertions(+) create mode 100644 libdeepgalois/include/deepgalois/lgraph.h create mode 100644 libdeepgalois/src/lgraph.cpp diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index fffe49af1a..9c6bc0a88f 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -97,6 +97,7 @@ set(sources src/optimizer.cpp src/context.cpp src/sampler.cpp + src/lgraph.cpp src/utils.cpp src/node.cpp src/net.cpp diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index e368319dff..affe48ace0 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -7,6 +7,7 @@ #include #include "deepgalois/types.h" #ifdef CPU_ONLY +#include "deepgalois/lgraph.h" #include "deepgalois/gtypes.h" #else #include "graph_gpu.h" @@ -39,6 +40,7 @@ class Context { #ifdef CPU_ONLY Graph* graph_cpu; // the input graph, |V| = N + LearningGraph* lgraph; Graph* subgraph_cpu; void createSubgraph(); void add_selfloop(Graph &og, Graph &g); diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h new file mode 100644 index 0000000000..dbe66c0092 --- /dev/null +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -0,0 +1,33 @@ +#pragma once +#include "deepgalois/types.h" +#include +#include + +namespace deepgalois { + +typedef uint32_t index_t; + +class LearningGraph { +protected: + index_t num_vertices_; + index_t num_edges_; + index_t *rowptr_; + index_t *colidx_; + index_t *degrees_; +public: + //typedef index_t* iterator; + using iterator = boost::counting_iterator; + LearningGraph(); + ~LearningGraph(); + void readGraph(std::string path, std::string dataset); + index_t getDegree(index_t vid) { return degrees_[vid]; } + index_t getEdgeDst(index_t eid) { return colidx_[eid]; } + index_t edge_begin(index_t vid) { return rowptr_[vid]; } + index_t edge_end(index_t vid) { return rowptr_[vid+1]; } + index_t* row_start_ptr() { return rowptr_; } + index_t* edge_dst_ptr() { return colidx_; } + iterator begin() const { return iterator(0); } + iterator end() const { return iterator(num_vertices_); } +}; + +} diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index b17f6d7eaa..4320df1bc6 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -68,6 +68,9 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo std::string filename = path + dataset_str + ".el"; printf("Reading .el file: %s\n", filename.c_str()); read_edgelist(filename.c_str(), true); // symmetrize + } else if (filetype == "bin") { + lgraph = new LearningGraph(); + lgraph->readGraph(path, dataset_str); } else if (filetype == "gr") { std::string filename = path + dataset_str + ".csgr"; printf("Reading .gr file: %s\n", filename.c_str()); diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp new file mode 100644 index 0000000000..390ba87488 --- /dev/null +++ b/libdeepgalois/src/lgraph.cpp @@ -0,0 +1,31 @@ +#include "deepgalois/lgraph.h" +#include + +namespace deepgalois { + +LearningGraph::LearningGraph() : num_vertices_(0), num_edges_(0), + rowptr_(NULL), colidx_(NULL), degrees_(NULL) {} + +void LearningGraph::readGraph(std::string path, std::string dataset) { + std::string file_dims = path + dataset + "-dims.bin"; + std::string file_rowptr = path + dataset + "-rowptr.bin"; + std::string file_colidx = path + dataset + "-colidx.bin"; + index_t dims[2]; + std::ifstream ifs; + ifs.open(file_dims, std::ios::binary|std::ios::in); + ifs.read((char*)dims, sizeof(index_t) * 2); + ifs.close(); + num_vertices_ = dims[0]; + num_edges_ = dims[1]; + degrees_ = new index_t[num_vertices_]; + rowptr_ = new index_t[num_vertices_+1]; + colidx_ = new index_t[num_edges_]; + ifs.open(file_rowptr, std::ios::binary|std::ios::in); + ifs.read((char*)rowptr_, sizeof(index_t) * (num_vertices_+1)); + ifs.close(); + ifs.open(file_colidx, std::ios::binary|std::ios::in); + ifs.read((char*)colidx_, sizeof(index_t) * num_edges_); + ifs.close(); +} + +} From d0ada11e4bd5511091e35ee30810223ed08c81ee Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 29 Apr 2020 10:29:09 -0500 Subject: [PATCH 208/660] add lgraph --- CMakeLists.txt | 4 +- libdeepgalois/CMakeLists.txt | 1 + libdeepgalois/include/deepgalois/cutils.h | 6 ++ libdeepgalois/include/deepgalois/lgraph.h | 16 +++- libdeepgalois/include/deepgalois/types.h | 4 + libdeepgalois/src/lgraph.cpp | 96 ++++++++++++++++++++++- libdeepgalois/src/lgraph.cu | 32 ++++++++ 7 files changed, 151 insertions(+), 8 deletions(-) create mode 100644 libdeepgalois/src/lgraph.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 5a0d440a3c..dc0250a3f4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -45,7 +45,7 @@ set(USE_ARCH native CACHE STRING "Optimize for a specific processor architecture set(USE_DEEPGALOIS OFF CACHE BOOL "Use gnn apps as well as the DeepGalois library") set(USE_MKL_BLAS OFF CACHE BOOL "Use MKL for BLAS") # TODO; this is GNN related; find better way to do than hardcode -SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.0) +SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.2) # This option is automatically handled by CMake. # It makes add_library build a shared lib unless STATIC is explicitly specified. @@ -356,14 +356,12 @@ if (ENABLE_HETERO_GALOIS) add_subdirectory(libgpu) endif() -add_subdirectory(libpangolin) # Applications (apps) add_subdirectory(lonestar) if (ENABLE_DIST_GALOIS) add_subdirectory(lonestardist) endif() -add_subdirectory(lonestarmine) add_subdirectory(scripts) add_subdirectory(inputs) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 9c6bc0a88f..be5853f987 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -54,6 +54,7 @@ else() src/math_functions.cu src/optimizer.cu src/context.cu + src/lgraph.cu src/node.cu src/net.cu ) diff --git a/libdeepgalois/include/deepgalois/cutils.h b/libdeepgalois/include/deepgalois/cutils.h index 7be873a183..5181408363 100644 --- a/libdeepgalois/include/deepgalois/cutils.h +++ b/libdeepgalois/include/deepgalois/cutils.h @@ -53,6 +53,8 @@ inline const char* cublasGetErrorString(cublasStatus_t error) { case CUBLAS_STATUS_LICENSE_ERROR: return "CUBLAS_STATUS_LICENSE_ERROR"; #endif + default: + break; } return "Unknown cublas status"; } @@ -79,6 +81,8 @@ inline const char* cusparseGetErrorString(cusparseStatus_t error) { return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; case CUSPARSE_STATUS_ZERO_PIVOT: return "CUSPARSE_STATUS_ZERO_PIVOT"; + default: + break; } return "Unknown cusparse status"; } @@ -111,6 +115,8 @@ inline const char* curandGetErrorString(curandStatus_t error) { return "CURAND_STATUS_ARCH_MISMATCH"; case CURAND_STATUS_INTERNAL_ERROR: return "CURAND_STATUS_INTERNAL_ERROR"; + default: + break; } return "Unknown curand status"; } diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index dbe66c0092..126802e07a 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -5,29 +5,41 @@ namespace deepgalois { -typedef uint32_t index_t; - class LearningGraph { protected: + bool is_device; index_t num_vertices_; index_t num_edges_; index_t *rowptr_; index_t *colidx_; index_t *degrees_; + vdata_t *vertex_data_; + edata_t *edge_data_; + public: //typedef index_t* iterator; using iterator = boost::counting_iterator; LearningGraph(); ~LearningGraph(); + void init(index_t nv, index_t ne) { num_vertices_ = nv; num_edges_ = ne; } void readGraph(std::string path, std::string dataset); + index_t size() { return num_vertices_; } + index_t sizeEdges() { return num_edges_; } index_t getDegree(index_t vid) { return degrees_[vid]; } index_t getEdgeDst(index_t eid) { return colidx_[eid]; } index_t edge_begin(index_t vid) { return rowptr_[vid]; } index_t edge_end(index_t vid) { return rowptr_[vid+1]; } index_t* row_start_ptr() { return rowptr_; } index_t* edge_dst_ptr() { return colidx_; } + index_t* degrees_ptr() { return degrees_; } + edata_t* edge_data_ptr() { return edge_data_; } + vdata_t* vertex_data_ptr() { return vertex_data_; } iterator begin() const { return iterator(0); } iterator end() const { return iterator(num_vertices_); } + void progressPrint(unsigned maxii, unsigned ii); + void allocOnDevice(bool no_edge_data_); + void copy_to_cpu(LearningGraph ©graph); + void copy_to_gpu(LearningGraph ©graph); }; } diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index 9c6c79c6e5..3a579a9c5c 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -28,6 +28,10 @@ typedef std::vector VertexList; typedef std::set VertexSet; typedef std::vector dims_t; // dimentions type +typedef uint32_t index_t; // index type +typedef float_t edata_t; // edge data type +typedef float_t vdata_t; // vertex data type + enum class net_phase { train, test }; #define CHUNK_SIZE 256 diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index 390ba87488..799812ac1d 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -1,17 +1,102 @@ #include "deepgalois/lgraph.h" +#include "deepgalois/utils.h" +#include +#include +#include +#include /* For O_RDWR */ +#include /* For open(), creat() */ #include +#include +#include namespace deepgalois { -LearningGraph::LearningGraph() : num_vertices_(0), num_edges_(0), - rowptr_(NULL), colidx_(NULL), degrees_(NULL) {} +LearningGraph::LearningGraph() : is_device(false), num_vertices_(0), num_edges_(0), + rowptr_(NULL), colidx_(NULL), degrees_(NULL), + vertex_data_(NULL), edge_data_(NULL) {} + +void LearningGraph::progressPrint(unsigned maxii, unsigned ii) { + const unsigned nsteps = 10; + unsigned ineachstep = (maxii / nsteps); + if(ineachstep == 0) ineachstep = 1; + if (ii % ineachstep == 0) { + int progress = ((size_t) ii * 100) / maxii + 1; + printf("\t%3d%%\r", progress); + fflush(stdout); + } +} void LearningGraph::readGraph(std::string path, std::string dataset) { + std::string filename = path + dataset + ".csgr"; + std::ifstream ifs; + ifs.open(filename); + int masterFD = open(filename.c_str(), O_RDONLY); + if (masterFD == -1) { + std::cout << "LearningGraph: unable to open" << filename << "\n"; + exit(1); + } + struct stat buf; + int f = fstat(masterFD, &buf); + if (f == -1) { + std::cout << "LearningGraph: unable to stat" << filename << "\n"; + exit(1); + } + size_t masterLength = buf.st_size; + int _MAP_BASE = MAP_PRIVATE; + void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0); + if (m == MAP_FAILED) { + m = 0; + std::cout << "LearningGraph: mmap failed.\n"; + exit(1); + } + Timer t; + t.Start(); + + uint64_t* fptr = (uint64_t*)m; + __attribute__((unused)) uint64_t version = le64toh(*fptr++); + assert(version == 1); + uint64_t sizeEdgeTy = le64toh(*fptr++); + uint64_t numNodes = le64toh(*fptr++); + uint64_t numEdges = le64toh(*fptr++); + uint64_t *outIdx = fptr; + fptr += numNodes; + uint32_t *fptr32 = (uint32_t*)fptr; + uint32_t *outs = fptr32; + fptr32 += numEdges; + if (numEdges % 2) fptr32 += 1; + num_vertices_ = numNodes; + num_edges_ = numEdges; + if (sizeEdgeTy != 0) { + std::cout << "LearningGraph: currently edge data not supported.\n"; + exit(1); + } + + printf("num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_); + degrees_ = new index_t[num_vertices_]; + rowptr_ = new index_t[num_vertices_+1]; + colidx_ = new index_t[num_edges_]; + rowptr_[0] = 0; + for (unsigned ii = 0; ii < num_vertices_; ++ii) { + rowptr_[ii+1] = le64toh(outIdx[ii]); + degrees_[ii] = rowptr_[ii+1] - rowptr_[ii]; + for (unsigned jj = 0; jj < degrees_[ii]; ++jj) { + unsigned eid = rowptr_[ii] + jj; + unsigned dst = le32toh(outs[eid]); + if (dst >= num_vertices_) { + printf("\tinvalid edge from %d to %d at index %d(%d).\n", ii, dst, jj, eid); + exit(0); + } + colidx_[eid] = dst; + } + progressPrint(num_vertices_, ii); + } + ifs.close(); + +/* std::string file_dims = path + dataset + "-dims.bin"; std::string file_rowptr = path + dataset + "-rowptr.bin"; std::string file_colidx = path + dataset + "-colidx.bin"; index_t dims[2]; - std::ifstream ifs; ifs.open(file_dims, std::ios::binary|std::ios::in); ifs.read((char*)dims, sizeof(index_t) * 2); ifs.close(); @@ -26,6 +111,11 @@ void LearningGraph::readGraph(std::string path, std::string dataset) { ifs.open(file_colidx, std::ios::binary|std::ios::in); ifs.read((char*)colidx_, sizeof(index_t) * num_edges_); ifs.close(); +*/ + t.Stop(); + double runtime = t.Millisecs(); + std::cout << "read " << masterLength << " bytes in " << runtime << " ms (" + << masterLength/1000.0/runtime << " MB/s)\n\n"; } } diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu new file mode 100644 index 0000000000..afd4ced9dc --- /dev/null +++ b/libdeepgalois/src/lgraph.cu @@ -0,0 +1,32 @@ + +void LearningGraph::allocOnDevice(bool no_edge_data_) { + if (colidx_ != NULL) return true; + CUDA_CHECK(cudaMalloc((void **) &colidx_, num_edges_ * sizeof(index_type))); + CUDA_CHECK(cudaMalloc((void **) &rowptr_, (num_vertices_+1) * sizeof(index_type))); + CUDA_CHECK(cudaMalloc((void **) °rees_, num_vertices_ * sizeof(index_type))); + //if (!no_edge_data_) CUDA_CHECK(cudaMalloc((void **) &edge_data_, num_edges_ * sizeof(edge_data__t))); + //CUDA_CHECK(cudaMalloc((void **) &vertex_data_, num_vertices_ * sizeof(vdata_t))); + is_device = true; +} + +void LearningGraph::copy_to_gpu(LearningGraph ©graph) { + copygraph.init(num_vertices_, num_edges_); + copygraph.allocOnDevice(edge_data_ == NULL); + CUDA_CHECK(cudaMemcpy(copygraph.colidx_, colidx_, num_edges_ * sizeof(index_type), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(copygraph.rowptr_, rowptr_, (num_vertices_+1) * sizeof(index_type), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(copygraph.degrees_, degrees_, num_vertices_ * sizeof(index_type), cudaMemcpyHostToDevice)); + //if (edge_data_ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data_, edge_data_, num_edges_ * sizeof(edata_t), cudaMemcpyHostToDevice)); + //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data_, vertex_data_, num_vertices_ * sizeof(vdata_t), cudaMemcpyHostToDevice)); +} + +void LearningGraph::copy_to_cpu(LearningGraph ©graph) { + assert(is_device); + assert(copygraph.size() = num_vertices_); + assert(copygraph.sizeEdges() = num_edges_); + CUDA_CHECK(cudaMemcpy(copygraph.edge_dst_ptr(), colidx_, num_edges_ * sizeof(index_type), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(copygraph.row_start_ptr(), rowptr_, (num_vertices_+1) * sizeof(index_type), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(copygraph.degrees_ptr(), degrees_, num_vertices_ * sizeof(index_type), cudaMemcpyDeviceToHost)); + //if (edge_data_ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data_ptr(), edge_data_, num_edges_ * sizeof(edata_t), cudaMemcpyDeviceToHost)); + //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data_ptr(), vertex_data_, num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost)); +} + From ecb5de88d766572399639442780d77e22c1c5f5b Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 29 Apr 2020 12:33:38 -0500 Subject: [PATCH 209/660] add NodeIndexTy and EdgeIndexTy in LC_CSR_Graph --- libdeepgalois/include/deepgalois/context.h | 5 +- libdeepgalois/include/deepgalois/gtypes.h | 4 +- libdeepgalois/include/deepgalois/lgraph.h | 14 ++++- libdeepgalois/src/context.cpp | 27 +++++---- libdeepgalois/src/lgraph.cpp | 43 ++++++++++++--- libdeepgalois/src/lgraph.cu | 9 +++ .../include/galois/graphs/LC_CSR_Graph.h | 55 ++++++++++--------- 7 files changed, 106 insertions(+), 51 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index affe48ace0..6d3d5a884c 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -40,13 +40,16 @@ class Context { #ifdef CPU_ONLY Graph* graph_cpu; // the input graph, |V| = N - LearningGraph* lgraph; Graph* subgraph_cpu; + LearningGraph* lgraph; + LearningGraph* lsubgraph; void createSubgraph(); void add_selfloop(Graph &og, Graph &g); //! returns pointer to the graph Graph* getGraphPointer() { return graph_cpu; } Graph* getSubgraphPointer() { return subgraph_cpu; }; + //LearningGraph* getGraphPointer() { return lgraph; } + //LearningGraph* getSubgraphPointer() { return lsubgraph; }; float_t* get_feats_ptr() { return h_feats; } float_t* get_feats_subg_ptr() { return h_feats_subg; } label_t* get_labels_ptr() { return h_labels; } diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h index dfc2e1d8c6..9aa405507c 100644 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ b/libdeepgalois/include/deepgalois/gtypes.h @@ -21,8 +21,8 @@ using AccuracyAccum = galois::DGAccumulator; typedef galois::graphs::LC_CSR_Graph::with_numa_alloc< true>::type ::with_no_lockable::type Graph; #else -typedef galois::graphs::LC_CSR_Graph::with_numa_alloc< - true>::type ::with_no_lockable::type Graph; +typedef galois::graphs::LC_CSR_Graph:: + with_numa_alloc::type ::with_no_lockable::type Graph; #endif #else using Graph = galois::graphs::DistGraph; diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index 126802e07a..03ae92cbff 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -19,14 +19,18 @@ class LearningGraph { public: //typedef index_t* iterator; using iterator = boost::counting_iterator; - LearningGraph(); - ~LearningGraph(); + LearningGraph(bool use_gpu) : is_device(use_gpu), num_vertices_(0), num_edges_(0), + rowptr_(NULL), colidx_(NULL), degrees_(NULL), + vertex_data_(NULL), edge_data_(NULL) {} + LearningGraph() : LearningGraph(false) {} + ~LearningGraph() { dealloc(); } void init(index_t nv, index_t ne) { num_vertices_ = nv; num_edges_ = ne; } void readGraph(std::string path, std::string dataset); index_t size() { return num_vertices_; } index_t sizeEdges() { return num_edges_; } index_t getDegree(index_t vid) { return degrees_[vid]; } index_t getEdgeDst(index_t eid) { return colidx_[eid]; } + index_t get_degree(index_t vid) { return degrees_[vid]; } index_t edge_begin(index_t vid) { return rowptr_[vid]; } index_t edge_end(index_t vid) { return rowptr_[vid+1]; } index_t* row_start_ptr() { return rowptr_; } @@ -40,6 +44,12 @@ class LearningGraph { void allocOnDevice(bool no_edge_data_); void copy_to_cpu(LearningGraph ©graph); void copy_to_gpu(LearningGraph ©graph); + void dealloc(); + void degree_counting(); + void allocateFrom(index_t nv, index_t ne); + void constructNodes(); + void fixEndEdge(index_t vid, index_t row_end); + void constructEdge(index_t eid, index_t dst, edata_t edata); }; } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 4320df1bc6..9013e563f5 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -30,6 +30,7 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) { void Context::createSubgraph() { subgraph_cpu = new Graph(); + lsubgraph = new LearningGraph(); } // generate labels for the subgraph, m is subgraph size @@ -63,7 +64,6 @@ void Context::gen_subgraph_feats(size_t m, const mask_t *masks) { size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) { galois::StatTimer Tread("GraphReadingTime"); Tread.start(); - graph_cpu = new Graph(); if (filetype == "el") { std::string filename = path + dataset_str + ".el"; printf("Reading .el file: %s\n", filename.c_str()); @@ -72,6 +72,7 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo lgraph = new LearningGraph(); lgraph->readGraph(path, dataset_str); } else if (filetype == "gr") { + graph_cpu = new Graph(); std::string filename = path + dataset_str + ".csgr"; printf("Reading .gr file: %s\n", filename.c_str()); if (selfloop) { @@ -86,9 +87,10 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo exit(1); } Tread.stop(); - std::cout << "num_vertices " << graph_cpu->size() << " num_edges " - << graph_cpu->sizeEdges() << "\n"; - return graph_cpu->size(); + auto g = getGraphPointer(); + std::cout << "num_vertices " << g->size() << " num_edges " + << g->sizeEdges() << "\n"; + return g->size(); } void Context::add_selfloop(Graph &og, Graph &g) { @@ -124,11 +126,13 @@ void Context::add_selfloop(Graph &og, Graph &g) { } void Context::norm_factor_counting(size_t g_size) { - Graph *g = graph_cpu; - if (use_subgraph) g = subgraph_cpu; + auto g = getGraphPointer(); + auto subg = getSubgraphPointer(); + g->degree_counting(); + if (use_subgraph) g = subg; if (norm_factor == NULL) norm_factor = new float_t[g_size]; galois::do_all(galois::iterate((size_t)0, g_size), [&](auto v) { - auto degree = std::distance(g->edge_begin(v), g->edge_end(v)); + auto degree = g->get_degree(v); float_t temp = std::sqrt(float_t(degree)); if (temp == 0.0) norm_factor[v] = 0.0; else norm_factor[v] = 1.0 / temp; @@ -185,14 +189,15 @@ void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self colidx_[offsets[i]++] = dst; } - graph_cpu->allocateFrom(num_vertices_, num_edges_); - graph_cpu->constructNodes(); + auto g = getGraphPointer(); + g->allocateFrom(num_vertices_, num_edges_); + g->constructNodes(); for (size_t i = 0; i < num_vertices_; i++) { auto row_begin = rowptr_[i]; auto row_end = rowptr_[i+1]; - graph_cpu->fixEndEdge(i, row_end); + g->fixEndEdge(i, row_end); for (auto offset = row_begin; offset < row_end; offset++) - graph_cpu->constructEdge(offset, colidx_[offset], 0); + g->constructEdge(offset, colidx_[offset], 0); } } diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index 799812ac1d..4e65d838a5 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -10,11 +10,6 @@ #include namespace deepgalois { - -LearningGraph::LearningGraph() : is_device(false), num_vertices_(0), num_edges_(0), - rowptr_(NULL), colidx_(NULL), degrees_(NULL), - vertex_data_(NULL), edge_data_(NULL) {} - void LearningGraph::progressPrint(unsigned maxii, unsigned ii) { const unsigned nsteps = 10; unsigned ineachstep = (maxii / nsteps); @@ -26,6 +21,27 @@ void LearningGraph::progressPrint(unsigned maxii, unsigned ii) { } } +void LearningGraph::allocateFrom(index_t nv, index_t ne) { +} + +void LearningGraph::constructNodes() { +} + +void LearningGraph::fixEndEdge(index_t vid, index_t row_end) { +} + +void LearningGraph::constructEdge(index_t eid, index_t dst, edata_t edata) { +} + +void degree_counting() { +/* + degrees = new uint32_t[num_vertices_]; + galois::do_all(galois::iterate(begin(), end()), [&] (auto v) { + degrees[v] = std::distance(this->edge_begin(v), this->edge_end(v)); + }, galois::loopname("DegreeCounting")); +*/ +} + void LearningGraph::readGraph(std::string path, std::string dataset) { std::string filename = path + dataset + ".csgr"; std::ifstream ifs; @@ -56,15 +72,15 @@ void LearningGraph::readGraph(std::string path, std::string dataset) { __attribute__((unused)) uint64_t version = le64toh(*fptr++); assert(version == 1); uint64_t sizeEdgeTy = le64toh(*fptr++); - uint64_t numNodes = le64toh(*fptr++); + uint64_t nv = le64toh(*fptr++); uint64_t numEdges = le64toh(*fptr++); uint64_t *outIdx = fptr; - fptr += numNodes; + fptr += nv; uint32_t *fptr32 = (uint32_t*)fptr; uint32_t *outs = fptr32; fptr32 += numEdges; if (numEdges % 2) fptr32 += 1; - num_vertices_ = numNodes; + num_vertices_ = nv; num_edges_ = numEdges; if (sizeEdgeTy != 0) { std::cout << "LearningGraph: currently edge data not supported.\n"; @@ -118,4 +134,15 @@ void LearningGraph::readGraph(std::string path, std::string dataset) { << masterLength/1000.0/runtime << " MB/s)\n\n"; } +#ifdef CPU_ONLY +void LearningGraph::dealloc() { + assert (!is_device); + free(rowptr_); + free(colidx_); + free(degrees_); + if (vertex_data_ != NULL) free(vertex_data_); + if (edge_data_ != NULL) free(edge_data_); } +#endif + +} // end namespace diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu index afd4ced9dc..14b7239358 100644 --- a/libdeepgalois/src/lgraph.cu +++ b/libdeepgalois/src/lgraph.cu @@ -1,4 +1,13 @@ +void LearningGraph::dealloc() { + assert(is_device); + CUDA_CHECK(cudaFree(colidx_)); + CUDA_CHECK(cudaFree(rowptr_)); + CUDA_CHECK(cudaFree(degrees_)); + if (edge_data != NULL) CUDA_CHECK(cudaFree(edge_data)); + if (vertex_data != NULL) CUDA_CHECK(cudaFree(vertex_data)); +} + void LearningGraph::allocOnDevice(bool no_edge_data_) { if (colidx_ != NULL) return true; CUDA_CHECK(cudaMalloc((void **) &colidx_, num_edges_ * sizeof(index_type))); diff --git a/libgalois/include/galois/graphs/LC_CSR_Graph.h b/libgalois/include/galois/graphs/LC_CSR_Graph.h index 903354f83a..ff6f7b9caf 100644 --- a/libgalois/include/galois/graphs/LC_CSR_Graph.h +++ b/libgalois/include/galois/graphs/LC_CSR_Graph.h @@ -61,7 +61,8 @@ namespace graphs { template numa-blocked, false => numa-interleaved - bool HasOutOfLineLockable = false, typename FileEdgeTy = EdgeTy> + bool HasOutOfLineLockable = false, typename FileEdgeTy = EdgeTy, + typename NodeIndexTy = uint32_t, typename EdgeIndexTy = uint64_t > class LC_CSR_Graph : //! [doxygennuma] private boost::noncopyable, @@ -134,18 +135,18 @@ class LC_CSR_Graph : protected: typedef LargeArray EdgeData; - typedef LargeArray EdgeDst; + typedef LargeArray EdgeDst; typedef internal::NodeInfoBaseTypes NodeInfoTypes; typedef internal::NodeInfoBase NodeInfo; - typedef LargeArray EdgeIndData; + typedef LargeArray EdgeIndData; typedef LargeArray NodeData; public: - typedef uint32_t GraphNode; + typedef NodeIndexTy GraphNode; typedef EdgeTy edge_data_type; typedef FileEdgeTy file_edge_data_type; typedef NodeTy node_data_type; @@ -333,8 +334,8 @@ class LC_CSR_Graph : } // cxh - uint64_t* row_start_ptr() { return &edgeIndData[0]; } - uint32_t* edge_dst_ptr() { return &edgeDst[0]; } + EdgeIndexTy* row_start_ptr() { return &edgeIndData[0]; } + NodeIndexTy* edge_dst_ptr() { return &edgeDst[0]; } /** * Accesses the "prefix sum" of this graph; takes advantage of the fact * that edge_end(n) is basically prefix_sum[n] (if a prefix sum existed + @@ -349,7 +350,7 @@ class LC_CSR_Graph : uint64_t operator[](uint64_t n) { return *(edge_end(n)); } template - LC_CSR_Graph(uint32_t _numNodes, uint64_t _numEdges, EdgeNumFnTy edgeNum, + LC_CSR_Graph(NodeIndexTy _numNodes, EdgeIndexTy _numEdges, EdgeNumFnTy edgeNum, EdgeDstFnTy _edgeDst, EdgeDataFnTy _edgeData) : numNodes(_numNodes), numEdges(_numEdges) { // std::cerr << "\n**" << numNodes << " " << numEdges << "\n\n"; @@ -552,7 +553,7 @@ class LC_CSR_Graph : } } - void allocateFrom(uint32_t nNodes, uint64_t nEdges) { + void allocateFrom(NodeIndexTy nNodes, EdgeIndexTy nEdges) { numNodes = nNodes; numEdges = nEdges; @@ -571,7 +572,7 @@ class LC_CSR_Graph : } } - void destroyAndAllocateFrom(uint32_t nNodes, uint64_t nEdges) { + void destroyAndAllocateFrom(NodeIndexTy nNodes, EdgeIndexTy nEdges) { numNodes = nNodes; numEdges = nEdges; @@ -595,7 +596,7 @@ class LC_CSR_Graph : void constructNodes() { #ifndef GALOIS_GRAPH_CONSTRUCT_SERIAL - for (uint32_t x = 0; x < numNodes; ++x) { + for (NodeIndexTy x = 0; x < numNodes; ++x) { nodeData.constructAt(x); this->outOfLineConstructAt(x); } @@ -623,15 +624,15 @@ class LC_CSR_Graph : edgeData.destroy(); } - void constructEdge(uint64_t e, uint32_t dst, + void constructEdge(EdgeIndexTy e, NodeIndexTy dst, const typename EdgeData::value_type& val) { edgeData.set(e, val); edgeDst[e] = dst; } - void constructEdge(uint64_t e, uint32_t dst) { edgeDst[e] = dst; } + void constructEdge(EdgeIndexTy e, NodeIndexTy dst) { edgeDst[e] = dst; } - void fixEndEdge(uint32_t n, uint64_t e) { edgeIndData[n] = e; } + void fixEndEdge(NodeIndexTy n, EdgeIndexTy e) { edgeIndData[n] = e; } /** * Perform an in-memory transpose of the graph, replacing the original @@ -681,7 +682,7 @@ class LC_CSR_Graph : // TODO is it worth doing parallel prefix sum? // prefix sum calculation of the edge index array - for (uint32_t n = 1; n < numNodes; ++n) { + for (NodeIndexTy n = 1; n < numNodes; ++n) { edgeIndData_temp[n] += edgeIndData_temp[n - 1]; } @@ -735,15 +736,15 @@ class LC_CSR_Graph : } template - void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new, - uint64_t e, + void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, EdgeIndexTy e_new, + EdgeIndexTy e, typename std::enable_if::type* = 0) { edgeData_new[e_new] = edgeData[e]; } template - void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, uint64_t e_new, - uint64_t e, + void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, EdgeIndexTy e_new, + EdgeIndexTy e, typename std::enable_if::type* = 0) { // does nothing } @@ -793,7 +794,7 @@ class LC_CSR_Graph : * Adding for Louvain clustering * TODO: Find better way to do this */ - void constructFrom(uint32_t numNodes, uint64_t numEdges, std::vector& prefix_sum, std::vector>& edges_id, std::vector>& edges_data) { + void constructFrom(NodeIndexTy numNodes, EdgeIndexTy numEdges, std::vector& prefix_sum, std::vector>& edges_id, std::vector>& edges_data) { //allocateFrom(numNodes, numEdges); /* * Deallocate if reusing the graph @@ -801,13 +802,13 @@ class LC_CSR_Graph : destroyAndAllocateFrom(numNodes, numEdges); constructNodes(); - galois::do_all(galois::iterate((uint32_t)0, numNodes), - [&](uint32_t n) { + galois::do_all(galois::iterate((NodeIndexTy)0, numNodes), + [&](NodeIndexTy n) { edgeIndData[n] = prefix_sum[n]; }); - galois::do_all(galois::iterate((uint32_t)0, numNodes), - [&](uint32_t n) { + galois::do_all(galois::iterate((NodeIndexTy)0, numNodes), + [&](NodeIndexTy n) { if( n == 0){ if(edgeIndData[n] > 0){ std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin()); @@ -879,11 +880,11 @@ class LC_CSR_Graph : readPosition = ((4 + numNodes) * sizeof(uint64_t)); graphFile.seekg(readPosition); if(version == 1) { - graphFile.read(reinterpret_cast(edgeDst.data()), sizeof(uint32_t)*numEdges); - readPosition = ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(uint32_t)); + graphFile.read(reinterpret_cast(edgeDst.data()), sizeof(NodeIndexTy)*numEdges); + readPosition = ((4 + numNodes) * sizeof(uint64_t) + numEdges * sizeof(NodeIndexTy)); // version 1 padding TODO make version agnostic if (numEdges% 2) { - readPosition += sizeof(uint32_t); + readPosition += sizeof(NodeIndexTy); } } else if(version == 2) { graphFile.read(reinterpret_cast(edgeDst.data()), sizeof(uint64_t)*numEdges); @@ -955,7 +956,7 @@ void readGraphFromGRFile(const std::string& filename) { readPosition = ((4 + numNodes) * sizeof(uint64_t)); graphFile.seekg(readPosition); if(version == 1) { - graphFile.read(reinterpret_cast(edgeDst.data()), sizeof(uint32_t)*numEdges); + graphFile.read(reinterpret_cast(edgeDst.data()), sizeof(NodeIndexTy)*numEdges); } else if(version == 2) { graphFile.read(reinterpret_cast(edgeDst.data()), sizeof(uint64_t)*numEdges); } else { From e7fe5d859f5e00a6168226bc82abb1ee69454215 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 29 Apr 2020 13:24:49 -0500 Subject: [PATCH 210/660] fix types --- libdeepgalois/include/deepgalois/context.h | 5 ----- libdeepgalois/include/deepgalois/gtypes.h | 18 ++++++++++-------- libdeepgalois/include/deepgalois/lgraph.h | 5 +++-- libdeepgalois/include/deepgalois/sampler.h | 6 +++--- libdeepgalois/src/context.cpp | 13 +++++++------ libdeepgalois/src/layers/aggregator.cpp | 2 +- libdeepgalois/src/lgraph.cpp | 5 ++++- libdeepgalois/src/sampler.cpp | 20 ++++++++------------ lonestargnn/gcn/gcn.cpp | 2 +- 9 files changed, 37 insertions(+), 39 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index 6d3d5a884c..e368319dff 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -7,7 +7,6 @@ #include #include "deepgalois/types.h" #ifdef CPU_ONLY -#include "deepgalois/lgraph.h" #include "deepgalois/gtypes.h" #else #include "graph_gpu.h" @@ -41,15 +40,11 @@ class Context { #ifdef CPU_ONLY Graph* graph_cpu; // the input graph, |V| = N Graph* subgraph_cpu; - LearningGraph* lgraph; - LearningGraph* lsubgraph; void createSubgraph(); void add_selfloop(Graph &og, Graph &g); //! returns pointer to the graph Graph* getGraphPointer() { return graph_cpu; } Graph* getSubgraphPointer() { return subgraph_cpu; }; - //LearningGraph* getGraphPointer() { return lgraph; } - //LearningGraph* getSubgraphPointer() { return lsubgraph; }; float_t* get_feats_ptr() { return h_feats; } float_t* get_feats_subg_ptr() { return h_feats_subg; } label_t* get_labels_ptr() { return h_labels; } diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h index 9aa405507c..fe759803e2 100644 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ b/libdeepgalois/include/deepgalois/gtypes.h @@ -1,14 +1,14 @@ -#ifndef __DG_GTYPES__ -#define __DG_GTYPES__ +#pragma once #include "galois/Galois.h" #include "galois/graphs/LCGraph.h" #include "deepgalois/types.h" +#include "deepgalois/lgraph.h" #ifdef GALOIS_USE_DIST #include "galois/graphs/NewGeneric.h" #endif -// TODO namespace +namespace deepgalois { typedef galois::GAccumulator AccumF; typedef galois::GAccumulator AccumU; @@ -19,15 +19,17 @@ using AccuracyAccum = galois::DGAccumulator; #ifndef GALOIS_USE_DIST #ifdef EDGE_LABEL typedef galois::graphs::LC_CSR_Graph::with_numa_alloc< - true>::type ::with_no_lockable::type Graph; + true>::type ::with_no_lockable::type LCGraph; #else typedef galois::graphs::LC_CSR_Graph:: - with_numa_alloc::type ::with_no_lockable::type Graph; + with_numa_alloc::type ::with_no_lockable::type LCGraph; #endif +//typedef LCGraph Graph; +//typedef Graph::edge_iterator edge_iterator; +typedef LearningGraph Graph; +typedef index_t edge_iterator; #else using Graph = galois::graphs::DistGraph; #endif -typedef Graph::GraphNode GNode; - -#endif +} diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index 03ae92cbff..bf3ace2470 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -26,8 +26,9 @@ class LearningGraph { ~LearningGraph() { dealloc(); } void init(index_t nv, index_t ne) { num_vertices_ = nv; num_edges_ = ne; } void readGraph(std::string path, std::string dataset); - index_t size() { return num_vertices_; } - index_t sizeEdges() { return num_edges_; } + void readGraphFromGRFile(const std::string& filename); + size_t size() { return (size_t)num_vertices_; } + size_t sizeEdges() { return (size_t)num_edges_; } index_t getDegree(index_t vid) { return degrees_[vid]; } index_t getEdgeDst(index_t eid) { return colidx_[eid]; } index_t get_degree(index_t vid) { return degrees_[vid]; } diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h index 01616d01f5..15c82ffa12 100644 --- a/libdeepgalois/include/deepgalois/sampler.h +++ b/libdeepgalois/include/deepgalois/sampler.h @@ -13,11 +13,11 @@ class Sampler { // !API function for user-defined selection strategy virtual void select_vertices(size_t nv, size_t n, int m, Graph* g, VertexList vertices, VertexSet &vertex_set); - galois::runtime::iterable > neighbor_sampler(Graph &g, GNode v); + galois::runtime::iterable > neighbor_sampler(Graph &g, VertexID v); - Graph::edge_iterator sampled_edge_begin(Graph &g, GNode v) { return g.edge_begin(v); } + edge_iterator sampled_edge_begin(Graph &g, VertexID v) { return g.edge_begin(v); } - Graph::edge_iterator sampled_edge_end(Graph &g, GNode v) { return g.edge_end(v); } + edge_iterator sampled_edge_end(Graph &g, VertexID v) { return g.edge_end(v); } void set_masked_graph(size_t begin, size_t end, size_t count, mask_t* masks, Graph* g); diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 9013e563f5..9b5b858206 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -30,7 +30,6 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) { void Context::createSubgraph() { subgraph_cpu = new Graph(); - lsubgraph = new LearningGraph(); } // generate labels for the subgraph, m is subgraph size @@ -62,25 +61,27 @@ void Context::gen_subgraph_feats(size_t m, const mask_t *masks) { } size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) { + std::string filename = path + dataset_str + ".csgr"; galois::StatTimer Tread("GraphReadingTime"); Tread.start(); if (filetype == "el") { - std::string filename = path + dataset_str + ".el"; + filename = path + dataset_str + ".el"; printf("Reading .el file: %s\n", filename.c_str()); read_edgelist(filename.c_str(), true); // symmetrize } else if (filetype == "bin") { - lgraph = new LearningGraph(); - lgraph->readGraph(path, dataset_str); + graph_cpu->readGraphFromGRFile(filename); } else if (filetype == "gr") { graph_cpu = new Graph(); std::string filename = path + dataset_str + ".csgr"; printf("Reading .gr file: %s\n", filename.c_str()); if (selfloop) { Graph graph_temp; - galois::graphs::readGraph(graph_temp, filename); + //galois::graphs::readGraph(graph_temp, filename); + graph_temp.readGraphFromGRFile(filename); add_selfloop(graph_temp, *graph_cpu); is_selfloop_added = selfloop; - } else galois::graphs::readGraph(*graph_cpu, filename); + //} else galois::graphs::readGraph(*graph_cpu, filename); + } else graph_cpu->readGraphFromGRFile(filename); // TODO dist version of self loop } else { printf("Unkown file format\n"); diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index b374dd9d91..7dc1752436 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -17,7 +17,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou // get normalization factor if needed if (norm) a = norm_factor[src]; // gather neighbors' embeddings - for (const auto e : g.edges(src)) { + for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { const auto dst = g.getEdgeDst(e); if (norm) { // normalize b as well diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index 4e65d838a5..ba4432aca9 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -33,7 +33,7 @@ void LearningGraph::fixEndEdge(index_t vid, index_t row_end) { void LearningGraph::constructEdge(index_t eid, index_t dst, edata_t edata) { } -void degree_counting() { +void LearningGraph::degree_counting() { /* degrees = new uint32_t[num_vertices_]; galois::do_all(galois::iterate(begin(), end()), [&] (auto v) { @@ -44,6 +44,9 @@ void degree_counting() { void LearningGraph::readGraph(std::string path, std::string dataset) { std::string filename = path + dataset + ".csgr"; +} + +void LearningGraph::readGraphFromGRFile(const std::string& filename) { std::ifstream ifs; ifs.open(filename); int masterFD = open(filename.c_str(), O_RDONLY); diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index 257cf1edef..a1b1e4feba 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -3,10 +3,6 @@ #include #include -inline unsigned getDegree(Graph *g, GNode v) { - return std::distance(g->edge_begin(v), g->edge_end(v)); -} - namespace deepgalois { void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *masks, Graph *g) { @@ -29,9 +25,9 @@ void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *m void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph *g, std::vector °rees) { assert(degrees.size() == n); - galois::do_all(galois::iterate(size_t(0), n), [&](const GNode src) { + galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { if (masks[src] == 1) { - for (const auto e : g->edges(src)) { + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { const auto dst = g->getEdgeDst(e); if (masks[dst] == 1) degrees[src] ++; } @@ -48,11 +44,11 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& su #ifndef GALOIS_USE_DIST sub.allocateFrom(n, ne); sub.constructNodes(); - galois::do_all(galois::iterate((size_t)0, n), [&](const GNode src) { + galois::do_all(galois::iterate((size_t)0, n), [&](const auto src) { sub.fixEndEdge(src, offsets[src+1]); if (masks[src] == 1) { auto idx = offsets[src]; - for (const auto e : g->edges(src)) { + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { const auto dst = g->getEdgeDst(e); if (masks[dst] == 1) sub.constructEdge(idx++, dst, 0); } @@ -77,7 +73,7 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v galois::gPrint("vertex_set size: ", vertex_set.size(), "\n"); int *degrees = new int[m]; galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) { - degrees[i] = (int)getDegree(g, frontier[i]); + degrees[i] = (int)g->get_degree(frontier[i]); }, galois::loopname("compute_degrees")); for (size_t i = 0; i < n - m; i++) { auto pos = select_one_item((int)m, degrees); @@ -89,7 +85,7 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v auto dst = g->getEdgeDst(g->edge_begin(u) + neighbor_id); if (vertex_set.find(dst) == vertex_set.end()) { frontier[pos] = dst; - degrees[pos] = getDegree(g, frontier[pos]); + degrees[pos] = g->get_degree(frontier[pos]); vertex_set.insert(dst); break; } @@ -129,7 +125,7 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { VertexList new_ids = reindexing_vertice(graph->size(), vertex_set); std::vector degrees(nv, 0); // degrees of vertices in the subgraph for (auto v : vertex_set) { - degrees[new_ids[v]] = std::distance(g.edge_begin(v), g.edge_end(v)); + degrees[new_ids[v]] = g.get_degree(v); } auto offsets = deepgalois::parallel_prefix_sum(degrees); auto ne = offsets[nv]; @@ -142,7 +138,7 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { sub.fixEndEdge(i, offsets[i+1]); unsigned j = 0; auto old_id = old_ids[i]; - for (auto e : g.edges(old_id)) { + for (auto e = g.edge_begin(old_id); e != g.edge_end(old_id); e++) { sub.constructEdge(offsets[i]+j, g.getEdgeDst(e), 0); j ++; } diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index de999a095e..fa492172a5 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -18,7 +18,7 @@ int main(int argc, char** argv) { LonestarGnnStart(argc, argv, name, desc, url); deepgalois::Net network; // the neural network to train - Graph* dGraph = NULL; + deepgalois::Graph* dGraph = NULL; #ifdef GALOIS_USE_DIST std::vector dummyVec; dGraph = galois::graphs::constructSymmetricGraph(dummyVec); From c63e46440efe915474aedafcb626f9f9047d9530 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 29 Apr 2020 16:41:05 -0500 Subject: [PATCH 211/660] fix mkl csrmm --- libdeepgalois/CMakeLists.txt | 3 +- libdeepgalois/include/deepgalois/context.h | 27 ++ .../deepgalois/layers/graph_conv_layer.h | 2 +- .../include/deepgalois/math_functions.hh | 68 ++--- libdeepgalois/include/deepgalois/utils.h | 4 - libdeepgalois/src/context.cpp | 28 +- libdeepgalois/src/layers/aggregator.cpp | 4 + libdeepgalois/src/layers/graph_conv_layer.cpp | 12 +- libdeepgalois/src/math_functions.cpp | 281 ++++-------------- 9 files changed, 140 insertions(+), 289 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index be5853f987..2ede00abbc 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -105,10 +105,11 @@ set(sources ) endif() +set(BOOST_LIBRARIES "-lboost_system -lboost_thread") add_library(dg_cpu STATIC ${sources}) target_link_libraries(dg_cpu galois_shmem) target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES}) -target_link_libraries(dg_cpu ${BLAS_LIB}) +target_link_libraries(dg_cpu ${BLAS_LIB} ${BOOST_LIBRARIES}) target_include_directories(dg_cpu PUBLIC ${CMAKE_SOURCE_DIR}/libgalois/include ${CMAKE_CURRENT_SOURCE_DIR}/include diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index e368319dff..ffbaecb0d3 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -6,6 +6,7 @@ #include #include #include "deepgalois/types.h" +#include #ifdef CPU_ONLY #include "deepgalois/gtypes.h" #else @@ -15,10 +16,13 @@ namespace deepgalois { +using boost::shared_ptr; + class Context { public: Context(); ~Context(); + static Context& Get(); size_t read_graph(std::string dataset_str, bool selfloop); size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop); @@ -66,6 +70,28 @@ class Context { inline static curandGenerator_t curand_generator() { return curand_generator_; } #endif + // This random number generator facade hides boost and CUDA rng + // implementation from one another (for cross-platform compatibility). + class RNG { + public: + RNG(); + explicit RNG(unsigned int seed); + explicit RNG(const RNG&); + RNG& operator=(const RNG&); + void* generator(); + private: + class Generator; + shared_ptr generator_; + }; + + // Getters for boost rng, curand, and cublas handles + inline static RNG& rng_stream() { + if (!Get().random_generator_) { + Get().random_generator_.reset(new RNG()); + } + return *(Get().random_generator_); + } + protected: size_t n; // number of samples: N size_t num_classes; // number of classes: E @@ -82,6 +108,7 @@ class Context { float_t* d_feats; // input features on device float_t* d_feats_subg; // input features for subgraph on device float_t* norm_factor; // normalization constant based on graph structure + shared_ptr random_generator_; #ifdef CPU_ONLY void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false); diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index dc38642330..6cc40c266d 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -67,7 +67,7 @@ class graph_conv_layer : public layer { float_t* in_temp; float_t* in_temp1; float_t* trans_data; // y*x - unsigned* dropout_mask; // x*y + mask_t* dropout_mask; // x*y // Glorot & Bengio (AISTATS 2010) inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed=1); diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 72b836da64..05a63ee9ca 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -12,7 +12,7 @@ #ifdef USE_MKL #include -#else // If use MKL, simply include the MKL header +#else extern "C" { #include } @@ -23,26 +23,33 @@ namespace deepgalois { namespace math { //! add 2 arrays for n elements void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out); + //! multiply n elements of vector by scalar void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out); + //! do dot product of 2 vectors float_t dot(const vec_t& x, const vec_t& y); void axpy(size_t n, const float_t a, float_t *x, float_t *y); int argmax(const size_t n, const float_t* x); // the arguments of the maxima + //! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2 float_t l2_norm(size_t n, const float_t* a); + //! clear n elements of a vector void clear_cpu(size_t n, float_t* in); + // dropout functions randomly remove weights -void dropout_cpu(size_t n, const float scale, const float dropout_rate, - const float_t* in, unsigned* mask, float_t* out); +void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate, const float_t* in, mask_t* mask, float_t* out); + // dropout derivative: use existing dropouts in masks instead of generating them; -void d_dropout_cpu(size_t n, const float scale, const float_t* in_diff, - unsigned* mask, float_t* out_diff); +void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in, mask_t* mask, float_t* out); + //! ReLU = keep if positive void relu_cpu(size_t n, const float_t* in, float_t* out); + //! ReLU derivative; generally, 1 if data > 0, 0 otherwise void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out); + void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out); void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, const float_t* data, float_t* out); @@ -65,61 +72,28 @@ void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const //! copy vector from in -> out; first len elements void copy_cpu(size_t len, const float_t* in, float_t* out); + // single-precision dense matrix multiply void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C); + // single-precision sparse matrix dense matrix multiply, C = A * B, A is sparse void csrmm_cpu(const int M, const int N, const int K, const int nnz, const float alpha, const float* A_nonzeros, const int* A_idx_ptr, const int* A_nonzero_idx, const float* B, const float beta, float* C); -} // deepgalois -} // math -//! clear entire vector -void clear(vec_t& in); -//! multiply vector by scalar -void mul_scalar(const float_t alpha, vec_t& Y); -//! add two same size vectors into out -void vadd(const vec_t& a, const vec_t& b, vec_t& out); // vector add // dropout functions randomly remove weights -void dropout(const float scale, const float dropout_rate, const vec_t& in, - std::vector& mask, vec_t& out); // dropout -void dropout(const float scale, const float dropout_rate, const vec_t& in, - std::vector& mask, float_t* out); -void d_dropout(const float scale, const vec_t& in_diff, - std::vector& mask, vec_t& out_diff); -//! ReLU = keep if positive -void relu(const vec_t& in, vec_t& out); -//! copy vector from in -> out -void copy1D1D(const vec_t& in, vec_t& out); -//! matrix multiply -void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, - const float_t* A, const float_t* B, - float_t* C); // matrix multiply -//! transposes a matrix (vector) -void transpose(size_t x, size_t y, const vec_t& in, vec_t& out); +void dropout(float scale, float dropout_rate, const float_t* in, mask_t* mask, float_t* out); +void d_dropout(const float scale, const float_t* in, mask_t* mask, float_t* out); + //! transposes a matrix (malloc'd array) void transpose(size_t x, size_t y, const float_t* in, float_t* out); -void vsub(const vec_t& a, const vec_t& b, vec_t& out); -void vmul(const vec_t& a, const vec_t& b, vec_t& out); -void vdiv(const vec_t& a, const vec_t& b, vec_t& out); -void add_scalar(const float_t alpha, vec_t& Y); -void sub_scalar(const float_t alpha, vec_t& Y); -void div_scalar(const float_t alpha, vec_t& Y); -//void mvmul(const vec_t& matrix, const vec_t& in_vector, vec_t& out_vector); -void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, float_t *out_vector); -void vvmul(const vec_t& a, const vec_t& b, tensor_t& out); -void matadd(size_t x, size_t y, const tensor_t& A, const tensor_t& B, - tensor_t& C); -void copy2D1D(const tensor_t& in, vec_t& out); -void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C); -void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B, - vec_t& C); -void transpose2D(const tensor_t& in, tensor_t& out); -void transpose2D1D(const tensor_t& in, vec_t& out); -int argmax(const size_t n, const vec_t& x); // the arguments of the maxima +void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in, float_t *out); + +} // math +} // deepgalois // GPU operators bool isnan_gpu(int n, const float_t *array); // does array contain any 'nan' element diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h index 60974b9f8a..c8bb1d4e41 100644 --- a/libdeepgalois/include/deepgalois/utils.h +++ b/libdeepgalois/include/deepgalois/utils.h @@ -96,10 +96,6 @@ uniform_rand(T min, T max) { return dst(random_generator::get_instance()()); } -inline bool bernoulli(float_t p) { - return uniform_rand(float_t(0), float_t(1)) > p; -} - // sequential prefix sum template inline std::vector prefix_sum(const std::vector &in) { diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 9b5b858206..efbc525a32 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -4,9 +4,20 @@ #include "deepgalois/context.h" #include "deepgalois/utils.h" #include "deepgalois/configs.h" +#include namespace deepgalois { +// Make sure each thread can have different values. +static boost::thread_specific_ptr thread_instance_; + +Context& Context::Get() { + if (!thread_instance_.get()) { + thread_instance_.reset(new Context()); + } + return *(thread_instance_.get()); +} + #ifdef CPU_ONLY Context::Context() : n(0), num_classes(0), feat_len(0), is_single_class(true), @@ -129,15 +140,28 @@ void Context::add_selfloop(Graph &og, Graph &g) { void Context::norm_factor_counting(size_t g_size) { auto g = getGraphPointer(); auto subg = getSubgraphPointer(); - g->degree_counting(); if (use_subgraph) g = subg; + g->degree_counting(); +#ifdef USE_MKL + if (norm_factor == NULL) norm_factor = new float_t[g->sizeEdges()]; + galois::do_all(galois::iterate((size_t)0, g_size), [&](auto i) { + float_t c_i = std::sqrt(float_t(g->get_degree(i))); + for (auto e = g->edge_begin(i); e != g->edge_end(i); e++) { + const auto j = g->getEdgeDst(e); + float_t c_j = std::sqrt(float_t(g->get_degree(j))); + if (c_i == 0.0 || c_j == 0.0) norm_factor[e] = 0.0; + else norm_factor[e] = 1.0 / (c_i * c_j); + } + }, galois::loopname("NormCountingEdge")); +#else if (norm_factor == NULL) norm_factor = new float_t[g_size]; galois::do_all(galois::iterate((size_t)0, g_size), [&](auto v) { auto degree = g->get_degree(v); float_t temp = std::sqrt(float_t(degree)); if (temp == 0.0) norm_factor[v] = 0.0; else norm_factor[v] = 1.0 / temp; - }, galois::loopname("NormCounting")); + }, galois::loopname("NormCountingVertex")); +#endif } void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self_loop) { diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 7dc1752436..bd76b8b99b 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -37,9 +37,13 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { + galois::StatTimer Tcsrmm("CSRMM-MKL"); + //galois::gPrint("csrmm mkl\n"); + Tcsrmm.start(); unsigned n = g.size(); math::clear_cpu(n*len, out); math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, out); + Tcsrmm.stop(); } #endif diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index dae3d14ce5..c5c73b1f36 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -13,7 +13,7 @@ graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, assert(input_dims[0] == output_dims[0]); // num_vertices trainable_ = true; name_ = layer_type() + "_" + std::to_string(level); - assert(dropout_rate_ < 1.); + assert(dropout_rate_ >= 0. && dropout_rate_ < 1.); scale_ = 1. / (1. - dropout_rate_); } @@ -61,8 +61,8 @@ void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, floa void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t* neighbors, float_t* out) { float_t *a = new float_t[len]; float_t *b = new float_t[len]; - mvmul(n, len, &Q[0], self, a); - mvmul(n, len, &W[0], neighbors, b); + math::mvmul(n, len, &Q[0], self, a); + math::mvmul(n, len, &W[0], neighbors, b); math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors } @@ -90,7 +90,7 @@ void graph_conv_layer::malloc_and_init() { // rand_init_matrix(y, z, Q); zero_init_matrix(y, z, layer::weight_grad); - if (dropout_) dropout_mask = new unsigned[x * y]; + if (dropout_) dropout_mask = new mask_t[x * y]; in_temp = new float_t[x * y]; out_temp = new float_t[x * z]; trans_data = new float_t[y * x]; // y*x @@ -108,7 +108,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W if (dropout_ && phase_ == net_phase::train) - math::dropout_cpu(x*y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); + math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); else math::copy_cpu(x*y, in_data, in_temp); if (y > z) { @@ -165,7 +165,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, #endif if (level_ != 0 && dropout_) - math::d_dropout_cpu(x*y, scale_, in_grad, dropout_mask, in_grad); + math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad); #ifdef GALOIS_USE_DIST layer::syncSub->sync("GradientSync"); diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index aa41ffc41f..4c0354cccc 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -1,7 +1,9 @@ #include "deepgalois/math_functions.hh" #include "galois/Timer.h" #include "galois/Galois.h" +#include #include +#include #include "deepgalois/utils.h" #ifdef USE_MKL @@ -18,7 +20,29 @@ extern "C" { exit(1); \ } while(0); +std::default_random_engine generator; +std::uniform_real_distribution distribution(0.0,1.0); +/* +typedef boost::mt19937 rng_t; +inline rng_t* deepgalois_rng() { + return static_cast(Context::rng_stream().generator()); +} + +void rng_bernoulli(size_t n, const float_t p, uint8_t* r) { + boost::bernoulli_distribution random_distribution(p); + boost::variate_generator > + variate_generator(deepgalois_rng(), random_distribution); + for (size_t i = 0; i < n; ++i) + r[i] = variate_generator(); +} +*/ namespace deepgalois { + +inline uint8_t bernoulli(float_t p) { + //return uniform_rand(float_t(0), float_t(1)) > p ? 1 : 0; + return distribution(generator) > p ? 1 : 0; +} + namespace math { //! wrapper function to call cblas_sgemm @@ -41,8 +65,7 @@ void csrmm_cpu(const int M, const int N, const int K, const int nnz, #ifdef USE_MKL const char *matdescra = "GXXCX";//6 bytes const char transa = 'N'; - printf("Calling Intel MKL\n"); - exit(1); + //printf("Calling Intel MKL\n"); exit(1); mkl_scsrmm(&transa, &M , &N, &K, &alpha , matdescra, A_nonzeros, A_nnz_idx, A_idx_ptr, A_idx_ptr+1, B, &N, &beta , C, &N); @@ -168,55 +191,40 @@ void clear(vec_t& in) { } void clear_cpu(size_t n, float_t* in) { - for (size_t i = 0; i < n; i++) in[i] = 0; + //for (size_t i = 0; i < n; i++) in[i] = 0; + std::fill(in, in+n, 0); // memset(in, 0, n*sizeof(float_t)); } -void dropout(const float scale, const float dropout_rate, const vec_t& in, - std::vector& masks, vec_t& out) { - assert(masks.size() == out.size()); - // rng_bernoulli(1. - dropout_rate, masks); // Create random numbers - for (size_t i = 0; i < in.size(); ++i) - masks[i] = deepgalois::bernoulli(dropout_rate)?1:0; - for (size_t i = 0; i < in.size(); ++i) - out[i] = in[i] * masks[i] * scale; +void dropout(size_t m, float scale, float dropout_rate, + const float_t* in, mask_t* masks, float_t* out) { + for (size_t i = 0; i < m; ++i) + masks[i] = deepgalois::bernoulli(dropout_rate); + for (size_t i = 0; i < m; ++i) + out[i] = in[i] * (float_t)masks[i] * scale; } -void dropout(const float scale, const float dropout_rate, const vec_t& in, - std::vector& masks, float_t* out) { - for (size_t i = 0; i < in.size(); ++i) - masks[i] = deepgalois::bernoulli(dropout_rate)?1:0; - for (size_t i = 0; i < in.size(); ++i) - out[i] = in[i] * masks[i] * scale; -} - -void dropout_cpu(size_t n, const float scale, const float dropout_rate, - const float_t* in, unsigned* masks, float_t* out) { - galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { - masks[i] = deepgalois::bernoulli(dropout_rate)?1:0; - out[i] = in[i] * masks[i] * scale; +void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate, + const float_t* in, mask_t* masks, float_t* out) { + for (size_t i = 0; i < n*m; ++i) + masks[i] = deepgalois::bernoulli(dropout_rate); + galois::do_all(galois::iterate((size_t)0, n*m), [&](const auto& i) { + out[i] = in[i] * (float_t)masks[i] * scale; }, galois::loopname("dropout")); } -void d_dropout(const float scale, const vec_t& in_diff, - std::vector& masks, vec_t& out_diff) { - for (size_t i = 0; i < in_diff.size(); ++i) - out_diff[i] = in_diff[i] * masks[i] * scale; +void d_dropout(size_t m, float scale, const float_t* in, mask_t* masks, float_t* out) { + for (size_t i = 0; i < m; ++i) + out[i] = in[i] * (float_t)masks[i] * scale; } -void d_dropout_cpu(size_t n, const float scale, const float_t* in, - unsigned* masks, float_t* out) { - galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { - out[i] = in[i] * masks[i] * scale; +void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in, + mask_t* masks, float_t* out) { + galois::do_all(galois::iterate((size_t)0, n*m), [&](const auto& i) { + out[i] = in[i] * (float_t)masks[i] * scale; }, galois::loopname("d_dropout")); } -void relu(const vec_t& in, vec_t& out) { - for (size_t i = 0; i < out.size(); ++i) { - out[i] = std::max(in[i], (float_t)0); - } -} - void relu_cpu(size_t n, const float_t* in, float_t* out) { // TODO: vectorize galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { @@ -373,15 +381,6 @@ void matmul1D1D(const size_t dim_x, const size_t dim_y, const size_t dim_z, sgemm_cpu(CblasNoTrans, CblasNoTrans, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); } -// TODO make parallel -void transpose(size_t x, size_t y, const vec_t& in, vec_t& out) { - for (size_t i = 0; i < y; i++) { - for (size_t j = 0; j < x; j++) { - out[i * x + j] = in[j * y + i]; - } - } -} - // TODO make parallel void transpose(size_t x, size_t y, const float_t* in, float_t* out) { for (size_t i = 0; i < y; i++) { @@ -390,51 +389,6 @@ void transpose(size_t x, size_t y, const float_t* in, float_t* out) { } } } -} // deepgalois -} // math - - - -// vector subtract -void vsub(const vec_t& in_a, const vec_t& in_b, vec_t& out) { - for (size_t i = 0; i < out.size(); ++i) - out[i] = in_a[i] - in_b[i]; -} - -// vector multiply -void vmul(const vec_t& in_a, const vec_t& in_b, vec_t& out) { - for (size_t i = 0; i < out.size(); ++i) - out[i] = in_a[i] * in_b[i]; -} - -// vector divide -void vdiv(const vec_t& in_a, const vec_t& in_b, vec_t& out) { - for (size_t i = 0; i < out.size(); ++i) { - assert(in_b[i] != 0); - out[i] = in_a[i] / in_b[i]; - } -} - -// vector add scalar -void add_scalar(const float_t alpha, vec_t& Y) { - for (size_t i = 0; i < Y.size(); ++i) - Y[i] += alpha; -} - -// vector subtract scalar -void sub_scalar(const float_t alpha, vec_t& Y) { - for (size_t i = 0; i < Y.size(); ++i) - Y[i] -= alpha; -} - - -// vector divide scalar -void div_scalar(const float_t alpha, vec_t& Y) { - assert(alpha != 0); - for (size_t i = 0; i < Y.size(); ++i) - Y[i] /= alpha; -} - // matrix-vector multiply void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, float_t *out_vector) { @@ -445,143 +399,14 @@ void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, } } -// vector-vector multiply -void vvmul(const vec_t& a, const vec_t& b, tensor_t& out) { - size_t m = a.size(); - size_t n = b.size(); - for (size_t i = 0; i < m; ++i) { - for (size_t j = 0; j < n; ++j) { - out[i][j] += a[i] * b[j]; - } - } -} - -// matrix addition -void matadd(size_t x, size_t y, const tensor_t& A, const tensor_t& B, - tensor_t& C) { - for (size_t i = 0; i < x; ++i) - for (size_t j = 0; j < y; ++j) - C[i][j] = A[i][j] + B[i][j]; -} - -// TODO: vectorize -void copy2D1D(const tensor_t& in, vec_t& out) { - size_t x = in.size(); - size_t y = in[0].size(); - auto ptr = &out[0]; - for (size_t i = 0; i < x; i++) { - std::copy(in[i].begin(), in[i].end(), ptr); - ptr += y; - } -} - - - -void matmul2D(const tensor_t& A, const tensor_t& B, tensor_t& C) { - // A: x*z; B: z*y; C: x*y - size_t dim_x = A.size(); - size_t dim_y = C[0].size(); - size_t dim_z = A[0].size(); - assert(C.size() == dim_x); - assert(B.size() == dim_z); - assert(B[0].size() == dim_y); - - for (size_t i = 0; i < dim_x; ++i) { - for (size_t j = 0; j < dim_y; ++j) { - C[i][j] = 0; - for (size_t k = 0; k < dim_z; ++k) { - C[i][j] += A[i][k] * B[k][j]; - } - } - } -} - - -void matmul2D1D(const size_t dim_y, const tensor_t& A, const vec_t& B, - vec_t& C) { - // A: x*z; B: z*y; C: x*y - size_t dim_x = A.size(); - size_t dim_z = A[0].size(); - assert(B.size() == dim_z * dim_y); - assert(C.size() == dim_x * dim_y); - vec_t A1D(dim_x * dim_z); - copy2D1D(A, A1D); - deepgalois::math::matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C[0]); -} - -void matmul(const tensor_t& A, const vec_t& B, tensor_t& C) { - // A: x*z; B: z*y; C: x*y - size_t dim_x = C.size(); - size_t dim_y = C[0].size(); - size_t dim_z = A[0].size(); - assert(A.size() == dim_x); - assert(B.size() == dim_y * dim_z); - vec_t A1D(dim_x * dim_z); - vec_t C1D(dim_x * dim_y, 0); - auto ptr = &A1D[0]; - for (size_t i = 0; i < dim_x; i++) { - std::copy(A[i].begin(), A[i].end(), ptr); - ptr += dim_z; - } - deepgalois::math::matmul1D1D(dim_x, dim_y, dim_z, &A1D[0], &B[0], &C1D[0]); - for (size_t i = 0; i < dim_x; i++) { - for (size_t j = 0; j < dim_y; ++j) { - C[i][j] = C1D[i * dim_y + j]; - } - } -} - -void transpose2D(const tensor_t& in, tensor_t& out) { - size_t x = in.size(); - size_t y = in[0].size(); - for (size_t i = 0; i < y; i++) { - for (size_t j = 0; j < x; j++) { - out[i][j] = in[j][i]; - } - } -} - -// TODO: vectorize -void transpose2D1D(const tensor_t& in, vec_t& out) { - size_t x = in.size(); - size_t y = in[0].size(); - assert(out.size() == x * y); - for (size_t i = 0; i < y; i++) { - for (size_t j = 0; j < x; j++) { - out[i * x + j] = in[j][i]; - } - } -} - - -int argmax(const size_t n, const vec_t& x) { - float_t max = x[0]; - int max_ind = 0; - for (size_t i = 1; i < n; i++) { - if (x[i] > max) { - max_ind = i; - max = x[i]; - } +float reduce_mean(size_t n, const float_t* x) { + float_t sum = 0.; + for (size_t i = 0; i < n; i++) { + sum += (float_t)x[i]; } - return max_ind; -} - -void d_mvmul(vec_t& in_diff, vec_t& h_in, tensor_t& out_diff) { - vvmul(h_in, in_diff, out_diff); // transposed feature matrix X^T times in_diff + return sum / (float_t)n; } -void d_vadd(vec_t& in_diff, vec_t& out_diff) { - for (size_t i = 0; i < out_diff.size(); ++i) - out_diff[i] = in_diff[i]; -} - -float reduce_mean(const vec_t& x) { - size_t n = x.size(); - assert(n > 0); - float sum = (float)x[0]; - for (size_t i = 1; i < n; i++) { - sum += (float)x[i]; - } - return sum / (float)n; -} +} // end namespace math +} // end namespace deepgalois From 59e506f04fa2ad91e05c17dc81cb281ca0006d01 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 29 Apr 2020 20:20:03 -0500 Subject: [PATCH 212/660] clean math --- .../include/deepgalois/math_functions.hh | 58 +++---- libdeepgalois/src/context.cpp | 5 +- libdeepgalois/src/layers/aggregator.cpp | 4 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 7 +- libdeepgalois/src/lgraph.cpp | 28 ++-- libdeepgalois/src/math_functions.cpp | 143 +++++------------- libdeepgalois/src/sampler.cpp | 2 + 7 files changed, 94 insertions(+), 153 deletions(-) diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 05a63ee9ca..a39e463ecc 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -21,15 +21,37 @@ extern "C" { namespace deepgalois { namespace math { + +// single-precision dense matrix multiply +void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, + const int M, const int N, const int K, const float alpha, + const float* A, const float* B, const float beta, float* C); + +// single-precision sparse matrix dense matrix multiply, C = A * B, A is sparse +void csrmm_cpu(const int M, const int N, const int K, const int nnz, + const float alpha, const float* A_nonzeros, + const int* A_idx_ptr, const int* A_nonzero_idx, + const float* B, const float beta, float* C); + +// matrix-vector multiply +void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, + const float* A, const float* x, const float beta, float* y); + //! add 2 arrays for n elements void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out); //! multiply n elements of vector by scalar -void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out); +void scal(size_t n, const float_t alpha, float_t* x); +void scale(size_t n, const float_t alpha, const float_t* x, float_t* y); +void mul_scalar(size_t n, const float_t alpha, const float_t* x, float_t* y); //! do dot product of 2 vectors -float_t dot(const vec_t& x, const vec_t& y); +float_t dot(size_t n, const float_t* x, const float_t* y); + +// SAXPY stands for โ€œSingle-precision A*X Plus Y" void axpy(size_t n, const float_t a, float_t *x, float_t *y); + +// Returns the index of the maximum value int argmax(const size_t n, const float_t* x); // the arguments of the maxima //! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2 @@ -38,60 +60,42 @@ float_t l2_norm(size_t n, const float_t* a); //! clear n elements of a vector void clear_cpu(size_t n, float_t* in); +//! copy vector from in -> out; first len elements +void copy_cpu(size_t len, const float_t* in, float_t* out); + // dropout functions randomly remove weights void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate, const float_t* in, mask_t* mask, float_t* out); // dropout derivative: use existing dropouts in masks instead of generating them; void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in, mask_t* mask, float_t* out); -//! ReLU = keep if positive +//! ReLU = keep if positive; and ReLU derivative: 1 if data > 0, 0 otherwise void relu_cpu(size_t n, const float_t* in, float_t* out); - -//! ReLU derivative; generally, 1 if data > 0, 0 otherwise void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out); +// Leaky ReLU void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out); void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, const float_t* data, float_t* out); // Loss function for single-class label (one-hot) data: softmax -void softmax(const vec_t& input, vec_t& output); void softmax(size_t n, const float_t* input, float_t* output); -void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp); void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp); -float_t cross_entropy(const vec_t& y, const vec_t& p); +// Cross entropy float_t cross_entropy(size_t n, const float_t* y, const float_t* p); -void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d); void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d); // Loss function for multi-class label (one-hot) data: sigmoid -void sigmoid(const vec_t& input, vec_t& output); void sigmoid(size_t n, const float_t* input, float_t* output); -void d_sigmoid(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp); void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp); -//! copy vector from in -> out; first len elements -void copy_cpu(size_t len, const float_t* in, float_t* out); - -// single-precision dense matrix multiply -void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, - const int M, const int N, const int K, const float alpha, - const float* A, const float* B, const float beta, float* C); - -// single-precision sparse matrix dense matrix multiply, C = A * B, A is sparse -void csrmm_cpu(const int M, const int N, const int K, const int nnz, - const float alpha, const float* A_nonzeros, - const int* A_idx_ptr, const int* A_nonzero_idx, - const float* B, const float beta, float* C); - // dropout functions randomly remove weights void dropout(float scale, float dropout_rate, const float_t* in, mask_t* mask, float_t* out); void d_dropout(const float scale, const float_t* in, mask_t* mask, float_t* out); //! transposes a matrix (malloc'd array) void transpose(size_t x, size_t y, const float_t* in, float_t* out); -void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in, float_t *out); - + } // math } // deepgalois diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index efbc525a32..37c9a33e04 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -142,8 +142,9 @@ void Context::norm_factor_counting(size_t g_size) { auto subg = getSubgraphPointer(); if (use_subgraph) g = subg; g->degree_counting(); + if (norm_factor != NULL) free(norm_factor); #ifdef USE_MKL - if (norm_factor == NULL) norm_factor = new float_t[g->sizeEdges()]; + norm_factor = new float_t[g->sizeEdges()]; galois::do_all(galois::iterate((size_t)0, g_size), [&](auto i) { float_t c_i = std::sqrt(float_t(g->get_degree(i))); for (auto e = g->edge_begin(i); e != g->edge_end(i); e++) { @@ -154,7 +155,7 @@ void Context::norm_factor_counting(size_t g_size) { } }, galois::loopname("NormCountingEdge")); #else - if (norm_factor == NULL) norm_factor = new float_t[g_size]; + norm_factor = new float_t[g_size]; galois::do_all(galois::iterate((size_t)0, g_size), [&](auto v) { auto degree = g->get_degree(v); float_t temp = std::sqrt(float_t(degree)); diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index bd76b8b99b..e3d6f12f78 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -22,9 +22,9 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou if (norm) { // normalize b as well b = a * norm_factor[dst]; - vec_t neighbor(len); + float_t* neighbor = new float_t[len]; // scale the neighbor's data using the normalization factor - math::mul_scalar(len, b, &in[dst * len], &neighbor[0]); + math::scale(len, b, &in[dst * len], neighbor); // use scaled data to update; out[src] += in[dst] math::vadd_cpu(len, &out[src * len], &neighbor[0], &out[src * len]); } else { diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index c5c73b1f36..31622e0699 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -61,8 +61,8 @@ void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, floa void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t* neighbors, float_t* out) { float_t *a = new float_t[len]; float_t *b = new float_t[len]; - math::mvmul(n, len, &Q[0], self, a); - math::mvmul(n, len, &W[0], neighbors, b); + math::mvmul(CblasNoTrans, n, len, 1.0, &Q[0], self, 0.0, a); + math::mvmul(CblasNoTrans, n, len, 1.0, &W[0], neighbors, 0.0, b); math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors } @@ -72,8 +72,7 @@ void graph_conv_layer::malloc_and_init() { size_t z = output_dims[1]; #ifdef GALOIS_USE_DIST // setup gluon - layer::gradientGraph = new deepgalois::GluonGradients(layer::weight_grad, - y * z); + layer::gradientGraph = new deepgalois::GluonGradients(layer::weight_grad, y * z); layer::syncSub = new galois::graphs::GluonSubstrate( *layer::gradientGraph, layer::gradientGraph->myHostID(), diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index ba4432aca9..6c36eb464b 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -1,5 +1,6 @@ #include "deepgalois/lgraph.h" #include "deepgalois/utils.h" +#include "galois/Galois.h" #include #include #include @@ -22,24 +23,33 @@ void LearningGraph::progressPrint(unsigned maxii, unsigned ii) { } void LearningGraph::allocateFrom(index_t nv, index_t ne) { + num_vertices_ = nv; + num_edges_ = ne; + printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_); + rowptr_ = new index_t[num_vertices_+1]; + colidx_ = new index_t[num_edges_]; + rowptr_[0] = 0; } void LearningGraph::constructNodes() { } void LearningGraph::fixEndEdge(index_t vid, index_t row_end) { + rowptr_[vid+1] = row_end; } void LearningGraph::constructEdge(index_t eid, index_t dst, edata_t edata) { + assert(dst < num_vertices_); + assert(eid < num_edges_); + colidx_[eid] = dst; } void LearningGraph::degree_counting() { -/* - degrees = new uint32_t[num_vertices_]; - galois::do_all(galois::iterate(begin(), end()), [&] (auto v) { - degrees[v] = std::distance(this->edge_begin(v), this->edge_end(v)); + if (degrees_ != NULL) return; + degrees_ = new index_t[num_vertices_]; + galois::do_all(galois::iterate(size_t(0), size_t(num_vertices_)), [&] (auto v) { + degrees_[v] = rowptr_[v+1] - rowptr_[v]; }, galois::loopname("DegreeCounting")); -*/ } void LearningGraph::readGraph(std::string path, std::string dataset) { @@ -76,15 +86,15 @@ void LearningGraph::readGraphFromGRFile(const std::string& filename) { assert(version == 1); uint64_t sizeEdgeTy = le64toh(*fptr++); uint64_t nv = le64toh(*fptr++); - uint64_t numEdges = le64toh(*fptr++); + uint64_t ne = le64toh(*fptr++); uint64_t *outIdx = fptr; fptr += nv; uint32_t *fptr32 = (uint32_t*)fptr; uint32_t *outs = fptr32; - fptr32 += numEdges; - if (numEdges % 2) fptr32 += 1; + fptr32 += ne; + if (ne % 2) fptr32 += 1; num_vertices_ = nv; - num_edges_ = numEdges; + num_edges_ = ne; if (sizeEdgeTy != 0) { std::cout << "LearningGraph: currently edge data not supported.\n"; exit(1); diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 4c0354cccc..08ceaab76c 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -74,28 +74,25 @@ void csrmm_cpu(const int M, const int N, const int K, const int nnz, #endif } +// matrix-vector multiply +void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, + const float* A, const float* x, const float beta, float* y) { + cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); +} + +/* const size_t vec_len = 8; // for 32-bit floating point in AVX2 // vector add -#if defined(__AVX__) || defined(__AVX2__) void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) { +#ifdef __AVX2__ const size_t alignedN = n - n % vec_len; for (size_t i = 0; i < alignedN; i += vec_len) _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i]; -} - -void vadd(const vec_t& a, const vec_t& b, vec_t& out) { - size_t n = out.size(); - vadd_cpu(n, &a[0], &b[0], &out[0]); -} #else -void vadd(const vec_t& a, const vec_t& b, vec_t& out) { - for (size_t i = 0; i < out.size(); ++i) out[i] = a[i] + b[i]; -} -void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) { for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i]; -} #endif +} #if defined(__AVX__) || defined(__AVX2__) void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) { @@ -107,7 +104,6 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) } // SAXPY stands for โ€œSingle-precision A*X Plus Y" -/* void axpy(size_t n, const float_t a, float_t *x, float_t *y) { const size_t alignedN = n - n % vec_len; const __m256 alpha = _mm256_set1_ps(a); @@ -128,27 +124,32 @@ float_t l2_norm(size_t n, const float_t* in) { __m256 sum = _mm256_hadd_ps(vsum, vsum); return (((float_t*)&sum)[0] + ((float_t*)&sum)[2]) / 2.0; } -*/ #else // vector multiply scalar -void mul_scalar(const float_t alpha, vec_t& Y) { - for (size_t i = 0; i < Y.size(); ++i) Y[i] *= alpha; -} - void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) { for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i]; } -//void axpy(size_t n, const float_t a, float_t *x, float_t *y) { -// for (size_t i = 0; i < n; ++i) y[i] = a * x[i] + y[i]; -//} - -//float_t l2_norm(size_t n, const float_t* a) { -// float_t sum = 0.0; -// for (size_t i = 0; i < n; ++i) sum += a[i] * a[i]; -// return sum / 2.0; -//} +float_t l2_norm(size_t n, const float_t* a) { + float_t sum = 0.0; + for (size_t i = 0; i < n; ++i) sum += a[i] * a[i]; + return sum / 2.0; +} #endif +*/ + +void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* y) { + vsAdd(n, a, b, y); +} + +void scal(size_t n, const float_t alpha, float_t* x) { + cblas_sscal(n, alpha, x, 1); +} + +void scale(size_t n, const float_t alpha, const float_t* x, float_t* y) { + cblas_scopy(n, x, 1, y, 1); + cblas_sscal(n, alpha, y, 1); +} void axpy(size_t n, const float_t a, float_t *x, float_t *y) { cblas_saxpy(n, a, x, 1, y, 1); @@ -166,28 +167,14 @@ int argmax(const size_t n, const float_t* x) { return max_ind; } +// l2 normalization float_t l2_norm(size_t n, const float_t* x) { return cblas_snrm2(n, x, 1); } // dot product -float_t dot(const vec_t& x, const vec_t& y) { - float_t sum = 0; - for (size_t i = 0; i < x.size(); ++i) - sum += x[i] * y[i]; - return sum; -} - float_t dot(size_t n, const float_t* x, const float_t* y) { - float_t sum = 0; - for (size_t i = 0; i < n; ++i) - sum += x[i] * y[i]; - return sum; -} - -void clear(vec_t& in) { - for (size_t i = 0; i < in.size(); i++) - in[i] = 0; + return cblas_sdot(n, x, 1, y, 1); } void clear_cpu(size_t n, float_t* in) { @@ -255,17 +242,6 @@ void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, }, galois::chunk_size<64>(), galois::loopname("d_leaky_relu")); } -void softmax(const vec_t& input, vec_t& output) { - const float_t max = *std::max_element(input.begin(), input.end()); - float_t denominator(0); - for (size_t i = 0; i < input.size(); i++) { - output[i] = std::exp(input[i] - max); - denominator += output[i]; - } - for (size_t i = 0; i < input.size(); i++) - output[i] /= denominator; -} - void softmax(size_t n, const float_t* input, float_t* output) { const float_t max = *std::max_element(input, input + n); float_t denominator(0); @@ -277,20 +253,6 @@ void softmax(size_t n, const float_t* input, float_t* output) { output[i] /= denominator; } -void d_softmax(const vec_t& y, const vec_t& p, vec_t& dy, const vec_t& dp) { - auto n = y.size(); - vec_t df(n, 0); - for (size_t i = 0; i < n; i++) { - for (size_t j = 0; j < n; j++) { - // float_t delta_ij = i == j? 1 : 0; - // df[i] += p[j] * (delta_ij - p[i]); - df[j] = (j == i) ? p[i] * (float_t(1) - p[i]) : -p[j] * p[i]; - } - // dy = dp * (gradient of softmax) - dy[i] = dot(dp, df); - } -} - void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp) { vec_t df(n, 0); @@ -305,20 +267,6 @@ void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, // cross-entropy loss function for multi-class classification // y: ground truth // p: predicted probability -float_t cross_entropy(const vec_t& y, const vec_t& p) { - auto n = y.size(); - assert(n > 0); - float_t loss = 0.0; - for (size_t i = 0; i < n; i++) { - if (y[i] == float_t(0)) - continue; - if (p[i] == float_t(0)) - loss -= y[i] * std::log(float_t(1e-10)); - else loss -= y[i] * std::log(p[i]); - } - return loss; -} - float_t cross_entropy(size_t n, const float_t* y, const float_t* p) { float_t loss = 0.0; for (size_t i = 0; i < n; i++) { @@ -332,13 +280,6 @@ float_t cross_entropy(size_t n, const float_t* y, const float_t* p) { return loss; } -void d_cross_entropy(const vec_t& y, const vec_t& p, vec_t& d) { - auto n = y.size(); - for (size_t i = 0; i < n; i++) { - d[i] = -y[i] / (p[i] + float_t(1e-10)); - } -} - void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) { for (size_t i = 0; i < n; i++) { d[i] = -y[i] / (p[i] + float_t(1e-10)); @@ -350,11 +291,6 @@ void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) { inline float_t sigmoid_func(float_t x) { return 1./(1.+expf(-x)); } // Sigmoid -void sigmoid(const vec_t& in, vec_t &out) { - for (size_t i = 0; i < in.size(); ++i) - out[i] = sigmoid_func(in[i]); -} - void sigmoid(size_t n, const float_t* in, float_t* out) { for (size_t i = 0; i < n; i++) { out[i] = 1. / (1. + expf(-in[i])); @@ -367,12 +303,10 @@ void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const } } -void copy1D1D(const vec_t& in, vec_t& out) { - std::copy(in.begin(), in.end(), &out[0]); -} - -void copy_cpu(size_t len, const float_t* in, float_t* out) { - std::copy(in, in + len, out); +void copy_cpu(size_t n, const float_t* in, float_t* out) { + //std::copy(in, in + n, out); + //memcpy(out, in, sizeof(float_t) * n); + cblas_scopy(n, in, 1, out, 1); } // num rows in A, C; num columns in B, C; num columns in A, rows in B @@ -390,15 +324,6 @@ void transpose(size_t x, size_t y, const float_t* in, float_t* out) { } } -// matrix-vector multiply -void mvmul(size_t m, size_t n, const float_t *matrix, const float_t *in_vector, float_t *out_vector) { - for (size_t i = 0; i < m; ++i) { - for (size_t j = 0; j < n; ++j) { - out_vector[i] += matrix[i * n + j] * in_vector[j]; - } - } -} - float reduce_mean(size_t n, const float_t* x) { float_t sum = 0.; for (size_t i = 0; i < n; i++) { diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index a1b1e4feba..f1e4238a84 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -54,6 +54,7 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& su } } }, galois::loopname("gen_subgraph")); + sub.degree_counting(); #endif } @@ -143,6 +144,7 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { j ++; } }, galois::loopname("construct_graph")); + sub.degree_counting(); #endif } From 14461e39772ebbabea3920fae396e31a7facba50 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Thu, 30 Apr 2020 15:56:14 -0500 Subject: [PATCH 213/660] udapte cmake --- CMakeLists.txt | 4 +++- libdeepgalois/CMakeLists.txt | 4 ++++ libdeepgalois/src/math_functions.cpp | 1 + lonestargnn/CMakeLists.txt | 1 + 4 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index dc0250a3f4..a56a1702e9 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -259,7 +259,9 @@ if(USE_VTUNE) endif() if(USE_MKL_BLAS) - SET(MKL_ROOT /opt/apps/sysnet/intel/17.0/mkl) + SET(INTEL_ROOT /opt/apps/sysnet/intel/17.0) + SET(MKL_ROOT ${INTEL_ROOT}/mkl) + SET(INTEL_LIBS_DIR ${INTEL_ROOT}/lib/intel64_lin) find_package(MKL) message(STATUS "MKL: ${MKL_INCLUDE_DIRS}") if (MKL_FOUND) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 2ede00abbc..3c41d945cb 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -1,12 +1,16 @@ cmake_minimum_required(VERSION 2.8) +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread") SET(BLAS_INC_DIR ${OPENBLAS_ROOT}/include/openblas) SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64) set(BLAS_LIB "-lopenblas -lpthread") if(USE_MKL_BLAS) + link_directories(${INTEL_LIBS_DIR}) + message(STATUS "ICC Libraries for MKL: ${INTEL_LIBS_DIR}") SET(BLAS_INC_DIR ${MKL_ROOT}/include) SET(BLAS_LIB_DIR ${MKL_ROOT}/lib/intel64) set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core") + #set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lpthread -liomp5") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MKL") endif() diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 08ceaab76c..e254839bed 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -63,6 +63,7 @@ void csrmm_cpu(const int M, const int N, const int K, const int nnz, const int* A_idx_ptr, const int* A_nnz_idx, const float* B, const float beta, float* C) { #ifdef USE_MKL + mkl_set_num_threads(56); const char *matdescra = "GXXCX";//6 bytes const char transa = 'N'; //printf("Calling Intel MKL\n"); exit(1); diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index 24c9c6a726..6db3877a6f 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -10,6 +10,7 @@ if(USE_MKL_BLAS) SET(BLAS_LIB_DIR "${MKL_ROOT}/lib/intel64") endif() link_directories(${BLAS_LIB_DIR}) +link_directories(${INTEL_LIBS_DIR}) if(NOT ENABLE_HETERO_GALOIS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") From 6efe3bbb663fae737d4c19592ae7fecf9e751d9e Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Thu, 30 Apr 2020 16:10:05 -0500 Subject: [PATCH 214/660] fix compile --- libdeepgalois/src/math_functions.cpp | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index e254839bed..968f477f63 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -81,8 +81,8 @@ void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float a cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); } +const size_t vec_len = 8; // for 32-bit floating point in AVX2; TODO AVX512 /* -const size_t vec_len = 8; // for 32-bit floating point in AVX2 // vector add void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) { #ifdef __AVX2__ @@ -140,7 +140,18 @@ float_t l2_norm(size_t n, const float_t* a) { */ void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* y) { +#ifdef USE_MKL vsAdd(n, a, b, y); +#else +#ifdef __AVX2__ + const size_t alignedN = n - n % vec_len; + for (size_t i = 0; i < alignedN; i += vec_len) + _mm256_storeu_ps(&y[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); + for (size_t i = alignedN; i < n; ++i) y[i] = a[i] + b[i]; +#else + for (size_t i = 0; i < n; ++i) y[i] = a[i] + b[i]; +#endif +#endif } void scal(size_t n, const float_t alpha, float_t* x) { From e1e037832b03a94048938ab9fdbfad6f911b0c4a Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Thu, 30 Apr 2020 19:47:20 -0500 Subject: [PATCH 215/660] fix gpu --- libdeepgalois/CMakeLists.txt | 2 +- libdeepgalois/include/deepgalois/context.h | 14 ++-- libdeepgalois/include/deepgalois/lgraph.h | 6 +- .../include/deepgalois/math_functions.hh | 8 +- libdeepgalois/include/deepgalois/net.h | 16 ++-- libdeepgalois/src/context.cpp | 6 +- libdeepgalois/src/layers/graph_conv_layer.cu | 2 +- libdeepgalois/src/lgraph.cpp | 1 + libdeepgalois/src/lgraph.cu | 48 ++++++----- libdeepgalois/src/math_functions.cpp | 8 +- libdeepgalois/src/math_functions.cu | 39 +++++---- libdeepgalois/src/net.cpp | 82 +++++++++++-------- lonestargnn/gcn/gcn.cpp | 12 +-- 13 files changed, 134 insertions(+), 110 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 3c41d945cb..7afa6c9169 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -109,7 +109,7 @@ set(sources ) endif() -set(BOOST_LIBRARIES "-lboost_system -lboost_thread") +#set(BOOST_LIBRARIES "-lboost_system -lboost_thread") add_library(dg_cpu STATIC ${sources}) target_link_libraries(dg_cpu galois_shmem) target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES}) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index ffbaecb0d3..ff324ef60f 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -6,7 +6,7 @@ #include #include #include "deepgalois/types.h" -#include +//#include #ifdef CPU_ONLY #include "deepgalois/gtypes.h" #else @@ -16,13 +16,10 @@ namespace deepgalois { -using boost::shared_ptr; - class Context { public: Context(); ~Context(); - static Context& Get(); size_t read_graph(std::string dataset_str, bool selfloop); size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop); @@ -69,7 +66,7 @@ class Context { inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; } inline static curandGenerator_t curand_generator() { return curand_generator_; } #endif - +/* // This random number generator facade hides boost and CUDA rng // implementation from one another (for cross-platform compatibility). class RNG { @@ -81,9 +78,10 @@ class Context { void* generator(); private: class Generator; - shared_ptr generator_; + boost::shared_ptr generator_; }; + static Context& Get(); // Getters for boost rng, curand, and cublas handles inline static RNG& rng_stream() { if (!Get().random_generator_) { @@ -91,7 +89,7 @@ class Context { } return *(Get().random_generator_); } - +*/ protected: size_t n; // number of samples: N size_t num_classes; // number of classes: E @@ -108,7 +106,7 @@ class Context { float_t* d_feats; // input features on device float_t* d_feats_subg; // input features for subgraph on device float_t* norm_factor; // normalization constant based on graph structure - shared_ptr random_generator_; + //boost::shared_ptr random_generator_; #ifdef CPU_ONLY void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false); diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index bf3ace2470..733c6620d8 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -1,7 +1,7 @@ #pragma once #include "deepgalois/types.h" #include -#include +//#include namespace deepgalois { @@ -17,8 +17,8 @@ class LearningGraph { edata_t *edge_data_; public: - //typedef index_t* iterator; - using iterator = boost::counting_iterator; + typedef size_t iterator; + //using iterator = boost::counting_iterator; LearningGraph(bool use_gpu) : is_device(use_gpu), num_vertices_(0), num_edges_(0), rowptr_(NULL), colidx_(NULL), degrees_(NULL), vertex_data_(NULL), edge_data_(NULL) {} diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index a39e463ecc..53baa2ff0f 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -112,10 +112,10 @@ void leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in, float_t* out); // Leaky ReLU void d_leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in_diff, const float_t* data, float_t* out_diff); // Leaky ReLU derivative -void dropout_gpu(const int n, const float scale, const float dropout_rate, - const float_t* in, unsigned* masks, float_t* out); // dropout -void d_dropout_gpu(const int n, const float scale, const float dropout_rate, - const float_t* in, const unsigned* masks, float_t* out); // dropout derivative +void dropout_gpu(int n, float scale, float dropout_rate, + const float_t* in, mask_t* masks, float_t* out); // dropout +void d_dropout_gpu(int n, float scale, float dropout_rate, + const float_t* in, const mask_t* masks, float_t* out); // dropout derivative void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C); diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 820367bef5..ba34687e22 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -5,18 +5,19 @@ #define _MODEL_H_ #include -#include "galois/Timer.h" #include "deepgalois/types.h" -#include "deepgalois/gtypes.h" #include "deepgalois/layers/l2_norm_layer.h" #include "deepgalois/layers/graph_conv_layer.h" #include "deepgalois/layers/softmax_loss_layer.h" #include "deepgalois/layers/sigmoid_loss_layer.h" #include "deepgalois/optimizer.h" +#ifdef CPU_ONLY #include "deepgalois/sampler.h" +#endif #ifndef GALOIS_USE_DIST #include "deepgalois/context.h" #else +#include "deepgalois/gtypes.h" #include "deepgalois/DistContext.h" #endif @@ -40,8 +41,10 @@ class Net { void init(std::string dataset_str, unsigned num_conv, unsigned epochs, unsigned hidden1, float lr, float dropout, float wd, bool selfloop, bool single, bool l2norm, bool dense, - unsigned neigh_sample_size = 0, unsigned subg_sample = 0, - Graph* dGraph = NULL); + unsigned neigh_sample_size = 0, unsigned subg_sample = 0); +#ifdef GALOIS_USE_DIST + void dist_init(Graph* dGraph); +#endif size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } size_t get_nnodes() { return num_samples; } @@ -55,7 +58,7 @@ class Net { void train(optimizer* opt, bool need_validate); // training double evaluate(std::string type, acc_t& loss, acc_t& acc); // inference - void read_test_masks(std::string dataset, Graph* dGraph); + void read_test_masks(std::string dataset); acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks); // forward propagation void bprop(); // back propogation void normalize(); // Scale gradient to counterbalance accumulation @@ -106,14 +109,15 @@ class Net { mask_t* subgraph_masks; // masks for subgraph std::vector feature_dims; // feature dimnesions for each layer std::vector layers; // all the layers in the neural network - Sampler *sampler; #ifndef GALOIS_USE_DIST deepgalois::Context* context; #else deepgalois::DistContext* context; + Graph* dGraph; #endif #ifdef CPU_ONLY + Sampler *sampler; // comparing outputs with the ground truth (labels) acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph); acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph); diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 37c9a33e04..ffc2069024 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -4,10 +4,10 @@ #include "deepgalois/context.h" #include "deepgalois/utils.h" #include "deepgalois/configs.h" -#include +//#include namespace deepgalois { - +/* // Make sure each thread can have different values. static boost::thread_specific_ptr thread_instance_; @@ -17,7 +17,7 @@ Context& Context::Get() { } return *(thread_instance_.get()); } - +*/ #ifdef CPU_ONLY Context::Context() : n(0), num_classes(0), feat_len(0), is_single_class(true), diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index 41f6e30a0f..7edb4ab10c 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -8,7 +8,7 @@ void graph_conv_layer::malloc_and_init() { size_t y = input_dims[1]; size_t z = output_dims[1]; - if (dropout_) CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(unsigned))); + if (dropout_) CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(mask_t))); //CUDA_CHECK(cudaMalloc((void**)&in_temp, x * y * sizeof(float_t))); float_malloc_device(x*y, in_temp); init_const_gpu(x*y, 0.0, in_temp); diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index 6c36eb464b..891973e612 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -11,6 +11,7 @@ #include namespace deepgalois { + void LearningGraph::progressPrint(unsigned maxii, unsigned ii) { const unsigned nsteps = 10; unsigned ineachstep = (maxii / nsteps); diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu index 14b7239358..0a925bbbdb 100644 --- a/libdeepgalois/src/lgraph.cu +++ b/libdeepgalois/src/lgraph.cu @@ -1,41 +1,47 @@ +#include "deepgalois/lgraph.h" +#include "deepgalois/cutils.h" +#include + +namespace deepgalois { void LearningGraph::dealloc() { assert(is_device); CUDA_CHECK(cudaFree(colidx_)); CUDA_CHECK(cudaFree(rowptr_)); CUDA_CHECK(cudaFree(degrees_)); - if (edge_data != NULL) CUDA_CHECK(cudaFree(edge_data)); - if (vertex_data != NULL) CUDA_CHECK(cudaFree(vertex_data)); + if (edge_data_ != NULL) CUDA_CHECK(cudaFree(edge_data_)); + if (vertex_data_ != NULL) CUDA_CHECK(cudaFree(vertex_data_)); } -void LearningGraph::allocOnDevice(bool no_edge_data_) { - if (colidx_ != NULL) return true; - CUDA_CHECK(cudaMalloc((void **) &colidx_, num_edges_ * sizeof(index_type))); - CUDA_CHECK(cudaMalloc((void **) &rowptr_, (num_vertices_+1) * sizeof(index_type))); - CUDA_CHECK(cudaMalloc((void **) °rees_, num_vertices_ * sizeof(index_type))); - //if (!no_edge_data_) CUDA_CHECK(cudaMalloc((void **) &edge_data_, num_edges_ * sizeof(edge_data__t))); - //CUDA_CHECK(cudaMalloc((void **) &vertex_data_, num_vertices_ * sizeof(vdata_t))); +void LearningGraph::allocOnDevice(bool no_edge_data__) { + if (colidx_ != NULL) return; + CUDA_CHECK(cudaMalloc((void **) &colidx_, num_edges_ * sizeof(index_t))); + CUDA_CHECK(cudaMalloc((void **) &rowptr_, (num_vertices_+1) * sizeof(index_t))); + CUDA_CHECK(cudaMalloc((void **) °rees_, num_vertices_ * sizeof(index_t))); + //if (!no_edge_data__) CUDA_CHECK(cudaMalloc((void **) &edge_data__, num_edges_ * sizeof(edge_data___t))); + //CUDA_CHECK(cudaMalloc((void **) &vertex_data__, num_vertices_ * sizeof(vdata_t))); is_device = true; } void LearningGraph::copy_to_gpu(LearningGraph ©graph) { copygraph.init(num_vertices_, num_edges_); copygraph.allocOnDevice(edge_data_ == NULL); - CUDA_CHECK(cudaMemcpy(copygraph.colidx_, colidx_, num_edges_ * sizeof(index_type), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(copygraph.rowptr_, rowptr_, (num_vertices_+1) * sizeof(index_type), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(copygraph.degrees_, degrees_, num_vertices_ * sizeof(index_type), cudaMemcpyHostToDevice)); - //if (edge_data_ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data_, edge_data_, num_edges_ * sizeof(edata_t), cudaMemcpyHostToDevice)); - //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data_, vertex_data_, num_vertices_ * sizeof(vdata_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(copygraph.colidx_, colidx_, num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(copygraph.rowptr_, rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(copygraph.degrees_, degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyHostToDevice)); + //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__, edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyHostToDevice)); + //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__, vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyHostToDevice)); } void LearningGraph::copy_to_cpu(LearningGraph ©graph) { assert(is_device); - assert(copygraph.size() = num_vertices_); - assert(copygraph.sizeEdges() = num_edges_); - CUDA_CHECK(cudaMemcpy(copygraph.edge_dst_ptr(), colidx_, num_edges_ * sizeof(index_type), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(copygraph.row_start_ptr(), rowptr_, (num_vertices_+1) * sizeof(index_type), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(copygraph.degrees_ptr(), degrees_, num_vertices_ * sizeof(index_type), cudaMemcpyDeviceToHost)); - //if (edge_data_ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data_ptr(), edge_data_, num_edges_ * sizeof(edata_t), cudaMemcpyDeviceToHost)); - //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data_ptr(), vertex_data_, num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost)); + assert(copygraph.size() == num_vertices_); + assert(copygraph.sizeEdges() == num_edges_); + CUDA_CHECK(cudaMemcpy(copygraph.edge_dst_ptr(), colidx_, num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(copygraph.row_start_ptr(), rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(copygraph.degrees_ptr(), degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyDeviceToHost)); + //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__ptr(), edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyDeviceToHost)); + //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__ptr(), vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost)); } +} diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 968f477f63..45fccaea04 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -1,10 +1,9 @@ -#include "deepgalois/math_functions.hh" -#include "galois/Timer.h" -#include "galois/Galois.h" #include #include -#include +#include "galois/Timer.h" +#include "galois/Galois.h" #include "deepgalois/utils.h" +#include "deepgalois/math_functions.hh" #ifdef USE_MKL #include @@ -23,6 +22,7 @@ extern "C" { std::default_random_engine generator; std::uniform_real_distribution distribution(0.0,1.0); /* +#include typedef boost::mt19937 rng_t; inline rng_t* deepgalois_rng() { return static_cast(Context::rng_stream().generator()); diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index c1746d9075..62a7af3849 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -28,8 +28,8 @@ bool isnan_gpu(int n, const float_t *array) { return h_result; } -void gpu_rng_uniform(const int n, unsigned* r) { - CURAND_CHECK(curandGenerate(deepgalois::Context::curand_generator(), r, n)); +void gpu_rng_uniform(const int n, float_t* r) { + CURAND_CHECK(curandGenerateUniform(deepgalois::Context::curand_generator(), r, n)); } void rng_uniform_gpu(const int n, const float_t a, const float_t b, float_t* r) { @@ -78,30 +78,33 @@ __global__ void setup_curand_kernel(const int n, curandState* state) { } } -__global__ void dropout_kernel(const int n, const float scale, - const float threshold, const float_t* in, - unsigned* masks, float_t* out) { - CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * (masks[i] > threshold) * scale; } +__global__ void dropout_kernel(int n, float scale, float threshold, + float_t *rands, const float_t* in, + mask_t* masks, float_t* out) { + CUDA_KERNEL_LOOP(i, n) { + masks[i] = rands[i] > threshold ? 1 : 0; + out[i] = in[i] * masks[i] * scale; + } } -void dropout_gpu(const int n, const float scale, const float dropout_rate, - const float_t* in, unsigned* masks, float_t* out) { - gpu_rng_uniform(n, masks); - //std::cout << "[debug]: dropout_gpu\n"; +void dropout_gpu(int n, float scale, float dropout_rate, + const float_t* in, mask_t* masks, float_t* out) { + float_t *rands; + float_malloc_device(n, rands); + gpu_rng_uniform(n, rands); dropout_kernel<<>>( - n, scale, dropout_rate, in, masks, out); + n, scale, dropout_rate, rands, in, masks, out); CudaTest("solving dropout kernel failed"); - //std::cout << "[debug]: dropout_gpu done\n"; + float_free_device(rands); } -__global__ void d_dropout_kernel(const int n, const float scale, - const float threshold, const float_t* in, - const unsigned* masks, float_t* out) { - CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * (masks[i] > threshold) * scale; } +__global__ void d_dropout_kernel(int n, float scale, float threshold, + const float_t* in, const mask_t* masks, float_t* out) { + CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; } } -void d_dropout_gpu(const int n, const float scale, const float dropout_rate, - const float_t* in, const unsigned* masks, float_t* out) { +void d_dropout_gpu(int n, float scale, float dropout_rate, + const float_t* in, const mask_t* masks, float_t* out) { d_dropout_kernel<<>>( n, scale, dropout_rate, in, masks, out); CudaTest("solving d_dropout kernel failed"); diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 7da9fcbb18..cc7ba738cc 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -2,6 +2,7 @@ * Based on the net.hpp file from Caffe deep learning framework. */ +#include "galois/Timer.h" #include "deepgalois/net.h" #include "deepgalois/utils.h" #include "deepgalois/math_functions.hh" @@ -11,7 +12,7 @@ namespace deepgalois { void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, unsigned hidden1, float lr, float dropout, float wd, bool selfloop, bool single, bool l2norm, bool dense, - unsigned neigh_sz, unsigned subg_sz, Graph* dGraph) { + unsigned neigh_sz, unsigned subg_sz) { assert(num_conv > 0); num_conv_layers = num_conv; num_epochs = epochs; @@ -32,21 +33,14 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, ", weight_decay ", weight_decay, "\n"); #ifndef GALOIS_USE_DIST context = new deepgalois::Context(); - context->set_label_class(is_single_class); - context->set_use_subgraph(subgraph_sample_size > 0); num_samples = context->read_graph(dataset_str, selfloop); - if (subgraph_sample_size) sampler = new deepgalois::Sampler(); -#else - context = new deepgalois::DistContext(); - num_samples = dGraph->size(); - context->saveGraph(dGraph); - // TODO self loop setup? - context->initializeSyncSubstrate(); + context->set_label_class(is_single_class); #endif // read graph, get num nodes num_classes = context->read_labels(dataset_str); +#ifndef GALOIS_USE_DIST //std::cout << "Reading label masks ... "; train_masks = new mask_t[num_samples]; val_masks = new mask_t[num_samples]; @@ -59,31 +53,13 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, train_end = train_begin + train_count; val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; // TODO do all can be used below -#ifndef GALOIS_USE_DIST for (size_t i = train_begin; i < train_end; i++) train_masks[i] = 1; for (size_t i = val_begin; i < val_end; i++) val_masks[i] = 1; -#else - // find local ID from global ID, set if it exists - for (size_t i = train_begin; i < train_end; i++) { - if (dGraph->isLocal(i)) { - train_masks[dGraph->getLID(i)] = 1; - } - } - for (size_t i = val_begin; i < val_end; i++) { - if (dGraph->isLocal(i)) { - val_masks[dGraph->getLID(i)] = 1; - } - } -#endif } else { -#ifndef GALOIS_USE_DIST train_count = context->read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks); val_count = context->read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks); -#else - train_count = context->read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks, dGraph); - val_count = context->read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks, dGraph); -#endif } +#endif if (subgraph_sample_size > train_count) { galois::gPrint("FATAL: subgraph size can not be larger than the size of training set\n"); @@ -108,13 +84,53 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, feature_dims[num_layers] = num_classes; // normalized output embedding: E layers.resize(num_layers); -#ifndef CPU_ONLY +#ifdef CPU_ONLY + context->set_use_subgraph(subgraph_sample_size > 0); + if (subgraph_sample_size) sampler = new deepgalois::Sampler(); +#else copy_masks_device(num_samples, train_masks, d_train_masks); copy_masks_device(num_samples, val_masks, d_val_masks); context->copy_data_to_device(); // copy labels and input features to the device #endif } +#ifdef GALOIS_USE_DIST +void Net::dist_init(Graph* graph) { + dGraph = graph; + context = new deepgalois::DistContext(); + num_samples = dGraph->size(); + context->saveGraph(dGraph); + // TODO self loop setup? + context->initializeSyncSubstrate(); + + //std::cout << "Reading label masks ... "; + train_masks = new mask_t[num_samples]; + val_masks = new mask_t[num_samples]; + std::fill(train_masks, train_masks+num_samples, 0); + std::fill(val_masks, val_masks+num_samples, 0); + + if (dataset_str == "reddit") { + train_begin = 0, train_count = 153431, + train_end = train_begin + train_count; + val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; + // find local ID from global ID, set if it exists + for (size_t i = train_begin; i < train_end; i++) { + if (dGraph->isLocal(i)) { + train_masks[dGraph->getLID(i)] = 1; + } + } + for (size_t i = val_begin; i < val_end; i++) { + if (dGraph->isLocal(i)) { + val_masks[dGraph->getLID(i)] = 1; + } + } + } else { + train_count = context->read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks, dGraph); + val_count = context->read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks, dGraph); + } +} +#endif + void Net::train(optimizer* opt, bool need_validate) { std::string header = ""; std::string seperator = " "; @@ -396,7 +412,7 @@ void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, layers[layer_id]->set_graph_ptr(context->getGraphPointer()); } -void Net::read_test_masks(std::string dataset, Graph* dGraph) { +void Net::read_test_masks(std::string dataset) { test_masks = new mask_t[num_samples]; if (dataset == "reddit") { test_begin = 177262; @@ -430,7 +446,7 @@ void Net::read_test_masks(std::string dataset, Graph* dGraph) { * @param end GLOBAL end * @param count GLOBAL training count */ -acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph) { +acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks) { #ifndef GALOIS_USE_DIST AccumF accuracy_all; #else @@ -479,7 +495,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks return accuracy_all.reduce() / (acc_t)count; } -acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph) { +acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks) { auto preds = layers[num_conv_layers]->next()->get_data(); auto ground_truth = context->get_labels_ptr(); return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, ground_truth, preds); diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index fa492172a5..62a8067294 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -18,17 +18,17 @@ int main(int argc, char** argv) { LonestarGnnStart(argc, argv, name, desc, url); deepgalois::Net network; // the neural network to train - deepgalois::Graph* dGraph = NULL; #ifdef GALOIS_USE_DIST std::vector dummyVec; - dGraph = galois::graphs::constructSymmetricGraph(dummyVec); + deepgalois::Graph* dGraph = galois::graphs::constructSymmetricGraph(dummyVec); + network.dist_init(dGraph); #endif // read network, features, ground truth, initialize metadata network.init(dataset, num_conv_layers, epochs, hidden1, learning_rate, dropout_rate, weight_decay, add_selfloop, is_single_class, add_l2norm, add_dense, - neighbor_sample_sz, subgraph_sample_sz, dGraph); + neighbor_sample_sz, subgraph_sample_sz); // default setting for now; can be customized by the user network.construct_layers(); network.print_layers_info(); @@ -47,11 +47,7 @@ int main(int argc, char** argv) { if (do_test) { // test using test samples galois::gPrint("\n"); -#ifndef GALOIS_USE_DIST - network.read_test_masks(dataset, NULL); -#else - network.read_test_masks(dataset, dGraph); -#endif + network.read_test_masks(dataset); galois::StatTimer Ttest("Test"); Ttest.start(); acc_t test_loss = 0.0, test_acc = 0.0; From c487a8d4ee76f69fc35a59c6cdbd16b89e4bac81 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Thu, 30 Apr 2020 19:55:16 -0500 Subject: [PATCH 216/660] fix cpu --- libdeepgalois/include/deepgalois/net.h | 4 ++-- libdeepgalois/src/net.cpp | 5 ++--- libdeepgalois/src/net.cu | 4 ++-- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index ba34687e22..5c32292430 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -119,8 +119,8 @@ class Net { #ifdef CPU_ONLY Sampler *sampler; // comparing outputs with the ground truth (labels) - acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph); - acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, Graph* dGraph); + acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks); + acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks); #else acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *gGraph); acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *gGraph); diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index cc7ba738cc..86bd0f6340 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -270,11 +270,10 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { #endif loss = fprop(begin, end, count, masks); - auto g = context->getGraphPointer(); if (is_single_class) { - acc = masked_accuracy(begin, end, count, masks, g); + acc = masked_accuracy(begin, end, count, masks); } else { - acc = masked_multi_class_accuracy(begin, end, count, masks, g); + acc = masked_multi_class_accuracy(begin, end, count, masks); } t_eval.Stop(); return t_eval.Millisecs(); diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index 3077566512..6ead99d31a 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -144,14 +144,14 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, namespace deepgalois { acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, - mask_t* masks, CSRGraph *g) { + mask_t* masks) { return masked_accuracy_gpu(num_classes, begin, end, count, masks, layers[num_conv_layers - 1]->next()->get_data(), context->get_labels_ptr()); } acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, - mask_t* masks, CSRGraph* g) { + mask_t* masks) { return masked_f1_score_gpu(num_classes, begin, end, count, masks, layers[num_conv_layers]->next()->get_data(), context->get_labels_ptr()); From 15209f42111b76068278ad188596bc1ef94a4919 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Thu, 30 Apr 2020 19:58:52 -0500 Subject: [PATCH 217/660] fix gpu --- libdeepgalois/include/deepgalois/net.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 5c32292430..fe5eaa8aac 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -122,8 +122,8 @@ class Net { acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks); acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks); #else - acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *gGraph); - acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, CSRGraph *gGraph); + acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks); + acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks); #endif }; From 2491650e212dab17fe0b4e5c05ad27a4e7f043d2 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 1 May 2020 11:51:08 -0500 Subject: [PATCH 218/660] fix gpu --- libdeepgalois/CMakeLists.txt | 1 + libdeepgalois/include/deepgalois/math_functions.hh | 2 +- libdeepgalois/src/math_functions.cu | 4 ++-- lonestargnn/CMakeLists.txt | 3 ++- 4 files changed, 6 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 7afa6c9169..69a6e7fa40 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -46,6 +46,7 @@ else() #set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; -G -Xcompiler -rdynamic) link_directories(${CUDA_HOME}/lib64) link_directories(${CMAKE_SOURCE_DIR}/libgpu) + message(STATUS "CUDA_LIB_DIR: ${CUDA_HOME}/lib64") set(CUDA_SOURCES src/layers/graph_conv_layer.cu diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 53baa2ff0f..fc9e798633 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -142,7 +142,7 @@ void d_sigmoid_cross_entropy_gpu(int len, int bengin, int end, const float_t* out_data, float_t* diff); void scal_gpu(const int n, const float alpha, float* X); void add_scalar_gpu(const int n, const float_t alpha, float_t* Y); -void rng_uniform_gpu(const int n, const float_t a, const float_t b, float_t* r); +void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r); bool is_allocated_device(float_t* data); void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks); void float_malloc_device(int n, float_t*& ptr); diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 62a7af3849..1f9c020676 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -28,11 +28,11 @@ bool isnan_gpu(int n, const float_t *array) { return h_result; } -void gpu_rng_uniform(const int n, float_t* r) { +void gpu_rng_uniform(size_t n, float_t* r) { CURAND_CHECK(curandGenerateUniform(deepgalois::Context::curand_generator(), r, n)); } -void rng_uniform_gpu(const int n, const float_t a, const float_t b, float_t* r) { +void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r) { CURAND_CHECK(curandGenerateUniform(deepgalois::Context::curand_generator(), r, n)); const float range = b - a; if (range != float_t(1)) diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index 6db3877a6f..0c313d742c 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -1,8 +1,10 @@ include_directories(${CMAKE_SOURCE_DIR}/lonestargnn/include) include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include) include_directories(${CUDA_HOME}/include) +link_directories(${CUDA_HOME}/lib64) if(ENABLE_HETERO_GALOIS) include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) + link_directories(${INTEL_LIBS_DIR}) endif() SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64) @@ -10,7 +12,6 @@ if(USE_MKL_BLAS) SET(BLAS_LIB_DIR "${MKL_ROOT}/lib/intel64") endif() link_directories(${BLAS_LIB_DIR}) -link_directories(${INTEL_LIBS_DIR}) if(NOT ENABLE_HETERO_GALOIS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") From 9ac1e7be7e0886818518f5715de9887ae59c8018 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 1 May 2020 12:25:03 -0500 Subject: [PATCH 219/660] fix agg --- libdeepgalois/src/layers/aggregator.cpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index e3d6f12f78..0dec25c019 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -4,13 +4,13 @@ #ifdef CPU_ONLY void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { - // zero out the output data #ifndef GALOIS_USE_DIST galois::do_all(galois::iterate(size_t(0), g.size()),[&](const auto src) { #else auto& rangeObj = g.allNodesRange(); galois::do_all(galois::iterate(rangeObj), [&](const auto src) { #endif + // zero out the output data math::clear_cpu(len , &out[src * len]); float_t a = 0.0; float_t b = 0.0; @@ -22,9 +22,10 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou if (norm) { // normalize b as well b = a * norm_factor[dst]; - float_t* neighbor = new float_t[len]; + //float_t* neighbor = new float_t[len]; // this is super slow + vec_t neighbor(len); // scale the neighbor's data using the normalization factor - math::scale(len, b, &in[dst * len], neighbor); + math::scale(len, b, &in[dst * len], &neighbor[0]); // use scaled data to update; out[src] += in[dst] math::vadd_cpu(len, &out[src * len], &neighbor[0], &out[src * len]); } else { From 24292beef51cebd4343e95cfa801aed0261ae4ea Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 1 May 2020 14:51:52 -0500 Subject: [PATCH 220/660] update dropout --- libdeepgalois/src/math_functions.cpp | 63 ++++++++++++++++++++++++---- 1 file changed, 54 insertions(+), 9 deletions(-) diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 45fccaea04..ec43be8656 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -1,4 +1,7 @@ #include +#include +#include +#include #include #include "galois/Timer.h" #include "galois/Galois.h" @@ -19,8 +22,6 @@ extern "C" { exit(1); \ } while(0); -std::default_random_engine generator; -std::uniform_real_distribution distribution(0.0,1.0); /* #include typedef boost::mt19937 rng_t; @@ -36,15 +37,18 @@ void rng_bernoulli(size_t n, const float_t p, uint8_t* r) { r[i] = variate_generator(); } */ + +std::default_random_engine generator; +std::uniform_real_distribution distribution(0.0,1.0); + namespace deepgalois { +namespace math { + inline uint8_t bernoulli(float_t p) { - //return uniform_rand(float_t(0), float_t(1)) > p ? 1 : 0; return distribution(generator) > p ? 1 : 0; } -namespace math { - //! wrapper function to call cblas_sgemm void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, @@ -81,6 +85,26 @@ void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float a cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); } +inline void rng_uniform_cpu(size_t n, float_t* r) { +#ifdef USE_MKL + VSLStreamStatePtr stream; + // Initializing the streams + vslNewStream(&stream, VSL_BRNG_SOBOL, 1); + // Generating + vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, n, r, 0.0f, 1.0f); + // Deleting the streams + vslDeleteStream(&stream); +#else + for (size_t i = 0; i < n; ++i) { + r[i] = distribution(generator); + } + //galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { + // unsigned short xi[3]; + // r[i] = erand48(xi); + //}, galois::loopname("randomMaskGen")); +#endif +} + const size_t vec_len = 8; // for 32-bit floating point in AVX2; TODO AVX512 /* // vector add @@ -198,16 +222,37 @@ void clear_cpu(size_t n, float_t* in) { void dropout(size_t m, float scale, float dropout_rate, const float_t* in, mask_t* masks, float_t* out) { for (size_t i = 0; i < m; ++i) - masks[i] = deepgalois::bernoulli(dropout_rate); + masks[i] = bernoulli(dropout_rate); for (size_t i = 0; i < m; ++i) out[i] = in[i] * (float_t)masks[i] * scale; } void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate, const float_t* in, mask_t* masks, float_t* out) { - for (size_t i = 0; i < n*m; ++i) - masks[i] = deepgalois::bernoulli(dropout_rate); - galois::do_all(galois::iterate((size_t)0, n*m), [&](const auto& i) { + size_t len = n * m; +/* +#ifdef USE_MKL + vec_t rands(len); + rng_uniform_cpu(len, &rands[0]); + galois::do_all(galois::iterate((size_t)0, len), [&](const auto& i) { + masks[i] = rands[i] > dropout_rate ? 1 : 0; + }, galois::loopname("randomMaskGen")); +*/ +/* + galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { + auto idx = i * m; + vec_t rands(m); + rng_uniform_cpu(m, &rands[0]); + for (size_t j = 0; j < m; ++j) + masks[idx+j] = rands[j] > dropout_rate ? 1 : 0; + }, galois::loopname("dropout")); +#else +*/ + for (size_t i = 0; i < len; ++i) { + masks[i] = bernoulli(dropout_rate); + } +//#endif + galois::do_all(galois::iterate((size_t)0, len), [&](const auto& i) { out[i] = in[i] * (float_t)masks[i] * scale; }, galois::loopname("dropout")); } From 4bafd7274a7231f26b9016bdd377abaa1672fb3d Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 1 May 2020 15:32:56 -0500 Subject: [PATCH 221/660] udapte lgraph --- libdeepgalois/CMakeLists.txt | 1 + libdeepgalois/include/deepgalois/lgraph.h | 8 ++++++++ libdeepgalois/src/lgraph.cpp | 14 ++++++++++++++ 3 files changed, 23 insertions(+) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 69a6e7fa40..157e0151ad 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -85,6 +85,7 @@ set(sources src/DistContext.cpp src/optimizer.cpp src/sampler.cpp + src/lgraph.cpp src/utils.cpp src/node.cpp src/net.cpp diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index 733c6620d8..315ec1145a 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -15,6 +15,7 @@ class LearningGraph { index_t *degrees_; vdata_t *vertex_data_; edata_t *edge_data_; + std::vector> mirrorNodes; public: typedef size_t iterator; @@ -51,6 +52,13 @@ class LearningGraph { void constructNodes(); void fixEndEdge(index_t vid, index_t row_end); void constructEdge(index_t eid, index_t dst, edata_t edata); + + bool isLocal(index_t vid); + index_t getLID(index_t vid); + bool is_vertex_cut(); + std::vector>& getMirrorNodes(); + uint64_t numMasters(); + uint64_t globalSize(); }; } diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index 891973e612..3573a9627a 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -12,6 +12,20 @@ namespace deepgalois { +bool LearningGraph::isLocal(index_t vid) { return true; } + +index_t LearningGraph::getLID(index_t vid) { return 0; } + +bool LearningGraph::is_vertex_cut() {return true; } + +std::vector>& LearningGraph::getMirrorNodes() { + return mirrorNodes; +} + +uint64_t LearningGraph::numMasters() { return 0; } + +uint64_t LearningGraph::globalSize() { return 0; } + void LearningGraph::progressPrint(unsigned maxii, unsigned ii) { const unsigned nsteps = 10; unsigned ineachstep = (maxii / nsteps); From f6fd899e1421b706aaab8d8095d9a4bf1b574cb9 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 1 May 2020 21:29:22 -0500 Subject: [PATCH 222/660] fix sampler bug --- .../include/deepgalois/DistContext.h | 6 +- libdeepgalois/include/deepgalois/context.h | 34 ++-------- .../include/deepgalois/layers/layer.h | 13 +++- libdeepgalois/include/deepgalois/net.h | 9 +-- libdeepgalois/src/DistContext.cpp | 2 +- libdeepgalois/src/context.cpp | 55 ++++++++++++---- libdeepgalois/src/context.cu | 12 ++-- libdeepgalois/src/layers/aggregator.cpp | 17 +++-- libdeepgalois/src/layers/graph_conv_layer.cpp | 3 +- libdeepgalois/src/layers/graph_conv_layer.cu | 2 - .../src/layers/softmax_loss_layer.cpp | 9 +-- libdeepgalois/src/lgraph.cpp | 10 +-- libdeepgalois/src/net.cpp | 64 ++++++++++++------- libdeepgalois/src/sampler.cpp | 4 +- 14 files changed, 138 insertions(+), 102 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 04aca5fc9e..d7e368965a 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -26,7 +26,7 @@ class DistContext { label_t *d_labels_subg; // labels for subgraph on device float_t* d_feats; // input features on device float_t* d_feats_subg; // input features for subgraph on device - float_t* norm_factor; // normalization constant based on graph structure + float_t* norm_factors; // normalization constant based on graph structure public: DistContext(); @@ -47,12 +47,12 @@ class DistContext { //! find norm factor by looking at degree // TODO this is a distributed operation - void norm_factor_counting(size_t g_size); + void norm_factor_computing(size_t g_size); void createSubgraph() {} void gen_subgraph_labels(size_t m, const mask_t *masks) {} void gen_subgraph_feats(size_t m, const mask_t *masks) {} - float_t* get_norm_factor_ptr() { return norm_factor; } + float_t* get_norm_factors_ptr() { return norm_factors; } Graph* getGraphPointer() { return graph_cpu; } Graph* getSubgraphPointer() { return subgraph_cpu; }; float_t* get_feats_ptr() { return h_feats; } diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index ff324ef60f..fc9748d952 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -31,12 +31,13 @@ class Context { label_t get_label(size_t i) { return h_labels[i]; } // single-class (one-hot) label //label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label - float_t* get_norm_factor_ptr() { return norm_factor; } + float_t* get_norm_factors_ptr() { return norm_factors; } + float_t* get_norm_factors_subg_ptr() { return norm_factors_subg; } void set_label_class(bool is_single = true) { is_single_class = is_single; } void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; } void copy_data_to_device(); // copy labels and input features - void norm_factor_counting(size_t g_size); + void norm_factor_computing(bool is_subgraph); #ifdef CPU_ONLY Graph* graph_cpu; // the input graph, |V| = N @@ -66,30 +67,7 @@ class Context { inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; } inline static curandGenerator_t curand_generator() { return curand_generator_; } #endif -/* - // This random number generator facade hides boost and CUDA rng - // implementation from one another (for cross-platform compatibility). - class RNG { - public: - RNG(); - explicit RNG(unsigned int seed); - explicit RNG(const RNG&); - RNG& operator=(const RNG&); - void* generator(); - private: - class Generator; - boost::shared_ptr generator_; - }; - static Context& Get(); - // Getters for boost rng, curand, and cublas handles - inline static RNG& rng_stream() { - if (!Get().random_generator_) { - Get().random_generator_.reset(new RNG()); - } - return *(Get().random_generator_); - } -*/ protected: size_t n; // number of samples: N size_t num_classes; // number of classes: E @@ -105,8 +83,10 @@ class Context { label_t *d_labels_subg; // labels for subgraph on device float_t* d_feats; // input features on device float_t* d_feats_subg; // input features for subgraph on device - float_t* norm_factor; // normalization constant based on graph structure - //boost::shared_ptr random_generator_; + float_t* norm_factors; // normalization constant based on graph structure + float_t* norm_factors_subg; // normalization constant for subgraph + void alloc_norm_factor(); + void alloc_subgraph_norm_factor(); #ifdef CPU_ONLY void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false); diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 7f1c05ce60..0ffab6de41 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -72,6 +72,7 @@ class layer : public deepgalois::node { void set_context(ContextType* ctx) { context = ctx; } void set_trainable(bool trainable) { trainable_ = trainable; } // is this layer trainable? void set_labels_ptr(label_t *ptr) { labels = ptr; } + void set_norm_consts_ptr(float_t *ptr) { norm_consts = ptr; } void set_feats_ptr(float_t *ptr) { prev_->set_data(ptr); } void set_name(std::string name) { name_ = name; } // name metadata #ifdef CPU_ONLY @@ -79,7 +80,7 @@ class layer : public deepgalois::node { #else void set_graph_ptr(CSRGraph *ptr) { graph_gpu = ptr; } #endif - void update_dim_size(size_t sg_size) { input_dims[0] = output_dims[0] = sg_size; } + void update_dim_size(size_t g_size) { input_dims[0] = output_dims[0] = g_size; } //! set the data of the previous layer connected to this one void set_in_data(float_t* data) { @@ -93,11 +94,15 @@ class layer : public deepgalois::node { begin_ = sample_begin; end_ = sample_end; count_ = sample_count; + use_mask = false; + if (masks != NULL) { + use_mask = true; #ifdef CPU_ONLY - masks_ = masks; + masks_ = masks; #else - d_masks_ = masks; + d_masks_ = masks; #endif + } } void add_edge() { @@ -151,6 +156,7 @@ class layer : public deepgalois::node { std::vector output_dims; // output dimentions std::string name_; // name of this layer bool trainable_; // is this layer trainable + bool use_mask; vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x // 16, layer1: 16 x E @@ -162,6 +168,7 @@ class layer : public deepgalois::node { float_t* loss; // error for each vertex: N x 1 ContextType* context; label_t* labels; + float_t* norm_consts; #ifdef CPU_ONLY Graph *graph_cpu; #else diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index fe5eaa8aac..f87b4e549a 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -118,13 +118,10 @@ class Net { #ifdef CPU_ONLY Sampler *sampler; - // comparing outputs with the ground truth (labels) - acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks); - acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks); -#else - acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks); - acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks); #endif + // comparing outputs with the ground truth (labels) + acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth); + acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth); }; } // namespace deepgalois diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 174e7eb210..e53dc1c118 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -151,7 +151,7 @@ float_t* DistContext::get_in_ptr() { return &h_feats[0]; } -void DistContext::norm_factor_counting(size_t g_size) { +void DistContext::norm_factor_computing(size_t g_size) { // TODO: this is a distributed operation // create for now, TODO need to actually fill it in diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index ffc2069024..bb0e67c818 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -26,12 +26,15 @@ Context::Context() : n(0), num_classes(0), h_feats(NULL), h_feats_subg(NULL), d_labels(NULL), d_labels_subg(NULL), d_feats(NULL), d_feats_subg(NULL), - norm_factor(NULL) {} + norm_factors(NULL) {} Context::~Context() { if (h_labels) delete h_labels; + if (h_labels_subg) delete h_labels_subg; if (h_feats) delete h_feats; - if (norm_factor) delete norm_factor; + if (h_feats_subg) delete h_feats_subg; + if (norm_factors) delete norm_factors; + if (norm_factors_subg) delete norm_factors_subg; } size_t Context::read_graph(std::string dataset_str, bool selfloop) { @@ -137,30 +140,56 @@ void Context::add_selfloop(Graph &og, Graph &g) { //*/ } -void Context::norm_factor_counting(size_t g_size) { - auto g = getGraphPointer(); - auto subg = getSubgraphPointer(); - if (use_subgraph) g = subg; +void Context::alloc_norm_factor() { + Graph* g = getGraphPointer(); + if (norm_factors == NULL) +#ifdef USE_MKL + norm_factors = new float_t[g->sizeEdges()]; +#else + norm_factors = new float_t[g->size()]; +#endif +} + +void Context::alloc_subgraph_norm_factor() { + Graph* g = getSubgraphPointer(); + if (norm_factors_subg == NULL) +#ifdef USE_MKL + norm_factors_subg = new float_t[g->sizeEdges()]; +#else + norm_factors_subg = new float_t[g->size()]; +#endif +} + +void Context::norm_factor_computing(bool is_subgraph) { + Graph* g; + float_t *constants; + if (!is_subgraph) { + g = getGraphPointer(); + alloc_norm_factor(); + constants = norm_factors; + } else { + g = getSubgraphPointer(); + alloc_subgraph_norm_factor(); + constants = norm_factors_subg; + } + auto g_size = g->size(); g->degree_counting(); - if (norm_factor != NULL) free(norm_factor); #ifdef USE_MKL - norm_factor = new float_t[g->sizeEdges()]; galois::do_all(galois::iterate((size_t)0, g_size), [&](auto i) { float_t c_i = std::sqrt(float_t(g->get_degree(i))); for (auto e = g->edge_begin(i); e != g->edge_end(i); e++) { const auto j = g->getEdgeDst(e); float_t c_j = std::sqrt(float_t(g->get_degree(j))); - if (c_i == 0.0 || c_j == 0.0) norm_factor[e] = 0.0; - else norm_factor[e] = 1.0 / (c_i * c_j); + if (c_i == 0.0 || c_j == 0.0) constants[e] = 0.0; + else constants[e] = 1.0 / (c_i * c_j); } }, galois::loopname("NormCountingEdge")); #else - norm_factor = new float_t[g_size]; galois::do_all(galois::iterate((size_t)0, g_size), [&](auto v) { auto degree = g->get_degree(v); float_t temp = std::sqrt(float_t(degree)); - if (temp == 0.0) norm_factor[v] = 0.0; - else norm_factor[v] = 1.0 / temp; + if (temp == 0.0) constants[v] = 0.0; + else constants[v] = 1.0 / temp; }, galois::loopname("NormCountingVertex")); #endif } diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 23abd3f1c2..0042f5420e 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -25,7 +25,7 @@ int64_t cluster_seedgen(void) { } // computing normalization factor for each vertex -__global__ void norm_factor_counting_node(int n, CSRGraph graph, float_t* norm_fac) { +__global__ void norm_factor_computing_node(int n, CSRGraph graph, float_t* norm_fac) { CUDA_KERNEL_LOOP(i, n) { float_t temp = sqrt(float_t(graph.getOutDegree(i))); if (temp == 0.0) norm_fac[i] = 0.0; @@ -35,7 +35,7 @@ __global__ void norm_factor_counting_node(int n, CSRGraph graph, float_t* norm_f // TODO: make sure self-loop added for each vertex // computing normalization factor for each edge -__global__ void norm_factor_counting_edge(int n, CSRGraph graph, float_t* norm_fac) { +__global__ void norm_factor_computing_edge(int n, CSRGraph graph, float_t* norm_fac) { CUDA_KERNEL_LOOP(src, n) { assert(src < n); float_t d_src = float_t(graph.getOutDegree(src)); @@ -97,7 +97,7 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) { return n; } -void Context::norm_factor_counting(size_t g_size) { +void Context::norm_factor_computing(bool is_subgraph) { std::cout << "Pre-computing normalization factor (n=" << n << ") ... "; if (!is_selfloop_added) { std::cout << "Set -sl=1 to add selfloop\n"; @@ -107,14 +107,14 @@ void Context::norm_factor_counting(size_t g_size) { int nnz = graph_gpu.nedges; CUDA_CHECK(cudaMalloc((void**)&norm_factor, nnz * sizeof(float_t))); init_const_gpu(nnz, 0.0, norm_factor); - norm_factor_counting_edge<<>>( + norm_factor_computing_edge<<>>( n, graph_gpu, norm_factor); #else CUDA_CHECK(cudaMalloc((void**)&norm_factor, n * sizeof(float_t))); - norm_factor_counting_node<<>>( + norm_factor_computing_node<<>>( n, graph_gpu, norm_factor); #endif - CudaTest("solving norm_factor_counting kernel failed"); + CudaTest("solving norm_factor_computing kernel failed"); std::cout << "Done\n"; } /* diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 0dec25c019..fc841a6361 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -4,14 +4,17 @@ #ifdef CPU_ONLY void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { + size_t n = g.size(); + //std::cout << "[update_all] graph size: " << n << "\n"; #ifndef GALOIS_USE_DIST - galois::do_all(galois::iterate(size_t(0), g.size()),[&](const auto src) { + galois::do_all(galois::iterate(size_t(0), n),[&](const auto src) { #else auto& rangeObj = g.allNodesRange(); galois::do_all(galois::iterate(rangeObj), [&](const auto src) { #endif + auto src_idx = src * len; // zero out the output data - math::clear_cpu(len , &out[src * len]); + math::clear_cpu(len , &out[src_idx]); float_t a = 0.0; float_t b = 0.0; // get normalization factor if needed @@ -19,21 +22,23 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou // gather neighbors' embeddings for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { const auto dst = g.getEdgeDst(e); + assert(dst < n); + auto dst_idx = dst * len; if (norm) { // normalize b as well b = a * norm_factor[dst]; //float_t* neighbor = new float_t[len]; // this is super slow vec_t neighbor(len); // scale the neighbor's data using the normalization factor - math::scale(len, b, &in[dst * len], &neighbor[0]); + math::scale(len, b, &in[dst_idx], &neighbor[0]); // use scaled data to update; out[src] += in[dst] - math::vadd_cpu(len, &out[src * len], &neighbor[0], &out[src * len]); + math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]); } else { // add embeddings from neighbors together; out[src] += in[dst] - math::vadd_cpu(len, &out[src * len], &in[dst * len], &out[src * len]); + math::vadd_cpu(len, &out[src_idx], &in[dst_idx], &out[src_idx]); } } - }, galois::steal(), galois::no_stats(), galois::loopname("update_all")); + }, galois::steal(), galois::loopname("update_all")); } void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out, diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 31622e0699..e50d66f5ae 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -40,7 +40,6 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t // aggregate based on graph topology void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { // normalization constant based on graph structure - float_t* norm_consts = context->get_norm_factor_ptr(); #ifdef USE_MKL update_all_csrmm(len, g, in, out, norm_, norm_consts); #else @@ -50,7 +49,6 @@ void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_ // since graph is symmetric, the derivative is the same void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { - float_t* norm_consts = context->get_norm_factor_ptr(); #ifdef USE_MKL update_all_csrmm(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z #else @@ -101,6 +99,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ size_t x = input_dims[0]; size_t y = input_dims[1]; size_t z = output_dims[1]; + //std::cout << "layer: " << name_ << "\n"; //std::cout << "x=" << x << ", y=" << y << ", z=" << z << "\n"; // input: x*y; W: y*z; output: x*z diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index 7edb4ab10c..ef62725da2 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -31,7 +31,6 @@ void graph_conv_layer::malloc_and_init() { } void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { - float_t* norm_factor = context->get_norm_factor_ptr(); #ifdef USE_CUSPARSE deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor); #else @@ -40,7 +39,6 @@ void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, flo } void graph_conv_layer::d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { - float_t* norm_factor = context->get_norm_factor_ptr(); #ifdef USE_CUSPARSE deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor); #else diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index f1c1aa27e4..4ae9c6364b 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -31,11 +31,11 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - if (masks_[i] == 1) { // masked + if (!use_mask || masks_[i] == 1) { // masked // output is normalized input for this layer math::softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax // one hot encoded vector for the labels - std::vector groundTruth(output_dims[1], 0.0); // ground truth + vec_t groundTruth(output_dims[1], 0.0); // ground truth groundTruth[get_label(i)] = 1.0; // one-hot // loss calculation loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]); @@ -52,7 +52,7 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, // note: out_grad is ignored because it shouldn't exist (this is output layer) size_t len = layer::input_dims[1]; galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { - if (masks_[i] == 1) { // masked + if (!use_mask || masks_[i] == 1) { // masked vec_t norm_grad(len); std::vector groundTruth(len, 0.0); groundTruth[get_label(i)] = 1.0; @@ -74,11 +74,12 @@ acc_t softmax_loss_layer::get_prediction_loss() { total_loss.reset(); valid_sample_count.reset(); galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { - if (masks_[i]) { + if (!use_mask || masks_[i]) { total_loss += loss[i]; valid_sample_count += 1; } }, galois::chunk_size<64>(), galois::steal(), galois::loopname("getMaskedLoss")); + //std::cout << "begin = " << begin_ << " end = " << end_ << " count = " << count_ << " valid_count = " << valid_sample_count.reduce() << "\n"; assert(valid_sample_count.reduce() == count_); return total_loss.reduce() / (acc_t)count_; } diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index 3573a9627a..dd3390d331 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -165,11 +165,11 @@ void LearningGraph::readGraphFromGRFile(const std::string& filename) { #ifdef CPU_ONLY void LearningGraph::dealloc() { assert (!is_device); - free(rowptr_); - free(colidx_); - free(degrees_); - if (vertex_data_ != NULL) free(vertex_data_); - if (edge_data_ != NULL) free(edge_data_); + if (rowptr_ != NULL) delete rowptr_; + if (colidx_ != NULL) delete colidx_; + if (degrees_ != NULL) delete degrees_; + if (vertex_data_ != NULL) delete vertex_data_; + if (edge_data_ != NULL) delete edge_data_; } #endif diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 86bd0f6340..b5d0e36197 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -162,17 +162,17 @@ void Net::train(optimizer* opt, bool need_validate) { t_epoch.Start(); if (subgraph_sample_size && num_subg_remain == 0) { + for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(subgraph_sample_size); #ifdef CPU_ONLY // generate subgraph context->createSubgraph(); auto subgraph_ptr = context->getSubgraphPointer(); sampler->subgraph_sample(subgraph_sample_size, *(subgraph_ptr), subgraph_masks); - for (size_t i = 0; i < num_conv_layers-1; i++) { + context->norm_factor_computing(1); + for (size_t i = 0; i < num_conv_layers; i++) { layers[i]->set_graph_ptr(context->getSubgraphPointer()); + layers[i]->set_norm_consts_ptr(context->get_norm_factors_subg_ptr()); } - // update masks for subgraph - layers[num_layers - 1]->set_sample_mask(train_begin, train_end, train_count, subgraph_masks); - // update labels for subgraph context->gen_subgraph_labels(subgraph_sample_size, subgraph_masks); layers[num_layers-1]->set_labels_ptr(context->get_labels_subg_ptr()); @@ -180,8 +180,6 @@ void Net::train(optimizer* opt, bool need_validate) { // update features for subgraph context->gen_subgraph_feats(subgraph_sample_size, subgraph_masks); layers[0]->set_feats_ptr(context->get_feats_subg_ptr()); // feed input data - - context->norm_factor_counting(subgraph_sample_size); #endif num_subg_remain += 1; // num_threads } @@ -215,6 +213,17 @@ void Net::train(optimizer* opt, bool need_validate) { double epoch_time = t_epoch.Millisecs(); total_train_time += epoch_time; if (need_validate && ep % val_interval == 0) { + if (subgraph_sample_size) { // switch to the original graph + for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_samples); +#ifdef CPU_ONLY + for (size_t i = 0; i < num_conv_layers; i++) { + layers[i]->set_graph_ptr(context->getGraphPointer()); + layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr()); + } + layers[num_layers-1]->set_labels_ptr(context->get_labels_ptr()); + layers[0]->set_feats_ptr(context->get_feats_ptr()); // feed input data +#endif + } // Validation acc_t val_loss = 0.0, val_acc = 0.0; Tval.start(); @@ -247,7 +256,13 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { end = train_end; count = train_count; masks = train_masks; - if (subgraph_sample_size) masks = subgraph_masks; + if (subgraph_sample_size) { + // update masks for subgraph + masks = NULL; + begin = 0; + end = subgraph_sample_size; + count = subgraph_sample_size; + } } else if (type == "val") { begin = val_begin; end = val_end; @@ -270,10 +285,17 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { #endif loss = fprop(begin, end, count, masks); + float_t* predictions = layers[num_layers - 1]->next()->get_data(); + label_t* labels; + if (subgraph_sample_size) { + labels = context->get_labels_subg_ptr(); + } else { + labels = context->get_labels_ptr(); + } if (is_single_class) { - acc = masked_accuracy(begin, end, count, masks); + acc = masked_accuracy(begin, end, count, masks, predictions, labels); } else { - acc = masked_multi_class_accuracy(begin, end, count, masks); + acc = masked_multi_class_accuracy(begin, end, count, masks, predictions, labels); } t_eval.Stop(); return t_eval.Millisecs(); @@ -347,8 +369,6 @@ void Net::construct_layers() { // allocate memory for intermediate features and gradients for (size_t i = 0; i < num_layers; i++) { - if (subgraph_sample_size) - layers[i]->update_dim_size(subgraph_sample_size); layers[i]->add_edge(); } for (size_t i = 1; i < num_layers; i++) @@ -357,7 +377,9 @@ void Net::construct_layers() { layers[i]->malloc_and_init(); layers[0]->set_in_data(context->get_feats_ptr()); // feed input data // precompute the normalization constant based on graph structure - if (!subgraph_sample_size) context->norm_factor_counting(num_samples); + context->norm_factor_computing(0); + for (size_t i = 0; i < num_conv_layers; i++) + layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr()); set_contexts(); } @@ -445,7 +467,7 @@ void Net::read_test_masks(std::string dataset) { * @param end GLOBAL end * @param count GLOBAL training count */ -acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks) { +acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth) { #ifndef GALOIS_USE_DIST AccumF accuracy_all; #else @@ -458,12 +480,11 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks galois::do_all(galois::iterate(begin, end), [&](const auto& i) { #ifndef GALOIS_USE_DIST - if (masks[i] == 1) { + if (masks == NULL || masks[i] == 1) { // use sampled graph when masks is NULL // get prediction - int preds = math::argmax(num_classes, - &(layers[num_conv_layers - 1]->next()->get_data()[i * num_classes])); + auto pred = math::argmax(num_classes, preds+i*num_classes); // check prediction - if ((label_t)preds == context->get_label(i)) + if ((label_t)pred == ground_truth[i]) accuracy_all += 1.0; } #else @@ -475,10 +496,9 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks uint32_t localID = dGraph->getLID(i); if (masks[localID] == 1) { // get prediction - int preds = math::argmax(num_classes, - &(layers[num_conv_layers - 1]->next()->get_data()[localID * num_classes])); + auto preds = math::argmax(num_classes, preds+localID*num_classes); // check prediction - if ((label_t)preds == context->get_label(localID)) + if ((label_t)preds == ground_truth[localID]) accuracy_all += 1.0; } } @@ -494,9 +514,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks return accuracy_all.reduce() / (acc_t)count; } -acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks) { - auto preds = layers[num_conv_layers]->next()->get_data(); - auto ground_truth = context->get_labels_ptr(); +acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth) { return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, ground_truth, preds); } #endif diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index f1e4238a84..c7d5639330 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -140,7 +140,9 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { unsigned j = 0; auto old_id = old_ids[i]; for (auto e = g.edge_begin(old_id); e != g.edge_end(old_id); e++) { - sub.constructEdge(offsets[i]+j, g.getEdgeDst(e), 0); + auto dst = new_ids[g.getEdgeDst(e)]; + assert(dst < nv); + sub.constructEdge(offsets[i]+j, dst, 0); j ++; } }, galois::loopname("construct_graph")); From c0083d4cfa3396b4abb5b0da0f05284c6a761def Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 2 May 2020 13:03:52 -0500 Subject: [PATCH 223/660] update sampler --- .../include/deepgalois/DistContext.h | 2 +- libdeepgalois/include/deepgalois/context.h | 14 ++-- libdeepgalois/include/deepgalois/net.h | 9 ++- libdeepgalois/src/context.cpp | 18 +++--- libdeepgalois/src/context.cu | 9 +++ libdeepgalois/src/lgraph.cpp | 2 +- libdeepgalois/src/net.cpp | 64 +++++++++++++------ libdeepgalois/src/sampler.cpp | 16 +++-- lonestargnn/gcn/gcn.cpp | 6 +- 9 files changed, 89 insertions(+), 51 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index d7e368965a..32d95fcc99 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -48,7 +48,7 @@ class DistContext { //! find norm factor by looking at degree // TODO this is a distributed operation void norm_factor_computing(size_t g_size); - void createSubgraph() {} + void createSubgraphs(int num_subgraphs) {} void gen_subgraph_labels(size_t m, const mask_t *masks) {} void gen_subgraph_feats(size_t m, const mask_t *masks) {} diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index fc9748d952..0f6d96ec98 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -37,22 +37,22 @@ class Context { void set_label_class(bool is_single = true) { is_single_class = is_single; } void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; } void copy_data_to_device(); // copy labels and input features - void norm_factor_computing(bool is_subgraph); + void norm_factor_computing(bool is_subgraph, int subg_id = 0); + void gen_subgraph_labels(size_t m, const mask_t *masks); + void gen_subgraph_feats(size_t m, const mask_t *masks); + void createSubgraphs(int num_subgraphs); #ifdef CPU_ONLY Graph* graph_cpu; // the input graph, |V| = N - Graph* subgraph_cpu; - void createSubgraph(); + std::vector subgraphs_cpu; void add_selfloop(Graph &og, Graph &g); //! returns pointer to the graph Graph* getGraphPointer() { return graph_cpu; } - Graph* getSubgraphPointer() { return subgraph_cpu; }; + Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; }; float_t* get_feats_ptr() { return h_feats; } float_t* get_feats_subg_ptr() { return h_feats_subg; } label_t* get_labels_ptr() { return h_labels; } label_t* get_labels_subg_ptr() { return h_labels_subg; } - void gen_subgraph_labels(size_t m, const mask_t *masks); - void gen_subgraph_feats(size_t m, const mask_t *masks); #else CSRGraph graph_gpu; // the input graph, |V| = N CSRGraph subgraph_gpu; @@ -86,7 +86,7 @@ class Context { float_t* norm_factors; // normalization constant based on graph structure float_t* norm_factors_subg; // normalization constant for subgraph void alloc_norm_factor(); - void alloc_subgraph_norm_factor(); + void alloc_subgraph_norm_factor(int subg_id); #ifdef CPU_ONLY void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false); diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index f87b4e549a..6708e2ce63 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -31,14 +31,14 @@ class Net { public: Net() : is_single_class(true), has_l2norm(false), has_dense(false), neighbor_sample_size(0), subgraph_sample_size(0), - num_samples(0), num_classes(0), + num_threads(1), num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0), num_epochs(0), learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0), train_begin(0), train_end(0), train_count(0), val_begin(0), val_end(0), val_count(0), test_begin(0), test_end(0), test_count(0), train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {} - void init(std::string dataset_str, unsigned num_conv, unsigned epochs, + void init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs, unsigned hidden1, float lr, float dropout, float wd, bool selfloop, bool single, bool l2norm, bool dense, unsigned neigh_sample_size = 0, unsigned subg_sample = 0); @@ -87,6 +87,7 @@ class Net { bool has_dense; // whether the net contains an dense layer unsigned neighbor_sample_size; // neighbor sampling unsigned subgraph_sample_size; // subgraph sampling + int num_threads; // number of threads size_t num_samples; // number of samples: N size_t num_classes; // number of vertex classes: E size_t num_conv_layers; // number of convolutional layers @@ -99,6 +100,8 @@ class Net { size_t val_begin, val_end, val_count; size_t test_begin, test_end, test_count; int val_interval; + int num_subgraphs; + int num_vertices_sg; mask_t* train_masks; // masks for training mask_t* d_train_masks; // masks for training on device @@ -106,7 +109,7 @@ class Net { mask_t* d_val_masks; // masks for validation on device mask_t* test_masks; // masks for test mask_t* d_test_masks; // masks for test on device - mask_t* subgraph_masks; // masks for subgraph + mask_t* subgraphs_masks; // masks for subgraphs std::vector feature_dims; // feature dimnesions for each layer std::vector layers; // all the layers in the neural network #ifndef GALOIS_USE_DIST diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index bb0e67c818..bbaa915e0f 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -26,7 +26,7 @@ Context::Context() : n(0), num_classes(0), h_feats(NULL), h_feats_subg(NULL), d_labels(NULL), d_labels_subg(NULL), d_feats(NULL), d_feats_subg(NULL), - norm_factors(NULL) {} + norm_factors(NULL), norm_factors_subg(NULL) {} Context::~Context() { if (h_labels) delete h_labels; @@ -42,8 +42,10 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) { return n; } -void Context::createSubgraph() { - subgraph_cpu = new Graph(); +void Context::createSubgraphs(int num_subgraphs) { + subgraphs_cpu.resize(num_subgraphs); + for (int i = 0; i < num_subgraphs; i++) + subgraphs_cpu[i] = new Graph(); } // generate labels for the subgraph, m is subgraph size @@ -150,8 +152,8 @@ void Context::alloc_norm_factor() { #endif } -void Context::alloc_subgraph_norm_factor() { - Graph* g = getSubgraphPointer(); +void Context::alloc_subgraph_norm_factor(int subg_id) { + Graph* g = getSubgraphPointer(subg_id); if (norm_factors_subg == NULL) #ifdef USE_MKL norm_factors_subg = new float_t[g->sizeEdges()]; @@ -160,7 +162,7 @@ void Context::alloc_subgraph_norm_factor() { #endif } -void Context::norm_factor_computing(bool is_subgraph) { +void Context::norm_factor_computing(bool is_subgraph, int subg_id) { Graph* g; float_t *constants; if (!is_subgraph) { @@ -168,8 +170,8 @@ void Context::norm_factor_computing(bool is_subgraph) { alloc_norm_factor(); constants = norm_factors; } else { - g = getSubgraphPointer(); - alloc_subgraph_norm_factor(); + g = getSubgraphPointer(subg_id); + alloc_subgraph_norm_factor(subg_id); constants = norm_factors_subg; } auto g_size = g->size(); diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 0042f5420e..531671c3c2 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -92,6 +92,15 @@ Context::~Context() { if (norm_factor) CUDA_CHECK(cudaFree(norm_factor)); } +void Context::createSubgraphs(int n_sg) { +} + +void Context::gen_subgraph_labels(size_t m, const mask_t *masks) { +} + +void Context::gen_subgraph_feats(size_t m, const mask_t *masks) { +} + size_t Context::read_graph(std::string dataset_str, bool selfloop) { n = read_graph_gpu(dataset_str, selfloop); return n; diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index dd3390d331..a2c4a9e4ca 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -40,7 +40,7 @@ void LearningGraph::progressPrint(unsigned maxii, unsigned ii) { void LearningGraph::allocateFrom(index_t nv, index_t ne) { num_vertices_ = nv; num_edges_ = ne; - printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_); + //printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_); rowptr_ = new index_t[num_vertices_+1]; colidx_ = new index_t[num_edges_]; rowptr_[0] = 0; diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index b5d0e36197..e9c64a0f7a 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -9,12 +9,13 @@ namespace deepgalois { -void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, +void Net::init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs, unsigned hidden1, float lr, float dropout, float wd, bool selfloop, bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz) { - assert(num_conv > 0); - num_conv_layers = num_conv; + assert(n_conv > 0); + num_threads = nt; + num_conv_layers = n_conv; num_epochs = epochs; learning_rate = lr; dropout_rate = dropout; @@ -25,7 +26,9 @@ void Net::init(std::string dataset_str, unsigned num_conv, unsigned epochs, neighbor_sample_size = neigh_sz; subgraph_sample_size = subg_sz; val_interval = 1; - galois::gPrint("Configuration: num_conv_layers ", num_conv_layers, + num_subgraphs = num_threads; + galois::gPrint("Configuration: num_threads ", num_threads, + ", num_conv_layers ", num_conv_layers, ", num_epochs ", num_epochs, ", hidden1 ", hidden1, ", learning_rate ", learning_rate, @@ -149,8 +152,9 @@ void Net::train(optimizer* opt, bool need_validate) { int num_subg_remain = 0; #ifdef CPU_ONLY if (subgraph_sample_size) { + context->createSubgraphs(num_subgraphs); + subgraphs_masks = new mask_t[num_samples*num_subgraphs]; galois::gPrint("\nConstruct training vertex set induced graph...\n"); - subgraph_masks = new mask_t[num_samples]; sampler->set_masked_graph(train_begin, train_end, train_count, train_masks, context->getGraphPointer()); } #endif @@ -161,28 +165,46 @@ void Net::train(optimizer* opt, bool need_validate) { galois::gPrint(header, "Epoch ", std::setw(3), ep, seperator); t_epoch.Start(); - if (subgraph_sample_size && num_subg_remain == 0) { - for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(subgraph_sample_size); -#ifdef CPU_ONLY - // generate subgraph - context->createSubgraph(); - auto subgraph_ptr = context->getSubgraphPointer(); - sampler->subgraph_sample(subgraph_sample_size, *(subgraph_ptr), subgraph_masks); - context->norm_factor_computing(1); + if (subgraph_sample_size) { + if (num_subg_remain == 0) { + galois::gPrint("Generating subgraphs (mini-batches) ... "); + Timer t_subgen; + t_subgen.Start(); + // generate subgraphs + for (int sid = 0; sid < num_subgraphs; sid++) { + //galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) { + sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples]); + }//, galois::loopname("subgraph_gen")); + num_subg_remain = num_subgraphs; + t_subgen.Stop(); + galois::gPrint("Done, time: ", t_subgen.Millisecs(), "\n"); + } + for (int i = 0; i < num_subgraphs; i++) { + //auto sg_ptr = context->getSubgraphPointer(i); + //galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " num_e ", sg_ptr->sizeEdges(), "\n"); + } + num_subg_remain--; + int sg_id = num_subg_remain; + auto subgraph_ptr = context->getSubgraphPointer(sg_id); + num_vertices_sg = subgraph_ptr->size(); + galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, + ", num_edges: ", subgraph_ptr->sizeEdges(), "\n"); + for (size_t i = 0; i < num_layers; i++) + layers[i]->update_dim_size(num_vertices_sg); + context->norm_factor_computing(1, sg_id); for (size_t i = 0; i < num_conv_layers; i++) { - layers[i]->set_graph_ptr(context->getSubgraphPointer()); + layers[i]->set_graph_ptr(subgraph_ptr); layers[i]->set_norm_consts_ptr(context->get_norm_factors_subg_ptr()); } // update labels for subgraph - context->gen_subgraph_labels(subgraph_sample_size, subgraph_masks); + context->gen_subgraph_labels(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]); layers[num_layers-1]->set_labels_ptr(context->get_labels_subg_ptr()); // update features for subgraph - context->gen_subgraph_feats(subgraph_sample_size, subgraph_masks); + context->gen_subgraph_feats(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]); layers[0]->set_feats_ptr(context->get_feats_subg_ptr()); // feed input data -#endif - num_subg_remain += 1; // num_threads - } + } + // training steps set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; @@ -260,8 +282,8 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { // update masks for subgraph masks = NULL; begin = 0; - end = subgraph_sample_size; - count = subgraph_sample_size; + end = num_vertices_sg; + count = num_vertices_sg; } } else if (type == "val") { begin = val_begin; diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index c7d5639330..a54cc145d9 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -6,7 +6,7 @@ namespace deepgalois { void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *masks, Graph *g) { - galois::gPrint("Set masked graph: begin=", begin, ", end=", end, ", count=", count, "\n"); + //galois::gPrint("Set masked graph: begin=", begin, ", end=", end, ", count=", count, "\n"); begin_ = begin; end_ = end; count_ = count; @@ -40,7 +40,7 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& su get_masked_degrees(n, masks, g, degrees); auto offsets = deepgalois::parallel_prefix_sum(degrees); size_t ne = offsets[n]; - galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", ne, "\n"); + //galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", ne, "\n"); #ifndef GALOIS_USE_DIST sub.allocateFrom(n, ne); sub.constructNodes(); @@ -64,14 +64,14 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& su // n: number of vertices in the subgraph; // m: number of vertices in the frontier. void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList vertices, VertexSet &vertex_set) { - galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices, graph size: ", g->size(), "\n"); + //galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices, graph size: ", g->size(), "\n"); assert(nv == vertices.size()); auto frontier_indices = deepgalois::select_k_items(m, 0, (int)nv); // randomly select m vertices from vertices as frontier VertexList frontier(m); for (int i = 0; i < m; i++) frontier[i] = vertices[frontier_indices[i]]; vertex_set.insert(frontier.begin(), frontier.end()); - galois::gPrint("vertex_set size: ", vertex_set.size(), "\n"); + //galois::gPrint("vertex_set size: ", vertex_set.size(), "\n"); int *degrees = new int[m]; galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) { degrees[i] = (int)g->get_degree(frontier[i]); @@ -93,7 +93,8 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v } if (j == degree) galois::gPrint("Not found from ", degree, " neighbors\n"); } - assert(n == vertex_set.size()); + /* + assert(n == vertex_set.size()); // size of vertex_set could be slightly smaller than n galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), ", set: ( "); unsigned counter = 0; for (int i : vertex_set) { @@ -102,10 +103,11 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v galois::gPrint(i, " "); } galois::gPrint(" )\n"); + */ } void Sampler::update_masks(size_t n, VertexSet vertices, mask_t *masks) { - galois::gPrint("Updating masks, size = ", vertices.size(), "\n"); + //galois::gPrint("Updating masks, size = ", vertices.size(), "\n"); std::fill(masks, masks+n, 0); for (auto v : vertices) masks[v] = 1; } @@ -130,7 +132,7 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { } auto offsets = deepgalois::parallel_prefix_sum(degrees); auto ne = offsets[nv]; - galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne, "\n"); + //galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne, "\n"); #ifndef GALOIS_USE_DIST sub.allocateFrom(nv, ne); sub.constructNodes(); diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 62a8067294..489553a689 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -25,9 +25,9 @@ int main(int argc, char** argv) { #endif // read network, features, ground truth, initialize metadata - network.init(dataset, num_conv_layers, epochs, hidden1, learning_rate, - dropout_rate, weight_decay, add_selfloop, - is_single_class, add_l2norm, add_dense, + network.init(dataset, numThreads, num_conv_layers, epochs, hidden1, + learning_rate, dropout_rate, weight_decay, + add_selfloop, is_single_class, add_l2norm, add_dense, neighbor_sample_sz, subgraph_sample_sz); // default setting for now; can be customized by the user network.construct_layers(); From fdce957d48ff4bcc413654389936aeb1369531af Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 2 May 2020 13:36:19 -0500 Subject: [PATCH 224/660] fix gpu --- libdeepgalois/include/deepgalois/context.h | 4 ++-- libdeepgalois/src/context.cu | 18 ++++++++---------- libdeepgalois/src/layers/graph_conv_layer.cu | 8 ++++---- libdeepgalois/src/net.cpp | 4 +++- libdeepgalois/src/net.cu | 14 +++++--------- libgpu/include/graph_gpu.h | 2 ++ 6 files changed, 24 insertions(+), 26 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index 0f6d96ec98..4a0ce506b2 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -55,9 +55,9 @@ class Context { label_t* get_labels_subg_ptr() { return h_labels_subg; } #else CSRGraph graph_gpu; // the input graph, |V| = N - CSRGraph subgraph_gpu; + std::vector subgraphs_gpu; CSRGraph* getGraphPointer() { return &graph_gpu; } - CSRGraph* getSubgraphPointer() { return &subgraph_gpu; }; + CSRGraph* getSubgraphPointer(int id) { return subgraphs_gpu[id]; }; float_t* get_feats_ptr() { return d_feats; } float_t* get_feats_subg_ptr() { return d_feats_subg; } label_t* get_labels_ptr() { return d_labels; } diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 531671c3c2..6f42196428 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -68,7 +68,7 @@ Context::Context() : n(0), num_classes(0), feat_len(0), h_feats(NULL), h_feats_subg(NULL), d_labels(NULL), d_labels_subg(NULL), d_feats(NULL), d_feats_subg(NULL), - norm_factor(NULL) { + norm_factors(NULL) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_)); @@ -89,7 +89,7 @@ Context::~Context() { CURAND_CHECK(curandDestroyGenerator(curand_generator_)); if (d_labels) CUDA_CHECK(cudaFree(d_labels)); if (d_feats) CUDA_CHECK(cudaFree(d_feats)); - if (norm_factor) CUDA_CHECK(cudaFree(norm_factor)); + if (norm_factors) CUDA_CHECK(cudaFree(norm_factors)); } void Context::createSubgraphs(int n_sg) { @@ -106,7 +106,7 @@ size_t Context::read_graph(std::string dataset_str, bool selfloop) { return n; } -void Context::norm_factor_computing(bool is_subgraph) { +void Context::norm_factor_computing(bool is_subgraph, int subg_id) { std::cout << "Pre-computing normalization factor (n=" << n << ") ... "; if (!is_selfloop_added) { std::cout << "Set -sl=1 to add selfloop\n"; @@ -114,14 +114,12 @@ void Context::norm_factor_computing(bool is_subgraph) { } #ifdef USE_CUSPARSE int nnz = graph_gpu.nedges; - CUDA_CHECK(cudaMalloc((void**)&norm_factor, nnz * sizeof(float_t))); - init_const_gpu(nnz, 0.0, norm_factor); - norm_factor_computing_edge<<>>( - n, graph_gpu, norm_factor); + CUDA_CHECK(cudaMalloc((void**)&norm_factors, nnz * sizeof(float_t))); + init_const_gpu(nnz, 0.0, norm_factors); + norm_factor_computing_edge<<>>(n, graph_gpu, norm_factors); #else - CUDA_CHECK(cudaMalloc((void**)&norm_factor, n * sizeof(float_t))); - norm_factor_computing_node<<>>( - n, graph_gpu, norm_factor); + CUDA_CHECK(cudaMalloc((void**)&norm_factors, n * sizeof(float_t))); + norm_factor_computing_node<<>>(n, graph_gpu, norm_factors); #endif CudaTest("solving norm_factor_computing kernel failed"); std::cout << "Done\n"; diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index ef62725da2..f4282ced42 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -32,17 +32,17 @@ void graph_conv_layer::malloc_and_init() { void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { #ifdef USE_CUSPARSE - deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor); + deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts); #else - deepgalois::update_all(len, g, in, out, norm_, norm_factor); + deepgalois::update_all(len, g, in, out, norm_, norm_consts); #endif } void graph_conv_layer::d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { #ifdef USE_CUSPARSE - deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_factor); + deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts); #else - deepgalois::update_all(len, g, in, out, norm_, norm_factor); + deepgalois::update_all(len, g, in, out, norm_, norm_consts); #endif } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index e9c64a0f7a..b9fd931746 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -87,8 +87,8 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs feature_dims[num_layers] = num_classes; // normalized output embedding: E layers.resize(num_layers); -#ifdef CPU_ONLY context->set_use_subgraph(subgraph_sample_size > 0); +#ifdef CPU_ONLY if (subgraph_sample_size) sampler = new deepgalois::Sampler(); #else copy_masks_device(num_samples, train_masks, d_train_masks); @@ -173,7 +173,9 @@ void Net::train(optimizer* opt, bool need_validate) { // generate subgraphs for (int sid = 0; sid < num_subgraphs; sid++) { //galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) { +#ifdef CPU_ONLY sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples]); +#endif }//, galois::loopname("subgraph_gen")); num_subg_remain = num_subgraphs; t_subgen.Stop(); diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index 6ead99d31a..5cc8593647 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -143,18 +143,14 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, } namespace deepgalois { -acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, - mask_t* masks) { - return masked_accuracy_gpu(num_classes, begin, end, count, masks, - layers[num_conv_layers - 1]->next()->get_data(), - context->get_labels_ptr()); +acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, + mask_t* masks, float_t* preds, label_t* ground_truth) { + return masked_accuracy_gpu(num_classes, begin, end, count, masks, preds, ground_truth); } acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, - mask_t* masks) { - return masked_f1_score_gpu(num_classes, begin, end, count, masks, - layers[num_conv_layers]->next()->get_data(), - context->get_labels_ptr()); + mask_t* masks, float_t* preds, label_t* ground_truth) { + return masked_f1_score_gpu(num_classes, begin, end, count, masks, preds, ground_truth); } } // end namespace diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index e2057bf7af..f6f9c57643 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -175,6 +175,8 @@ struct CSRGraph { __device__ __host__ edge_data_type *edge_data_ptr() { return edge_data; } __device__ __host__ const edge_data_type *edge_data_ptr() const { return edge_data; } + size_t size() { return size_t(nnodes); } + size_t sizeEdges() { return size_t(nedges); } index_type nnodes, nedges; index_type* row_start; // row_start[node] points into edge_dst, node starts at // 0, row_start[nnodes] = nedges From 61ec2c9a2acf6fdc0af5c1451568b7f89b3b854e Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 3 May 2020 13:50:23 -0500 Subject: [PATCH 225/660] fix dist --- libdeepgalois/include/deepgalois/DistContext.h | 2 +- libdeepgalois/src/DistContext.cpp | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 32d95fcc99..1304b631f8 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -47,7 +47,7 @@ class DistContext { //! find norm factor by looking at degree // TODO this is a distributed operation - void norm_factor_computing(size_t g_size); + void norm_factor_computing(bool is_subgraph, int subg_id); void createSubgraphs(int num_subgraphs) {} void gen_subgraph_labels(size_t m, const mask_t *masks) {} void gen_subgraph_feats(size_t m, const mask_t *masks) {} diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index e53dc1c118..3f915ec062 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -151,22 +151,22 @@ float_t* DistContext::get_in_ptr() { return &h_feats[0]; } -void DistContext::norm_factor_computing(size_t g_size) { +void DistContext::norm_factor_computing(bool is_subgraph, int subg_id) { // TODO: this is a distributed operation // create for now, TODO need to actually fill it in - norm_factor = new float_t[localVertices]; + norm_factors = new float_t[localVertices]; galois::do_all(galois::iterate((size_t)0, localVertices), [&](auto v) { - norm_factor[v] = 1; + norm_factors[v] = 1; }, galois::loopname("NormCounting")); //galois::do_all(galois::iterate((size_t)0, localVertices), // [&](auto v) { // auto degree = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v)); // float_t temp = std::sqrt(float_t(degree)); - // if (temp == 0.0) norm_factor[v] = 0.0; - // else norm_factor[v] = 1.0 / temp; + // if (temp == 0.0) norm_factors[v] = 0.0; + // else norm_factors[v] = 1.0 / temp; // }, galois::loopname("NormCounting")); return; From de8feacd310c9987e6317d8788b06a14fa7b4323 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 3 May 2020 20:28:47 -0500 Subject: [PATCH 226/660] update sampler --- libdeepgalois/include/deepgalois/sampler.h | 15 +- libdeepgalois/src/net.cpp | 20 +- libdeepgalois/src/sampler.cpp | 234 ++++++++++++++++++--- 3 files changed, 232 insertions(+), 37 deletions(-) diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h index 15c82ffa12..c92e8471d3 100644 --- a/libdeepgalois/include/deepgalois/sampler.h +++ b/libdeepgalois/include/deepgalois/sampler.h @@ -2,16 +2,23 @@ #include "deepgalois/gtypes.h" namespace deepgalois { +#define ETA 1.5 // length factor of DB in sampling +#define SAMPLE_CLIP 3000 // clip degree in sampling +#define DEFAULT_SIZE_FRONTIER 3000 +#define DEFAULT_SIZE_SUBG 9000 + class Sampler { public: - Sampler() : m_(1000) {} + typedef int db_t; + Sampler() : m_(DEFAULT_SIZE_FRONTIER) {} ~Sampler() {} // sample a subgraph sg of size n from graph g - void subgraph_sample(size_t n, Graph &sg, mask_t* masks); + void subgraph_sample(size_t n, Graph &sg, mask_t* masks, unsigned tid = 0); // !API function for user-defined selection strategy virtual void select_vertices(size_t nv, size_t n, int m, Graph* g, VertexList vertices, VertexSet &vertex_set); + virtual void select_vertices(size_t n, int m, VertexSet &vertex_set, unsigned tid); galois::runtime::iterable > neighbor_sampler(Graph &g, VertexID v); @@ -26,7 +33,10 @@ class Sampler { size_t count_; size_t begin_; size_t end_; + int avg_deg; + int subg_deg; VertexList vertices_; + std::vector node_train; mask_t *masks_; Graph *masked_graph; Graph *graph; @@ -37,6 +47,7 @@ class Sampler { void get_masked_degrees(size_t n, mask_t* masks, Graph* g, std::vector °rees); void update_masks(size_t n, VertexSet vertices, mask_t* masks); inline VertexList reindexing_vertice(size_t n, VertexSet vertex_set); + void check_DB(std::vector &DB0, std::vector &DB1, std::vector &DB2, size_t size); }; } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index b9fd931746..39909b4d34 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -167,30 +167,32 @@ void Net::train(optimizer* opt, bool need_validate) { if (subgraph_sample_size) { if (num_subg_remain == 0) { - galois::gPrint("Generating subgraphs (mini-batches) ... "); + //galois::gPrint("Generating ", num_subgraphs, " subgraphs (mini-batches) ... "); Timer t_subgen; t_subgen.Start(); // generate subgraphs - for (int sid = 0; sid < num_subgraphs; sid++) { - //galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) { + //for (int sid = 0; sid < num_subgraphs; sid++) { + galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) { + unsigned tid = 0; + tid = galois::substrate::ThreadPool::getTID(); #ifdef CPU_ONLY - sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples]); + sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples], tid); #endif - }//, galois::loopname("subgraph_gen")); + }, galois::loopname("subgraph_gen")); num_subg_remain = num_subgraphs; t_subgen.Stop(); - galois::gPrint("Done, time: ", t_subgen.Millisecs(), "\n"); + //galois::gPrint("Done, time: ", t_subgen.Millisecs(), "\n"); } for (int i = 0; i < num_subgraphs; i++) { - //auto sg_ptr = context->getSubgraphPointer(i); + auto sg_ptr = context->getSubgraphPointer(i); + sg_ptr->degree_counting(); //galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " num_e ", sg_ptr->sizeEdges(), "\n"); } num_subg_remain--; int sg_id = num_subg_remain; auto subgraph_ptr = context->getSubgraphPointer(sg_id); num_vertices_sg = subgraph_ptr->size(); - galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, - ", num_edges: ", subgraph_ptr->sizeEdges(), "\n"); + //galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, ", num_edges: ", subgraph_ptr->sizeEdges(), "\n"); for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_vertices_sg); context->norm_factor_computing(1, sg_id); diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index a54cc145d9..2157d97dc5 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -4,6 +4,11 @@ #include namespace deepgalois { +inline unsigned getDegree(Graph *g, index_t v) { + //return g->get_degree(v); + //return std::distance(g->edge_begin(v), g->edge_end(v)); + return g->edge_end(v) - g->edge_begin(v); +} void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *masks, Graph *g) { //galois::gPrint("Set masked graph: begin=", begin, ", end=", end, ", count=", count, "\n"); @@ -15,7 +20,31 @@ void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *m #ifndef GALOIS_USE_DIST masked_graph = new Graph(); #endif - generate_masked_graph(g->size(), masks, g, *masked_graph); + //generate_masked_graph(g->size(), masks, g, *masked_graph); + std::vector degrees(g->size(), 0); + get_masked_degrees(g->size(), masks, g, degrees); + auto offsets = deepgalois::parallel_prefix_sum(degrees); + size_t ne = offsets[g->size()]; + for (size_t i = 0; i < g->size(); i++) { + if (masks[i] == 1) node_train.push_back(i); + } + masked_graph->allocateFrom(g->size(), ne); + masked_graph->constructNodes(); + galois::do_all(galois::iterate((size_t)0, g->size()), [&](const auto src) { + masked_graph->fixEndEdge(src, offsets[src+1]); + if (masks[src] == 1) { + auto idx = offsets[src]; + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) masked_graph->constructEdge(idx++, dst, 0); + } + } + }, galois::loopname("gen_subgraph")); + + masked_graph->degree_counting(); + avg_deg = masked_graph->sizeEdges() / masked_graph->size(); + subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg; + //galois::gPrint("Train graph: num_vertices ", masked_graph->size(), " num_edges ", masked_graph->sizeEdges(), " avg_degree ", avg_deg, "\n"); size_t idx = 0; vertices_.resize(count); for (size_t i = begin; i < end; i++) { @@ -25,26 +54,29 @@ void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *m void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph *g, std::vector °rees) { assert(degrees.size() == n); - galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { + for (size_t src = 0; src < n; src++) { + //galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { if (masks[src] == 1) { for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { const auto dst = g->getEdgeDst(e); if (masks[dst] == 1) degrees[src] ++; } } - }, galois::loopname("update_degrees")); + }//, galois::loopname("update_degrees")); } void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& sub) { std::vector degrees(n, 0); get_masked_degrees(n, masks, g, degrees); - auto offsets = deepgalois::parallel_prefix_sum(degrees); + //auto offsets = deepgalois::parallel_prefix_sum(degrees); + auto offsets = deepgalois::prefix_sum(degrees); size_t ne = offsets[n]; //galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", ne, "\n"); #ifndef GALOIS_USE_DIST sub.allocateFrom(n, ne); sub.constructNodes(); - galois::do_all(galois::iterate((size_t)0, n), [&](const auto src) { + for (size_t src = 0; src < n; src++) { + //galois::do_all(galois::iterate((size_t)0, n), [&](const auto src) { sub.fixEndEdge(src, offsets[src+1]); if (masks[src] == 1) { auto idx = offsets[src]; @@ -53,11 +85,164 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& su if (masks[dst] == 1) sub.constructEdge(idx++, dst, 0); } } - }, galois::loopname("gen_subgraph")); - sub.degree_counting(); + }//, galois::loopname("gen_subgraph")); #endif } +void Sampler::check_DB(std::vector &DB0, std::vector &DB1, std::vector &DB2, size_t size) { + if (DB0.capacity() < size) { + DB0.reserve(DB0.capacity()*2); + DB1.reserve(DB1.capacity()*2); + DB2.reserve(DB2.capacity()*2); + } + DB0.resize(size); + DB1.resize(size); + DB2.resize(size); +} + +void print_vertex_set(VertexSet vertex_set) { + unsigned counter = 0; + unsigned n = vertex_set.size(); + galois::gPrint("( "); + for (int i : vertex_set) { + counter ++; + if (counter > 16 && counter < n-16) continue; + galois::gPrint(i, " "); + } + galois::gPrint(")\n"); +} + +void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) { + //unsigned myseed = time(NULL); + unsigned myseed = tid; + //DBx: Dashboard line x, IAx: Index array line x + std::vector DB0, DB1, DB2, IA0, IA1, IA2, IA3, IA4, nDB0, nDB1, nDB2; + DB0.reserve(subg_deg*m*ETA); + DB1.reserve(subg_deg*m*ETA); + DB2.reserve(subg_deg*m*ETA); + IA0.reserve(n); + IA1.reserve(n); + IA2.reserve(n); + IA3.reserve(n); + IA4.reserve(n); + IA0.resize(m); + IA1.resize(m); + IA2.resize(m); + IA3.resize(m); + + //galois::gPrint("seed ", myseed, " m ", m, "\n"); + //galois::gPrint("node_train size: ", node_train.size(), "\n"); + //printf("( "); + //for (size_t i = 0; i < 10; i++) std::cout << node_train[i] << " "; + //printf(")\n"); + for (int i = 0; i < m; i++) { + auto rand_idx = rand_r(&myseed) % node_train.size(); + db_t v = IA3[i] = node_train[rand_idx]; + st.insert(v); + IA0[i] = getDegree(masked_graph, v); + IA0[i] = (IA0[i] > SAMPLE_CLIP) ? SAMPLE_CLIP : IA0[i]; + IA1[i] = 1; + IA2[i] = 0; + } + // calculate prefix sum for IA0 and store in IA2 to compute the address for each frontier in DB + IA2[0] = IA0[0]; + for (int i = 1; i < m; i++) IA2[i] = IA2[i-1] + IA0[i]; + // now fill DB accordingly + check_DB(DB0, DB1, DB2, IA2[m-1]); + for (int i = 0; i < m; i++) { + db_t DB_start = (i==0) ? 0 : IA2[i-1]; + db_t DB_end = IA2[i]; + for (auto j = DB_start; j < DB_end; j++) { + DB0[j] = IA3[i]; + DB1[j] = (j==DB_start) ? (j-DB_end) : (j-DB_start); + DB2[j] = i + 1; + } + } + + db_t choose, neigh_v, newsize, tmp; + for (size_t itr = 0; itr < n-m; itr++) { + choose = db_t(-1); + while (choose == db_t(-1)) { + tmp = rand_r(&myseed) % DB0.size(); + if (size_t(tmp) < DB0.size()) + if (DB0[tmp] != db_t(-1)) choose = tmp; + } + choose = (DB1[choose] < 0) ? choose : (choose - DB1[choose]); + db_t v = DB0[choose]; + auto degree = getDegree(masked_graph, v); + neigh_v = (degree!=0) ? rand_r(&myseed)%degree : db_t(-1); + if (neigh_v != db_t(-1)) { + neigh_v = masked_graph->getEdgeDst(masked_graph->edge_begin(v)+neigh_v); + st.insert(neigh_v); + IA1[DB2[choose]-1] = 0; + IA0[DB2[choose]-1] = 0; + for (auto i = choose; i < choose-DB1[choose]; i++) DB0[i] = db_t(-1); + newsize = getDegree(masked_graph, neigh_v); + newsize = (newsize > SAMPLE_CLIP) ? SAMPLE_CLIP : newsize; + } + else newsize = 0; + //shrink DB to remove sampled nodes, also shrink IA accordingly + bool cond = DB0.size() + newsize > DB0.capacity(); + if (cond) { + // compute prefix sum for the location in shrinked DB + IA4.resize(IA0.size()); + IA4[0]=IA0[0]; + for (size_t i = 1; i < IA0.size(); i++) IA4[i] = IA4[i-1] + IA0[i]; + nDB0.resize(IA4.back()); + nDB1.resize(IA4.back()); + nDB2.resize(IA4.back()); + IA2.assign(IA4.begin(), IA4.end()); + for (size_t i = 0; i < IA0.size(); i++) { + if (IA1[i] == 0) continue; + db_t DB_start = (i==0) ? 0 : IA4[i-1]; + db_t DB_end = IA4[i]; + for (auto j = DB_start; j < DB_end; j++) { + nDB0[j] = IA3[i]; + nDB1[j] = (j==DB_start) ? (j-DB_end) : (j-DB_start); + nDB2[j] = i + 1; + } + } + // remap the index in DB2 by compute prefix of IA1 (new idx in IA) + IA4.resize(IA1.size()); + IA4[0] = IA1[0]; + for (size_t i = 1; i < IA1.size(); i++) + IA4[i] = IA4[i-1] + IA1[i]; + DB0.assign(nDB0.begin(), nDB0.end()); + DB1.assign(nDB1.begin(), nDB1.end()); + DB2.assign(nDB2.begin(), nDB2.end()); + for (auto i = DB2.begin(); i < DB2.end(); i++) *i = IA4[*i - 1]; + db_t curr=0; + for (size_t i = 0; i < IA0.size(); i++) { + if (IA0[i] != 0) { + IA0[curr]=IA0[i]; + IA1[curr]=IA1[i]; + IA2[curr]=IA2[i]; + IA3[curr]=IA3[i]; + curr++; + } + } + IA0.resize(curr); + IA1.resize(curr); + IA2.resize(curr); + IA3.resize(curr); + } + check_DB(DB0, DB1, DB2, newsize+DB0.size()); + IA0.push_back(newsize); + IA1.push_back(1); + IA2.push_back(IA2.back() + IA0.back()); + IA3.push_back(neigh_v); + db_t DB_start = (*(IA2.end() - 2)); + db_t DB_end = IA2.back(); + for (auto j = DB_start; j < DB_end; j++) { + DB0[j] = IA3.back(); + DB1[j] = (j==DB_start) ? (j-DB_end) : (j-DB_start); + DB2[j] = IA3.size(); + } + } + //galois::gPrint("Done selection, vertex_set size: ", st.size(), ", set: "); + //print_vertex_set(st); +} + // !API function for user-defined selection strategy // Select n vertices from vertices and put them in vertex_set. // nv: number of vertices in the original graph; @@ -73,9 +258,10 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v vertex_set.insert(frontier.begin(), frontier.end()); //galois::gPrint("vertex_set size: ", vertex_set.size(), "\n"); int *degrees = new int[m]; - galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) { - degrees[i] = (int)g->get_degree(frontier[i]); - }, galois::loopname("compute_degrees")); + for (int i = 0; i < m; i++) { + //galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) { + degrees[i] = (int)getDegree(g, frontier[i]); + }//, galois::loopname("compute_degrees")); for (size_t i = 0; i < n - m; i++) { auto pos = select_one_item((int)m, degrees); auto u = frontier[pos]; @@ -86,7 +272,7 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v auto dst = g->getEdgeDst(g->edge_begin(u) + neighbor_id); if (vertex_set.find(dst) == vertex_set.end()) { frontier[pos] = dst; - degrees[pos] = g->get_degree(frontier[pos]); + degrees[pos] = getDegree(g, frontier[pos]); vertex_set.insert(dst); break; } @@ -95,14 +281,8 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList v } /* assert(n == vertex_set.size()); // size of vertex_set could be slightly smaller than n - galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), ", set: ( "); - unsigned counter = 0; - for (int i : vertex_set) { - counter ++; - if (counter > 16 && counter < n-16) continue; - galois::gPrint(i, " "); - } - galois::gPrint(" )\n"); + galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), ", set: "); + print_vertex_set(vertex_set); */ } @@ -128,16 +308,18 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { VertexList new_ids = reindexing_vertice(graph->size(), vertex_set); std::vector degrees(nv, 0); // degrees of vertices in the subgraph for (auto v : vertex_set) { - degrees[new_ids[v]] = g.get_degree(v); + degrees[new_ids[v]] = getDegree(&g, v); } - auto offsets = deepgalois::parallel_prefix_sum(degrees); + //auto offsets = deepgalois::parallel_prefix_sum(degrees); + auto offsets = deepgalois::prefix_sum(degrees); auto ne = offsets[nv]; //galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne, "\n"); #ifndef GALOIS_USE_DIST sub.allocateFrom(nv, ne); sub.constructNodes(); VertexList old_ids(vertex_set.begin(), vertex_set.end()); // vertex ID mapping - galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) { + for (size_t i = 0; i < nv; i++) { + //galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) { sub.fixEndEdge(i, offsets[i+1]); unsigned j = 0; auto old_id = old_ids[i]; @@ -147,14 +329,14 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { sub.constructEdge(offsets[i]+j, dst, 0); j ++; } - }, galois::loopname("construct_graph")); - sub.degree_counting(); + }//, galois::loopname("construct_graph")); #endif } -void Sampler::subgraph_sample(size_t n, Graph&sg, mask_t *masks) { +void Sampler::subgraph_sample(size_t n, Graph&sg, mask_t *masks, unsigned tid) { VertexSet vertex_set; // n = 9000 by default - select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m = 1000 by default + //select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m = 1000 by default + select_vertices(n, m_, vertex_set, tid); // m = 1000 by default update_masks(graph->size(), vertex_set, masks); // set masks for vertices in the vertex_set #ifndef GALOIS_USE_DIST Graph masked_sg; From 1488798572187bcee1137da0191e16e2b216168d Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 4 May 2020 14:27:56 -0500 Subject: [PATCH 227/660] minor fix --- libdeepgalois/include/deepgalois/cutils.h | 2 +- libdeepgalois/src/context.cpp | 12 ++++----- .../src/layers/sigmoid_loss_layer.cpp | 8 +++--- .../src/layers/softmax_loss_layer.cpp | 2 +- libdeepgalois/src/lgraph.cpp | 27 ++++++++++++------- libdeepgalois/src/net.cpp | 10 +++---- libdeepgalois/src/net.cu | 8 +++--- libdeepgalois/src/sampler.cpp | 3 ++- libgpu/include/graph_gpu.h | 1 + 9 files changed, 42 insertions(+), 31 deletions(-) diff --git a/libdeepgalois/include/deepgalois/cutils.h b/libdeepgalois/include/deepgalois/cutils.h index 5181408363..383c9d6325 100644 --- a/libdeepgalois/include/deepgalois/cutils.h +++ b/libdeepgalois/include/deepgalois/cutils.h @@ -177,6 +177,6 @@ inline void print_device_vector(size_t n, const float_t *d_x, std::string name = float_t *h_x = new float_t[n]; CUDA_CHECK(cudaMemcpy(h_x, d_x, n * sizeof(float_t), cudaMemcpyDeviceToHost)); for (size_t i = 0; i < n; i ++) std::cout << name << "[" << i << "]=" << h_x[i] << "\n"; - delete h_x; + delete[] h_x; } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index bbaa915e0f..9f2b306371 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -29,12 +29,12 @@ Context::Context() : n(0), num_classes(0), norm_factors(NULL), norm_factors_subg(NULL) {} Context::~Context() { - if (h_labels) delete h_labels; - if (h_labels_subg) delete h_labels_subg; - if (h_feats) delete h_feats; - if (h_feats_subg) delete h_feats_subg; - if (norm_factors) delete norm_factors; - if (norm_factors_subg) delete norm_factors_subg; + if (h_labels) delete[] h_labels; + if (h_labels_subg) delete[] h_labels_subg; + if (h_feats) delete[] h_feats; + if (h_feats_subg) delete[] h_feats_subg; + if (norm_factors) delete[] norm_factors; + if (norm_factors_subg) delete[] norm_factors_subg; } size_t Context::read_graph(std::string dataset_str, bool selfloop) { diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index ca34389127..10a4f8454a 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -13,7 +13,7 @@ sigmoid_loss_layer::sigmoid_loss_layer(unsigned level, } sigmoid_loss_layer::~sigmoid_loss_layer() { - delete loss; + delete[] loss; } void sigmoid_loss_layer::malloc_and_init() { @@ -37,7 +37,7 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* ou for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)get_label(i, j); // loss calculation loss[i] = math::cross_entropy(len, ground_truth, &out_data[idx]); - delete ground_truth; + delete[] ground_truth; } }, galois::chunk_size(), galois::steal(), galois::loopname("sigmoid-loss-fw")); } @@ -55,8 +55,8 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* math::d_cross_entropy(len, ground_truth, &out_data[idx], norm_grad); // derviative sigmoid to gradient used in the next layer math::d_sigmoid(len, &in_data[idx], &out_data[idx], &in_grad[idx], norm_grad); - delete norm_grad; - delete ground_truth; + delete[] norm_grad; + delete[] ground_truth; } }, galois::chunk_size(), galois::steal(), galois::loopname("sigmoid-loss-bw")); } diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 4ae9c6364b..54e461121f 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -13,7 +13,7 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, } softmax_loss_layer::~softmax_loss_layer() { - delete loss; + delete[] loss; } void softmax_loss_layer::malloc_and_init() { diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index a2c4a9e4ca..684d9b89e8 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -38,11 +38,20 @@ void LearningGraph::progressPrint(unsigned maxii, unsigned ii) { } void LearningGraph::allocateFrom(index_t nv, index_t ne) { - num_vertices_ = nv; - num_edges_ = ne; //printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_); - rowptr_ = new index_t[num_vertices_+1]; - colidx_ = new index_t[num_edges_]; + if (num_vertices_ != nv) { + if (rowptr_ != NULL) delete [] rowptr_; + if (degrees_ != NULL) delete [] degrees_; + if (vertex_data_ != NULL) delete [] vertex_data_; + num_vertices_ = nv; + } + if (num_edges_ != ne) { + if (colidx_ != NULL) delete [] colidx_; + if (edge_data_ != NULL) delete [] edge_data_; + num_edges_ = ne; + } + if (rowptr_ == NULL) rowptr_ = new index_t[num_vertices_+1]; + if (colidx_ == NULL) colidx_ = new index_t[num_edges_]; rowptr_[0] = 0; } @@ -165,11 +174,11 @@ void LearningGraph::readGraphFromGRFile(const std::string& filename) { #ifdef CPU_ONLY void LearningGraph::dealloc() { assert (!is_device); - if (rowptr_ != NULL) delete rowptr_; - if (colidx_ != NULL) delete colidx_; - if (degrees_ != NULL) delete degrees_; - if (vertex_data_ != NULL) delete vertex_data_; - if (edge_data_ != NULL) delete edge_data_; + if (rowptr_ != NULL) delete [] rowptr_; + if (colidx_ != NULL) delete [] colidx_; + if (degrees_ != NULL) delete [] degrees_; + if (vertex_data_ != NULL) delete [] vertex_data_; + if (edge_data_ != NULL) delete [] edge_data_; } #endif diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 39909b4d34..c0127a54f2 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -162,23 +162,22 @@ void Net::train(optimizer* opt, bool need_validate) { Timer t_epoch; // run epochs for (unsigned ep = 0; ep < num_epochs; ep++) { - galois::gPrint(header, "Epoch ", std::setw(3), ep, seperator); t_epoch.Start(); if (subgraph_sample_size) { if (num_subg_remain == 0) { - //galois::gPrint("Generating ", num_subgraphs, " subgraphs (mini-batches) ... "); + galois::gPrint("Generating ", num_subgraphs, " subgraphs (mini-batches) ... "); Timer t_subgen; t_subgen.Start(); // generate subgraphs +#ifdef CPU_ONLY //for (int sid = 0; sid < num_subgraphs; sid++) { galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) { unsigned tid = 0; tid = galois::substrate::ThreadPool::getTID(); -#ifdef CPU_ONLY sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples], tid); -#endif }, galois::loopname("subgraph_gen")); +#endif num_subg_remain = num_subgraphs; t_subgen.Stop(); //galois::gPrint("Done, time: ", t_subgen.Millisecs(), "\n"); @@ -210,6 +209,7 @@ void Net::train(optimizer* opt, bool need_validate) { } // training steps + galois::gPrint(header, "Epoch ", std::setw(3), ep, seperator); set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; @@ -313,7 +313,7 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { loss = fprop(begin, end, count, masks); float_t* predictions = layers[num_layers - 1]->next()->get_data(); label_t* labels; - if (subgraph_sample_size) { + if (type == "train" && subgraph_sample_size) { labels = context->get_labels_subg_ptr(); } else { labels = context->get_labels_ptr(); diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index 5cc8593647..115ff6d81d 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -135,10 +135,10 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, float_free_device(d_fp); float_free_device(d_fn); float_free_device(d_tn); - delete h_tp; - delete h_fp; - delete h_fn; - delete h_tn; + delete[] h_tp; + delete[] h_fp; + delete[] h_fn; + delete[] h_tn; return f1_micro; } diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index 2157d97dc5..ba338f5012 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -114,7 +114,8 @@ void print_vertex_set(VertexSet vertex_set) { void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) { //unsigned myseed = time(NULL); - unsigned myseed = tid; + unsigned myseed = tid + time(NULL); + //unsigned myseed = tid; //DBx: Dashboard line x, IAx: Index array line x std::vector DB0, DB1, DB2, IA0, IA1, IA2, IA3, IA4, nDB0, nDB1, nDB2; DB0.reserve(subg_deg*m*ETA); diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index f6f9c57643..6815d1304f 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -177,6 +177,7 @@ struct CSRGraph { size_t size() { return size_t(nnodes); } size_t sizeEdges() { return size_t(nedges); } + void degree_counting() {} index_type nnodes, nedges; index_type* row_start; // row_start[node] points into edge_dst, node starts at // 0, row_start[nnodes] = nedges From e8bda7a4890fe189260aead174daf15fb275fa0d Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 4 May 2020 15:42:50 -0500 Subject: [PATCH 228/660] distgalois: enable def, move around cmakelist --- CMakeLists.txt | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index a56a1702e9..58b143766d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -337,17 +337,21 @@ add_custom_target(apps) # Core libraries (lib) add_subdirectory(libgalois) -if(USE_DEEPGALOIS) - add_subdirectory(libdeepgalois) - add_subdirectory(lonestargnn) -endif(USE_DEEPGALOIS) - if (ENABLE_DIST_GALOIS) + # currently making use of this in deepgalois to distinguish dist from no dist + # note this has to go before the libdeepgalois subdirectory is added below + add_definitions(-DGALOIS_USE_DIST) find_package(MPI REQUIRED) add_subdirectory(libdist) add_subdirectory(libcusp) add_subdirectory(libgluon) endif() + +if(USE_DEEPGALOIS) + add_subdirectory(libdeepgalois) + add_subdirectory(lonestargnn) +endif(USE_DEEPGALOIS) + if (ENABLE_HETERO_GALOIS) enable_language(CUDA) string(REPLACE "." "" GENCODES ${CUDA_CAPABILITY}) From 15953d19f0b3e74a6e36692e9aa1b79e927f8c6a Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 4 May 2020 15:43:23 -0500 Subject: [PATCH 229/660] dist graph index types, exposure of csr arrasy --- .../include/galois/graphs/DistributedGraph.h | 20 +++++++++++++++---- 1 file changed, 16 insertions(+), 4 deletions(-) diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h index 3912e41f15..d35bdbc91c 100644 --- a/libcusp/include/galois/graphs/DistributedGraph.h +++ b/libcusp/include/galois/graphs/DistributedGraph.h @@ -67,13 +67,16 @@ enum MASTERS_DISTRIBUTION { * @tparam NodeTy type of node data for the graph * @tparam EdgeTy type of edge data for the graph */ -template +template class DistGraph { private: //! Graph name used for printing things constexpr static const char* const GRNAME = "dGraph"; - using GraphTy = galois::graphs::LC_CSR_Graph; + using GraphTy = galois::graphs::LC_CSR_Graph; protected: //! The internal graph used by DistGraph to represent the graph @@ -1029,11 +1032,20 @@ class DistGraph { galois::gDebug("Deallocating CSR in DistGraph"); graph.deallocate(); } + + +//////////////////////////////////////////////////////////////////////////////// +// what follows are GNN functions; some are not great (e.g. expose arrays) +// TODO figure out better way to do this +//////////////////////////////////////////////////////////////////////////////// + EdgeIndexTy* row_start_ptr() { return graph.row_start_ptr(); } + NodeIndexTy* edge_dst_ptr() { return graph.edge_dst_ptr(); } }; -template +template constexpr const char* const - galois::graphs::DistGraph::GRNAME; + galois::graphs::DistGraph::GRNAME; } // end namespace graphs } // end namespace galois From 41d53578492c86069f19085eb990eeaac2297f6c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 4 May 2020 15:44:28 -0500 Subject: [PATCH 230/660] sync vars for graphconvlayer defined in cpp only --- libdeepgalois/include/deepgalois/types.h | 7 +++++-- libdeepgalois/src/layers/graph_conv_layer.cpp | 7 +++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index 3a579a9c5c..87e7411689 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -44,11 +44,14 @@ enum class net_phase { train, test }; #ifdef GALOIS_USE_DIST namespace deepgalois { + // TODO only being used by graph conv layer at the moment so extern works, + // but this design is bad and needs to be revisited + //! Set this to let sync struct know where to get data from - static float_t* _dataToSync = nullptr; + extern float_t* _dataToSync; //! Set this to let sync struct know the size of the vector to use during //! sync - static long unsigned _syncVectorSize = 0; + extern long unsigned _syncVectorSize; } #endif diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index e50d66f5ae..354db106e9 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -4,6 +4,13 @@ namespace deepgalois { +//! Set this to let sync struct know where to get data from +float_t* _dataToSync = nullptr; +//! Set this to let sync struct know the size of the vector to use during +//! sync +long unsigned _syncVectorSize = 0; + + graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout, float_t dropout_rate, std::vector in_dims, From 0065fe91e9db76950fe0bc8c721f94172b41ac82 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 4 May 2020 15:45:21 -0500 Subject: [PATCH 231/660] gnn distgraphloader, include LLVM command line --- lonestargnn/CMakeLists.txt | 4 +--- lonestargnn/include/DistributedGraphLoader.h | 1 + 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index 0c313d742c..62fe9b321f 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -19,9 +19,7 @@ endif() if(ENABLE_DIST_GALOIS) add_library(distgraphloader STATIC src/DistributedGraphLoader.cpp) - target_include_directories(distgraphloader PUBLIC - include - ) + target_include_directories(distgraphloader PUBLIC include) target_link_libraries(distgraphloader galois_cusp) endif() diff --git a/lonestargnn/include/DistributedGraphLoader.h b/lonestargnn/include/DistributedGraphLoader.h index 247ad0763c..f5a896b3de 100644 --- a/lonestargnn/include/DistributedGraphLoader.h +++ b/lonestargnn/include/DistributedGraphLoader.h @@ -32,6 +32,7 @@ #include "galois/graphs/CuSPPartitioner.h" #include "deepgalois/configs.h" +#include "llvm/Support/CommandLine.h" /******************************************************************************* * Supported partitioning schemes From d95fc7736e2892601053430a38f56da6881779e0 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 4 May 2020 15:46:49 -0500 Subject: [PATCH 232/660] gtypes edgeiterator change for distgraph --- libdeepgalois/include/deepgalois/gtypes.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h index fe759803e2..697d386d9a 100644 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ b/libdeepgalois/include/deepgalois/gtypes.h @@ -29,6 +29,8 @@ typedef galois::graphs::LC_CSR_Graph; #endif From 91ee8dc256a4c75d86223beedd955e958368ce24 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 4 May 2020 15:47:14 -0500 Subject: [PATCH 233/660] WIP: disabling sampler for dist build --- libdeepgalois/CMakeLists.txt | 2 +- libdeepgalois/include/deepgalois/net.h | 4 +++- libdeepgalois/include/deepgalois/sampler.h | 4 ++++ libdeepgalois/src/layers/aggregator.cpp | 2 +- libdeepgalois/src/net.cpp | 10 +++++++++- 5 files changed, 18 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 157e0151ad..a22985b3fa 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -72,6 +72,7 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if(ENABLE_DIST_GALOIS) # do not link regular context.cpp; TODO do this conditional in cleaner way +# also don't link sampler set(sources src/layers/softmax_loss_layer.cpp src/layers/sigmoid_loss_layer.cpp @@ -84,7 +85,6 @@ set(sources src/layers/layer.cpp src/DistContext.cpp src/optimizer.cpp - src/sampler.cpp src/lgraph.cpp src/utils.cpp src/node.cpp diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 6708e2ce63..ad0b1547f3 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -43,7 +43,7 @@ class Net { bool selfloop, bool single, bool l2norm, bool dense, unsigned neigh_sample_size = 0, unsigned subg_sample = 0); #ifdef GALOIS_USE_DIST - void dist_init(Graph* dGraph); + void dist_init(Graph* graph, std::string dataset_str); #endif size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } @@ -120,7 +120,9 @@ class Net { #endif #ifdef CPU_ONLY +#ifndef GALOIS_USE_DIST Sampler *sampler; +#endif #endif // comparing outputs with the ground truth (labels) acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth); diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h index c92e8471d3..eb3b936d18 100644 --- a/libdeepgalois/include/deepgalois/sampler.h +++ b/libdeepgalois/include/deepgalois/sampler.h @@ -1,3 +1,5 @@ +#ifndef GALOIS_USE_DIST + #pragma once #include "deepgalois/gtypes.h" @@ -51,3 +53,5 @@ class Sampler { }; } + +#endif diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index fc841a6361..430106e16d 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -4,9 +4,9 @@ #ifdef CPU_ONLY void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { - size_t n = g.size(); //std::cout << "[update_all] graph size: " << n << "\n"; #ifndef GALOIS_USE_DIST + size_t n = g.size(); galois::do_all(galois::iterate(size_t(0), n),[&](const auto src) { #else auto& rangeObj = g.allNodesRange(); diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index c0127a54f2..74ffba528b 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -87,6 +87,7 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs feature_dims[num_layers] = num_classes; // normalized output embedding: E layers.resize(num_layers); +#ifndef GALOIS_USE_DIST context->set_use_subgraph(subgraph_sample_size > 0); #ifdef CPU_ONLY if (subgraph_sample_size) sampler = new deepgalois::Sampler(); @@ -95,10 +96,11 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs copy_masks_device(num_samples, val_masks, d_val_masks); context->copy_data_to_device(); // copy labels and input features to the device #endif +#endif } #ifdef GALOIS_USE_DIST -void Net::dist_init(Graph* graph) { +void Net::dist_init(Graph* graph, std::string dataset_str) { dGraph = graph; context = new deepgalois::DistContext(); num_samples = dGraph->size(); @@ -151,12 +153,14 @@ void Net::train(optimizer* opt, bool need_validate) { int num_subg_remain = 0; #ifdef CPU_ONLY +#ifndef GALOIS_USE_DIST if (subgraph_sample_size) { context->createSubgraphs(num_subgraphs); subgraphs_masks = new mask_t[num_samples*num_subgraphs]; galois::gPrint("\nConstruct training vertex set induced graph...\n"); sampler->set_masked_graph(train_begin, train_end, train_count, train_masks, context->getGraphPointer()); } +#endif #endif galois::gPrint("\nStart training...\n"); Timer t_epoch; @@ -171,22 +175,26 @@ void Net::train(optimizer* opt, bool need_validate) { t_subgen.Start(); // generate subgraphs #ifdef CPU_ONLY +#ifndef GALOIS_USE_DIST //for (int sid = 0; sid < num_subgraphs; sid++) { galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) { unsigned tid = 0; tid = galois::substrate::ThreadPool::getTID(); sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples], tid); }, galois::loopname("subgraph_gen")); +#endif #endif num_subg_remain = num_subgraphs; t_subgen.Stop(); //galois::gPrint("Done, time: ", t_subgen.Millisecs(), "\n"); } +#ifndef GALOIS_USE_DIST for (int i = 0; i < num_subgraphs; i++) { auto sg_ptr = context->getSubgraphPointer(i); sg_ptr->degree_counting(); //galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " num_e ", sg_ptr->sizeEdges(), "\n"); } +#endif GALOIS_USE_DIST num_subg_remain--; int sg_id = num_subg_remain; auto subgraph_ptr = context->getSubgraphPointer(sg_id); From 42ec7dac87a408730d8cb4d34a7eb8566c2666b6 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 4 May 2020 16:15:01 -0500 Subject: [PATCH 234/660] fix some build errors --- .../include/deepgalois/DistContext.h | 8 ++-- libdeepgalois/include/deepgalois/lgraph.h | 26 +++++++----- libdeepgalois/src/lgraph.cpp | 22 +++++++--- libdeepgalois/src/lgraph.cu | 40 +++++++++---------- libdeepgalois/src/net.cpp | 6 +-- lonestargnn/gcn/gcn.cpp | 2 +- 6 files changed, 60 insertions(+), 44 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 1304b631f8..953010f09a 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -17,7 +17,7 @@ class DistContext { galois::graphs::GluonSubstrate* syncSubstrate; Graph* graph_cpu; // the input graph, |V| = N - Graph* subgraph_cpu; + std::vector subgraphs_cpu; label_t *h_labels; // labels for classification. Single-class label: Nx1, multi-class label: NxE label_t *h_labels_subg; // labels for subgraph float_t* h_feats; // input features: N x D @@ -27,6 +27,7 @@ class DistContext { float_t* d_feats; // input features on device float_t* d_feats_subg; // input features for subgraph on device float_t* norm_factors; // normalization constant based on graph structure + float_t* norm_factors_subg; // normalization constant for subgraph public: DistContext(); @@ -47,18 +48,19 @@ class DistContext { //! find norm factor by looking at degree // TODO this is a distributed operation - void norm_factor_computing(bool is_subgraph, int subg_id); + void norm_factor_computing(bool is_subgraph, int subg_id = 0); void createSubgraphs(int num_subgraphs) {} void gen_subgraph_labels(size_t m, const mask_t *masks) {} void gen_subgraph_feats(size_t m, const mask_t *masks) {} float_t* get_norm_factors_ptr() { return norm_factors; } Graph* getGraphPointer() { return graph_cpu; } - Graph* getSubgraphPointer() { return subgraph_cpu; }; + Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; }; float_t* get_feats_ptr() { return h_feats; } float_t* get_feats_subg_ptr() { return h_feats_subg; } label_t* get_labels_ptr() { return h_labels; } label_t* get_labels_subg_ptr() { return h_labels_subg; } + float_t* get_norm_factors_subg_ptr() { return norm_factors_subg; } void initializeSyncSubstrate(); galois::graphs::GluonSubstrate* getSyncSubstrate(); diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index 315ec1145a..8d450a1a23 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -6,22 +6,30 @@ namespace deepgalois { class LearningGraph { + typedef std::vector IndexList; + //typedef index_t* IndexList; protected: bool is_device; index_t num_vertices_; index_t num_edges_; - index_t *rowptr_; - index_t *colidx_; - index_t *degrees_; + IndexList rowptr_; + IndexList colidx_; + IndexList degrees_; vdata_t *vertex_data_; edata_t *edge_data_; + + index_t *d_rowptr_; + index_t *d_colidx_; + index_t *d_degrees_; + vdata_t *d_vertex_data_; + edata_t *d_edge_data_; std::vector> mirrorNodes; public: typedef size_t iterator; //using iterator = boost::counting_iterator; LearningGraph(bool use_gpu) : is_device(use_gpu), num_vertices_(0), num_edges_(0), - rowptr_(NULL), colidx_(NULL), degrees_(NULL), + //rowptr_(NULL), colidx_(NULL), degrees_(NULL), vertex_data_(NULL), edge_data_(NULL) {} LearningGraph() : LearningGraph(false) {} ~LearningGraph() { dealloc(); } @@ -35,17 +43,17 @@ class LearningGraph { index_t get_degree(index_t vid) { return degrees_[vid]; } index_t edge_begin(index_t vid) { return rowptr_[vid]; } index_t edge_end(index_t vid) { return rowptr_[vid+1]; } - index_t* row_start_ptr() { return rowptr_; } - index_t* edge_dst_ptr() { return colidx_; } - index_t* degrees_ptr() { return degrees_; } + index_t* row_start_ptr() { return &rowptr_[0]; } + index_t* edge_dst_ptr() { return &colidx_[0]; } + index_t* degrees_ptr() { return °rees_[0]; } edata_t* edge_data_ptr() { return edge_data_; } vdata_t* vertex_data_ptr() { return vertex_data_; } iterator begin() const { return iterator(0); } iterator end() const { return iterator(num_vertices_); } void progressPrint(unsigned maxii, unsigned ii); void allocOnDevice(bool no_edge_data_); - void copy_to_cpu(LearningGraph ©graph); - void copy_to_gpu(LearningGraph ©graph); + void copy_to_cpu(); + void copy_to_gpu(); void dealloc(); void degree_counting(); void allocateFrom(index_t nv, index_t ne); diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index 684d9b89e8..a99ce7df36 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -39,6 +39,7 @@ void LearningGraph::progressPrint(unsigned maxii, unsigned ii) { void LearningGraph::allocateFrom(index_t nv, index_t ne) { //printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_); +/* if (num_vertices_ != nv) { if (rowptr_ != NULL) delete [] rowptr_; if (degrees_ != NULL) delete [] degrees_; @@ -52,6 +53,12 @@ void LearningGraph::allocateFrom(index_t nv, index_t ne) { } if (rowptr_ == NULL) rowptr_ = new index_t[num_vertices_+1]; if (colidx_ == NULL) colidx_ = new index_t[num_edges_]; +*/ + num_vertices_ = nv; + num_edges_ = ne; + rowptr_.resize(num_vertices_+1); + colidx_.resize(num_edges_); + degrees_.resize(num_vertices_); rowptr_[0] = 0; } @@ -69,8 +76,8 @@ void LearningGraph::constructEdge(index_t eid, index_t dst, edata_t edata) { } void LearningGraph::degree_counting() { - if (degrees_ != NULL) return; - degrees_ = new index_t[num_vertices_]; + //if (degrees_ != NULL) return; + //degrees_ = new index_t[num_vertices_]; galois::do_all(galois::iterate(size_t(0), size_t(num_vertices_)), [&] (auto v) { degrees_[v] = rowptr_[v+1] - rowptr_[v]; }, galois::loopname("DegreeCounting")); @@ -125,10 +132,11 @@ void LearningGraph::readGraphFromGRFile(const std::string& filename) { } printf("num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_); - degrees_ = new index_t[num_vertices_]; - rowptr_ = new index_t[num_vertices_+1]; - colidx_ = new index_t[num_edges_]; - rowptr_[0] = 0; + allocateFrom(nv, ne); + //degrees_ = new index_t[num_vertices_]; + //rowptr_ = new index_t[num_vertices_+1]; + //colidx_ = new index_t[num_edges_]; + //rowptr_[0] = 0; for (unsigned ii = 0; ii < num_vertices_; ++ii) { rowptr_[ii+1] = le64toh(outIdx[ii]); degrees_[ii] = rowptr_[ii+1] - rowptr_[ii]; @@ -173,12 +181,14 @@ void LearningGraph::readGraphFromGRFile(const std::string& filename) { #ifdef CPU_ONLY void LearningGraph::dealloc() { +/* assert (!is_device); if (rowptr_ != NULL) delete [] rowptr_; if (colidx_ != NULL) delete [] colidx_; if (degrees_ != NULL) delete [] degrees_; if (vertex_data_ != NULL) delete [] vertex_data_; if (edge_data_ != NULL) delete [] edge_data_; +//*/ } #endif diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu index 0a925bbbdb..3a379a649e 100644 --- a/libdeepgalois/src/lgraph.cu +++ b/libdeepgalois/src/lgraph.cu @@ -6,40 +6,36 @@ namespace deepgalois { void LearningGraph::dealloc() { assert(is_device); - CUDA_CHECK(cudaFree(colidx_)); - CUDA_CHECK(cudaFree(rowptr_)); - CUDA_CHECK(cudaFree(degrees_)); - if (edge_data_ != NULL) CUDA_CHECK(cudaFree(edge_data_)); - if (vertex_data_ != NULL) CUDA_CHECK(cudaFree(vertex_data_)); + CUDA_CHECK(cudaFree(d_colidx_)); + CUDA_CHECK(cudaFree(d_rowptr_)); + CUDA_CHECK(cudaFree(d_degrees_)); + if (edge_data_ != NULL) CUDA_CHECK(cudaFree(d_edge_data_)); + if (vertex_data_ != NULL) CUDA_CHECK(cudaFree(d_vertex_data_)); } void LearningGraph::allocOnDevice(bool no_edge_data__) { - if (colidx_ != NULL) return; - CUDA_CHECK(cudaMalloc((void **) &colidx_, num_edges_ * sizeof(index_t))); - CUDA_CHECK(cudaMalloc((void **) &rowptr_, (num_vertices_+1) * sizeof(index_t))); - CUDA_CHECK(cudaMalloc((void **) °rees_, num_vertices_ * sizeof(index_t))); + if (d_colidx_ != NULL) return; + CUDA_CHECK(cudaMalloc((void **) &d_colidx_, num_edges_ * sizeof(index_t))); + CUDA_CHECK(cudaMalloc((void **) &d_rowptr_, (num_vertices_+1) * sizeof(index_t))); + CUDA_CHECK(cudaMalloc((void **) &d_degrees_, num_vertices_ * sizeof(index_t))); //if (!no_edge_data__) CUDA_CHECK(cudaMalloc((void **) &edge_data__, num_edges_ * sizeof(edge_data___t))); //CUDA_CHECK(cudaMalloc((void **) &vertex_data__, num_vertices_ * sizeof(vdata_t))); is_device = true; } -void LearningGraph::copy_to_gpu(LearningGraph ©graph) { - copygraph.init(num_vertices_, num_edges_); - copygraph.allocOnDevice(edge_data_ == NULL); - CUDA_CHECK(cudaMemcpy(copygraph.colidx_, colidx_, num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(copygraph.rowptr_, rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(copygraph.degrees_, degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyHostToDevice)); +void LearningGraph::copy_to_gpu() { + allocOnDevice(edge_data_ == NULL); + CUDA_CHECK(cudaMemcpy(edge_dst_ptr(), d_colidx_, num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(row_start_ptr(), d_rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyHostToDevice)); //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__, edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyHostToDevice)); //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__, vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyHostToDevice)); } -void LearningGraph::copy_to_cpu(LearningGraph ©graph) { - assert(is_device); - assert(copygraph.size() == num_vertices_); - assert(copygraph.sizeEdges() == num_edges_); - CUDA_CHECK(cudaMemcpy(copygraph.edge_dst_ptr(), colidx_, num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(copygraph.row_start_ptr(), rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(copygraph.degrees_ptr(), degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyDeviceToHost)); +void LearningGraph::copy_to_cpu() { + CUDA_CHECK(cudaMemcpy(edge_dst_ptr(), d_colidx_, num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(row_start_ptr(), d_rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyDeviceToHost)); //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__ptr(), edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyDeviceToHost)); //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__ptr(), vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost)); } diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 74ffba528b..9ade42ff9b 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -194,7 +194,7 @@ void Net::train(optimizer* opt, bool need_validate) { sg_ptr->degree_counting(); //galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " num_e ", sg_ptr->sizeEdges(), "\n"); } -#endif GALOIS_USE_DIST +#endif //GALOIS_USE_DIST num_subg_remain--; int sg_id = num_subg_remain; auto subgraph_ptr = context->getSubgraphPointer(sg_id); @@ -530,9 +530,9 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks uint32_t localID = dGraph->getLID(i); if (masks[localID] == 1) { // get prediction - auto preds = math::argmax(num_classes, preds+localID*num_classes); + auto pred = math::argmax(num_classes, &preds[localID*num_classes]); // check prediction - if ((label_t)preds == ground_truth[localID]) + if ((label_t)pred == ground_truth[localID]) accuracy_all += 1.0; } } diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 489553a689..9cbb0eb77f 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -21,7 +21,7 @@ int main(int argc, char** argv) { #ifdef GALOIS_USE_DIST std::vector dummyVec; deepgalois::Graph* dGraph = galois::graphs::constructSymmetricGraph(dummyVec); - network.dist_init(dGraph); + network.dist_init(dGraph, dataset); #endif // read network, features, ground truth, initialize metadata From 0cb1cf05ce1c0468d0d45e5b059295256eeb17cb Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 4 May 2020 16:26:22 -0500 Subject: [PATCH 235/660] linking llvm gnn graph loader, llvm partition cl --- lonestargnn/CMakeLists.txt | 2 +- lonestargnn/src/DistributedGraphLoader.cpp | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/lonestargnn/CMakeLists.txt b/lonestargnn/CMakeLists.txt index 62fe9b321f..0f7ef10320 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestargnn/CMakeLists.txt @@ -20,7 +20,7 @@ endif() if(ENABLE_DIST_GALOIS) add_library(distgraphloader STATIC src/DistributedGraphLoader.cpp) target_include_directories(distgraphloader PUBLIC include) - target_link_libraries(distgraphloader galois_cusp) + target_link_libraries(distgraphloader galois_cusp LLVMSupport) endif() add_subdirectory(gcn) diff --git a/lonestargnn/src/DistributedGraphLoader.cpp b/lonestargnn/src/DistributedGraphLoader.cpp index dbdf24ab90..7c309dedc2 100644 --- a/lonestargnn/src/DistributedGraphLoader.cpp +++ b/lonestargnn/src/DistributedGraphLoader.cpp @@ -44,6 +44,5 @@ cll::opt partitionScheme( clEnumValN(GINGER_I, "ginger-i", "ginger, incoming edges, using CuSP"), clEnumValN(FENNEL_O, "fennel-o", "fennel, outgoing edge cut, using CuSP"), clEnumValN(FENNEL_I, "fennel-i", "fennel, incoming edge cut, using CuSP"), - clEnumValN(SUGAR_O, "sugar-o", "fennel, incoming edge cut, using CuSP"), - clEnumValEnd), + clEnumValN(SUGAR_O, "sugar-o", "fennel, incoming edge cut, using CuSP")), cll::init(OEC)); From 09402d1724c7438f0b1b95c3e3df5cd9a131c4ea Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 4 May 2020 20:46:16 -0500 Subject: [PATCH 236/660] fix a bug --- libdeepgalois/include/deepgalois/context.h | 15 ++++++---- libdeepgalois/src/context.cpp | 33 +++++++++++++--------- libdeepgalois/src/net.cpp | 2 +- 3 files changed, 29 insertions(+), 21 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index 4a0ce506b2..5683c26f12 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -32,7 +32,7 @@ class Context { label_t get_label(size_t i) { return h_labels[i]; } // single-class (one-hot) label //label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label float_t* get_norm_factors_ptr() { return norm_factors; } - float_t* get_norm_factors_subg_ptr() { return norm_factors_subg; } + float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; } void set_label_class(bool is_single = true) { is_single_class = is_single; } void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; } @@ -50,9 +50,9 @@ class Context { Graph* getGraphPointer() { return graph_cpu; } Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; }; float_t* get_feats_ptr() { return h_feats; } - float_t* get_feats_subg_ptr() { return h_feats_subg; } + float_t* get_feats_subg_ptr() { return &h_feats_subg[0]; } label_t* get_labels_ptr() { return h_labels; } - label_t* get_labels_subg_ptr() { return h_labels_subg; } + label_t* get_labels_subg_ptr() { return &h_labels_subg[0]; } #else CSRGraph graph_gpu; // the input graph, |V| = N std::vector subgraphs_gpu; @@ -76,15 +76,18 @@ class Context { bool is_selfloop_added; // whether selfloop is added to the input graph bool use_subgraph; // whether to use subgraph label_t *h_labels; // labels for classification. Single-class label: Nx1, multi-class label: NxE - label_t *h_labels_subg; // labels for subgraph float_t* h_feats; // input features: N x D - float_t* h_feats_subg; // input features for subgraph + //label_t *h_labels_subg; // labels for subgraph + //float_t* h_feats_subg; // input features for subgraph label_t* d_labels; // labels on device label_t *d_labels_subg; // labels for subgraph on device float_t* d_feats; // input features on device float_t* d_feats_subg; // input features for subgraph on device float_t* norm_factors; // normalization constant based on graph structure - float_t* norm_factors_subg; // normalization constant for subgraph + std::vector h_labels_subg; // labels for subgraph + std::vector h_feats_subg; // input features for subgraph + std::vector norm_factors_subg; // normalization constant for subgraph + //float_t* norm_factors_subg; // normalization constant for subgraph void alloc_norm_factor(); void alloc_subgraph_norm_factor(int subg_id); diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 9f2b306371..a4b0c27be2 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -22,19 +22,20 @@ Context& Context::Get() { Context::Context() : n(0), num_classes(0), feat_len(0), is_single_class(true), is_selfloop_added(false), use_subgraph(false), - h_labels(NULL), h_labels_subg(NULL), - h_feats(NULL), h_feats_subg(NULL), + h_labels(NULL), h_feats(NULL), + //h_labels_subg(NULL), h_feats_subg(NULL), d_labels(NULL), d_labels_subg(NULL), d_feats(NULL), d_feats_subg(NULL), - norm_factors(NULL), norm_factors_subg(NULL) {} + norm_factors(NULL) {} + //norm_factors_subg(NULL) {} Context::~Context() { if (h_labels) delete[] h_labels; - if (h_labels_subg) delete[] h_labels_subg; + //if (h_labels_subg) delete[] h_labels_subg; if (h_feats) delete[] h_feats; - if (h_feats_subg) delete[] h_feats_subg; + //if (h_feats_subg) delete[] h_feats_subg; if (norm_factors) delete[] norm_factors; - if (norm_factors_subg) delete[] norm_factors_subg; + //if (norm_factors_subg) delete[] norm_factors_subg; } size_t Context::read_graph(std::string dataset_str, bool selfloop) { @@ -50,14 +51,15 @@ void Context::createSubgraphs(int num_subgraphs) { // generate labels for the subgraph, m is subgraph size void Context::gen_subgraph_labels(size_t m, const mask_t *masks) { - if (h_labels_subg == NULL) h_labels_subg = new label_t[m]; + //if (h_labels_subg == NULL) h_labels_subg = new label_t[m]; + h_labels_subg.resize(m); size_t count = 0; for (size_t i = 0; i < n; i++) { if (masks[i] == 1) { if (is_single_class) { h_labels_subg[count] = h_labels[i]; } else { - std::copy(h_labels+i*num_classes, h_labels+(i+1)*num_classes, h_labels_subg+count*num_classes); + std::copy(h_labels+i*num_classes, h_labels+(i+1)*num_classes, &h_labels_subg[count*num_classes]); } count ++; } @@ -67,10 +69,11 @@ void Context::gen_subgraph_labels(size_t m, const mask_t *masks) { // generate input features for the subgraph, m is subgraph size void Context::gen_subgraph_feats(size_t m, const mask_t *masks) { size_t count = 0; - if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len]; + //if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len]; + h_feats_subg.resize(m*feat_len); for (size_t i = 0; i < n; i++) { if (masks[i] == 1) { - std::copy(h_feats+i*feat_len, h_feats+(i+1)*feat_len, h_feats_subg+count*feat_len); + std::copy(h_feats+i*feat_len, h_feats+(i+1)*feat_len, &h_feats_subg[count*feat_len]); count ++; } } @@ -154,11 +157,13 @@ void Context::alloc_norm_factor() { void Context::alloc_subgraph_norm_factor(int subg_id) { Graph* g = getSubgraphPointer(subg_id); - if (norm_factors_subg == NULL) + //if (norm_factors_subg == NULL) #ifdef USE_MKL - norm_factors_subg = new float_t[g->sizeEdges()]; + //norm_factors_subg = new float_t[g->sizeEdges()]; + norm_factors_subg.resize(g->sizeEdges()); #else - norm_factors_subg = new float_t[g->size()]; + norm_factors_subg.resize(g->size()); + //norm_factors_subg = new float_t[g->size()]; #endif } @@ -172,7 +177,7 @@ void Context::norm_factor_computing(bool is_subgraph, int subg_id) { } else { g = getSubgraphPointer(subg_id); alloc_subgraph_norm_factor(subg_id); - constants = norm_factors_subg; + constants = get_norm_factors_subg_ptr(); } auto g_size = g->size(); g->degree_counting(); diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 9ade42ff9b..9ee41b3302 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -170,7 +170,7 @@ void Net::train(optimizer* opt, bool need_validate) { if (subgraph_sample_size) { if (num_subg_remain == 0) { - galois::gPrint("Generating ", num_subgraphs, " subgraphs (mini-batches) ... "); + galois::gPrint("Generating ", num_subgraphs, " subgraphs "); Timer t_subgen; t_subgen.Start(); // generate subgraphs From b1b77c025c0e0ca02d49849789297015e6abd802 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 5 May 2020 09:56:04 -0500 Subject: [PATCH 237/660] update sampler --- libdeepgalois/include/deepgalois/net.h | 10 ++++--- libdeepgalois/src/context.cpp | 6 +++- libdeepgalois/src/net.cpp | 41 +++++++++++++------------- libdeepgalois/src/sampler.cpp | 31 +++++++++++++++---- lonestargnn/gcn/gcn.cpp | 2 +- lonestargnn/include/lonestargnn.h | 4 +-- 6 files changed, 59 insertions(+), 35 deletions(-) diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index ad0b1547f3..9c794a9063 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -37,11 +37,13 @@ class Net { train_begin(0), train_end(0), train_count(0), val_begin(0), val_end(0), val_count(0), test_begin(0), test_end(0), test_count(0), - train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {} - void init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs, + val_interval(1), num_subgraphs(1), num_vertices_sg(9000), + train_masks(NULL), val_masks(NULL), + test_masks(NULL), context(NULL) {} + void init(std::string dataset_str, int nt, unsigned n_conv, int epochs, unsigned hidden1, float lr, float dropout, float wd, bool selfloop, bool single, bool l2norm, bool dense, - unsigned neigh_sample_size = 0, unsigned subg_sample = 0); + unsigned neigh_sample_sz, unsigned subg_sample_sz, int val_itv); #ifdef GALOIS_USE_DIST void dist_init(Graph* graph, std::string dataset_str); #endif @@ -92,7 +94,7 @@ class Net { size_t num_classes; // number of vertex classes: E size_t num_conv_layers; // number of convolutional layers size_t num_layers; // total number of layers (conv + output) - unsigned num_epochs; // number of epochs + int num_epochs; // number of epochs float learning_rate; // learning rate float dropout_rate; // dropout rate float weight_decay; // weighti decay for over-fitting diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index a4b0c27be2..caec001182 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -52,7 +52,11 @@ void Context::createSubgraphs(int num_subgraphs) { // generate labels for the subgraph, m is subgraph size void Context::gen_subgraph_labels(size_t m, const mask_t *masks) { //if (h_labels_subg == NULL) h_labels_subg = new label_t[m]; - h_labels_subg.resize(m); + if (is_single_class) { + h_labels_subg.resize(m); + } else { + h_labels_subg.resize(m*num_classes); + } size_t count = 0; for (size_t i = 0; i < n; i++) { if (masks[i] == 1) { diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 9ee41b3302..d62ac752b1 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -9,10 +9,10 @@ namespace deepgalois { -void Net::init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs, +void Net::init(std::string dataset_str, int nt, unsigned n_conv, int epochs, unsigned hidden1, float lr, float dropout, float wd, bool selfloop, bool single, bool l2norm, bool dense, - unsigned neigh_sz, unsigned subg_sz) { + unsigned neigh_sz, unsigned subg_sz, int val_itv) { assert(n_conv > 0); num_threads = nt; num_conv_layers = n_conv; @@ -25,8 +25,8 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, unsigned epochs has_dense = dense; neighbor_sample_size = neigh_sz; subgraph_sample_size = subg_sz; - val_interval = 1; - num_subgraphs = num_threads; + val_interval = val_itv; + //num_subgraphs = 1;//num_threads; galois::gPrint("Configuration: num_threads ", num_threads, ", num_conv_layers ", num_conv_layers, ", num_epochs ", num_epochs, @@ -165,7 +165,7 @@ void Net::train(optimizer* opt, bool need_validate) { galois::gPrint("\nStart training...\n"); Timer t_epoch; // run epochs - for (unsigned ep = 0; ep < num_epochs; ep++) { + for (int ep = 0; ep < num_epochs; ep++) { t_epoch.Start(); if (subgraph_sample_size) { @@ -176,12 +176,12 @@ void Net::train(optimizer* opt, bool need_validate) { // generate subgraphs #ifdef CPU_ONLY #ifndef GALOIS_USE_DIST - //for (int sid = 0; sid < num_subgraphs; sid++) { - galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) { + for (int sid = 0; sid < num_subgraphs; sid++) { + //galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) { unsigned tid = 0; - tid = galois::substrate::ThreadPool::getTID(); + //tid = galois::substrate::ThreadPool::getTID(); sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples], tid); - }, galois::loopname("subgraph_gen")); + }//, galois::loopname("subgraph_gen")); #endif #endif num_subg_remain = num_subgraphs; @@ -247,17 +247,6 @@ void Net::train(optimizer* opt, bool need_validate) { double epoch_time = t_epoch.Millisecs(); total_train_time += epoch_time; if (need_validate && ep % val_interval == 0) { - if (subgraph_sample_size) { // switch to the original graph - for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_samples); -#ifdef CPU_ONLY - for (size_t i = 0; i < num_conv_layers; i++) { - layers[i]->set_graph_ptr(context->getGraphPointer()); - layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr()); - } - layers[num_layers-1]->set_labels_ptr(context->get_labels_ptr()); - layers[0]->set_feats_ptr(context->get_feats_ptr()); // feed input data -#endif - } // Validation acc_t val_loss = 0.0, val_acc = 0.0; Tval.start(); @@ -308,7 +297,17 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { count = test_count; masks = test_masks; } -#ifndef CPU_ONLY +#ifdef CPU_ONLY + if (subgraph_sample_size && type != "train") { // switch to the original graph + for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_samples); + for (size_t i = 0; i < num_conv_layers; i++) { + layers[i]->set_graph_ptr(context->getGraphPointer()); + layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr()); + } + layers[num_layers-1]->set_labels_ptr(context->get_labels_ptr()); + layers[0]->set_feats_ptr(context->get_feats_ptr()); // feed input data + } +#else if (type == "train") { masks = d_train_masks; } else if (type == "val") { diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index ba338f5012..47317bdd3d 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -2,6 +2,7 @@ #include "deepgalois/sampler.h" #include #include +#define PARALLEL_GEN namespace deepgalois { inline unsigned getDegree(Graph *g, index_t v) { @@ -54,15 +55,21 @@ void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *m void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph *g, std::vector °rees) { assert(degrees.size() == n); +#ifdef PARALLEL_GEN + galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { +#else for (size_t src = 0; src < n; src++) { - //galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { +#endif if (masks[src] == 1) { for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { const auto dst = g->getEdgeDst(e); if (masks[dst] == 1) degrees[src] ++; } } - }//, galois::loopname("update_degrees")); + } +#ifdef PARALLEL_GEN + , galois::loopname("update_degrees")); +#endif } void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& sub) { @@ -75,8 +82,11 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& su #ifndef GALOIS_USE_DIST sub.allocateFrom(n, ne); sub.constructNodes(); +#ifdef PARALLEL_GEN + galois::do_all(galois::iterate((size_t)0, n), [&](const auto src) { +#else for (size_t src = 0; src < n; src++) { - //galois::do_all(galois::iterate((size_t)0, n), [&](const auto src) { +#endif sub.fixEndEdge(src, offsets[src+1]); if (masks[src] == 1) { auto idx = offsets[src]; @@ -85,7 +95,10 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& su if (masks[dst] == 1) sub.constructEdge(idx++, dst, 0); } } - }//, galois::loopname("gen_subgraph")); + } +#ifdef PARALLEL_GEN + , galois::loopname("gen_subgraph")); +#endif #endif } @@ -319,8 +332,11 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { sub.allocateFrom(nv, ne); sub.constructNodes(); VertexList old_ids(vertex_set.begin(), vertex_set.end()); // vertex ID mapping +#ifdef PARALLEL_GEN + galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) { +#else for (size_t i = 0; i < nv; i++) { - //galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) { +#endif sub.fixEndEdge(i, offsets[i+1]); unsigned j = 0; auto old_id = old_ids[i]; @@ -330,7 +346,10 @@ void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { sub.constructEdge(offsets[i]+j, dst, 0); j ++; } - }//, galois::loopname("construct_graph")); + } +#ifdef PARALLEL_GEN + , galois::loopname("construct_graph")); +#endif #endif } diff --git a/lonestargnn/gcn/gcn.cpp b/lonestargnn/gcn/gcn.cpp index 9cbb0eb77f..de3f2a76ee 100644 --- a/lonestargnn/gcn/gcn.cpp +++ b/lonestargnn/gcn/gcn.cpp @@ -28,7 +28,7 @@ int main(int argc, char** argv) { network.init(dataset, numThreads, num_conv_layers, epochs, hidden1, learning_rate, dropout_rate, weight_decay, add_selfloop, is_single_class, add_l2norm, add_dense, - neighbor_sample_sz, subgraph_sample_sz); + neighbor_sample_sz, subgraph_sample_sz, val_interval); // default setting for now; can be customized by the user network.construct_layers(); network.print_layers_info(); diff --git a/lonestargnn/include/lonestargnn.h b/lonestargnn/include/lonestargnn.h index 77a2777d5f..a72668daab 100644 --- a/lonestargnn/include/lonestargnn.h +++ b/lonestargnn/include/lonestargnn.h @@ -20,7 +20,7 @@ static cll::opt dataset(cll::Positional, cll::desc(""), cll::Required); // 'cora', 'citeseer', 'pubmed' //static cll::opt model("m", // cll::desc("Model string"), cll::init("gcn")); // 'gcn', 'gcn_cheby', 'dense' -static cll::opt epochs("k", +static cll::opt epochs("k", cll::desc("number of epoch, i.e. iterations (default value 1)"), cll::init(1)); static cll::opt num_conv_layers("nc", cll::desc("number of convolutional layers, (default value 2)"), cll::init(2)); @@ -41,7 +41,7 @@ static cll::opt do_test("dt", cll::desc("enable test"), cll::init(1)); static cll::opt add_selfloop("sl", cll::desc("add selfloop"), cll::init(0)); static cll::opt add_l2norm("l2", cll::desc("add an l2_norm layer"), cll::init(0)); static cll::opt add_dense("d", cll::desc("add an dense layer"), cll::init(0)); -static cll::opt val_interval("vi", cll::desc("validation interval (default value 1)"), cll::init(1)); +static cll::opt val_interval("vi", cll::desc("validation interval (default value 1)"), cll::init(1)); static cll::opt neighbor_sample_sz("ns", cll::desc("neighbor sampling size (default value 0)"), cll::init(0)); static cll::opt subgraph_sample_sz("ss", cll::desc("subgraph sampling size (default value 0)"), cll::init(0)); From 766b5ee78614ffe0a9cd85ea7c6289a286d1a7db Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 5 May 2020 12:53:41 -0500 Subject: [PATCH 238/660] use mkl19 --- CMakeLists.txt | 2 +- .../include/deepgalois/layers/aggregator.h | 4 +- .../include/deepgalois/math_functions.hh | 3 +- libdeepgalois/src/context.cu | 7 ++-- libdeepgalois/src/layers/aggregator.cpp | 8 ++-- .../src/layers/sigmoid_loss_layer.cpp | 6 +-- libdeepgalois/src/math_functions.cpp | 41 ++++++++++++++----- libdeepgalois/src/utils.cpp | 2 +- 8 files changed, 44 insertions(+), 29 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 58b143766d..d0fa1a80c1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -259,7 +259,7 @@ if(USE_VTUNE) endif() if(USE_MKL_BLAS) - SET(INTEL_ROOT /opt/apps/sysnet/intel/17.0) + SET(INTEL_ROOT /opt/apps/sysnet/intel/19.0) SET(MKL_ROOT ${INTEL_ROOT}/mkl) SET(INTEL_LIBS_DIR ${INTEL_ROOT}/lib/intel64_lin) find_package(MKL) diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h index ffdd3935a8..1b2d4b5104 100644 --- a/libdeepgalois/include/deepgalois/layers/aggregator.h +++ b/libdeepgalois/include/deepgalois/layers/aggregator.h @@ -6,9 +6,9 @@ #include "deepgalois/gtypes.h" namespace deepgalois { void update_all(size_t len, Graph& g, const float_t* in, float_t* out, - bool norm, const float_t* norm_factor); + bool norm, float_t* norm_factor); void update_all_csrmm(size_t len, Graph& g, const float_t* in, - float_t* out, bool norm, const float_t* norm_factor); + float_t* out, bool norm, float_t* norm_factor); } #else #include "graph_gpu.h" diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index fc9e798633..6e7ac10fe2 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -29,8 +29,7 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, // single-precision sparse matrix dense matrix multiply, C = A * B, A is sparse void csrmm_cpu(const int M, const int N, const int K, const int nnz, - const float alpha, const float* A_nonzeros, - const int* A_idx_ptr, const int* A_nonzero_idx, + const float alpha, float* A_nonzeros, int* A_idx_ptr, int* A_nonzero_idx, const float* B, const float beta, float* C); // matrix-vector multiply diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 6f42196428..99f14ce11a 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -64,10 +64,9 @@ curandGenerator_t Context::curand_generator_ = 0; Context::Context() : n(0), num_classes(0), feat_len(0), is_single_class(true), is_selfloop_added(false), - h_labels(NULL), h_labels_subg(NULL), - h_feats(NULL), h_feats_subg(NULL), - d_labels(NULL), d_labels_subg(NULL), - d_feats(NULL), d_feats_subg(NULL), + h_labels(NULL), h_feats(NULL), + d_labels(NULL), d_feats(NULL), + d_labels_subg(NULL), d_feats_subg(NULL), norm_factors(NULL) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 430106e16d..b298107f4e 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -3,7 +3,7 @@ #ifdef CPU_ONLY void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, - bool norm, const float_t* norm_factor) { + bool norm, float_t* norm_factor) { //std::cout << "[update_all] graph size: " << n << "\n"; #ifndef GALOIS_USE_DIST size_t n = g.size(); @@ -42,14 +42,12 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou } void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out, - bool norm, const float_t* norm_factor) { + bool norm, float_t* norm_factor) { galois::StatTimer Tcsrmm("CSRMM-MKL"); - //galois::gPrint("csrmm mkl\n"); Tcsrmm.start(); unsigned n = g.size(); math::clear_cpu(n*len, out); - math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, - (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, out); + math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, (int*)g.row_start_ptr(), (int*)g.edge_dst_ptr(), in, 0.0, out); Tcsrmm.stop(); } #endif diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index 10a4f8454a..2288a8da26 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -28,7 +28,7 @@ inline label_t sigmoid_loss_layer::get_label(size_t i, size_t j) { void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - if (masks_[i] == 1) { // masked + if (!use_mask || masks_[i] == 1) { // masked size_t idx = len * i; // output is normalized input for this layer math::sigmoid(len, &in_data[idx], &out_data[idx]); // normalize using sigmoid @@ -46,7 +46,7 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* float_t* out_grad, float_t* in_grad) { size_t len = layer::input_dims[1]; galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { - if (masks_[i] == 1) { // masked + if (!use_mask || masks_[i] == 1) { // masked size_t idx = len * i; float_t *norm_grad = new float_t[len]; float_t *ground_truth = new float_t[len]; @@ -68,7 +68,7 @@ acc_t sigmoid_loss_layer::get_prediction_loss() { total_loss.reset(); valid_sample_count.reset(); galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { - if (masks_[i]) { + if (!use_mask || masks_[i]) { total_loss += loss[i]; valid_sample_count += 1; } diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index ec43be8656..4dcfb941ac 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -57,23 +57,42 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, Tmatmul.start(); int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, - beta, C, N); + cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); Tmatmul.stop(); } void csrmm_cpu(const int M, const int N, const int K, const int nnz, - const float alpha, const float* A_nonzeros, - const int* A_idx_ptr, const int* A_nnz_idx, + const float alpha, float* A_nonzeros, int* A_idx_ptr, int* A_nnz_idx, const float* B, const float beta, float* C) { #ifdef USE_MKL - mkl_set_num_threads(56); - const char *matdescra = "GXXCX";//6 bytes - const char transa = 'N'; - //printf("Calling Intel MKL\n"); exit(1); - mkl_scsrmm(&transa, &M , &N, &K, &alpha , matdescra, - A_nonzeros, A_nnz_idx, A_idx_ptr, A_idx_ptr+1, - B, &N, &beta , C, &N); + //mkl_set_num_threads(56); + //const char *matdescra = "GXXCX";//6 bytes + //const char transa = 'N'; + //mkl_scsrmm(&transa, &M , &N, &K, &alpha, matdescra, A_nonzeros, A_nnz_idx, A_idx_ptr, A_idx_ptr+1, B, &N, &beta, C, &N); + sparse_status_t status; + bool need_trans = false; + bool is_row_major = true; + sparse_matrix_t csrA = NULL; + sparse_index_base_t indexing = SPARSE_INDEX_BASE_ZERO; + sparse_layout_t layout = (is_row_major ? SPARSE_LAYOUT_ROW_MAJOR : SPARSE_LAYOUT_COLUMN_MAJOR); + status = mkl_sparse_s_create_csr(&csrA, indexing, M, K, A_idx_ptr, A_idx_ptr + 1, A_nnz_idx, A_nonzeros); + if (status != SPARSE_STATUS_SUCCESS) { + std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl; + exit(1); + } + sparse_operation_t transa = (need_trans ? SPARSE_OPERATION_TRANSPOSE : SPARSE_OPERATION_NON_TRANSPOSE); + struct matrix_descr descrA; + descrA.type = SPARSE_MATRIX_TYPE_GENERAL; + //descrA.mode = SPARSE_FILL_MODE_UPPER; + //descrA.diag = SPARSE_DIAG_NON_UNIT; + //mkl_sparse_set_mm_hint(csrA, transa, descrA, layout, N, 1); + //mkl_sparse_optimize(csrA); + status = mkl_sparse_s_mm(transa, alpha, csrA, descrA, layout, B, N, N, beta, C, N); + if (status != SPARSE_STATUS_SUCCESS) { + std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl; + exit(1); + } + mkl_sparse_destroy(csrA); #else NOT_IMPLEMENTED; #endif diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index dedb9c225a..882154f5c0 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -59,7 +59,7 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, int tp_cls(0), fp_cls(0), fn_cls(0), tn_cls(0); for (size_t row = begin; row < end; row ++) { //galois::do_all(galois::iterate(begin, end), [&](const auto& row) { - if (masks[row] == 1) { + if (masks == NULL || masks[row] == 1) { auto idx = row * num_classes + col; if (ground_truth[idx] == 1 && pred[idx] > 0.5) { //__sync_fetch_and_add(&tp_cls, 1); From d61963d836b22b5bb96143d1e813337870dfa203 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 5 May 2020 16:03:20 -0500 Subject: [PATCH 239/660] lonestargnn now subdirectory in lonestar --- CMakeLists.txt | 1 - lonestar/CMakeLists.txt | 4 ++++ {lonestargnn => lonestar/gnn}/CMakeLists.txt | 2 +- {lonestargnn => lonestar/gnn}/gcn/CMakeLists.txt | 0 {lonestargnn => lonestar/gnn/gcn}/README.md | 0 {lonestargnn => lonestar/gnn}/gcn/gcn.cpp | 0 {lonestargnn => lonestar/gnn}/gin/CMakeLists.txt | 0 {lonestargnn => lonestar/gnn}/gin/gin.cpp | 0 {lonestargnn => lonestar/gnn}/graphsage/gs-mean.cpp | 0 .../gnn}/include/DistributedGraphLoader.h | 0 {lonestargnn => lonestar/gnn}/include/lonestargnn.h | 0 {lonestargnn => lonestar/gnn}/run-citeseer.sh | 0 {lonestargnn => lonestar/gnn}/src/DistributedGraphLoader.cpp | 0 13 files changed, 5 insertions(+), 2 deletions(-) rename {lonestargnn => lonestar/gnn}/CMakeLists.txt (92%) rename {lonestargnn => lonestar/gnn}/gcn/CMakeLists.txt (100%) rename {lonestargnn => lonestar/gnn/gcn}/README.md (100%) rename {lonestargnn => lonestar/gnn}/gcn/gcn.cpp (100%) rename {lonestargnn => lonestar/gnn}/gin/CMakeLists.txt (100%) rename {lonestargnn => lonestar/gnn}/gin/gin.cpp (100%) rename {lonestargnn => lonestar/gnn}/graphsage/gs-mean.cpp (100%) rename {lonestargnn => lonestar/gnn}/include/DistributedGraphLoader.h (100%) rename {lonestargnn => lonestar/gnn}/include/lonestargnn.h (100%) rename {lonestargnn => lonestar/gnn}/run-citeseer.sh (100%) rename {lonestargnn => lonestar/gnn}/src/DistributedGraphLoader.cpp (100%) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8f2a43aa02..b8c5e98cf6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -297,7 +297,6 @@ endif() if(USE_DEEPGALOIS) add_subdirectory(libdeepgalois) - add_subdirectory(lonestargnn) endif(USE_DEEPGALOIS) if (ENABLE_HETERO_GALOIS) diff --git a/lonestar/CMakeLists.txt b/lonestar/CMakeLists.txt index fbb645e7cf..58e911aa0d 100644 --- a/lonestar/CMakeLists.txt +++ b/lonestar/CMakeLists.txt @@ -193,3 +193,7 @@ add_subdirectory(analytics) add_subdirectory(eda) add_subdirectory(mining) add_subdirectory(scientific) + +if(USE_DEEPGALOIS) + add_subdirectory(gnn) +endif(USE_DEEPGALOIS) diff --git a/lonestargnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt similarity index 92% rename from lonestargnn/CMakeLists.txt rename to lonestar/gnn/CMakeLists.txt index 0f7ef10320..d0551bdadc 100644 --- a/lonestargnn/CMakeLists.txt +++ b/lonestar/gnn/CMakeLists.txt @@ -1,4 +1,4 @@ -include_directories(${CMAKE_SOURCE_DIR}/lonestargnn/include) +include_directories(${CMAKE_SOURCE_DIR}/lonestar/gnn/include) include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include) include_directories(${CUDA_HOME}/include) link_directories(${CUDA_HOME}/lib64) diff --git a/lonestargnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt similarity index 100% rename from lonestargnn/gcn/CMakeLists.txt rename to lonestar/gnn/gcn/CMakeLists.txt diff --git a/lonestargnn/README.md b/lonestar/gnn/gcn/README.md similarity index 100% rename from lonestargnn/README.md rename to lonestar/gnn/gcn/README.md diff --git a/lonestargnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp similarity index 100% rename from lonestargnn/gcn/gcn.cpp rename to lonestar/gnn/gcn/gcn.cpp diff --git a/lonestargnn/gin/CMakeLists.txt b/lonestar/gnn/gin/CMakeLists.txt similarity index 100% rename from lonestargnn/gin/CMakeLists.txt rename to lonestar/gnn/gin/CMakeLists.txt diff --git a/lonestargnn/gin/gin.cpp b/lonestar/gnn/gin/gin.cpp similarity index 100% rename from lonestargnn/gin/gin.cpp rename to lonestar/gnn/gin/gin.cpp diff --git a/lonestargnn/graphsage/gs-mean.cpp b/lonestar/gnn/graphsage/gs-mean.cpp similarity index 100% rename from lonestargnn/graphsage/gs-mean.cpp rename to lonestar/gnn/graphsage/gs-mean.cpp diff --git a/lonestargnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h similarity index 100% rename from lonestargnn/include/DistributedGraphLoader.h rename to lonestar/gnn/include/DistributedGraphLoader.h diff --git a/lonestargnn/include/lonestargnn.h b/lonestar/gnn/include/lonestargnn.h similarity index 100% rename from lonestargnn/include/lonestargnn.h rename to lonestar/gnn/include/lonestargnn.h diff --git a/lonestargnn/run-citeseer.sh b/lonestar/gnn/run-citeseer.sh similarity index 100% rename from lonestargnn/run-citeseer.sh rename to lonestar/gnn/run-citeseer.sh diff --git a/lonestargnn/src/DistributedGraphLoader.cpp b/lonestar/gnn/src/DistributedGraphLoader.cpp similarity index 100% rename from lonestargnn/src/DistributedGraphLoader.cpp rename to lonestar/gnn/src/DistributedGraphLoader.cpp From 8a851dc85fb693f1969ad43d418fa745a836fd63 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 5 May 2020 16:14:56 -0500 Subject: [PATCH 240/660] readdded libpangolin that was accidentally deleted in merge --- CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index b8c5e98cf6..3fb831effd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -309,6 +309,7 @@ if (ENABLE_HETERO_GALOIS) add_subdirectory(libgpu) endif() +add_subdirectory(libpangolin) # Applications (apps) add_subdirectory(lonestar) From 3fa21e02077117ffd16e1b6a27eb578efbfc56d1 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 5 May 2020 17:55:16 -0500 Subject: [PATCH 241/660] merge fix --- libdeepgalois/CMakeLists.txt | 9 ++ .../deepgalois/layers/graph_conv_layer.h | 12 ++- .../include/deepgalois/layers/layer.h | 9 +- .../include/deepgalois/layers/node.h | 7 +- libdeepgalois/include/deepgalois/lgraph.h | 33 +++++-- libdeepgalois/include/deepgalois/optimizer.h | 30 +++---- libdeepgalois/include/deepgalois/types.h | 1 + libdeepgalois/src/layers/aggregator.cpp | 1 + libdeepgalois/src/layers/graph_conv_layer.cpp | 16 +--- libdeepgalois/src/layers/l2_norm_layer.cpp | 1 + libdeepgalois/src/layers/leaky_relu_layer.cpp | 1 + libdeepgalois/src/layers/relu_layer.cpp | 1 + .../src/layers/sigmoid_loss_layer.cpp | 1 + .../src/layers/softmax_loss_layer.cpp | 10 +-- libdeepgalois/src/lgraph.cpp | 39 ++++++++- libdeepgalois/src/math_functions.cpp | 3 + libdeepgalois/src/optimizer.cpp | 85 +++++++++---------- libdeepgalois/src/utils.cpp | 1 + 18 files changed, 161 insertions(+), 99 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index a22985b3fa..3f5bc11a95 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -91,6 +91,14 @@ set(sources src/net.cpp ) else() +if(ENABLE_DIST_GALOIS) +set(sources + src/context.cpp + src/lgraph.cpp + src/node.cpp + src/net.cpp +) +else() set(sources src/layers/softmax_loss_layer.cpp src/layers/sigmoid_loss_layer.cpp @@ -110,6 +118,7 @@ set(sources src/net.cpp ) endif() +endif() #set(BOOST_LIBRARIES "-lboost_system -lboost_thread") add_library(dg_cpu STATIC ${sources}) diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 6cc40c266d..92bc999653 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -28,14 +28,21 @@ class graph_conv_layer : public layer { public: graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout, float_t dropout_rate, - std::vector in_dims, std::vector out_dims); + std::vector in_dims, std::vector out_dims) + : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias), + dropout_(dropout), dropout_rate_(dropout_rate) { + assert(input_dims[0] == output_dims[0]); // num_vertices + trainable_ = true; + name_ = layer_type() + "_" + std::to_string(level); + assert(dropout_rate_ >= 0. && dropout_rate_ < 1.); + scale_ = 1. / (1. - dropout_rate_); + } graph_conv_layer(unsigned level, std::vector in_dims, std::vector out_dims) : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {} ~graph_conv_layer() {} void malloc_and_init(); std::string layer_type() const override { return std::string("graph_conv"); } - void set_netphase(net_phase ctx) override { phase_ = ctx; } virtual acc_t get_weight_decay_loss(); //! Uses weights contained in this layer to update in_data (results from previous) //! and save result to out_data @@ -62,7 +69,6 @@ class graph_conv_layer : public layer { bool dropout_; // whether to use dropout at first const float_t dropout_rate_; float_t scale_; - net_phase phase_; float_t* out_temp; //!< intermediate data temporary float_t* in_temp; float_t* in_temp1; diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 0ffab6de41..206e5e7da3 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -48,7 +48,7 @@ class layer : public deepgalois::node { layer(unsigned level, std::vector in_dims, std::vector out_dims) - : node(in_dims.size(), out_dims.size()), level_(level), begin_(0), + : level_(level), begin_(0), end_(0), num_dims(in_dims.size()), input_dims(in_dims), output_dims(out_dims), labels(NULL) { } virtual ~layer() = default; @@ -68,7 +68,7 @@ class layer : public deepgalois::node { float_t* get_grads_device_ptr() { return d_weight_grad; } // set methods - virtual void set_netphase(net_phase phase) {} + void set_netphase(net_phase ctx) { phase_ = ctx; } void set_context(ContextType* ctx) { context = ctx; } void set_trainable(bool trainable) { trainable_ = trainable; } // is this layer trainable? void set_labels_ptr(label_t *ptr) { labels = ptr; } @@ -137,8 +137,8 @@ class layer : public deepgalois::node { #ifdef CPU_ONLY // parallelize only when target size is big enough to mitigate thread // spawning overhead. - bool parallel = (W.size() >= 512); - opt->update(layer::weight_grad, layer::W, parallel); // W += grad + //bool parallel = (W.size() >= 512); + opt->update(layer::weight_grad, layer::W); // W += grad #else opt->update_gpu(input_dims[1]*output_dims[1], d_weight_grad, d_W); // W += grad #endif @@ -152,6 +152,7 @@ class layer : public deepgalois::node { size_t end_; // sample end index size_t count_; // number of samples size_t num_dims; // number of dimensions + net_phase phase_; // in which phase: train, val or test std::vector input_dims; // input dimensions std::vector output_dims; // output dimentions std::string name_; // name of this layer diff --git a/libdeepgalois/include/deepgalois/layers/node.h b/libdeepgalois/include/deepgalois/layers/node.h index 9b43167656..ec7c319d87 100644 --- a/libdeepgalois/include/deepgalois/layers/node.h +++ b/libdeepgalois/include/deepgalois/layers/node.h @@ -26,14 +26,15 @@ typedef std::shared_ptr edgeptr_t; // edge class node : public std::enable_shared_from_this { public: - node(size_t in_size, size_t out_size) { - } //: prev_(in_size), next_(out_size) {} + node() { prev_= NULL; next_ = NULL; } + //node(size_t in_size, size_t out_size) { + //} //: prev_(in_size), next_(out_size) {} virtual ~node() {} const edgeptr_t prev() const { return prev_; } const edgeptr_t next() const { return next_; } protected: - node() = delete; + //node() = delete; friend void connect(layer* head, layer* tail); mutable edgeptr_t prev_; mutable edgeptr_t next_; diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index 8d450a1a23..f8e5ce8315 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -43,11 +43,7 @@ class LearningGraph { index_t get_degree(index_t vid) { return degrees_[vid]; } index_t edge_begin(index_t vid) { return rowptr_[vid]; } index_t edge_end(index_t vid) { return rowptr_[vid+1]; } - index_t* row_start_ptr() { return &rowptr_[0]; } - index_t* edge_dst_ptr() { return &colidx_[0]; } - index_t* degrees_ptr() { return °rees_[0]; } - edata_t* edge_data_ptr() { return edge_data_; } - vdata_t* vertex_data_ptr() { return vertex_data_; } + iterator begin() const { return iterator(0); } iterator end() const { return iterator(num_vertices_); } void progressPrint(unsigned maxii, unsigned ii); @@ -60,6 +56,7 @@ class LearningGraph { void constructNodes(); void fixEndEdge(index_t vid, index_t row_end); void constructEdge(index_t eid, index_t dst, edata_t edata); + void add_selfloop(); bool isLocal(index_t vid); index_t getLID(index_t vid); @@ -67,6 +64,32 @@ class LearningGraph { std::vector>& getMirrorNodes(); uint64_t numMasters(); uint64_t globalSize(); + +#ifdef CPU_ONLY + index_t* row_start_ptr() { return &rowptr_[0]; } + const index_t* row_start_ptr() const { return &rowptr_[0]; } + index_t* edge_dst_ptr() { return &colidx_[0]; } + const index_t* edge_dst_ptr() const { return &colidx_[0]; } + index_t* degrees_ptr() { return °rees_[0]; } + edata_t* edge_data_ptr() { return edge_data_; } + vdata_t* vertex_data_ptr() { return vertex_data_; } +#else + __device__ index_t getEdgeDst(unsigned edge) { return colidx_[edge]; } + __device__ index_t edge_begin(unsigned src) { return d_rowptr_[src]; } + __device__ index_t edge_end(unsigned src) { return d_rowptr_[src+1]; } + __device__ vdata_t getData(unsigned vid) { return vertex_data_[vid]; } + __device__ index_t getDegree(unsigned vid) { return d_degrees_[vid]; } + index_t *row_start_ptr() { return d_rowptr_; } + const index_t *row_start_ptr() const { return d_rowptr_; } + index_t *edge_dst_ptr() { return d_colidx_; } + const index_t *edge_dst_ptr() const { return d_colidx_; } + index_t* degrees_ptr() { return d_degrees_; } + edata_t *edge_data_ptr() { return d_edge_data_; } + vdata_t *vertex_data_ptr() { return d_vertex_data_; } + //const vdata_t *vertex_data_ptr() const { return vertex_data_; } + //const edata_t *edge_data_ptr() const { return edge_data; } +#endif + }; } diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h index b745f12cb6..c9db614814 100644 --- a/libdeepgalois/include/deepgalois/optimizer.h +++ b/libdeepgalois/include/deepgalois/optimizer.h @@ -29,7 +29,7 @@ struct optimizer { optimizer& operator=(const optimizer&) = default; optimizer& operator=(optimizer&&) = default; virtual ~optimizer() = default; - virtual void update(const vec_t& dW, vec_t& W, bool parallelize) = 0; + virtual void update(const vec_t& dW, vec_t& W) = 0; virtual void update_gpu(const size_t n, const float_t* dW, float_t* W) = 0; virtual void reset() {} // override to implement pre-learning action }; @@ -65,8 +65,8 @@ struct stateful_optimizer : public optimizer { **/ struct adagrad : public stateful_optimizer<1> { adagrad() : alpha(0.01), eps(float_t(1e-8)) {} - void update(const vec_t& dW, vec_t& W, bool parallelize); - void update_gpu(const size_t n, const float_t* dW, float_t* W) {} + void update(const vec_t& dW, vec_t& W); + void update_gpu(const size_t n, const float_t* dW, float_t* W); float_t alpha; // learning rate private: float_t eps; @@ -80,8 +80,8 @@ struct adagrad : public stateful_optimizer<1> { **/ struct RMSprop : public stateful_optimizer<1> { RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {} - void update(const vec_t& dW, vec_t& W, bool parallelize); - void update_gpu(const size_t n, const float_t* dW, float_t* W) {} + void update(const vec_t& dW, vec_t& W); + void update_gpu(const size_t n, const float_t* dW, float_t* W); float_t alpha; // learning rate float_t mu; // decay term private: @@ -94,9 +94,9 @@ struct adam : public stateful_optimizer<2> { adam() : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {} - void update(const vec_t& dW, vec_t& W, bool parallelize); + void update(const vec_t& dW, vec_t& W); #ifdef CPU_ONLY - void update_gpu(const size_t n, const float_t* dW, float_t* W) {} + void update_gpu(const size_t n, const float_t* dW, float_t* W); #else void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif @@ -121,8 +121,8 @@ struct adamax : public stateful_optimizer<2> { adamax() : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1), eps(float_t(1e-8)) {} - void update(const vec_t& dW, vec_t& W, bool parallelize); - void update_gpu(const size_t n, const float_t* dW, float_t* W) {} + void update(const vec_t& dW, vec_t& W); + void update_gpu(const size_t n, const float_t* dW, float_t* W); float_t alpha; // learning rate float_t b1; // decay term @@ -137,8 +137,8 @@ struct adamax : public stateful_optimizer<2> { // slightly faster than tiny_dnn::momentum struct gradient_descent : public optimizer { gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {} - void update(const vec_t& dW, vec_t& W, bool parallelize); - void update_gpu(const size_t n, const float_t* dW, float_t* W) {} + void update(const vec_t& dW, vec_t& W); + void update_gpu(const size_t n, const float_t* dW, float_t* W); float_t alpha; // learning rate float_t lambda; // weight decay }; @@ -153,8 +153,8 @@ struct gradient_descent : public optimizer { struct momentum : public stateful_optimizer<1> { public: momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} - void update(const vec_t& dW, vec_t& W, bool parallelize); - void update_gpu(const size_t n, const float_t* dW, float_t* W) {} + void update(const vec_t& dW, vec_t& W); + void update_gpu(const size_t n, const float_t* dW, float_t* W); float_t alpha; // learning rate float_t lambda; // weight decay @@ -172,8 +172,8 @@ struct nesterov_momentum : public stateful_optimizer<1> { public: nesterov_momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} - void update(const vec_t& dW, vec_t& W, bool parallelize); - void update_gpu(const size_t n, const float_t* dW, float_t* W) {} + void update(const vec_t& dW, vec_t& W); + void update_gpu(const size_t n, const float_t* dW, float_t* W); float_t alpha; // learning rate float_t lambda; // weight decay diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index 87e7411689..e1c405d653 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -41,6 +41,7 @@ enum class net_phase { train, test }; #define MAX_NUM_CLASSES 128 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE) #define USE_CUSPARSE +#define UNUSED(expr) do { (void)(expr); } while (0) #ifdef GALOIS_USE_DIST namespace deepgalois { diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index b298107f4e..4468e72ea7 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -43,6 +43,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, float_t* norm_factor) { + UNUSED(norm); galois::StatTimer Tcsrmm("CSRMM-MKL"); Tcsrmm.start(); unsigned n = g.size(); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 354db106e9..e46a2477a6 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -10,20 +10,7 @@ float_t* _dataToSync = nullptr; //! sync long unsigned _syncVectorSize = 0; - -graph_conv_layer::graph_conv_layer(unsigned level, bool act, bool norm, - bool bias, bool dropout, float_t dropout_rate, - std::vector in_dims, - std::vector out_dims) - : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias), - dropout_(dropout), dropout_rate_(dropout_rate) { - assert(input_dims[0] == output_dims[0]); // num_vertices - trainable_ = true; - name_ = layer_type() + "_" + std::to_string(level); - assert(dropout_rate_ >= 0. && dropout_rate_ < 1.); - scale_ = 1. / (1. - dropout_rate_); -} - +#ifdef CPU_ONLY inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed) { auto init_range = sqrt(6.0 / (dim_x + dim_y)); std::default_random_engine rng(seed); @@ -43,7 +30,6 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t } } -#ifdef CPU_ONLY // aggregate based on graph topology void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { // normalization constant based on graph structure diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp index 3e12a1d603..0e3ea946f0 100644 --- a/libdeepgalois/src/layers/l2_norm_layer.cpp +++ b/libdeepgalois/src/layers/l2_norm_layer.cpp @@ -32,6 +32,7 @@ void l2_norm_layer::forward_propagation(const float_t* in_data, float_t* out_dat void l2_norm_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { + UNUSED(out_data); size_t x = input_dims[0]; size_t y = input_dims[1]; galois::do_all(galois::iterate((size_t)0, x), [&](const auto i) { diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp index f7cfe375cc..e4ebfaad1e 100644 --- a/libdeepgalois/src/layers/leaky_relu_layer.cpp +++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp @@ -22,6 +22,7 @@ void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_ // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™, ๐‘–๐‘“ (๐‘ฆ[๐‘™] > 0) void leaky_relu_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { + UNUSED(in_data); math::d_leaky_relu_cpu(n, epsilon_, out_grad, out_data, in_grad); } #endif diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp index aee6e29a07..e351d11d4f 100644 --- a/libdeepgalois/src/layers/relu_layer.cpp +++ b/libdeepgalois/src/layers/relu_layer.cpp @@ -14,6 +14,7 @@ void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™, ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ void relu_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { + UNUSED(in_data); size_t n = input_dims[0] * input_dims[1]; math::d_relu_cpu(n, out_grad, out_data, in_grad); } diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index 2288a8da26..4cddbaa854 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -44,6 +44,7 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* ou void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { + if (out_grad) delete[] out_grad; size_t len = layer::input_dims[1]; galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { if (!use_mask || masks_[i] == 1) { // masked diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 54e461121f..a53c81488b 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -27,8 +27,7 @@ inline label_t softmax_loss_layer::get_label(size_t i) { // TODO: need kernel fusion optimization // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] -void softmax_loss_layer::forward_propagation(const float_t* in_data, - float_t* out_data) { +void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { size_t len = input_dims[1]; galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { if (!use_mask || masks_[i] == 1) { // masked @@ -46,9 +45,9 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, // in this forward pass; only a post-process pretty much } -void softmax_loss_layer::back_propagation(const float_t* in_data, - const float_t* out_data, +void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { + if (out_grad) delete[] out_grad; // note: out_grad is ignored because it shouldn't exist (this is output layer) size_t len = layer::input_dims[1]; galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { @@ -59,8 +58,7 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, // use ground truth to determine derivative of cross entropy math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]); // derviative softmax to gradient used in the next layer - math::d_softmax(len, &in_data[len * i], &out_data[len * i], - &in_grad[len * i], &norm_grad[0]); + math::d_softmax(len, &in_data[len * i], &out_data[len * i], &in_grad[len * i], &norm_grad[0]); } }, galois::chunk_size<64>(), galois::steal(), galois::loopname("softmax-loss-bw")); diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index a99ce7df36..b9da782599 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -12,9 +12,9 @@ namespace deepgalois { -bool LearningGraph::isLocal(index_t vid) { return true; } +bool LearningGraph::isLocal(index_t vid) { UNUSED(vid); return true; } -index_t LearningGraph::getLID(index_t vid) { return 0; } +index_t LearningGraph::getLID(index_t vid) { UNUSED(vid); return 0; } bool LearningGraph::is_vertex_cut() {return true; } @@ -73,6 +73,7 @@ void LearningGraph::constructEdge(index_t eid, index_t dst, edata_t edata) { assert(dst < num_vertices_); assert(eid < num_edges_); colidx_[eid] = dst; + if (edge_data_) edge_data_[eid] = edata; } void LearningGraph::degree_counting() { @@ -83,6 +84,40 @@ void LearningGraph::degree_counting() { }, galois::loopname("DegreeCounting")); } +void LearningGraph::add_selfloop() { + //print_neighbors(nnodes-1); + //print_neighbors(0); + auto old_colidx_ = colidx_; + colidx_.resize(num_vertices_ + num_edges_); + for (index_t i = 0; i < num_vertices_; i++) { + auto start = rowptr_[i]; + auto end = rowptr_[i+1]; + bool selfloop_inserted = false; + if (start == end) { + colidx_[start+i] = i; + continue; + } + for (auto e = start; e != end; e++) { + auto dst = old_colidx_[e]; + if (!selfloop_inserted) { + if (i < dst) { + selfloop_inserted = true; + colidx_[e+i] = i; + colidx_[e+i+1] = dst; + } else if (e+1 == end) { + selfloop_inserted = true; + colidx_[e+i+1] = i; + colidx_[e+i] = dst; + } else colidx_[e+i] = dst; + } else colidx_[e+i+1] = dst; + } + } + for (index_t i = 0; i <= num_vertices_; i++) rowptr_[i] += i; + num_edges_ += num_vertices_; + //print_neighbors(nnodes-1); + //print_neighbors(0); +} + void LearningGraph::readGraph(std::string path, std::string dataset) { std::string filename = path + dataset + ".csgr"; } diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 4dcfb941ac..27de4e144f 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -64,6 +64,7 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, void csrmm_cpu(const int M, const int N, const int K, const int nnz, const float alpha, float* A_nonzeros, int* A_idx_ptr, int* A_nnz_idx, const float* B, const float beta, float* C) { + UNUSED(nnz); #ifdef USE_MKL //mkl_set_num_threads(56); //const char *matdescra = "GXXCX";//6 bytes @@ -331,6 +332,7 @@ void softmax(size_t n, const float_t* input, float_t* output) { void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp) { + UNUSED(y); vec_t df(n, 0); for (size_t i = 0; i < n; i++) { for (size_t j = 0; j < n; j++) { @@ -374,6 +376,7 @@ void sigmoid(size_t n, const float_t* in, float_t* out) { } void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp) { + UNUSED(y); for (size_t i = 0; i < n; i++) { dy[i] = dp[i] * p[i] * (float_t(1) - p[i]); } diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp index 0f00b4da33..a73b5cd6d2 100644 --- a/libdeepgalois/src/optimizer.cpp +++ b/libdeepgalois/src/optimizer.cpp @@ -4,23 +4,21 @@ namespace deepgalois { -void adagrad::update(const vec_t& dW, vec_t& W, bool parallelize) { +void adagrad::update(const vec_t& dW, vec_t& W) { vec_t& g = get<0>(W); - if (parallelize) { - galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - g[i] += dW[i] * dW[i]; - W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); - }, galois::loopname("adagrad_update")); - } else { + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + g[i] += dW[i] * dW[i]; + W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); + }, galois::loopname("adagrad_update")); +/* for (size_t i = 0; i < W.size(); i++) { g[i] += dW[i] * dW[i]; W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); } - } +*/ } -void RMSprop::update(const vec_t& dW, vec_t& W, bool parallelize) { +void RMSprop::update(const vec_t& dW, vec_t& W) { vec_t& g = get<0>(W); galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { @@ -29,59 +27,54 @@ void RMSprop::update(const vec_t& dW, vec_t& W, bool parallelize) { }, galois::loopname("rms_update")); } -void adam::update(const vec_t& dW, vec_t& W, bool parallelize) { +void adam::update(const vec_t& dW, vec_t& W) { vec_t& mt = get<0>(W); vec_t& vt = get<1>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; - vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; - // L2 norm based update rule - W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / - std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); - }, galois::chunk_size<256>(), galois::steal(), - galois::loopname("adam_update")); + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; + // L2 norm based update rule + W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / + std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); + }, galois::chunk_size<256>(), galois::steal(), galois::loopname("adam_update")); b1_t *= b1; b2_t *= b2; } -void adamax::update(const vec_t& dW, vec_t& W, bool parallelize) { +void adamax::update(const vec_t& dW, vec_t& W) { vec_t& mt = get<0>(W); vec_t& ut = get<1>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; - ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); - // Lp norm based update rule - W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); - }, galois::loopname("adamax_update")); + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); + // Lp norm based update rule + W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); + }, galois::loopname("adamax_update")); b1_t *= b1; } -void gradient_descent::update(const vec_t& dW, vec_t& W, bool parallelize) { - galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); }, - galois::loopname("gradient_descent_update")); +void gradient_descent::update(const vec_t& dW, vec_t& W) { + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); + }, galois::loopname("gradient_descent_update")); } -void momentum::update(const vec_t& dW, vec_t& W, bool parallelize) { +void momentum::update(const vec_t& dW, vec_t& W) { vec_t& dWprev = get<0>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); - W[i] += V; - dWprev[i] = V; - }, galois::loopname("momentum_update")); + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += V; + dWprev[i] = V; + }, galois::loopname("momentum_update")); } -void nesterov_momentum::update(const vec_t& dW, vec_t& W, bool parallelize) { +void nesterov_momentum::update(const vec_t& dW, vec_t& W) { vec_t& dWprev = get<0>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); - W[i] += (-mu) * dWprev[i] + (1 + mu) * V; - dWprev[i] = V; - }, galois::loopname("nesterov_momentum_update")); + galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { + float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += (-mu) * dWprev[i] + (1 + mu) * V; + dWprev[i] = V; + }, galois::loopname("nesterov_momentum_update")); } } // namespace deepgalois diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index 882154f5c0..58a68d1d7d 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -53,6 +53,7 @@ template uint32_t* parallel_prefix_sum(const std::vector Date: Tue, 5 May 2020 18:09:36 -0500 Subject: [PATCH 242/660] fix update_gpu --- libdeepgalois/include/deepgalois/optimizer.h | 18 ++++++++-- .../src/layers/sigmoid_loss_layer.cpp | 2 +- .../src/layers/softmax_loss_layer.cpp | 2 +- libdeepgalois/src/optimizer.cu | 36 +++++++++++++++++++ 4 files changed, 53 insertions(+), 5 deletions(-) diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h index c9db614814..4fd7caa800 100644 --- a/libdeepgalois/include/deepgalois/optimizer.h +++ b/libdeepgalois/include/deepgalois/optimizer.h @@ -30,7 +30,9 @@ struct optimizer { optimizer& operator=(optimizer&&) = default; virtual ~optimizer() = default; virtual void update(const vec_t& dW, vec_t& W) = 0; +#ifndef CPU_ONLY virtual void update_gpu(const size_t n, const float_t* dW, float_t* W) = 0; +#endif virtual void reset() {} // override to implement pre-learning action }; @@ -66,7 +68,9 @@ struct stateful_optimizer : public optimizer { struct adagrad : public stateful_optimizer<1> { adagrad() : alpha(0.01), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W); +#ifndef CPU_ONLY void update_gpu(const size_t n, const float_t* dW, float_t* W); +#endif float_t alpha; // learning rate private: float_t eps; @@ -81,7 +85,9 @@ struct adagrad : public stateful_optimizer<1> { struct RMSprop : public stateful_optimizer<1> { RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W); +#ifndef CPU_ONLY void update_gpu(const size_t n, const float_t* dW, float_t* W); +#endif float_t alpha; // learning rate float_t mu; // decay term private: @@ -95,9 +101,7 @@ struct adam : public stateful_optimizer<2> { : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W); -#ifdef CPU_ONLY - void update_gpu(const size_t n, const float_t* dW, float_t* W); -#else +#ifndef CPU_ONLY void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif @@ -122,7 +126,9 @@ struct adamax : public stateful_optimizer<2> { : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W); +#ifndef CPU_ONLY void update_gpu(const size_t n, const float_t* dW, float_t* W); +#endif float_t alpha; // learning rate float_t b1; // decay term @@ -138,7 +144,9 @@ struct adamax : public stateful_optimizer<2> { struct gradient_descent : public optimizer { gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {} void update(const vec_t& dW, vec_t& W); +#ifndef CPU_ONLY void update_gpu(const size_t n, const float_t* dW, float_t* W); +#endif float_t alpha; // learning rate float_t lambda; // weight decay }; @@ -154,7 +162,9 @@ struct momentum : public stateful_optimizer<1> { public: momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} void update(const vec_t& dW, vec_t& W); +#ifndef CPU_ONLY void update_gpu(const size_t n, const float_t* dW, float_t* W); +#endif float_t alpha; // learning rate float_t lambda; // weight decay @@ -173,7 +183,9 @@ struct nesterov_momentum : public stateful_optimizer<1> { nesterov_momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} void update(const vec_t& dW, vec_t& W); +#ifndef CPU_ONLY void update_gpu(const size_t n, const float_t* dW, float_t* W); +#endif float_t alpha; // learning rate float_t lambda; // weight decay diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index 4cddbaa854..60b4227ac6 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -44,7 +44,7 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* ou void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { - if (out_grad) delete[] out_grad; + UNUSED(out_grad); size_t len = layer::input_dims[1]; galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { if (!use_mask || masks_[i] == 1) { // masked diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index a53c81488b..6360db26be 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -47,7 +47,7 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* ou void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { - if (out_grad) delete[] out_grad; + UNUSED(out_grad); // note: out_grad is ignored because it shouldn't exist (this is output layer) size_t len = layer::input_dims[1]; galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu index 355d959254..3a4365da6e 100644 --- a/libdeepgalois/src/optimizer.cu +++ b/libdeepgalois/src/optimizer.cu @@ -39,4 +39,40 @@ void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { b2_t *= b2; } +void adagrad::update_gpu(const size_t n, const float_t* dW, float_t* W) { + UNUSED(n); + UNUSED(dW); + UNUSED(W); +} + +void RMSprop::update_gpu(const size_t n, const float_t* dW, float_t* W) { + UNUSED(n); + UNUSED(dW); + UNUSED(W); +} + +void adamax::update_gpu(const size_t n, const float_t* dW, float_t* W) { + UNUSED(n); + UNUSED(dW); + UNUSED(W); +} + +void gradient_descent::update_gpu(const size_t n, const float_t* dW, float_t* W) { + UNUSED(n); + UNUSED(dW); + UNUSED(W); +} + +void momentum::update_gpu(const size_t n, const float_t* dW, float_t* W) { + UNUSED(n); + UNUSED(dW); + UNUSED(W); +} + +void nesterov_momentum::update_gpu(const size_t n, const float_t* dW, float_t* W) { + UNUSED(n); + UNUSED(dW); + UNUSED(W); +} + } From f7c41e8a431235f1645bd68d9ca204797fa592be Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 5 May 2020 18:02:49 -0500 Subject: [PATCH 243/660] remove use of unused; unused params leave unnamed instead --- libdeepgalois/include/deepgalois/types.h | 1 - libdeepgalois/src/layers/aggregator.cpp | 3 +-- libdeepgalois/src/layers/l2_norm_layer.cpp | 3 +-- libdeepgalois/src/layers/leaky_relu_layer.cpp | 3 +-- libdeepgalois/src/layers/relu_layer.cpp | 3 +-- libdeepgalois/src/lgraph.cpp | 4 ++-- libdeepgalois/src/math_functions.cpp | 9 +++------ libdeepgalois/src/utils.cpp | 3 +-- 8 files changed, 10 insertions(+), 19 deletions(-) diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index e1c405d653..87e7411689 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -41,7 +41,6 @@ enum class net_phase { train, test }; #define MAX_NUM_CLASSES 128 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE) #define USE_CUSPARSE -#define UNUSED(expr) do { (void)(expr); } while (0) #ifdef GALOIS_USE_DIST namespace deepgalois { diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 4468e72ea7..d17cf79a72 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -42,8 +42,7 @@ void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* ou } void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out, - bool norm, float_t* norm_factor) { - UNUSED(norm); + bool, float_t* norm_factor) { galois::StatTimer Tcsrmm("CSRMM-MKL"); Tcsrmm.start(); unsigned n = g.size(); diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp index 0e3ea946f0..a5a77eb82e 100644 --- a/libdeepgalois/src/layers/l2_norm_layer.cpp +++ b/libdeepgalois/src/layers/l2_norm_layer.cpp @@ -30,9 +30,8 @@ void l2_norm_layer::forward_propagation(const float_t* in_data, float_t* out_dat }, galois::loopname("l2_norm")); } -void l2_norm_layer::back_propagation(const float_t* in_data, const float_t* out_data, +void l2_norm_layer::back_propagation(const float_t* in_data, const float_t*, float_t* out_grad, float_t* in_grad) { - UNUSED(out_data); size_t x = input_dims[0]; size_t y = input_dims[1]; galois::do_all(galois::iterate((size_t)0, x), [&](const auto i) { diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp index e4ebfaad1e..f0ea5f591e 100644 --- a/libdeepgalois/src/layers/leaky_relu_layer.cpp +++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp @@ -20,9 +20,8 @@ void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_ // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ * ฮต, ๐‘–๐‘“ (๐‘ฆ[๐‘™] โ‰ค 0) // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™, ๐‘–๐‘“ (๐‘ฆ[๐‘™] > 0) -void leaky_relu_layer::back_propagation(const float_t* in_data, const float_t* out_data, +void leaky_relu_layer::back_propagation(const float_t*, const float_t* out_data, float_t* out_grad, float_t* in_grad) { - UNUSED(in_data); math::d_leaky_relu_cpu(n, epsilon_, out_grad, out_data, in_grad); } #endif diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp index e351d11d4f..9e54d64975 100644 --- a/libdeepgalois/src/layers/relu_layer.cpp +++ b/libdeepgalois/src/layers/relu_layer.cpp @@ -12,9 +12,8 @@ void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™, ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ -void relu_layer::back_propagation(const float_t* in_data, const float_t* out_data, +void relu_layer::back_propagation(const float_t*, const float_t* out_data, float_t* out_grad, float_t* in_grad) { - UNUSED(in_data); size_t n = input_dims[0] * input_dims[1]; math::d_relu_cpu(n, out_grad, out_data, in_grad); } diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index b9da782599..6531034794 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -12,9 +12,9 @@ namespace deepgalois { -bool LearningGraph::isLocal(index_t vid) { UNUSED(vid); return true; } +bool LearningGraph::isLocal(index_t) { return true; } -index_t LearningGraph::getLID(index_t vid) { UNUSED(vid); return 0; } +index_t LearningGraph::getLID(index_t) { return 0; } bool LearningGraph::is_vertex_cut() {return true; } diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 27de4e144f..9e1b997f47 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -61,10 +61,9 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, Tmatmul.stop(); } -void csrmm_cpu(const int M, const int N, const int K, const int nnz, +void csrmm_cpu(const int M, const int N, const int K, const int, const float alpha, float* A_nonzeros, int* A_idx_ptr, int* A_nnz_idx, const float* B, const float beta, float* C) { - UNUSED(nnz); #ifdef USE_MKL //mkl_set_num_threads(56); //const char *matdescra = "GXXCX";//6 bytes @@ -330,9 +329,8 @@ void softmax(size_t n, const float_t* input, float_t* output) { output[i] /= denominator; } -void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, +void d_softmax(size_t n, const float_t*, const float_t* p, float_t* dy, const float_t* dp) { - UNUSED(y); vec_t df(n, 0); for (size_t i = 0; i < n; i++) { for (size_t j = 0; j < n; j++) { @@ -375,8 +373,7 @@ void sigmoid(size_t n, const float_t* in, float_t* out) { } } -void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp) { - UNUSED(y); +void d_sigmoid(size_t n, const float_t*, const float_t* p, float_t* dy, const float_t* dp) { for (size_t i = 0; i < n; i++) { dy[i] = dp[i] * p[i] * (float_t(1) - p[i]); } diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index 58a68d1d7d..00a7d5696a 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -51,9 +51,8 @@ template uint32_t* parallel_prefix_sum(const std::vector Date: Tue, 5 May 2020 18:09:58 -0500 Subject: [PATCH 244/660] mkl: don't name params if not used --- libdeepgalois/src/math_functions.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 9e1b997f47..0923411ff2 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -61,9 +61,15 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, Tmatmul.stop(); } -void csrmm_cpu(const int M, const int N, const int K, const int, +#ifdef USE_MKL +void csrmm_cpu(const int M, const int N, const int K, const int, const float alpha, float* A_nonzeros, int* A_idx_ptr, int* A_nnz_idx, const float* B, const float beta, float* C) { +#else +void csrmm_cpu(const int, const int, const int, const int, + const float, float*, int*, int*, + const float*, const float, float*) { +#endif #ifdef USE_MKL //mkl_set_num_threads(56); //const char *matdescra = "GXXCX";//6 bytes @@ -99,14 +105,14 @@ void csrmm_cpu(const int M, const int N, const int K, const int, } // matrix-vector multiply -void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, +void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, const float* A, const float* x, const float beta, float* y) { cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); } inline void rng_uniform_cpu(size_t n, float_t* r) { #ifdef USE_MKL - VSLStreamStatePtr stream; + VSLStreamStatePtr stream; // Initializing the streams vslNewStream(&stream, VSL_BRNG_SOBOL, 1); // Generating @@ -238,7 +244,7 @@ void clear_cpu(size_t n, float_t* in) { // memset(in, 0, n*sizeof(float_t)); } -void dropout(size_t m, float scale, float dropout_rate, +void dropout(size_t m, float scale, float dropout_rate, const float_t* in, mask_t* masks, float_t* out) { for (size_t i = 0; i < m; ++i) masks[i] = bernoulli(dropout_rate); @@ -310,7 +316,7 @@ void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out) }, galois::chunk_size<64>(), galois::loopname("leaky_relu")); } -void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, +void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, const float_t* data, float_t* out) { // TODO: vectorize galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { From 2af0f556f866bb6bf5e4f3cf94d57873d33de6e1 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 5 May 2020 18:13:56 -0500 Subject: [PATCH 245/660] got rid of more unused calls in layers/opt --- .../src/layers/sigmoid_loss_layer.cpp | 3 +- .../src/layers/softmax_loss_layer.cpp | 3 +- libdeepgalois/src/optimizer.cu | 36 ++++--------------- 3 files changed, 8 insertions(+), 34 deletions(-) diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index 60b4227ac6..5a511d2308 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -43,8 +43,7 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* ou } void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, - float_t* out_grad, float_t* in_grad) { - UNUSED(out_grad); + float_t*, float_t* in_grad) { size_t len = layer::input_dims[1]; galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { if (!use_mask || masks_[i] == 1) { // masked diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 6360db26be..2fc7ac80dc 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -46,8 +46,7 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* ou } void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, - float_t* out_grad, float_t* in_grad) { - UNUSED(out_grad); + float_t*, float_t* in_grad) { // note: out_grad is ignored because it shouldn't exist (this is output layer) size_t len = layer::input_dims[1]; galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu index 3a4365da6e..6953a804c1 100644 --- a/libdeepgalois/src/optimizer.cu +++ b/libdeepgalois/src/optimizer.cu @@ -39,40 +39,16 @@ void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { b2_t *= b2; } -void adagrad::update_gpu(const size_t n, const float_t* dW, float_t* W) { - UNUSED(n); - UNUSED(dW); - UNUSED(W); -} +void adagrad::update_gpu(const size_t, const float_t*, float_t*) {} -void RMSprop::update_gpu(const size_t n, const float_t* dW, float_t* W) { - UNUSED(n); - UNUSED(dW); - UNUSED(W); -} +void RMSprop::update_gpu(const size_t, const float_t*, float_t*) {} -void adamax::update_gpu(const size_t n, const float_t* dW, float_t* W) { - UNUSED(n); - UNUSED(dW); - UNUSED(W); -} +void adamax::update_gpu(const size_t, const float_t*, float_t*) {} -void gradient_descent::update_gpu(const size_t n, const float_t* dW, float_t* W) { - UNUSED(n); - UNUSED(dW); - UNUSED(W); -} +void gradient_descent::update_gpu(const size_t, const float_t*, float_t*) {} -void momentum::update_gpu(const size_t n, const float_t* dW, float_t* W) { - UNUSED(n); - UNUSED(dW); - UNUSED(W); -} +void momentum::update_gpu(const size_t, const float_t*, float_t*) {} -void nesterov_momentum::update_gpu(const size_t n, const float_t* dW, float_t* W) { - UNUSED(n); - UNUSED(dW); - UNUSED(W); -} +void nesterov_momentum::update_gpu(const size_t, const float_t*, float_t*) {} } From 089ff4d1b076155074768001f14054e252dd44ea Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 5 May 2020 18:28:37 -0500 Subject: [PATCH 246/660] Unused vars fix for distributed gcn --- libdeepgalois/include/deepgalois/DistContext.h | 10 +++++++--- .../include/deepgalois/layers/GluonGradients.h | 8 ++++---- .../include/deepgalois/layers/GradientSyncStructs.h | 8 ++++---- .../deepgalois/layers/GraphConvSyncStructures.h | 8 ++++---- libdeepgalois/src/DistContext.cpp | 3 ++- libdeepgalois/src/net.cpp | 3 +++ libgalois/include/galois/graphs/LC_CSR_Graph.h | 3 +-- lonestar/gnn/include/DistributedGraphLoader.h | 3 +-- 8 files changed, 26 insertions(+), 20 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 953010f09a..7fce4a12d9 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -49,9 +49,13 @@ class DistContext { //! find norm factor by looking at degree // TODO this is a distributed operation void norm_factor_computing(bool is_subgraph, int subg_id = 0); - void createSubgraphs(int num_subgraphs) {} - void gen_subgraph_labels(size_t m, const mask_t *masks) {} - void gen_subgraph_feats(size_t m, const mask_t *masks) {} + //void createSubgraphs(int num_subgraphs) {} + //void gen_subgraph_labels(size_t m, const mask_t *masks) {} + //void gen_subgraph_feats(size_t m, const mask_t *masks) {} + // TODO define these + void createSubgraphs(int) {} + void gen_subgraph_labels(size_t, const mask_t *) {} + void gen_subgraph_feats(size_t, const mask_t *) {} float_t* get_norm_factors_ptr() { return norm_factors; } Graph* getGraphPointer() { return graph_cpu; } diff --git a/libdeepgalois/include/deepgalois/layers/GluonGradients.h b/libdeepgalois/include/deepgalois/layers/GluonGradients.h index 1643a62027..a7aa66d576 100644 --- a/libdeepgalois/include/deepgalois/layers/GluonGradients.h +++ b/libdeepgalois/include/deepgalois/layers/GluonGradients.h @@ -164,22 +164,22 @@ class GluonGradients { } //! no edges, return 0 - unsigned edge_begin(uint32_t dummy) { + unsigned edge_begin(uint32_t) { return 0; } //! no edges, return 0 - unsigned edge_end(uint32_t dummy) { + unsigned edge_end(uint32_t) { return 0; } //! no edges, return 0 - unsigned getEdgeDst(uint32_t dummy) { + unsigned getEdgeDst(uint32_t) { return 0; } //! no edges, return 0 - unsigned getEdgeData(uint32_t dummy) { + unsigned getEdgeData(uint32_t) { return 0; } }; diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h index d0074d11ed..1d26b87007 100644 --- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h +++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h @@ -6,11 +6,11 @@ struct GradientSync { using ValTy = float_t; - static ValTy extract(uint32_t node_id, float_t& weight) { + static ValTy extract(uint32_t, float_t& weight) { return weight; } - static bool reduce(uint32_t node_id, float_t& weight, ValTy y) { + static bool reduce(uint32_t, float_t& weight, ValTy y) { // TODO merge function here // for now make sure the weights are close enough //if (std::abs(weight - y) > 0.00001) { @@ -21,12 +21,12 @@ struct GradientSync { } //! reset weight to 0 - static void reset(uint32_t node_id, float_t &weight) { + static void reset(uint32_t, float_t &weight) { weight = 0; } //! save weight - static void setVal(uint32_t node_id, float_t &weight, ValTy y) { + static void setVal(uint32_t, float_t &weight, ValTy y) { weight = y; } diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h index 3b95d55f82..e4874e468f 100644 --- a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h +++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h @@ -5,7 +5,7 @@ struct GraphConvSync { using ValTy = std::vector; //! return a vector of floats to sync - static ValTy extract(uint32_t node_id, char& filler) { + static ValTy extract(uint32_t node_id, char&) { // TODO figure out how to avoid copy from C array to vector; best // way is if original data is in a vector probably, but that has the // issue of not being able to directly call BLAS @@ -23,7 +23,7 @@ struct GraphConvSync { //! reduction is addition in this case; add received vector to //! own vector - static bool reduce(uint32_t node_id, char& filler, ValTy y) { + static bool reduce(uint32_t node_id, char&, ValTy y) { assert(y.size() == deepgalois::_syncVectorSize); // loop and do addition for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) { @@ -33,11 +33,11 @@ struct GraphConvSync { } //! do nothing (waste of a write) - static void reset(uint32_t node_id, char& filler) { + static void reset(uint32_t, char&) { } //! element wise set - static void setVal(uint32_t node_id, char& filler, ValTy y) { + static void setVal(uint32_t node_id, char&, ValTy y) { assert(y.size() == deepgalois::_syncVectorSize); // loop and do addition for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) { diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 3f915ec062..66a1a0885e 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -151,7 +151,8 @@ float_t* DistContext::get_in_ptr() { return &h_feats[0]; } -void DistContext::norm_factor_computing(bool is_subgraph, int subg_id) { +//void DistContext::norm_factor_computing(bool is_subgraph, int subg_id) { +void DistContext::norm_factor_computing(bool, int) { // TODO: this is a distributed operation // create for now, TODO need to actually fill it in diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index d62ac752b1..052fab6a40 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -38,6 +38,9 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, int epochs, context = new deepgalois::Context(); num_samples = context->read_graph(dataset_str, selfloop); context->set_label_class(is_single_class); +#else + // only done here to avoid unused var complain TODO find better way + (void)selfloop; #endif // read graph, get num nodes diff --git a/libgalois/include/galois/graphs/LC_CSR_Graph.h b/libgalois/include/galois/graphs/LC_CSR_Graph.h index 19aefefb27..6e8c65012b 100644 --- a/libgalois/include/galois/graphs/LC_CSR_Graph.h +++ b/libgalois/include/galois/graphs/LC_CSR_Graph.h @@ -748,8 +748,7 @@ class LC_CSR_Graph : } template - void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, EdgeIndexTy e_new, - EdgeIndexTy e, + void edgeDataCopy(EdgeData&, EdgeData&, EdgeIndexTy, EdgeIndexTy, typename std::enable_if::type* = 0) { // does nothing } diff --git a/lonestar/gnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h index f5a896b3de..7827c1a39f 100644 --- a/lonestar/gnn/include/DistributedGraphLoader.h +++ b/lonestar/gnn/include/DistributedGraphLoader.h @@ -117,13 +117,12 @@ namespace graphs { * * @tparam NodeData node data to store in graph * @tparam EdgeData edge data to store in graph - * @param scaleFactor How to split nodes among hosts * @returns a pointer to a newly allocated DistGraph based on the command line * loaded based on command line arguments */ template DistGraph* -constructSymmetricGraph(std::vector& scaleFactor) { +constructSymmetricGraph(std::vector&) { std::string inputFile = deepgalois::path + dataset + ".csgr"; galois::gInfo("File to read is ", inputFile); From 7cb394b37117a08a73d5224ffff9ec7c9ddb57f3 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 5 May 2020 19:59:32 -0500 Subject: [PATCH 247/660] add reader --- libdeepgalois/CMakeLists.txt | 16 +- libdeepgalois/include/deepgalois/context.h | 24 ++- libdeepgalois/include/deepgalois/reader.h | 18 +++ libdeepgalois/src/context.cpp | 179 ++------------------- libdeepgalois/src/context.cu | 19 +-- libdeepgalois/src/net.cpp | 19 +-- libdeepgalois/src/reader.cpp | 144 +++++++++++++++++ libgpu/include/checker.h | 15 -- libgpu/include/gg.h | 1 - libgpu/include/graph_gpu.h | 13 -- 10 files changed, 212 insertions(+), 236 deletions(-) create mode 100644 libdeepgalois/include/deepgalois/reader.h create mode 100644 libdeepgalois/src/reader.cpp delete mode 100644 libgpu/include/checker.h diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 3f5bc11a95..b46750b060 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -70,6 +70,12 @@ else() endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") +if(ENABLE_HETERO_GALOIS) +set(sources + src/node.cpp + src/net.cpp +) +else() if(ENABLE_DIST_GALOIS) # do not link regular context.cpp; TODO do this conditional in cleaner way # also don't link sampler @@ -85,20 +91,13 @@ set(sources src/layers/layer.cpp src/DistContext.cpp src/optimizer.cpp + src/reader.cpp src/lgraph.cpp src/utils.cpp src/node.cpp src/net.cpp ) else() -if(ENABLE_DIST_GALOIS) -set(sources - src/context.cpp - src/lgraph.cpp - src/node.cpp - src/net.cpp -) -else() set(sources src/layers/softmax_loss_layer.cpp src/layers/sigmoid_loss_layer.cpp @@ -112,6 +111,7 @@ set(sources src/optimizer.cpp src/context.cpp src/sampler.cpp + src/reader.cpp src/lgraph.cpp src/utils.cpp src/node.cpp diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index 5683c26f12..f9ca056421 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -6,7 +6,7 @@ #include #include #include "deepgalois/types.h" -//#include +#include "deepgalois/reader.h" #ifdef CPU_ONLY #include "deepgalois/gtypes.h" #else @@ -19,21 +19,25 @@ namespace deepgalois { class Context { public: Context(); + Context(bool use_gpu) : is_device(use_gpu), n(0), num_classes(0), feat_len(0), is_single_class(true), + is_selfloop_added(false), use_subgraph(false), h_labels(NULL), h_feats(NULL), + d_labels(NULL), d_labels_subg(NULL), d_feats(NULL), d_feats_subg(NULL), norm_factors(NULL) {} + ~Context(); - size_t read_graph(std::string dataset_str, bool selfloop); - size_t read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop); - size_t read_graph_gpu(std::string dataset_str, bool selfloop); - size_t read_labels(std::string dataset_str); - size_t read_features(std::string dataset_str, std::string filetype = "bin"); - size_t read_masks(std::string dataset_str, std::string mask_type, - size_t n, size_t& begin, size_t& end, mask_t* masks); + size_t read_graph(bool selfloop); + size_t read_labels() { num_classes = reader.read_labels(is_single_class, h_labels); return num_classes; } + size_t read_features() { feat_len = reader.read_features(h_feats); return feat_len; } + size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks) { + return reader.read_masks(mask_type, n, begin, end, masks); + } label_t get_label(size_t i) { return h_labels[i]; } // single-class (one-hot) label //label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label float_t* get_norm_factors_ptr() { return norm_factors; } float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; } + void set_dataset(std::string dataset_str) { dataset = dataset_str; reader.init(dataset); } void set_label_class(bool is_single = true) { is_single_class = is_single; } void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; } void copy_data_to_device(); // copy labels and input features @@ -69,6 +73,8 @@ class Context { #endif protected: + std::string dataset; + bool is_device; // is this on device or host size_t n; // number of samples: N size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D @@ -88,6 +94,8 @@ class Context { std::vector h_feats_subg; // input features for subgraph std::vector norm_factors_subg; // normalization constant for subgraph //float_t* norm_factors_subg; // normalization constant for subgraph + Reader reader; + void alloc_norm_factor(); void alloc_subgraph_norm_factor(int subg_id); diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h new file mode 100644 index 0000000000..090ec817f8 --- /dev/null +++ b/libdeepgalois/include/deepgalois/reader.h @@ -0,0 +1,18 @@ +#pragma once +#include "deepgalois/types.h" + +namespace deepgalois { + +class Reader { +private: + std::string dataset_str; +public: + Reader() : dataset_str("") {} + Reader(std::string dataset) : dataset_str(dataset) {} + void init(std::string dataset) { dataset_str = dataset; } + size_t read_labels(bool is_single_class, label_t*& labels); + size_t read_features(float_t*& feats, std::string filetype = "bin"); + size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks); +}; + +} diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index caec001182..71410eee13 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -7,42 +7,20 @@ //#include namespace deepgalois { -/* -// Make sure each thread can have different values. -static boost::thread_specific_ptr thread_instance_; -Context& Context::Get() { - if (!thread_instance_.get()) { - thread_instance_.reset(new Context()); - } - return *(thread_instance_.get()); -} -*/ #ifdef CPU_ONLY -Context::Context() : n(0), num_classes(0), - feat_len(0), is_single_class(true), - is_selfloop_added(false), use_subgraph(false), - h_labels(NULL), h_feats(NULL), - //h_labels_subg(NULL), h_feats_subg(NULL), - d_labels(NULL), d_labels_subg(NULL), - d_feats(NULL), d_feats_subg(NULL), - norm_factors(NULL) {} - //norm_factors_subg(NULL) {} + +Context::Context() : Context(false) {} Context::~Context() { if (h_labels) delete[] h_labels; - //if (h_labels_subg) delete[] h_labels_subg; if (h_feats) delete[] h_feats; - //if (h_feats_subg) delete[] h_feats_subg; if (norm_factors) delete[] norm_factors; + //if (h_feats_subg) delete[] h_feats_subg; + //if (h_labels_subg) delete[] h_labels_subg; //if (norm_factors_subg) delete[] norm_factors_subg; } -size_t Context::read_graph(std::string dataset_str, bool selfloop) { - n = read_graph_cpu(dataset_str, "gr", selfloop); - return n; -} - void Context::createSubgraphs(int num_subgraphs) { subgraphs_cpu.resize(num_subgraphs); for (int i = 0; i < num_subgraphs; i++) @@ -83,19 +61,20 @@ void Context::gen_subgraph_feats(size_t m, const mask_t *masks) { } } -size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bool selfloop) { - std::string filename = path + dataset_str + ".csgr"; +size_t Context::read_graph(bool selfloop) { + std::string filename = path + dataset + ".csgr"; + std::string filetype = "gr"; galois::StatTimer Tread("GraphReadingTime"); Tread.start(); if (filetype == "el") { - filename = path + dataset_str + ".el"; + filename = path + dataset + ".el"; printf("Reading .el file: %s\n", filename.c_str()); read_edgelist(filename.c_str(), true); // symmetrize } else if (filetype == "bin") { graph_cpu->readGraphFromGRFile(filename); } else if (filetype == "gr") { graph_cpu = new Graph(); - std::string filename = path + dataset_str + ".csgr"; + std::string filename = path + dataset + ".csgr"; printf("Reading .gr file: %s\n", filename.c_str()); if (selfloop) { Graph graph_temp; @@ -114,7 +93,8 @@ size_t Context::read_graph_cpu(std::string dataset_str, std::string filetype, bo auto g = getGraphPointer(); std::cout << "num_vertices " << g->size() << " num_edges " << g->sizeEdges() << "\n"; - return g->size(); + n = g->size(); + return n; } void Context::add_selfloop(Graph &og, Graph &g) { @@ -269,144 +249,7 @@ void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self #endif -// labels contain the ground truth (e.g. vertex classes) for each example -// (num_examples x 1). Note that labels is not one-hot encoded vector and it can -// be computed as y.argmax(axis=1) from one-hot encoded vector (y) of labels if -// required. -size_t Context::read_labels(std::string dataset_str) { - std::cout << "Reading labels ... "; - Timer t_read; - t_read.Start(); - std::string filename = path + dataset_str + "-labels.txt"; - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - size_t m; // m: number of samples - in >> m >> num_classes >> std::ws; - assert(m == n); - if (is_single_class) { - std::cout << "Using single-class (one-hot) labels\n"; - h_labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1 - } else { - std::cout << "Using multi-class labels\n"; - h_labels = new label_t[m*num_classes]; // multi-class label for each vertex: N x E - } - unsigned v = 0; - while (std::getline(in, line)) { - std::istringstream label_stream(line); - unsigned x; - for (size_t idx = 0; idx < num_classes; ++idx) { - label_stream >> x; - if (is_single_class) { - if (x != 0) { - h_labels[v] = idx; - break; - } - } else { - h_labels[v*num_classes+idx] = x; - } - } - v++; - } - in.close(); - t_read.Stop(); - // print the number of vertex classes - std::cout << "Done, unique label counts: " << num_classes - << ", time: " << t_read.Millisecs() << " ms\n"; - //for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " << unsigned(labels[i]) << "\n"; - return num_classes; -} - -//! Read features, return the length of a feature vector -//! Features are stored in the Context class -size_t Context::read_features(std::string dataset_str, std::string filetype) { - //filetype = "txt"; - std::cout << "Reading features ... "; - Timer t_read; - t_read.Start(); - size_t m; // m = number of vertices - std::string filename = path + dataset_str + ".ft"; - std::ifstream in; - - if (filetype == "bin") { - std::string file_dims = path + dataset_str + "-dims.txt"; - std::ifstream ifs; - ifs.open(file_dims, std::ios::in); - ifs >> m >> feat_len >> std::ws; - ifs.close(); - } else { - in.open(filename, std::ios::in); - in >> m >> feat_len >> std::ws; - } - std::cout << "N x D: " << m << " x " << feat_len << "\n"; - h_feats = new float_t[m * feat_len]; - if (filetype == "bin") { - filename = path + dataset_str + "-feats.bin"; - in.open(filename, std::ios::binary|std::ios::in); - in.read((char*)h_feats, sizeof(float_t) * m * feat_len); - } else { - std::string line; - while (std::getline(in, line)) { - std::istringstream edge_stream(line); - unsigned u, v; - float_t w; - edge_stream >> u; - edge_stream >> v; - edge_stream >> w; - h_feats[u * feat_len + v] = w; - } - } - in.close(); - t_read.Stop(); - std::cout << "Done, feature length: " << feat_len - << ", time: " << t_read.Millisecs() << " ms\n"; - //for (auto i = 0; i < 6; i ++) - //for (auto j = 0; j < 6; j ++) - //std::cout << "feats[" << i << "][" << j << "] = " << h_feats[i*feat_len+j] << "\n"; - return feat_len; -} -//! Get masks from datafile where first line tells range of -//! set to create mask from -size_t Context::read_masks(std::string dataset_str, std::string mask_type, - size_t n, size_t& begin, size_t& end, mask_t* masks) { - bool dataset_found = false; - for (int i = 0; i < NUM_DATASETS; i++) { - if (dataset_str == dataset_names[i]) { - dataset_found = true; - break; - } - } - if (!dataset_found) { - std::cout << "Dataset currently not supported\n"; - exit(1); - } - size_t i = 0; - size_t sample_count = 0; - std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; - // std::cout << "Reading " << filename << "\n"; - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - in >> begin >> end >> std::ws; - while (std::getline(in, line)) { - std::istringstream mask_stream(line); - if (i >= begin && i < end) { - unsigned mask = 0; - mask_stream >> mask; - if (mask == 1) { - masks[i] = 1; - sample_count++; - } - } - i++; - } - std::cout << mask_type + "_mask range: [" << begin << ", " << end - << ") Number of valid samples: " << sample_count << " (" - << (float)sample_count/(float)n*(float)100 << "\%)\n"; - in.close(); - return sample_count; -} /* inline void init_features(size_t dim, vec_t &x) { diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 99f14ce11a..7f435e8ca8 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -62,12 +62,7 @@ cusparseHandle_t Context::cusparse_handle_ = 0; cusparseMatDescr_t Context::cusparse_matdescr_ = 0; curandGenerator_t Context::curand_generator_ = 0; -Context::Context() : n(0), num_classes(0), feat_len(0), - is_single_class(true), is_selfloop_added(false), - h_labels(NULL), h_feats(NULL), - d_labels(NULL), d_feats(NULL), - d_labels_subg(NULL), d_feats_subg(NULL), - norm_factors(NULL) { +Context::Context() : Context(true) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_)); @@ -100,11 +95,6 @@ void Context::gen_subgraph_labels(size_t m, const mask_t *masks) { void Context::gen_subgraph_feats(size_t m, const mask_t *masks) { } -size_t Context::read_graph(std::string dataset_str, bool selfloop) { - n = read_graph_gpu(dataset_str, selfloop); - return n; -} - void Context::norm_factor_computing(bool is_subgraph, int subg_id) { std::cout << "Pre-computing normalization factor (n=" << n << ") ... "; if (!is_selfloop_added) { @@ -136,8 +126,8 @@ void Context::SetDevice(const int device_id) { CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); } */ -size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) { - std::string filename = path + dataset_str + ".csgr"; +size_t Context::read_graph(bool selfloop) { + std::string filename = path + dataset + ".csgr"; CSRGraph g; g.read(filename.c_str(), false); if (selfloop) { @@ -145,7 +135,8 @@ size_t Context::read_graph_gpu(std::string dataset_str, bool selfloop) { is_selfloop_added = selfloop; } g.copy_to_gpu(graph_gpu); - return graph_gpu.nnodes; + n = graph_gpu.nnodes; + return n; } void Context::copy_data_to_device() { diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 052fab6a40..719b5267ee 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -36,7 +36,8 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, int epochs, ", weight_decay ", weight_decay, "\n"); #ifndef GALOIS_USE_DIST context = new deepgalois::Context(); - num_samples = context->read_graph(dataset_str, selfloop); + context->set_dataset(dataset_str); + num_samples = context->read_graph(selfloop); context->set_label_class(is_single_class); #else // only done here to avoid unused var complain TODO find better way @@ -44,7 +45,7 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, int epochs, #endif // read graph, get num nodes - num_classes = context->read_labels(dataset_str); + num_classes = context->read_labels(); #ifndef GALOIS_USE_DIST //std::cout << "Reading label masks ... "; @@ -62,8 +63,8 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, int epochs, for (size_t i = train_begin; i < train_end; i++) train_masks[i] = 1; for (size_t i = val_begin; i < val_end; i++) val_masks[i] = 1; } else { - train_count = context->read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks); - val_count = context->read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks); + train_count = context->read_masks("train", num_samples, train_begin, train_end, train_masks); + val_count = context->read_masks("val", num_samples, val_begin, val_end, val_masks); } #endif @@ -79,7 +80,7 @@ void Net::init(std::string dataset_str, int nt, unsigned n_conv, int epochs, if (has_dense) num_layers ++; // initialize feature metadata feature_dims.resize(num_layers + 1); - feature_dims[0] = context->read_features(dataset_str); // input feature dimension: D + feature_dims[0] = context->read_features(); // input feature dimension: D for (size_t i = 1; i < num_conv_layers; i++) feature_dims[i] = hidden1; // hidden1 level embedding: 16 feature_dims[num_conv_layers] = num_classes; // output embedding: E @@ -133,8 +134,8 @@ void Net::dist_init(Graph* graph, std::string dataset_str) { } } } else { - train_count = context->read_masks(dataset_str, "train", num_samples, train_begin, train_end, train_masks, dGraph); - val_count = context->read_masks(dataset_str, "val", num_samples, val_begin, val_end, val_masks, dGraph); + train_count = context->read_masks("train", num_samples, train_begin, train_end, train_masks, dGraph); + val_count = context->read_masks("val", num_samples, val_begin, val_end, val_masks, dGraph); } } #endif @@ -486,9 +487,9 @@ void Net::read_test_masks(std::string dataset) { #endif } else { #ifndef GALOIS_USE_DIST - test_count = context->read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks); + test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks); #else - test_count = context->read_masks(dataset, "test", num_samples, test_begin, test_end, test_masks, dGraph); + test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks, dGraph); #endif } #ifndef CPU_ONLY diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp new file mode 100644 index 0000000000..2ea8134254 --- /dev/null +++ b/libdeepgalois/src/reader.cpp @@ -0,0 +1,144 @@ +#include "deepgalois/reader.h" +#include "deepgalois/utils.h" +#include "deepgalois/configs.h" + +namespace deepgalois { + +// labels contain the ground truth (e.g. vertex classes) for each example +// (num_examples x 1). Note that labels is not one-hot encoded vector and it can +// be computed as y.argmax(axis=1) from one-hot encoded vector (y) of labels if +// required. +size_t Reader::read_labels(bool is_single_class, label_t*& labels) { + std::cout << "Reading labels ... "; + Timer t_read; + t_read.Start(); + std::string filename = path + dataset_str + "-labels.txt"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + size_t m, num_classes; // m: number of samples + in >> m >> num_classes >> std::ws; + if (is_single_class) { + std::cout << "Using single-class (one-hot) labels\n"; + labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1 + } else { + std::cout << "Using multi-class labels\n"; + labels = new label_t[m*num_classes]; // multi-class label for each vertex: N x E + } + unsigned v = 0; + while (std::getline(in, line)) { + std::istringstream label_stream(line); + unsigned x; + for (size_t idx = 0; idx < num_classes; ++idx) { + label_stream >> x; + if (is_single_class) { + if (x != 0) { + labels[v] = idx; + break; + } + } else { + labels[v*num_classes+idx] = x; + } + } + v++; + } + in.close(); + t_read.Stop(); + // print the number of vertex classes + std::cout << "Done, unique label counts: " << num_classes + << ", time: " << t_read.Millisecs() << " ms\n"; + //for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " << unsigned(labels[i]) << "\n"; + return num_classes; +} + +//! Read features, return the length of a feature vector +//! Features are stored in the Context class +size_t Reader::read_features(float_t*& feats, std::string filetype) { + //filetype = "txt"; + std::cout << "Reading features ... "; + Timer t_read; + t_read.Start(); + size_t m, feat_len; // m = number of vertices + std::string filename = path + dataset_str + ".ft"; + std::ifstream in; + + if (filetype == "bin") { + std::string file_dims = path + dataset_str + "-dims.txt"; + std::ifstream ifs; + ifs.open(file_dims, std::ios::in); + ifs >> m >> feat_len >> std::ws; + ifs.close(); + } else { + in.open(filename, std::ios::in); + in >> m >> feat_len >> std::ws; + } + std::cout << "N x D: " << m << " x " << feat_len << "\n"; + feats = new float_t[m * feat_len]; + if (filetype == "bin") { + filename = path + dataset_str + "-feats.bin"; + in.open(filename, std::ios::binary|std::ios::in); + in.read((char*)feats, sizeof(float_t) * m * feat_len); + } else { + std::string line; + while (std::getline(in, line)) { + std::istringstream edge_stream(line); + unsigned u, v; + float_t w; + edge_stream >> u; + edge_stream >> v; + edge_stream >> w; + feats[u * feat_len + v] = w; + } + } + in.close(); + t_read.Stop(); + std::cout << "Done, feature length: " << feat_len + << ", time: " << t_read.Millisecs() << " ms\n"; + //for (auto i = 0; i < 6; i ++) + //for (auto j = 0; j < 6; j ++) + //std::cout << "feats[" << i << "][" << j << "] = " << feats[i*feat_len+j] << "\n"; + return feat_len; +} + +//! Get masks from datafile where first line tells range of +//! set to create mask from +size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks) { + bool dataset_found = false; + for (int i = 0; i < NUM_DATASETS; i++) { + if (dataset_str == dataset_names[i]) { + dataset_found = true; + break; + } + } + if (!dataset_found) { + std::cout << "Dataset currently not supported\n"; + exit(1); + } + size_t i = 0; + size_t sample_count = 0; + std::string filename = path + dataset_str + "-" + mask_type + "_mask.txt"; + // std::cout << "Reading " << filename << "\n"; + std::ifstream in; + std::string line; + in.open(filename, std::ios::in); + in >> begin >> end >> std::ws; + while (std::getline(in, line)) { + std::istringstream mask_stream(line); + if (i >= begin && i < end) { + unsigned mask = 0; + mask_stream >> mask; + if (mask == 1) { + masks[i] = 1; + sample_count++; + } + } + i++; + } + std::cout << mask_type + "_mask range: [" << begin << ", " << end + << ") Number of valid samples: " << sample_count << " (" + << (float)sample_count/(float)n*(float)100 << "\%)\n"; + in.close(); + return sample_count; +} + +} diff --git a/libgpu/include/checker.h b/libgpu/include/checker.h deleted file mode 100644 index 7f2cf4e36e..0000000000 --- a/libgpu/include/checker.h +++ /dev/null @@ -1,15 +0,0 @@ -#ifndef CHECKER_H -#define CHECKER_H -#include -#include - -static void check_cuda_error(const cudaError_t e, const char* file, - const int line) { - if (e != cudaSuccess) { - fprintf(stderr, "%s:%d: %s (%d)\n", file, line, cudaGetErrorString(e), e); - exit(1); - } -} -#define check_cuda(x) check_cuda_error(x, __FILE__, __LINE__) - -#endif diff --git a/libgpu/include/gg.h b/libgpu/include/gg.h index 7f4a130c23..69239fd46c 100644 --- a/libgpu/include/gg.h +++ b/libgpu/include/gg.h @@ -34,7 +34,6 @@ unsigned const debug = GGDEBUG; #include "Timer.h" -#include "checker.h" template static void check_retval(const T retval, const T expected, const char* file, diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index 6815d1304f..f456c367dc 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -16,7 +16,6 @@ #include #include -#include "checker.h" // Adapted from LSG CSRGraph.h @@ -93,18 +92,6 @@ struct CSRGraph { return edge_data[abs_edge]; }; - void init_from_mgraph(int m, int nnz, index_type *h_row_offsets, index_type *h_column_indices, node_data_type *h_labels) { - nnodes = m; - nedges = nnz; - check_cuda(cudaMalloc((void **)&row_start, (m + 1) * sizeof(index_type))); - check_cuda(cudaMalloc((void **)&edge_dst, nnz * sizeof(index_type))); - check_cuda(cudaMemcpy(row_start, h_row_offsets, (m + 1) * sizeof(index_type), cudaMemcpyHostToDevice)); - check_cuda(cudaMemcpy(edge_dst, h_column_indices, nnz * sizeof(index_type), cudaMemcpyHostToDevice)); - #ifdef ENABLE_LABEL - check_cuda(cudaMalloc((void **)&node_data, m * sizeof(node_data_type))); - check_cuda(cudaMemcpy(node_data, h_labels, m * sizeof(node_data_type), cudaMemcpyHostToDevice)); - #endif - } void print_neighbors(index_type vid) { printf("Vertex %d neighbors: [ ", vid); index_type start = row_start[vid]; From a5dacc491c1dcc1446637091db2ea9d476cbde92 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 5 May 2020 20:13:22 -0500 Subject: [PATCH 248/660] update node --- libdeepgalois/CMakeLists.txt | 5 +--- .../include/deepgalois/layers/node.h | 6 +--- libdeepgalois/src/node.cpp | 29 ++++--------------- libdeepgalois/src/node.cu | 23 ++++++++------- 4 files changed, 21 insertions(+), 42 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index b46750b060..43d7fb5fac 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -71,10 +71,7 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if(ENABLE_HETERO_GALOIS) -set(sources - src/node.cpp - src/net.cpp -) +set(sources src/net.cpp) else() if(ENABLE_DIST_GALOIS) # do not link regular context.cpp; TODO do this conditional in cleaner way diff --git a/libdeepgalois/include/deepgalois/layers/node.h b/libdeepgalois/include/deepgalois/layers/node.h index ec7c319d87..e8699d2498 100644 --- a/libdeepgalois/include/deepgalois/layers/node.h +++ b/libdeepgalois/include/deepgalois/layers/node.h @@ -47,12 +47,8 @@ class edge { : num_samples_(n), ft_dim_(len), data_(NULL), grad_(NULL), prev_(prev) {} void alloc(); - void alloc_gpu(); - void merge_grads(vec_t* dst); - void merge_grads_gpu(float_t* dst); void clear_grads(); - void clear_grads_gpu(); - + void merge_grads(float_t* dst); void set_data(float_t* ptr) { data_ = ptr; } float_t* get_data() { return data_; } const float_t* get_data() const { return data_; } diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp index b1ee96a58b..fbd8d2bc6a 100644 --- a/libdeepgalois/src/node.cpp +++ b/libdeepgalois/src/node.cpp @@ -4,41 +4,24 @@ namespace deepgalois { void edge::alloc() { - // std::cout << "Allocating memory for tensors (intermediate features and - // gradients) ...\n"; -#ifdef CPU_ONLY data_ = new float_t[num_samples_ * ft_dim_]; grad_ = new float_t[num_samples_ * ft_dim_]; -#else - alloc_gpu(); -#endif } -void edge::merge_grads(vec_t* dst) { +void edge::merge_grads(float_t* dst) { assert(grad_ != NULL); - dst->resize(ft_dim_); - float_t* pdst = &(*dst)[0]; -#ifdef CPU_ONLY - std::copy(grad_, grad_ + ft_dim_, pdst); + if(dst) delete[] dst; + dst = new float_t[ft_dim_]; + std::copy(grad_, grad_ + ft_dim_, dst); // @todo consider adding parallelism and vectorization for (size_t sample = 1; sample < num_samples_; ++sample) { for (size_t i = 0; i < ft_dim_; i++) - pdst[i] += grad_[sample * ft_dim_ + i]; - // vectorize::reduce(&grad_[sample][0], ft_dim_, pdst); + dst[i] += grad_[sample * ft_dim_ + i]; } -#else - merge_grads_gpu(pdst); -#endif } void edge::clear_grads() { -#ifdef CPU_ONLY - std::fill(grad_, grad_ + ft_dim_ * num_samples_, - float_t(0)); // TODO: need vectorize - // vectorize::fill(&grad_[0], grad_.size(), float_t(0)); -#else - clear_grads_gpu(); -#endif + std::fill(grad_, grad_ + ft_dim_ * num_samples_, float_t(0)); } } // namespace deepgalois diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu index 88d486f369..b5a17af1fd 100644 --- a/libdeepgalois/src/node.cu +++ b/libdeepgalois/src/node.cu @@ -1,18 +1,21 @@ #include "deepgalois/layers/node.h" #include "deepgalois/cutils.h" +#include "deepgalois/math_functions.hh" -void deepgalois::edge::alloc_gpu() { - CUDA_CHECK( - cudaMalloc((void**)&data_, num_samples_ * ft_dim_ * sizeof(float_t))); - CUDA_CHECK( - cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t))); +namespace deepgalois { + +void edge::alloc() { + CUDA_CHECK(cudaMalloc((void**)&data_, num_samples_ * ft_dim_ * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t))); +} + +void edge::merge_grads_gpu(float_t* dst) { + CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost)); } -void deepgalois::edge::merge_grads_gpu(float_t* dst) { - CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), - cudaMemcpyDeviceToHost)); +void edge::clear_grads() { + //CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t))); + init_const_gpu(num_samples_ * ft_dim_, 0.0, grad_); } -void deepgalois::edge::clear_grads_gpu() { - CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t))); } From 6e356d24877f7cd38210c6f2a6a1bac5486f6fa5 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 6 May 2020 09:24:23 -0500 Subject: [PATCH 249/660] remove cpp for gpu --- libdeepgalois/CMakeLists.txt | 7 +- libdeepgalois/include/deepgalois/net.h | 421 +++++++++++++++++++++-- libdeepgalois/src/net.cpp | 455 +------------------------ libdeepgalois/src/net.cu | 19 ++ libgpu/include/checker.h | 15 + libgpu/include/csr_graph.h | 4 + lonestar/gnn/gcn/CMakeLists.txt | 9 +- lonestar/gnn/gcn/gcn.cpp | 10 +- 8 files changed, 465 insertions(+), 475 deletions(-) create mode 100644 libgpu/include/checker.h diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 43d7fb5fac..23a0b44ed7 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -70,9 +70,7 @@ else() endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -if(ENABLE_HETERO_GALOIS) -set(sources src/net.cpp) -else() +if(NOT ENABLE_HETERO_GALOIS) if(ENABLE_DIST_GALOIS) # do not link regular context.cpp; TODO do this conditional in cleaner way # also don't link sampler @@ -115,9 +113,7 @@ set(sources src/net.cpp ) endif() -endif() -#set(BOOST_LIBRARIES "-lboost_system -lboost_thread") add_library(dg_cpu STATIC ${sources}) target_link_libraries(dg_cpu galois_shmem) target_link_libraries(dg_cpu ${MPI_CXX_LIBRARIES}) @@ -150,3 +146,4 @@ set_target_properties(dg_cpu PROPERTIES INTERFACE_POSITION_INDEPENDENT_CODE On POSITION_INDEPENDENT_CODE On ) +endif() diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 9c794a9063..5bab5f12d2 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -1,9 +1,7 @@ /** * Based on the net.hpp file from Caffe deep learning framework. */ -#ifndef _MODEL_H_ -#define _MODEL_H_ - +#pragma once #include #include "deepgalois/types.h" #include "deepgalois/layers/l2_norm_layer.h" @@ -11,6 +9,7 @@ #include "deepgalois/layers/softmax_loss_layer.h" #include "deepgalois/layers/sigmoid_loss_layer.h" #include "deepgalois/optimizer.h" +#include "deepgalois/utils.h" #ifdef CPU_ONLY #include "deepgalois/sampler.h" #endif @@ -29,6 +28,75 @@ namespace deepgalois { // layer 2: features N x 16, weights 16 x E, out N x E class Net { public: + Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, + unsigned hidden1, float lr, float dropout, float wd, + bool selfloop, bool single, bool l2norm, bool dense, + unsigned neigh_sz, unsigned subg_sz, int val_itv) : + is_single_class(single), has_l2norm(l2norm), has_dense(dense), + neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz), + num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), + learning_rate(lr), dropout_rate(dropout), weight_decay(wd), + val_interval(val_itv), is_selfloop(selfloop) { + assert(n_conv > 0); + std::cout << "Configuration: num_threads " << num_threads + << ", num_conv_layers " << num_conv_layers + << ", num_epochs " << num_epochs + << ", hidden1 " << hidden1 + << ", learning_rate " << learning_rate + << ", dropout_rate " << dropout_rate + << ", weight_decay " << weight_decay << "\n"; + num_layers = num_conv_layers + 1; + if (has_l2norm) num_layers ++; + if (has_dense) num_layers ++; + // initialize feature metadata + feature_dims.resize(num_layers + 1); +#ifndef GALOIS_USE_DIST + context = new deepgalois::Context(); + context->set_dataset(dataset_str); + num_samples = context->read_graph(selfloop); + context->set_label_class(is_single_class); + // read graph, get num nodes + num_classes = context->read_labels(); + + //std::cout << "Reading label masks ... "; + train_masks = new mask_t[num_samples]; + val_masks = new mask_t[num_samples]; + std::fill(train_masks, train_masks+num_samples, 0); + std::fill(val_masks, val_masks+num_samples, 0); + + // get training and validation sets + if (dataset_str == "reddit") { + train_begin = 0, train_count = 153431, + train_end = train_begin + train_count; + val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; + // TODO do all can be used below + for (size_t i = train_begin; i < train_end; i++) train_masks[i] = 1; + for (size_t i = val_begin; i < val_end; i++) val_masks[i] = 1; + } else { + train_count = context->read_masks("train", num_samples, train_begin, train_end, train_masks); + val_count = context->read_masks("val", num_samples, val_begin, val_end, val_masks); + } + + if (subgraph_sample_size > train_count) { + std::cout << "FATAL: subgraph size can not be larger than the size of training set\n"; + exit(1); + } + + feature_dims[0] = context->read_features(); // input feature dimension: D + for (size_t i = 1; i < num_conv_layers; i++) + feature_dims[i] = hidden1; // hidden1 level embedding: 16 + feature_dims[num_conv_layers] = num_classes; // output embedding: E + if (has_l2norm) + feature_dims[num_conv_layers+1] = num_classes; // l2 normalized embedding: E + if (has_dense) + feature_dims[num_layers-1] = num_classes; // MLP embedding: E + feature_dims[num_layers] = num_classes; // normalized output embedding: E + layers.resize(num_layers); + context->set_use_subgraph(subgraph_sample_size > 0); + init(); +#endif + } + Net() : is_single_class(true), has_l2norm(false), has_dense(false), neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1), num_samples(0), num_classes(0), @@ -40,10 +108,8 @@ class Net { val_interval(1), num_subgraphs(1), num_vertices_sg(9000), train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {} - void init(std::string dataset_str, int nt, unsigned n_conv, int epochs, - unsigned hidden1, float lr, float dropout, float wd, - bool selfloop, bool single, bool l2norm, bool dense, - unsigned neigh_sample_sz, unsigned subg_sample_sz, int val_itv); + + void init(); #ifdef GALOIS_USE_DIST void dist_init(Graph* graph, std::string dataset_str); #endif @@ -51,21 +117,334 @@ class Net { size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } size_t get_nnodes() { return num_samples; } - void construct_layers(); - void append_out_layer(size_t layer_id); - void append_l2norm_layer(size_t layer_id); - void append_dense_layer(size_t layer_id); - void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, - bool bias = false, bool dropout = true); //! Add a convolution layer to the network - - void train(optimizer* opt, bool need_validate); // training - double evaluate(std::string type, acc_t& loss, acc_t& acc); // inference - void read_test_masks(std::string dataset); - acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks); // forward propagation - void bprop(); // back propogation void normalize(); // Scale gradient to counterbalance accumulation void regularize(); // add weight decay - void update_weights(optimizer* opt); // update trainable weights after back-propagation + + void train(optimizer* opt, bool need_validate) { + std::string header = ""; + std::string seperator = " "; +#ifdef GALOIS_USE_DIST + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + header = "[" + std::to_string(myID) + "] "; + seperator = "\n"; +#endif + + double total_train_time = 0.0; + int num_subg_remain = 0; +#ifdef CPU_ONLY +#ifndef GALOIS_USE_DIST + if (subgraph_sample_size) { + context->createSubgraphs(num_subgraphs); + subgraphs_masks = new mask_t[num_samples*num_subgraphs]; + std::cout << "\nConstruct training vertex set induced graph...\n"; + sampler->set_masked_graph(train_begin, train_end, train_count, train_masks, context->getGraphPointer()); + } +#endif +#endif + std::cout << "\nStart training...\n"; + Timer t_epoch; + // run epochs + for (int ep = 0; ep < num_epochs; ep++) { + t_epoch.Start(); + + if (subgraph_sample_size) { + if (num_subg_remain == 0) { + std::cout << "Generating " << num_subgraphs << " subgraphs "; + Timer t_subgen; + t_subgen.Start(); + // generate subgraphs +#ifdef CPU_ONLY +#ifndef GALOIS_USE_DIST + for (int sid = 0; sid < num_subgraphs; sid++) { + //galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) { + unsigned tid = 0; + //tid = galois::substrate::ThreadPool::getTID(); + sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples], tid); + }//, galois::loopname("subgraph_gen")); +#endif +#endif + num_subg_remain = num_subgraphs; + t_subgen.Stop(); + //std::cout << "Done, time: " << t_subgen.Millisecs() << "\n"; + } +#ifndef GALOIS_USE_DIST + for (int i = 0; i < num_subgraphs; i++) { + auto sg_ptr = context->getSubgraphPointer(i); + sg_ptr->degree_counting(); + //galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " num_e ", sg_ptr->sizeEdges(), "\n"); + } +#endif //GALOIS_USE_DIST + num_subg_remain--; + int sg_id = num_subg_remain; + auto subgraph_ptr = context->getSubgraphPointer(sg_id); + num_vertices_sg = subgraph_ptr->size(); + //galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, ", num_edges: ", subgraph_ptr->sizeEdges(), "\n"); + for (size_t i = 0; i < num_layers; i++) + layers[i]->update_dim_size(num_vertices_sg); + context->norm_factor_computing(1, sg_id); + for (size_t i = 0; i < num_conv_layers; i++) { + layers[i]->set_graph_ptr(subgraph_ptr); + layers[i]->set_norm_consts_ptr(context->get_norm_factors_subg_ptr()); + } + // update labels for subgraph + context->gen_subgraph_labels(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]); + layers[num_layers-1]->set_labels_ptr(context->get_labels_subg_ptr()); + + // update features for subgraph + context->gen_subgraph_feats(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]); + layers[0]->set_feats_ptr(context->get_feats_subg_ptr()); // feed input data + } + + // training steps + std::cout << header << "Epoch " << std::setw(3) << ep << seperator; + set_netphases(net_phase::train); + acc_t train_loss = 0.0, train_acc = 0.0; + + // forward: after this phase, layer edges will contain intermediate features + // for use during backprop + double fw_time = evaluate("train", train_loss, train_acc); + + // backward: use intermediate features + ground truth to update layers + // with feature gradients whcih are then used to calculate weight gradients + Net::bprop(); + + // gradient update: use gradients stored on each layer to update model for + // next epoch + Net::update_weights(opt); // update parameters + + // validation / testing + set_netphases(net_phase::test); + std::cout << header << "train_loss " << std::setprecision(3) << std::fixed << train_loss + << " train_acc " << train_acc << seperator; + t_epoch.Stop(); + double epoch_time = t_epoch.Millisecs(); + total_train_time += epoch_time; + if (need_validate && ep % val_interval == 0) { + // Validation + acc_t val_loss = 0.0, val_acc = 0.0; + double val_time = evaluate("val", val_loss, val_acc); + std::cout << header << "val_loss " << std::setprecision(3) << std::fixed << val_loss + << " val_acc " << val_acc << seperator; + std::cout << header << "time " << std::setprecision(3) << std::fixed << epoch_time + val_time + << " ms (train_time " << epoch_time << " val_time " << val_time << ")\n"; + } else { + std::cout << header << "train_time " << std::fixed << epoch_time + << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n"; + } + } + double avg_train_time = total_train_time / (double)num_epochs; + double throughput = 1000.0 * (double)num_epochs / total_train_time; + std::cout << "\nAverage training time: " << avg_train_time + << " ms. Throughput: " << throughput << " epoch/s\n"; + } + + // evaluate, i.e. inference or predict + double evaluate(std::string type, acc_t& loss, acc_t& acc) { + // TODO may need to do something for the dist case + Timer t_eval; + t_eval.Start(); + size_t begin = 0, end = 0, count = 0; + mask_t* masks = NULL; + if (type == "train") { + begin = train_begin; + end = train_end; + count = train_count; + masks = train_masks; + if (subgraph_sample_size) { + // update masks for subgraph + masks = NULL; + begin = 0; + end = num_vertices_sg; + count = num_vertices_sg; + } + } else if (type == "val") { + begin = val_begin; + end = val_end; + count = val_count; + masks = val_masks; + } else { + begin = test_begin; + end = test_end; + count = test_count; + masks = test_masks; + } +#ifdef CPU_ONLY + if (subgraph_sample_size && type != "train") { // switch to the original graph + for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_samples); + for (size_t i = 0; i < num_conv_layers; i++) { + layers[i]->set_graph_ptr(context->getGraphPointer()); + layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr()); + } + layers[num_layers-1]->set_labels_ptr(context->get_labels_ptr()); + layers[0]->set_feats_ptr(context->get_feats_ptr()); // feed input data + } +#else + if (type == "train") { + masks = d_train_masks; + } else if (type == "val") { + masks = d_val_masks; + } else { + masks = d_test_masks; + } +#endif + loss = fprop(begin, end, count, masks); + float_t* predictions = layers[num_layers - 1]->next()->get_data(); + label_t* labels; + if (type == "train" && subgraph_sample_size) { + labels = context->get_labels_subg_ptr(); + } else { + labels = context->get_labels_ptr(); + } + if (is_single_class) { + acc = masked_accuracy(begin, end, count, masks, predictions, labels); + } else { + acc = masked_multi_class_accuracy(begin, end, count, masks, predictions, labels); + } + t_eval.Stop(); + return t_eval.Millisecs(); + } + + // read masks of test set + void read_test_masks(std::string dataset) { + test_masks = new mask_t[num_samples]; + if (dataset == "reddit") { + test_begin = 177262; + test_count = 55703; + test_end = test_begin + test_count; +#ifndef GALOIS_USE_DIST + for (size_t i = test_begin; i < test_end; i++) test_masks[i] = 1; +#else + for (size_t i = test_begin; i < test_end; i++) { + if (dGraph->isLocal(i)) { + test_masks[dGraph->getLID(i)] = 1; + } + } +#endif + } else { +#ifndef GALOIS_USE_DIST + test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks); +#else + test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks, dGraph); +#endif + } +#ifndef CPU_ONLY + copy_test_masks_to_device(); +#endif + } + void copy_test_masks_to_device(); + + void construct_layers() { + // append conv layers + std::cout << "\nConstructing layers...\n"; + for (size_t i = 0; i < num_conv_layers-1; i++) + append_conv_layer(i, true); // conv layers, act=true + append_conv_layer(num_conv_layers-1); // the last hidden layer, act=false + if (has_l2norm) + append_l2norm_layer(num_conv_layers); // l2_norm layer + if (has_dense) + append_dense_layer(num_layers-2); // dense layer + append_out_layer(num_layers-1); // output layer + + // allocate memory for intermediate features and gradients + for (size_t i = 0; i < num_layers; i++) { + layers[i]->add_edge(); + } + for (size_t i = 1; i < num_layers; i++) + connect(layers[i - 1], layers[i]); + for (size_t i = 0; i < num_layers; i++) + layers[i]->malloc_and_init(); + layers[0]->set_in_data(context->get_feats_ptr()); // feed input data + // precompute the normalization constant based on graph structure + context->norm_factor_computing(0); + for (size_t i = 0; i < num_conv_layers; i++) + layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr()); + set_contexts(); + } + + //! Add an l2_norm layer to the network + void append_l2norm_layer(size_t layer_id) { + assert(layer_id > 0); // can not be the first layer + std::vector in_dims(2), out_dims(2); + in_dims[0] = num_samples; + in_dims[0] = num_samples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + layers[layer_id] = new l2_norm_layer(layer_id, in_dims, out_dims); + } + + //! Add an dense layer to the network + void append_dense_layer(size_t layer_id) { + assert(layer_id > 0); // can not be the first layer + std::vector in_dims(2), out_dims(2); + in_dims[0] = num_samples; + in_dims[0] = num_samples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + //layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims); + } + + //! Add an output layer to the network + void append_out_layer(size_t layer_id) { + assert(layer_id > 0); // can not be the first layer + std::vector in_dims(2), out_dims(2); + in_dims[0] = out_dims[0] = num_samples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + if (is_single_class) + layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); + else + layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims); + layers[layer_id]->set_labels_ptr(context->get_labels_ptr()); + } + + //! Add a convolution layer to the network + void append_conv_layer(size_t layer_id, bool act=false, bool norm=true, bool bias=false, bool dropout=true) { + assert(dropout_rate < 1.0); + assert(layer_id < num_conv_layers); + std::vector in_dims(2), out_dims(2); + in_dims[0] = out_dims[0] = num_samples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, + dropout_rate, in_dims, out_dims); + layers[layer_id]->set_graph_ptr(context->getGraphPointer()); + } + + // update trainable weights after back-propagation + void update_weights(optimizer* opt) { + normalize(); + regularize(); + for (size_t i = 0; i < num_layers; i++) { + if (layers[i]->trainable()) { + layers[i]->update_weight(opt); + } + } + } + + //! forward propagation: [begin, end) is the range of samples used. + //! calls "forward" on each layer and returns the loss of the final layer + acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) { + // set mask for the last layer + layers[num_layers - 1]->set_sample_mask(begin, end, count, masks); + // layer0: from N x D to N x 16 + // layer1: from N x 16 to N x E + // layer2: from N x E to N x E (normalize only) + for (size_t i = 0; i < num_layers; i++) { + layers[i]->forward(); + // TODO need to sync model between layers here + } + // prediction error + auto loss = layers[num_layers - 1]->get_prediction_loss(); + // Squared Norm Regularization to mitigate overfitting + loss += weight_decay * layers[0]->get_weight_decay_loss(); + return loss; + } + + void bprop() { + for (size_t i = num_layers; i != 0; i--) { + layers[i - 1]->backward(); + } + } //! Save the context object to all layers of the network void set_contexts() { @@ -104,6 +483,7 @@ class Net { int val_interval; int num_subgraphs; int num_vertices_sg; + bool is_selfloop; mask_t* train_masks; // masks for training mask_t* d_train_masks; // masks for training on device @@ -133,4 +513,3 @@ class Net { } // namespace deepgalois -#endif diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 719b5267ee..1f63eacc60 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -4,105 +4,10 @@ #include "galois/Timer.h" #include "deepgalois/net.h" -#include "deepgalois/utils.h" #include "deepgalois/math_functions.hh" namespace deepgalois { -void Net::init(std::string dataset_str, int nt, unsigned n_conv, int epochs, - unsigned hidden1, float lr, float dropout, float wd, - bool selfloop, bool single, bool l2norm, bool dense, - unsigned neigh_sz, unsigned subg_sz, int val_itv) { - assert(n_conv > 0); - num_threads = nt; - num_conv_layers = n_conv; - num_epochs = epochs; - learning_rate = lr; - dropout_rate = dropout; - weight_decay = wd; - is_single_class = single; - has_l2norm = l2norm; - has_dense = dense; - neighbor_sample_size = neigh_sz; - subgraph_sample_size = subg_sz; - val_interval = val_itv; - //num_subgraphs = 1;//num_threads; - galois::gPrint("Configuration: num_threads ", num_threads, - ", num_conv_layers ", num_conv_layers, - ", num_epochs ", num_epochs, - ", hidden1 ", hidden1, - ", learning_rate ", learning_rate, - ", dropout_rate ", dropout_rate, - ", weight_decay ", weight_decay, "\n"); -#ifndef GALOIS_USE_DIST - context = new deepgalois::Context(); - context->set_dataset(dataset_str); - num_samples = context->read_graph(selfloop); - context->set_label_class(is_single_class); -#else - // only done here to avoid unused var complain TODO find better way - (void)selfloop; -#endif - - // read graph, get num nodes - num_classes = context->read_labels(); - -#ifndef GALOIS_USE_DIST - //std::cout << "Reading label masks ... "; - train_masks = new mask_t[num_samples]; - val_masks = new mask_t[num_samples]; - std::fill(train_masks, train_masks+num_samples, 0); - std::fill(val_masks, val_masks+num_samples, 0); - - // get training and validation sets - if (dataset_str == "reddit") { - train_begin = 0, train_count = 153431, - train_end = train_begin + train_count; - val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; - // TODO do all can be used below - for (size_t i = train_begin; i < train_end; i++) train_masks[i] = 1; - for (size_t i = val_begin; i < val_end; i++) val_masks[i] = 1; - } else { - train_count = context->read_masks("train", num_samples, train_begin, train_end, train_masks); - val_count = context->read_masks("val", num_samples, val_begin, val_end, val_masks); - } -#endif - - if (subgraph_sample_size > train_count) { - galois::gPrint("FATAL: subgraph size can not be larger than the size of training set\n"); - exit(1); - } - // NOTE: train_begin/train_end are global IDs, train_masks is a local id - // train count and val count are LOCAL counts - - num_layers = num_conv_layers + 1; - if (has_l2norm) num_layers ++; - if (has_dense) num_layers ++; - // initialize feature metadata - feature_dims.resize(num_layers + 1); - feature_dims[0] = context->read_features(); // input feature dimension: D - for (size_t i = 1; i < num_conv_layers; i++) - feature_dims[i] = hidden1; // hidden1 level embedding: 16 - feature_dims[num_conv_layers] = num_classes; // output embedding: E - if (has_l2norm) - feature_dims[num_conv_layers+1] = num_classes; // l2 normalized embedding: E - if (has_dense) - feature_dims[num_layers-1] = num_classes; // MLP embedding: E - feature_dims[num_layers] = num_classes; // normalized output embedding: E - layers.resize(num_layers); - -#ifndef GALOIS_USE_DIST - context->set_use_subgraph(subgraph_sample_size > 0); -#ifdef CPU_ONLY - if (subgraph_sample_size) sampler = new deepgalois::Sampler(); -#else - copy_masks_device(num_samples, train_masks, d_train_masks); - copy_masks_device(num_samples, val_masks, d_val_masks); - context->copy_data_to_device(); // copy labels and input features to the device -#endif -#endif -} - #ifdef GALOIS_USE_DIST void Net::dist_init(Graph* graph, std::string dataset_str) { dGraph = graph; @@ -111,6 +16,7 @@ void Net::dist_init(Graph* graph, std::string dataset_str) { context->saveGraph(dGraph); // TODO self loop setup? context->initializeSyncSubstrate(); + num_classes = context->read_labels(); //std::cout << "Reading label masks ... "; train_masks = new mask_t[num_samples]; @@ -137,367 +43,38 @@ void Net::dist_init(Graph* graph, std::string dataset_str) { train_count = context->read_masks("train", num_samples, train_begin, train_end, train_masks, dGraph); val_count = context->read_masks("val", num_samples, val_begin, val_end, val_masks, dGraph); } -} -#endif - -void Net::train(optimizer* opt, bool need_validate) { - std::string header = ""; - std::string seperator = " "; -#ifdef GALOIS_USE_DIST - unsigned myID = galois::runtime::getSystemNetworkInterface().ID; - header = "[" + std::to_string(myID) + "] "; - seperator = "\n"; -#endif - - galois::StatTimer Tupdate("Train-WeightUpdate"); - galois::StatTimer Tfw("Train-Forward"); - galois::StatTimer Tbw("Train-Backward"); - galois::StatTimer Tval("Validation"); - double total_train_time = 0.0; - int num_subg_remain = 0; -#ifdef CPU_ONLY -#ifndef GALOIS_USE_DIST - if (subgraph_sample_size) { - context->createSubgraphs(num_subgraphs); - subgraphs_masks = new mask_t[num_samples*num_subgraphs]; - galois::gPrint("\nConstruct training vertex set induced graph...\n"); - sampler->set_masked_graph(train_begin, train_end, train_count, train_masks, context->getGraphPointer()); - } -#endif -#endif - galois::gPrint("\nStart training...\n"); - Timer t_epoch; - // run epochs - for (int ep = 0; ep < num_epochs; ep++) { - t_epoch.Start(); - - if (subgraph_sample_size) { - if (num_subg_remain == 0) { - galois::gPrint("Generating ", num_subgraphs, " subgraphs "); - Timer t_subgen; - t_subgen.Start(); - // generate subgraphs -#ifdef CPU_ONLY -#ifndef GALOIS_USE_DIST - for (int sid = 0; sid < num_subgraphs; sid++) { - //galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) { - unsigned tid = 0; - //tid = galois::substrate::ThreadPool::getTID(); - sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples], tid); - }//, galois::loopname("subgraph_gen")); -#endif -#endif - num_subg_remain = num_subgraphs; - t_subgen.Stop(); - //galois::gPrint("Done, time: ", t_subgen.Millisecs(), "\n"); - } -#ifndef GALOIS_USE_DIST - for (int i = 0; i < num_subgraphs; i++) { - auto sg_ptr = context->getSubgraphPointer(i); - sg_ptr->degree_counting(); - //galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " num_e ", sg_ptr->sizeEdges(), "\n"); - } -#endif //GALOIS_USE_DIST - num_subg_remain--; - int sg_id = num_subg_remain; - auto subgraph_ptr = context->getSubgraphPointer(sg_id); - num_vertices_sg = subgraph_ptr->size(); - //galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, ", num_edges: ", subgraph_ptr->sizeEdges(), "\n"); - for (size_t i = 0; i < num_layers; i++) - layers[i]->update_dim_size(num_vertices_sg); - context->norm_factor_computing(1, sg_id); - for (size_t i = 0; i < num_conv_layers; i++) { - layers[i]->set_graph_ptr(subgraph_ptr); - layers[i]->set_norm_consts_ptr(context->get_norm_factors_subg_ptr()); - } - // update labels for subgraph - context->gen_subgraph_labels(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]); - layers[num_layers-1]->set_labels_ptr(context->get_labels_subg_ptr()); - - // update features for subgraph - context->gen_subgraph_feats(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]); - layers[0]->set_feats_ptr(context->get_feats_subg_ptr()); // feed input data - } - - // training steps - galois::gPrint(header, "Epoch ", std::setw(3), ep, seperator); - set_netphases(net_phase::train); - acc_t train_loss = 0.0, train_acc = 0.0; - - // forward: after this phase, layer edges will contain intermediate features - // for use during backprop - Tfw.start(); - double fw_time = evaluate("train", train_loss, train_acc); - Tfw.stop(); - - // backward: use intermediate features + ground truth to update layers - // with feature gradients whcih are then used to calculate weight gradients - Tbw.start(); - Net::bprop(); - Tbw.stop(); - - // gradient update: use gradients stored on each layer to update model for - // next epoch - Tupdate.start(); - Net::update_weights(opt); // update parameters - Tupdate.stop(); - - // validation / testing - set_netphases(net_phase::test); - galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, train_loss, - " train_acc ", train_acc, seperator); - t_epoch.Stop(); - double epoch_time = t_epoch.Millisecs(); - total_train_time += epoch_time; - if (need_validate && ep % val_interval == 0) { - // Validation - acc_t val_loss = 0.0, val_acc = 0.0; - Tval.start(); - double val_time = evaluate("val", val_loss, val_acc); - Tval.stop(); - galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, val_loss, - " val_acc ", val_acc, seperator); - galois::gPrint(header, "time ", std::setprecision(3), std::fixed, epoch_time + val_time, - " ms (train_time ", epoch_time, " val_time ", val_time, ")\n"); - } else { - galois::gPrint(header, "train_time ", std::fixed, epoch_time, - " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, ")\n"); - } - } - double avg_train_time = total_train_time / (double)num_epochs; - double throughput = 1000.0 * (double)num_epochs / total_train_time; - galois::gPrint("\nAverage training time: ", avg_train_time, - " ms. Throughput: ", throughput, " epoch/s\n"); + feature_dims[0] = context->read_features(); // input feature dimension: D + for (size_t i = 1; i < num_conv_layers; i++) + feature_dims[i] = hidden1; // hidden1 level embedding: 16 + feature_dims[num_conv_layers] = num_classes; // output embedding: E + if (has_l2norm) + feature_dims[num_conv_layers+1] = num_classes; // l2 normalized embedding: E + if (has_dense) + feature_dims[num_layers-1] = num_classes; // MLP embedding: E + feature_dims[num_layers] = num_classes; // normalized output embedding: E + layers.resize(num_layers); } - -// evaluate, i.e. inference or predict -double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { - // TODO may need to do something for the dist case - Timer t_eval; - t_eval.Start(); - size_t begin = 0, end = 0, count = 0; - mask_t* masks = NULL; - if (type == "train") { - begin = train_begin; - end = train_end; - count = train_count; - masks = train_masks; - if (subgraph_sample_size) { - // update masks for subgraph - masks = NULL; - begin = 0; - end = num_vertices_sg; - count = num_vertices_sg; - } - } else if (type == "val") { - begin = val_begin; - end = val_end; - count = val_count; - masks = val_masks; - } else { - begin = test_begin; - end = test_end; - count = test_count; - masks = test_masks; - } -#ifdef CPU_ONLY - if (subgraph_sample_size && type != "train") { // switch to the original graph - for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_samples); - for (size_t i = 0; i < num_conv_layers; i++) { - layers[i]->set_graph_ptr(context->getGraphPointer()); - layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr()); - } - layers[num_layers-1]->set_labels_ptr(context->get_labels_ptr()); - layers[0]->set_feats_ptr(context->get_feats_ptr()); // feed input data - } -#else - if (type == "train") { - masks = d_train_masks; - } else if (type == "val") { - masks = d_val_masks; - } else { - masks = d_test_masks; - } #endif - loss = fprop(begin, end, count, masks); - float_t* predictions = layers[num_layers - 1]->next()->get_data(); - label_t* labels; - if (type == "train" && subgraph_sample_size) { - labels = context->get_labels_subg_ptr(); - } else { - labels = context->get_labels_ptr(); - } - if (is_single_class) { - acc = masked_accuracy(begin, end, count, masks, predictions, labels); - } else { - acc = masked_multi_class_accuracy(begin, end, count, masks, predictions, labels); - } - t_eval.Stop(); - return t_eval.Millisecs(); -} - -//! forward propagation: [begin, end) is the range of samples used. -//! calls "forward" on the layers of the network and returns the loss of the -//! final layer -acc_t Net::fprop(size_t begin, size_t end, size_t count, mask_t* masks) { - // set mask for the last layer - layers[num_layers - 1]->set_sample_mask(begin, end, count, masks); - // layer0: from N x D to N x 16 - // layer1: from N x 16 to N x E - // layer2: from N x E to N x E (normalize only) - for (size_t i = 0; i < num_layers; i++) { - layers[i]->forward(); - // TODO need to sync model between layers here - } - // prediction error - auto loss = layers[num_layers - 1]->get_prediction_loss(); - // Squared Norm Regularization to mitigate overfitting - loss += weight_decay * layers[0]->get_weight_decay_loss(); - return loss; -} - -void Net::bprop() { - for (size_t i = num_layers; i != 0; i--) { - layers[i - 1]->backward(); - } -} - -// Scale gradient to counterbalance accumulation -void Net::normalize() { +#ifdef CPU_ONLY +void Net::init() { + if (subgraph_sample_size) sampler = new deepgalois::Sampler(); } // add weight decay void Net::regularize() { size_t layer_id = 0; auto n = feature_dims[layer_id] * feature_dims[layer_id+1]; -#ifdef CPU_ONLY // TODO: parallel math::axpy(n, weight_decay, layers[layer_id]->get_weights_ptr(), layers[layer_id]->get_grads_ptr()); -#else - axpy_gpu(n, weight_decay, layers[layer_id]->get_weights_device_ptr(), - layers[layer_id]->get_grads_device_ptr()); -#endif -} - -void Net::update_weights(optimizer* opt) { - normalize(); - regularize(); - for (size_t i = 0; i < num_layers; i++) { - if (layers[i]->trainable()) { - layers[i]->update_weight(opt); - } - } -} - -void Net::construct_layers() { - // append conv layers - std::cout << "\nConstructing layers...\n"; - for (size_t i = 0; i < num_conv_layers-1; i++) - append_conv_layer(i, true); // conv layers, act=true - append_conv_layer(num_conv_layers-1); // the last hidden layer, act=false - if (has_l2norm) - append_l2norm_layer(num_conv_layers); // l2_norm layer - if (has_dense) - append_dense_layer(num_layers-2); // dense layer - append_out_layer(num_layers-1); // output layer - - // allocate memory for intermediate features and gradients - for (size_t i = 0; i < num_layers; i++) { - layers[i]->add_edge(); - } - for (size_t i = 1; i < num_layers; i++) - connect(layers[i - 1], layers[i]); - for (size_t i = 0; i < num_layers; i++) - layers[i]->malloc_and_init(); - layers[0]->set_in_data(context->get_feats_ptr()); // feed input data - // precompute the normalization constant based on graph structure - context->norm_factor_computing(0); - for (size_t i = 0; i < num_conv_layers; i++) - layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr()); - set_contexts(); -} - -//! Add an l2_norm layer to the network -void Net::append_l2norm_layer(size_t layer_id) { - assert(layer_id > 0); // can not be the first layer - std::vector in_dims(2), out_dims(2); - in_dims[0] = num_samples; - in_dims[0] = num_samples; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - layers[layer_id] = new l2_norm_layer(layer_id, in_dims, out_dims); -} - -//! Add an dense layer to the network -void Net::append_dense_layer(size_t layer_id) { - assert(layer_id > 0); // can not be the first layer - std::vector in_dims(2), out_dims(2); - in_dims[0] = num_samples; - in_dims[0] = num_samples; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - //layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims); -} - -//! Add an output layer to the network -void Net::append_out_layer(size_t layer_id) { - assert(layer_id > 0); // can not be the first layer - std::vector in_dims(2), out_dims(2); - in_dims[0] = out_dims[0] = num_samples; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - if (is_single_class) - layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); - else - layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims); - layers[layer_id]->set_labels_ptr(context->get_labels_ptr()); } -//! Add a convolution layer to the network -void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, - bool dropout) { - assert(dropout_rate < 1.0); - assert(layer_id < num_conv_layers); - std::vector in_dims(2), out_dims(2); - in_dims[0] = out_dims[0] = num_samples; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, - dropout_rate, in_dims, out_dims); - layers[layer_id]->set_graph_ptr(context->getGraphPointer()); -} - -void Net::read_test_masks(std::string dataset) { - test_masks = new mask_t[num_samples]; - if (dataset == "reddit") { - test_begin = 177262; - test_count = 55703; - test_end = test_begin + test_count; -#ifndef GALOIS_USE_DIST - for (size_t i = test_begin; i < test_end; i++) test_masks[i] = 1; -#else - for (size_t i = test_begin; i < test_end; i++) { - if (dGraph->isLocal(i)) { - test_masks[dGraph->getLID(i)] = 1; - } - } -#endif - } else { -#ifndef GALOIS_USE_DIST - test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks); -#else - test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks, dGraph); -#endif - } -#ifndef CPU_ONLY - copy_masks_device(num_samples, test_masks, d_test_masks); -#endif +// Scale gradient to counterbalance accumulation +void Net::normalize() { } -#ifdef CPU_ONLY /** * * @param begin GLOBAL begin diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index 115ff6d81d..1a50c0c551 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -143,6 +143,25 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, } namespace deepgalois { + +void Net::init() { + copy_masks_device(num_samples, train_masks, d_train_masks); + copy_masks_device(num_samples, val_masks, d_val_masks); + context->copy_data_to_device(); // copy labels and input features to the device +} + +void Net::copy_test_masks_to_device() { + copy_masks_device(num_samples, test_masks, d_test_masks); +} + +// add weight decay +void Net::regularize() { + size_t layer_id = 0; + auto n = feature_dims[layer_id] * feature_dims[layer_id+1]; + axpy_gpu(n, weight_decay, layers[layer_id]->get_weights_device_ptr(), + layers[layer_id]->get_grads_device_ptr()); +} + acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth) { return masked_accuracy_gpu(num_classes, begin, end, count, masks, preds, ground_truth); diff --git a/libgpu/include/checker.h b/libgpu/include/checker.h new file mode 100644 index 0000000000..7f2cf4e36e --- /dev/null +++ b/libgpu/include/checker.h @@ -0,0 +1,15 @@ +#ifndef CHECKER_H +#define CHECKER_H +#include +#include + +static void check_cuda_error(const cudaError_t e, const char* file, + const int line) { + if (e != cudaSuccess) { + fprintf(stderr, "%s:%d: %s (%d)\n", file, line, cudaGetErrorString(e), e); + exit(1); + } +} +#define check_cuda(x) check_cuda_error(x, __FILE__, __LINE__) + +#endif diff --git a/libgpu/include/csr_graph.h b/libgpu/include/csr_graph.h index c663edb8a3..7fff0750e4 100644 --- a/libgpu/include/csr_graph.h +++ b/libgpu/include/csr_graph.h @@ -18,7 +18,10 @@ //#include "graph_gpu.h" #include +#include "checker.h" +#include "graph_gpu.h" +/* // Adapted from LSG CSRGraph.h // TODO: make this template data @@ -143,6 +146,7 @@ struct CSRGraph { bool device_graph; }; >>>>>>> dist-dev +//*/ struct CSRGraphTex : CSRGraph { cudaTextureObject_t edge_dst_tx; diff --git a/lonestar/gnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt index eff742aa69..589f60b881 100644 --- a/lonestar/gnn/gcn/CMakeLists.txt +++ b/lonestar/gnn/gcn/CMakeLists.txt @@ -1,13 +1,12 @@ -#app(gcn gcn.cpp) add_executable(gcn gcn.cpp) target_link_libraries(gcn PRIVATE Galois::shmem lonestar) +if(ENABLE_HETERO_GALOIS) + target_link_libraries(gcn PRIVATE dg_gpu) + target_link_libraries(gcn PRIVATE -lcudart -lcublas -lcurand -lcudadevrt) +else() target_link_libraries(gcn PRIVATE dg_cpu) if(ENABLE_DIST_GALOIS) target_link_libraries(gcn PRIVATE distgraphloader) endif() - -if(ENABLE_HETERO_GALOIS) - target_link_libraries(gcn PRIVATE dg_gpu) - target_link_libraries(gcn PRIVATE -lcudart -lcublas -lcurand -lcudadevrt) endif() diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp index de3f2a76ee..a8ab651603 100644 --- a/lonestar/gnn/gcn/gcn.cpp +++ b/lonestar/gnn/gcn/gcn.cpp @@ -16,7 +16,11 @@ int main(int argc, char** argv) { galois::DistMemSys G; #endif LonestarGnnStart(argc, argv, name, desc, url); - deepgalois::Net network; // the neural network to train + // the neural network to train + deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, + hidden1, learning_rate, dropout_rate, weight_decay, + add_selfloop, is_single_class, add_l2norm, add_dense, + neighbor_sample_sz, subgraph_sample_sz, val_interval); #ifdef GALOIS_USE_DIST std::vector dummyVec; @@ -25,10 +29,6 @@ int main(int argc, char** argv) { #endif // read network, features, ground truth, initialize metadata - network.init(dataset, numThreads, num_conv_layers, epochs, hidden1, - learning_rate, dropout_rate, weight_decay, - add_selfloop, is_single_class, add_l2norm, add_dense, - neighbor_sample_sz, subgraph_sample_sz, val_interval); // default setting for now; can be customized by the user network.construct_layers(); network.print_layers_info(); From d9aae5d50e8359861bd671b81db59900e468d090 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 6 May 2020 10:04:37 -0500 Subject: [PATCH 250/660] fix gtypes --- libdeepgalois/include/deepgalois/context.h | 12 ++--- libdeepgalois/include/deepgalois/gtypes.h | 48 +++++++++++-------- .../include/deepgalois/layers/aggregator.h | 8 ++-- .../include/deepgalois/layers/layer.h | 8 ++-- libdeepgalois/include/deepgalois/lgraph.h | 11 +++-- libdeepgalois/include/deepgalois/sampler.h | 2 +- libdeepgalois/src/context.cpp | 9 +--- libdeepgalois/src/layers/aggregator.cpp | 4 +- libdeepgalois/src/layers/l2_norm_layer.cpp | 1 + .../src/layers/sigmoid_loss_layer.cpp | 5 +- .../src/layers/softmax_loss_layer.cpp | 5 +- libdeepgalois/src/net.cpp | 5 +- libdeepgalois/src/node.cu | 2 +- libdeepgalois/src/sampler.cpp | 1 + 14 files changed, 66 insertions(+), 55 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index f9ca056421..afabe49973 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -10,7 +10,7 @@ #ifdef CPU_ONLY #include "deepgalois/gtypes.h" #else -#include "graph_gpu.h" +//#include "graph_gpu.h" #include "deepgalois/cutils.h" #endif @@ -46,22 +46,22 @@ class Context { void gen_subgraph_feats(size_t m, const mask_t *masks); void createSubgraphs(int num_subgraphs); -#ifdef CPU_ONLY Graph* graph_cpu; // the input graph, |V| = N std::vector subgraphs_cpu; void add_selfloop(Graph &og, Graph &g); //! returns pointer to the graph Graph* getGraphPointer() { return graph_cpu; } Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; }; +#ifdef CPU_ONLY float_t* get_feats_ptr() { return h_feats; } float_t* get_feats_subg_ptr() { return &h_feats_subg[0]; } label_t* get_labels_ptr() { return h_labels; } label_t* get_labels_subg_ptr() { return &h_labels_subg[0]; } #else - CSRGraph graph_gpu; // the input graph, |V| = N - std::vector subgraphs_gpu; - CSRGraph* getGraphPointer() { return &graph_gpu; } - CSRGraph* getSubgraphPointer(int id) { return subgraphs_gpu[id]; }; + //CSRGraph graph_gpu; // the input graph, |V| = N + //std::vector subgraphs_gpu; + //CSRGraph* getGraphPointer() { return &graph_gpu; } + //CSRGraph* getSubgraphPointer(int id) { return subgraphs_gpu[id]; }; float_t* get_feats_ptr() { return d_feats; } float_t* get_feats_subg_ptr() { return d_feats_subg; } label_t* get_labels_ptr() { return d_labels; } diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h index 697d386d9a..cc6fba8041 100644 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ b/libdeepgalois/include/deepgalois/gtypes.h @@ -1,37 +1,47 @@ #pragma once -#include "galois/Galois.h" -#include "galois/graphs/LCGraph.h" #include "deepgalois/types.h" -#include "deepgalois/lgraph.h" #ifdef GALOIS_USE_DIST +#include "galois/Galois.h" #include "galois/graphs/NewGeneric.h" +#else +#ifdef CPU_ONLY +//#include "galois/Galois.h" +//#include "galois/graphs/LCGraph.h" +#include "deepgalois/lgraph.h" +#else +#include "graph_gpu.h" #endif - -namespace deepgalois { - -typedef galois::GAccumulator AccumF; -typedef galois::GAccumulator AccumU; -#ifdef GALOIS_USE_DIST -using AccuracyAccum = galois::DGAccumulator; #endif #ifndef GALOIS_USE_DIST -#ifdef EDGE_LABEL -typedef galois::graphs::LC_CSR_Graph::with_numa_alloc< - true>::type ::with_no_lockable::type LCGraph; -#else -typedef galois::graphs::LC_CSR_Graph:: - with_numa_alloc::type ::with_no_lockable::type LCGraph; -#endif + +namespace deepgalois { +#ifdef CPU_ONLY +//#ifdef EDGE_LABEL +//typedef galois::graphs::LC_CSR_Graph:: +// with_numa_alloc::type ::with_no_lockable::type LCGraph; +//#else +//typedef galois::graphs::LC_CSR_Graph:: +// with_numa_alloc::type ::with_no_lockable::type LCGraph; +//#endif //typedef LCGraph Graph; //typedef Graph::edge_iterator edge_iterator; -typedef LearningGraph Graph; typedef index_t edge_iterator; +typedef LearningGraph Graph; +#else +//typedef CSRGraph GraphGPU; +typedef LearningGraph GraphGPU; +#endif +} + #else + +namespace deepgalois { // TODO check if this needs changing typedef index_t edge_iterator; using Graph = galois::graphs::DistGraph; +} + #endif -} diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h index 1b2d4b5104..67d4bedf3f 100644 --- a/libdeepgalois/include/deepgalois/layers/aggregator.h +++ b/libdeepgalois/include/deepgalois/layers/aggregator.h @@ -1,9 +1,9 @@ #pragma once -#include "deepgalois/types.h" +//#include "deepgalois/types.h" +#include "deepgalois/gtypes.h" //! For each node in the graph, add the embeddings of all of its neighbors //! together (using norm_factor if specified) #ifdef CPU_ONLY -#include "deepgalois/gtypes.h" namespace deepgalois { void update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, float_t* norm_factor); @@ -13,9 +13,9 @@ void update_all_csrmm(size_t len, Graph& g, const float_t* in, #else #include "graph_gpu.h" namespace deepgalois { -void update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, +void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); -void update_all_csrmm(size_t len, CSRGraph& g, const float_t* in, +void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); } #endif diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 206e5e7da3..5d4d1419d2 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -75,11 +75,11 @@ class layer : public deepgalois::node { void set_norm_consts_ptr(float_t *ptr) { norm_consts = ptr; } void set_feats_ptr(float_t *ptr) { prev_->set_data(ptr); } void set_name(std::string name) { name_ = name; } // name metadata -#ifdef CPU_ONLY +//#ifdef CPU_ONLY void set_graph_ptr(Graph *ptr) { graph_cpu = ptr; } -#else - void set_graph_ptr(CSRGraph *ptr) { graph_gpu = ptr; } -#endif +//#else +// void set_graph_ptr(CSRGraph *ptr) { graph_gpu = ptr; } +//#endif void update_dim_size(size_t g_size) { input_dims[0] = output_dims[0] = g_size; } //! set the data of the previous layer connected to this one diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index f8e5ce8315..a82a80c989 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -38,11 +38,7 @@ class LearningGraph { void readGraphFromGRFile(const std::string& filename); size_t size() { return (size_t)num_vertices_; } size_t sizeEdges() { return (size_t)num_edges_; } - index_t getDegree(index_t vid) { return degrees_[vid]; } - index_t getEdgeDst(index_t eid) { return colidx_[eid]; } index_t get_degree(index_t vid) { return degrees_[vid]; } - index_t edge_begin(index_t vid) { return rowptr_[vid]; } - index_t edge_end(index_t vid) { return rowptr_[vid+1]; } iterator begin() const { return iterator(0); } iterator end() const { return iterator(num_vertices_); } @@ -66,6 +62,11 @@ class LearningGraph { uint64_t globalSize(); #ifdef CPU_ONLY + index_t getEdgeDst(index_t eid) { return colidx_[eid]; } + index_t edge_begin(index_t vid) { return rowptr_[vid]; } + index_t edge_end(index_t vid) { return rowptr_[vid+1]; } + vdata_t getData(unsigned vid) { return vertex_data_[vid]; } + index_t getDegree(index_t vid) { return degrees_[vid]; } index_t* row_start_ptr() { return &rowptr_[0]; } const index_t* row_start_ptr() const { return &rowptr_[0]; } index_t* edge_dst_ptr() { return &colidx_[0]; } @@ -77,7 +78,7 @@ class LearningGraph { __device__ index_t getEdgeDst(unsigned edge) { return colidx_[edge]; } __device__ index_t edge_begin(unsigned src) { return d_rowptr_[src]; } __device__ index_t edge_end(unsigned src) { return d_rowptr_[src+1]; } - __device__ vdata_t getData(unsigned vid) { return vertex_data_[vid]; } + __device__ vdata_t getData(unsigned vid) { return d_vertex_data_[vid]; } __device__ index_t getDegree(unsigned vid) { return d_degrees_[vid]; } index_t *row_start_ptr() { return d_rowptr_; } const index_t *row_start_ptr() const { return d_rowptr_; } diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h index eb3b936d18..c5f8abd219 100644 --- a/libdeepgalois/include/deepgalois/sampler.h +++ b/libdeepgalois/include/deepgalois/sampler.h @@ -22,7 +22,7 @@ class Sampler { virtual void select_vertices(size_t nv, size_t n, int m, Graph* g, VertexList vertices, VertexSet &vertex_set); virtual void select_vertices(size_t n, int m, VertexSet &vertex_set, unsigned tid); - galois::runtime::iterable > neighbor_sampler(Graph &g, VertexID v); + //galois::runtime::iterable > neighbor_sampler(Graph &g, VertexID v); edge_iterator sampled_edge_begin(Graph &g, VertexID v) { return g.edge_begin(v); } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 71410eee13..bfa006a1d7 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -4,12 +4,10 @@ #include "deepgalois/context.h" #include "deepgalois/utils.h" #include "deepgalois/configs.h" -//#include +#include "galois/Galois.h" namespace deepgalois { -#ifdef CPU_ONLY - Context::Context() : Context(false) {} Context::~Context() { @@ -247,10 +245,6 @@ void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self } } -#endif - - - /* inline void init_features(size_t dim, vec_t &x) { std::default_random_engine rng; @@ -259,4 +253,5 @@ inline void init_features(size_t dim, vec_t &x) { x[i] = dist(rng); } */ + } // end deepgalois namespace diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index d17cf79a72..8b9e726e8e 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -1,7 +1,7 @@ #include "deepgalois/layers/aggregator.h" #include "deepgalois/math_functions.hh" +#include "galois/Galois.h" -#ifdef CPU_ONLY void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, float_t* norm_factor) { //std::cout << "[update_all] graph size: " << n << "\n"; @@ -50,4 +50,4 @@ void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, (int*)g.row_start_ptr(), (int*)g.edge_dst_ptr(), in, 0.0, out); Tcsrmm.stop(); } -#endif + diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp index a5a77eb82e..f1cb6a4445 100644 --- a/libdeepgalois/src/layers/l2_norm_layer.cpp +++ b/libdeepgalois/src/layers/l2_norm_layer.cpp @@ -1,5 +1,6 @@ #include "deepgalois/layers/l2_norm_layer.h" #include "deepgalois/math_functions.hh" +#include "galois/Galois.h" namespace deepgalois { diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index 5a511d2308..d7ec46378e 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -1,5 +1,6 @@ #include "deepgalois/layers/sigmoid_loss_layer.h" #include "deepgalois/math_functions.hh" +#include "galois/Galois.h" namespace deepgalois { @@ -63,8 +64,8 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* acc_t sigmoid_loss_layer::get_prediction_loss() { assert(count_ > 0); - AccumF total_loss; - AccumU valid_sample_count; + galois::GAccumulator total_loss; + galois::GAccumulator valid_sample_count; total_loss.reset(); valid_sample_count.reset(); galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 2fc7ac80dc..d40ff6d411 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -1,5 +1,6 @@ #include "deepgalois/layers/softmax_loss_layer.h" #include "deepgalois/math_functions.hh" +#include "galois/Galois.h" namespace deepgalois { @@ -66,8 +67,8 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* acc_t softmax_loss_layer::get_prediction_loss() { assert(count_ > 0); - AccumF total_loss; - AccumU valid_sample_count; + galois::GAccumulator total_loss; + galois::GAccumulator valid_sample_count; total_loss.reset(); valid_sample_count.reset(); galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 1f63eacc60..381539df6b 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -3,6 +3,7 @@ */ #include "galois/Timer.h" +#include "galois/Galois.h" #include "deepgalois/net.h" #include "deepgalois/math_functions.hh" @@ -83,9 +84,9 @@ void Net::normalize() { */ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth) { #ifndef GALOIS_USE_DIST - AccumF accuracy_all; + galois::GAccumulator accuracy_all; #else - AccuracyAccum accuracy_all; + galois::DGAccumulator accuracy_all; galois::DGAccumulator sampleCount; sampleCount.reset(); #endif diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu index b5a17af1fd..afaceaeaea 100644 --- a/libdeepgalois/src/node.cu +++ b/libdeepgalois/src/node.cu @@ -9,7 +9,7 @@ void edge::alloc() { CUDA_CHECK(cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t))); } -void edge::merge_grads_gpu(float_t* dst) { +void edge::merge_grads(float_t* dst) { CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost)); } diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index 47317bdd3d..f61f1bcaa4 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -1,5 +1,6 @@ #include "deepgalois/utils.h" #include "deepgalois/sampler.h" +#include "galois/Galois.h" #include #include #define PARALLEL_GEN From c3001cc20ddf33435611bd419042ebdc2372d0b4 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 6 May 2020 10:57:41 -0500 Subject: [PATCH 251/660] fix GraphGPU type --- libdeepgalois/include/deepgalois/context.h | 15 +++++------ libdeepgalois/include/deepgalois/gtypes.h | 5 ++-- .../include/deepgalois/layers/aggregator.h | 7 +++--- .../deepgalois/layers/graph_conv_layer.h | 4 +-- .../include/deepgalois/layers/layer.h | 12 ++++----- libdeepgalois/include/deepgalois/lgraph.h | 15 ++++++----- libdeepgalois/src/context.cu | 25 +++++++++++-------- libdeepgalois/src/layers/aggregator.cu | 18 +++++++------ libdeepgalois/src/layers/graph_conv_layer.cu | 4 +-- lonestar/gnn/gcn/CMakeLists.txt | 5 ++++ 10 files changed, 62 insertions(+), 48 deletions(-) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index afabe49973..f8b848f453 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -7,10 +7,11 @@ #include #include "deepgalois/types.h" #include "deepgalois/reader.h" -#ifdef CPU_ONLY +//#ifdef CPU_ONLY #include "deepgalois/gtypes.h" -#else +//#else //#include "graph_gpu.h" +#ifndef CPU_ONLY #include "deepgalois/cutils.h" #endif @@ -46,22 +47,22 @@ class Context { void gen_subgraph_feats(size_t m, const mask_t *masks); void createSubgraphs(int num_subgraphs); +#ifdef CPU_ONLY Graph* graph_cpu; // the input graph, |V| = N std::vector subgraphs_cpu; void add_selfloop(Graph &og, Graph &g); //! returns pointer to the graph Graph* getGraphPointer() { return graph_cpu; } Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; }; -#ifdef CPU_ONLY float_t* get_feats_ptr() { return h_feats; } float_t* get_feats_subg_ptr() { return &h_feats_subg[0]; } label_t* get_labels_ptr() { return h_labels; } label_t* get_labels_subg_ptr() { return &h_labels_subg[0]; } #else - //CSRGraph graph_gpu; // the input graph, |V| = N - //std::vector subgraphs_gpu; - //CSRGraph* getGraphPointer() { return &graph_gpu; } - //CSRGraph* getSubgraphPointer(int id) { return subgraphs_gpu[id]; }; + GraphGPU graph_gpu; // the input graph, |V| = N + std::vector subgraphs_gpu; + GraphGPU* getGraphPointer() { return &graph_gpu; } + GraphGPU* getSubgraphPointer(int id) { return subgraphs_gpu[id]; }; float_t* get_feats_ptr() { return d_feats; } float_t* get_feats_subg_ptr() { return d_feats_subg; } label_t* get_labels_ptr() { return d_labels; } diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h index cc6fba8041..d12ac8e0d1 100644 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ b/libdeepgalois/include/deepgalois/gtypes.h @@ -10,13 +10,15 @@ //#include "galois/graphs/LCGraph.h" #include "deepgalois/lgraph.h" #else -#include "graph_gpu.h" +//#include "graph_gpu.h" +#include "deepgalois/lgraph.h" #endif #endif #ifndef GALOIS_USE_DIST namespace deepgalois { +typedef index_t edge_iterator; #ifdef CPU_ONLY //#ifdef EDGE_LABEL //typedef galois::graphs::LC_CSR_Graph:: @@ -27,7 +29,6 @@ namespace deepgalois { //#endif //typedef LCGraph Graph; //typedef Graph::edge_iterator edge_iterator; -typedef index_t edge_iterator; typedef LearningGraph Graph; #else //typedef CSRGraph GraphGPU; diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h index 67d4bedf3f..90c5781189 100644 --- a/libdeepgalois/include/deepgalois/layers/aggregator.h +++ b/libdeepgalois/include/deepgalois/layers/aggregator.h @@ -1,9 +1,9 @@ #pragma once -//#include "deepgalois/types.h" -#include "deepgalois/gtypes.h" +#include "deepgalois/types.h" //! For each node in the graph, add the embeddings of all of its neighbors //! together (using norm_factor if specified) #ifdef CPU_ONLY +#include "deepgalois/gtypes.h" namespace deepgalois { void update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, float_t* norm_factor); @@ -11,7 +11,8 @@ void update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, float_t* norm_factor); } #else -#include "graph_gpu.h" +#include "deepgalois/gtypes.h" +//#include "graph_gpu.h" namespace deepgalois { void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 92bc999653..56c0de0be6 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -56,8 +56,8 @@ class graph_conv_layer : public layer { virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out); void d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out); #else - virtual void aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out); - void d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out); + virtual void aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out); + void d_aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out); #endif // user-defined combine function virtual void combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out); diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 5d4d1419d2..cebef58059 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -9,7 +9,7 @@ * Reused/revised under 3-BSD */ -#include "deepgalois/types.h" +#include "deepgalois/gtypes.h" #ifndef GALOIS_USE_DIST #include "deepgalois/context.h" #else @@ -75,11 +75,11 @@ class layer : public deepgalois::node { void set_norm_consts_ptr(float_t *ptr) { norm_consts = ptr; } void set_feats_ptr(float_t *ptr) { prev_->set_data(ptr); } void set_name(std::string name) { name_ = name; } // name metadata -//#ifdef CPU_ONLY +#ifdef CPU_ONLY void set_graph_ptr(Graph *ptr) { graph_cpu = ptr; } -//#else -// void set_graph_ptr(CSRGraph *ptr) { graph_gpu = ptr; } -//#endif +#else + void set_graph_ptr(GraphGPU *ptr) { graph_gpu = ptr; } +#endif void update_dim_size(size_t g_size) { input_dims[0] = output_dims[0] = g_size; } //! set the data of the previous layer connected to this one @@ -173,7 +173,7 @@ class layer : public deepgalois::node { #ifdef CPU_ONLY Graph *graph_cpu; #else - CSRGraph *graph_gpu; + GraphGPU *graph_gpu; #endif #ifdef GALOIS_USE_DIST diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index a82a80c989..77d48d87a6 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -1,7 +1,6 @@ #pragma once #include "deepgalois/types.h" #include -//#include namespace deepgalois { @@ -27,7 +26,6 @@ class LearningGraph { public: typedef size_t iterator; - //using iterator = boost::counting_iterator; LearningGraph(bool use_gpu) : is_device(use_gpu), num_vertices_(0), num_edges_(0), //rowptr_(NULL), colidx_(NULL), degrees_(NULL), vertex_data_(NULL), edge_data_(NULL) {} @@ -65,7 +63,7 @@ class LearningGraph { index_t getEdgeDst(index_t eid) { return colidx_[eid]; } index_t edge_begin(index_t vid) { return rowptr_[vid]; } index_t edge_end(index_t vid) { return rowptr_[vid+1]; } - vdata_t getData(unsigned vid) { return vertex_data_[vid]; } + vdata_t getData(index_t vid) { return vertex_data_[vid]; } index_t getDegree(index_t vid) { return degrees_[vid]; } index_t* row_start_ptr() { return &rowptr_[0]; } const index_t* row_start_ptr() const { return &rowptr_[0]; } @@ -75,11 +73,12 @@ class LearningGraph { edata_t* edge_data_ptr() { return edge_data_; } vdata_t* vertex_data_ptr() { return vertex_data_; } #else - __device__ index_t getEdgeDst(unsigned edge) { return colidx_[edge]; } - __device__ index_t edge_begin(unsigned src) { return d_rowptr_[src]; } - __device__ index_t edge_end(unsigned src) { return d_rowptr_[src+1]; } - __device__ vdata_t getData(unsigned vid) { return d_vertex_data_[vid]; } - __device__ index_t getDegree(unsigned vid) { return d_degrees_[vid]; } + __device__ index_t getEdgeDst(index_t edge) { return d_colidx_[edge]; } + __device__ index_t edge_begin(index_t src) { return d_rowptr_[src]; } + __device__ index_t edge_end(index_t src) { return d_rowptr_[src+1]; } + __device__ vdata_t getData(index_t vid) { return d_vertex_data_[vid]; } + __device__ index_t getDegree(index_t vid) { return d_degrees_[vid]; } + __device__ index_t getOutDegree(index_t vid) { return d_degrees_[vid]; } index_t *row_start_ptr() { return d_rowptr_; } const index_t *row_start_ptr() const { return d_rowptr_; } index_t *edge_dst_ptr() { return d_colidx_; } diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 7f435e8ca8..528c34b7e5 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -24,8 +24,10 @@ int64_t cluster_seedgen(void) { return seed; } +namespace deepgalois { + // computing normalization factor for each vertex -__global__ void norm_factor_computing_node(int n, CSRGraph graph, float_t* norm_fac) { +__global__ void norm_factor_computing_node(int n, GraphGPU graph, float_t* norm_fac) { CUDA_KERNEL_LOOP(i, n) { float_t temp = sqrt(float_t(graph.getOutDegree(i))); if (temp == 0.0) norm_fac[i] = 0.0; @@ -35,16 +37,16 @@ __global__ void norm_factor_computing_node(int n, CSRGraph graph, float_t* norm_ // TODO: make sure self-loop added for each vertex // computing normalization factor for each edge -__global__ void norm_factor_computing_edge(int n, CSRGraph graph, float_t* norm_fac) { +__global__ void norm_factor_computing_edge(int n, GraphGPU graph, float_t* norm_fac) { CUDA_KERNEL_LOOP(src, n) { assert(src < n); float_t d_src = float_t(graph.getOutDegree(src)); assert(d_src != 0.0); // should never be zero since self-loop added for each vertex d_src = 1.0 / sqrt(d_src); - index_type start = graph.edge_begin(src); - index_type end = graph.edge_end(src); - for (index_type e = start; e != end; e++) { - index_type dst = graph.getEdgeDst(e); + auto start = graph.edge_begin(src); + index_t end = graph.edge_end(src); + for (index_t e = start; e != end; e++) { + index_t dst = graph.getEdgeDst(e); if (dst >= n) printf("src=%d, dst=%d, e=%d, start=%d, end=%d\n", src, dst, e, start, end); assert(dst < n); float_t d_dst = float_t(graph.getOutDegree(dst)); @@ -55,8 +57,6 @@ __global__ void norm_factor_computing_edge(int n, CSRGraph graph, float_t* norm_ } } -namespace deepgalois { - cublasHandle_t Context::cublas_handle_ = 0; cusparseHandle_t Context::cusparse_handle_ = 0; cusparseMatDescr_t Context::cusparse_matdescr_ = 0; @@ -102,7 +102,7 @@ void Context::norm_factor_computing(bool is_subgraph, int subg_id) { exit(0); } #ifdef USE_CUSPARSE - int nnz = graph_gpu.nedges; + int nnz = graph_gpu.sizeEdges(); CUDA_CHECK(cudaMalloc((void**)&norm_factors, nnz * sizeof(float_t))); init_const_gpu(nnz, 0.0, norm_factors); norm_factor_computing_edge<<>>(n, graph_gpu, norm_factors); @@ -128,14 +128,17 @@ void Context::SetDevice(const int device_id) { */ size_t Context::read_graph(bool selfloop) { std::string filename = path + dataset + ".csgr"; - CSRGraph g; - g.read(filename.c_str(), false); + /*GraphGPU g; + graph.read(filename.c_str(), false); if (selfloop) { g.add_selfloop(); is_selfloop_added = selfloop; } g.copy_to_gpu(graph_gpu); n = graph_gpu.nnodes; + */ + graph_gpu.readGraphFromGRFile(filename); + graph_gpu.copy_to_gpu(); return n; } diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu index 1f739eef58..fe3aca0182 100644 --- a/libdeepgalois/src/layers/aggregator.cu +++ b/libdeepgalois/src/layers/aggregator.cu @@ -5,6 +5,8 @@ #include "deepgalois/layers/aggregator.h" #include "deepgalois/math_functions.hh" +namespace deepgalois { + // TODO: use warp __device__ void scale_add(const int n, const float_t alpha, const float_t* a, const float_t* b, float_t* y) { @@ -12,7 +14,7 @@ __device__ void scale_add(const int n, const float_t alpha, const float_t* a, y[i] = alpha * a[i] + b[i]; } -__global__ void update_all_naive(size_t n, size_t len, CSRGraph g, +__global__ void update_all_naive(size_t n, size_t len, GraphGPU g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { CUDA_KERNEL_LOOP(src, n) { @@ -29,7 +31,7 @@ __global__ void update_all_naive(size_t n, size_t len, CSRGraph g, } } -__global__ void update_all_warp(size_t n, size_t len, CSRGraph g, +__global__ void update_all_warp(size_t n, size_t len, GraphGPU g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { __shared__ index_type ptrs[BLOCK_SIZE/WARP_SIZE][2]; @@ -59,23 +61,25 @@ __global__ void update_all_warp(size_t n, size_t len, CSRGraph g, } } -void deepgalois::update_all(size_t len, CSRGraph& g, const float_t* in, float_t* out, +void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { - unsigned n = g.nnodes; + unsigned n = g.size(); CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); //update_all_naive<<>>(n, len, g, in, out, norm, norm_factor); update_all_warp<<<(n-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(n, len, g, in, out, norm, norm_factor); CudaTest("solving update_all kernel failed"); } -void deepgalois::update_all_csrmm(size_t len, CSRGraph& g, const float_t* in, float_t* out, +void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { - unsigned n = g.nnodes; + unsigned n = g.size(); CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); //std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n"; //print_device_vector(10, norm_factor, "norm_factor"); float *temp; float_malloc_device(n*len, temp); // TODO: avoid repetitive allocation - csrmm_gpu(n, len, n, g.nedges, 1.0, norm_factor, (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, temp, out); + csrmm_gpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, temp, out); float_free_device(temp); } + +} diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index f4282ced42..e814631022 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -30,7 +30,7 @@ void graph_conv_layer::malloc_and_init() { init_const_gpu(y*z, 0.0, layer::d_weight_grad); } -void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { +void graph_conv_layer::aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out) { #ifdef USE_CUSPARSE deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts); #else @@ -38,7 +38,7 @@ void graph_conv_layer::aggregate(size_t len, CSRGraph& g, const float_t* in, flo #endif } -void graph_conv_layer::d_aggregate(size_t len, CSRGraph& g, const float_t* in, float_t* out) { +void graph_conv_layer::d_aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out) { #ifdef USE_CUSPARSE deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts); #else diff --git a/lonestar/gnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt index 589f60b881..80daca4c78 100644 --- a/lonestar/gnn/gcn/CMakeLists.txt +++ b/lonestar/gnn/gcn/CMakeLists.txt @@ -1,7 +1,12 @@ +if(ENABLE_HETERO_GALOIS) + set_source_files_properties(gcn.cpp PROPERTIES LANGUAGE CUDA) +endif() add_executable(gcn gcn.cpp) target_link_libraries(gcn PRIVATE Galois::shmem lonestar) if(ENABLE_HETERO_GALOIS) + set_property(TARGET gcn PROPERTY CUDA_STANDARD 14) + set_property(TARGET gcn PROPERTY CUDA_SEPARABLE_COMPILATION ON) target_link_libraries(gcn PRIVATE dg_gpu) target_link_libraries(gcn PRIVATE -lcudart -lcublas -lcurand -lcudadevrt) else() From ca77dc4675706eac1ca156a8e3041ff72422115c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 5 May 2020 21:45:22 -0500 Subject: [PATCH 252/660] added a define to disable boost 1.69 warnings --- libgalois/include/galois/runtime/Statistics.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libgalois/include/galois/runtime/Statistics.h b/libgalois/include/galois/runtime/Statistics.h index 253bfcad01..5c6df094cb 100644 --- a/libgalois/include/galois/runtime/Statistics.h +++ b/libgalois/include/galois/runtime/Statistics.h @@ -28,6 +28,8 @@ #include #include +// added her to get rid of annoying int log deprecation in boost 1.69 +#define BOOST_ALLOW_DEPRECATED_HEADERS #include // uuid class #include // generators #include // streaming operators etc. From 532d41fac386a851e7e62f50385a7eca8e9b19c9 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 6 May 2020 11:41:08 -0500 Subject: [PATCH 253/660] cmake changes (indentation fixes) --- libdeepgalois/CMakeLists.txt | 84 ++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 41 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 23a0b44ed7..d57962c185 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -70,48 +70,50 @@ else() endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") + if(NOT ENABLE_HETERO_GALOIS) -if(ENABLE_DIST_GALOIS) -# do not link regular context.cpp; TODO do this conditional in cleaner way -# also don't link sampler -set(sources - src/layers/softmax_loss_layer.cpp - src/layers/sigmoid_loss_layer.cpp - src/layers/graph_conv_layer.cpp - src/layers/leaky_relu_layer.cpp - src/layers/l2_norm_layer.cpp - src/layers/relu_layer.cpp - src/layers/aggregator.cpp - src/math_functions.cpp - src/layers/layer.cpp - src/DistContext.cpp - src/optimizer.cpp - src/reader.cpp - src/lgraph.cpp - src/utils.cpp - src/node.cpp - src/net.cpp -) -else() -set(sources - src/layers/softmax_loss_layer.cpp - src/layers/sigmoid_loss_layer.cpp - src/layers/graph_conv_layer.cpp - src/layers/leaky_relu_layer.cpp - src/layers/l2_norm_layer.cpp - src/layers/relu_layer.cpp - src/layers/aggregator.cpp - src/math_functions.cpp - src/layers/layer.cpp - src/optimizer.cpp - src/context.cpp - src/sampler.cpp - src/reader.cpp - src/lgraph.cpp - src/utils.cpp - src/node.cpp - src/net.cpp -) + if(ENABLE_DIST_GALOIS) + # do not link regular context.cpp; TODO do this conditional in cleaner way + # also don't link sampler + set(sources + src/layers/softmax_loss_layer.cpp + src/layers/sigmoid_loss_layer.cpp + src/layers/graph_conv_layer.cpp + src/layers/leaky_relu_layer.cpp + src/layers/l2_norm_layer.cpp + src/layers/relu_layer.cpp + src/layers/aggregator.cpp + src/math_functions.cpp + src/layers/layer.cpp + src/DistContext.cpp + src/optimizer.cpp + src/reader.cpp + src/lgraph.cpp + src/utils.cpp + src/node.cpp + src/net.cpp + ) + else() + set(sources + src/layers/softmax_loss_layer.cpp + src/layers/sigmoid_loss_layer.cpp + src/layers/graph_conv_layer.cpp + src/layers/leaky_relu_layer.cpp + src/layers/l2_norm_layer.cpp + src/layers/relu_layer.cpp + src/layers/aggregator.cpp + src/math_functions.cpp + src/layers/layer.cpp + src/optimizer.cpp + src/context.cpp + src/sampler.cpp + src/reader.cpp + src/lgraph.cpp + src/utils.cpp + src/node.cpp + src/net.cpp + ) + endif() endif() add_library(dg_cpu STATIC ${sources}) From 2c90c4934731b0240278f45a5b317c0bc07fe7c8 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 6 May 2020 11:50:26 -0500 Subject: [PATCH 254/660] deepgalois cmake if closure fix --- libdeepgalois/CMakeLists.txt | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index d57962c185..5a732ccf93 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -113,8 +113,8 @@ if(NOT ENABLE_HETERO_GALOIS) src/node.cpp src/net.cpp ) - endif() -endif() + endif(ENABLE_DIST_GALOIS) +endif(NOT ENABLE_HETERO_GALOIS) add_library(dg_cpu STATIC ${sources}) target_link_libraries(dg_cpu galois_shmem) @@ -148,4 +148,3 @@ set_target_properties(dg_cpu PROPERTIES INTERFACE_POSITION_INDEPENDENT_CODE On POSITION_INDEPENDENT_CODE On ) -endif() From 390ab47f90f003fb66853ff1027e63ca361b0181 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 6 May 2020 12:27:28 -0500 Subject: [PATCH 255/660] update gtype --- libdeepgalois/include/deepgalois/gtypes.h | 11 +++++++++-- libdeepgalois/src/context.cu | 10 ++++++---- lonestar/gnn/gcn/CMakeLists.txt | 6 +++--- lonestar/gnn/include/lonestargnn.h | 2 -- 4 files changed, 18 insertions(+), 11 deletions(-) diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h index d12ac8e0d1..c011ad4537 100644 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ b/libdeepgalois/include/deepgalois/gtypes.h @@ -1,4 +1,5 @@ #pragma once +#define USE_CSRGRAPH #include "deepgalois/types.h" #ifdef GALOIS_USE_DIST @@ -10,10 +11,13 @@ //#include "galois/graphs/LCGraph.h" #include "deepgalois/lgraph.h" #else -//#include "graph_gpu.h" +#ifdef USE_CSRGRAPH +#include "graph_gpu.h" +#else #include "deepgalois/lgraph.h" #endif #endif +#endif #ifndef GALOIS_USE_DIST @@ -31,9 +35,12 @@ typedef index_t edge_iterator; //typedef Graph::edge_iterator edge_iterator; typedef LearningGraph Graph; #else -//typedef CSRGraph GraphGPU; +#ifdef USE_CSRGRAPH +typedef CSRGraph GraphGPU; +#else typedef LearningGraph GraphGPU; #endif +#endif } #else diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 528c34b7e5..bbaddf0e99 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -128,17 +128,19 @@ void Context::SetDevice(const int device_id) { */ size_t Context::read_graph(bool selfloop) { std::string filename = path + dataset + ".csgr"; - /*GraphGPU g; - graph.read(filename.c_str(), false); +#ifdef USE_CSRGRAPH + GraphGPU g; + g.read(filename.c_str(), false); if (selfloop) { g.add_selfloop(); is_selfloop_added = selfloop; } g.copy_to_gpu(graph_gpu); - n = graph_gpu.nnodes; - */ +#else graph_gpu.readGraphFromGRFile(filename); graph_gpu.copy_to_gpu(); +#endif + n = graph_gpu.size(); return n; } diff --git a/lonestar/gnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt index 80daca4c78..ca8dcaa83e 100644 --- a/lonestar/gnn/gcn/CMakeLists.txt +++ b/lonestar/gnn/gcn/CMakeLists.txt @@ -1,6 +1,6 @@ -if(ENABLE_HETERO_GALOIS) - set_source_files_properties(gcn.cpp PROPERTIES LANGUAGE CUDA) -endif() +#if(ENABLE_HETERO_GALOIS) +# set_source_files_properties(gcn.cpp PROPERTIES LANGUAGE CUDA) +#endif() add_executable(gcn gcn.cpp) target_link_libraries(gcn PRIVATE Galois::shmem lonestar) diff --git a/lonestar/gnn/include/lonestargnn.h b/lonestar/gnn/include/lonestargnn.h index a72668daab..d0255b9368 100644 --- a/lonestar/gnn/include/lonestargnn.h +++ b/lonestar/gnn/include/lonestargnn.h @@ -109,6 +109,4 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, galois::runtime::reportParam("(NULL)", "Hostname", name); } -#include "deepgalois/types.h" -#include "deepgalois/utils.h" #include "deepgalois/net.h" From 4f07d348430c6d5961dd21be44e7863fb8c9d3c5 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 6 May 2020 15:56:56 -0500 Subject: [PATCH 256/660] TODO something needs to be included as part of dg_cpu, dg_gpu doesn't build either --- libdeepgalois/CMakeLists.txt | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 5a732ccf93..6c694dc038 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -114,6 +114,12 @@ if(NOT ENABLE_HETERO_GALOIS) src/net.cpp ) endif(ENABLE_DIST_GALOIS) +else() + # dummy sources set for dg_cpu for HETERO build + # TODO fix this + set(sources + src/net.cpp + ) endif(NOT ENABLE_HETERO_GALOIS) add_library(dg_cpu STATIC ${sources}) From 3ace80fce5bc0eb6b02ef90d2d2956dcf0b51c71 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 6 May 2020 18:09:14 -0500 Subject: [PATCH 257/660] fix gpu compilation --- libdeepgalois/CMakeLists.txt | 6 +- libdeepgalois/include/deepgalois/gtypes.h | 5 +- .../include/deepgalois/layers/l2_norm_layer.h | 8 +- .../include/deepgalois/layers/layer.h | 9 +- libdeepgalois/include/deepgalois/lgraph.h | 32 ++-- libdeepgalois/include/deepgalois/reader.h | 4 +- libdeepgalois/src/context.cpp | 6 +- libdeepgalois/src/context.cu | 4 +- libdeepgalois/src/layers/l2_norm_layer.cpp | 10 -- libdeepgalois/src/layers/layer.cpp | 12 -- libdeepgalois/src/lgraph.cpp | 142 +----------------- libdeepgalois/src/lgraph.cu | 8 + libdeepgalois/src/net.cu | 2 + libdeepgalois/src/optimizer.cu | 1 + libdeepgalois/src/reader.cpp | 107 +++++++++++++ lonestar/gnn/gcn/CMakeLists.txt | 2 +- 16 files changed, 169 insertions(+), 189 deletions(-) delete mode 100644 libdeepgalois/src/layers/layer.cpp diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 6c694dc038..41e5130818 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -84,7 +84,6 @@ if(NOT ENABLE_HETERO_GALOIS) src/layers/relu_layer.cpp src/layers/aggregator.cpp src/math_functions.cpp - src/layers/layer.cpp src/DistContext.cpp src/optimizer.cpp src/reader.cpp @@ -103,7 +102,6 @@ if(NOT ENABLE_HETERO_GALOIS) src/layers/relu_layer.cpp src/layers/aggregator.cpp src/math_functions.cpp - src/layers/layer.cpp src/optimizer.cpp src/context.cpp src/sampler.cpp @@ -117,9 +115,7 @@ if(NOT ENABLE_HETERO_GALOIS) else() # dummy sources set for dg_cpu for HETERO build # TODO fix this - set(sources - src/net.cpp - ) + set(sources src/reader.cpp) endif(NOT ENABLE_HETERO_GALOIS) add_library(dg_cpu STATIC ${sources}) diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h index c011ad4537..9b85007d28 100644 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ b/libdeepgalois/include/deepgalois/gtypes.h @@ -1,5 +1,5 @@ #pragma once -#define USE_CSRGRAPH +//#define USE_CSRGRAPH #include "deepgalois/types.h" #ifdef GALOIS_USE_DIST @@ -23,7 +23,6 @@ namespace deepgalois { typedef index_t edge_iterator; -#ifdef CPU_ONLY //#ifdef EDGE_LABEL //typedef galois::graphs::LC_CSR_Graph:: // with_numa_alloc::type ::with_no_lockable::type LCGraph; @@ -34,13 +33,11 @@ typedef index_t edge_iterator; //typedef LCGraph Graph; //typedef Graph::edge_iterator edge_iterator; typedef LearningGraph Graph; -#else #ifdef USE_CSRGRAPH typedef CSRGraph GraphGPU; #else typedef LearningGraph GraphGPU; #endif -#endif } #else diff --git a/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h index b15c1ae671..29e29f3474 100644 --- a/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h +++ b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h @@ -5,7 +5,12 @@ namespace deepgalois { // L2 Normalization Layer class l2_norm_layer : public layer { public: - l2_norm_layer(unsigned level, float_t eps, float_t scale, dims_t in_dims, dims_t out_dims); + l2_norm_layer(unsigned level, float_t eps, float_t scale, dims_t in_dims, dims_t out_dims) + : layer(level, in_dims, out_dims), epsilon_(eps), scale_(scale) { + assert(input_dims[0] == output_dims[0]); // num_vertices + trainable_ = false; + name_ = layer_type() + "_" + std::to_string(level); + } l2_norm_layer(unsigned level, dims_t in_dims, dims_t out_dims) : l2_norm_layer(level, 1e-12, 20, in_dims, out_dims) {} ~l2_norm_layer() {} @@ -17,4 +22,5 @@ class l2_norm_layer : public layer { float_t epsilon_; float_t scale_; }; + } // namespace diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index cebef58059..c0f03aafd3 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -8,7 +8,7 @@ * All rights reserved. * Reused/revised under 3-BSD */ - +#include #include "deepgalois/gtypes.h" #ifndef GALOIS_USE_DIST #include "deepgalois/context.h" @@ -53,9 +53,12 @@ class layer : public deepgalois::node { output_dims(out_dims), labels(NULL) { } virtual ~layer() = default; virtual std::string layer_type() const = 0; - void print_layer_info(); //! debug print function virtual void malloc_and_init() {} - + void print_layer_info() { //! debug print function + std::cout << "Layer" << level_ << " type: " << layer_type() << " input[" + << input_dims[0] << "," << input_dims[1] << "] output[" + << output_dims[0] << "," << output_dims[1] << "]\n"; + } // get methods virtual acc_t get_prediction_loss() { return acc_t(0); } virtual acc_t get_weight_decay_loss() { return acc_t(0); } diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index 77d48d87a6..eb53a4d930 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -2,6 +2,12 @@ #include "deepgalois/types.h" #include +#ifdef __CUDACC__ +#define CUDA_HOSTDEV __host__ __device__ +#else +#define CUDA_HOSTDEV +#endif + namespace deepgalois { class LearningGraph { @@ -32,8 +38,6 @@ class LearningGraph { LearningGraph() : LearningGraph(false) {} ~LearningGraph() { dealloc(); } void init(index_t nv, index_t ne) { num_vertices_ = nv; num_edges_ = ne; } - void readGraph(std::string path, std::string dataset); - void readGraphFromGRFile(const std::string& filename); size_t size() { return (size_t)num_vertices_; } size_t sizeEdges() { return (size_t)num_edges_; } index_t get_degree(index_t vid) { return degrees_[vid]; } @@ -46,12 +50,20 @@ class LearningGraph { void copy_to_gpu(); void dealloc(); void degree_counting(); - void allocateFrom(index_t nv, index_t ne); void constructNodes(); void fixEndEdge(index_t vid, index_t row_end); void constructEdge(index_t eid, index_t dst, edata_t edata); void add_selfloop(); - + void readGraph(std::string dataset); + void allocateFrom(index_t nv, index_t ne) { + //printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_); + num_vertices_ = nv; + num_edges_ = ne; + rowptr_.resize(num_vertices_+1); + colidx_.resize(num_edges_); + degrees_.resize(num_vertices_); + rowptr_[0] = 0; + } bool isLocal(index_t vid); index_t getLID(index_t vid); bool is_vertex_cut(); @@ -73,12 +85,12 @@ class LearningGraph { edata_t* edge_data_ptr() { return edge_data_; } vdata_t* vertex_data_ptr() { return vertex_data_; } #else - __device__ index_t getEdgeDst(index_t edge) { return d_colidx_[edge]; } - __device__ index_t edge_begin(index_t src) { return d_rowptr_[src]; } - __device__ index_t edge_end(index_t src) { return d_rowptr_[src+1]; } - __device__ vdata_t getData(index_t vid) { return d_vertex_data_[vid]; } - __device__ index_t getDegree(index_t vid) { return d_degrees_[vid]; } - __device__ index_t getOutDegree(index_t vid) { return d_degrees_[vid]; } + CUDA_HOSTDEV index_t getEdgeDst(index_t edge) { return d_colidx_[edge]; } + CUDA_HOSTDEV index_t edge_begin(index_t src) { return d_rowptr_[src]; } + CUDA_HOSTDEV index_t edge_end(index_t src) { return d_rowptr_[src+1]; } + CUDA_HOSTDEV vdata_t getData(index_t vid) { return d_vertex_data_[vid]; } + CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_degrees_[vid]; } + CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_degrees_[vid]; } index_t *row_start_ptr() { return d_rowptr_; } const index_t *row_start_ptr() const { return d_rowptr_; } index_t *edge_dst_ptr() { return d_colidx_; } diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h index 090ec817f8..e25124cbfd 100644 --- a/libdeepgalois/include/deepgalois/reader.h +++ b/libdeepgalois/include/deepgalois/reader.h @@ -1,11 +1,12 @@ #pragma once -#include "deepgalois/types.h" +#include "deepgalois/gtypes.h" namespace deepgalois { class Reader { private: std::string dataset_str; + void progressPrint(unsigned maxii, unsigned ii); public: Reader() : dataset_str("") {} Reader(std::string dataset) : dataset_str(dataset) {} @@ -13,6 +14,7 @@ class Reader { size_t read_labels(bool is_single_class, label_t*& labels); size_t read_features(float_t*& feats, std::string filetype = "bin"); size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks); + void readGraphFromGRFile(Graph *g); }; } diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index bfa006a1d7..757279ceba 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -69,7 +69,7 @@ size_t Context::read_graph(bool selfloop) { printf("Reading .el file: %s\n", filename.c_str()); read_edgelist(filename.c_str(), true); // symmetrize } else if (filetype == "bin") { - graph_cpu->readGraphFromGRFile(filename); + graph_cpu->readGraph(dataset); } else if (filetype == "gr") { graph_cpu = new Graph(); std::string filename = path + dataset + ".csgr"; @@ -77,11 +77,11 @@ size_t Context::read_graph(bool selfloop) { if (selfloop) { Graph graph_temp; //galois::graphs::readGraph(graph_temp, filename); - graph_temp.readGraphFromGRFile(filename); + graph_temp.readGraph(dataset); add_selfloop(graph_temp, *graph_cpu); is_selfloop_added = selfloop; //} else galois::graphs::readGraph(*graph_cpu, filename); - } else graph_cpu->readGraphFromGRFile(filename); + } else graph_cpu->readGraph(dataset); // TODO dist version of self loop } else { printf("Unkown file format\n"); diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index bbaddf0e99..17356c4423 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -127,8 +127,8 @@ void Context::SetDevice(const int device_id) { } */ size_t Context::read_graph(bool selfloop) { - std::string filename = path + dataset + ".csgr"; #ifdef USE_CSRGRAPH + std::string filename = path + dataset + ".csgr"; GraphGPU g; g.read(filename.c_str(), false); if (selfloop) { @@ -137,7 +137,7 @@ size_t Context::read_graph(bool selfloop) { } g.copy_to_gpu(graph_gpu); #else - graph_gpu.readGraphFromGRFile(filename); + graph_gpu.readGraph(dataset); graph_gpu.copy_to_gpu(); #endif n = graph_gpu.size(); diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp index f1cb6a4445..864eaeb321 100644 --- a/libdeepgalois/src/layers/l2_norm_layer.cpp +++ b/libdeepgalois/src/layers/l2_norm_layer.cpp @@ -4,15 +4,6 @@ namespace deepgalois { -l2_norm_layer::l2_norm_layer(unsigned level, float_t eps, float_t scale, - dims_t in_dims, dims_t out_dims) - : layer(level, in_dims, out_dims), epsilon_(eps), scale_(scale) { - assert(input_dims[0] == output_dims[0]); // num_vertices - trainable_ = false; - name_ = layer_type() + "_" + std::to_string(level); -} - -#ifdef CPU_ONLY void l2_norm_layer::forward_propagation(const float_t* in_data, float_t* out_data) { size_t x = input_dims[0]; size_t y = input_dims[1]; @@ -51,6 +42,5 @@ void l2_norm_layer::back_propagation(const float_t* in_data, const float_t*, } }, galois::loopname("d_l2_norm")); } -#endif } // namespace diff --git a/libdeepgalois/src/layers/layer.cpp b/libdeepgalois/src/layers/layer.cpp deleted file mode 100644 index 6abb1ffb6a..0000000000 --- a/libdeepgalois/src/layers/layer.cpp +++ /dev/null @@ -1,12 +0,0 @@ -#include "deepgalois/layers/layer.h" -#include "galois/Galois.h" - -namespace deepgalois { - -void layer::print_layer_info() { - galois::gPrint("Layer", level_, " type: ", layer_type(), " input[", - input_dims[0], ",", input_dims[1], "] output[", - output_dims[0], ",", output_dims[1], "]\n"); -} - -} diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index 6531034794..76187f302f 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -1,12 +1,7 @@ #include "deepgalois/lgraph.h" #include "deepgalois/utils.h" +#include "deepgalois/reader.h" #include "galois/Galois.h" -#include -#include -#include -#include /* For O_RDWR */ -#include /* For open(), creat() */ -#include #include #include @@ -26,43 +21,12 @@ uint64_t LearningGraph::numMasters() { return 0; } uint64_t LearningGraph::globalSize() { return 0; } -void LearningGraph::progressPrint(unsigned maxii, unsigned ii) { - const unsigned nsteps = 10; - unsigned ineachstep = (maxii / nsteps); - if(ineachstep == 0) ineachstep = 1; - if (ii % ineachstep == 0) { - int progress = ((size_t) ii * 100) / maxii + 1; - printf("\t%3d%%\r", progress); - fflush(stdout); - } +void LearningGraph::constructNodes() { } -void LearningGraph::allocateFrom(index_t nv, index_t ne) { - //printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_); -/* - if (num_vertices_ != nv) { - if (rowptr_ != NULL) delete [] rowptr_; - if (degrees_ != NULL) delete [] degrees_; - if (vertex_data_ != NULL) delete [] vertex_data_; - num_vertices_ = nv; - } - if (num_edges_ != ne) { - if (colidx_ != NULL) delete [] colidx_; - if (edge_data_ != NULL) delete [] edge_data_; - num_edges_ = ne; - } - if (rowptr_ == NULL) rowptr_ = new index_t[num_vertices_+1]; - if (colidx_ == NULL) colidx_ = new index_t[num_edges_]; -*/ - num_vertices_ = nv; - num_edges_ = ne; - rowptr_.resize(num_vertices_+1); - colidx_.resize(num_edges_); - degrees_.resize(num_vertices_); - rowptr_[0] = 0; -} - -void LearningGraph::constructNodes() { +void LearningGraph::readGraph(std::string dataset) { + deepgalois::Reader reader(dataset); + reader.readGraphFromGRFile(this); } void LearningGraph::fixEndEdge(index_t vid, index_t row_end) { @@ -118,102 +82,6 @@ void LearningGraph::add_selfloop() { //print_neighbors(0); } -void LearningGraph::readGraph(std::string path, std::string dataset) { - std::string filename = path + dataset + ".csgr"; -} - -void LearningGraph::readGraphFromGRFile(const std::string& filename) { - std::ifstream ifs; - ifs.open(filename); - int masterFD = open(filename.c_str(), O_RDONLY); - if (masterFD == -1) { - std::cout << "LearningGraph: unable to open" << filename << "\n"; - exit(1); - } - struct stat buf; - int f = fstat(masterFD, &buf); - if (f == -1) { - std::cout << "LearningGraph: unable to stat" << filename << "\n"; - exit(1); - } - size_t masterLength = buf.st_size; - int _MAP_BASE = MAP_PRIVATE; - void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0); - if (m == MAP_FAILED) { - m = 0; - std::cout << "LearningGraph: mmap failed.\n"; - exit(1); - } - Timer t; - t.Start(); - - uint64_t* fptr = (uint64_t*)m; - __attribute__((unused)) uint64_t version = le64toh(*fptr++); - assert(version == 1); - uint64_t sizeEdgeTy = le64toh(*fptr++); - uint64_t nv = le64toh(*fptr++); - uint64_t ne = le64toh(*fptr++); - uint64_t *outIdx = fptr; - fptr += nv; - uint32_t *fptr32 = (uint32_t*)fptr; - uint32_t *outs = fptr32; - fptr32 += ne; - if (ne % 2) fptr32 += 1; - num_vertices_ = nv; - num_edges_ = ne; - if (sizeEdgeTy != 0) { - std::cout << "LearningGraph: currently edge data not supported.\n"; - exit(1); - } - - printf("num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_); - allocateFrom(nv, ne); - //degrees_ = new index_t[num_vertices_]; - //rowptr_ = new index_t[num_vertices_+1]; - //colidx_ = new index_t[num_edges_]; - //rowptr_[0] = 0; - for (unsigned ii = 0; ii < num_vertices_; ++ii) { - rowptr_[ii+1] = le64toh(outIdx[ii]); - degrees_[ii] = rowptr_[ii+1] - rowptr_[ii]; - for (unsigned jj = 0; jj < degrees_[ii]; ++jj) { - unsigned eid = rowptr_[ii] + jj; - unsigned dst = le32toh(outs[eid]); - if (dst >= num_vertices_) { - printf("\tinvalid edge from %d to %d at index %d(%d).\n", ii, dst, jj, eid); - exit(0); - } - colidx_[eid] = dst; - } - progressPrint(num_vertices_, ii); - } - ifs.close(); - -/* - std::string file_dims = path + dataset + "-dims.bin"; - std::string file_rowptr = path + dataset + "-rowptr.bin"; - std::string file_colidx = path + dataset + "-colidx.bin"; - index_t dims[2]; - ifs.open(file_dims, std::ios::binary|std::ios::in); - ifs.read((char*)dims, sizeof(index_t) * 2); - ifs.close(); - num_vertices_ = dims[0]; - num_edges_ = dims[1]; - degrees_ = new index_t[num_vertices_]; - rowptr_ = new index_t[num_vertices_+1]; - colidx_ = new index_t[num_edges_]; - ifs.open(file_rowptr, std::ios::binary|std::ios::in); - ifs.read((char*)rowptr_, sizeof(index_t) * (num_vertices_+1)); - ifs.close(); - ifs.open(file_colidx, std::ios::binary|std::ios::in); - ifs.read((char*)colidx_, sizeof(index_t) * num_edges_); - ifs.close(); -*/ - t.Stop(); - double runtime = t.Millisecs(); - std::cout << "read " << masterLength << " bytes in " << runtime << " ms (" - << masterLength/1000.0/runtime << " MB/s)\n\n"; -} - #ifdef CPU_ONLY void LearningGraph::dealloc() { /* diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu index 3a379a649e..42280956d4 100644 --- a/libdeepgalois/src/lgraph.cu +++ b/libdeepgalois/src/lgraph.cu @@ -1,9 +1,15 @@ #include "deepgalois/lgraph.h" #include "deepgalois/cutils.h" +#include "deepgalois/reader.h" #include namespace deepgalois { +void LearningGraph::readGraph(std::string dataset) { + deepgalois::Reader reader(dataset); + reader.readGraphFromGRFile(this); +} + void LearningGraph::dealloc() { assert(is_device); CUDA_CHECK(cudaFree(d_colidx_)); @@ -40,4 +46,6 @@ void LearningGraph::copy_to_cpu() { //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__ptr(), vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost)); } +void LearningGraph::degree_counting() {} + } diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index 1a50c0c551..7d3e61bac8 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -162,6 +162,8 @@ void Net::regularize() { layers[layer_id]->get_grads_device_ptr()); } +void Net::normalize() {} + acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth) { return masked_accuracy_gpu(num_classes, begin, end, count, masks, preds, ground_truth); diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu index 6953a804c1..0fd16803fd 100644 --- a/libdeepgalois/src/optimizer.cu +++ b/libdeepgalois/src/optimizer.cu @@ -27,6 +27,7 @@ float_t* stateful_optimizer::get_gpu(const size_t n, const float_t *key) { return dE_[Index][key]; } +void adam::update(const vec_t& dW, vec_t& W) {} void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { //std::cout << "updating weights on GPU, n = " << n << "\n"; //print_device_vector(10, dW, "dW"); diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp index 2ea8134254..7497cb2887 100644 --- a/libdeepgalois/src/reader.cpp +++ b/libdeepgalois/src/reader.cpp @@ -1,6 +1,13 @@ #include "deepgalois/reader.h" #include "deepgalois/utils.h" #include "deepgalois/configs.h" +#include +#include +#include +#include /* For O_RDWR */ +#include /* For open(), creat() */ +#include +#include namespace deepgalois { @@ -141,4 +148,104 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, size_t return sample_count; } +void Reader::progressPrint(unsigned maxii, unsigned ii) { + const unsigned nsteps = 10; + unsigned ineachstep = (maxii / nsteps); + if(ineachstep == 0) ineachstep = 1; + if (ii % ineachstep == 0) { + int progress = ((size_t) ii * 100) / maxii + 1; + printf("\t%3d%%\r", progress); + fflush(stdout); + } +} + +void Reader::readGraphFromGRFile(Graph *g) { + std::string filename = path + dataset_str + ".csgr"; + std::ifstream ifs; + ifs.open(filename); + int masterFD = open(filename.c_str(), O_RDONLY); + if (masterFD == -1) { + std::cout << "LearningGraph: unable to open" << filename << "\n"; + exit(1); + } + struct stat buf; + int f = fstat(masterFD, &buf); + if (f == -1) { + std::cout << "LearningGraph: unable to stat" << filename << "\n"; + exit(1); + } + size_t masterLength = buf.st_size; + int _MAP_BASE = MAP_PRIVATE; + void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0); + if (m == MAP_FAILED) { + m = 0; + std::cout << "LearningGraph: mmap failed.\n"; + exit(1); + } + Timer t; + t.Start(); + + uint64_t* fptr = (uint64_t*)m; + __attribute__((unused)) uint64_t version = le64toh(*fptr++); + assert(version == 1); + uint64_t sizeEdgeTy = le64toh(*fptr++); + uint64_t nv = le64toh(*fptr++); + uint64_t ne = le64toh(*fptr++); + uint64_t *outIdx = fptr; + fptr += nv; + uint32_t *fptr32 = (uint32_t*)fptr; + uint32_t *outs = fptr32; + fptr32 += ne; + if (ne % 2) fptr32 += 1; + if (sizeEdgeTy != 0) { + std::cout << "LearningGraph: currently edge data not supported.\n"; + exit(1); + } + printf("num_vertices %lu, num_edges %lu.\n", nv, ne); + g->allocateFrom(nv, ne); + auto rowptr = g->row_start_ptr(); + auto colidx = g->edge_dst_ptr(); + auto degrees = g->degrees_ptr(); + for (unsigned ii = 0; ii < nv; ++ii) { + rowptr[ii+1] = le64toh(outIdx[ii]); + degrees[ii] = rowptr[ii+1] - rowptr[ii]; + for (unsigned jj = 0; jj < degrees[ii]; ++jj) { + unsigned eid = rowptr[ii] + jj; + unsigned dst = le32toh(outs[eid]); + if (dst >= nv) { + printf("\tinvalid edge from %d to %d at index %d(%d).\n", ii, dst, jj, eid); + exit(0); + } + colidx[eid] = dst; + } + progressPrint(nv, ii); + } + ifs.close(); + +/* + std::string file_dims = path + dataset + "-dims.bin"; + std::string file_rowptr = path + dataset + "-rowptr.bin"; + std::string file_colidx = path + dataset + "-colidx.bin"; + index_t dims[2]; + ifs.open(file_dims, std::ios::binary|std::ios::in); + ifs.read((char*)dims, sizeof(index_t) * 2); + ifs.close(); + num_vertices_ = dims[0]; + num_edges_ = dims[1]; + degrees_ = new index_t[num_vertices_]; + rowptr_ = new index_t[num_vertices_+1]; + colidx_ = new index_t[num_edges_]; + ifs.open(file_rowptr, std::ios::binary|std::ios::in); + ifs.read((char*)rowptr_, sizeof(index_t) * (num_vertices_+1)); + ifs.close(); + ifs.open(file_colidx, std::ios::binary|std::ios::in); + ifs.read((char*)colidx_, sizeof(index_t) * num_edges_); + ifs.close(); +*/ + t.Stop(); + double runtime = t.Millisecs(); + std::cout << "read " << masterLength << " bytes in " << runtime << " ms (" + << masterLength/1000.0/runtime << " MB/s)\n\n"; +} + } diff --git a/lonestar/gnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt index ca8dcaa83e..fc5f134d76 100644 --- a/lonestar/gnn/gcn/CMakeLists.txt +++ b/lonestar/gnn/gcn/CMakeLists.txt @@ -7,7 +7,7 @@ target_link_libraries(gcn PRIVATE Galois::shmem lonestar) if(ENABLE_HETERO_GALOIS) set_property(TARGET gcn PROPERTY CUDA_STANDARD 14) set_property(TARGET gcn PROPERTY CUDA_SEPARABLE_COMPILATION ON) - target_link_libraries(gcn PRIVATE dg_gpu) + target_link_libraries(gcn PRIVATE dg_gpu dg_cpu) target_link_libraries(gcn PRIVATE -lcudart -lcublas -lcurand -lcudadevrt) else() target_link_libraries(gcn PRIVATE dg_cpu) From f043c193bb6f4b18a884f32a9744f6848dffef97 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 6 May 2020 18:37:43 -0500 Subject: [PATCH 258/660] fix bug in reader --- libdeepgalois/include/deepgalois/lgraph.h | 51 +++++++++++++++++-- libdeepgalois/src/context.cu | 4 ++ libdeepgalois/src/lgraph.cpp | 62 +---------------------- libdeepgalois/src/reader.cpp | 28 +++++----- 4 files changed, 65 insertions(+), 80 deletions(-) diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index eb53a4d930..e9dd995f93 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -1,6 +1,7 @@ #pragma once #include "deepgalois/types.h" #include +#include #ifdef __CUDACC__ #define CUDA_HOSTDEV __host__ __device__ @@ -50,11 +51,12 @@ class LearningGraph { void copy_to_gpu(); void dealloc(); void degree_counting(); - void constructNodes(); - void fixEndEdge(index_t vid, index_t row_end); - void constructEdge(index_t eid, index_t dst, edata_t edata); - void add_selfloop(); + void constructNodes() {} + void readGraph(std::string dataset); + void fixEndEdge(index_t vid, index_t row_end) { + rowptr_[vid+1] = row_end; + } void allocateFrom(index_t nv, index_t ne) { //printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_); num_vertices_ = nv; @@ -64,6 +66,46 @@ class LearningGraph { degrees_.resize(num_vertices_); rowptr_[0] = 0; } + void constructEdge(index_t eid, index_t dst, edata_t edata = 0) { + assert(dst < num_vertices_); + assert(eid < num_edges_); + colidx_[eid] = dst; + if (edge_data_) edge_data_[eid] = edata; + } + void add_selfloop() { + //print_neighbors(nnodes-1); + //print_neighbors(0); + auto old_colidx_ = colidx_; + colidx_.resize(num_vertices_ + num_edges_); + for (index_t i = 0; i < num_vertices_; i++) { + auto start = rowptr_[i]; + auto end = rowptr_[i+1]; + bool selfloop_inserted = false; + if (start == end) { + colidx_[start+i] = i; + continue; + } + for (auto e = start; e != end; e++) { + auto dst = old_colidx_[e]; + if (!selfloop_inserted) { + if (i < dst) { + selfloop_inserted = true; + colidx_[e+i] = i; + colidx_[e+i+1] = dst; + } else if (e+1 == end) { + selfloop_inserted = true; + colidx_[e+i+1] = i; + colidx_[e+i] = dst; + } else colidx_[e+i] = dst; + } else colidx_[e+i+1] = dst; + } + } + for (index_t i = 0; i <= num_vertices_; i++) rowptr_[i] += i; + num_edges_ += num_vertices_; + //print_neighbors(nnodes-1); + //print_neighbors(0); + } + bool isLocal(index_t vid); index_t getLID(index_t vid); bool is_vertex_cut(); @@ -71,6 +113,7 @@ class LearningGraph { uint64_t numMasters(); uint64_t globalSize(); + index_t* row_start_host_ptr() { return &rowptr_[0]; } #ifdef CPU_ONLY index_t getEdgeDst(index_t eid) { return colidx_[eid]; } index_t edge_begin(index_t vid) { return rowptr_[vid]; } diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 17356c4423..65b0e6304a 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -138,6 +138,10 @@ size_t Context::read_graph(bool selfloop) { g.copy_to_gpu(graph_gpu); #else graph_gpu.readGraph(dataset); + if (selfloop) { + graph_gpu.add_selfloop(); + is_selfloop_added = selfloop; + } graph_gpu.copy_to_gpu(); #endif n = graph_gpu.size(); diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index 76187f302f..26811280a1 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -3,7 +3,6 @@ #include "deepgalois/reader.h" #include "galois/Galois.h" #include -#include namespace deepgalois { @@ -21,25 +20,11 @@ uint64_t LearningGraph::numMasters() { return 0; } uint64_t LearningGraph::globalSize() { return 0; } -void LearningGraph::constructNodes() { -} - void LearningGraph::readGraph(std::string dataset) { deepgalois::Reader reader(dataset); reader.readGraphFromGRFile(this); } -void LearningGraph::fixEndEdge(index_t vid, index_t row_end) { - rowptr_[vid+1] = row_end; -} - -void LearningGraph::constructEdge(index_t eid, index_t dst, edata_t edata) { - assert(dst < num_vertices_); - assert(eid < num_edges_); - colidx_[eid] = dst; - if (edge_data_) edge_data_[eid] = edata; -} - void LearningGraph::degree_counting() { //if (degrees_ != NULL) return; //degrees_ = new index_t[num_vertices_]; @@ -48,51 +33,6 @@ void LearningGraph::degree_counting() { }, galois::loopname("DegreeCounting")); } -void LearningGraph::add_selfloop() { - //print_neighbors(nnodes-1); - //print_neighbors(0); - auto old_colidx_ = colidx_; - colidx_.resize(num_vertices_ + num_edges_); - for (index_t i = 0; i < num_vertices_; i++) { - auto start = rowptr_[i]; - auto end = rowptr_[i+1]; - bool selfloop_inserted = false; - if (start == end) { - colidx_[start+i] = i; - continue; - } - for (auto e = start; e != end; e++) { - auto dst = old_colidx_[e]; - if (!selfloop_inserted) { - if (i < dst) { - selfloop_inserted = true; - colidx_[e+i] = i; - colidx_[e+i+1] = dst; - } else if (e+1 == end) { - selfloop_inserted = true; - colidx_[e+i+1] = i; - colidx_[e+i] = dst; - } else colidx_[e+i] = dst; - } else colidx_[e+i+1] = dst; - } - } - for (index_t i = 0; i <= num_vertices_; i++) rowptr_[i] += i; - num_edges_ += num_vertices_; - //print_neighbors(nnodes-1); - //print_neighbors(0); -} - -#ifdef CPU_ONLY -void LearningGraph::dealloc() { -/* - assert (!is_device); - if (rowptr_ != NULL) delete [] rowptr_; - if (colidx_ != NULL) delete [] colidx_; - if (degrees_ != NULL) delete [] degrees_; - if (vertex_data_ != NULL) delete [] vertex_data_; - if (edge_data_ != NULL) delete [] edge_data_; -//*/ -} -#endif +void LearningGraph::dealloc() {} } // end namespace diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp index 7497cb2887..519e27496a 100644 --- a/libdeepgalois/src/reader.cpp +++ b/libdeepgalois/src/reader.cpp @@ -148,12 +148,12 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, size_t return sample_count; } -void Reader::progressPrint(unsigned maxii, unsigned ii) { +void Reader::progressPrint(unsigned max, unsigned i) { const unsigned nsteps = 10; - unsigned ineachstep = (maxii / nsteps); + unsigned ineachstep = (max / nsteps); if(ineachstep == 0) ineachstep = 1; - if (ii % ineachstep == 0) { - int progress = ((size_t) ii * 100) / maxii + 1; + if (i % ineachstep == 0) { + int progress = ((size_t) i * 100) / max + 1; printf("\t%3d%%\r", progress); fflush(stdout); } @@ -203,22 +203,20 @@ void Reader::readGraphFromGRFile(Graph *g) { } printf("num_vertices %lu, num_edges %lu.\n", nv, ne); g->allocateFrom(nv, ne); - auto rowptr = g->row_start_ptr(); - auto colidx = g->edge_dst_ptr(); - auto degrees = g->degrees_ptr(); - for (unsigned ii = 0; ii < nv; ++ii) { - rowptr[ii+1] = le64toh(outIdx[ii]); - degrees[ii] = rowptr[ii+1] - rowptr[ii]; - for (unsigned jj = 0; jj < degrees[ii]; ++jj) { - unsigned eid = rowptr[ii] + jj; + auto rowptr = g->row_start_host_ptr(); + for (unsigned vid = 0; vid < nv; ++vid) { + g->fixEndEdge(vid, le64toh(outIdx[vid])); + auto degree = rowptr[vid+1] - rowptr[vid]; + for (unsigned jj = 0; jj < degree; ++jj) { + unsigned eid = rowptr[vid] + jj; unsigned dst = le32toh(outs[eid]); if (dst >= nv) { - printf("\tinvalid edge from %d to %d at index %d(%d).\n", ii, dst, jj, eid); + printf("\tinvalid edge from %d to %d at index %d(%d).\n", vid, dst, jj, eid); exit(0); } - colidx[eid] = dst; + g->constructEdge(eid, dst); } - progressPrint(nv, ii); + progressPrint(nv, vid); } ifs.close(); From f7887794c70bcf842a719471251f2b61d50bbcf7 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 6 May 2020 20:06:32 -0500 Subject: [PATCH 259/660] fix subgraph bug --- libdeepgalois/include/deepgalois/cutils.h | 7 +++++++ libdeepgalois/include/deepgalois/lgraph.h | 15 +++++++-------- libdeepgalois/include/deepgalois/net.h | 2 +- libdeepgalois/src/layers/aggregator.cu | 12 +++++++++--- libdeepgalois/src/layers/graph_conv_layer.cu | 10 ++-------- libdeepgalois/src/lgraph.cu | 18 +++++++++++------- libdeepgalois/src/math_functions.cu | 4 ++-- libdeepgalois/src/net.cu | 5 ++--- lonestar/gnn/gcn/README.md | 10 +++++++--- 9 files changed, 48 insertions(+), 35 deletions(-) diff --git a/libdeepgalois/include/deepgalois/cutils.h b/libdeepgalois/include/deepgalois/cutils.h index 383c9d6325..9466f55c53 100644 --- a/libdeepgalois/include/deepgalois/cutils.h +++ b/libdeepgalois/include/deepgalois/cutils.h @@ -180,3 +180,10 @@ inline void print_device_vector(size_t n, const float_t *d_x, std::string name = delete[] h_x; } +inline void print_device_int_vector(size_t n, const int *d_x, std::string name = "x") { + int *h_x = new int[n]; + CUDA_CHECK(cudaMemcpy(h_x, d_x, n * sizeof(int), cudaMemcpyDeviceToHost)); + for (size_t i = 0; i < n; i ++) std::cout << name << "[" << i << "]=" << h_x[i] << "\n"; + delete[] h_x; +} + diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index e9dd995f93..21caec947d 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -34,7 +34,6 @@ class LearningGraph { public: typedef size_t iterator; LearningGraph(bool use_gpu) : is_device(use_gpu), num_vertices_(0), num_edges_(0), - //rowptr_(NULL), colidx_(NULL), degrees_(NULL), vertex_data_(NULL), edge_data_(NULL) {} LearningGraph() : LearningGraph(false) {} ~LearningGraph() { dealloc(); } @@ -58,7 +57,7 @@ class LearningGraph { rowptr_[vid+1] = row_end; } void allocateFrom(index_t nv, index_t ne) { - //printf("Allocating num_vertices_=%d, num_edges_=%d.\n", num_vertices_, num_edges_); + //printf("Allocating num_vertices %d num_edgesi %d\n", num_vertices_, num_edges_); num_vertices_ = nv; num_edges_ = ne; rowptr_.resize(num_vertices_+1); @@ -73,8 +72,6 @@ class LearningGraph { if (edge_data_) edge_data_[eid] = edata; } void add_selfloop() { - //print_neighbors(nnodes-1); - //print_neighbors(0); auto old_colidx_ = colidx_; colidx_.resize(num_vertices_ + num_edges_); for (index_t i = 0; i < num_vertices_; i++) { @@ -102,8 +99,7 @@ class LearningGraph { } for (index_t i = 0; i <= num_vertices_; i++) rowptr_[i] += i; num_edges_ += num_vertices_; - //print_neighbors(nnodes-1); - //print_neighbors(0); + printf("Selfloop added: num_vertices %d num_edges %d\n", num_vertices_, num_edges_); } bool isLocal(index_t vid); @@ -114,6 +110,7 @@ class LearningGraph { uint64_t globalSize(); index_t* row_start_host_ptr() { return &rowptr_[0]; } + index_t* edge_dst_host_ptr() { return &colidx_[0]; } #ifdef CPU_ONLY index_t getEdgeDst(index_t eid) { return colidx_[eid]; } index_t edge_begin(index_t vid) { return rowptr_[vid]; } @@ -132,8 +129,10 @@ class LearningGraph { CUDA_HOSTDEV index_t edge_begin(index_t src) { return d_rowptr_[src]; } CUDA_HOSTDEV index_t edge_end(index_t src) { return d_rowptr_[src+1]; } CUDA_HOSTDEV vdata_t getData(index_t vid) { return d_vertex_data_[vid]; } - CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_degrees_[vid]; } - CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_degrees_[vid]; } + //CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_degrees_[vid]; } + //CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_degrees_[vid]; } + CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_rowptr_[vid+1] - d_rowptr_[vid]; } + CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_rowptr_[vid+1] - d_rowptr_[vid]; } index_t *row_start_ptr() { return d_rowptr_; } const index_t *row_start_ptr() const { return d_rowptr_; } index_t *edge_dst_ptr() { return d_colidx_; } diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 5bab5f12d2..9ad0ac7d15 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -36,7 +36,7 @@ class Net { neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz), num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), learning_rate(lr), dropout_rate(dropout), weight_decay(wd), - val_interval(val_itv), is_selfloop(selfloop) { + val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { assert(n_conv > 0); std::cout << "Configuration: num_threads " << num_threads << ", num_conv_layers " << num_conv_layers diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu index fe3aca0182..e1bee86c47 100644 --- a/libdeepgalois/src/layers/aggregator.cu +++ b/libdeepgalois/src/layers/aggregator.cu @@ -74,11 +74,17 @@ void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { unsigned n = g.size(); CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); - //std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n"; - //print_device_vector(10, norm_factor, "norm_factor"); + std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n"; + print_device_vector(10, norm_factor, "norm_factor"); float *temp; + const int *row_start = (const int*)g.row_start_ptr(); + const int *edge_dst = (const int*)g.edge_dst_ptr(); + printf("row_start_ptr: 0x%x\n", row_start); + printf("edge_dst_ptr: 0x%x\n", edge_dst); + print_device_int_vector(10, row_start, "row_start"); + print_device_int_vector(10, edge_dst, "edge_dst"); float_malloc_device(n*len, temp); // TODO: avoid repetitive allocation - csrmm_gpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, (const int*)g.row_start_ptr(), (const int*)g.edge_dst_ptr(), in, 0.0, temp, out); + csrmm_gpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, row_start, edge_dst, in, 0.0, temp, out); float_free_device(temp); } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index e814631022..a1682847ad 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -9,24 +9,19 @@ void graph_conv_layer::malloc_and_init() { size_t z = output_dims[1]; if (dropout_) CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(mask_t))); - //CUDA_CHECK(cudaMalloc((void**)&in_temp, x * y * sizeof(float_t))); float_malloc_device(x*y, in_temp); init_const_gpu(x*y, 0.0, in_temp); if (y <= z) { float_malloc_device(x*y, in_temp1); init_const_gpu(x*y, 0.0, in_temp1); } - //CUDA_CHECK(cudaMalloc((void**)&out_temp, x * z * sizeof(float_t))); float_malloc_device(x*z, out_temp); init_const_gpu(x*z, 0.0, out_temp); - //CUDA_CHECK(cudaMalloc((void**)&d_W, y * z * sizeof(float_t))); float_malloc_device(y*z, d_W); auto init_range = sqrt(6.0 / (y + z)); // Glorot & Bengio (AISTATS 2010) rng_uniform_gpu(y * z, -init_range, init_range, d_W); - //CUDA_CHECK(cudaMalloc((void**)&layer::d_weight_grad, y * z * sizeof(float_t))); float_malloc_device(y*z, layer::d_weight_grad); - //CUDA_CHECK(cudaMemset(layer::d_weight_grad, 0, y * z * sizeof(float_t))); init_const_gpu(y*z, 0.0, layer::d_weight_grad); } @@ -56,9 +51,9 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ size_t y = input_dims[1]; size_t z = output_dims[1]; + // currently only support feature length <= 128 if (z > MAX_NUM_CLASSES) { std::cout << "Currently support maximum hidden feature length of " << MAX_NUM_CLASSES << "\n"; - // currently only support feature length <= 128 exit(0); } init_const_gpu(x*z, 0.0, out_temp); @@ -76,8 +71,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_ } // GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad) -void graph_conv_layer::back_propagation(const float_t* in_data, - const float_t* out_data, +void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { size_t x = input_dims[0]; size_t y = input_dims[1]; diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu index 42280956d4..54e696c5ca 100644 --- a/libdeepgalois/src/lgraph.cu +++ b/libdeepgalois/src/lgraph.cu @@ -23,7 +23,7 @@ void LearningGraph::allocOnDevice(bool no_edge_data__) { if (d_colidx_ != NULL) return; CUDA_CHECK(cudaMalloc((void **) &d_colidx_, num_edges_ * sizeof(index_t))); CUDA_CHECK(cudaMalloc((void **) &d_rowptr_, (num_vertices_+1) * sizeof(index_t))); - CUDA_CHECK(cudaMalloc((void **) &d_degrees_, num_vertices_ * sizeof(index_t))); + //CUDA_CHECK(cudaMalloc((void **) &d_degrees_, num_vertices_ * sizeof(index_t))); //if (!no_edge_data__) CUDA_CHECK(cudaMalloc((void **) &edge_data__, num_edges_ * sizeof(edge_data___t))); //CUDA_CHECK(cudaMalloc((void **) &vertex_data__, num_vertices_ * sizeof(vdata_t))); is_device = true; @@ -31,17 +31,21 @@ void LearningGraph::allocOnDevice(bool no_edge_data__) { void LearningGraph::copy_to_gpu() { allocOnDevice(edge_data_ == NULL); - CUDA_CHECK(cudaMemcpy(edge_dst_ptr(), d_colidx_, num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(row_start_ptr(), d_rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_colidx_, edge_dst_host_ptr(), num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_rowptr_, row_start_host_ptr(), (num_vertices_+1) * sizeof(index_t), cudaMemcpyHostToDevice)); + printf("row_start_ptr: 0x%x\n", d_rowptr_); + printf("edge_dst_ptr: 0x%x\n", d_colidx_); + print_device_int_vector(10, (const int*)d_rowptr_, "row_start"); + print_device_int_vector(10, (const int*)d_colidx_, "edge_dst"); + //CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyHostToDevice)); //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__, edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyHostToDevice)); //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__, vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyHostToDevice)); } void LearningGraph::copy_to_cpu() { - CUDA_CHECK(cudaMemcpy(edge_dst_ptr(), d_colidx_, num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(row_start_ptr(), d_rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(edge_dst_host_ptr(), d_colidx_, num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(row_start_host_ptr(), d_rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyDeviceToHost)); + //CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyDeviceToHost)); //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__ptr(), edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyDeviceToHost)); //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__ptr(), vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost)); } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 1f9c020676..2ef1e9a803 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -208,9 +208,9 @@ void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, // workspace memory (transpose_C) for this. void csrmm_gpu(const int M, const int N, const int K, const int nnz, const float alpha, const float* A_nonzeros, - const int* A_idx_ptr, const int* A_nnz_idx, + const int* A_idx_ptr, const int* A_nnz_idx, const float* B, const float beta, float *transpose_C, float* C) { - //std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n"; + std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n"; CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::Context::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, M, N, K, nnz, &alpha, deepgalois::Context::cusparse_matdescr(), A_nonzeros, diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index 7d3e61bac8..98a5e82010 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -23,8 +23,7 @@ __global__ void masked_accuracy_kernel(int num_classes, int begin, float_t* preds, label_t* labels, HGAccumulator total) { total.thread_entry(); - __shared__ cub::BlockReduce::TempStorage - local_accuracy; + __shared__ cub::BlockReduce::TempStorage local_accuracy; CUDA_KERNEL_LOOP(i, end - begin) { if (masks[begin + i] == 1) { label_t pred = (label_t)argmax_device(num_classes, preds + (begin + i) * num_classes); @@ -72,7 +71,7 @@ __global__ void masked_f1_score_kernel(int num_classes, int begin, atomicAdd(&true_negtive[j], 1.0); } } - } + } } } diff --git a/lonestar/gnn/gcn/README.md b/lonestar/gnn/gcn/README.md index bae49e36a0..ba680b1f5e 100644 --- a/lonestar/gnn/gcn/README.md +++ b/lonestar/gnn/gcn/README.md @@ -26,12 +26,16 @@ BUILD RUN =========== +Datasets: +(1) single-class: cora citeseer pubmed flickr reddit +(2) multi-class: ppi yelp amazon + The following are a few example command lines. $ export OPENBLAS_NUM_THREADS=28 -$ ./gnn cora -t=1 -k=3 -$ ./gnn citeseer -t=3 -k=30 -$ ./gnn reddit -t=56 -k=3 +$ ./gnn cora -t=1 -k=30 +$ ./gnn reddit -t=56 -k=200 +$ ./gcn reddit -k=200 -t=56 -ss=9000 -dr=0.1 -h=128 -vi=20 PERFORMANCE =========== From 2272e8a7685564f4607d14f3021da404cd686ed2 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 6 May 2020 21:26:10 -0500 Subject: [PATCH 260/660] use CSRGraph --- libdeepgalois/include/deepgalois/gtypes.h | 3 +- libdeepgalois/include/deepgalois/lgraph.h | 1 + libdeepgalois/src/context.cu | 4 +- libdeepgalois/src/layers/aggregator.cu | 16 ++++---- libdeepgalois/src/lgraph.cu | 12 ++++-- libdeepgalois/src/math_functions.cu | 2 +- libgpu/include/graph_gpu.h | 46 +++++++++++++---------- 7 files changed, 49 insertions(+), 35 deletions(-) diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h index 9b85007d28..e06a9c3fe0 100644 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ b/libdeepgalois/include/deepgalois/gtypes.h @@ -1,5 +1,5 @@ #pragma once -//#define USE_CSRGRAPH +#define USE_CSRGRAPH #include "deepgalois/types.h" #ifdef GALOIS_USE_DIST @@ -12,6 +12,7 @@ #include "deepgalois/lgraph.h" #else #ifdef USE_CSRGRAPH +#include "deepgalois/lgraph.h" #include "graph_gpu.h" #else #include "deepgalois/lgraph.h" diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index 21caec947d..d9e6e60d1d 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -142,6 +142,7 @@ class LearningGraph { vdata_t *vertex_data_ptr() { return d_vertex_data_; } //const vdata_t *vertex_data_ptr() const { return vertex_data_; } //const edata_t *edge_data_ptr() const { return edge_data; } + void print_test(); #endif }; diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 65b0e6304a..f7a76d2db0 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -45,9 +45,9 @@ __global__ void norm_factor_computing_edge(int n, GraphGPU graph, float_t* norm_ d_src = 1.0 / sqrt(d_src); auto start = graph.edge_begin(src); index_t end = graph.edge_end(src); - for (index_t e = start; e != end; e++) { + for (index_t e = start; e != end; e++) { index_t dst = graph.getEdgeDst(e); - if (dst >= n) printf("src=%d, dst=%d, e=%d, start=%d, end=%d\n", src, dst, e, start, end); + //if (dst >= n) printf("src=%d, dst=%d, e=%d, start=%d, end=%d\n", src, dst, e, start, end); assert(dst < n); float_t d_dst = float_t(graph.getOutDegree(dst)); assert(d_dst != 0.0); diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu index e1bee86c47..158b1c2b4d 100644 --- a/libdeepgalois/src/layers/aggregator.cu +++ b/libdeepgalois/src/layers/aggregator.cu @@ -72,19 +72,21 @@ void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out, void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { + //g.print_test(); unsigned n = g.size(); + auto nnz = g.sizeEdges(); CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); - std::cout << "[debug]: update_all on GPU, n=" << n << ", len=" << len << "\n"; - print_device_vector(10, norm_factor, "norm_factor"); + //std::cout << "[debug]: update_all on GPU, n " << n << " len " << len << " nnz " << nnz << "\n"; + //print_device_vector(10, norm_factor, "norm_factor"); float *temp; const int *row_start = (const int*)g.row_start_ptr(); const int *edge_dst = (const int*)g.edge_dst_ptr(); - printf("row_start_ptr: 0x%x\n", row_start); - printf("edge_dst_ptr: 0x%x\n", edge_dst); - print_device_int_vector(10, row_start, "row_start"); - print_device_int_vector(10, edge_dst, "edge_dst"); + //printf("row_start_ptr: 0x%x\n", row_start); + //printf("edge_dst_ptr: 0x%x\n", edge_dst); + //print_device_int_vector(10, row_start, "row_start"); + //print_device_int_vector(10, edge_dst, "edge_dst"); float_malloc_device(n*len, temp); // TODO: avoid repetitive allocation - csrmm_gpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, row_start, edge_dst, in, 0.0, temp, out); + csrmm_gpu(n, len, n, nnz, 1.0, norm_factor, row_start, edge_dst, in, 0.0, temp, out); float_free_device(temp); } diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu index 54e696c5ca..2c630ca7ae 100644 --- a/libdeepgalois/src/lgraph.cu +++ b/libdeepgalois/src/lgraph.cu @@ -29,14 +29,18 @@ void LearningGraph::allocOnDevice(bool no_edge_data__) { is_device = true; } +void LearningGraph::print_test() { + printf("d_rowptr_: 0x%x\n", d_rowptr_); + printf("d_colidx_: 0x%x\n", d_colidx_); + print_device_int_vector(10, (const int*)d_rowptr_, "row_start"); + print_device_int_vector(10, (const int*)d_colidx_, "edge_dst"); +} + void LearningGraph::copy_to_gpu() { allocOnDevice(edge_data_ == NULL); CUDA_CHECK(cudaMemcpy(d_colidx_, edge_dst_host_ptr(), num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice)); CUDA_CHECK(cudaMemcpy(d_rowptr_, row_start_host_ptr(), (num_vertices_+1) * sizeof(index_t), cudaMemcpyHostToDevice)); - printf("row_start_ptr: 0x%x\n", d_rowptr_); - printf("edge_dst_ptr: 0x%x\n", d_colidx_); - print_device_int_vector(10, (const int*)d_rowptr_, "row_start"); - print_device_int_vector(10, (const int*)d_colidx_, "edge_dst"); + print_test(); //CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyHostToDevice)); //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__, edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyHostToDevice)); //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__, vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyHostToDevice)); diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 2ef1e9a803..449b597621 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -210,7 +210,7 @@ void csrmm_gpu(const int M, const int N, const int K, const int nnz, const float alpha, const float* A_nonzeros, const int* A_idx_ptr, const int* A_nnz_idx, const float* B, const float beta, float *transpose_C, float* C) { - std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n"; + //std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n"; CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::Context::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, M, N, K, nnz, &alpha, deepgalois::Context::cusparse_matdescr(), A_nonzeros, diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index f456c367dc..b47ed326b1 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -17,6 +17,12 @@ #include #include +#ifdef __CUDACC__ +#define CUDA_HOSTDEV __host__ __device__ +#else +#define CUDA_HOSTDEV +#endif + // Adapted from LSG CSRGraph.h // TODO: make this template data @@ -42,20 +48,20 @@ struct CSRGraph { unsigned deallocOnDevice(); void dealloc(); - __device__ __host__ bool valid_node(index_type node) { + CUDA_HOSTDEV bool valid_node(index_type node) { return (node < nnodes); } - __device__ __host__ bool valid_edge(index_type edge) { + CUDA_HOSTDEV bool valid_edge(index_type edge) { return (edge < nedges); } - __device__ __host__ index_type getOutDegree(unsigned src) { + CUDA_HOSTDEV index_type getOutDegree(unsigned src) { assert(src < nnodes); return row_start[src + 1] - row_start[src]; }; - __device__ __host__ index_type getDestination(unsigned src, unsigned edge) { + CUDA_HOSTDEV index_type getDestination(unsigned src, unsigned edge) { assert(src < nnodes); assert(edge < getOutDegree(src)); @@ -65,18 +71,18 @@ struct CSRGraph { return edge_dst[abs_edge]; }; - __device__ __host__ index_type getAbsDestination(unsigned abs_edge) { + CUDA_HOSTDEV index_type getAbsDestination(unsigned abs_edge) { assert(abs_edge < nedges); return edge_dst[abs_edge]; }; - __device__ __host__ index_type getFirstEdge(unsigned src) { + CUDA_HOSTDEV index_type getFirstEdge(unsigned src) { assert(src <= nnodes); // <= is okay return row_start[src]; }; - __device__ __host__ edge_data_type getWeight(unsigned src, unsigned edge) { + CUDA_HOSTDEV edge_data_type getWeight(unsigned src, unsigned edge) { assert(src < nnodes); assert(edge < getOutDegree(src)); @@ -86,7 +92,7 @@ struct CSRGraph { return edge_data[abs_edge]; }; - __device__ __host__ edge_data_type getAbsWeight(unsigned abs_edge) { + CUDA_HOSTDEV edge_data_type getAbsWeight(unsigned abs_edge) { assert(abs_edge < nedges); return edge_data[abs_edge]; @@ -138,29 +144,29 @@ struct CSRGraph { //print_neighbors(0); } - __device__ __host__ index_type getEdgeDst(unsigned edge) { + CUDA_HOSTDEV index_type getEdgeDst(unsigned edge) { assert(edge < nedges); return edge_dst[edge]; }; - __device__ __host__ node_data_type getData(unsigned vid) { + CUDA_HOSTDEV node_data_type getData(unsigned vid) { return node_data[vid]; } - __device__ __host__ index_type edge_begin(unsigned src) { + CUDA_HOSTDEV index_type edge_begin(unsigned src) { assert(src <= nnodes); return row_start[src]; }; - __device__ __host__ index_type edge_end(unsigned src) { + CUDA_HOSTDEV index_type edge_end(unsigned src) { assert(src <= nnodes); return row_start[src+1]; }; - __device__ __host__ index_type *row_start_ptr() { return row_start; } - __device__ __host__ const index_type *row_start_ptr() const { return row_start; } - __device__ __host__ index_type *edge_dst_ptr() { return edge_dst; } - __device__ __host__ const index_type *edge_dst_ptr() const { return edge_dst; } - __device__ __host__ node_data_type *node_data_ptr() { return node_data; } - __device__ __host__ const node_data_type *node_data_ptr() const { return node_data; } - __device__ __host__ edge_data_type *edge_data_ptr() { return edge_data; } - __device__ __host__ const edge_data_type *edge_data_ptr() const { return edge_data; } + CUDA_HOSTDEV index_type *row_start_ptr() { return row_start; } + CUDA_HOSTDEV const index_type *row_start_ptr() const { return row_start; } + CUDA_HOSTDEV index_type *edge_dst_ptr() { return edge_dst; } + CUDA_HOSTDEV const index_type *edge_dst_ptr() const { return edge_dst; } + CUDA_HOSTDEV node_data_type *node_data_ptr() { return node_data; } + CUDA_HOSTDEV const node_data_type *node_data_ptr() const { return node_data; } + CUDA_HOSTDEV edge_data_type *edge_data_ptr() { return edge_data; } + CUDA_HOSTDEV const edge_data_type *edge_data_ptr() const { return edge_data; } size_t size() { return size_t(nnodes); } size_t sizeEdges() { return size_t(nedges); } From 17caee78f48fe3d3a40fee0ee40a249b88635257 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 7 May 2020 13:02:54 -0500 Subject: [PATCH 261/660] wip: HETERO not USE CPU, removing USE_DIST --- .../include/deepgalois/DistContext.h | 2 +- libdeepgalois/include/deepgalois/context.h | 19 +++++++++---------- libdeepgalois/include/deepgalois/net.h | 4 ++++ lonestar/gnn/CMakeLists.txt | 4 ++-- lonestar/gnn/gcn/gcn.cpp | 12 ++++-------- 5 files changed, 20 insertions(+), 21 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 7fce4a12d9..b110f0df89 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -34,7 +34,7 @@ class DistContext { ~DistContext(); //! save graph pointer to context object - void saveGraph(Graph* dGraph); + void saveDistGraph(Graph* dGraph); //! read labels of local nodes only size_t read_labels(std::string dataset_str); diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index f8b848f453..feacca3a4a 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -7,11 +7,9 @@ #include #include "deepgalois/types.h" #include "deepgalois/reader.h" -//#ifdef CPU_ONLY #include "deepgalois/gtypes.h" -//#else -//#include "graph_gpu.h" -#ifndef CPU_ONLY + +#ifdef __GALOIS_HET_CUDA__ #include "deepgalois/cutils.h" #endif @@ -20,10 +18,11 @@ namespace deepgalois { class Context { public: Context(); - Context(bool use_gpu) : is_device(use_gpu), n(0), num_classes(0), feat_len(0), is_single_class(true), - is_selfloop_added(false), use_subgraph(false), h_labels(NULL), h_feats(NULL), - d_labels(NULL), d_labels_subg(NULL), d_feats(NULL), d_feats_subg(NULL), norm_factors(NULL) {} - + Context(bool use_gpu) : + is_device(use_gpu), n(0), num_classes(0), feat_len(0), + is_single_class(true), is_selfloop_added(false), use_subgraph(false), + h_labels(NULL), h_feats(NULL), d_labels(NULL), d_labels_subg(NULL), + d_feats(NULL), d_feats_subg(NULL), norm_factors(NULL) {} ~Context(); size_t read_graph(bool selfloop); @@ -47,7 +46,7 @@ class Context { void gen_subgraph_feats(size_t m, const mask_t *masks); void createSubgraphs(int num_subgraphs); -#ifdef CPU_ONLY +#ifndef __GALOIS_HET_CUDA__ Graph* graph_cpu; // the input graph, |V| = N std::vector subgraphs_cpu; void add_selfloop(Graph &og, Graph &g); @@ -100,7 +99,7 @@ class Context { void alloc_norm_factor(); void alloc_subgraph_norm_factor(int subg_id); -#ifdef CPU_ONLY +#ifndef __GALOIS_HET_CUDA__ void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false); #else static cublasHandle_t cublas_handle_; // used to call cuBLAS diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 9ad0ac7d15..aa62339a2a 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -38,6 +38,7 @@ class Net { learning_rate(lr), dropout_rate(dropout), weight_decay(wd), val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { assert(n_conv > 0); + // TODO use galois print std::cout << "Configuration: num_threads " << num_threads << ", num_conv_layers " << num_conv_layers << ", num_epochs " << num_epochs @@ -50,6 +51,9 @@ class Net { if (has_dense) num_layers ++; // initialize feature metadata feature_dims.resize(num_layers + 1); + + + #ifndef GALOIS_USE_DIST context = new deepgalois::Context(); context->set_dataset(dataset_str); diff --git a/lonestar/gnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt index d0551bdadc..1f5d35b5f1 100644 --- a/lonestar/gnn/CMakeLists.txt +++ b/lonestar/gnn/CMakeLists.txt @@ -13,8 +13,8 @@ if(USE_MKL_BLAS) endif() link_directories(${BLAS_LIB_DIR}) -if(NOT ENABLE_HETERO_GALOIS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") +if(ENABLE_HETERO_GALOIS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GALOIS_HET_CUDA__") endif() if(ENABLE_DIST_GALOIS) diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp index a8ab651603..97e1d71447 100644 --- a/lonestar/gnn/gcn/gcn.cpp +++ b/lonestar/gnn/gcn/gcn.cpp @@ -10,23 +10,19 @@ const char* desc = "Graph convolutional neural networks on an undirected graph"; const char* url = 0; int main(int argc, char** argv) { -#ifndef GALOIS_USE_DIST - galois::SharedMemSys G; -#else galois::DistMemSys G; -#endif LonestarGnnStart(argc, argv, name, desc, url); - // the neural network to train + + // the neural network to train: loads the entire graph on CPU deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1, learning_rate, dropout_rate, weight_decay, add_selfloop, is_single_class, add_l2norm, add_dense, neighbor_sample_sz, subgraph_sample_sz, val_interval); -#ifdef GALOIS_USE_DIST std::vector dummyVec; - deepgalois::Graph* dGraph = galois::graphs::constructSymmetricGraph(dummyVec); + deepgalois::Graph* dGraph = + galois::graphs::constructSymmetricGraph(dummyVec); network.dist_init(dGraph, dataset); -#endif // read network, features, ground truth, initialize metadata // default setting for now; can be customized by the user From df2c358b72775023345475d2146143362c6dbfbb Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 7 May 2020 13:06:22 -0500 Subject: [PATCH 262/660] ran clang-format on all of deepgalois --- .../include/deepgalois/DistContext.h | 45 +- libdeepgalois/include/deepgalois/configs.h | 5 +- libdeepgalois/include/deepgalois/context.h | 104 ++-- libdeepgalois/include/deepgalois/cutils.h | 21 +- libdeepgalois/include/deepgalois/gtypes.h | 14 +- .../deepgalois/layers/GluonGradients.h | 79 +-- .../deepgalois/layers/GradientSyncStructs.h | 21 +- .../layers/GraphConvSyncStructures.h | 13 +- .../include/deepgalois/layers/aggregator.h | 12 +- .../deepgalois/layers/arithmetic_layer.h | 2 +- .../deepgalois/layers/graph_conv_layer.h | 36 +- .../include/deepgalois/layers/l2_norm_layer.h | 14 +- .../include/deepgalois/layers/layer.h | 49 +- .../deepgalois/layers/leaky_relu_layer.h | 10 +- .../include/deepgalois/layers/linear_layer.h | 2 +- .../include/deepgalois/layers/node.h | 9 +- .../include/deepgalois/layers/relu_layer.h | 6 +- .../deepgalois/layers/sigmoid_loss_layer.h | 6 +- .../deepgalois/layers/softmax_loss_layer.h | 6 +- libdeepgalois/include/deepgalois/lgraph.h | 115 ++-- .../include/deepgalois/math_functions.hh | 110 ++-- libdeepgalois/include/deepgalois/net.h | 287 +++++---- libdeepgalois/include/deepgalois/optimizer.h | 10 +- libdeepgalois/include/deepgalois/reader.h | 8 +- libdeepgalois/include/deepgalois/sampler.h | 41 +- libdeepgalois/include/deepgalois/types.h | 24 +- libdeepgalois/include/deepgalois/utils.h | 100 ++-- libdeepgalois/src/DistContext.cpp | 52 +- libdeepgalois/src/context.cpp | 140 +++-- libdeepgalois/src/context.cu | 97 +-- libdeepgalois/src/layers/aggregator.cpp | 87 +-- libdeepgalois/src/layers/aggregator.cu | 82 +-- libdeepgalois/src/layers/graph_conv_layer.cpp | 97 +-- libdeepgalois/src/layers/graph_conv_layer.cu | 89 +-- libdeepgalois/src/layers/l2_norm_layer.cpp | 69 ++- libdeepgalois/src/layers/l2_norm_layer.cu | 10 +- libdeepgalois/src/layers/leaky_relu_layer.cpp | 17 +- libdeepgalois/src/layers/leaky_relu_layer.cu | 12 +- libdeepgalois/src/layers/relu_layer.cpp | 5 +- libdeepgalois/src/layers/relu_layer.cu | 10 +- .../src/layers/sigmoid_loss_layer.cpp | 100 ++-- .../src/layers/sigmoid_loss_layer.cu | 14 +- .../src/layers/softmax_loss_layer.cpp | 91 +-- .../src/layers/softmax_loss_layer.cu | 14 +- libdeepgalois/src/lgraph.cpp | 15 +- libdeepgalois/src/lgraph.cu | 56 +- libdeepgalois/src/math_functions.cpp | 220 +++---- libdeepgalois/src/math_functions.cu | 563 ++++++++++-------- libdeepgalois/src/net.cpp | 107 ++-- libdeepgalois/src/net.cu | 107 ++-- libdeepgalois/src/node.cpp | 3 +- libdeepgalois/src/node.cu | 13 +- libdeepgalois/src/optimizer.cpp | 101 ++-- libdeepgalois/src/optimizer.cu | 20 +- libdeepgalois/src/reader.cpp | 111 ++-- libdeepgalois/src/sampler.cpp | 341 ++++++----- libdeepgalois/src/utils.cpp | 103 ++-- lonestar/gnn/gcn/gcn.cpp | 12 +- 58 files changed, 2172 insertions(+), 1735 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index b110f0df89..2f65360106 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -11,23 +11,24 @@ namespace deepgalois { class DistContext { protected: - size_t localVertices; // number of samples: N - size_t num_classes; // number of classes: E - size_t feat_len; // input feature length: D + size_t localVertices; // number of samples: N + size_t num_classes; // number of classes: E + size_t feat_len; // input feature length: D galois::graphs::GluonSubstrate* syncSubstrate; - Graph* graph_cpu; // the input graph, |V| = N + Graph* graph_cpu; // the input graph, |V| = N std::vector subgraphs_cpu; - label_t *h_labels; // labels for classification. Single-class label: Nx1, multi-class label: NxE - label_t *h_labels_subg; // labels for subgraph - float_t* h_feats; // input features: N x D - float_t* h_feats_subg; // input features for subgraph - label_t* d_labels; // labels on device - label_t *d_labels_subg; // labels for subgraph on device - float_t* d_feats; // input features on device - float_t* d_feats_subg; // input features for subgraph on device - float_t* norm_factors; // normalization constant based on graph structure - float_t* norm_factors_subg; // normalization constant for subgraph + label_t* h_labels; // labels for classification. Single-class label: Nx1, + // multi-class label: NxE + label_t* h_labels_subg; // labels for subgraph + float_t* h_feats; // input features: N x D + float_t* h_feats_subg; // input features for subgraph + label_t* d_labels; // labels on device + label_t* d_labels_subg; // labels for subgraph on device + float_t* d_feats; // input features on device + float_t* d_feats_subg; // input features for subgraph on device + float_t* norm_factors; // normalization constant based on graph structure + float_t* norm_factors_subg; // normalization constant for subgraph public: DistContext(); @@ -43,19 +44,19 @@ class DistContext { size_t read_features(std::string dataset_str); //! read masks of local nodes only - size_t read_masks(std::string dataset_str, std::string mask_type, - size_t n, size_t& begin, size_t& end, mask_t* masks, Graph* dGraph); + size_t read_masks(std::string dataset_str, std::string mask_type, size_t n, + size_t& begin, size_t& end, mask_t* masks, Graph* dGraph); //! find norm factor by looking at degree // TODO this is a distributed operation void norm_factor_computing(bool is_subgraph, int subg_id = 0); - //void createSubgraphs(int num_subgraphs) {} - //void gen_subgraph_labels(size_t m, const mask_t *masks) {} - //void gen_subgraph_feats(size_t m, const mask_t *masks) {} + // void createSubgraphs(int num_subgraphs) {} + // void gen_subgraph_labels(size_t m, const mask_t *masks) {} + // void gen_subgraph_feats(size_t m, const mask_t *masks) {} // TODO define these void createSubgraphs(int) {} - void gen_subgraph_labels(size_t, const mask_t *) {} - void gen_subgraph_feats(size_t, const mask_t *) {} + void gen_subgraph_labels(size_t, const mask_t*) {} + void gen_subgraph_feats(size_t, const mask_t*) {} float_t* get_norm_factors_ptr() { return norm_factors; } Graph* getGraphPointer() { return graph_cpu; } @@ -77,6 +78,6 @@ class DistContext { float_t* get_in_ptr(); }; -} // end deepgalois namespace +} // namespace deepgalois #endif diff --git a/libdeepgalois/include/deepgalois/configs.h b/libdeepgalois/include/deepgalois/configs.h index 3de67ecb74..f21dff7fed 100644 --- a/libdeepgalois/include/deepgalois/configs.h +++ b/libdeepgalois/include/deepgalois/configs.h @@ -6,6 +6,7 @@ const std::string path = "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset #define NUM_DATASETS 8 -const std::string dataset_names[NUM_DATASETS] = {"cora", "citeseer", "ppi", "pubmed", "flickr", "yelp", "reddit", "amazon"}; +const std::string dataset_names[NUM_DATASETS] = { + "cora", "citeseer", "ppi", "pubmed", "flickr", "yelp", "reddit", "amazon"}; -} +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index feacca3a4a..77c15ee890 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -18,38 +18,51 @@ namespace deepgalois { class Context { public: Context(); - Context(bool use_gpu) : - is_device(use_gpu), n(0), num_classes(0), feat_len(0), - is_single_class(true), is_selfloop_added(false), use_subgraph(false), - h_labels(NULL), h_feats(NULL), d_labels(NULL), d_labels_subg(NULL), - d_feats(NULL), d_feats_subg(NULL), norm_factors(NULL) {} + Context(bool use_gpu) + : is_device(use_gpu), n(0), num_classes(0), feat_len(0), + is_single_class(true), is_selfloop_added(false), use_subgraph(false), + h_labels(NULL), h_feats(NULL), d_labels(NULL), d_labels_subg(NULL), + d_feats(NULL), d_feats_subg(NULL), norm_factors(NULL) {} ~Context(); size_t read_graph(bool selfloop); - size_t read_labels() { num_classes = reader.read_labels(is_single_class, h_labels); return num_classes; } - size_t read_features() { feat_len = reader.read_features(h_feats); return feat_len; } - size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks) { + size_t read_labels() { + num_classes = reader.read_labels(is_single_class, h_labels); + return num_classes; + } + size_t read_features() { + feat_len = reader.read_features(h_feats); + return feat_len; + } + size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, + mask_t* masks) { return reader.read_masks(mask_type, n, begin, end, masks); } - label_t get_label(size_t i) { return h_labels[i]; } // single-class (one-hot) label - //label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // multi-class label + label_t get_label(size_t i) { + return h_labels[i]; + } // single-class (one-hot) label + // label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } + // // multi-class label float_t* get_norm_factors_ptr() { return norm_factors; } float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; } - void set_dataset(std::string dataset_str) { dataset = dataset_str; reader.init(dataset); } + void set_dataset(std::string dataset_str) { + dataset = dataset_str; + reader.init(dataset); + } void set_label_class(bool is_single = true) { is_single_class = is_single; } void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; } void copy_data_to_device(); // copy labels and input features void norm_factor_computing(bool is_subgraph, int subg_id = 0); - void gen_subgraph_labels(size_t m, const mask_t *masks); - void gen_subgraph_feats(size_t m, const mask_t *masks); + void gen_subgraph_labels(size_t m, const mask_t* masks); + void gen_subgraph_feats(size_t m, const mask_t* masks); void createSubgraphs(int num_subgraphs); #ifndef __GALOIS_HET_CUDA__ Graph* graph_cpu; // the input graph, |V| = N std::vector subgraphs_cpu; - void add_selfloop(Graph &og, Graph &g); + void add_selfloop(Graph& og, Graph& g); //! returns pointer to the graph Graph* getGraphPointer() { return graph_cpu; } Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; }; @@ -68,45 +81,52 @@ class Context { label_t* get_labels_subg_ptr() { return d_labels_subg; } inline static cublasHandle_t cublas_handle() { return cublas_handle_; } inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; } - inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; } - inline static curandGenerator_t curand_generator() { return curand_generator_; } + inline static cusparseMatDescr_t cusparse_matdescr() { + return cusparse_matdescr_; + } + inline static curandGenerator_t curand_generator() { + return curand_generator_; + } #endif protected: - std::string dataset; - bool is_device; // is this on device or host - size_t n; // number of samples: N - size_t num_classes; // number of classes: E - size_t feat_len; // input feature length: D - bool is_single_class; // single-class (one-hot) or multi-class label - bool is_selfloop_added; // whether selfloop is added to the input graph - bool use_subgraph; // whether to use subgraph - label_t *h_labels; // labels for classification. Single-class label: Nx1, multi-class label: NxE - float_t* h_feats; // input features: N x D - //label_t *h_labels_subg; // labels for subgraph - //float_t* h_feats_subg; // input features for subgraph - label_t* d_labels; // labels on device - label_t *d_labels_subg; // labels for subgraph on device - float_t* d_feats; // input features on device - float_t* d_feats_subg; // input features for subgraph on device - float_t* norm_factors; // normalization constant based on graph structure - std::vector h_labels_subg; // labels for subgraph - std::vector h_feats_subg; // input features for subgraph - std::vector norm_factors_subg; // normalization constant for subgraph - //float_t* norm_factors_subg; // normalization constant for subgraph + std::string dataset; + bool is_device; // is this on device or host + size_t n; // number of samples: N + size_t num_classes; // number of classes: E + size_t feat_len; // input feature length: D + bool is_single_class; // single-class (one-hot) or multi-class label + bool is_selfloop_added; // whether selfloop is added to the input graph + bool use_subgraph; // whether to use subgraph + label_t* h_labels; // labels for classification. Single-class label: Nx1, + // multi-class label: NxE + float_t* h_feats; // input features: N x D + // label_t *h_labels_subg; // labels for subgraph + // float_t* h_feats_subg; // input features for subgraph + label_t* d_labels; // labels on device + label_t* d_labels_subg; // labels for subgraph on device + float_t* d_feats; // input features on device + float_t* d_feats_subg; // input features for subgraph on device + float_t* norm_factors; // normalization constant based on graph structure + std::vector h_labels_subg; // labels for subgraph + std::vector h_feats_subg; // input features for subgraph + std::vector norm_factors_subg; // normalization constant for subgraph + // float_t* norm_factors_subg; // normalization constant for subgraph Reader reader; void alloc_norm_factor(); void alloc_subgraph_norm_factor(int subg_id); #ifndef __GALOIS_HET_CUDA__ - void read_edgelist(const char* filename, bool symmetrize = false, bool add_self_loop = false); + void read_edgelist(const char* filename, bool symmetrize = false, + bool add_self_loop = false); #else - static cublasHandle_t cublas_handle_; // used to call cuBLAS - static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE + static cublasHandle_t cublas_handle_; // used to call cuBLAS + static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE - static curandGenerator_t curand_generator_; // used to generate random numbers on GPU + static curandGenerator_t + curand_generator_; // used to generate random numbers on GPU #endif }; -} // end deepgalois namespace +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/cutils.h b/libdeepgalois/include/deepgalois/cutils.h index 9466f55c53..4e4e9842b1 100644 --- a/libdeepgalois/include/deepgalois/cutils.h +++ b/libdeepgalois/include/deepgalois/cutils.h @@ -78,9 +78,9 @@ inline const char* cusparseGetErrorString(cusparseStatus_t error) { case CUSPARSE_STATUS_INTERNAL_ERROR: return "CUSPARSE_STATUS_INTERNAL_ERROR"; case CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED: - return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; + return "CUSPARSE_STATUS_MATRIX_TYPE_NOT_SUPPORTED"; case CUSPARSE_STATUS_ZERO_PIVOT: - return "CUSPARSE_STATUS_ZERO_PIVOT"; + return "CUSPARSE_STATUS_ZERO_PIVOT"; default: break; } @@ -173,17 +173,20 @@ inline const char* curandGetErrorString(curandStatus_t error) { // CUDA: check for error after kernel execution and exit loudly if there is one. #define CUDA_POST_KERNEL_CHECK CUDA_CHECK(cudaPeekAtLastError()) -inline void print_device_vector(size_t n, const float_t *d_x, std::string name = "x") { - float_t *h_x = new float_t[n]; +inline void print_device_vector(size_t n, const float_t* d_x, + std::string name = "x") { + float_t* h_x = new float_t[n]; CUDA_CHECK(cudaMemcpy(h_x, d_x, n * sizeof(float_t), cudaMemcpyDeviceToHost)); - for (size_t i = 0; i < n; i ++) std::cout << name << "[" << i << "]=" << h_x[i] << "\n"; + for (size_t i = 0; i < n; i++) + std::cout << name << "[" << i << "]=" << h_x[i] << "\n"; delete[] h_x; } -inline void print_device_int_vector(size_t n, const int *d_x, std::string name = "x") { - int *h_x = new int[n]; +inline void print_device_int_vector(size_t n, const int* d_x, + std::string name = "x") { + int* h_x = new int[n]; CUDA_CHECK(cudaMemcpy(h_x, d_x, n * sizeof(int), cudaMemcpyDeviceToHost)); - for (size_t i = 0; i < n; i ++) std::cout << name << "[" << i << "]=" << h_x[i] << "\n"; + for (size_t i = 0; i < n; i++) + std::cout << name << "[" << i << "]=" << h_x[i] << "\n"; delete[] h_x; } - diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h index e06a9c3fe0..ff4a6e4e46 100644 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ b/libdeepgalois/include/deepgalois/gtypes.h @@ -25,21 +25,22 @@ namespace deepgalois { typedef index_t edge_iterator; //#ifdef EDGE_LABEL -//typedef galois::graphs::LC_CSR_Graph:: +// typedef galois::graphs::LC_CSR_Graph:: // with_numa_alloc::type ::with_no_lockable::type LCGraph; //#else -//typedef galois::graphs::LC_CSR_Graph:: +// typedef galois::graphs::LC_CSR_Graph:: // with_numa_alloc::type ::with_no_lockable::type LCGraph; //#endif -//typedef LCGraph Graph; -//typedef Graph::edge_iterator edge_iterator; +// typedef LCGraph Graph; +// typedef Graph::edge_iterator edge_iterator; typedef LearningGraph Graph; #ifdef USE_CSRGRAPH typedef CSRGraph GraphGPU; #else typedef LearningGraph GraphGPU; #endif -} +} // namespace deepgalois #else @@ -47,7 +48,6 @@ namespace deepgalois { // TODO check if this needs changing typedef index_t edge_iterator; using Graph = galois::graphs::DistGraph; -} +} // namespace deepgalois #endif - diff --git a/libdeepgalois/include/deepgalois/layers/GluonGradients.h b/libdeepgalois/include/deepgalois/layers/GluonGradients.h index a7aa66d576..e14fe27bc8 100644 --- a/libdeepgalois/include/deepgalois/layers/GluonGradients.h +++ b/libdeepgalois/include/deepgalois/layers/GluonGradients.h @@ -38,6 +38,7 @@ class GluonGradients { std::vector> _mirrorNodes; //! nodes that are mirrors on this host std::vector> _mirrorRanges; + public: /** * Save weight gradients + number of them (i.e. size). @@ -45,7 +46,7 @@ class GluonGradients { */ GluonGradients(GradientVecType& gradients, size_t numWeights) : _gradients(gradients), _numWeights(numWeights) { - _myHost = galois::runtime::getSystemNetworkInterface().ID; + _myHost = galois::runtime::getSystemNetworkInterface().ID; _totalHosts = galois::runtime::getSystemNetworkInterface().Num; // allocate a vector for each host @@ -54,13 +55,13 @@ class GluonGradients { // loop through distribution of weights to hosts for (unsigned h = 0; h < _totalHosts; h++) { std::pair curRange = - galois::block_range((size_t)0, _numWeights, h, _totalHosts); + galois::block_range((size_t)0, _numWeights, h, _totalHosts); if (h != _myHost) { // setup mirrors for the host h which is just the list of IDs - size_t curW = curRange.first; + size_t curW = curRange.first; size_t lastW = curRange.second; - size_t numW = lastW - curW; + size_t numW = lastW - curW; // set mirrors for host h _mirrorNodes[h].reserve(numW); @@ -71,8 +72,8 @@ class GluonGradients { // these belong to this host; save, then mirror ranges can be // calculated from this _beginMaster = curRange.first; - _endMaster = curRange.second; - _numOwned = _endMaster - _beginMaster; + _endMaster = curRange.second; + _numOwned = _endMaster - _beginMaster; // first range is 0 to begin master if (_beginMaster > 0) { @@ -95,44 +96,28 @@ class GluonGradients { } //! Size is number of weights - size_t size() const { - return _numWeights; - } + size_t size() const { return _numWeights; } //! Global size is number of weights - size_t globalSize() const { - return _numWeights; - } + size_t globalSize() const { return _numWeights; } //! Return the weights owned by this host - size_t numMasters() const { - return _numOwned; - } + size_t numMasters() const { return _numOwned; } //! Return host ID - unsigned myHostID() const { - return _myHost; - } + unsigned myHostID() const { return _myHost; } //! Return num hosts in the system - unsigned numHosts() const { - return _totalHosts; - } + unsigned numHosts() const { return _totalHosts; } //! GID is same as LID since all hosts have all weights - uint32_t getGID(const uint32_t nodeID) const { - return nodeID; - } + uint32_t getGID(const uint32_t nodeID) const { return nodeID; } //! LID is same as GID since all hosts have all weights - uint32_t getLID(const uint32_t nodeID) const { - return nodeID; - } + uint32_t getLID(const uint32_t nodeID) const { return nodeID; } //! Return local weight w - GradientType& getData(uint32_t w) const { - return _gradients[w]; - } + GradientType& getData(uint32_t w) const { return _gradients[w]; } //! Return ranges for mirrors (unowned nodes) const std::vector>& getMirrorRanges() const { @@ -140,50 +125,34 @@ class GluonGradients { } //! Return mirror nodes for each host from this host's point of view - std::vector>& getMirrorNodes() { - return _mirrorNodes; - } + std::vector>& getMirrorNodes() { return _mirrorNodes; } //! clears the vector // TODO return to this when we start distributing on GPUs; wrapper // end probably shouldn't be managing this MAYBE - void deallocate() { - _gradients.clear(); - } + void deallocate() { _gradients.clear(); } // Essentially no-op functions follow //! no nodes with edges - size_t getNumNodesWithEdges() { - return 0; - } + size_t getNumNodesWithEdges() { return 0; } //! No edges; not a vertex cut - bool is_vertex_cut() const { - return false; - } + bool is_vertex_cut() const { return false; } //! no edges, return 0 - unsigned edge_begin(uint32_t) { - return 0; - } + unsigned edge_begin(uint32_t) { return 0; } //! no edges, return 0 - unsigned edge_end(uint32_t) { - return 0; - } + unsigned edge_end(uint32_t) { return 0; } //! no edges, return 0 - unsigned getEdgeDst(uint32_t) { - return 0; - } + unsigned getEdgeDst(uint32_t) { return 0; } //! no edges, return 0 - unsigned getEdgeData(uint32_t) { - return 0; - } + unsigned getEdgeData(uint32_t) { return 0; } }; -} +} // namespace deepgalois #endif // end header guard diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h index 1d26b87007..dd2f3de6a9 100644 --- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h +++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h @@ -6,14 +6,12 @@ struct GradientSync { using ValTy = float_t; - static ValTy extract(uint32_t, float_t& weight) { - return weight; - } + static ValTy extract(uint32_t, float_t& weight) { return weight; } static bool reduce(uint32_t, float_t& weight, ValTy y) { // TODO merge function here // for now make sure the weights are close enough - //if (std::abs(weight - y) > 0.00001) { + // if (std::abs(weight - y) > 0.00001) { // galois::gInfo("weight ", node_id, " not consistent with one received"); //} weight += y; @@ -21,22 +19,19 @@ struct GradientSync { } //! reset weight to 0 - static void reset(uint32_t, float_t &weight) { - weight = 0; - } + static void reset(uint32_t, float_t& weight) { weight = 0; } //! save weight - static void setVal(uint32_t, float_t &weight, ValTy y) { - weight = y; - } + static void setVal(uint32_t, float_t& weight, ValTy y) { weight = y; } // GPU options TODO for GPU static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { return false; } static bool extract_batch(unsigned, uint8_t*) { return false; } - static bool extract_reset_batch(unsigned, uint8_t*, size_t*, - DataCommMode*) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } static bool extract_reset_batch(unsigned, uint8_t*) { return false; } static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { @@ -46,5 +41,5 @@ struct GradientSync { }; // TODO bitset; might have to do it manually -//GALOIS_SYNC_STRUCTURE_BITSET(TODOTHIS?); +// GALOIS_SYNC_STRUCTURE_BITSET(TODOTHIS?); #endif diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h index e4874e468f..cb5a33e783 100644 --- a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h +++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h @@ -15,7 +15,7 @@ struct GraphConvSync { // copy the node's data to vector to serialize/send for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) { vecToReturn[i] = - deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i]; + deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i]; } // move constructor should kick in here to avoid return copy return vecToReturn; @@ -27,14 +27,14 @@ struct GraphConvSync { assert(y.size() == deepgalois::_syncVectorSize); // loop and do addition for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) { - deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i] += y[i]; + deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i] += + y[i]; } return true; } //! do nothing (waste of a write) - static void reset(uint32_t, char&) { - } + static void reset(uint32_t, char&) {} //! element wise set static void setVal(uint32_t node_id, char&, ValTy y) { @@ -50,8 +50,9 @@ struct GraphConvSync { return false; } static bool extract_batch(unsigned, uint8_t*) { return false; } - static bool extract_reset_batch(unsigned, uint8_t*, size_t*, - DataCommMode*) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } static bool extract_reset_batch(unsigned, uint8_t*) { return false; } static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h index 90c5781189..6e5e7a5926 100644 --- a/libdeepgalois/include/deepgalois/layers/aggregator.h +++ b/libdeepgalois/include/deepgalois/layers/aggregator.h @@ -7,16 +7,16 @@ namespace deepgalois { void update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, float_t* norm_factor); -void update_all_csrmm(size_t len, Graph& g, const float_t* in, - float_t* out, bool norm, float_t* norm_factor); -} +void update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out, + bool norm, float_t* norm_factor); +} // namespace deepgalois #else #include "deepgalois/gtypes.h" //#include "graph_gpu.h" namespace deepgalois { void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); -void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, - float_t* out, bool norm, const float_t* norm_factor); -} +void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out, + bool norm, const float_t* norm_factor); +} // namespace deepgalois #endif diff --git a/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h b/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h index c28d0ed89c..e4b59e694f 100644 --- a/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h +++ b/libdeepgalois/include/deepgalois/layers/arithmetic_layer.h @@ -25,4 +25,4 @@ class elementwise_add_layer : public layer { in_grad = out_grad; } }; -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 56c0de0be6..09d4233c27 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -7,7 +7,7 @@ /** * GraphConv Layer; based on DGL implementation + follows TinyDNN layer - * convention + * convention * https://docs.dgl.ai/en/0.4.x/_modules/dgl/nn/pytorch/conv/graphconv.html * * Parameters @@ -26,11 +26,11 @@ namespace deepgalois { class graph_conv_layer : public layer { public: - graph_conv_layer(unsigned level, bool act, bool norm, bool bias, - bool dropout, float_t dropout_rate, - std::vector in_dims, std::vector out_dims) - : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias), - dropout_(dropout), dropout_rate_(dropout_rate) { + graph_conv_layer(unsigned level, bool act, bool norm, bool bias, bool dropout, + float_t dropout_rate, std::vector in_dims, + std::vector out_dims) + : layer(level, in_dims, out_dims), act_(act), norm_(norm), bias_(bias), + dropout_(dropout), dropout_rate_(dropout_rate) { assert(input_dims[0] == output_dims[0]); // num_vertices trainable_ = true; name_ = layer_type() + "_" + std::to_string(level); @@ -39,16 +39,17 @@ class graph_conv_layer : public layer { } graph_conv_layer(unsigned level, std::vector in_dims, std::vector out_dims) - : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, out_dims) {} + : graph_conv_layer(level, false, true, false, true, 0.5, in_dims, + out_dims) {} ~graph_conv_layer() {} void malloc_and_init(); std::string layer_type() const override { return std::string("graph_conv"); } virtual acc_t get_weight_decay_loss(); - //! Uses weights contained in this layer to update in_data (results from previous) - //! and save result to out_data + //! Uses weights contained in this layer to update in_data (results from + //! previous) and save result to out_data virtual void forward_propagation(const float_t* in_data, float_t* out_data); - //! Uses gradients from layer after this one to update both own weight gradients - //! as well as gradients for the features (in_grad) + //! Uses gradients from layer after this one to update both own weight + //! gradients as well as gradients for the features (in_grad) virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); // user-defined aggregate function @@ -56,11 +57,13 @@ class graph_conv_layer : public layer { virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out); void d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out); #else - virtual void aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out); + virtual void aggregate(size_t len, GraphGPU& g, const float_t* in, + float_t* out); void d_aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out); #endif // user-defined combine function - virtual void combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out); + virtual void combine(size_t dim_x, size_t dim_y, const float_t* self, + const float_t* neighbors, float_t* out); private: bool act_; // whether to use activation function at the end @@ -72,12 +75,13 @@ class graph_conv_layer : public layer { float_t* out_temp; //!< intermediate data temporary float_t* in_temp; float_t* in_temp1; - float_t* trans_data; // y*x + float_t* trans_data; // y*x mask_t* dropout_mask; // x*y // Glorot & Bengio (AISTATS 2010) - inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed=1); + inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, + unsigned seed = 1); inline void zero_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix); }; -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h index 29e29f3474..c7167700a2 100644 --- a/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h +++ b/libdeepgalois/include/deepgalois/layers/l2_norm_layer.h @@ -5,22 +5,24 @@ namespace deepgalois { // L2 Normalization Layer class l2_norm_layer : public layer { public: - l2_norm_layer(unsigned level, float_t eps, float_t scale, dims_t in_dims, dims_t out_dims) - : layer(level, in_dims, out_dims), epsilon_(eps), scale_(scale) { + l2_norm_layer(unsigned level, float_t eps, float_t scale, dims_t in_dims, + dims_t out_dims) + : layer(level, in_dims, out_dims), epsilon_(eps), scale_(scale) { assert(input_dims[0] == output_dims[0]); // num_vertices trainable_ = false; - name_ = layer_type() + "_" + std::to_string(level); + name_ = layer_type() + "_" + std::to_string(level); } - l2_norm_layer(unsigned level, dims_t in_dims, dims_t out_dims) : - l2_norm_layer(level, 1e-12, 20, in_dims, out_dims) {} + l2_norm_layer(unsigned level, dims_t in_dims, dims_t out_dims) + : l2_norm_layer(level, 1e-12, 20, in_dims, out_dims) {} ~l2_norm_layer() {} std::string layer_type() const override { return std::string("l2_norm"); } virtual void forward_propagation(const float_t* in_data, float_t* out_data); virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); + protected: float_t epsilon_; float_t scale_; }; -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index c0f03aafd3..ec35c1d8c9 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -48,16 +48,15 @@ class layer : public deepgalois::node { layer(unsigned level, std::vector in_dims, std::vector out_dims) - : level_(level), begin_(0), - end_(0), num_dims(in_dims.size()), input_dims(in_dims), - output_dims(out_dims), labels(NULL) { } + : level_(level), begin_(0), end_(0), num_dims(in_dims.size()), + input_dims(in_dims), output_dims(out_dims), labels(NULL) {} virtual ~layer() = default; virtual std::string layer_type() const = 0; virtual void malloc_and_init() {} void print_layer_info() { //! debug print function std::cout << "Layer" << level_ << " type: " << layer_type() << " input[" << input_dims[0] << "," << input_dims[1] << "] output[" - << output_dims[0] << "," << output_dims[1] << "]\n"; + << output_dims[0] << "," << output_dims[1] << "]\n"; } // get methods virtual acc_t get_prediction_loss() { return acc_t(0); } @@ -73,30 +72,35 @@ class layer : public deepgalois::node { // set methods void set_netphase(net_phase ctx) { phase_ = ctx; } void set_context(ContextType* ctx) { context = ctx; } - void set_trainable(bool trainable) { trainable_ = trainable; } // is this layer trainable? - void set_labels_ptr(label_t *ptr) { labels = ptr; } - void set_norm_consts_ptr(float_t *ptr) { norm_consts = ptr; } - void set_feats_ptr(float_t *ptr) { prev_->set_data(ptr); } + void set_trainable(bool trainable) { + trainable_ = trainable; + } // is this layer trainable? + void set_labels_ptr(label_t* ptr) { labels = ptr; } + void set_norm_consts_ptr(float_t* ptr) { norm_consts = ptr; } + void set_feats_ptr(float_t* ptr) { prev_->set_data(ptr); } void set_name(std::string name) { name_ = name; } // name metadata #ifdef CPU_ONLY - void set_graph_ptr(Graph *ptr) { graph_cpu = ptr; } + void set_graph_ptr(Graph* ptr) { graph_cpu = ptr; } #else - void set_graph_ptr(GraphGPU *ptr) { graph_gpu = ptr; } + void set_graph_ptr(GraphGPU* ptr) { graph_gpu = ptr; } #endif - void update_dim_size(size_t g_size) { input_dims[0] = output_dims[0] = g_size; } + void update_dim_size(size_t g_size) { + input_dims[0] = output_dims[0] = g_size; + } //! set the data of the previous layer connected to this one void set_in_data(float_t* data) { - prev_ = std::make_shared(this, input_dims[0], input_dims[1]); + prev_ = + std::make_shared(this, input_dims[0], input_dims[1]); prev_->set_data(data); // no need to allocate memory for gradients, since this is the input layer. } virtual void set_sample_mask(size_t sample_begin, size_t sample_end, size_t sample_count, mask_t* masks) { - begin_ = sample_begin; - end_ = sample_end; - count_ = sample_count; + begin_ = sample_begin; + end_ = sample_end; + count_ = sample_count; use_mask = false; if (masks != NULL) { use_mask = true; @@ -110,7 +114,8 @@ class layer : public deepgalois::node { void add_edge() { // add an outgoing edge - next_ = std::make_shared(this, output_dims[0], output_dims[1]); + next_ = std::make_shared(this, output_dims[0], + output_dims[1]); // allocate memory for intermediate feature vectors and gradients next_->alloc(); } @@ -140,10 +145,11 @@ class layer : public deepgalois::node { #ifdef CPU_ONLY // parallelize only when target size is big enough to mitigate thread // spawning overhead. - //bool parallel = (W.size() >= 512); + // bool parallel = (W.size() >= 512); opt->update(layer::weight_grad, layer::W); // W += grad #else - opt->update_gpu(input_dims[1]*output_dims[1], d_weight_grad, d_W); // W += grad + opt->update_gpu(input_dims[1] * output_dims[1], d_weight_grad, + d_W); // W += grad #endif // prev()->clear_grads(); next()->clear_grads(); @@ -174,9 +180,9 @@ class layer : public deepgalois::node { label_t* labels; float_t* norm_consts; #ifdef CPU_ONLY - Graph *graph_cpu; + Graph* graph_cpu; #else - GraphGPU *graph_gpu; + GraphGPU* graph_gpu; #endif #ifdef GALOIS_USE_DIST @@ -186,9 +192,8 @@ class layer : public deepgalois::node { #endif }; - //! Connects tail to head's edge and sets that edge's target to tail -//inline void connect(layer* head, layer* tail) { +// inline void connect(layer* head, layer* tail) { inline void connect(layer* head, layer* tail) { tail->prev_ = head->next_; tail->prev_->add_next_node(tail); diff --git a/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h b/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h index a8b6136eea..2f43e0a228 100644 --- a/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h +++ b/libdeepgalois/include/deepgalois/layers/leaky_relu_layer.h @@ -5,16 +5,18 @@ namespace deepgalois { // Leaky ReLU Layer class leaky_relu_layer : public layer { public: - leaky_relu_layer(unsigned level, float_t eps, dims_t in_dims, dims_t out_dims); - leaky_relu_layer(unsigned level, dims_t in_dims, dims_t out_dims) : - leaky_relu_layer(level, 0.0, in_dims, out_dims) {} + leaky_relu_layer(unsigned level, float_t eps, dims_t in_dims, + dims_t out_dims); + leaky_relu_layer(unsigned level, dims_t in_dims, dims_t out_dims) + : leaky_relu_layer(level, 0.0, in_dims, out_dims) {} ~leaky_relu_layer() {} std::string layer_type() const override { return std::string("leaky_relu"); } virtual void forward_propagation(const float_t* in_data, float_t* out_data); virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); + protected: float_t epsilon_; size_t n; }; -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/linear_layer.h b/libdeepgalois/include/deepgalois/layers/linear_layer.h index d68ae12479..ebcc774cc1 100644 --- a/libdeepgalois/include/deepgalois/layers/linear_layer.h +++ b/libdeepgalois/include/deepgalois/layers/linear_layer.h @@ -31,4 +31,4 @@ class linear_layer : public layer { protected: float_t scale_, bias_; }; -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/node.h b/libdeepgalois/include/deepgalois/layers/node.h index e8699d2498..11499bbede 100644 --- a/libdeepgalois/include/deepgalois/layers/node.h +++ b/libdeepgalois/include/deepgalois/layers/node.h @@ -26,15 +26,18 @@ typedef std::shared_ptr edgeptr_t; // edge class node : public std::enable_shared_from_this { public: - node() { prev_= NULL; next_ = NULL; } - //node(size_t in_size, size_t out_size) { + node() { + prev_ = NULL; + next_ = NULL; + } + // node(size_t in_size, size_t out_size) { //} //: prev_(in_size), next_(out_size) {} virtual ~node() {} const edgeptr_t prev() const { return prev_; } const edgeptr_t next() const { return next_; } protected: - //node() = delete; + // node() = delete; friend void connect(layer* head, layer* tail); mutable edgeptr_t prev_; mutable edgeptr_t next_; diff --git a/libdeepgalois/include/deepgalois/layers/relu_layer.h b/libdeepgalois/include/deepgalois/layers/relu_layer.h index 601c5d67ed..4e1c47ed77 100644 --- a/libdeepgalois/include/deepgalois/layers/relu_layer.h +++ b/libdeepgalois/include/deepgalois/layers/relu_layer.h @@ -6,11 +6,13 @@ namespace deepgalois { class relu_layer : public layer { public: relu_layer(unsigned level, dims_t in_dims, dims_t out_dims) - : layer(level, in_dims, out_dims) { trainable_ = false; } + : layer(level, in_dims, out_dims) { + trainable_ = false; + } ~relu_layer() {} std::string layer_type() const override { return std::string("relu"); } virtual void forward_propagation(const float_t* in_data, float_t* out_data); virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); }; -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h index c8b1241acc..be133995c0 100644 --- a/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h +++ b/libdeepgalois/include/deepgalois/layers/sigmoid_loss_layer.h @@ -7,7 +7,9 @@ class sigmoid_loss_layer : public layer { sigmoid_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims); ~sigmoid_loss_layer(); - std::string layer_type() const override { return std::string("sigmoid_loss"); } + std::string layer_type() const override { + return std::string("sigmoid_loss"); + } void malloc_and_init(); inline label_t get_label(size_t i, size_t j); virtual void forward_propagation(const float_t* in_data, float_t* out_data); @@ -15,4 +17,4 @@ class sigmoid_loss_layer : public layer { float_t* out_grad, float_t* in_grad); virtual acc_t get_prediction_loss(); }; -} +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h index 43f07728cd..7ba096a2aa 100644 --- a/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h +++ b/libdeepgalois/include/deepgalois/layers/softmax_loss_layer.h @@ -7,7 +7,9 @@ class softmax_loss_layer : public layer { softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims); ~softmax_loss_layer(); - std::string layer_type() const override { return std::string("softmax_loss"); } + std::string layer_type() const override { + return std::string("softmax_loss"); + } void malloc_and_init(); inline label_t get_label(size_t i); virtual void forward_propagation(const float_t* in_data, float_t* out_data); @@ -15,4 +17,4 @@ class softmax_loss_layer : public layer { float_t* out_grad, float_t* in_grad); virtual acc_t get_prediction_loss(); }; -} +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index d9e6e60d1d..53382199f4 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -13,7 +13,7 @@ namespace deepgalois { class LearningGraph { typedef std::vector IndexList; - //typedef index_t* IndexList; + // typedef index_t* IndexList; protected: bool is_device; index_t num_vertices_; @@ -21,23 +21,27 @@ class LearningGraph { IndexList rowptr_; IndexList colidx_; IndexList degrees_; - vdata_t *vertex_data_; - edata_t *edge_data_; + vdata_t* vertex_data_; + edata_t* edge_data_; - index_t *d_rowptr_; - index_t *d_colidx_; - index_t *d_degrees_; - vdata_t *d_vertex_data_; - edata_t *d_edge_data_; + index_t* d_rowptr_; + index_t* d_colidx_; + index_t* d_degrees_; + vdata_t* d_vertex_data_; + edata_t* d_edge_data_; std::vector> mirrorNodes; public: typedef size_t iterator; - LearningGraph(bool use_gpu) : is_device(use_gpu), num_vertices_(0), num_edges_(0), - vertex_data_(NULL), edge_data_(NULL) {} + LearningGraph(bool use_gpu) + : is_device(use_gpu), num_vertices_(0), num_edges_(0), vertex_data_(NULL), + edge_data_(NULL) {} LearningGraph() : LearningGraph(false) {} ~LearningGraph() { dealloc(); } - void init(index_t nv, index_t ne) { num_vertices_ = nv; num_edges_ = ne; } + void init(index_t nv, index_t ne) { + num_vertices_ = nv; + num_edges_ = ne; + } size_t size() { return (size_t)num_vertices_; } size_t sizeEdges() { return (size_t)num_edges_; } index_t get_degree(index_t vid) { return degrees_[vid]; } @@ -53,14 +57,13 @@ class LearningGraph { void constructNodes() {} void readGraph(std::string dataset); - void fixEndEdge(index_t vid, index_t row_end) { - rowptr_[vid+1] = row_end; - } + void fixEndEdge(index_t vid, index_t row_end) { rowptr_[vid + 1] = row_end; } void allocateFrom(index_t nv, index_t ne) { - //printf("Allocating num_vertices %d num_edgesi %d\n", num_vertices_, num_edges_); + // printf("Allocating num_vertices %d num_edgesi %d\n", num_vertices_, + // num_edges_); num_vertices_ = nv; - num_edges_ = ne; - rowptr_.resize(num_vertices_+1); + num_edges_ = ne; + rowptr_.resize(num_vertices_ + 1); colidx_.resize(num_edges_); degrees_.resize(num_vertices_); rowptr_[0] = 0; @@ -69,37 +72,42 @@ class LearningGraph { assert(dst < num_vertices_); assert(eid < num_edges_); colidx_[eid] = dst; - if (edge_data_) edge_data_[eid] = edata; + if (edge_data_) + edge_data_[eid] = edata; } void add_selfloop() { auto old_colidx_ = colidx_; colidx_.resize(num_vertices_ + num_edges_); for (index_t i = 0; i < num_vertices_; i++) { - auto start = rowptr_[i]; - auto end = rowptr_[i+1]; + auto start = rowptr_[i]; + auto end = rowptr_[i + 1]; bool selfloop_inserted = false; if (start == end) { - colidx_[start+i] = i; + colidx_[start + i] = i; continue; } for (auto e = start; e != end; e++) { auto dst = old_colidx_[e]; if (!selfloop_inserted) { if (i < dst) { - selfloop_inserted = true; - colidx_[e+i] = i; - colidx_[e+i+1] = dst; - } else if (e+1 == end) { - selfloop_inserted = true; - colidx_[e+i+1] = i; - colidx_[e+i] = dst; - } else colidx_[e+i] = dst; - } else colidx_[e+i+1] = dst; + selfloop_inserted = true; + colidx_[e + i] = i; + colidx_[e + i + 1] = dst; + } else if (e + 1 == end) { + selfloop_inserted = true; + colidx_[e + i + 1] = i; + colidx_[e + i] = dst; + } else + colidx_[e + i] = dst; + } else + colidx_[e + i + 1] = dst; } } - for (index_t i = 0; i <= num_vertices_; i++) rowptr_[i] += i; + for (index_t i = 0; i <= num_vertices_; i++) + rowptr_[i] += i; num_edges_ += num_vertices_; - printf("Selfloop added: num_vertices %d num_edges %d\n", num_vertices_, num_edges_); + printf("Selfloop added: num_vertices %d num_edges %d\n", num_vertices_, + num_edges_); } bool isLocal(index_t vid); @@ -114,8 +122,8 @@ class LearningGraph { #ifdef CPU_ONLY index_t getEdgeDst(index_t eid) { return colidx_[eid]; } index_t edge_begin(index_t vid) { return rowptr_[vid]; } - index_t edge_end(index_t vid) { return rowptr_[vid+1]; } - vdata_t getData(index_t vid) { return vertex_data_[vid]; } + index_t edge_end(index_t vid) { return rowptr_[vid + 1]; } + vdata_t getData(index_t vid) { return vertex_data_[vid]; } index_t getDegree(index_t vid) { return degrees_[vid]; } index_t* row_start_ptr() { return &rowptr_[0]; } const index_t* row_start_ptr() const { return &rowptr_[0]; } @@ -125,26 +133,29 @@ class LearningGraph { edata_t* edge_data_ptr() { return edge_data_; } vdata_t* vertex_data_ptr() { return vertex_data_; } #else - CUDA_HOSTDEV index_t getEdgeDst(index_t edge) { return d_colidx_[edge]; } - CUDA_HOSTDEV index_t edge_begin(index_t src) { return d_rowptr_[src]; } - CUDA_HOSTDEV index_t edge_end(index_t src) { return d_rowptr_[src+1]; } - CUDA_HOSTDEV vdata_t getData(index_t vid) { return d_vertex_data_[vid]; } - //CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_degrees_[vid]; } - //CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_degrees_[vid]; } - CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_rowptr_[vid+1] - d_rowptr_[vid]; } - CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_rowptr_[vid+1] - d_rowptr_[vid]; } - index_t *row_start_ptr() { return d_rowptr_; } - const index_t *row_start_ptr() const { return d_rowptr_; } - index_t *edge_dst_ptr() { return d_colidx_; } - const index_t *edge_dst_ptr() const { return d_colidx_; } + CUDA_HOSTDEV index_t getEdgeDst(index_t edge) { return d_colidx_[edge]; } + CUDA_HOSTDEV index_t edge_begin(index_t src) { return d_rowptr_[src]; } + CUDA_HOSTDEV index_t edge_end(index_t src) { return d_rowptr_[src + 1]; } + CUDA_HOSTDEV vdata_t getData(index_t vid) { return d_vertex_data_[vid]; } + // CUDA_HOSTDEV index_t getDegree(index_t vid) { return d_degrees_[vid]; } + // CUDA_HOSTDEV index_t getOutDegree(index_t vid) { return d_degrees_[vid]; } + CUDA_HOSTDEV index_t getDegree(index_t vid) { + return d_rowptr_[vid + 1] - d_rowptr_[vid]; + } + CUDA_HOSTDEV index_t getOutDegree(index_t vid) { + return d_rowptr_[vid + 1] - d_rowptr_[vid]; + } + index_t* row_start_ptr() { return d_rowptr_; } + const index_t* row_start_ptr() const { return d_rowptr_; } + index_t* edge_dst_ptr() { return d_colidx_; } + const index_t* edge_dst_ptr() const { return d_colidx_; } index_t* degrees_ptr() { return d_degrees_; } - edata_t *edge_data_ptr() { return d_edge_data_; } - vdata_t *vertex_data_ptr() { return d_vertex_data_; } - //const vdata_t *vertex_data_ptr() const { return vertex_data_; } - //const edata_t *edge_data_ptr() const { return edge_data; } + edata_t* edge_data_ptr() { return d_edge_data_; } + vdata_t* vertex_data_ptr() { return d_vertex_data_; } + // const vdata_t *vertex_data_ptr() const { return vertex_data_; } + // const edata_t *edge_data_ptr() const { return edge_data; } void print_test(); #endif - }; -} +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 6e7ac10fe2..89cc3d5d9c 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -28,13 +28,14 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const float* A, const float* B, const float beta, float* C); // single-precision sparse matrix dense matrix multiply, C = A * B, A is sparse -void csrmm_cpu(const int M, const int N, const int K, const int nnz, - const float alpha, float* A_nonzeros, int* A_idx_ptr, int* A_nonzero_idx, - const float* B, const float beta, float* C); +void csrmm_cpu(const int M, const int N, const int K, const int nnz, + const float alpha, float* A_nonzeros, int* A_idx_ptr, + int* A_nonzero_idx, const float* B, const float beta, float* C); // matrix-vector multiply -void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, - const float* A, const float* x, const float beta, float* y); +void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float* A, const float* x, const float beta, + float* y); //! add 2 arrays for n elements void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out); @@ -48,12 +49,13 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* x, float_t* y); float_t dot(size_t n, const float_t* x, const float_t* y); // SAXPY stands for โ€œSingle-precision A*X Plus Y" -void axpy(size_t n, const float_t a, float_t *x, float_t *y); +void axpy(size_t n, const float_t a, float_t* x, float_t* y); // Returns the index of the maximum value int argmax(const size_t n, const float_t* x); // the arguments of the maxima -//! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) / 2 +//! Computes half the L2 norm of a tensor without the sqrt: output = sum(t ** 2) +//! / 2 float_t l2_norm(size_t n, const float_t* a); //! clear n elements of a vector @@ -63,10 +65,13 @@ void clear_cpu(size_t n, float_t* in); void copy_cpu(size_t len, const float_t* in, float_t* out); // dropout functions randomly remove weights -void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate, const float_t* in, mask_t* mask, float_t* out); +void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate, + const float_t* in, mask_t* mask, float_t* out); -// dropout derivative: use existing dropouts in masks instead of generating them; -void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in, mask_t* mask, float_t* out); +// dropout derivative: use existing dropouts in masks instead of generating +// them; +void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in, + mask_t* mask, float_t* out); //! ReLU = keep if positive; and ReLU derivative: 1 if data > 0, 0 otherwise void relu_cpu(size_t n, const float_t* in, float_t* out); @@ -74,11 +79,13 @@ void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out); // Leaky ReLU void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out); -void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, const float_t* data, float_t* out); +void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, + const float_t* data, float_t* out); // Loss function for single-class label (one-hot) data: softmax void softmax(size_t n, const float_t* input, float_t* output); -void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp); +void d_softmax(size_t n, const float_t* y, const float_t* p, float_t* dy, + const float_t* dp); // Cross entropy float_t cross_entropy(size_t n, const float_t* y, const float_t* p); @@ -86,56 +93,65 @@ void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d); // Loss function for multi-class label (one-hot) data: sigmoid void sigmoid(size_t n, const float_t* input, float_t* output); -void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, const float_t* dp); +void d_sigmoid(size_t n, const float_t* y, const float_t* p, float_t* dy, + const float_t* dp); // dropout functions randomly remove weights -void dropout(float scale, float dropout_rate, const float_t* in, mask_t* mask, float_t* out); -void d_dropout(const float scale, const float_t* in, mask_t* mask, float_t* out); +void dropout(float scale, float dropout_rate, const float_t* in, mask_t* mask, + float_t* out); +void d_dropout(const float scale, const float_t* in, mask_t* mask, + float_t* out); //! transposes a matrix (malloc'd array) void transpose(size_t x, size_t y, const float_t* in, float_t* out); - -} // math -} // deepgalois + +} // namespace math +} // namespace deepgalois // GPU operators -bool isnan_gpu(int n, const float_t *array); // does array contain any 'nan' element -void init_const_gpu(int n, float_t value, float_t *array); +bool isnan_gpu(int n, + const float_t* array); // does array contain any 'nan' element +void init_const_gpu(int n, float_t value, float_t* array); void copy_gpu(int len, const float_t* in, float_t* out); -void vadd_gpu(const int n, const float_t* a, const float_t* b, float_t* out); // vector add -void axpy_gpu(const int n, const float_t a, const float_t* x, float_t* y); // axpy +void vadd_gpu(const int n, const float_t* a, const float_t* b, + float_t* out); // vector add +void axpy_gpu(const int n, const float_t a, const float_t* x, + float_t* y); // axpy void relu_gpu(const int n, const float_t* in, float_t* out); // ReLU void d_relu_gpu(const int n, const float_t* in_diff, const float_t* data, float_t* out_diff); // ReLU derivative -void leaky_relu_gpu(const int n, const float_t epsilon, - const float_t* in, float_t* out); // Leaky ReLU -void d_leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in_diff, - const float_t* data, float_t* out_diff); // Leaky ReLU derivative -void dropout_gpu(int n, float scale, float dropout_rate, - const float_t* in, mask_t* masks, float_t* out); // dropout -void d_dropout_gpu(int n, float scale, float dropout_rate, - const float_t* in, const mask_t* masks, float_t* out); // dropout derivative +void leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in, + float_t* out); // Leaky ReLU +void d_leaky_relu_gpu(const int n, const float_t epsilon, + const float_t* in_diff, const float_t* data, + float_t* out_diff); // Leaky ReLU derivative +void dropout_gpu(int n, float scale, float dropout_rate, const float_t* in, + mask_t* masks, float_t* out); // dropout +void d_dropout_gpu(int n, float scale, float dropout_rate, const float_t* in, + const mask_t* masks, float_t* out); // dropout derivative void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, const int M, const int N, const int K, const float alpha, const float* A, const float* B, const float beta, float* C); void matmul_gpu(const size_t x, const size_t y, const size_t z, - const float_t* A, const float_t* B, float_t* C); + const float_t* A, const float_t* B, float_t* C); void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, const float_t* A, const float_t* B, float_t* C); // matrix multiply -void csrmm_gpu(const int M, const int N, const int K, const int nnz, - const float alpha, const float* A_nonzeros, - const int* A_idx_ptr, const int* A_nonzero_idx, - const float* B, const float beta, float* trans_C, float* C); -void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in_data, - const mask_t* masks, const label_t* labels, - float_t* loss, float_t* out_data); +void csrmm_gpu(const int M, const int N, const int K, const int nnz, + const float alpha, const float* A_nonzeros, const int* A_idx_ptr, + const int* A_nonzero_idx, const float* B, const float beta, + float* trans_C, float* C); +void softmax_cross_entropy_gpu(int len, int begin, int end, + const float_t* in_data, const mask_t* masks, + const label_t* labels, float_t* loss, + float_t* out_data); void d_softmax_cross_entropy_gpu(int len, int bengin, int end, const mask_t* masks, const label_t* labels, const float_t* out_data, float_t* diff); -void sigmoid_cross_entropy_gpu(int len, int begin, int end, const float_t* in_data, - const mask_t* masks, const label_t* labels, - float_t* loss, float_t* out_data); +void sigmoid_cross_entropy_gpu(int len, int begin, int end, + const float_t* in_data, const mask_t* masks, + const label_t* labels, float_t* loss, + float_t* out_data); void d_sigmoid_cross_entropy_gpu(int len, int bengin, int end, const mask_t* masks, const label_t* labels, const float_t* out_data, float_t* diff); @@ -146,9 +162,11 @@ bool is_allocated_device(float_t* data); void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks); void float_malloc_device(int n, float_t*& ptr); void float_free_device(float_t*& ptr); -void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr); -acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss); -acc_t l2_norm_gpu(int n, const float_t *in); -void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t *out); -void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t *in_diff, float_t *out_diff); +void float_copy_device(int n, float_t* h_ptr, float_t* d_ptr); +acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, + float_t* loss); +acc_t l2_norm_gpu(int n, const float_t* in); +void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t* out); +void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff, + float_t* out_diff); #endif diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index aa62339a2a..117de131b2 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -29,31 +29,29 @@ namespace deepgalois { class Net { public: Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, - unsigned hidden1, float lr, float dropout, float wd, - bool selfloop, bool single, bool l2norm, bool dense, - unsigned neigh_sz, unsigned subg_sz, int val_itv) : - is_single_class(single), has_l2norm(l2norm), has_dense(dense), - neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz), - num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), - learning_rate(lr), dropout_rate(dropout), weight_decay(wd), - val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { + unsigned hidden1, float lr, float dropout, float wd, bool selfloop, + bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz, + int val_itv) + : is_single_class(single), has_l2norm(l2norm), has_dense(dense), + neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz), + num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), + learning_rate(lr), dropout_rate(dropout), weight_decay(wd), + val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { assert(n_conv > 0); // TODO use galois print std::cout << "Configuration: num_threads " << num_threads - << ", num_conv_layers " << num_conv_layers - << ", num_epochs " << num_epochs - << ", hidden1 " << hidden1 - << ", learning_rate " << learning_rate - << ", dropout_rate " << dropout_rate + << ", num_conv_layers " << num_conv_layers << ", num_epochs " + << num_epochs << ", hidden1 " << hidden1 << ", learning_rate " + << learning_rate << ", dropout_rate " << dropout_rate << ", weight_decay " << weight_decay << "\n"; num_layers = num_conv_layers + 1; - if (has_l2norm) num_layers ++; - if (has_dense) num_layers ++; + if (has_l2norm) + num_layers++; + if (has_dense) + num_layers++; // initialize feature metadata feature_dims.resize(num_layers + 1); - - #ifndef GALOIS_USE_DIST context = new deepgalois::Context(); context->set_dataset(dataset_str); @@ -62,56 +60,60 @@ class Net { // read graph, get num nodes num_classes = context->read_labels(); - //std::cout << "Reading label masks ... "; + // std::cout << "Reading label masks ... "; train_masks = new mask_t[num_samples]; - val_masks = new mask_t[num_samples]; - std::fill(train_masks, train_masks+num_samples, 0); - std::fill(val_masks, val_masks+num_samples, 0); + val_masks = new mask_t[num_samples]; + std::fill(train_masks, train_masks + num_samples, 0); + std::fill(val_masks, val_masks + num_samples, 0); // get training and validation sets if (dataset_str == "reddit") { train_begin = 0, train_count = 153431, - train_end = train_begin + train_count; + train_end = train_begin + train_count; val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; // TODO do all can be used below - for (size_t i = train_begin; i < train_end; i++) train_masks[i] = 1; - for (size_t i = val_begin; i < val_end; i++) val_masks[i] = 1; + for (size_t i = train_begin; i < train_end; i++) + train_masks[i] = 1; + for (size_t i = val_begin; i < val_end; i++) + val_masks[i] = 1; } else { - train_count = context->read_masks("train", num_samples, train_begin, train_end, train_masks); - val_count = context->read_masks("val", num_samples, val_begin, val_end, val_masks); + train_count = context->read_masks("train", num_samples, train_begin, + train_end, train_masks); + val_count = context->read_masks("val", num_samples, val_begin, val_end, + val_masks); } if (subgraph_sample_size > train_count) { - std::cout << "FATAL: subgraph size can not be larger than the size of training set\n"; + std::cout << "FATAL: subgraph size can not be larger than the size of " + "training set\n"; exit(1); } feature_dims[0] = context->read_features(); // input feature dimension: D for (size_t i = 1; i < num_conv_layers; i++) - feature_dims[i] = hidden1; // hidden1 level embedding: 16 - feature_dims[num_conv_layers] = num_classes; // output embedding: E - if (has_l2norm) - feature_dims[num_conv_layers+1] = num_classes; // l2 normalized embedding: E - if (has_dense) - feature_dims[num_layers-1] = num_classes; // MLP embedding: E - feature_dims[num_layers] = num_classes; // normalized output embedding: E + feature_dims[i] = hidden1; // hidden1 level embedding: 16 + feature_dims[num_conv_layers] = num_classes; // output embedding: E + if (has_l2norm) + feature_dims[num_conv_layers + 1] = + num_classes; // l2 normalized embedding: E + if (has_dense) + feature_dims[num_layers - 1] = num_classes; // MLP embedding: E + feature_dims[num_layers] = num_classes; // normalized output embedding: E layers.resize(num_layers); context->set_use_subgraph(subgraph_sample_size > 0); init(); -#endif +#endif } - Net() : is_single_class(true), has_l2norm(false), has_dense(false), - neighbor_sample_size(0), subgraph_sample_size(0), - num_threads(1), num_samples(0), num_classes(0), - num_conv_layers(0), num_layers(0), num_epochs(0), - learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0), - train_begin(0), train_end(0), train_count(0), - val_begin(0), val_end(0), val_count(0), - test_begin(0), test_end(0), test_count(0), - val_interval(1), num_subgraphs(1), num_vertices_sg(9000), - train_masks(NULL), val_masks(NULL), - test_masks(NULL), context(NULL) {} + Net() + : is_single_class(true), has_l2norm(false), has_dense(false), + neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1), + num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0), + num_epochs(0), learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0), + train_begin(0), train_end(0), train_count(0), val_begin(0), val_end(0), + val_count(0), test_begin(0), test_end(0), test_count(0), + val_interval(1), num_subgraphs(1), num_vertices_sg(9000), + train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {} void init(); #ifdef GALOIS_USE_DIST @@ -121,27 +123,28 @@ class Net { size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } size_t get_nnodes() { return num_samples; } - void normalize(); // Scale gradient to counterbalance accumulation + void normalize(); // Scale gradient to counterbalance accumulation void regularize(); // add weight decay void train(optimizer* opt, bool need_validate) { - std::string header = ""; + std::string header = ""; std::string seperator = " "; #ifdef GALOIS_USE_DIST unsigned myID = galois::runtime::getSystemNetworkInterface().ID; - header = "[" + std::to_string(myID) + "] "; - seperator = "\n"; + header = "[" + std::to_string(myID) + "] "; + seperator = "\n"; #endif double total_train_time = 0.0; - int num_subg_remain = 0; + int num_subg_remain = 0; #ifdef CPU_ONLY #ifndef GALOIS_USE_DIST if (subgraph_sample_size) { context->createSubgraphs(num_subgraphs); - subgraphs_masks = new mask_t[num_samples*num_subgraphs]; + subgraphs_masks = new mask_t[num_samples * num_subgraphs]; std::cout << "\nConstruct training vertex set induced graph...\n"; - sampler->set_masked_graph(train_begin, train_end, train_count, train_masks, context->getGraphPointer()); + sampler->set_masked_graph(train_begin, train_end, train_count, + train_masks, context->getGraphPointer()); } #endif #endif @@ -160,29 +163,34 @@ class Net { #ifdef CPU_ONLY #ifndef GALOIS_USE_DIST for (int sid = 0; sid < num_subgraphs; sid++) { - //galois::do_all(galois::iterate(size_t(0), size_t(num_subgraphs)),[&](const auto sid) { + // galois::do_all(galois::iterate(size_t(0), + // size_t(num_subgraphs)),[&](const auto sid) { unsigned tid = 0; - //tid = galois::substrate::ThreadPool::getTID(); - sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid*num_samples], tid); - }//, galois::loopname("subgraph_gen")); + // tid = galois::substrate::ThreadPool::getTID(); + sampler->subgraph_sample(subgraph_sample_size, + *(context->getSubgraphPointer(sid)), + &subgraphs_masks[sid * num_samples], tid); + } //, galois::loopname("subgraph_gen")); #endif #endif num_subg_remain = num_subgraphs; t_subgen.Stop(); - //std::cout << "Done, time: " << t_subgen.Millisecs() << "\n"; + // std::cout << "Done, time: " << t_subgen.Millisecs() << "\n"; } #ifndef GALOIS_USE_DIST for (int i = 0; i < num_subgraphs; i++) { auto sg_ptr = context->getSubgraphPointer(i); sg_ptr->degree_counting(); - //galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " num_e ", sg_ptr->sizeEdges(), "\n"); + // galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " + // num_e ", sg_ptr->sizeEdges(), "\n"); } -#endif //GALOIS_USE_DIST +#endif // GALOIS_USE_DIST num_subg_remain--; - int sg_id = num_subg_remain; + int sg_id = num_subg_remain; auto subgraph_ptr = context->getSubgraphPointer(sg_id); - num_vertices_sg = subgraph_ptr->size(); - //galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, ", num_edges: ", subgraph_ptr->sizeEdges(), "\n"); + num_vertices_sg = subgraph_ptr->size(); + // galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, ", + // num_edges: ", subgraph_ptr->sizeEdges(), "\n"); for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_vertices_sg); context->norm_factor_computing(1, sg_id); @@ -191,12 +199,15 @@ class Net { layers[i]->set_norm_consts_ptr(context->get_norm_factors_subg_ptr()); } // update labels for subgraph - context->gen_subgraph_labels(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]); - layers[num_layers-1]->set_labels_ptr(context->get_labels_subg_ptr()); + context->gen_subgraph_labels(num_vertices_sg, + &subgraphs_masks[sg_id * num_samples]); + layers[num_layers - 1]->set_labels_ptr(context->get_labels_subg_ptr()); // update features for subgraph - context->gen_subgraph_feats(num_vertices_sg, &subgraphs_masks[sg_id*num_samples]); - layers[0]->set_feats_ptr(context->get_feats_subg_ptr()); // feed input data + context->gen_subgraph_feats(num_vertices_sg, + &subgraphs_masks[sg_id * num_samples]); + layers[0]->set_feats_ptr( + context->get_feats_subg_ptr()); // feed input data } // training steps @@ -204,12 +215,13 @@ class Net { set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; - // forward: after this phase, layer edges will contain intermediate features - // for use during backprop + // forward: after this phase, layer edges will contain intermediate + // features for use during backprop double fw_time = evaluate("train", train_loss, train_acc); // backward: use intermediate features + ground truth to update layers - // with feature gradients whcih are then used to calculate weight gradients + // with feature gradients whcih are then used to calculate weight + // gradients Net::bprop(); // gradient update: use gradients stored on each layer to update model for @@ -218,8 +230,8 @@ class Net { // validation / testing set_netphases(net_phase::test); - std::cout << header << "train_loss " << std::setprecision(3) << std::fixed << train_loss - << " train_acc " << train_acc << seperator; + std::cout << header << "train_loss " << std::setprecision(3) << std::fixed + << train_loss << " train_acc " << train_acc << seperator; t_epoch.Stop(); double epoch_time = t_epoch.Millisecs(); total_train_time += epoch_time; @@ -227,17 +239,19 @@ class Net { // Validation acc_t val_loss = 0.0, val_acc = 0.0; double val_time = evaluate("val", val_loss, val_acc); - std::cout << header << "val_loss " << std::setprecision(3) << std::fixed << val_loss - << " val_acc " << val_acc << seperator; - std::cout << header << "time " << std::setprecision(3) << std::fixed << epoch_time + val_time - << " ms (train_time " << epoch_time << " val_time " << val_time << ")\n"; + std::cout << header << "val_loss " << std::setprecision(3) << std::fixed + << val_loss << " val_acc " << val_acc << seperator; + std::cout << header << "time " << std::setprecision(3) << std::fixed + << epoch_time + val_time << " ms (train_time " << epoch_time + << " val_time " << val_time << ")\n"; } else { - std::cout << header << "train_time " << std::fixed << epoch_time - << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n"; + std::cout << header << "train_time " << std::fixed << epoch_time + << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time + << ")\n"; } } double avg_train_time = total_train_time / (double)num_epochs; - double throughput = 1000.0 * (double)num_epochs / total_train_time; + double throughput = 1000.0 * (double)num_epochs / total_train_time; std::cout << "\nAverage training time: " << avg_train_time << " ms. Throughput: " << throughput << " epoch/s\n"; } @@ -251,35 +265,37 @@ class Net { mask_t* masks = NULL; if (type == "train") { begin = train_begin; - end = train_end; + end = train_end; count = train_count; masks = train_masks; if (subgraph_sample_size) { // update masks for subgraph masks = NULL; begin = 0; - end = num_vertices_sg; + end = num_vertices_sg; count = num_vertices_sg; } } else if (type == "val") { begin = val_begin; - end = val_end; + end = val_end; count = val_count; masks = val_masks; } else { begin = test_begin; - end = test_end; + end = test_end; count = test_count; masks = test_masks; } #ifdef CPU_ONLY - if (subgraph_sample_size && type != "train") { // switch to the original graph - for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_samples); + if (subgraph_sample_size && + type != "train") { // switch to the original graph + for (size_t i = 0; i < num_layers; i++) + layers[i]->update_dim_size(num_samples); for (size_t i = 0; i < num_conv_layers; i++) { layers[i]->set_graph_ptr(context->getGraphPointer()); layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr()); } - layers[num_layers-1]->set_labels_ptr(context->get_labels_ptr()); + layers[num_layers - 1]->set_labels_ptr(context->get_labels_ptr()); layers[0]->set_feats_ptr(context->get_feats_ptr()); // feed input data } #else @@ -291,7 +307,7 @@ class Net { masks = d_test_masks; } #endif - loss = fprop(begin, end, count, masks); + loss = fprop(begin, end, count, masks); float_t* predictions = layers[num_layers - 1]->next()->get_data(); label_t* labels; if (type == "train" && subgraph_sample_size) { @@ -302,7 +318,8 @@ class Net { if (is_single_class) { acc = masked_accuracy(begin, end, count, masks, predictions, labels); } else { - acc = masked_multi_class_accuracy(begin, end, count, masks, predictions, labels); + acc = masked_multi_class_accuracy(begin, end, count, masks, predictions, + labels); } t_eval.Stop(); return t_eval.Millisecs(); @@ -316,9 +333,10 @@ class Net { test_count = 55703; test_end = test_begin + test_count; #ifndef GALOIS_USE_DIST - for (size_t i = test_begin; i < test_end; i++) test_masks[i] = 1; + for (size_t i = test_begin; i < test_end; i++) + test_masks[i] = 1; #else - for (size_t i = test_begin; i < test_end; i++) { + for (size_t i = test_begin; i < test_end; i++) { if (dGraph->isLocal(i)) { test_masks[dGraph->getLID(i)] = 1; } @@ -326,9 +344,11 @@ class Net { #endif } else { #ifndef GALOIS_USE_DIST - test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks); + test_count = context->read_masks("test", num_samples, test_begin, + test_end, test_masks); #else - test_count = context->read_masks("test", num_samples, test_begin, test_end, test_masks, dGraph); + test_count = context->read_masks("test", num_samples, test_begin, + test_end, test_masks, dGraph); #endif } #ifndef CPU_ONLY @@ -340,14 +360,14 @@ class Net { void construct_layers() { // append conv layers std::cout << "\nConstructing layers...\n"; - for (size_t i = 0; i < num_conv_layers-1; i++) - append_conv_layer(i, true); // conv layers, act=true - append_conv_layer(num_conv_layers-1); // the last hidden layer, act=false + for (size_t i = 0; i < num_conv_layers - 1; i++) + append_conv_layer(i, true); // conv layers, act=true + append_conv_layer(num_conv_layers - 1); // the last hidden layer, act=false if (has_l2norm) - append_l2norm_layer(num_conv_layers); // l2_norm layer + append_l2norm_layer(num_conv_layers); // l2_norm layer if (has_dense) - append_dense_layer(num_layers-2); // dense layer - append_out_layer(num_layers-1); // output layer + append_dense_layer(num_layers - 2); // dense layer + append_out_layer(num_layers - 1); // output layer // allocate memory for intermediate features and gradients for (size_t i = 0; i < num_layers; i++) { @@ -380,11 +400,11 @@ class Net { void append_dense_layer(size_t layer_id) { assert(layer_id > 0); // can not be the first layer std::vector in_dims(2), out_dims(2); - in_dims[0] = num_samples; - in_dims[0] = num_samples; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - //layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims); + in_dims[0] = num_samples; + in_dims[0] = num_samples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + // layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims); } //! Add an output layer to the network @@ -402,7 +422,8 @@ class Net { } //! Add a convolution layer to the network - void append_conv_layer(size_t layer_id, bool act=false, bool norm=true, bool bias=false, bool dropout=true) { + void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, + bool bias = false, bool dropout = true) { assert(dropout_rate < 1.0); assert(layer_id < num_conv_layers); std::vector in_dims(2), out_dims(2); @@ -410,7 +431,7 @@ class Net { in_dims[1] = get_in_dim(layer_id); out_dims[1] = get_out_dim(layer_id); layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, - dropout_rate, in_dims, out_dims); + dropout_rate, in_dims, out_dims); layers[layer_id]->set_graph_ptr(context->getGraphPointer()); } @@ -467,20 +488,20 @@ class Net { } protected: - bool is_single_class; // single-class (one-hot) or multi-class label - bool has_l2norm; // whether the net contains an l2_norm layer - bool has_dense; // whether the net contains an dense layer - unsigned neighbor_sample_size; // neighbor sampling - unsigned subgraph_sample_size; // subgraph sampling - int num_threads; // number of threads - size_t num_samples; // number of samples: N - size_t num_classes; // number of vertex classes: E - size_t num_conv_layers; // number of convolutional layers - size_t num_layers; // total number of layers (conv + output) - int num_epochs; // number of epochs - float learning_rate; // learning rate - float dropout_rate; // dropout rate - float weight_decay; // weighti decay for over-fitting + bool is_single_class; // single-class (one-hot) or multi-class label + bool has_l2norm; // whether the net contains an l2_norm layer + bool has_dense; // whether the net contains an dense layer + unsigned neighbor_sample_size; // neighbor sampling + unsigned subgraph_sample_size; // subgraph sampling + int num_threads; // number of threads + size_t num_samples; // number of samples: N + size_t num_classes; // number of vertex classes: E + size_t num_conv_layers; // number of convolutional layers + size_t num_layers; // total number of layers (conv + output) + int num_epochs; // number of epochs + float learning_rate; // learning rate + float dropout_rate; // dropout rate + float weight_decay; // weighti decay for over-fitting size_t train_begin, train_end, train_count; size_t val_begin, val_end, val_count; size_t test_begin, test_end, test_count; @@ -489,15 +510,15 @@ class Net { int num_vertices_sg; bool is_selfloop; - mask_t* train_masks; // masks for training - mask_t* d_train_masks; // masks for training on device - mask_t* val_masks; // masks for validation - mask_t* d_val_masks; // masks for validation on device - mask_t* test_masks; // masks for test - mask_t* d_test_masks; // masks for test on device - mask_t* subgraphs_masks; // masks for subgraphs - std::vector feature_dims; // feature dimnesions for each layer - std::vector layers; // all the layers in the neural network + mask_t* train_masks; // masks for training + mask_t* d_train_masks; // masks for training on device + mask_t* val_masks; // masks for validation + mask_t* d_val_masks; // masks for validation on device + mask_t* test_masks; // masks for test + mask_t* d_test_masks; // masks for test on device + mask_t* subgraphs_masks; // masks for subgraphs + std::vector feature_dims; // feature dimnesions for each layer + std::vector layers; // all the layers in the neural network #ifndef GALOIS_USE_DIST deepgalois::Context* context; #else @@ -507,13 +528,15 @@ class Net { #ifdef CPU_ONLY #ifndef GALOIS_USE_DIST - Sampler *sampler; + Sampler* sampler; #endif #endif // comparing outputs with the ground truth (labels) - acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth); - acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth); + acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, + float_t* preds, label_t* ground_truth); + acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, + mask_t* masks, float_t* preds, + label_t* ground_truth); }; } // namespace deepgalois - diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h index 4fd7caa800..aa0dcbaab7 100644 --- a/libdeepgalois/include/deepgalois/optimizer.h +++ b/libdeepgalois/include/deepgalois/optimizer.h @@ -27,8 +27,8 @@ struct optimizer { optimizer(const optimizer&) = default; optimizer(optimizer&&) = default; optimizer& operator=(const optimizer&) = default; - optimizer& operator=(optimizer&&) = default; - virtual ~optimizer() = default; + optimizer& operator=(optimizer&&) = default; + virtual ~optimizer() = default; virtual void update(const vec_t& dW, vec_t& W) = 0; #ifndef CPU_ONLY virtual void update_gpu(const size_t n, const float_t* dW, float_t* W) = 0; @@ -40,8 +40,10 @@ struct optimizer { template struct stateful_optimizer : public optimizer { void reset() override { - for (auto& e : E_) e.clear(); + for (auto& e : E_) + e.clear(); } + protected: template vec_t& get(const vec_t& key) { @@ -53,7 +55,7 @@ struct stateful_optimizer : public optimizer { std::unordered_map E_[N]; #ifndef CPU_ONLY template - float_t *get_gpu(const size_t n, const float_t *key); + float_t* get_gpu(const size_t n, const float_t* key); std::unordered_map dE_[N]; #endif }; diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h index e25124cbfd..9e5faf1f39 100644 --- a/libdeepgalois/include/deepgalois/reader.h +++ b/libdeepgalois/include/deepgalois/reader.h @@ -7,14 +7,16 @@ class Reader { private: std::string dataset_str; void progressPrint(unsigned maxii, unsigned ii); + public: Reader() : dataset_str("") {} Reader(std::string dataset) : dataset_str(dataset) {} void init(std::string dataset) { dataset_str = dataset; } size_t read_labels(bool is_single_class, label_t*& labels); size_t read_features(float_t*& feats, std::string filetype = "bin"); - size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks); - void readGraphFromGRFile(Graph *g); + size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, + mask_t* masks); + void readGraphFromGRFile(Graph* g); }; -} +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h index c5f8abd219..c559804354 100644 --- a/libdeepgalois/include/deepgalois/sampler.h +++ b/libdeepgalois/include/deepgalois/sampler.h @@ -4,7 +4,7 @@ #include "deepgalois/gtypes.h" namespace deepgalois { -#define ETA 1.5 // length factor of DB in sampling +#define ETA 1.5 // length factor of DB in sampling #define SAMPLE_CLIP 3000 // clip degree in sampling #define DEFAULT_SIZE_FRONTIER 3000 #define DEFAULT_SIZE_SUBG 9000 @@ -16,19 +16,25 @@ class Sampler { ~Sampler() {} // sample a subgraph sg of size n from graph g - void subgraph_sample(size_t n, Graph &sg, mask_t* masks, unsigned tid = 0); + void subgraph_sample(size_t n, Graph& sg, mask_t* masks, unsigned tid = 0); // !API function for user-defined selection strategy - virtual void select_vertices(size_t nv, size_t n, int m, Graph* g, VertexList vertices, VertexSet &vertex_set); - virtual void select_vertices(size_t n, int m, VertexSet &vertex_set, unsigned tid); + virtual void select_vertices(size_t nv, size_t n, int m, Graph* g, + VertexList vertices, VertexSet& vertex_set); + virtual void select_vertices(size_t n, int m, VertexSet& vertex_set, + unsigned tid); - //galois::runtime::iterable > neighbor_sampler(Graph &g, VertexID v); + // galois::runtime::iterable > + // neighbor_sampler(Graph &g, VertexID v); - edge_iterator sampled_edge_begin(Graph &g, VertexID v) { return g.edge_begin(v); } + edge_iterator sampled_edge_begin(Graph& g, VertexID v) { + return g.edge_begin(v); + } - edge_iterator sampled_edge_end(Graph &g, VertexID v) { return g.edge_end(v); } + edge_iterator sampled_edge_end(Graph& g, VertexID v) { return g.edge_end(v); } - void set_masked_graph(size_t begin, size_t end, size_t count, mask_t* masks, Graph* g); + void set_masked_graph(size_t begin, size_t end, size_t count, mask_t* masks, + Graph* g); protected: int m_; @@ -39,19 +45,22 @@ class Sampler { int subg_deg; VertexList vertices_; std::vector node_train; - mask_t *masks_; - Graph *masked_graph; - Graph *graph; + mask_t* masks_; + Graph* masked_graph; + Graph* graph; - // Given a subset of vertices and a graph g, generate a subgraph sg from the graph g - void generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub); + // Given a subset of vertices and a graph g, generate a subgraph sg from the + // graph g + void generate_subgraph(VertexSet& vertex_set, Graph& g, Graph& sub); void generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& mg); - void get_masked_degrees(size_t n, mask_t* masks, Graph* g, std::vector °rees); + void get_masked_degrees(size_t n, mask_t* masks, Graph* g, + std::vector& degrees); void update_masks(size_t n, VertexSet vertices, mask_t* masks); inline VertexList reindexing_vertice(size_t n, VertexSet vertex_set); - void check_DB(std::vector &DB0, std::vector &DB1, std::vector &DB2, size_t size); + void check_DB(std::vector& DB0, std::vector& DB1, + std::vector& DB2, size_t size); }; -} +} // namespace deepgalois #endif diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index 87e7411689..71add8b650 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -19,9 +19,9 @@ typedef std::vector typedef std::vector FV; // feature vector typedef std::vector FV2D; // feature vectors: num_samples x feature_dim typedef float acc_t; // Accuracy type -typedef uint8_t label_t; // label is for classification (supervised learning) -typedef uint8_t mask_t; // mask is used to indicate different uses of labels: - // train, val, test +typedef uint8_t label_t; // label is for classification (supervised learning) +typedef uint8_t mask_t; // mask is used to indicate different uses of labels: + // train, val, test typedef uint32_t VertexID; typedef uint64_t EdgeID; typedef std::vector VertexList; @@ -44,15 +44,15 @@ enum class net_phase { train, test }; #ifdef GALOIS_USE_DIST namespace deepgalois { - // TODO only being used by graph conv layer at the moment so extern works, - // but this design is bad and needs to be revisited - - //! Set this to let sync struct know where to get data from - extern float_t* _dataToSync; - //! Set this to let sync struct know the size of the vector to use during - //! sync - extern long unsigned _syncVectorSize; -} +// TODO only being used by graph conv layer at the moment so extern works, +// but this design is bad and needs to be revisited + +//! Set this to let sync struct know where to get data from +extern float_t* _dataToSync; +//! Set this to let sync struct know the size of the vector to use during +//! sync +extern long unsigned _syncVectorSize; +} // namespace deepgalois #endif #endif diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h index c8bb1d4e41..08f28126bf 100644 --- a/libdeepgalois/include/deepgalois/utils.h +++ b/libdeepgalois/include/deepgalois/utils.h @@ -98,10 +98,10 @@ uniform_rand(T min, T max) { // sequential prefix sum template -inline std::vector prefix_sum(const std::vector &in) { +inline std::vector prefix_sum(const std::vector& in) { std::vector prefix(in.size() + 1); OutTy total = 0; - for (size_t i = 0; i < in.size(); i ++) { + for (size_t i = 0; i < in.size(); i++) { prefix[i] = total; total += (OutTy)in[i]; } @@ -110,62 +110,66 @@ inline std::vector prefix_sum(const std::vector &in) { } template -OutTy* parallel_prefix_sum(const std::vector &in); +OutTy* parallel_prefix_sum(const std::vector& in); // Utility function to randomly select k items from [begin, end) template inline T* select_k_items(T k, T begin, T end) { - auto i = begin; - - // reservoir[] is the output array. Initialize - // it with first k vertices - T *reservoir = new T[k]; - for (; i < k; i++) reservoir[i] = i; - - // Use a different seed value so that we don't get - // same result each time we run this program - srand(time(NULL)); - - // Iterate from the (k+1)th element to nth element - for (; i < end; i++) { - // Pick a random index from 0 to i. - auto j = rand() % (i + 1); - - // If the randomly picked index is smaller than k, - // then replace the element present at the index - // with new element from stream - if (j < k) reservoir[j] = i; - } - return reservoir; + auto i = begin; + + // reservoir[] is the output array. Initialize + // it with first k vertices + T* reservoir = new T[k]; + for (; i < k; i++) + reservoir[i] = i; + + // Use a different seed value so that we don't get + // same result each time we run this program + srand(time(NULL)); + + // Iterate from the (k+1)th element to nth element + for (; i < end; i++) { + // Pick a random index from 0 to i. + auto j = rand() % (i + 1); + + // If the randomly picked index is smaller than k, + // then replace the element present at the index + // with new element from stream + if (j < k) + reservoir[j] = i; + } + return reservoir; } // Utility function to find ceiling of r in arr[l..h] template -inline T find_ceil(T *arr, T r, T l, T h) { - T mid; - while (l < h) { - mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2 - (r > arr[mid]) ? (l = mid + 1) : (h = mid); - } - return (arr[l] >= r) ? l : -1; -} - -// Utility function to select one element from n elements given a frequency (probability) distribution +inline T find_ceil(T* arr, T r, T l, T h) { + T mid; + while (l < h) { + mid = l + ((h - l) >> 1); // Same as mid = (l+h)/2 + (r > arr[mid]) ? (l = mid + 1) : (h = mid); + } + return (arr[l] >= r) ? l : -1; +} + +// Utility function to select one element from n elements given a frequency +// (probability) distribution // https://www.geeksforgeeks.org/random-number-generator-in-arbitrary-probability-distribution-fashion/ template -T select_one_item(T n, T *dist) { - T *offsets = new T[n]; - offsets[0] = dist[0]; - // compute the prefix sum of the distribution - for (T i = 1; i < n; ++i) offsets[i] = offsets[i-1] + dist[i]; - // offsets[n-1] is sum of all frequencies - T sum = offsets[n-1]; - T r = (rand() % sum) + 1; - // find which range r falls into, and return the index of the range - return find_ceil(offsets, r, 0, n - 1); +T select_one_item(T n, T* dist) { + T* offsets = new T[n]; + offsets[0] = dist[0]; + // compute the prefix sum of the distribution + for (T i = 1; i < n; ++i) + offsets[i] = offsets[i - 1] + dist[i]; + // offsets[n-1] is sum of all frequencies + T sum = offsets[n - 1]; + T r = (rand() % sum) + 1; + // find which range r falls into, and return the index of the range + return find_ceil(offsets, r, 0, n - 1); } -acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t *masks, - size_t num_classes, label_t *ground_truth, float_t *pred); +acc_t masked_f1_score(size_t begin, size_t end, size_t count, mask_t* masks, + size_t num_classes, label_t* ground_truth, float_t* pred); -} // end namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 66a1a0885e..1da6c6c5a1 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -26,10 +26,11 @@ size_t DistContext::read_labels(std::string dataset_str) { in >> m >> num_classes >> std::ws; assert(m == dGraph->globalSize()); // size of labels should be # local nodes - h_labels = new label_t[dGraph->size()]; // single-class (one-hot) label for each vertex: N x 1 + h_labels = new label_t[dGraph->size()]; // single-class (one-hot) label for + // each vertex: N x 1 uint32_t foundVertices = 0; - unsigned v = 0; + unsigned v = 0; // each line contains a set of 0s and 1s while (std::getline(in, line)) { // only bother if local node @@ -55,8 +56,9 @@ size_t DistContext::read_labels(std::string dataset_str) { in.close(); // print the number of vertex classes - galois::gPrint("[", myID, "] Done with labels, unique label counts: ", - num_classes, "; set ", foundVertices, " nodes\n"); + galois::gPrint("[", myID, + "] Done with labels, unique label counts: ", num_classes, + "; set ", foundVertices, " nodes\n"); return num_classes; } @@ -97,8 +99,8 @@ size_t DistContext::read_features(std::string dataset_str) { } in.close(); - galois::gPrint("[", myID, "] Done with features, feature length: ", - feat_len, "\n"); + galois::gPrint("[", myID, "] Done with features, feature length: ", feat_len, + "\n"); return feat_len; } @@ -141,50 +143,42 @@ size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, i++; } std::cout << mask_type + "_mask range: [" << begin << ", " << end - << ") Number of valid samples: " << sample_count << "(" - << (float)sample_count/(float)n*(float)100 << "\%)\n"; + << ") Number of valid samples: " << sample_count << "(" + << (float)sample_count / (float)n * (float)100 << "\%)\n"; in.close(); return sample_count; } -float_t* DistContext::get_in_ptr() { - return &h_feats[0]; -} +float_t* DistContext::get_in_ptr() { return &h_feats[0]; } -//void DistContext::norm_factor_computing(bool is_subgraph, int subg_id) { +// void DistContext::norm_factor_computing(bool is_subgraph, int subg_id) { void DistContext::norm_factor_computing(bool, int) { // TODO: this is a distributed operation // create for now, TODO need to actually fill it in norm_factors = new float_t[localVertices]; - galois::do_all(galois::iterate((size_t)0, localVertices), - [&](auto v) { - norm_factors[v] = 1; - }, galois::loopname("NormCounting")); + galois::do_all( + galois::iterate((size_t)0, localVertices), + [&](auto v) { norm_factors[v] = 1; }, galois::loopname("NormCounting")); - //galois::do_all(galois::iterate((size_t)0, localVertices), + // galois::do_all(galois::iterate((size_t)0, localVertices), // [&](auto v) { - // auto degree = std::distance(graph_cpu->edge_begin(v), graph_cpu->edge_end(v)); - // float_t temp = std::sqrt(float_t(degree)); - // if (temp == 0.0) norm_factors[v] = 0.0; - // else norm_factors[v] = 1.0 / temp; + // auto degree = std::distance(graph_cpu->edge_begin(v), + // graph_cpu->edge_end(v)); float_t temp = std::sqrt(float_t(degree)); if + // (temp == 0.0) norm_factors[v] = 0.0; else norm_factors[v] = 1.0 / temp; // }, galois::loopname("NormCounting")); return; } void DistContext::initializeSyncSubstrate() { - DistContext::syncSubstrate = - new galois::graphs::GluonSubstrate( - *DistContext::graph_cpu, - galois::runtime::getSystemNetworkInterface().ID, - galois::runtime::getSystemNetworkInterface().Num, - false - ); + DistContext::syncSubstrate = new galois::graphs::GluonSubstrate( + *DistContext::graph_cpu, galois::runtime::getSystemNetworkInterface().ID, + galois::runtime::getSystemNetworkInterface().Num, false); } galois::graphs::GluonSubstrate* DistContext::getSyncSubstrate() { return DistContext::syncSubstrate; }; -} // deepgalois +} // namespace deepgalois diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index 757279ceba..f07da83d6d 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -11,27 +11,30 @@ namespace deepgalois { Context::Context() : Context(false) {} Context::~Context() { - if (h_labels) delete[] h_labels; - if (h_feats) delete[] h_feats; - if (norm_factors) delete[] norm_factors; - //if (h_feats_subg) delete[] h_feats_subg; - //if (h_labels_subg) delete[] h_labels_subg; - //if (norm_factors_subg) delete[] norm_factors_subg; + if (h_labels) + delete[] h_labels; + if (h_feats) + delete[] h_feats; + if (norm_factors) + delete[] norm_factors; + // if (h_feats_subg) delete[] h_feats_subg; + // if (h_labels_subg) delete[] h_labels_subg; + // if (norm_factors_subg) delete[] norm_factors_subg; } void Context::createSubgraphs(int num_subgraphs) { subgraphs_cpu.resize(num_subgraphs); for (int i = 0; i < num_subgraphs; i++) - subgraphs_cpu[i] = new Graph(); + subgraphs_cpu[i] = new Graph(); } // generate labels for the subgraph, m is subgraph size -void Context::gen_subgraph_labels(size_t m, const mask_t *masks) { - //if (h_labels_subg == NULL) h_labels_subg = new label_t[m]; +void Context::gen_subgraph_labels(size_t m, const mask_t* masks) { + // if (h_labels_subg == NULL) h_labels_subg = new label_t[m]; if (is_single_class) { h_labels_subg.resize(m); } else { - h_labels_subg.resize(m*num_classes); + h_labels_subg.resize(m * num_classes); } size_t count = 0; for (size_t i = 0; i < n; i++) { @@ -39,23 +42,25 @@ void Context::gen_subgraph_labels(size_t m, const mask_t *masks) { if (is_single_class) { h_labels_subg[count] = h_labels[i]; } else { - std::copy(h_labels+i*num_classes, h_labels+(i+1)*num_classes, &h_labels_subg[count*num_classes]); - } - count ++; - } + std::copy(h_labels + i * num_classes, h_labels + (i + 1) * num_classes, + &h_labels_subg[count * num_classes]); + } + count++; + } } } // generate input features for the subgraph, m is subgraph size -void Context::gen_subgraph_feats(size_t m, const mask_t *masks) { +void Context::gen_subgraph_feats(size_t m, const mask_t* masks) { size_t count = 0; - //if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len]; - h_feats_subg.resize(m*feat_len); + // if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len]; + h_feats_subg.resize(m * feat_len); for (size_t i = 0; i < n; i++) { if (masks[i] == 1) { - std::copy(h_feats+i*feat_len, h_feats+(i+1)*feat_len, &h_feats_subg[count*feat_len]); - count ++; - } + std::copy(h_feats + i * feat_len, h_feats + (i + 1) * feat_len, + &h_feats_subg[count * feat_len]); + count++; + } } } @@ -71,32 +76,33 @@ size_t Context::read_graph(bool selfloop) { } else if (filetype == "bin") { graph_cpu->readGraph(dataset); } else if (filetype == "gr") { - graph_cpu = new Graph(); + graph_cpu = new Graph(); std::string filename = path + dataset + ".csgr"; printf("Reading .gr file: %s\n", filename.c_str()); if (selfloop) { Graph graph_temp; - //galois::graphs::readGraph(graph_temp, filename); + // galois::graphs::readGraph(graph_temp, filename); graph_temp.readGraph(dataset); add_selfloop(graph_temp, *graph_cpu); is_selfloop_added = selfloop; - //} else galois::graphs::readGraph(*graph_cpu, filename); - } else graph_cpu->readGraph(dataset); -// TODO dist version of self loop + //} else galois::graphs::readGraph(*graph_cpu, filename); + } else + graph_cpu->readGraph(dataset); + // TODO dist version of self loop } else { printf("Unkown file format\n"); exit(1); } Tread.stop(); auto g = getGraphPointer(); - std::cout << "num_vertices " << g->size() << " num_edges " - << g->sizeEdges() << "\n"; + std::cout << "num_vertices " << g->size() << " num_edges " << g->sizeEdges() + << "\n"; n = g->size(); return n; } -void Context::add_selfloop(Graph &og, Graph &g) { - g.allocateFrom(og.size(), og.size()+og.sizeEdges()); +void Context::add_selfloop(Graph& og, Graph& g) { + g.allocateFrom(og.size(), og.size() + og.sizeEdges()); g.constructNodes(); /* for (size_t src = 0; src < og.size(); src++) { @@ -139,19 +145,19 @@ void Context::alloc_norm_factor() { void Context::alloc_subgraph_norm_factor(int subg_id) { Graph* g = getSubgraphPointer(subg_id); - //if (norm_factors_subg == NULL) + // if (norm_factors_subg == NULL) #ifdef USE_MKL - //norm_factors_subg = new float_t[g->sizeEdges()]; - norm_factors_subg.resize(g->sizeEdges()); + // norm_factors_subg = new float_t[g->sizeEdges()]; + norm_factors_subg.resize(g->sizeEdges()); #else - norm_factors_subg.resize(g->size()); - //norm_factors_subg = new float_t[g->size()]; + norm_factors_subg.resize(g->size()); + // norm_factors_subg = new float_t[g->size()]; #endif } void Context::norm_factor_computing(bool is_subgraph, int subg_id) { Graph* g; - float_t *constants; + float_t* constants; if (!is_subgraph) { g = getGraphPointer(); alloc_norm_factor(); @@ -164,26 +170,37 @@ void Context::norm_factor_computing(bool is_subgraph, int subg_id) { auto g_size = g->size(); g->degree_counting(); #ifdef USE_MKL - galois::do_all(galois::iterate((size_t)0, g_size), [&](auto i) { - float_t c_i = std::sqrt(float_t(g->get_degree(i))); - for (auto e = g->edge_begin(i); e != g->edge_end(i); e++) { - const auto j = g->getEdgeDst(e); - float_t c_j = std::sqrt(float_t(g->get_degree(j))); - if (c_i == 0.0 || c_j == 0.0) constants[e] = 0.0; - else constants[e] = 1.0 / (c_i * c_j); - } - }, galois::loopname("NormCountingEdge")); + galois::do_all( + galois::iterate((size_t)0, g_size), + [&](auto i) { + float_t c_i = std::sqrt(float_t(g->get_degree(i))); + for (auto e = g->edge_begin(i); e != g->edge_end(i); e++) { + const auto j = g->getEdgeDst(e); + float_t c_j = std::sqrt(float_t(g->get_degree(j))); + if (c_i == 0.0 || c_j == 0.0) + constants[e] = 0.0; + else + constants[e] = 1.0 / (c_i * c_j); + } + }, + galois::loopname("NormCountingEdge")); #else - galois::do_all(galois::iterate((size_t)0, g_size), [&](auto v) { - auto degree = g->get_degree(v); - float_t temp = std::sqrt(float_t(degree)); - if (temp == 0.0) constants[v] = 0.0; - else constants[v] = 1.0 / temp; - }, galois::loopname("NormCountingVertex")); + galois::do_all( + galois::iterate((size_t)0, g_size), + [&](auto v) { + auto degree = g->get_degree(v); + float_t temp = std::sqrt(float_t(degree)); + if (temp == 0.0) + constants[v] = 0.0; + else + constants[v] = 1.0 / temp; + }, + galois::loopname("NormCountingVertex")); #endif } -void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self_loop) { +void Context::read_edgelist(const char* filename, bool symmetrize, + bool add_self_loop) { std::ifstream in; std::string line; in.open(filename, std::ios::in); @@ -192,10 +209,11 @@ void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self size_t num_vertices_ = m; size_t num_edges_ = 0; std::cout << "num_vertices " << num_vertices_ << "\n"; - std::vector > vertices(m); + std::vector> vertices(m); for (size_t i = 0; i < n; i++) { std::set neighbors; - if (add_self_loop) neighbors.insert(i); + if (add_self_loop) + neighbors.insert(i); vertices.push_back(neighbors); } while (std::getline(in, line)) { @@ -204,10 +222,12 @@ void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self edge_stream >> u; edge_stream >> v; vertices[u].insert(v); - if (symmetrize) vertices[v].insert(u); + if (symmetrize) + vertices[v].insert(u); } in.close(); - for (size_t i = 0; i < n; i++) num_edges_ += vertices[i].size(); + for (size_t i = 0; i < n; i++) + num_edges_ += vertices[i].size(); std::cout << "num_edges " << num_edges_ << "\n"; std::vector degrees; @@ -224,13 +244,13 @@ void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self offsets[degrees.size()] = total; degrees.clear(); assert(num_edges_ == offsets[num_vertices_]); - EdgeID *colidx_ = new EdgeID[num_edges_]; - VertexID *rowptr_ = new VertexID[num_vertices_ + 1]; + EdgeID* colidx_ = new EdgeID[num_edges_]; + VertexID* rowptr_ = new VertexID[num_vertices_ + 1]; for (size_t i = 0; i < num_vertices_ + 1; i++) rowptr_[i] = offsets[i]; for (size_t i = 0; i < num_vertices_; i++) { for (auto dst : vertices[i]) - colidx_[offsets[i]++] = dst; + colidx_[offsets[i]++] = dst; } auto g = getGraphPointer(); @@ -238,7 +258,7 @@ void Context::read_edgelist(const char* filename, bool symmetrize, bool add_self g->constructNodes(); for (size_t i = 0; i < num_vertices_; i++) { auto row_begin = rowptr_[i]; - auto row_end = rowptr_[i+1]; + auto row_end = rowptr_[i + 1]; g->fixEndEdge(i, row_end); for (auto offset = row_begin; offset < row_end; offset++) g->constructEdge(offset, colidx_[offset], 0); @@ -254,4 +274,4 @@ inline void init_features(size_t dim, vec_t &x) { } */ -} // end deepgalois namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index f7a76d2db0..365bef8e50 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -27,31 +27,37 @@ int64_t cluster_seedgen(void) { namespace deepgalois { // computing normalization factor for each vertex -__global__ void norm_factor_computing_node(int n, GraphGPU graph, float_t* norm_fac) { +__global__ void norm_factor_computing_node(int n, GraphGPU graph, + float_t* norm_fac) { CUDA_KERNEL_LOOP(i, n) { float_t temp = sqrt(float_t(graph.getOutDegree(i))); - if (temp == 0.0) norm_fac[i] = 0.0; - else norm_fac[i] = 1.0 / temp; + if (temp == 0.0) + norm_fac[i] = 0.0; + else + norm_fac[i] = 1.0 / temp; } } // TODO: make sure self-loop added for each vertex // computing normalization factor for each edge -__global__ void norm_factor_computing_edge(int n, GraphGPU graph, float_t* norm_fac) { +__global__ void norm_factor_computing_edge(int n, GraphGPU graph, + float_t* norm_fac) { CUDA_KERNEL_LOOP(src, n) { assert(src < n); float_t d_src = float_t(graph.getOutDegree(src)); - assert(d_src != 0.0); // should never be zero since self-loop added for each vertex - d_src = 1.0 / sqrt(d_src); - auto start = graph.edge_begin(src); + assert(d_src != + 0.0); // should never be zero since self-loop added for each vertex + d_src = 1.0 / sqrt(d_src); + auto start = graph.edge_begin(src); index_t end = graph.edge_end(src); for (index_t e = start; e != end; e++) { index_t dst = graph.getEdgeDst(e); - //if (dst >= n) printf("src=%d, dst=%d, e=%d, start=%d, end=%d\n", src, dst, e, start, end); + // if (dst >= n) printf("src=%d, dst=%d, e=%d, start=%d, end=%d\n", src, + // dst, e, start, end); assert(dst < n); float_t d_dst = float_t(graph.getOutDegree(dst)); assert(d_dst != 0.0); - d_dst = 1.0 / sqrt(d_dst); + d_dst = 1.0 / sqrt(d_dst); norm_fac[e] = d_src * d_dst; } } @@ -66,10 +72,14 @@ Context::Context() : Context(true) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_)); - CUSPARSE_CHECK(cusparseSetMatType(cusparse_matdescr_,CUSPARSE_MATRIX_TYPE_GENERAL)); - CUSPARSE_CHECK(cusparseSetMatIndexBase(cusparse_matdescr_,CUSPARSE_INDEX_BASE_ZERO)); - CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); + CUSPARSE_CHECK( + cusparseSetMatType(cusparse_matdescr_, CUSPARSE_MATRIX_TYPE_GENERAL)); + CUSPARSE_CHECK( + cusparseSetMatIndexBase(cusparse_matdescr_, CUSPARSE_INDEX_BASE_ZERO)); + CURAND_CHECK( + curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK( + curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); } Context::~Context() { @@ -81,34 +91,36 @@ Context::~Context() { CUSPARSE_CHECK(cusparseDestroyMatDescr(cusparse_matdescr_)); if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_)); - if (d_labels) CUDA_CHECK(cudaFree(d_labels)); - if (d_feats) CUDA_CHECK(cudaFree(d_feats)); - if (norm_factors) CUDA_CHECK(cudaFree(norm_factors)); + if (d_labels) + CUDA_CHECK(cudaFree(d_labels)); + if (d_feats) + CUDA_CHECK(cudaFree(d_feats)); + if (norm_factors) + CUDA_CHECK(cudaFree(norm_factors)); } -void Context::createSubgraphs(int n_sg) { -} +void Context::createSubgraphs(int n_sg) {} -void Context::gen_subgraph_labels(size_t m, const mask_t *masks) { -} +void Context::gen_subgraph_labels(size_t m, const mask_t* masks) {} -void Context::gen_subgraph_feats(size_t m, const mask_t *masks) { -} +void Context::gen_subgraph_feats(size_t m, const mask_t* masks) {} void Context::norm_factor_computing(bool is_subgraph, int subg_id) { std::cout << "Pre-computing normalization factor (n=" << n << ") ... "; if (!is_selfloop_added) { - std::cout << "Set -sl=1 to add selfloop\n"; + std::cout << "Set -sl=1 to add selfloop\n"; exit(0); } #ifdef USE_CUSPARSE int nnz = graph_gpu.sizeEdges(); CUDA_CHECK(cudaMalloc((void**)&norm_factors, nnz * sizeof(float_t))); init_const_gpu(nnz, 0.0, norm_factors); - norm_factor_computing_edge<<>>(n, graph_gpu, norm_factors); + norm_factor_computing_edge<<>>( + n, graph_gpu, norm_factors); #else CUDA_CHECK(cudaMalloc((void**)&norm_factors, n * sizeof(float_t))); - norm_factor_computing_node<<>>(n, graph_gpu, norm_factors); + norm_factor_computing_node<<>>( + n, graph_gpu, norm_factors); #endif CudaTest("solving norm_factor_computing kernel failed"); std::cout << "Done\n"; @@ -120,10 +132,13 @@ void Context::SetDevice(const int device_id) { if (current_device == device_id) return; CUDA_CHECK(cudaSetDevice(device_id)); if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); - if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_)); + if (curand_generator_) +CURAND_CHECK(curandDestroyGenerator(curand_generator_)); CUBLAS_CHECK(cublasCreate(&cublas_handle_)); - CURAND_CHECK(curandCreateGenerator(&curand_generator_, CURAND_RNG_PSEUDO_DEFAULT)); - CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); + CURAND_CHECK(curandCreateGenerator(&curand_generator_, +CURAND_RNG_PSEUDO_DEFAULT)); + CURAND_CHECK(curandSetPseudoRandomGeneratorSeed(curand_generator_, +cluster_seedgen())); } */ size_t Context::read_graph(bool selfloop) { @@ -151,21 +166,25 @@ size_t Context::read_graph(bool selfloop) { void Context::copy_data_to_device() { if (is_single_class) { CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t))); - CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * sizeof(label_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * sizeof(label_t), + cudaMemcpyHostToDevice)); } else { - CUDA_CHECK(cudaMalloc((void**)&d_labels, n * num_classes * sizeof(label_t))); - CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * num_classes * sizeof(label_t), cudaMemcpyHostToDevice)); + CUDA_CHECK( + cudaMalloc((void**)&d_labels, n * num_classes * sizeof(label_t))); + CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * num_classes * sizeof(label_t), + cudaMemcpyHostToDevice)); } CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t))); - CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); - //print_device_vector(10, d_feats, "d_feats"); + CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), + cudaMemcpyHostToDevice)); + // print_device_vector(10, d_feats, "d_feats"); } -//void Context::copy_data_to_device() { - //float_malloc_device(n, d_labels); - //float_copy_device(n, h_labels, d_labels); - //float_malloc_device(n*feat_len, d_feats); - //float_copy_device(n*feat_len, &h_feats[0], d_feats); +// void Context::copy_data_to_device() { +// float_malloc_device(n, d_labels); +// float_copy_device(n, h_labels, d_labels); +// float_malloc_device(n*feat_len, d_feats); +// float_copy_device(n*feat_len, &h_feats[0], d_feats); //} -} // namespace context +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 8b9e726e8e..9c3454d550 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -2,52 +2,59 @@ #include "deepgalois/math_functions.hh" #include "galois/Galois.h" -void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, - bool norm, float_t* norm_factor) { - //std::cout << "[update_all] graph size: " << n << "\n"; - #ifndef GALOIS_USE_DIST +void deepgalois::update_all(size_t len, Graph& g, const float_t* in, + float_t* out, bool norm, float_t* norm_factor) { +// std::cout << "[update_all] graph size: " << n << "\n"; +#ifndef GALOIS_USE_DIST size_t n = g.size(); - galois::do_all(galois::iterate(size_t(0), n),[&](const auto src) { - #else + galois::do_all( + galois::iterate(size_t(0), n), + [&](const auto src) { +#else auto& rangeObj = g.allNodesRange(); - galois::do_all(galois::iterate(rangeObj), [&](const auto src) { - #endif - auto src_idx = src * len; - // zero out the output data - math::clear_cpu(len , &out[src_idx]); - float_t a = 0.0; - float_t b = 0.0; - // get normalization factor if needed - if (norm) a = norm_factor[src]; - // gather neighbors' embeddings - for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { - const auto dst = g.getEdgeDst(e); - assert(dst < n); - auto dst_idx = dst * len; - if (norm) { - // normalize b as well - b = a * norm_factor[dst]; - //float_t* neighbor = new float_t[len]; // this is super slow - vec_t neighbor(len); - // scale the neighbor's data using the normalization factor - math::scale(len, b, &in[dst_idx], &neighbor[0]); - // use scaled data to update; out[src] += in[dst] - math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]); - } else { - // add embeddings from neighbors together; out[src] += in[dst] - math::vadd_cpu(len, &out[src_idx], &in[dst_idx], &out[src_idx]); - } - } - }, galois::steal(), galois::loopname("update_all")); + galois::do_all( + galois::iterate(rangeObj), + [&](const auto src) { +#endif + auto src_idx = src * len; + // zero out the output data + math::clear_cpu(len, &out[src_idx]); + float_t a = 0.0; + float_t b = 0.0; + // get normalization factor if needed + if (norm) + a = norm_factor[src]; + // gather neighbors' embeddings + for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { + const auto dst = g.getEdgeDst(e); + assert(dst < n); + auto dst_idx = dst * len; + if (norm) { + // normalize b as well + b = a * norm_factor[dst]; + // float_t* neighbor = new float_t[len]; // this is super slow + vec_t neighbor(len); + // scale the neighbor's data using the normalization factor + math::scale(len, b, &in[dst_idx], &neighbor[0]); + // use scaled data to update; out[src] += in[dst] + math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]); + } else { + // add embeddings from neighbors together; out[src] += in[dst] + math::vadd_cpu(len, &out[src_idx], &in[dst_idx], &out[src_idx]); + } + } + }, + galois::steal(), galois::loopname("update_all")); } -void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out, - bool, float_t* norm_factor) { +void deepgalois::update_all_csrmm(size_t len, Graph& g, const float_t* in, + float_t* out, bool, float_t* norm_factor) { galois::StatTimer Tcsrmm("CSRMM-MKL"); Tcsrmm.start(); unsigned n = g.size(); - math::clear_cpu(n*len, out); - math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, (int*)g.row_start_ptr(), (int*)g.edge_dst_ptr(), in, 0.0, out); + math::clear_cpu(n * len, out); + math::csrmm_cpu(n, len, n, g.sizeEdges(), 1.0, norm_factor, + (int*)g.row_start_ptr(), (int*)g.edge_dst_ptr(), in, 0.0, + out); Tcsrmm.stop(); } - diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu index 158b1c2b4d..2bfe55ca46 100644 --- a/libdeepgalois/src/layers/aggregator.cu +++ b/libdeepgalois/src/layers/aggregator.cu @@ -15,16 +15,18 @@ __device__ void scale_add(const int n, const float_t alpha, const float_t* a, } __global__ void update_all_naive(size_t n, size_t len, GraphGPU g, - const float_t* in, float_t* out, - bool norm, const float_t* norm_factor) { + const float_t* in, float_t* out, bool norm, + const float_t* norm_factor) { CUDA_KERNEL_LOOP(src, n) { float_t a = 0.0, b = 1.0; - if (norm) a = norm_factor[src]; + if (norm) + a = norm_factor[src]; index_type begin = g.edge_begin(src); index_type end = g.edge_end(src); for (index_type e = begin; e != end; e++) { index_type dst = g.getEdgeDst(e); - if (norm) b = a * norm_factor[dst]; + if (norm) + b = a * norm_factor[dst]; scale_add(len, b, in + dst * len, out + src * len, out + src * len); // out[src] += in[dst] } @@ -32,31 +34,36 @@ __global__ void update_all_naive(size_t n, size_t len, GraphGPU g, } __global__ void update_all_warp(size_t n, size_t len, GraphGPU g, - const float_t* in, float_t* out, - bool norm, const float_t* norm_factor) { - __shared__ index_type ptrs[BLOCK_SIZE/WARP_SIZE][2]; - const int thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index - const int thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp - const int warp_id = thread_id / WARP_SIZE; // global warp index - const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA - const int num_warps = (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + const float_t* in, float_t* out, bool norm, + const float_t* norm_factor) { + __shared__ index_type ptrs[BLOCK_SIZE / WARP_SIZE][2]; + const int thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps - for(int src = warp_id; src < n; src += num_warps) { + for (int src = warp_id; src < n; src += num_warps) { float_t a = 0.0, b = 1.0; - if (norm) a = norm_factor[src]; + if (norm) + a = norm_factor[src]; if (thread_lane < 2) ptrs[warp_lane][thread_lane] = g.edge_begin(src + thread_lane); __syncthreads(); const index_type row_begin = ptrs[warp_lane][0]; const index_type row_end = ptrs[warp_lane][1]; - index_type base_src = src * len; - for(index_type offset = row_begin; offset < row_end; offset ++) { + index_type base_src = src * len; + for (index_type offset = row_begin; offset < row_end; offset++) { index_type dst = g.getEdgeDst(offset); - if (norm) b = a * norm_factor[dst]; + if (norm) + b = a * norm_factor[dst]; index_type base_dst = dst * len; for (int i = 0; i < len; i += WARP_SIZE) - if (thread_lane+i < len) - out[base_src+thread_lane+i] += in[base_dst+thread_lane+i] * b; + if (thread_lane + i < len) + out[base_src + thread_lane + i] += in[base_dst + thread_lane + i] * b; } } } @@ -65,29 +72,32 @@ void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor) { unsigned n = g.size(); CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); - //update_all_naive<<>>(n, len, g, in, out, norm, norm_factor); - update_all_warp<<<(n-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>(n, len, g, in, out, norm, norm_factor); + // update_all_naive<<>>(n, len, g, in, + // out, norm, norm_factor); + update_all_warp<<<(n - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>( + n, len, g, in, out, norm, norm_factor); CudaTest("solving update_all kernel failed"); } void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out, - bool norm, const float_t* norm_factor) { - //g.print_test(); + bool norm, const float_t* norm_factor) { + // g.print_test(); unsigned n = g.size(); - auto nnz = g.sizeEdges(); + auto nnz = g.sizeEdges(); CUDA_CHECK(cudaMemset(out, 0, n * len * sizeof(float_t))); - //std::cout << "[debug]: update_all on GPU, n " << n << " len " << len << " nnz " << nnz << "\n"; - //print_device_vector(10, norm_factor, "norm_factor"); - float *temp; - const int *row_start = (const int*)g.row_start_ptr(); - const int *edge_dst = (const int*)g.edge_dst_ptr(); - //printf("row_start_ptr: 0x%x\n", row_start); - //printf("edge_dst_ptr: 0x%x\n", edge_dst); - //print_device_int_vector(10, row_start, "row_start"); - //print_device_int_vector(10, edge_dst, "edge_dst"); - float_malloc_device(n*len, temp); // TODO: avoid repetitive allocation - csrmm_gpu(n, len, n, nnz, 1.0, norm_factor, row_start, edge_dst, in, 0.0, temp, out); + // std::cout << "[debug]: update_all on GPU, n " << n << " len " << len << " + // nnz " << nnz << "\n"; print_device_vector(10, norm_factor, "norm_factor"); + float* temp; + const int* row_start = (const int*)g.row_start_ptr(); + const int* edge_dst = (const int*)g.edge_dst_ptr(); + // printf("row_start_ptr: 0x%x\n", row_start); + // printf("edge_dst_ptr: 0x%x\n", edge_dst); + // print_device_int_vector(10, row_start, "row_start"); + // print_device_int_vector(10, edge_dst, "edge_dst"); + float_malloc_device(n * len, temp); // TODO: avoid repetitive allocation + csrmm_gpu(n, len, n, nnz, 1.0, norm_factor, row_start, edge_dst, in, 0.0, + temp, out); float_free_device(temp); } -} +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index e46a2477a6..d50f7bfb0a 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -11,7 +11,8 @@ float_t* _dataToSync = nullptr; long unsigned _syncVectorSize = 0; #ifdef CPU_ONLY -inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed) { +inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y, + vec_t& matrix, unsigned seed) { auto init_range = sqrt(6.0 / (dim_x + dim_y)); std::default_random_engine rng(seed); std::uniform_real_distribution dist(-init_range, init_range); @@ -22,7 +23,8 @@ inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y, vec_t } } -inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix) { +inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, + vec_t& matrix) { matrix.resize(dim_x * dim_y); for (size_t i = 0; i < dim_x; ++i) { for (size_t j = 0; j < dim_y; ++j) @@ -31,7 +33,8 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, vec_t } // aggregate based on graph topology -void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { +void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, + float_t* out) { // normalization constant based on graph structure #ifdef USE_MKL update_all_csrmm(len, g, in, out, norm_, norm_consts); @@ -41,7 +44,8 @@ void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_ } // since graph is symmetric, the derivative is the same -void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { +void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, + float_t* out) { #ifdef USE_MKL update_all_csrmm(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z #else @@ -49,9 +53,10 @@ void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, floa #endif } -void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, const float_t* neighbors, float_t* out) { - float_t *a = new float_t[len]; - float_t *b = new float_t[len]; +void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, + const float_t* neighbors, float_t* out) { + float_t* a = new float_t[len]; + float_t* b = new float_t[len]; math::mvmul(CblasNoTrans, n, len, 1.0, &Q[0], self, 0.0, a); math::mvmul(CblasNoTrans, n, len, 1.0, &W[0], neighbors, 0.0, b); math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors @@ -63,11 +68,12 @@ void graph_conv_layer::malloc_and_init() { size_t z = output_dims[1]; #ifdef GALOIS_USE_DIST // setup gluon - layer::gradientGraph = new deepgalois::GluonGradients(layer::weight_grad, y * z); + layer::gradientGraph = + new deepgalois::GluonGradients(layer::weight_grad, y * z); layer::syncSub = - new galois::graphs::GluonSubstrate( - *layer::gradientGraph, layer::gradientGraph->myHostID(), - layer::gradientGraph->numHosts(), false); + new galois::graphs::GluonSubstrate( + *layer::gradientGraph, layer::gradientGraph->myHostID(), + layer::gradientGraph->numHosts(), false); #endif #ifdef GALOIS_USE_DIST @@ -80,43 +86,52 @@ void graph_conv_layer::malloc_and_init() { // rand_init_matrix(y, z, Q); zero_init_matrix(y, z, layer::weight_grad); - if (dropout_) dropout_mask = new mask_t[x * y]; - in_temp = new float_t[x * y]; - out_temp = new float_t[x * z]; + if (dropout_) + dropout_mask = new mask_t[x * y]; + in_temp = new float_t[x * y]; + out_temp = new float_t[x * z]; trans_data = new float_t[y * x]; // y*x - if (y <= z) in_temp1 = new float_t[x * y]; + if (y <= z) + in_temp1 = new float_t[x * y]; } // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) -void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { +void graph_conv_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { size_t x = input_dims[0]; size_t y = input_dims[1]; size_t z = output_dims[1]; - //std::cout << "layer: " << name_ << "\n"; - //std::cout << "x=" << x << ", y=" << y << ", z=" << z << "\n"; + // std::cout << "layer: " << name_ << "\n"; + // std::cout << "x=" << x << ", y=" << y << ", z=" << z << "\n"; // input: x*y; W: y*z; output: x*z // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W if (dropout_ && phase_ == net_phase::train) - math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); - else math::copy_cpu(x*y, in_data, in_temp); + math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask, + in_temp); + else + math::copy_cpu(x * y, in_data, in_temp); if (y > z) { - math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp); + math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, + &layer::W[0], 0.0, out_temp); aggregate(z, *graph_cpu, out_temp, out_data); } else { aggregate(y, *graph_cpu, in_temp, in_temp1); - math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, &layer::W[0], 0.0, out_data); + math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, + &layer::W[0], 0.0, out_data); } #ifdef GALOIS_USE_DIST // TODO sync of out_data required here deepgalois::_syncVectorSize = z; - deepgalois::_dataToSync = out_data; - layer::context->getSyncSubstrate()->sync("AggSync"); + deepgalois::_dataToSync = out_data; + layer::context->getSyncSubstrate()->sync( + "AggSync"); #endif // run relu activation on output if specified - if (act_) math::relu_cpu(x*z, out_data, out_data); + if (act_) + math::relu_cpu(x * z, out_data, out_data); } // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ @@ -127,8 +142,9 @@ void graph_conv_layer::back_propagation(const float_t* in_data, size_t y = input_dims[1]; size_t z = output_dims[1]; // note; assumption here is that out_grad contains 1s or 0s via relu? - if (act_) math::d_relu_cpu(x*z, out_grad, out_data, out_grad); - //else math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying + if (act_) + math::d_relu_cpu(x * z, out_grad, out_data, out_grad); + // else math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying if (y > z) { d_aggregate(z, *graph_cpu, out_grad, out_temp); @@ -137,22 +153,28 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // this calculates gradients for the node predictions if (level_ != 0) // no need to calculate in_grad for the first layer // derivative of matmul needs transposed matrix - math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y - // calculate weight gradients using input data; multiplied by gradients from last back prop step - math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z + math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], + 0.0, in_grad); // x*z; z*y -> x*y + // calculate weight gradients using input data; multiplied by gradients from + // last back prop step + math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, + 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z } else { if (level_ != 0) { - math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, &W[0], 0.0, in_temp); + math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, &W[0], + 0.0, in_temp); d_aggregate(y, *graph_cpu, in_temp, in_grad); } - math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, &layer::weight_grad[0]); + math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, + 0.0, &layer::weight_grad[0]); } #ifdef GALOIS_USE_DIST // sync agg deepgalois::_syncVectorSize = z; - deepgalois::_dataToSync = out_temp; - layer::context->getSyncSubstrate()->sync("AggSyncBack"); + deepgalois::_dataToSync = out_temp; + layer::context->getSyncSubstrate()->sync( + "AggSyncBack"); #endif if (level_ != 0 && dropout_) @@ -160,14 +182,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data, #ifdef GALOIS_USE_DIST layer::syncSub->sync("GradientSync"); - //galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); + // galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); #endif } acc_t graph_conv_layer::get_weight_decay_loss() { - return math::l2_norm(input_dims[1]*output_dims[1], &layer::W[0]); + return math::l2_norm(input_dims[1] * output_dims[1], &layer::W[0]); } #endif // end if CPU_ONLY -} // namespace - +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/graph_conv_layer.cu b/libdeepgalois/src/layers/graph_conv_layer.cu index a1682847ad..f8b59d3c0e 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cu +++ b/libdeepgalois/src/layers/graph_conv_layer.cu @@ -8,32 +8,35 @@ void graph_conv_layer::malloc_and_init() { size_t y = input_dims[1]; size_t z = output_dims[1]; - if (dropout_) CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(mask_t))); - float_malloc_device(x*y, in_temp); - init_const_gpu(x*y, 0.0, in_temp); + if (dropout_) + CUDA_CHECK(cudaMalloc((void**)&dropout_mask, x * y * sizeof(mask_t))); + float_malloc_device(x * y, in_temp); + init_const_gpu(x * y, 0.0, in_temp); if (y <= z) { - float_malloc_device(x*y, in_temp1); - init_const_gpu(x*y, 0.0, in_temp1); + float_malloc_device(x * y, in_temp1); + init_const_gpu(x * y, 0.0, in_temp1); } - float_malloc_device(x*z, out_temp); - init_const_gpu(x*z, 0.0, out_temp); - float_malloc_device(y*z, d_W); + float_malloc_device(x * z, out_temp); + init_const_gpu(x * z, 0.0, out_temp); + float_malloc_device(y * z, d_W); auto init_range = sqrt(6.0 / (y + z)); // Glorot & Bengio (AISTATS 2010) rng_uniform_gpu(y * z, -init_range, init_range, d_W); - float_malloc_device(y*z, layer::d_weight_grad); - init_const_gpu(y*z, 0.0, layer::d_weight_grad); + float_malloc_device(y * z, layer::d_weight_grad); + init_const_gpu(y * z, 0.0, layer::d_weight_grad); } -void graph_conv_layer::aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out) { - #ifdef USE_CUSPARSE +void graph_conv_layer::aggregate(size_t len, GraphGPU& g, const float_t* in, + float_t* out) { +#ifdef USE_CUSPARSE deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts); - #else +#else deepgalois::update_all(len, g, in, out, norm_, norm_consts); - #endif +#endif } -void graph_conv_layer::d_aggregate(size_t len, GraphGPU& g, const float_t* in, float_t* out) { +void graph_conv_layer::d_aggregate(size_t len, GraphGPU& g, const float_t* in, + float_t* out) { #ifdef USE_CUSPARSE deepgalois::update_all_csrmm(len, g, in, out, norm_, norm_consts); #else @@ -41,62 +44,74 @@ void graph_conv_layer::d_aggregate(size_t len, GraphGPU& g, const float_t* in, f #endif } -void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, const float_t* neighbors, float_t* out) { -} +void graph_conv_layer::combine(size_t dim_x, size_t dim_y, const float_t* self, + const float_t* neighbors, float_t* out) {} // GPU forward: compute output features // NOTE: in_data will be used in back-prop, so it can not be modified -void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { +void graph_conv_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { size_t x = input_dims[0]; size_t y = input_dims[1]; size_t z = output_dims[1]; - - // currently only support feature length <= 128 + + // currently only support feature length <= 128 if (z > MAX_NUM_CLASSES) { - std::cout << "Currently support maximum hidden feature length of " << MAX_NUM_CLASSES << "\n"; + std::cout << "Currently support maximum hidden feature length of " + << MAX_NUM_CLASSES << "\n"; exit(0); } - init_const_gpu(x*z, 0.0, out_temp); + init_const_gpu(x * z, 0.0, out_temp); if (dropout_ && phase_ == net_phase::train) dropout_gpu(x * y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); - else copy_gpu(x*y, in_data, in_temp); + else + copy_gpu(x * y, in_data, in_temp); if (y > z) { - sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, out_temp); + sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, d_W, 0.0, + out_temp); graph_conv_layer::aggregate(z, *graph_gpu, out_temp, out_data); } else { graph_conv_layer::aggregate(y, *graph_gpu, in_temp, in_temp1); - sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, d_W, 0.0, out_data); + sgemm_gpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, d_W, 0.0, + out_data); } - if (act_) relu_gpu(x * z, out_data, out_data); + if (act_) + relu_gpu(x * z, out_data, out_data); } -// GPU backward: compute input gradients (in_grad) and weight gradients (d_weight_grad) -void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data, +// GPU backward: compute input gradients (in_grad) and weight gradients +// (d_weight_grad) +void graph_conv_layer::back_propagation(const float_t* in_data, + const float_t* out_data, float_t* out_grad, float_t* in_grad) { size_t x = input_dims[0]; size_t y = input_dims[1]; size_t z = output_dims[1]; - - if (act_) d_relu_gpu(x * z, out_grad, out_data, out_grad); + + if (act_) + d_relu_gpu(x * z, out_grad, out_data, out_grad); if (y > z) { graph_conv_layer::d_aggregate(z, *graph_gpu, out_grad, out_temp); if (level_ != 0) - sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, in_grad); - sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, layer::d_weight_grad); + sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, d_W, 0.0, + in_grad); + sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, 0.0, + layer::d_weight_grad); } else { if (level_ != 0) { - sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, d_W, 0.0, in_temp); + sgemm_gpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_grad, d_W, 0.0, + in_temp); graph_conv_layer::d_aggregate(y, *graph_gpu, in_temp, in_grad); } - sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, layer::d_weight_grad); + sgemm_gpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, + layer::d_weight_grad); } if (level_ != 0 && dropout_) d_dropout_gpu(x * y, scale_, dropout_rate_, in_grad, dropout_mask, in_grad); } acc_t graph_conv_layer::get_weight_decay_loss() { - return l2_norm_gpu(input_dims[1]*output_dims[1], d_W); + return l2_norm_gpu(input_dims[1] * output_dims[1], d_W); } -} // namespace - +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/l2_norm_layer.cpp b/libdeepgalois/src/layers/l2_norm_layer.cpp index 864eaeb321..8de2406ede 100644 --- a/libdeepgalois/src/layers/l2_norm_layer.cpp +++ b/libdeepgalois/src/layers/l2_norm_layer.cpp @@ -4,43 +4,50 @@ namespace deepgalois { -void l2_norm_layer::forward_propagation(const float_t* in_data, float_t* out_data) { +void l2_norm_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { size_t x = input_dims[0]; size_t y = input_dims[1]; - galois::do_all(galois::iterate((size_t)0, x), [&](const auto i) { - //for (size_t i = 0; i < x; i++) { - float_t sum = 0.0; - size_t idx = i * y; - for (size_t j = 0; j < y; j++) { - sum += in_data[idx + j] * in_data[idx + j]; - } - sum = std::max(sum, epsilon_); - sum = sqrt(sum); - for (size_t j = 0; j < y; j++) { - out_data[idx + j] = in_data[idx + j] / sum * scale_; - } - }, galois::loopname("l2_norm")); + galois::do_all( + galois::iterate((size_t)0, x), + [&](const auto i) { + // for (size_t i = 0; i < x; i++) { + float_t sum = 0.0; + size_t idx = i * y; + for (size_t j = 0; j < y; j++) { + sum += in_data[idx + j] * in_data[idx + j]; + } + sum = std::max(sum, epsilon_); + sum = sqrt(sum); + for (size_t j = 0; j < y; j++) { + out_data[idx + j] = in_data[idx + j] / sum * scale_; + } + }, + galois::loopname("l2_norm")); } void l2_norm_layer::back_propagation(const float_t* in_data, const float_t*, - float_t* out_grad, float_t* in_grad) { + float_t* out_grad, float_t* in_grad) { size_t x = input_dims[0]; size_t y = input_dims[1]; - galois::do_all(galois::iterate((size_t)0, x), [&](const auto i) { - //for (size_t i = 0; i < x; i++) { - float_t sum_x2 = 0.0; - float_t coef0_axis0 = 0, coef1_axis0 = 0; - size_t idx = i * y; - for (size_t j = 0; j < y; j++) { - sum_x2 += powf(in_data[idx + j], 2); - coef0_axis0 -= in_data[idx + j] * out_grad[idx + j]; - } - coef1_axis0 = powf(sum_x2, -1.5); - for (size_t j = 0; j < y; j++) { - in_grad[idx + j] = in_data[idx + j] * coef0_axis0 * coef1_axis0 - + out_grad[idx + j] * sum_x2 * coef1_axis0; - } - }, galois::loopname("d_l2_norm")); + galois::do_all( + galois::iterate((size_t)0, x), + [&](const auto i) { + // for (size_t i = 0; i < x; i++) { + float_t sum_x2 = 0.0; + float_t coef0_axis0 = 0, coef1_axis0 = 0; + size_t idx = i * y; + for (size_t j = 0; j < y; j++) { + sum_x2 += powf(in_data[idx + j], 2); + coef0_axis0 -= in_data[idx + j] * out_grad[idx + j]; + } + coef1_axis0 = powf(sum_x2, -1.5); + for (size_t j = 0; j < y; j++) { + in_grad[idx + j] = in_data[idx + j] * coef0_axis0 * coef1_axis0 + + out_grad[idx + j] * sum_x2 * coef1_axis0; + } + }, + galois::loopname("d_l2_norm")); } -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/l2_norm_layer.cu b/libdeepgalois/src/layers/l2_norm_layer.cu index e600b6fbbb..ed86cf147d 100644 --- a/libdeepgalois/src/layers/l2_norm_layer.cu +++ b/libdeepgalois/src/layers/l2_norm_layer.cu @@ -3,17 +3,19 @@ namespace deepgalois { -void l2_norm_layer::forward_propagation(const float_t* in_data, float_t* out_data) { +void l2_norm_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { size_t x = input_dims[0]; size_t y = input_dims[1]; l2_norm_gpu(x, y, in_data, out_data); } -void l2_norm_layer::back_propagation(const float_t* in_data, const float_t* out_data, - float_t* out_grad, float_t* in_grad) { +void l2_norm_layer::back_propagation(const float_t* in_data, + const float_t* out_data, float_t* out_grad, + float_t* in_grad) { size_t x = input_dims[0]; size_t y = input_dims[1]; d_l2_norm_gpu(x, y, in_data, out_grad, in_grad); } -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp index f0ea5f591e..dd4357739f 100644 --- a/libdeepgalois/src/layers/leaky_relu_layer.cpp +++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp @@ -3,27 +3,28 @@ namespace deepgalois { -leaky_relu_layer::leaky_relu_layer(unsigned level, float_t eps, - dims_t in_dims, dims_t out_dims) +leaky_relu_layer::leaky_relu_layer(unsigned level, float_t eps, dims_t in_dims, + dims_t out_dims) : layer(level, in_dims, out_dims), epsilon_(eps) { assert(input_dims[0] == output_dims[0]); // num_vertices trainable_ = false; - n = input_dims[0] * input_dims[1]; - name_ = layer_type() + "_" + std::to_string(level); + n = input_dims[0] * input_dims[1]; + name_ = layer_type() + "_" + std::to_string(level); } #ifdef CPU_ONLY -// ๐‘ฆ[๐‘™] = ๐‘ฆ[๐‘™โˆ’1] > 0 ? ๐‘ฆ[๐‘™โˆ’1]) : ๐‘ฆ[๐‘™โˆ’1] * ฮต -void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) { +// ๐‘ฆ[๐‘™] = ๐‘ฆ[๐‘™โˆ’1] > 0 ? ๐‘ฆ[๐‘™โˆ’1]) : ๐‘ฆ[๐‘™โˆ’1] * ฮต +void leaky_relu_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { math::leaky_relu_cpu(n, epsilon_, in_data, out_data); } // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ * ฮต, ๐‘–๐‘“ (๐‘ฆ[๐‘™] โ‰ค 0) // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™, ๐‘–๐‘“ (๐‘ฆ[๐‘™] > 0) void leaky_relu_layer::back_propagation(const float_t*, const float_t* out_data, - float_t* out_grad, float_t* in_grad) { + float_t* out_grad, float_t* in_grad) { math::d_leaky_relu_cpu(n, epsilon_, out_grad, out_data, in_grad); } #endif -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cu b/libdeepgalois/src/layers/leaky_relu_layer.cu index 6fe4d005ac..a6271086e9 100644 --- a/libdeepgalois/src/layers/leaky_relu_layer.cu +++ b/libdeepgalois/src/layers/leaky_relu_layer.cu @@ -3,16 +3,18 @@ namespace deepgalois { -// ๐‘ฆ[๐‘™] = ๐‘ฆ[๐‘™โˆ’1] > 0 ? ๐‘ฆ[๐‘™โˆ’1]) : ๐‘ฆ[๐‘™โˆ’1] * ฮต -void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) { +// ๐‘ฆ[๐‘™] = ๐‘ฆ[๐‘™โˆ’1] > 0 ? ๐‘ฆ[๐‘™โˆ’1]) : ๐‘ฆ[๐‘™โˆ’1] * ฮต +void leaky_relu_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { leaky_relu_gpu(n, epsilon_, in_data, out_data); } // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™ * ฮต, ๐‘–๐‘“ (๐‘ฆ[๐‘™] โ‰ค 0) // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™, ๐‘–๐‘“ (๐‘ฆ[๐‘™] > 0) -void leaky_relu_layer::back_propagation(const float_t* in_data, const float_t* out_data, - float_t* out_grad, float_t* in_grad) { +void leaky_relu_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { d_leaky_relu_gpu(n, epsilon_, out_grad, in_data, in_grad); } -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp index 9e54d64975..03cd0f4652 100644 --- a/libdeepgalois/src/layers/relu_layer.cpp +++ b/libdeepgalois/src/layers/relu_layer.cpp @@ -5,7 +5,8 @@ namespace deepgalois { #ifdef CPU_ONLY // ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) -void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) { +void relu_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { size_t n = input_dims[0] * input_dims[1]; math::relu_cpu(n, in_data, out_data); } @@ -19,4 +20,4 @@ void relu_layer::back_propagation(const float_t*, const float_t* out_data, } #endif -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/relu_layer.cu b/libdeepgalois/src/layers/relu_layer.cu index 0d39a9dab2..d457c994ce 100644 --- a/libdeepgalois/src/layers/relu_layer.cu +++ b/libdeepgalois/src/layers/relu_layer.cu @@ -4,17 +4,19 @@ namespace deepgalois { // ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) -void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) { +void relu_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { const size_t count = input_dims[0] * input_dims[1]; relu_gpu(count, in_data, out_data); } // ๐œ•๐ฟ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = 0, ๐‘–๐‘“ (๐‘ฆ[๐‘™] < 0) // = ๐œ•๐ฟ / ๐œ•๐‘ฆ๐‘™, ๐‘œ๐‘กโ„Ž๐‘’๐‘Ÿ๐‘ค๐‘–๐‘ ๐‘’ -void relu_layer::back_propagation(const float_t* in_data, const float_t* out_data, - float_t* out_grad, float_t* in_grad) { +void relu_layer::back_propagation(const float_t* in_data, + const float_t* out_data, float_t* out_grad, + float_t* in_grad) { const size_t count = input_dims[0] * input_dims[1]; d_relu_gpu(count, out_grad, in_data, in_grad); } -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index d7ec46378e..d20f2a769b 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -13,53 +13,65 @@ sigmoid_loss_layer::sigmoid_loss_layer(unsigned level, name_ = layer_type() + "_" + std::to_string(level); } -sigmoid_loss_layer::~sigmoid_loss_layer() { - delete[] loss; -} +sigmoid_loss_layer::~sigmoid_loss_layer() { delete[] loss; } void sigmoid_loss_layer::malloc_and_init() { loss = new float_t[input_dims[0]]; // error for each sample } inline label_t sigmoid_loss_layer::get_label(size_t i, size_t j) { - //return context->get_label(i, j); - return labels[i*input_dims[1]+j]; + // return context->get_label(i, j); + return labels[i * input_dims[1] + j]; } -void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { +void sigmoid_loss_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { size_t len = input_dims[1]; - galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - if (!use_mask || masks_[i] == 1) { // masked - size_t idx = len * i; - // output is normalized input for this layer - math::sigmoid(len, &in_data[idx], &out_data[idx]); // normalize using sigmoid - // one hot encoded vector for the labels - float_t *ground_truth = new float_t[len]; - for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)get_label(i, j); - // loss calculation - loss[i] = math::cross_entropy(len, ground_truth, &out_data[idx]); - delete[] ground_truth; - } - }, galois::chunk_size(), galois::steal(), galois::loopname("sigmoid-loss-fw")); + galois::do_all( + galois::iterate(begin_, end_), + [&](const auto& i) { + if (!use_mask || masks_[i] == 1) { // masked + size_t idx = len * i; + // output is normalized input for this layer + math::sigmoid(len, &in_data[idx], + &out_data[idx]); // normalize using sigmoid + // one hot encoded vector for the labels + float_t* ground_truth = new float_t[len]; + for (size_t j = 0; j < len; j++) + ground_truth[j] = (float_t)get_label(i, j); + // loss calculation + loss[i] = math::cross_entropy(len, ground_truth, &out_data[idx]); + delete[] ground_truth; + } + }, + galois::chunk_size(), galois::steal(), + galois::loopname("sigmoid-loss-fw")); } -void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, - float_t*, float_t* in_grad) { +void sigmoid_loss_layer::back_propagation(const float_t* in_data, + const float_t* out_data, float_t*, + float_t* in_grad) { size_t len = layer::input_dims[1]; - galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { - if (!use_mask || masks_[i] == 1) { // masked - size_t idx = len * i; - float_t *norm_grad = new float_t[len]; - float_t *ground_truth = new float_t[len]; - for (size_t j = 0; j < len; j++) ground_truth[j] = (float_t)get_label(i, j); - // use ground truth to determine derivative of cross entropy - math::d_cross_entropy(len, ground_truth, &out_data[idx], norm_grad); - // derviative sigmoid to gradient used in the next layer - math::d_sigmoid(len, &in_data[idx], &out_data[idx], &in_grad[idx], norm_grad); - delete[] norm_grad; - delete[] ground_truth; - } - }, galois::chunk_size(), galois::steal(), galois::loopname("sigmoid-loss-bw")); + galois::do_all( + galois::iterate(layer::begin_, layer::end_), + [&](const auto& i) { + if (!use_mask || masks_[i] == 1) { // masked + size_t idx = len * i; + float_t* norm_grad = new float_t[len]; + float_t* ground_truth = new float_t[len]; + for (size_t j = 0; j < len; j++) + ground_truth[j] = (float_t)get_label(i, j); + // use ground truth to determine derivative of cross entropy + math::d_cross_entropy(len, ground_truth, &out_data[idx], norm_grad); + // derviative sigmoid to gradient used in the next layer + math::d_sigmoid(len, &in_data[idx], &out_data[idx], &in_grad[idx], + norm_grad); + delete[] norm_grad; + delete[] ground_truth; + } + }, + galois::chunk_size(), galois::steal(), + galois::loopname("sigmoid-loss-bw")); } acc_t sigmoid_loss_layer::get_prediction_loss() { @@ -68,15 +80,19 @@ acc_t sigmoid_loss_layer::get_prediction_loss() { galois::GAccumulator valid_sample_count; total_loss.reset(); valid_sample_count.reset(); - galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { - if (!use_mask || masks_[i]) { - total_loss += loss[i]; - valid_sample_count += 1; - } - }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); + galois::do_all( + galois::iterate(layer::begin_, layer::end_), + [&](const auto& i) { + if (!use_mask || masks_[i]) { + total_loss += loss[i]; + valid_sample_count += 1; + } + }, + galois::chunk_size<256>(), galois::steal(), + galois::loopname("getMaskedLoss")); assert(valid_sample_count.reduce() == count_); return total_loss.reduce() / (acc_t)count_; } #endif -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cu b/libdeepgalois/src/layers/sigmoid_loss_layer.cu index f00689dfc9..0f5ff9cb69 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cu +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cu @@ -13,9 +13,7 @@ sigmoid_loss_layer::sigmoid_loss_layer(unsigned level, name_ = layer_type() + "_" + std::to_string(level); } -sigmoid_loss_layer::~sigmoid_loss_layer() { - float_free_device(loss); -} +sigmoid_loss_layer::~sigmoid_loss_layer() { float_free_device(loss); } void sigmoid_loss_layer::malloc_and_init() { float_malloc_device(input_dims[0], loss); @@ -24,19 +22,19 @@ void sigmoid_loss_layer::malloc_and_init() { void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { init_const_gpu(input_dims[0], 0.0, loss); - sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, - d_masks_, labels, loss, out_data); + sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, d_masks_, + labels, loss, out_data); } void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { - d_sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, - labels, out_data, in_grad); + d_sigmoid_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, labels, + out_data, in_grad); } acc_t sigmoid_loss_layer::get_prediction_loss() { return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss); } -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index d40ff6d411..f3eb3ee969 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -13,9 +13,7 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, name_ = layer_type() + "_" + std::to_string(level); } -softmax_loss_layer::~softmax_loss_layer() { - delete[] loss; -} +softmax_loss_layer::~softmax_loss_layer() { delete[] loss; } void softmax_loss_layer::malloc_and_init() { loss = new float_t[input_dims[0]]; // error for each sample @@ -23,44 +21,58 @@ void softmax_loss_layer::malloc_and_init() { inline label_t softmax_loss_layer::get_label(size_t i) { return labels[i]; - //return context->get_label(i); + // return context->get_label(i); } // TODO: need kernel fusion optimization // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] -void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { +void softmax_loss_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { size_t len = input_dims[1]; - galois::do_all(galois::iterate(begin_, end_), [&](const auto& i) { - if (!use_mask || masks_[i] == 1) { // masked - // output is normalized input for this layer - math::softmax(len, &in_data[len*i], &out_data[len*i]); // normalize using softmax - // one hot encoded vector for the labels - vec_t groundTruth(output_dims[1], 0.0); // ground truth - groundTruth[get_label(i)] = 1.0; // one-hot - // loss calculation - loss[i] = math::cross_entropy(len, &groundTruth[0], &out_data[len*i]); - } - }, galois::chunk_size<64>(), galois::steal(), galois::loopname("softmax-loss-fw")); + galois::do_all( + galois::iterate(begin_, end_), + [&](const auto& i) { + if (!use_mask || masks_[i] == 1) { // masked + // output is normalized input for this layer + math::softmax(len, &in_data[len * i], + &out_data[len * i]); // normalize using softmax + // one hot encoded vector for the labels + vec_t groundTruth(output_dims[1], 0.0); // ground truth + groundTruth[get_label(i)] = 1.0; // one-hot + // loss calculation + loss[i] = + math::cross_entropy(len, &groundTruth[0], &out_data[len * i]); + } + }, + galois::chunk_size<64>(), galois::steal(), + galois::loopname("softmax-loss-fw")); // no sync required in distributed execution since no graph topology used // in this forward pass; only a post-process pretty much } -void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, - float_t*, float_t* in_grad) { +void softmax_loss_layer::back_propagation(const float_t* in_data, + const float_t* out_data, float_t*, + float_t* in_grad) { // note: out_grad is ignored because it shouldn't exist (this is output layer) size_t len = layer::input_dims[1]; - galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { - if (!use_mask || masks_[i] == 1) { // masked - vec_t norm_grad(len); - std::vector groundTruth(len, 0.0); - groundTruth[get_label(i)] = 1.0; - // use ground truth to determine derivative of cross entropy - math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], &norm_grad[0]); - // derviative softmax to gradient used in the next layer - math::d_softmax(len, &in_data[len * i], &out_data[len * i], &in_grad[len * i], &norm_grad[0]); - } - }, galois::chunk_size<64>(), galois::steal(), galois::loopname("softmax-loss-bw")); + galois::do_all( + galois::iterate(layer::begin_, layer::end_), + [&](const auto& i) { + if (!use_mask || masks_[i] == 1) { // masked + vec_t norm_grad(len); + std::vector groundTruth(len, 0.0); + groundTruth[get_label(i)] = 1.0; + // use ground truth to determine derivative of cross entropy + math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], + &norm_grad[0]); + // derviative softmax to gradient used in the next layer + math::d_softmax(len, &in_data[len * i], &out_data[len * i], + &in_grad[len * i], &norm_grad[0]); + } + }, + galois::chunk_size<64>(), galois::steal(), + galois::loopname("softmax-loss-bw")); // no weight sync required: this is all local graph information } @@ -71,16 +83,21 @@ acc_t softmax_loss_layer::get_prediction_loss() { galois::GAccumulator valid_sample_count; total_loss.reset(); valid_sample_count.reset(); - galois::do_all(galois::iterate(layer::begin_, layer::end_), [&](const auto& i) { - if (!use_mask || masks_[i]) { - total_loss += loss[i]; - valid_sample_count += 1; - } - }, galois::chunk_size<64>(), galois::steal(), galois::loopname("getMaskedLoss")); - //std::cout << "begin = " << begin_ << " end = " << end_ << " count = " << count_ << " valid_count = " << valid_sample_count.reduce() << "\n"; + galois::do_all( + galois::iterate(layer::begin_, layer::end_), + [&](const auto& i) { + if (!use_mask || masks_[i]) { + total_loss += loss[i]; + valid_sample_count += 1; + } + }, + galois::chunk_size<64>(), galois::steal(), + galois::loopname("getMaskedLoss")); + // std::cout << "begin = " << begin_ << " end = " << end_ << " count = " << + // count_ << " valid_count = " << valid_sample_count.reduce() << "\n"; assert(valid_sample_count.reduce() == count_); return total_loss.reduce() / (acc_t)count_; } #endif -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cu b/libdeepgalois/src/layers/softmax_loss_layer.cu index 59a955526b..20b7e659d8 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cu +++ b/libdeepgalois/src/layers/softmax_loss_layer.cu @@ -13,9 +13,7 @@ softmax_loss_layer::softmax_loss_layer(unsigned level, name_ = layer_type() + "_" + std::to_string(level); } -softmax_loss_layer::~softmax_loss_layer() { - float_free_device(loss); -} +softmax_loss_layer::~softmax_loss_layer() { float_free_device(loss); } void softmax_loss_layer::malloc_and_init() { float_malloc_device(input_dims[0], loss); @@ -24,19 +22,19 @@ void softmax_loss_layer::malloc_and_init() { void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { init_const_gpu(input_dims[0], 0.0, loss); - softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, - d_masks_, labels, loss, out_data); + softmax_cross_entropy_gpu(input_dims[1], begin_, end_, in_data, d_masks_, + labels, loss, out_data); } void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { - d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, - labels, out_data, in_grad); + d_softmax_cross_entropy_gpu(input_dims[1], begin_, end_, d_masks_, labels, + out_data, in_grad); } acc_t softmax_loss_layer::get_prediction_loss() { return masked_avg_loss_gpu(begin_, end_, count_, d_masks_, loss); } -} // namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index 26811280a1..572f4e5662 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -10,7 +10,7 @@ bool LearningGraph::isLocal(index_t) { return true; } index_t LearningGraph::getLID(index_t) { return 0; } -bool LearningGraph::is_vertex_cut() {return true; } +bool LearningGraph::is_vertex_cut() { return true; } std::vector>& LearningGraph::getMirrorNodes() { return mirrorNodes; @@ -26,13 +26,14 @@ void LearningGraph::readGraph(std::string dataset) { } void LearningGraph::degree_counting() { - //if (degrees_ != NULL) return; - //degrees_ = new index_t[num_vertices_]; - galois::do_all(galois::iterate(size_t(0), size_t(num_vertices_)), [&] (auto v) { - degrees_[v] = rowptr_[v+1] - rowptr_[v]; - }, galois::loopname("DegreeCounting")); + // if (degrees_ != NULL) return; + // degrees_ = new index_t[num_vertices_]; + galois::do_all( + galois::iterate(size_t(0), size_t(num_vertices_)), + [&](auto v) { degrees_[v] = rowptr_[v + 1] - rowptr_[v]; }, + galois::loopname("DegreeCounting")); } void LearningGraph::dealloc() {} -} // end namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu index 2c630ca7ae..679a4b6d8a 100644 --- a/libdeepgalois/src/lgraph.cu +++ b/libdeepgalois/src/lgraph.cu @@ -15,17 +15,23 @@ void LearningGraph::dealloc() { CUDA_CHECK(cudaFree(d_colidx_)); CUDA_CHECK(cudaFree(d_rowptr_)); CUDA_CHECK(cudaFree(d_degrees_)); - if (edge_data_ != NULL) CUDA_CHECK(cudaFree(d_edge_data_)); - if (vertex_data_ != NULL) CUDA_CHECK(cudaFree(d_vertex_data_)); + if (edge_data_ != NULL) + CUDA_CHECK(cudaFree(d_edge_data_)); + if (vertex_data_ != NULL) + CUDA_CHECK(cudaFree(d_vertex_data_)); } void LearningGraph::allocOnDevice(bool no_edge_data__) { - if (d_colidx_ != NULL) return; - CUDA_CHECK(cudaMalloc((void **) &d_colidx_, num_edges_ * sizeof(index_t))); - CUDA_CHECK(cudaMalloc((void **) &d_rowptr_, (num_vertices_+1) * sizeof(index_t))); - //CUDA_CHECK(cudaMalloc((void **) &d_degrees_, num_vertices_ * sizeof(index_t))); - //if (!no_edge_data__) CUDA_CHECK(cudaMalloc((void **) &edge_data__, num_edges_ * sizeof(edge_data___t))); - //CUDA_CHECK(cudaMalloc((void **) &vertex_data__, num_vertices_ * sizeof(vdata_t))); + if (d_colidx_ != NULL) + return; + CUDA_CHECK(cudaMalloc((void**)&d_colidx_, num_edges_ * sizeof(index_t))); + CUDA_CHECK( + cudaMalloc((void**)&d_rowptr_, (num_vertices_ + 1) * sizeof(index_t))); + // CUDA_CHECK(cudaMalloc((void **) &d_degrees_, num_vertices_ * + // sizeof(index_t))); if (!no_edge_data__) CUDA_CHECK(cudaMalloc((void **) + // &edge_data__, num_edges_ * sizeof(edge_data___t))); + // CUDA_CHECK(cudaMalloc((void **) &vertex_data__, num_vertices_ * + // sizeof(vdata_t))); is_device = true; } @@ -38,22 +44,34 @@ void LearningGraph::print_test() { void LearningGraph::copy_to_gpu() { allocOnDevice(edge_data_ == NULL); - CUDA_CHECK(cudaMemcpy(d_colidx_, edge_dst_host_ptr(), num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice)); - CUDA_CHECK(cudaMemcpy(d_rowptr_, row_start_host_ptr(), (num_vertices_+1) * sizeof(index_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_colidx_, edge_dst_host_ptr(), + num_edges_ * sizeof(index_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_rowptr_, row_start_host_ptr(), + (num_vertices_ + 1) * sizeof(index_t), + cudaMemcpyHostToDevice)); print_test(); - //CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyHostToDevice)); - //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__, edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyHostToDevice)); - //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__, vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyHostToDevice)); + // CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * + // sizeof(index_t), cudaMemcpyHostToDevice)); if (edge_data__ != NULL) + // CUDA_CHECK(cudaMemcpy(copygraph.edge_data__, edge_data__, num_edges_ * + // sizeof(edata_t), cudaMemcpyHostToDevice)); + // CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__, vertex_data__, num_vertices_ + // * sizeof(vdata_t), cudaMemcpyHostToDevice)); } void LearningGraph::copy_to_cpu() { - CUDA_CHECK(cudaMemcpy(edge_dst_host_ptr(), d_colidx_, num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(row_start_host_ptr(), d_rowptr_, (num_vertices_+1) * sizeof(index_t), cudaMemcpyDeviceToHost)); - //CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * sizeof(index_t), cudaMemcpyDeviceToHost)); - //if (edge_data__ != NULL) CUDA_CHECK(cudaMemcpy(copygraph.edge_data__ptr(), edge_data__, num_edges_ * sizeof(edata_t), cudaMemcpyDeviceToHost)); - //CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__ptr(), vertex_data__, num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(edge_dst_host_ptr(), d_colidx_, + num_edges_ * sizeof(index_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(row_start_host_ptr(), d_rowptr_, + (num_vertices_ + 1) * sizeof(index_t), + cudaMemcpyDeviceToHost)); + // CUDA_CHECK(cudaMemcpy(degrees_ptr(), d_degrees_, num_vertices_ * + // sizeof(index_t), cudaMemcpyDeviceToHost)); if (edge_data__ != NULL) + // CUDA_CHECK(cudaMemcpy(copygraph.edge_data__ptr(), edge_data__, num_edges_ * + // sizeof(edata_t), cudaMemcpyDeviceToHost)); + // CUDA_CHECK(cudaMemcpy(copygraph.vertex_data__ptr(), vertex_data__, + // num_vertices_ * sizeof(vdata_t), cudaMemcpyDeviceToHost)); } void LearningGraph::degree_counting() {} -} +} // namespace deepgalois diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 0923411ff2..3b96341c66 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -10,17 +10,17 @@ #ifdef USE_MKL #include -#else // If use MKL, simply include the MKL header +#else // If use MKL, simply include the MKL header extern "C" { #include } #endif -#define NOT_IMPLEMENTED \ - do { \ - std::cout << "Not Implemented Yet";\ - exit(1); \ - } while(0); +#define NOT_IMPLEMENTED \ + do { \ + std::cout << "Not Implemented Yet"; \ + exit(1); \ + } while (0); /* #include @@ -39,7 +39,7 @@ void rng_bernoulli(size_t n, const float_t p, uint8_t* r) { */ std::default_random_engine generator; -std::uniform_real_distribution distribution(0.0,1.0); +std::uniform_real_distribution distribution(0.0, 1.0); namespace deepgalois { @@ -57,43 +57,48 @@ void sgemm_cpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, Tmatmul.start(); int lda = (TransA == CblasNoTrans) ? K : M; int ldb = (TransB == CblasNoTrans) ? N : K; - cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, beta, C, N); + cblas_sgemm(CblasRowMajor, TransA, TransB, M, N, K, alpha, A, lda, B, ldb, + beta, C, N); Tmatmul.stop(); } #ifdef USE_MKL void csrmm_cpu(const int M, const int N, const int K, const int, - const float alpha, float* A_nonzeros, int* A_idx_ptr, int* A_nnz_idx, - const float* B, const float beta, float* C) { + const float alpha, float* A_nonzeros, int* A_idx_ptr, + int* A_nnz_idx, const float* B, const float beta, float* C) { #else -void csrmm_cpu(const int, const int, const int, const int, - const float, float*, int*, int*, - const float*, const float, float*) { +void csrmm_cpu(const int, const int, const int, const int, const float, float*, + int*, int*, const float*, const float, float*) { #endif #ifdef USE_MKL - //mkl_set_num_threads(56); - //const char *matdescra = "GXXCX";//6 bytes - //const char transa = 'N'; - //mkl_scsrmm(&transa, &M , &N, &K, &alpha, matdescra, A_nonzeros, A_nnz_idx, A_idx_ptr, A_idx_ptr+1, B, &N, &beta, C, &N); + // mkl_set_num_threads(56); + // const char *matdescra = "GXXCX";//6 bytes + // const char transa = 'N'; + // mkl_scsrmm(&transa, &M , &N, &K, &alpha, matdescra, A_nonzeros, A_nnz_idx, + // A_idx_ptr, A_idx_ptr+1, B, &N, &beta, C, &N); sparse_status_t status; - bool need_trans = false; - bool is_row_major = true; - sparse_matrix_t csrA = NULL; + bool need_trans = false; + bool is_row_major = true; + sparse_matrix_t csrA = NULL; sparse_index_base_t indexing = SPARSE_INDEX_BASE_ZERO; - sparse_layout_t layout = (is_row_major ? SPARSE_LAYOUT_ROW_MAJOR : SPARSE_LAYOUT_COLUMN_MAJOR); - status = mkl_sparse_s_create_csr(&csrA, indexing, M, K, A_idx_ptr, A_idx_ptr + 1, A_nnz_idx, A_nonzeros); + sparse_layout_t layout = + (is_row_major ? SPARSE_LAYOUT_ROW_MAJOR : SPARSE_LAYOUT_COLUMN_MAJOR); + status = mkl_sparse_s_create_csr(&csrA, indexing, M, K, A_idx_ptr, + A_idx_ptr + 1, A_nnz_idx, A_nonzeros); if (status != SPARSE_STATUS_SUCCESS) { std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl; exit(1); } - sparse_operation_t transa = (need_trans ? SPARSE_OPERATION_TRANSPOSE : SPARSE_OPERATION_NON_TRANSPOSE); + sparse_operation_t transa = (need_trans ? SPARSE_OPERATION_TRANSPOSE + : SPARSE_OPERATION_NON_TRANSPOSE); struct matrix_descr descrA; descrA.type = SPARSE_MATRIX_TYPE_GENERAL; - //descrA.mode = SPARSE_FILL_MODE_UPPER; - //descrA.diag = SPARSE_DIAG_NON_UNIT; - //mkl_sparse_set_mm_hint(csrA, transa, descrA, layout, N, 1); - //mkl_sparse_optimize(csrA); - status = mkl_sparse_s_mm(transa, alpha, csrA, descrA, layout, B, N, N, beta, C, N); + // descrA.mode = SPARSE_FILL_MODE_UPPER; + // descrA.diag = SPARSE_DIAG_NON_UNIT; + // mkl_sparse_set_mm_hint(csrA, transa, descrA, layout, N, 1); + // mkl_sparse_optimize(csrA); + status = + mkl_sparse_s_mm(transa, alpha, csrA, descrA, layout, B, N, N, beta, C, N); if (status != SPARSE_STATUS_SUCCESS) { std::cout << "mkl_sparse_s_create_csr status :" << status << std::endl; exit(1); @@ -105,8 +110,9 @@ void csrmm_cpu(const int, const int, const int, const int, } // matrix-vector multiply -void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float alpha, - const float* A, const float* x, const float beta, float* y) { +void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, + const float alpha, const float* A, const float* x, const float beta, + float* y) { cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); } @@ -123,7 +129,7 @@ inline void rng_uniform_cpu(size_t n, float_t* r) { for (size_t i = 0; i < n; ++i) { r[i] = distribution(generator); } - //galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { + // galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { // unsigned short xi[3]; // r[i] = erand48(xi); //}, galois::loopname("randomMaskGen")); @@ -137,18 +143,15 @@ void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) { #ifdef __AVX2__ const size_t alignedN = n - n % vec_len; for (size_t i = 0; i < alignedN; i += vec_len) - _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); - for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + b[i]; -#else - for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i]; -#endif + _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), +_mm256_loadu_ps(&b[i]))); for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + +b[i]; #else for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i]; #endif } #if defined(__AVX__) || defined(__AVX2__) -void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) { - const size_t alignedN = n - n % vec_len; - const __m256 scal = _mm256_set1_ps(alpha); - for (size_t i = 0; i < alignedN; i += vec_len) +void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) +{ const size_t alignedN = n - n % vec_len; const __m256 scal = +_mm256_set1_ps(alpha); for (size_t i = 0; i < alignedN; i += vec_len) _mm256_storeu_ps(&out[i], _mm256_mul_ps(_mm256_loadu_ps(&in[i]), scal)); for (size_t i = alignedN; i < n; ++i) out[i] = alpha * in[i]; } @@ -176,8 +179,8 @@ float_t l2_norm(size_t n, const float_t* in) { } #else // vector multiply scalar -void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) { - for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i]; +void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) +{ for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i]; } float_t l2_norm(size_t n, const float_t* a) { @@ -195,10 +198,13 @@ void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* y) { #ifdef __AVX2__ const size_t alignedN = n - n % vec_len; for (size_t i = 0; i < alignedN; i += vec_len) - _mm256_storeu_ps(&y[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); - for (size_t i = alignedN; i < n; ++i) y[i] = a[i] + b[i]; + _mm256_storeu_ps( + &y[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), _mm256_loadu_ps(&b[i]))); + for (size_t i = alignedN; i < n; ++i) + y[i] = a[i] + b[i]; #else - for (size_t i = 0; i < n; ++i) y[i] = a[i] + b[i]; + for (size_t i = 0; i < n; ++i) + y[i] = a[i] + b[i]; #endif #endif } @@ -212,7 +218,7 @@ void scale(size_t n, const float_t alpha, const float_t* x, float_t* y) { cblas_sscal(n, alpha, y, 1); } -void axpy(size_t n, const float_t a, float_t *x, float_t *y) { +void axpy(size_t n, const float_t a, float_t* x, float_t* y) { cblas_saxpy(n, a, x, 1, y, 1); } @@ -229,9 +235,7 @@ int argmax(const size_t n, const float_t* x) { } // l2 normalization -float_t l2_norm(size_t n, const float_t* x) { - return cblas_snrm2(n, x, 1); -} +float_t l2_norm(size_t n, const float_t* x) { return cblas_snrm2(n, x, 1); } // dot product float_t dot(size_t n, const float_t* x, const float_t* y) { @@ -239,13 +243,13 @@ float_t dot(size_t n, const float_t* x, const float_t* y) { } void clear_cpu(size_t n, float_t* in) { - //for (size_t i = 0; i < n; i++) in[i] = 0; - std::fill(in, in+n, 0); + // for (size_t i = 0; i < n; i++) in[i] = 0; + std::fill(in, in + n, 0); // memset(in, 0, n*sizeof(float_t)); } -void dropout(size_t m, float scale, float dropout_rate, - const float_t* in, mask_t* masks, float_t* out) { +void dropout(size_t m, float scale, float dropout_rate, const float_t* in, + mask_t* masks, float_t* out) { for (size_t i = 0; i < m; ++i) masks[i] = bernoulli(dropout_rate); for (size_t i = 0; i < m; ++i) @@ -253,75 +257,88 @@ void dropout(size_t m, float scale, float dropout_rate, } void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate, - const float_t* in, mask_t* masks, float_t* out) { + const float_t* in, mask_t* masks, float_t* out) { size_t len = n * m; -/* -#ifdef USE_MKL - vec_t rands(len); - rng_uniform_cpu(len, &rands[0]); - galois::do_all(galois::iterate((size_t)0, len), [&](const auto& i) { - masks[i] = rands[i] > dropout_rate ? 1 : 0; - }, galois::loopname("randomMaskGen")); -*/ -/* - galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { - auto idx = i * m; - vec_t rands(m); - rng_uniform_cpu(m, &rands[0]); - for (size_t j = 0; j < m; ++j) - masks[idx+j] = rands[j] > dropout_rate ? 1 : 0; - }, galois::loopname("dropout")); -#else -*/ + /* + #ifdef USE_MKL + vec_t rands(len); + rng_uniform_cpu(len, &rands[0]); + galois::do_all(galois::iterate((size_t)0, len), [&](const auto& i) { + masks[i] = rands[i] > dropout_rate ? 1 : 0; + }, galois::loopname("randomMaskGen")); + */ + /* + galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { + auto idx = i * m; + vec_t rands(m); + rng_uniform_cpu(m, &rands[0]); + for (size_t j = 0; j < m; ++j) + masks[idx+j] = rands[j] > dropout_rate ? 1 : 0; + }, galois::loopname("dropout")); + #else + */ for (size_t i = 0; i < len; ++i) { masks[i] = bernoulli(dropout_rate); } -//#endif - galois::do_all(galois::iterate((size_t)0, len), [&](const auto& i) { - out[i] = in[i] * (float_t)masks[i] * scale; - }, galois::loopname("dropout")); + //#endif + galois::do_all( + galois::iterate((size_t)0, len), + [&](const auto& i) { out[i] = in[i] * (float_t)masks[i] * scale; }, + galois::loopname("dropout")); } -void d_dropout(size_t m, float scale, const float_t* in, mask_t* masks, float_t* out) { +void d_dropout(size_t m, float scale, const float_t* in, mask_t* masks, + float_t* out) { for (size_t i = 0; i < m; ++i) out[i] = in[i] * (float_t)masks[i] * scale; } void d_dropout_cpu(size_t n, size_t m, float scale, const float_t* in, mask_t* masks, float_t* out) { - galois::do_all(galois::iterate((size_t)0, n*m), [&](const auto& i) { - out[i] = in[i] * (float_t)masks[i] * scale; - }, galois::loopname("d_dropout")); + galois::do_all( + galois::iterate((size_t)0, n * m), + [&](const auto& i) { out[i] = in[i] * (float_t)masks[i] * scale; }, + galois::loopname("d_dropout")); } void relu_cpu(size_t n, const float_t* in, float_t* out) { // TODO: vectorize - galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { - out[i] = std::max(in[i], float_t(0)); - }, galois::chunk_size<64>(), galois::loopname("relu")); + galois::do_all( + galois::iterate((size_t)0, n), + [&](const auto& i) { out[i] = std::max(in[i], float_t(0)); }, + galois::chunk_size<64>(), galois::loopname("relu")); } -void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out) { +void d_relu_cpu(size_t n, const float_t* in, const float_t* data, + float_t* out) { // TODO: vectorize // check if original data greater than 0; if so keep grad - galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { - out[i] = data[i] > float_t(0) ? in[i] : float_t(0); - }, galois::chunk_size<64>(), galois::loopname("d_relu")); + galois::do_all( + galois::iterate((size_t)0, n), + [&](const auto& i) { + out[i] = data[i] > float_t(0) ? in[i] : float_t(0); + }, + galois::chunk_size<64>(), galois::loopname("d_relu")); } -void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out) { +void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, + float_t* out) { // TODO: vectorize - galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { - out[i] = in[i] > 0 ? in[i] : epsilon * in[i]; - }, galois::chunk_size<64>(), galois::loopname("leaky_relu")); + galois::do_all( + galois::iterate((size_t)0, n), + [&](const auto& i) { out[i] = in[i] > 0 ? in[i] : epsilon * in[i]; }, + galois::chunk_size<64>(), galois::loopname("leaky_relu")); } void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, const float_t* data, float_t* out) { // TODO: vectorize - galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { - out[i] = in[i] * (data[i] > float_t(0) ? float_t(1) : epsilon); - }, galois::chunk_size<64>(), galois::loopname("d_leaky_relu")); + galois::do_all( + galois::iterate((size_t)0, n), + [&](const auto& i) { + out[i] = in[i] * (data[i] > float_t(0) ? float_t(1) : epsilon); + }, + galois::chunk_size<64>(), galois::loopname("d_leaky_relu")); } void softmax(size_t n, const float_t* input, float_t* output) { @@ -368,9 +385,10 @@ void d_cross_entropy(size_t n, const float_t* y, const float_t* p, float_t* d) { } } -// use sigmoid instead of softmax for multi-class datasets, e.g. ppi, yelp and amazon -// inline float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + 0.5; } -inline float_t sigmoid_func(float_t x) { return 1./(1.+expf(-x)); } +// use sigmoid instead of softmax for multi-class datasets, e.g. ppi, yelp and +// amazon inline float_t sigmoid_func(float_t x) { return 0.5 * tanh(0.5 * x) + +// 0.5; } +inline float_t sigmoid_func(float_t x) { return 1. / (1. + expf(-x)); } // Sigmoid void sigmoid(size_t n, const float_t* in, float_t* out) { @@ -379,15 +397,16 @@ void sigmoid(size_t n, const float_t* in, float_t* out) { } } -void d_sigmoid(size_t n, const float_t*, const float_t* p, float_t* dy, const float_t* dp) { +void d_sigmoid(size_t n, const float_t*, const float_t* p, float_t* dy, + const float_t* dp) { for (size_t i = 0; i < n; i++) { dy[i] = dp[i] * p[i] * (float_t(1) - p[i]); } } void copy_cpu(size_t n, const float_t* in, float_t* out) { - //std::copy(in, in + n, out); - //memcpy(out, in, sizeof(float_t) * n); + // std::copy(in, in + n, out); + // memcpy(out, in, sizeof(float_t) * n); cblas_scopy(n, in, 1, out, 1); } @@ -416,4 +435,3 @@ float reduce_mean(size_t n, const float_t* x) { } // end namespace math } // end namespace deepgalois - diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 449b597621..06d854d4b7 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -5,22 +5,25 @@ #include "cub/cub.cuh" #include -__global__ void init_const_kernel(int n, float_t value, float_t *array) { +__global__ void init_const_kernel(int n, float_t value, float_t* array) { CUDA_KERNEL_LOOP(i, n) { array[i] = value; } } -void init_const_gpu(int n, float_t value, float_t *array) { +void init_const_gpu(int n, float_t value, float_t* array) { init_const_kernel<<>>(n, value, array); CudaTest("solving init_const kernel failed"); } -__global__ void isnan_test(const int n, const float *data, bool *result) { - CUDA_KERNEL_LOOP(i, n) { if (isnan(data[i])) *result = true; } +__global__ void isnan_test(const int n, const float* data, bool* result) { + CUDA_KERNEL_LOOP(i, n) { + if (isnan(data[i])) + *result = true; + } } -bool isnan_gpu(int n, const float_t *array) { - bool *d_result, h_result = false; - cudaMalloc((void **)&d_result, sizeof (bool)); +bool isnan_gpu(int n, const float_t* array) { + bool *d_result, h_result = false; + cudaMalloc((void**)&d_result, sizeof(bool)); cudaMemcpy(d_result, &h_result, sizeof(bool), cudaMemcpyHostToDevice); isnan_test<<>>(n, array, d_result); CudaTest("solving init_const kernel failed"); @@ -29,11 +32,13 @@ bool isnan_gpu(int n, const float_t *array) { } void gpu_rng_uniform(size_t n, float_t* r) { - CURAND_CHECK(curandGenerateUniform(deepgalois::Context::curand_generator(), r, n)); + CURAND_CHECK( + curandGenerateUniform(deepgalois::Context::curand_generator(), r, n)); } void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r) { - CURAND_CHECK(curandGenerateUniform(deepgalois::Context::curand_generator(), r, n)); + CURAND_CHECK( + curandGenerateUniform(deepgalois::Context::curand_generator(), r, n)); const float range = b - a; if (range != float_t(1)) scal_gpu(n, range, r); @@ -41,15 +46,19 @@ void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r) { add_scalar_gpu(n, a, r); } -void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t* r) { - CURAND_CHECK(curandGenerateNormal(deepgalois::Context::curand_generator(), r, n, mu, sigma)); +void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, + float_t* r) { + CURAND_CHECK(curandGenerateNormal(deepgalois::Context::curand_generator(), r, + n, mu, sigma)); } bool is_allocated_device(float_t* data) { - if (data == NULL) return false; + if (data == NULL) + return false; cudaPointerAttributes attributes; CUDA_CHECK(cudaPointerGetAttributes(&attributes, data)); - if (attributes.devicePointer != NULL) return true; + if (attributes.devicePointer != NULL) + return true; return false; } @@ -57,18 +66,18 @@ void float_malloc_device(int n, float_t*& ptr) { CUDA_CHECK(cudaMalloc((void**)&ptr, n * sizeof(float_t))); } -void float_free_device(float_t*& ptr) { - CUDA_CHECK(cudaFree(ptr)); -} +void float_free_device(float_t*& ptr) { CUDA_CHECK(cudaFree(ptr)); } -void float_copy_device(int n, float_t* h_ptr, float_t *d_ptr) { - CUDA_CHECK(cudaMemcpy(d_ptr, h_ptr, n * sizeof(float_t), cudaMemcpyHostToDevice)); +void float_copy_device(int n, float_t* h_ptr, float_t* d_ptr) { + CUDA_CHECK( + cudaMemcpy(d_ptr, h_ptr, n * sizeof(float_t), cudaMemcpyHostToDevice)); } void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) { assert(h_masks != NULL); CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t))); - CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); + CUDA_CHECK( + cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); } __global__ void setup_curand_kernel(const int n, curandState* state) { @@ -79,17 +88,17 @@ __global__ void setup_curand_kernel(const int n, curandState* state) { } __global__ void dropout_kernel(int n, float scale, float threshold, - float_t *rands, const float_t* in, - mask_t* masks, float_t* out) { + float_t* rands, const float_t* in, mask_t* masks, + float_t* out) { CUDA_KERNEL_LOOP(i, n) { - masks[i] = rands[i] > threshold ? 1 : 0; - out[i] = in[i] * masks[i] * scale; + masks[i] = rands[i] > threshold ? 1 : 0; + out[i] = in[i] * masks[i] * scale; } } -void dropout_gpu(int n, float scale, float dropout_rate, - const float_t* in, mask_t* masks, float_t* out) { - float_t *rands; +void dropout_gpu(int n, float scale, float dropout_rate, const float_t* in, + mask_t* masks, float_t* out) { + float_t* rands; float_malloc_device(n, rands); gpu_rng_uniform(n, rands); dropout_kernel<<>>( @@ -99,12 +108,13 @@ void dropout_gpu(int n, float scale, float dropout_rate, } __global__ void d_dropout_kernel(int n, float scale, float threshold, - const float_t* in, const mask_t* masks, float_t* out) { + const float_t* in, const mask_t* masks, + float_t* out) { CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] * masks[i] * scale; } } -void d_dropout_gpu(int n, float scale, float dropout_rate, - const float_t* in, const mask_t* masks, float_t* out) { +void d_dropout_gpu(int n, float scale, float dropout_rate, const float_t* in, + const mask_t* masks, float_t* out) { d_dropout_kernel<<>>( n, scale, dropout_rate, in, masks, out); CudaTest("solving d_dropout kernel failed"); @@ -138,21 +148,24 @@ __global__ void leaky_relu_kernel(const int n, const float_t epsilon, CUDA_KERNEL_LOOP(i, n) { out[i] = in[i] > 0 ? in[i] : epsilon * in[i]; } } -void leaky_relu_gpu(const int n, const float_t epsilon, - const float_t* in, float_t* out) { - leaky_relu_kernel<<>>(n, epsilon, in, out); +void leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in, + float_t* out) { + leaky_relu_kernel<<>>(n, epsilon, in, + out); CudaTest("solving leaky_relu kernel failed"); } -__global__ void d_leaky_relu_kernel(const int n, const float_t epsilon, - const float_t* in_diff, const float_t* data, float_t* out_diff) { +__global__ void d_leaky_relu_kernel(const int n, const float_t epsilon, + const float_t* in_diff, const float_t* data, + float_t* out_diff) { CUDA_KERNEL_LOOP(i, n) { out_diff[i] = in_diff[i] * (data[i] > 0 ? 1.0 : epsilon); } } -void d_leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in_diff, - const float_t* data, float_t* out_diff) { +void d_leaky_relu_gpu(const int n, const float_t epsilon, + const float_t* in_diff, const float_t* data, + float_t* out_diff) { d_leaky_relu_kernel<<>>( n, epsilon, in_diff, data, out_diff); CudaTest("solving d_leaky_relu kernel failed"); @@ -160,23 +173,23 @@ void d_leaky_relu_gpu(const int n, const float_t epsilon, const float_t* in_diff __global__ void matmul_kernel(int x, int y, int z, const float_t* A, const float_t* B, float_t* C) { - int row = blockIdx.x*blockDim.x+threadIdx.x; - int col = blockIdx.y*blockDim.y+threadIdx.y; - float_t sum = 0.0f; - if (row < x && col < y) { - for (int i = 0; i < z; i++) { - sum += A[row * z + i] * B[i * y + col]; - } - } - C[row * y + col] = sum; + int row = blockIdx.x * blockDim.x + threadIdx.x; + int col = blockIdx.y * blockDim.y + threadIdx.y; + float_t sum = 0.0f; + if (row < x && col < y) { + for (int i = 0; i < z; i++) { + sum += A[row * z + i] * B[i * y + col]; + } + } + C[row * y + col] = sum; } #define TILE_SZ 16 void matmul_gpu(const size_t x, const size_t y, const size_t z, - const float_t* A, const float_t* B, float_t* C) { + const float_t* A, const float_t* B, float_t* C) { dim3 threadsPerBlock(TILE_SZ, TILE_SZ); - dim3 blocksPerGrid((y-1)/TILE_SZ+1, (x-1)/TILE_SZ+1); - matmul_kernel<<>>(x, y, z, A, B, C); + dim3 blocksPerGrid((y - 1) / TILE_SZ + 1, (x - 1) / TILE_SZ + 1); + matmul_kernel<<>>(x, y, z, A, B, C); CudaTest("solving matmul kernel failed"); } @@ -190,8 +203,9 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasSgemm(deepgalois::Context::cublas_handle(), cuTransB, cuTransA, - N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); + CUBLAS_CHECK(cublasSgemm(deepgalois::Context::cublas_handle(), cuTransB, + cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, + N)); } void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, @@ -201,57 +215,60 @@ void matmul1D1D_gpu(const size_t dim_x, const size_t dim_y, const size_t dim_z, sgemm_gpu(TransA, TransB, dim_x, dim_y, dim_z, 1.0, A, B, 0.0, C); } -// C = A x B, where A is a sparse matrix in CSR format, B is the dense matrix for vertex -// feature tensor. However, since cusparse only supports column-major, while feature -// tensor is stored in row-major, the actual computation is: C = trans(A x trans(B)). -// Currently, we use cublasSgeam to implement transposition and allocate intermediate -// workspace memory (transpose_C) for this. -void csrmm_gpu(const int M, const int N, const int K, const int nnz, - const float alpha, const float* A_nonzeros, - const int* A_idx_ptr, const int* A_nnz_idx, - const float* B, const float beta, float *transpose_C, float* C) { - //std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n"; - CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::Context::cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, - M, N, K, nnz, &alpha, deepgalois::Context::cusparse_matdescr(), A_nonzeros, - A_idx_ptr, A_nnz_idx, B, N, &beta, transpose_C, M)); - //transpose C - const float one = 1.0; - const float zero = 0.0; - CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T, - N, M, &one, transpose_C, M, &zero, NULL, M, C, N)); +// C = A x B, where A is a sparse matrix in CSR format, B is the dense matrix +// for vertex feature tensor. However, since cusparse only supports +// column-major, while feature tensor is stored in row-major, the actual +// computation is: C = trans(A x trans(B)). Currently, we use cublasSgeam to +// implement transposition and allocate intermediate workspace memory +// (transpose_C) for this. +void csrmm_gpu(const int M, const int N, const int K, const int nnz, + const float alpha, const float* A_nonzeros, const int* A_idx_ptr, + const int* A_nnz_idx, const float* B, const float beta, + float* transpose_C, float* C) { + // std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << + // ", nnz=" << nnz << "\n"; + CUSPARSE_CHECK(cusparseScsrmm2( + deepgalois::Context::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, + CUSPARSE_OPERATION_TRANSPOSE, M, N, K, nnz, &alpha, + deepgalois::Context::cusparse_matdescr(), A_nonzeros, A_idx_ptr, + A_nnz_idx, B, N, &beta, transpose_C, M)); + // transpose C + const float one = 1.0; + const float zero = 0.0; + CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T, + CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL, + M, C, N)); } /* -void csrmm_gpu_new(const int M, const int N, const int K, const int nnz, - const float alpha, const float* A_nonzeros, - const int* A_idx_ptr, const int* A_nnz_idx, +void csrmm_gpu_new(const int M, const int N, const int K, const int nnz, + const float alpha, const float* A_nonzeros, + const int* A_idx_ptr, const int* A_nnz_idx, const float* B, const float beta, float *transpose_C, float* C) { std::cout << "[debug]: csrmm_gpu\n"; cusparseSpMatDescr_t A_descr; - CUSPARSE_CHECK(cusparseCreateCsr(&A_descr, M, K, nnz, A_idx_ptr, A_nnz_idx, A_nonzeros, - CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); + CUSPARSE_CHECK(cusparseCreateCsr(&A_descr, M, K, nnz, A_idx_ptr, A_nnz_idx, +A_nonzeros, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); cusparseDnMatDescr_t B_descr; - CUSPARSE_CHECK(cusparseCreateDnMat(&B_descr, K, N, K, B, CUDA_R_32F, CUSPARSE_ORDER_COL)); - cusparseDnMatDescr_t C_descr; - CUSPARSE_CHECK(cusparseCreateDnMat(&C_descr, M, N, M, C, CUDA_R_32F, CUSPARSE_ORDER_COL)); - size_t bufferSize; + CUSPARSE_CHECK(cusparseCreateDnMat(&B_descr, K, N, K, B, CUDA_R_32F, +CUSPARSE_ORDER_COL)); cusparseDnMatDescr_t C_descr; + CUSPARSE_CHECK(cusparseCreateDnMat(&C_descr, M, N, M, C, CUDA_R_32F, +CUSPARSE_ORDER_COL)); size_t bufferSize; CUSPARSE_CHECK(cusparseSpMM_bufferSize(deepgalois::Context::cusparse_handle(), - CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, - (void*)&alpha, A_descr, B_descr, (void*)&beta, C_descr, - CUDA_R_32F, CUSPARSE_COOMM_ALG1, &bufferSize)); + CUSPARSE_OPERATION_NON_TRANSPOSE, +CUSPARSE_OPERATION_TRANSPOSE, (void*)&alpha, A_descr, B_descr, (void*)&beta, +C_descr, CUDA_R_32F, CUSPARSE_COOMM_ALG1, &bufferSize)); cudaDeviceSynchronize(); void* buffer = NULL; if (bufferSize > 0) CUDA_CHECK(cudaMalloc(&buffer, bufferSize)); CUSPARSE_CHECK(cusparseSpMM(deepgalois::Context::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, - (const void*)&alpha, A_descr, B_descr, (const void*)&beta, C_descr, - CUDA_R_32F, CUSPARSE_COOMM_ALG1, buffer)); - cudaDeviceSynchronize(); + (const void*)&alpha, A_descr, B_descr, (const void*)&beta, +C_descr, CUDA_R_32F, CUSPARSE_COOMM_ALG1, buffer)); cudaDeviceSynchronize(); //transpose C const float one = 1.0; - const float zero = 0.0; - CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T, - N, M, &one, transpose_C, M, &zero, NULL, M, C, N)); + const float zero = 0.0; + CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T, +CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL, M, C, N)); } //*/ void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, @@ -259,16 +276,18 @@ void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float beta, float* y) { cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasSgemv(deepgalois::Context::cublas_handle(), cuTransA, N, M, &alpha, A, - N, x, 1, &beta, y, 1)); + CUBLAS_CHECK(cublasSgemv(deepgalois::Context::cublas_handle(), cuTransA, N, M, + &alpha, A, N, x, 1, &beta, y, 1)); } void scal_gpu(const int N, const float alpha, float* X) { - CUBLAS_CHECK(cublasSscal(deepgalois::Context::cublas_handle(), N, &alpha, X, 1)); + CUBLAS_CHECK( + cublasSscal(deepgalois::Context::cublas_handle(), N, &alpha, X, 1)); } void dot_gpu(const int n, const float* x, const float* y, float* out) { - CUBLAS_CHECK(cublasSdot(deepgalois::Context::cublas_handle(), n, x, 1, y, 1, out)); + CUBLAS_CHECK( + cublasSdot(deepgalois::Context::cublas_handle(), n, x, 1, y, 1, out)); } void asum_gpu(const int n, const float* x, float* y) { @@ -276,8 +295,10 @@ void asum_gpu(const int n, const float* x, float* y) { } void scale_gpu(const int n, const float alpha, const float* x, float* y) { - CUBLAS_CHECK(cublasScopy(deepgalois::Context::cublas_handle(), n, x, 1, y, 1)); - CUBLAS_CHECK(cublasSscal(deepgalois::Context::cublas_handle(), n, &alpha, y, 1)); + CUBLAS_CHECK( + cublasScopy(deepgalois::Context::cublas_handle(), n, x, 1, y, 1)); + CUBLAS_CHECK( + cublasSscal(deepgalois::Context::cublas_handle(), n, &alpha, y, 1)); } __global__ void set_kernel(const int n, const float_t alpha, float_t* y) { @@ -318,12 +339,13 @@ __global__ void axpy_kernel(const int n, const float_t a, const float_t* x, } void axpy_gpu(const int n, const float_t a, const float_t* x, float_t* y) { - //axpy_kernel<<>>(n, a, x, y); - CUBLAS_CHECK(cublasSaxpy(deepgalois::Context::cublas_handle(), n, &a, x, 1, y, 1)); + // axpy_kernel<<>>(n, a, x, y); + CUBLAS_CHECK( + cublasSaxpy(deepgalois::Context::cublas_handle(), n, &a, x, 1, y, 1)); CudaTest("solving axpy kernel failed"); } -__global__ void l2_norm_kernel(const int n, const float_t* a, float_t *sum) { +__global__ void l2_norm_kernel(const int n, const float_t* a, float_t* sum) { CUDA_KERNEL_LOOP(i, n) { float_t product = a[i] * a[i]; atomicAdd(sum, product); @@ -332,24 +354,25 @@ __global__ void l2_norm_kernel(const int n, const float_t* a, float_t *sum) { acc_t l2_norm_gpu(int n, const float_t* x) { float_t sum = 0.0; - CUBLAS_CHECK(cublasSnrm2(deepgalois::Context::cublas_handle(), n, x, 1, &sum)); - //float_t *d_sum; - //CUDA_CHECK(cudaMalloc((void**)&d_sum, sizeof(float_t)); - //CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(acc_t), cudaMemcpyHostToDevice)); - //l2_norm_kernel<<>>(n, x, d_sum); - //CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(float_t), cudaMemcpyDeviceToHost)); + CUBLAS_CHECK( + cublasSnrm2(deepgalois::Context::cublas_handle(), n, x, 1, &sum)); + // float_t *d_sum; + // CUDA_CHECK(cudaMalloc((void**)&d_sum, sizeof(float_t)); + // CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(acc_t), cudaMemcpyHostToDevice)); + // l2_norm_kernel<<>>(n, x, d_sum); + // CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(float_t), + // cudaMemcpyDeviceToHost)); return (acc_t)sum / 2.0; } -void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t *out) { -} +void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t* out) {} -void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, - float_t *in_diff, float_t *out_diff) { -} +void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff, + float_t* out_diff) {} void copy_gpu(int len, const float_t* in, float_t* out) { - CUDA_CHECK(cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); + CUDA_CHECK( + cudaMemcpy(out, in, len * sizeof(float_t), cudaMemcpyDeviceToDevice)); } // TODO: use warp @@ -362,14 +385,15 @@ __device__ void softmax_device(int n, const float_t* input, float_t* output) { for (int i = 0; i < n; i++) { output[i] = expf(input[i] - max); denominator += output[i]; - if (output[i] < 0.0) printf("in[%d]=%f, out[%d]=%f\n", i, input[i], i, output[i]); - //assert(output[i] >= 0.0); + if (output[i] < 0.0) + printf("in[%d]=%f, out[%d]=%f\n", i, input[i], i, output[i]); + // assert(output[i] >= 0.0); } assert(denominator != 0.0); for (int i = 0; i < n; i++) { output[i] /= denominator; - //assert(output[i] >= 0.0); - //assert(output[i] <= 1.0); + // assert(output[i] >= 0.0); + // assert(output[i] <= 1.0); } } @@ -378,18 +402,25 @@ __device__ void sigmoid_device(int n, const float_t* in, float_t* out) { out[i] = 1. / (1. + expf(-in[i])); } -__device__ void cross_entropy_device(int n, const label_t idx, const float_t* p, float_t& loss) { - if (p[idx] == 0.0) loss -= logf(float_t(1e-10)); - else loss -= logf(p[idx]); +__device__ void cross_entropy_device(int n, const label_t idx, const float_t* p, + float_t& loss) { + if (p[idx] == 0.0) + loss -= logf(float_t(1e-10)); + else + loss -= logf(p[idx]); } // y: ground truth // p: predictions -__device__ void cross_entropy_multi_device(int n, const label_t *y, const float_t* p, float_t& loss) { +__device__ void cross_entropy_multi_device(int n, const label_t* y, + const float_t* p, float_t& loss) { for (int i = 0; i < n; i++) { - if (y[i] == 0) continue; - if (p[i] == float_t(0)) loss -= logf(float_t(1e-10)); // avoid NaN exception - else loss -= logf(p[i]); + if (y[i] == 0) + continue; + if (p[i] == float_t(0)) + loss -= logf(float_t(1e-10)); // avoid NaN exception + else + loss -= logf(p[i]); } } @@ -401,13 +432,13 @@ __global__ void softmax_cross_entropy_kernel(int len, int begin, int end, const mask_t* masks, const label_t* labels, float_t* loss, float_t* out_data) { - CUDA_KERNEL_LOOP(i, end-begin) { + CUDA_KERNEL_LOOP(i, end - begin) { int id = begin + i; if (masks[id] == 1) { // masked - // normalize using softmax - softmax_device(len, in_data + len*id, out_data + len*id); - //loss[id] = 0.0; - cross_entropy_device(len, labels[id], out_data + len*id, loss[id]); + // normalize using softmax + softmax_device(len, in_data + len * id, out_data + len * id); + // loss[id] = 0.0; + cross_entropy_device(len, labels[id], out_data + len * id, loss[id]); } } } @@ -415,8 +446,9 @@ __global__ void softmax_cross_entropy_kernel(int len, int begin, int end, void softmax_cross_entropy_gpu(int len, int begin, int end, const float_t* in, const mask_t* masks, const label_t* labels, float_t* loss, float_t* out) { - softmax_cross_entropy_kernel<<>>( - len, begin, end, in, masks, labels, loss, out); + softmax_cross_entropy_kernel<<>>(len, begin, end, in, masks, + labels, loss, out); CudaTest("solving softmax_cross_entropy kernel failed"); } @@ -428,11 +460,11 @@ __global__ void sigmoid_cross_entropy_kernel(int len, int begin, int end, const mask_t* masks, const label_t* labels, float_t* loss, float_t* out_data) { - CUDA_KERNEL_LOOP(i, end-begin) { + CUDA_KERNEL_LOOP(i, end - begin) { int id = begin + i; if (masks[id] == 1) { // masked - sigmoid_device(len, in_data + len*id, out_data + len*id); - cross_entropy_multi_device(len, labels, out_data + len*id, loss[id]); + sigmoid_device(len, in_data + len * id, out_data + len * id); + cross_entropy_multi_device(len, labels, out_data + len * id, loss[id]); } } } @@ -440,64 +472,77 @@ __global__ void sigmoid_cross_entropy_kernel(int len, int begin, int end, void sigmoid_cross_entropy_gpu(int len, int begin, int end, const float_t* in, const mask_t* masks, const label_t* labels, float_t* loss, float_t* out) { - sigmoid_cross_entropy_kernel<<>>( - len, begin, end, in, masks, labels, loss, out); + sigmoid_cross_entropy_kernel<<>>(len, begin, end, in, masks, + labels, loss, out); CudaTest("solving sigmoid_cross_entropy kernel failed"); } -__device__ void d_cross_entropy_device(int n, const label_t idx, const float_t* p, float_t* d) { +__device__ void d_cross_entropy_device(int n, const label_t idx, + const float_t* p, float_t* d) { for (int i = 0; i < n; i++) { - if (i == (int)idx) d[i] = -1.0 / (p[i] + 1e-10); - else d[i] = 0.0; + if (i == (int)idx) + d[i] = -1.0 / (p[i] + 1e-10); + else + d[i] = 0.0; } } __global__ void d_cross_entropy_kernel(int len, int begin, int end, - const mask_t* masks, const label_t* labels, - const float_t* data, float_t* grad) { + const mask_t* masks, + const label_t* labels, + const float_t* data, float_t* grad) { int base = begin * len; - CUDA_KERNEL_LOOP(i, (end-begin)*len) { - int id = begin + i/len; + CUDA_KERNEL_LOOP(i, (end - begin) * len) { + int id = begin + i / len; if (masks[id] == 1) { // masked - if (i%len == (int)labels[id]) grad[i] = -1.0 / (data[i+base] + 1e-10); - else grad[i] = 0.0; - //d_cross_entropy_device(len, labels[id], data + len*id, grad + len*i); + if (i % len == (int)labels[id]) + grad[i] = -1.0 / (data[i + base] + 1e-10); + else + grad[i] = 0.0; + // d_cross_entropy_device(len, labels[id], data + len*id, grad + len*i); } } -} +} __global__ void d_cross_entropy_warp(int len, int begin, int end, - const mask_t* masks, const label_t* labels, - const float_t* data, float_t* grad) { - __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; - const int thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index - const int thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp - const int warp_id = thread_id / WARP_SIZE; // global warp index - const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA - const int num_warps = (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps - - for (int wid = warp_id; wid < end-begin; wid += num_warps) { - int id = begin + wid; - int base = id * len; + const mask_t* masks, const label_t* labels, + const float_t* data, float_t* grad) { + __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + const int thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + for (int wid = warp_id; wid < end - begin; wid += num_warps) { + int id = begin + wid; + int base = id * len; if (masks[id] == 1) { for (int i = 0; i < len; i += WARP_SIZE) { int pid = thread_lane + i; - if (pid < len) p[warp_lane][pid] = data[base+pid]; + if (pid < len) + p[warp_lane][pid] = data[base + pid]; } __syncthreads(); for (int i = 0; i < len; i += WARP_SIZE) { int pid = thread_lane + i; if (pid < len) { if (pid == (int)labels[id]) - grad[wid*len+pid] = -1.0 / (p[warp_lane][pid] + 1e-10); - else grad[wid*len+pid] = 0.0; + grad[wid * len + pid] = -1.0 / (p[warp_lane][pid] + 1e-10); + else + grad[wid * len + pid] = 0.0; } } } } } -__device__ void d_softmax_device(int n, const float_t* p, const float_t* dp, float_t* dy) { +__device__ void d_softmax_device(int n, const float_t* p, const float_t* dp, + float_t* dy) { for (int i = 0; i < n; i++) { dy[i] = 0; for (int j = 0; j < n; j++) { @@ -510,47 +555,52 @@ __device__ void d_softmax_device(int n, const float_t* p, const float_t* dp, flo __global__ void d_softmax_kernel(int len, int begin, int end, const mask_t* masks, const float_t* data, const float_t* in_grad, float_t* out_grad) { - CUDA_KERNEL_LOOP(i, end-begin) { + CUDA_KERNEL_LOOP(i, end - begin) { int id = begin + i; if (masks[id] == 1) { // masked - d_softmax_device(len, data + len*id, in_grad + len*i, out_grad + len*id); + d_softmax_device(len, data + len * id, in_grad + len * i, + out_grad + len * id); } } -} - -__global__ void d_softmax_warp(int len, int begin, int end, - const mask_t* masks, const float_t* data, - const float_t* in_grad, float_t* out_grad) { - __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; - __shared__ float_t d[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; - const int thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index - const int thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp - const int warp_id = thread_id / WARP_SIZE; // global warp index - const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA - const int num_warps = (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps - - for (int wid = warp_id; wid < end-begin; wid += num_warps) { - int id = begin + wid; - int base = id * len; +} + +__global__ void d_softmax_warp(int len, int begin, int end, const mask_t* masks, + const float_t* data, const float_t* in_grad, + float_t* out_grad) { + __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + __shared__ float_t d[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + const int thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + for (int wid = warp_id; wid < end - begin; wid += num_warps) { + int id = begin + wid; + int base = id * len; if (masks[id] == 1) { for (int i = 0; i < len; i += WARP_SIZE) { int pid = thread_lane + i; if (pid < len) { - p[warp_lane][pid] = data[base+pid]; - d[warp_lane][pid] = in_grad[wid*len+pid]; + p[warp_lane][pid] = data[base + pid]; + d[warp_lane][pid] = in_grad[wid * len + pid]; } } __syncthreads(); for (int i = 0; i < len; i += WARP_SIZE) { int pid = thread_lane + i; if (pid < len) { - float_t sum = 0.0; + float_t sum = 0.0; float_t self = p[warp_lane][pid]; for (int j = 0; j < len; j++) { - float_t df = (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self; + float_t df = + (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self; sum += df * d[warp_lane][j]; } - out_grad[base+pid] = sum; + out_grad[base + pid] = sum; } } __syncthreads(); @@ -559,36 +609,44 @@ __global__ void d_softmax_warp(int len, int begin, int end, } __global__ void d_softmax_cross_entropy_kernel(int len, int begin, int end, - const mask_t* masks, const label_t* labels, - const float_t* out, float_t* diff) { - CUDA_KERNEL_LOOP(i, end-begin) { + const mask_t* masks, + const label_t* labels, + const float_t* out, + float_t* diff) { + CUDA_KERNEL_LOOP(i, end - begin) { int id = begin + i; - if (masks[id] == 1) { // masked - float_t out_grad[41]; // TODO - d_cross_entropy_device(len, labels[id], out + len*id, out_grad); - d_softmax_device(len, out + len*id, out_grad, diff + len*id); + if (masks[id] == 1) { // masked + float_t out_grad[41]; // TODO + d_cross_entropy_device(len, labels[id], out + len * id, out_grad); + d_softmax_device(len, out + len * id, out_grad, diff + len * id); } } } __global__ void d_softmax_cross_entropy_warp(int len, int begin, int end, - const mask_t* masks, const label_t* labels, - const float_t* data, float_t* grad) { - __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; - __shared__ float_t d[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; - const int thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index - const int thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp - const int warp_id = thread_id / WARP_SIZE; // global warp index - const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA - const int num_warps = (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps - - for (int wid = warp_id; wid < end-begin; wid += num_warps) { - int id = begin + wid; - int base = id * len; + const mask_t* masks, + const label_t* labels, + const float_t* data, + float_t* grad) { + __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + __shared__ float_t d[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + const int thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + for (int wid = warp_id; wid < end - begin; wid += num_warps) { + int id = begin + wid; + int base = id * len; if (masks[id] == 1) { for (int i = 0; i < len; i += WARP_SIZE) { int pid = thread_lane + i; - if (pid < len) p[warp_lane][pid] = data[base+pid]; + if (pid < len) + p[warp_lane][pid] = data[base + pid]; } __syncthreads(); @@ -598,7 +656,8 @@ __global__ void d_softmax_cross_entropy_warp(int len, int begin, int end, if (pid < len) { if (pid == (int)labels[id]) d[warp_lane][pid] = -1.0 / (p[warp_lane][pid] + 1e-10); - else d[warp_lane][pid] = 0.0; + else + d[warp_lane][pid] = 0.0; } } __syncthreads(); @@ -607,13 +666,14 @@ __global__ void d_softmax_cross_entropy_warp(int len, int begin, int end, for (int i = 0; i < len; i += WARP_SIZE) { int pid = thread_lane + i; if (pid < len) { - float_t sum = 0.0; + float_t sum = 0.0; float_t self = p[warp_lane][pid]; for (int j = 0; j < len; j++) { - float_t df = (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self; + float_t df = + (j == pid) ? self * (1.0 - self) : -p[warp_lane][j] * self; sum += df * d[warp_lane][j]; } - grad[base+pid] = sum; + grad[base + pid] = sum; } } __syncthreads(); @@ -624,42 +684,51 @@ __global__ void d_softmax_cross_entropy_warp(int len, int begin, int end, void d_softmax_cross_entropy_gpu(int len, int begin, int end, const mask_t* masks, const label_t* labels, const float_t* out, float_t* diff) { -// d_softmax_cross_entropy_kernel<<>>( -// len, begin, end, masks, labels, out, diff); -// CudaTest("solving d_softmax_cross_entropy kernel failed"); - //float_t *grad; - //float_malloc_device((end-begin)*len, grad); - //d_cross_entropy_kernel<<>>( - //d_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>( + // d_softmax_cross_entropy_kernel<<>>( + // len, begin, end, masks, labels, out, diff); + // CudaTest("solving d_softmax_cross_entropy kernel failed"); + // float_t *grad; + // float_malloc_device((end-begin)*len, grad); + // d_cross_entropy_kernel<<>>( d_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, + // BLOCK_SIZE>>>( // len, begin, end, masks, labels, out, grad); - //CudaTest("solving d_cross_entropy kernel failed"); - //d_softmax_kernel<<>>( - //d_softmax_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>( + // CudaTest("solving d_cross_entropy kernel failed"); + // d_softmax_kernel<<>>( + // d_softmax_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>( // len, begin, end, masks, out, grad, diff); - //CudaTest("solving d_softmax kernel failed"); - d_softmax_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>( - len, begin, end, masks, labels, out, diff); + // CudaTest("solving d_softmax kernel failed"); + d_softmax_cross_entropy_warp<<<(end - begin - 1) / WARPS_PER_BLOCK + 1, + BLOCK_SIZE>>>(len, begin, end, masks, labels, + out, diff); CudaTest("solving d_softmax_cross_entropy_warp kernel failed"); } __global__ void d_sigmoid_cross_entropy_warp(int len, int begin, int end, - const mask_t* masks, const label_t* labels, - const float_t* data, float_t* grad) { - __shared__ float_t p[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; - __shared__ float_t d[BLOCK_SIZE/WARP_SIZE][MAX_NUM_CLASSES]; - const int thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index - const int thread_lane = threadIdx.x & (WARP_SIZE-1); // thread index within the warp - const int warp_id = thread_id / WARP_SIZE; // global warp index - const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA - const int num_warps = (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps - - for (int wid = warp_id; wid < end-begin; wid += num_warps) { - int id = begin + wid; - int base = id * len; + const mask_t* masks, + const label_t* labels, + const float_t* data, + float_t* grad) { + __shared__ float_t p[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + __shared__ float_t d[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + const int thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const int thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const int warp_id = thread_id / WARP_SIZE; // global warp index + const int warp_lane = threadIdx.x / WARP_SIZE; // warp index within the CTA + const int num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + for (int wid = warp_id; wid < end - begin; wid += num_warps) { + int id = begin + wid; + int base = id * len; if (masks[id] == 1) { for (int i = 0; i < len; i += WARP_SIZE) { int pid = thread_lane + i; - if (pid < len) p[warp_lane][pid] = data[base+pid]; + if (pid < len) + p[warp_lane][pid] = data[base + pid]; } __syncthreads(); @@ -667,9 +736,10 @@ __global__ void d_sigmoid_cross_entropy_warp(int len, int begin, int end, for (int i = 0; i < len; i += WARP_SIZE) { int pid = thread_lane + i; if (pid < len) { - //if (p[warp_lane][pid] == 0) - d[warp_lane][pid] = -(float_t)labels[base+pid] / (p[warp_lane][pid] + 1e-10); - //else d[warp_lane][pid] = -(float_t)labels[pid] / 1e-10; + // if (p[warp_lane][pid] == 0) + d[warp_lane][pid] = + -(float_t)labels[base + pid] / (p[warp_lane][pid] + 1e-10); + // else d[warp_lane][pid] = -(float_t)labels[pid] / 1e-10; } } __syncthreads(); @@ -678,9 +748,9 @@ __global__ void d_sigmoid_cross_entropy_warp(int len, int begin, int end, for (int i = 0; i < len; i += WARP_SIZE) { int pid = thread_lane + i; if (pid < len) { - float_t self = p[warp_lane][pid]; - float_t dp = d[warp_lane][pid]; - grad[base+pid] = dp * self * (float_t(1) - self); + float_t self = p[warp_lane][pid]; + float_t dp = d[warp_lane][pid]; + grad[base + pid] = dp * self * (float_t(1) - self); } } __syncthreads(); @@ -691,13 +761,15 @@ __global__ void d_sigmoid_cross_entropy_warp(int len, int begin, int end, void d_sigmoid_cross_entropy_gpu(int len, int begin, int end, const mask_t* masks, const label_t* labels, const float_t* out, float_t* diff) { - d_sigmoid_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>( - len, begin, end, masks, labels, out, diff); + d_sigmoid_cross_entropy_warp<<<(end - begin - 1) / WARPS_PER_BLOCK + 1, + BLOCK_SIZE>>>(len, begin, end, masks, labels, + out, diff); CudaTest("solving d_sigmoid_cross_entropy_warp kernel failed"); } __global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks, - float_t* loss, HGAccumulator total) { + float_t* loss, + HGAccumulator total) { total.thread_entry(); __shared__ cub::BlockReduce::TempStorage local_loss; CUDA_KERNEL_LOOP(i, end - begin) { @@ -707,8 +779,10 @@ __global__ void masked_avg_loss_kernel(int begin, int end, mask_t* masks, total.thread_exit>(local_loss); } -//acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, float_t* loss); -acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss) { +// acc_t masked_avg_loss(int begin, int end, int count, mask_t* masks, float_t* +// loss); +acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, + float_t* loss) { assert(count > 0); HGAccumulator loss_accum; Shared total_loss = Shared(1); @@ -720,4 +794,3 @@ acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* cudaDeviceSynchronize(); return *(total_loss.cpu_rd_ptr()) / count; } - diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/net.cpp index 381539df6b..ebd19639da 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/net.cpp @@ -11,19 +11,19 @@ namespace deepgalois { #ifdef GALOIS_USE_DIST void Net::dist_init(Graph* graph, std::string dataset_str) { - dGraph = graph; - context = new deepgalois::DistContext(); + dGraph = graph; + context = new deepgalois::DistContext(); num_samples = dGraph->size(); context->saveGraph(dGraph); // TODO self loop setup? context->initializeSyncSubstrate(); num_classes = context->read_labels(); - //std::cout << "Reading label masks ... "; + // std::cout << "Reading label masks ... "; train_masks = new mask_t[num_samples]; - val_masks = new mask_t[num_samples]; - std::fill(train_masks, train_masks+num_samples, 0); - std::fill(val_masks, val_masks+num_samples, 0); + val_masks = new mask_t[num_samples]; + std::fill(train_masks, train_masks + num_samples, 0); + std::fill(val_masks, val_masks + num_samples, 0); if (dataset_str == "reddit") { train_begin = 0, train_count = 153431, @@ -41,40 +41,43 @@ void Net::dist_init(Graph* graph, std::string dataset_str) { } } } else { - train_count = context->read_masks("train", num_samples, train_begin, train_end, train_masks, dGraph); - val_count = context->read_masks("val", num_samples, val_begin, val_end, val_masks, dGraph); + train_count = context->read_masks("train", num_samples, train_begin, + train_end, train_masks, dGraph); + val_count = context->read_masks("val", num_samples, val_begin, val_end, + val_masks, dGraph); } feature_dims[0] = context->read_features(); // input feature dimension: D for (size_t i = 1; i < num_conv_layers; i++) - feature_dims[i] = hidden1; // hidden1 level embedding: 16 - feature_dims[num_conv_layers] = num_classes; // output embedding: E - if (has_l2norm) - feature_dims[num_conv_layers+1] = num_classes; // l2 normalized embedding: E - if (has_dense) - feature_dims[num_layers-1] = num_classes; // MLP embedding: E - feature_dims[num_layers] = num_classes; // normalized output embedding: E + feature_dims[i] = hidden1; // hidden1 level embedding: 16 + feature_dims[num_conv_layers] = num_classes; // output embedding: E + if (has_l2norm) + feature_dims[num_conv_layers + 1] = + num_classes; // l2 normalized embedding: E + if (has_dense) + feature_dims[num_layers - 1] = num_classes; // MLP embedding: E + feature_dims[num_layers] = num_classes; // normalized output embedding: E layers.resize(num_layers); } #endif #ifdef CPU_ONLY void Net::init() { - if (subgraph_sample_size) sampler = new deepgalois::Sampler(); + if (subgraph_sample_size) + sampler = new deepgalois::Sampler(); } // add weight decay void Net::regularize() { size_t layer_id = 0; - auto n = feature_dims[layer_id] * feature_dims[layer_id+1]; + auto n = feature_dims[layer_id] * feature_dims[layer_id + 1]; // TODO: parallel - math::axpy(n, weight_decay, layers[layer_id]->get_weights_ptr(), - layers[layer_id]->get_grads_ptr()); + math::axpy(n, weight_decay, layers[layer_id]->get_weights_ptr(), + layers[layer_id]->get_grads_ptr()); } // Scale gradient to counterbalance accumulation -void Net::normalize() { -} +void Net::normalize() {} /** * @@ -82,7 +85,9 @@ void Net::normalize() { * @param end GLOBAL end * @param count GLOBAL training count */ -acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth) { +acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, + mask_t* masks, float_t* preds, + label_t* ground_truth) { #ifndef GALOIS_USE_DIST galois::GAccumulator accuracy_all; #else @@ -93,32 +98,37 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks accuracy_all.reset(); - galois::do_all(galois::iterate(begin, end), [&](const auto& i) { + galois::do_all( + galois::iterate(begin, end), + [&](const auto& i) { #ifndef GALOIS_USE_DIST - if (masks == NULL || masks[i] == 1) { // use sampled graph when masks is NULL - // get prediction - auto pred = math::argmax(num_classes, preds+i*num_classes); - // check prediction - if ((label_t)pred == ground_truth[i]) - accuracy_all += 1.0; - } + if (masks == NULL || + masks[i] == 1) { // use sampled graph when masks is NULL + // get prediction + auto pred = math::argmax(num_classes, preds + i * num_classes); + // check prediction + if ((label_t)pred == ground_truth[i]) + accuracy_all += 1.0; + } #else - // only look at owned nodes (i.e. masters); the prediction for these - // should only by handled on the owner - if (dGraph->isOwned(i)) { - sampleCount += 1; - - uint32_t localID = dGraph->getLID(i); - if (masks[localID] == 1) { - // get prediction - auto pred = math::argmax(num_classes, &preds[localID*num_classes]); - // check prediction - if ((label_t)pred == ground_truth[localID]) - accuracy_all += 1.0; - } - } + // only look at owned nodes (i.e. masters); the prediction for these + // should only by handled on the owner + if (dGraph->isOwned(i)) { + sampleCount += 1; + + uint32_t localID = dGraph->getLID(i); + if (masks[localID] == 1) { + // get prediction + auto pred = + math::argmax(num_classes, &preds[localID * num_classes]); + // check prediction + if ((label_t)pred == ground_truth[localID]) + accuracy_all += 1.0; + } + } #endif - }, galois::loopname("getMaskedLoss")); + }, + galois::loopname("getMaskedLoss")); #ifdef GALOIS_USE_DIST count = sampleCount.reduce(); @@ -129,8 +139,11 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks return accuracy_all.reduce() / (acc_t)count; } -acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth) { - return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, ground_truth, preds); +acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, + mask_t* masks, float_t* preds, + label_t* ground_truth) { + return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, + ground_truth, preds); } #endif diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index 98a5e82010..cd635ef07f 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -7,7 +7,7 @@ // the arguments of the maxima __device__ int argmax_device(const int n, const float_t* x) { - float_t max = x[0]; + float_t max = x[0]; int max_ind = 0; for (int i = 1; i < n; i++) { if (x[i] > max) { @@ -18,15 +18,17 @@ __device__ int argmax_device(const int n, const float_t* x) { return max_ind; } -__global__ void masked_accuracy_kernel(int num_classes, int begin, - int end, mask_t* masks, - float_t* preds, label_t* labels, +__global__ void masked_accuracy_kernel(int num_classes, int begin, int end, + mask_t* masks, float_t* preds, + label_t* labels, HGAccumulator total) { total.thread_entry(); - __shared__ cub::BlockReduce::TempStorage local_accuracy; + __shared__ cub::BlockReduce::TempStorage + local_accuracy; CUDA_KERNEL_LOOP(i, end - begin) { if (masks[begin + i] == 1) { - label_t pred = (label_t)argmax_device(num_classes, preds + (begin + i) * num_classes); + label_t pred = (label_t)argmax_device(num_classes, + preds + (begin + i) * num_classes); if (pred == labels[begin + i]) total.reduce(1.0); } @@ -49,13 +51,11 @@ acc_t masked_accuracy_gpu(int num_classes, int begin, int end, int count, } typedef float f1count_t; -__global__ void masked_f1_score_kernel(int num_classes, int begin, - int end, mask_t* masks, - float_t* preds, label_t* labels, - f1count_t* true_positive, - f1count_t* false_positive, - f1count_t* false_negtive, - f1count_t* true_negtive) { +__global__ void +masked_f1_score_kernel(int num_classes, int begin, int end, mask_t* masks, + float_t* preds, label_t* labels, + f1count_t* true_positive, f1count_t* false_positive, + f1count_t* false_negtive, f1count_t* true_negtive) { CUDA_KERNEL_LOOP(i, end - begin) { int id = begin + i; if (masks[id] == 1) { @@ -83,7 +83,7 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, f1count_t* h_fp = new f1count_t[num_classes]; f1count_t* h_fn = new f1count_t[num_classes]; f1count_t* h_tn = new f1count_t[num_classes]; - f1count_t* d_tp, *d_fp, *d_fn, *d_tn; + f1count_t *d_tp, *d_fp, *d_fn, *d_tn; float_malloc_device(num_classes, d_tp); float_malloc_device(num_classes, d_fp); float_malloc_device(num_classes, d_fn); @@ -95,41 +95,45 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, masked_f1_score_kernel<<>>( num_classes, begin, end, masks, preds, labels, d_tp, d_fp, d_fn, d_tn); CudaTest("solving masked_f1_score_kernel kernel failed"); - CUDA_CHECK(cudaMemcpy(h_tp, d_tp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(h_fp, d_fp, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(h_fn, d_fn, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost)); - CUDA_CHECK(cudaMemcpy(h_tn, d_tn, num_classes * sizeof(f1count_t), cudaMemcpyDeviceToHost)); - - acc_t pNumerator = 0.0; - acc_t pDenominator = 0.0; - acc_t rNumerator = 0.0; - acc_t rDenominator = 0.0; + CUDA_CHECK(cudaMemcpy(h_tp, d_tp, num_classes * sizeof(f1count_t), + cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_fp, d_fp, num_classes * sizeof(f1count_t), + cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_fn, d_fn, num_classes * sizeof(f1count_t), + cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(h_tn, d_tn, num_classes * sizeof(f1count_t), + cudaMemcpyDeviceToHost)); + + acc_t pNumerator = 0.0; + acc_t pDenominator = 0.0; + acc_t rNumerator = 0.0; + acc_t rDenominator = 0.0; acc_t precisionMacro = 0.0; - acc_t recallMacro = 0.0; + acc_t recallMacro = 0.0; for (size_t i = 0; i < num_classes; i++) { acc_t fn = (acc_t)h_fn[i]; // false negtive acc_t fp = (acc_t)h_fp[i]; // false positive - acc_t tp = (acc_t)h_tp[i]; // true positive - //acc_t tn = (acc_t)h_tn[i]; // true positive + acc_t tp = (acc_t)h_tp[i]; // true positive + // acc_t tn = (acc_t)h_tn[i]; // true positive precisionMacro = precisionMacro + (tp / (tp + fp)); - recallMacro = recallMacro + (tp / (tp + fn)); - pNumerator = pNumerator + tp; - pDenominator = pDenominator + (tp + fp); - rNumerator = rNumerator + tp; - rDenominator = rDenominator + (tp + fn); + recallMacro = recallMacro + (tp / (tp + fn)); + pNumerator = pNumerator + tp; + pDenominator = pDenominator + (tp + fp); + rNumerator = rNumerator + tp; + rDenominator = rDenominator + (tp + fn); } precisionMacro = precisionMacro / num_classes; - recallMacro = recallMacro / num_classes; - acc_t f1_macro = (((beta * beta) + 1) * precisionMacro * recallMacro) / + recallMacro = recallMacro / num_classes; + acc_t f1_macro = (((beta * beta) + 1) * precisionMacro * recallMacro) / ((beta * beta) * precisionMacro + recallMacro); - acc_t recallMicro = rNumerator / rDenominator; + acc_t recallMicro = rNumerator / rDenominator; acc_t precisionMicro = pNumerator / pDenominator; - acc_t f1_micro = (((beta * beta) + 1) * precisionMicro * recallMicro) / + acc_t f1_micro = (((beta * beta) + 1) * precisionMicro * recallMicro) / ((beta * beta) * precisionMicro + recallMicro); - std::cout << std::setprecision(3) << std::fixed << - " (f1_micro: " << f1_micro << ", f1_macro: " << f1_macro << ") "; - + std::cout << std::setprecision(3) << std::fixed << " (f1_micro: " << f1_micro + << ", f1_macro: " << f1_macro << ") "; + float_free_device(d_tp); float_free_device(d_fp); float_free_device(d_fn); @@ -146,7 +150,8 @@ namespace deepgalois { void Net::init() { copy_masks_device(num_samples, train_masks, d_train_masks); copy_masks_device(num_samples, val_masks, d_val_masks); - context->copy_data_to_device(); // copy labels and input features to the device + context + ->copy_data_to_device(); // copy labels and input features to the device } void Net::copy_test_masks_to_device() { @@ -156,21 +161,25 @@ void Net::copy_test_masks_to_device() { // add weight decay void Net::regularize() { size_t layer_id = 0; - auto n = feature_dims[layer_id] * feature_dims[layer_id+1]; - axpy_gpu(n, weight_decay, layers[layer_id]->get_weights_device_ptr(), - layers[layer_id]->get_grads_device_ptr()); + auto n = feature_dims[layer_id] * feature_dims[layer_id + 1]; + axpy_gpu(n, weight_decay, layers[layer_id]->get_weights_device_ptr(), + layers[layer_id]->get_grads_device_ptr()); } void Net::normalize() {} -acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, - mask_t* masks, float_t* preds, label_t* ground_truth) { - return masked_accuracy_gpu(num_classes, begin, end, count, masks, preds, ground_truth); +acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, + mask_t* masks, float_t* preds, + label_t* ground_truth) { + return masked_accuracy_gpu(num_classes, begin, end, count, masks, preds, + ground_truth); } -acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, - mask_t* masks, float_t* preds, label_t* ground_truth) { - return masked_f1_score_gpu(num_classes, begin, end, count, masks, preds, ground_truth); +acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, + mask_t* masks, float_t* preds, + label_t* ground_truth) { + return masked_f1_score_gpu(num_classes, begin, end, count, masks, preds, + ground_truth); } -} // end namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/node.cpp b/libdeepgalois/src/node.cpp index fbd8d2bc6a..e5e9fa7c10 100644 --- a/libdeepgalois/src/node.cpp +++ b/libdeepgalois/src/node.cpp @@ -10,7 +10,8 @@ void edge::alloc() { void edge::merge_grads(float_t* dst) { assert(grad_ != NULL); - if(dst) delete[] dst; + if (dst) + delete[] dst; dst = new float_t[ft_dim_]; std::copy(grad_, grad_ + ft_dim_, dst); // @todo consider adding parallelism and vectorization diff --git a/libdeepgalois/src/node.cu b/libdeepgalois/src/node.cu index afaceaeaea..2151162752 100644 --- a/libdeepgalois/src/node.cu +++ b/libdeepgalois/src/node.cu @@ -5,17 +5,20 @@ namespace deepgalois { void edge::alloc() { - CUDA_CHECK(cudaMalloc((void**)&data_, num_samples_ * ft_dim_ * sizeof(float_t))); - CUDA_CHECK(cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t))); + CUDA_CHECK( + cudaMalloc((void**)&data_, num_samples_ * ft_dim_ * sizeof(float_t))); + CUDA_CHECK( + cudaMalloc((void**)&grad_, num_samples_ * ft_dim_ * sizeof(float_t))); } void edge::merge_grads(float_t* dst) { - CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(&dst, grad_, ft_dim_ * sizeof(float_t), + cudaMemcpyDeviceToHost)); } void edge::clear_grads() { - //CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t))); + // CUDA_CHECK(cudaMemset(grad_, 0, num_samples_ * ft_dim_ * sizeof(float_t))); init_const_gpu(num_samples_ * ft_dim_, 0.0, grad_); } -} +} // namespace deepgalois diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp index a73b5cd6d2..e8455e9206 100644 --- a/libdeepgalois/src/optimizer.cpp +++ b/libdeepgalois/src/optimizer.cpp @@ -6,37 +6,46 @@ namespace deepgalois { void adagrad::update(const vec_t& dW, vec_t& W) { vec_t& g = get<0>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - g[i] += dW[i] * dW[i]; - W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); - }, galois::loopname("adagrad_update")); -/* - for (size_t i = 0; i < W.size(); i++) { - g[i] += dW[i] * dW[i]; - W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); - } -*/ + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + g[i] += dW[i] * dW[i]; + W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); + }, + galois::loopname("adagrad_update")); + /* + for (size_t i = 0; i < W.size(); i++) { + g[i] += dW[i] * dW[i]; + W[i] -= alpha * dW[i] / (std::sqrt(g[i]) + eps); + } + */ } void RMSprop::update(const vec_t& dW, vec_t& W) { vec_t& g = get<0>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), - [&](const auto& i) { - g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i]; - W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps); - }, galois::loopname("rms_update")); + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + g[i] = mu * g[i] + (1 - mu) * dW[i] * dW[i]; + W[i] -= alpha * dW[i] / std::sqrt(g[i] + eps); + }, + galois::loopname("rms_update")); } void adam::update(const vec_t& dW, vec_t& W) { vec_t& mt = get<0>(W); vec_t& vt = get<1>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; - vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; - // L2 norm based update rule - W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / - std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); - }, galois::chunk_size<256>(), galois::steal(), galois::loopname("adam_update")); + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + vt[i] = b2 * vt[i] + (float_t(1) - b2) * dW[i] * dW[i]; + // L2 norm based update rule + W[i] -= alpha * (mt[i] / (float_t(1) - b1_t)) / + std::sqrt((vt[i] / (float_t(1) - b2_t)) + eps); + }, + galois::chunk_size<256>(), galois::steal(), + galois::loopname("adam_update")); b1_t *= b1; b2_t *= b2; } @@ -44,37 +53,47 @@ void adam::update(const vec_t& dW, vec_t& W) { void adamax::update(const vec_t& dW, vec_t& W) { vec_t& mt = get<0>(W); vec_t& ut = get<1>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; - ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); - // Lp norm based update rule - W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); - }, galois::loopname("adamax_update")); + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + mt[i] = b1 * mt[i] + (float_t(1) - b1) * dW[i]; + ut[i] = std::max(b2 * ut[i], std::abs(dW[i])); + // Lp norm based update rule + W[i] -= (alpha / (1.0 - b1_t)) * (mt[i] / (ut[i] + eps)); + }, + galois::loopname("adamax_update")); b1_t *= b1; } void gradient_descent::update(const vec_t& dW, vec_t& W) { - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); - }, galois::loopname("gradient_descent_update")); + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { W[i] = W[i] - alpha * (dW[i] + lambda * W[i]); }, + galois::loopname("gradient_descent_update")); } void momentum::update(const vec_t& dW, vec_t& W) { vec_t& dWprev = get<0>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); - W[i] += V; - dWprev[i] = V; - }, galois::loopname("momentum_update")); + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += V; + dWprev[i] = V; + }, + galois::loopname("momentum_update")); } void nesterov_momentum::update(const vec_t& dW, vec_t& W) { vec_t& dWprev = get<0>(W); - galois::do_all(galois::iterate((size_t)0, W.size()), [&](const auto& i) { - float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); - W[i] += (-mu) * dWprev[i] + (1 + mu) * V; - dWprev[i] = V; - }, galois::loopname("nesterov_momentum_update")); + galois::do_all( + galois::iterate((size_t)0, W.size()), + [&](const auto& i) { + float_t V = mu * dWprev[i] - alpha * (dW[i] + W[i] * lambda); + W[i] += (-mu) * dWprev[i] + (1 + mu) * V; + dWprev[i] = V; + }, + galois::loopname("nesterov_momentum_update")); } } // namespace deepgalois diff --git a/libdeepgalois/src/optimizer.cu b/libdeepgalois/src/optimizer.cu index 0fd16803fd..15f2fe5515 100644 --- a/libdeepgalois/src/optimizer.cu +++ b/libdeepgalois/src/optimizer.cu @@ -3,14 +3,14 @@ #include "deepgalois/math_functions.hh" __global__ void update_kernel(const int n, float_t alpha, float_t b1, - float_t b2, float_t b1_t, float_t b2_t, - float_t eps, float_t* mt, float_t* vt, - const float_t* dW, float_t* W) { + float_t b2, float_t b1_t, float_t b2_t, + float_t eps, float_t* mt, float_t* vt, + const float_t* dW, float_t* W) { CUDA_KERNEL_LOOP(i, n) { mt[i] = b1 * mt[i] + (1.0 - b1) * dW[i]; vt[i] = b2 * vt[i] + (1.0 - b2) * dW[i] * dW[i]; - W[i] -= alpha * (mt[i] / (1.0 - b1_t)) / - sqrtf((vt[i] / (1.0 - b2_t)) + eps); + W[i] -= + alpha * (mt[i] / (1.0 - b1_t)) / sqrtf((vt[i] / (1.0 - b2_t)) + eps); } } @@ -18,7 +18,7 @@ namespace deepgalois { template template -float_t* stateful_optimizer::get_gpu(const size_t n, const float_t *key) { +float_t* stateful_optimizer::get_gpu(const size_t n, const float_t* key) { static_assert(Index < N, "index out of range"); if (!is_allocated_device(dE_[Index][key])) { float_malloc_device(n, dE_[Index][key]); @@ -29,9 +29,9 @@ float_t* stateful_optimizer::get_gpu(const size_t n, const float_t *key) { void adam::update(const vec_t& dW, vec_t& W) {} void adam::update_gpu(const size_t n, const float_t* dW, float_t* W) { - //std::cout << "updating weights on GPU, n = " << n << "\n"; - //print_device_vector(10, dW, "dW"); - float_t* cache = get_gpu<0>(n, W); + // std::cout << "updating weights on GPU, n = " << n << "\n"; + // print_device_vector(10, dW, "dW"); + float_t* cache = get_gpu<0>(n, W); float_t* velocity = get_gpu<1>(n, W); update_kernel<<>>( @@ -52,4 +52,4 @@ void momentum::update_gpu(const size_t, const float_t*, float_t*) {} void nesterov_momentum::update_gpu(const size_t, const float_t*, float_t*) {} -} +} // namespace deepgalois diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp index 519e27496a..29f729f3a4 100644 --- a/libdeepgalois/src/reader.cpp +++ b/libdeepgalois/src/reader.cpp @@ -4,8 +4,8 @@ #include #include #include -#include /* For O_RDWR */ -#include /* For open(), creat() */ +#include /* For O_RDWR */ +#include /* For open(), creat() */ #include #include @@ -27,10 +27,13 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) { in >> m >> num_classes >> std::ws; if (is_single_class) { std::cout << "Using single-class (one-hot) labels\n"; - labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1 + labels = + new label_t[m]; // single-class (one-hot) label for each vertex: N x 1 } else { std::cout << "Using multi-class labels\n"; - labels = new label_t[m*num_classes]; // multi-class label for each vertex: N x E + labels = + new label_t[m * + num_classes]; // multi-class label for each vertex: N x E } unsigned v = 0; while (std::getline(in, line)) { @@ -44,7 +47,7 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) { break; } } else { - labels[v*num_classes+idx] = x; + labels[v * num_classes + idx] = x; } } v++; @@ -54,14 +57,15 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) { // print the number of vertex classes std::cout << "Done, unique label counts: " << num_classes << ", time: " << t_read.Millisecs() << " ms\n"; - //for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " << unsigned(labels[i]) << "\n"; + // for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " << + // unsigned(labels[i]) << "\n"; return num_classes; } //! Read features, return the length of a feature vector //! Features are stored in the Context class size_t Reader::read_features(float_t*& feats, std::string filetype) { - //filetype = "txt"; + // filetype = "txt"; std::cout << "Reading features ... "; Timer t_read; t_read.Start(); @@ -83,7 +87,7 @@ size_t Reader::read_features(float_t*& feats, std::string filetype) { feats = new float_t[m * feat_len]; if (filetype == "bin") { filename = path + dataset_str + "-feats.bin"; - in.open(filename, std::ios::binary|std::ios::in); + in.open(filename, std::ios::binary | std::ios::in); in.read((char*)feats, sizeof(float_t) * m * feat_len); } else { std::string line; @@ -101,15 +105,17 @@ size_t Reader::read_features(float_t*& feats, std::string filetype) { t_read.Stop(); std::cout << "Done, feature length: " << feat_len << ", time: " << t_read.Millisecs() << " ms\n"; - //for (auto i = 0; i < 6; i ++) - //for (auto j = 0; j < 6; j ++) - //std::cout << "feats[" << i << "][" << j << "] = " << feats[i*feat_len+j] << "\n"; + // for (auto i = 0; i < 6; i ++) + // for (auto j = 0; j < 6; j ++) + // std::cout << "feats[" << i << "][" << j << "] = " << feats[i*feat_len+j] << + // "\n"; return feat_len; } //! Get masks from datafile where first line tells range of //! set to create mask from -size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks) { +size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, + size_t& end, mask_t* masks) { bool dataset_found = false; for (int i = 0; i < NUM_DATASETS; i++) { if (dataset_str == dataset_names[i]) { @@ -142,24 +148,25 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, size_t i++; } std::cout << mask_type + "_mask range: [" << begin << ", " << end - << ") Number of valid samples: " << sample_count << " (" - << (float)sample_count/(float)n*(float)100 << "\%)\n"; + << ") Number of valid samples: " << sample_count << " (" + << (float)sample_count / (float)n * (float)100 << "\%)\n"; in.close(); return sample_count; } void Reader::progressPrint(unsigned max, unsigned i) { const unsigned nsteps = 10; - unsigned ineachstep = (max / nsteps); - if(ineachstep == 0) ineachstep = 1; + unsigned ineachstep = (max / nsteps); + if (ineachstep == 0) + ineachstep = 1; if (i % ineachstep == 0) { - int progress = ((size_t) i * 100) / max + 1; + int progress = ((size_t)i * 100) / max + 1; printf("\t%3d%%\r", progress); fflush(stdout); } } -void Reader::readGraphFromGRFile(Graph *g) { +void Reader::readGraphFromGRFile(Graph* g) { std::string filename = path + dataset_str + ".csgr"; std::ifstream ifs; ifs.open(filename); @@ -175,7 +182,7 @@ void Reader::readGraphFromGRFile(Graph *g) { exit(1); } size_t masterLength = buf.st_size; - int _MAP_BASE = MAP_PRIVATE; + int _MAP_BASE = MAP_PRIVATE; void* m = mmap(0, masterLength, PROT_READ, _MAP_BASE, masterFD, 0); if (m == MAP_FAILED) { m = 0; @@ -185,18 +192,19 @@ void Reader::readGraphFromGRFile(Graph *g) { Timer t; t.Start(); - uint64_t* fptr = (uint64_t*)m; + uint64_t* fptr = (uint64_t*)m; __attribute__((unused)) uint64_t version = le64toh(*fptr++); assert(version == 1); uint64_t sizeEdgeTy = le64toh(*fptr++); - uint64_t nv = le64toh(*fptr++); - uint64_t ne = le64toh(*fptr++); - uint64_t *outIdx = fptr; + uint64_t nv = le64toh(*fptr++); + uint64_t ne = le64toh(*fptr++); + uint64_t* outIdx = fptr; fptr += nv; - uint32_t *fptr32 = (uint32_t*)fptr; - uint32_t *outs = fptr32; + uint32_t* fptr32 = (uint32_t*)fptr; + uint32_t* outs = fptr32; fptr32 += ne; - if (ne % 2) fptr32 += 1; + if (ne % 2) + fptr32 += 1; if (sizeEdgeTy != 0) { std::cout << "LearningGraph: currently edge data not supported.\n"; exit(1); @@ -206,12 +214,13 @@ void Reader::readGraphFromGRFile(Graph *g) { auto rowptr = g->row_start_host_ptr(); for (unsigned vid = 0; vid < nv; ++vid) { g->fixEndEdge(vid, le64toh(outIdx[vid])); - auto degree = rowptr[vid+1] - rowptr[vid]; + auto degree = rowptr[vid + 1] - rowptr[vid]; for (unsigned jj = 0; jj < degree; ++jj) { unsigned eid = rowptr[vid] + jj; unsigned dst = le32toh(outs[eid]); if (dst >= nv) { - printf("\tinvalid edge from %d to %d at index %d(%d).\n", vid, dst, jj, eid); + printf("\tinvalid edge from %d to %d at index %d(%d).\n", vid, dst, jj, + eid); exit(0); } g->constructEdge(eid, dst); @@ -220,30 +229,30 @@ void Reader::readGraphFromGRFile(Graph *g) { } ifs.close(); -/* - std::string file_dims = path + dataset + "-dims.bin"; - std::string file_rowptr = path + dataset + "-rowptr.bin"; - std::string file_colidx = path + dataset + "-colidx.bin"; - index_t dims[2]; - ifs.open(file_dims, std::ios::binary|std::ios::in); - ifs.read((char*)dims, sizeof(index_t) * 2); - ifs.close(); - num_vertices_ = dims[0]; - num_edges_ = dims[1]; - degrees_ = new index_t[num_vertices_]; - rowptr_ = new index_t[num_vertices_+1]; - colidx_ = new index_t[num_edges_]; - ifs.open(file_rowptr, std::ios::binary|std::ios::in); - ifs.read((char*)rowptr_, sizeof(index_t) * (num_vertices_+1)); - ifs.close(); - ifs.open(file_colidx, std::ios::binary|std::ios::in); - ifs.read((char*)colidx_, sizeof(index_t) * num_edges_); - ifs.close(); -*/ + /* + std::string file_dims = path + dataset + "-dims.bin"; + std::string file_rowptr = path + dataset + "-rowptr.bin"; + std::string file_colidx = path + dataset + "-colidx.bin"; + index_t dims[2]; + ifs.open(file_dims, std::ios::binary|std::ios::in); + ifs.read((char*)dims, sizeof(index_t) * 2); + ifs.close(); + num_vertices_ = dims[0]; + num_edges_ = dims[1]; + degrees_ = new index_t[num_vertices_]; + rowptr_ = new index_t[num_vertices_+1]; + colidx_ = new index_t[num_edges_]; + ifs.open(file_rowptr, std::ios::binary|std::ios::in); + ifs.read((char*)rowptr_, sizeof(index_t) * (num_vertices_+1)); + ifs.close(); + ifs.open(file_colidx, std::ios::binary|std::ios::in); + ifs.read((char*)colidx_, sizeof(index_t) * num_edges_); + ifs.close(); + */ t.Stop(); double runtime = t.Millisecs(); - std::cout << "read " << masterLength << " bytes in " << runtime << " ms (" - << masterLength/1000.0/runtime << " MB/s)\n\n"; + std::cout << "read " << masterLength << " bytes in " << runtime << " ms (" + << masterLength / 1000.0 / runtime << " MB/s)\n\n"; } -} +} // namespace deepgalois diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index f61f1bcaa4..0ac77526f3 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -1,113 +1,134 @@ #include "deepgalois/utils.h" #include "deepgalois/sampler.h" #include "galois/Galois.h" -#include +#include #include #define PARALLEL_GEN namespace deepgalois { -inline unsigned getDegree(Graph *g, index_t v) { - //return g->get_degree(v); - //return std::distance(g->edge_begin(v), g->edge_end(v)); +inline unsigned getDegree(Graph* g, index_t v) { + // return g->get_degree(v); + // return std::distance(g->edge_begin(v), g->edge_end(v)); return g->edge_end(v) - g->edge_begin(v); } -void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, mask_t *masks, Graph *g) { - //galois::gPrint("Set masked graph: begin=", begin, ", end=", end, ", count=", count, "\n"); +void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, + mask_t* masks, Graph* g) { + // galois::gPrint("Set masked graph: begin=", begin, ", end=", end, ", + // count=", count, "\n"); begin_ = begin; - end_ = end; + end_ = end; count_ = count; masks_ = masks; - graph = g; + graph = g; #ifndef GALOIS_USE_DIST masked_graph = new Graph(); #endif - //generate_masked_graph(g->size(), masks, g, *masked_graph); + // generate_masked_graph(g->size(), masks, g, *masked_graph); std::vector degrees(g->size(), 0); get_masked_degrees(g->size(), masks, g, degrees); auto offsets = deepgalois::parallel_prefix_sum(degrees); - size_t ne = offsets[g->size()]; + size_t ne = offsets[g->size()]; for (size_t i = 0; i < g->size(); i++) { - if (masks[i] == 1) node_train.push_back(i); + if (masks[i] == 1) + node_train.push_back(i); } masked_graph->allocateFrom(g->size(), ne); masked_graph->constructNodes(); - galois::do_all(galois::iterate((size_t)0, g->size()), [&](const auto src) { - masked_graph->fixEndEdge(src, offsets[src+1]); - if (masks[src] == 1) { - auto idx = offsets[src]; - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - const auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) masked_graph->constructEdge(idx++, dst, 0); - } - } - }, galois::loopname("gen_subgraph")); + galois::do_all( + galois::iterate((size_t)0, g->size()), + [&](const auto src) { + masked_graph->fixEndEdge(src, offsets[src + 1]); + if (masks[src] == 1) { + auto idx = offsets[src]; + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) + masked_graph->constructEdge(idx++, dst, 0); + } + } + }, + galois::loopname("gen_subgraph")); masked_graph->degree_counting(); - avg_deg = masked_graph->sizeEdges() / masked_graph->size(); + avg_deg = masked_graph->sizeEdges() / masked_graph->size(); subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg; - //galois::gPrint("Train graph: num_vertices ", masked_graph->size(), " num_edges ", masked_graph->sizeEdges(), " avg_degree ", avg_deg, "\n"); + // galois::gPrint("Train graph: num_vertices ", masked_graph->size(), " + // num_edges ", masked_graph->sizeEdges(), " avg_degree ", avg_deg, "\n"); size_t idx = 0; vertices_.resize(count); for (size_t i = begin; i < end; i++) { - if (masks_[i] == 1) vertices_[idx++] = i; + if (masks_[i] == 1) + vertices_[idx++] = i; } } -void Sampler::get_masked_degrees(size_t n, mask_t *masks, Graph *g, std::vector °rees) { +void Sampler::get_masked_degrees(size_t n, mask_t* masks, Graph* g, + std::vector& degrees) { assert(degrees.size() == n); #ifdef PARALLEL_GEN - galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { + galois::do_all( + galois::iterate(size_t(0), n), + [&](const auto src) { #else for (size_t src = 0; src < n; src++) { #endif - if (masks[src] == 1) { - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - const auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) degrees[src] ++; + if (masks[src] == 1) { + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) + degrees[src]++; + } + } } - } - } #ifdef PARALLEL_GEN - , galois::loopname("update_degrees")); + , + galois::loopname("update_degrees")); #endif } -void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& sub) { +void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, + Graph& sub) { std::vector degrees(n, 0); get_masked_degrees(n, masks, g, degrees); - //auto offsets = deepgalois::parallel_prefix_sum(degrees); + // auto offsets = deepgalois::parallel_prefix_sum(degrees); auto offsets = deepgalois::prefix_sum(degrees); - size_t ne = offsets[n]; - //galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", ne, "\n"); + size_t ne = offsets[n]; + // galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", + // ne, "\n"); #ifndef GALOIS_USE_DIST sub.allocateFrom(n, ne); sub.constructNodes(); #ifdef PARALLEL_GEN - galois::do_all(galois::iterate((size_t)0, n), [&](const auto src) { + galois::do_all( + galois::iterate((size_t)0, n), + [&](const auto src) { #else for (size_t src = 0; src < n; src++) { #endif - sub.fixEndEdge(src, offsets[src+1]); - if (masks[src] == 1) { - auto idx = offsets[src]; - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - const auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) sub.constructEdge(idx++, dst, 0); + sub.fixEndEdge(src, offsets[src + 1]); + if (masks[src] == 1) { + auto idx = offsets[src]; + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) + sub.constructEdge(idx++, dst, 0); + } + } } - } - } #ifdef PARALLEL_GEN - , galois::loopname("gen_subgraph")); + , + galois::loopname("gen_subgraph")); #endif #endif } -void Sampler::check_DB(std::vector &DB0, std::vector &DB1, std::vector &DB2, size_t size) { +void Sampler::check_DB(std::vector& DB0, std::vector& DB1, + std::vector& DB2, size_t size) { if (DB0.capacity() < size) { - DB0.reserve(DB0.capacity()*2); - DB1.reserve(DB1.capacity()*2); - DB2.reserve(DB2.capacity()*2); + DB0.reserve(DB0.capacity() * 2); + DB1.reserve(DB1.capacity() * 2); + DB2.reserve(DB2.capacity() * 2); } DB0.resize(size); DB1.resize(size); @@ -116,25 +137,26 @@ void Sampler::check_DB(std::vector &DB0, std::vector &DB1, std::vect void print_vertex_set(VertexSet vertex_set) { unsigned counter = 0; - unsigned n = vertex_set.size(); + unsigned n = vertex_set.size(); galois::gPrint("( "); for (int i : vertex_set) { - counter ++; - if (counter > 16 && counter < n-16) continue; + counter++; + if (counter > 16 && counter < n - 16) + continue; galois::gPrint(i, " "); } galois::gPrint(")\n"); } -void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) { - //unsigned myseed = time(NULL); +void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned tid) { + // unsigned myseed = time(NULL); unsigned myseed = tid + time(NULL); - //unsigned myseed = tid; - //DBx: Dashboard line x, IAx: Index array line x + // unsigned myseed = tid; + // DBx: Dashboard line x, IAx: Index array line x std::vector DB0, DB1, DB2, IA0, IA1, IA2, IA3, IA4, nDB0, nDB1, nDB2; - DB0.reserve(subg_deg*m*ETA); - DB1.reserve(subg_deg*m*ETA); - DB2.reserve(subg_deg*m*ETA); + DB0.reserve(subg_deg * m * ETA); + DB1.reserve(subg_deg * m * ETA); + DB2.reserve(subg_deg * m * ETA); IA0.reserve(n); IA1.reserve(n); IA2.reserve(n); @@ -145,11 +167,11 @@ void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) { IA2.resize(m); IA3.resize(m); - //galois::gPrint("seed ", myseed, " m ", m, "\n"); - //galois::gPrint("node_train size: ", node_train.size(), "\n"); - //printf("( "); - //for (size_t i = 0; i < 10; i++) std::cout << node_train[i] << " "; - //printf(")\n"); + // galois::gPrint("seed ", myseed, " m ", m, "\n"); + // galois::gPrint("node_train size: ", node_train.size(), "\n"); + // printf("( "); + // for (size_t i = 0; i < 10; i++) std::cout << node_train[i] << " "; + // printf(")\n"); for (int i = 0; i < m; i++) { auto rand_idx = rand_r(&myseed) % node_train.size(); db_t v = IA3[i] = node_train[rand_idx]; @@ -159,61 +181,67 @@ void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) { IA1[i] = 1; IA2[i] = 0; } - // calculate prefix sum for IA0 and store in IA2 to compute the address for each frontier in DB + // calculate prefix sum for IA0 and store in IA2 to compute the address for + // each frontier in DB IA2[0] = IA0[0]; - for (int i = 1; i < m; i++) IA2[i] = IA2[i-1] + IA0[i]; + for (int i = 1; i < m; i++) + IA2[i] = IA2[i - 1] + IA0[i]; // now fill DB accordingly - check_DB(DB0, DB1, DB2, IA2[m-1]); + check_DB(DB0, DB1, DB2, IA2[m - 1]); for (int i = 0; i < m; i++) { - db_t DB_start = (i==0) ? 0 : IA2[i-1]; - db_t DB_end = IA2[i]; + db_t DB_start = (i == 0) ? 0 : IA2[i - 1]; + db_t DB_end = IA2[i]; for (auto j = DB_start; j < DB_end; j++) { DB0[j] = IA3[i]; - DB1[j] = (j==DB_start) ? (j-DB_end) : (j-DB_start); + DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start); DB2[j] = i + 1; } } db_t choose, neigh_v, newsize, tmp; - for (size_t itr = 0; itr < n-m; itr++) { + for (size_t itr = 0; itr < n - m; itr++) { choose = db_t(-1); while (choose == db_t(-1)) { tmp = rand_r(&myseed) % DB0.size(); if (size_t(tmp) < DB0.size()) - if (DB0[tmp] != db_t(-1)) choose = tmp; + if (DB0[tmp] != db_t(-1)) + choose = tmp; } - choose = (DB1[choose] < 0) ? choose : (choose - DB1[choose]); - db_t v = DB0[choose]; + choose = (DB1[choose] < 0) ? choose : (choose - DB1[choose]); + db_t v = DB0[choose]; auto degree = getDegree(masked_graph, v); - neigh_v = (degree!=0) ? rand_r(&myseed)%degree : db_t(-1); + neigh_v = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1); if (neigh_v != db_t(-1)) { - neigh_v = masked_graph->getEdgeDst(masked_graph->edge_begin(v)+neigh_v); + neigh_v = masked_graph->getEdgeDst(masked_graph->edge_begin(v) + neigh_v); st.insert(neigh_v); - IA1[DB2[choose]-1] = 0; - IA0[DB2[choose]-1] = 0; - for (auto i = choose; i < choose-DB1[choose]; i++) DB0[i] = db_t(-1); + IA1[DB2[choose] - 1] = 0; + IA0[DB2[choose] - 1] = 0; + for (auto i = choose; i < choose - DB1[choose]; i++) + DB0[i] = db_t(-1); newsize = getDegree(masked_graph, neigh_v); newsize = (newsize > SAMPLE_CLIP) ? SAMPLE_CLIP : newsize; - } - else newsize = 0; - //shrink DB to remove sampled nodes, also shrink IA accordingly + } else + newsize = 0; + // shrink DB to remove sampled nodes, also shrink IA accordingly bool cond = DB0.size() + newsize > DB0.capacity(); if (cond) { // compute prefix sum for the location in shrinked DB IA4.resize(IA0.size()); - IA4[0]=IA0[0]; - for (size_t i = 1; i < IA0.size(); i++) IA4[i] = IA4[i-1] + IA0[i]; + IA4[0] = IA0[0]; + for (size_t i = 1; i < IA0.size(); i++) + IA4[i] = IA4[i - 1] + IA0[i]; nDB0.resize(IA4.back()); nDB1.resize(IA4.back()); nDB2.resize(IA4.back()); IA2.assign(IA4.begin(), IA4.end()); for (size_t i = 0; i < IA0.size(); i++) { - if (IA1[i] == 0) continue; - db_t DB_start = (i==0) ? 0 : IA4[i-1]; - db_t DB_end = IA4[i]; + if (IA1[i] == 0) + continue; + db_t DB_start = (i == 0) ? 0 : IA4[i - 1]; + db_t DB_end = IA4[i]; for (auto j = DB_start; j < DB_end; j++) { nDB0[j] = IA3[i]; - nDB1[j] = (j==DB_start) ? (j-DB_end) : (j-DB_start); + nDB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start); nDB2[j] = i + 1; } } @@ -221,18 +249,19 @@ void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) { IA4.resize(IA1.size()); IA4[0] = IA1[0]; for (size_t i = 1; i < IA1.size(); i++) - IA4[i] = IA4[i-1] + IA1[i]; + IA4[i] = IA4[i - 1] + IA1[i]; DB0.assign(nDB0.begin(), nDB0.end()); DB1.assign(nDB1.begin(), nDB1.end()); DB2.assign(nDB2.begin(), nDB2.end()); - for (auto i = DB2.begin(); i < DB2.end(); i++) *i = IA4[*i - 1]; - db_t curr=0; + for (auto i = DB2.begin(); i < DB2.end(); i++) + *i = IA4[*i - 1]; + db_t curr = 0; for (size_t i = 0; i < IA0.size(); i++) { if (IA0[i] != 0) { - IA0[curr]=IA0[i]; - IA1[curr]=IA1[i]; - IA2[curr]=IA2[i]; - IA3[curr]=IA3[i]; + IA0[curr] = IA0[i]; + IA1[curr] = IA1[i]; + IA2[curr] = IA2[i]; + IA3[curr] = IA3[i]; curr++; } } @@ -241,21 +270,21 @@ void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) { IA2.resize(curr); IA3.resize(curr); } - check_DB(DB0, DB1, DB2, newsize+DB0.size()); + check_DB(DB0, DB1, DB2, newsize + DB0.size()); IA0.push_back(newsize); IA1.push_back(1); IA2.push_back(IA2.back() + IA0.back()); IA3.push_back(neigh_v); db_t DB_start = (*(IA2.end() - 2)); - db_t DB_end = IA2.back(); + db_t DB_end = IA2.back(); for (auto j = DB_start; j < DB_end; j++) { DB0[j] = IA3.back(); - DB1[j] = (j==DB_start) ? (j-DB_end) : (j-DB_start); + DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start); DB2[j] = IA3.size(); } } - //galois::gPrint("Done selection, vertex_set size: ", st.size(), ", set: "); - //print_vertex_set(st); + // galois::gPrint("Done selection, vertex_set size: ", st.size(), ", set: "); + // print_vertex_set(st); } // !API function for user-defined selection strategy @@ -263,48 +292,53 @@ void Sampler::select_vertices(size_t n, int m, VertexSet &st, unsigned tid) { // nv: number of vertices in the original graph; // n: number of vertices in the subgraph; // m: number of vertices in the frontier. -void Sampler::select_vertices(size_t nv, size_t n, int m, Graph *g, VertexList vertices, VertexSet &vertex_set) { - //galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices, graph size: ", g->size(), "\n"); +void Sampler::select_vertices(size_t nv, size_t n, int m, Graph* g, + VertexList vertices, VertexSet& vertex_set) { + // galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices, + // graph size: ", g->size(), "\n"); assert(nv == vertices.size()); - auto frontier_indices = deepgalois::select_k_items(m, 0, (int)nv); // randomly select m vertices from vertices as frontier + auto frontier_indices = deepgalois::select_k_items( + m, 0, (int)nv); // randomly select m vertices from vertices as frontier VertexList frontier(m); for (int i = 0; i < m; i++) frontier[i] = vertices[frontier_indices[i]]; vertex_set.insert(frontier.begin(), frontier.end()); - //galois::gPrint("vertex_set size: ", vertex_set.size(), "\n"); - int *degrees = new int[m]; + // galois::gPrint("vertex_set size: ", vertex_set.size(), "\n"); + int* degrees = new int[m]; for (int i = 0; i < m; i++) { - //galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) { + // galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) { degrees[i] = (int)getDegree(g, frontier[i]); - }//, galois::loopname("compute_degrees")); + } //, galois::loopname("compute_degrees")); for (size_t i = 0; i < n - m; i++) { - auto pos = select_one_item((int)m, degrees); - auto u = frontier[pos]; + auto pos = select_one_item((int)m, degrees); + auto u = frontier[pos]; auto degree = degrees[pos]; - int j =0; - for (; j < degree; j ++) { + int j = 0; + for (; j < degree; j++) { auto neighbor_id = rand() % degree; // randomly select a neighbor - auto dst = g->getEdgeDst(g->edge_begin(u) + neighbor_id); + auto dst = g->getEdgeDst(g->edge_begin(u) + neighbor_id); if (vertex_set.find(dst) == vertex_set.end()) { frontier[pos] = dst; - degrees[pos] = getDegree(g, frontier[pos]); + degrees[pos] = getDegree(g, frontier[pos]); vertex_set.insert(dst); break; } } - if (j == degree) galois::gPrint("Not found from ", degree, " neighbors\n"); + if (j == degree) + galois::gPrint("Not found from ", degree, " neighbors\n"); } /* - assert(n == vertex_set.size()); // size of vertex_set could be slightly smaller than n - galois::gPrint("Done selection, vertex_set size: ", vertex_set.size(), ", set: "); - print_vertex_set(vertex_set); + assert(n == vertex_set.size()); // size of vertex_set could be slightly + smaller than n galois::gPrint("Done selection, vertex_set size: ", + vertex_set.size(), ", set: "); print_vertex_set(vertex_set); */ } -void Sampler::update_masks(size_t n, VertexSet vertices, mask_t *masks) { - //galois::gPrint("Updating masks, size = ", vertices.size(), "\n"); - std::fill(masks, masks+n, 0); - for (auto v : vertices) masks[v] = 1; +void Sampler::update_masks(size_t n, VertexSet vertices, mask_t* masks) { + // galois::gPrint("Updating masks, size = ", vertices.size(), "\n"); + std::fill(masks, masks + n, 0); + for (auto v : vertices) + masks[v] = 1; } inline VertexList Sampler::reindexing_vertice(size_t n, VertexSet vertex_set) { @@ -316,55 +350,64 @@ inline VertexList Sampler::reindexing_vertice(size_t n, VertexSet vertex_set) { return new_ids; } -// Given a subset of vertices and a graph g, generate a subgraph sg from the graph g -void Sampler::generate_subgraph(VertexSet &vertex_set, Graph &g, Graph &sub) { - //auto n = g.size(); // old graph size - auto nv = vertex_set.size(); // new graph (subgraph) size +// Given a subset of vertices and a graph g, generate a subgraph sg from the +// graph g +void Sampler::generate_subgraph(VertexSet& vertex_set, Graph& g, Graph& sub) { + // auto n = g.size(); // old graph size + auto nv = vertex_set.size(); // new graph (subgraph) size VertexList new_ids = reindexing_vertice(graph->size(), vertex_set); std::vector degrees(nv, 0); // degrees of vertices in the subgraph for (auto v : vertex_set) { - degrees[new_ids[v]] = getDegree(&g, v); + degrees[new_ids[v]] = getDegree(&g, v); } - //auto offsets = deepgalois::parallel_prefix_sum(degrees); + // auto offsets = deepgalois::parallel_prefix_sum(degrees); auto offsets = deepgalois::prefix_sum(degrees); - auto ne = offsets[nv]; - //galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne, "\n"); + auto ne = offsets[nv]; + // galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne, + // "\n"); #ifndef GALOIS_USE_DIST sub.allocateFrom(nv, ne); sub.constructNodes(); VertexList old_ids(vertex_set.begin(), vertex_set.end()); // vertex ID mapping #ifdef PARALLEL_GEN - galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) { + galois::do_all( + galois::iterate((size_t)0, nv), + [&](const auto i) { #else for (size_t i = 0; i < nv; i++) { #endif - sub.fixEndEdge(i, offsets[i+1]); - unsigned j = 0; - auto old_id = old_ids[i]; - for (auto e = g.edge_begin(old_id); e != g.edge_end(old_id); e++) { - auto dst = new_ids[g.getEdgeDst(e)]; - assert(dst < nv); - sub.constructEdge(offsets[i]+j, dst, 0); - j ++; - } - } + sub.fixEndEdge(i, offsets[i + 1]); + unsigned j = 0; + auto old_id = old_ids[i]; + for (auto e = g.edge_begin(old_id); e != g.edge_end(old_id); e++) { + auto dst = new_ids[g.getEdgeDst(e)]; + assert(dst < nv); + sub.constructEdge(offsets[i] + j, dst, 0); + j++; + } + } #ifdef PARALLEL_GEN - , galois::loopname("construct_graph")); + , + galois::loopname("construct_graph")); #endif #endif } -void Sampler::subgraph_sample(size_t n, Graph&sg, mask_t *masks, unsigned tid) { +void Sampler::subgraph_sample(size_t n, Graph& sg, mask_t* masks, + unsigned tid) { VertexSet vertex_set; // n = 9000 by default - //select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m = 1000 by default + // select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m = + // 1000 by default select_vertices(n, m_, vertex_set, tid); // m = 1000 by default - update_masks(graph->size(), vertex_set, masks); // set masks for vertices in the vertex_set + update_masks(graph->size(), vertex_set, + masks); // set masks for vertices in the vertex_set #ifndef GALOIS_USE_DIST Graph masked_sg; - generate_masked_graph(graph->size(), masks, masked_graph, masked_sg); // remove edges whose destination is not masked + generate_masked_graph( + graph->size(), masks, masked_graph, + masked_sg); // remove edges whose destination is not masked generate_subgraph(vertex_set, masked_sg, sg); #endif } -} // end namespace - +} // namespace deepgalois diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index 00a7d5696a..3f67974c67 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -5,60 +5,66 @@ namespace deepgalois { // parallel prefix sum template -OutTy* parallel_prefix_sum(const std::vector &in) { - const size_t block_size = 1<<20; +OutTy* parallel_prefix_sum(const std::vector& in) { + const size_t block_size = 1 << 20; const size_t num_blocks = (in.size() + block_size - 1) / block_size; std::vector local_sums(num_blocks); // count how many bits are set on each thread - galois::do_all(galois::iterate((size_t)0, num_blocks), [&](const size_t& block) { - OutTy lsum = 0; - size_t block_end = std::min((block + 1) * block_size, in.size()); - for (size_t i=block * block_size; i < block_end; i++) - lsum += in[i]; - local_sums[block] = lsum; - }); - std::vector bulk_prefix(num_blocks+1); + galois::do_all( + galois::iterate((size_t)0, num_blocks), [&](const size_t& block) { + OutTy lsum = 0; + size_t block_end = std::min((block + 1) * block_size, in.size()); + for (size_t i = block * block_size; i < block_end; i++) + lsum += in[i]; + local_sums[block] = lsum; + }); + std::vector bulk_prefix(num_blocks + 1); OutTy total = 0; - for (size_t block=0; block < num_blocks; block++) { + for (size_t block = 0; block < num_blocks; block++) { bulk_prefix[block] = total; total += local_sums[block]; } bulk_prefix[num_blocks] = total; - OutTy *prefix = new OutTy[in.size() + 1]; - galois::do_all(galois::iterate((size_t)0, num_blocks), [&](const size_t& block) { - OutTy local_total = bulk_prefix[block]; - size_t block_end = std::min((block + 1) * block_size, in.size()); - for (size_t i=block * block_size; i < block_end; i++) { - prefix[i] = local_total; - local_total += in[i]; - } - }); + OutTy* prefix = new OutTy[in.size() + 1]; + galois::do_all( + galois::iterate((size_t)0, num_blocks), [&](const size_t& block) { + OutTy local_total = bulk_prefix[block]; + size_t block_end = std::min((block + 1) * block_size, in.size()); + for (size_t i = block * block_size; i < block_end; i++) { + prefix[i] = local_total; + local_total += in[i]; + } + }); prefix[in.size()] = bulk_prefix[num_blocks]; return prefix; } -template uint32_t* parallel_prefix_sum(const std::vector &in); +template uint32_t* +parallel_prefix_sum(const std::vector& in); // Compute the F1 score, also known as balanced F-score or F-measure -// The F1 score can be interpreted as a weighted average of the precision and recall, -// where an F1 score reaches its best value at 1 and worst score at 0. +// The F1 score can be interpreted as a weighted average of the precision and +// recall, where an F1 score reaches its best value at 1 and worst score at 0. // The relative contribution of precision and recall to the F1 score are equal. // The formula for the F1 score is: // F1 = 2 * (precision * recall) / (precision + recall) // where precision = TP / (TP + FP), recall = TP / (TP + FN) // TP: true positive; FP: false positive; FN: false negative. -// In the multi-class and multi-label case, this is the weighted average of the F1 score of each class. -// Please refer to https://sebastianraschka.com/faq/docs/multiclass-metric.html, -// http://pageperso.lif.univ-mrs.fr/~francois.denis/IAAM1/scikit-learn-docs.pdf (p.1672) -// and https://github.com/ashokpant/accuracy-evaluation-cpp/blob/master/src/evaluation.hpp -acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t *masks, - size_t num_classes, label_t *ground_truth, float_t *pred) { +// In the multi-class and multi-label case, this is the weighted average of the +// F1 score of each class. Please refer to +// https://sebastianraschka.com/faq/docs/multiclass-metric.html, +// http://pageperso.lif.univ-mrs.fr/~francois.denis/IAAM1/scikit-learn-docs.pdf +// (p.1672) and +// https://github.com/ashokpant/accuracy-evaluation-cpp/blob/master/src/evaluation.hpp +acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks, + size_t num_classes, label_t* ground_truth, + float_t* pred) { double precision_cls(0.), recall_cls(0.), f1_accum(0.); int tp_accum(0), fn_accum(0), fp_accum(0), tn_accum(0); for (size_t col = 0; col < num_classes; col++) { int tp_cls(0), fp_cls(0), fn_cls(0), tn_cls(0); - for (size_t row = begin; row < end; row ++) { - //galois::do_all(galois::iterate(begin, end), [&](const auto& row) { + for (size_t row = begin; row < end; row++) { + // galois::do_all(galois::iterate(begin, end), [&](const auto& row) { if (masks == NULL || masks[row] == 1) { auto idx = row * num_classes + col; if (ground_truth[idx] == 1 && pred[idx] > 0.5) { @@ -81,18 +87,31 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t *masks, fn_accum += fn_cls; fp_accum += fp_cls; tn_accum += tn_cls; - precision_cls = tp_cls + fp_cls > 0 ? (double)tp_cls/(double)(tp_cls+fp_cls) : 0.; - recall_cls = tp_cls+fn_cls > 0 ? (double)tp_cls/(double)(tp_cls+fn_cls) : 0.; - f1_accum += recall_cls+precision_cls > 0. ? 2.*(recall_cls*precision_cls)/(recall_cls+precision_cls) : 0.; + precision_cls = + tp_cls + fp_cls > 0 ? (double)tp_cls / (double)(tp_cls + fp_cls) : 0.; + recall_cls = + tp_cls + fn_cls > 0 ? (double)tp_cls / (double)(tp_cls + fn_cls) : 0.; + f1_accum += + recall_cls + precision_cls > 0. + ? 2. * (recall_cls * precision_cls) / (recall_cls + precision_cls) + : 0.; } - double f1_macro = f1_accum/(double)num_classes; - //double accuracy_mic = (double)(tp_accum+tn_accum)/(double)(tp_accum+tn_accum+fp_accum+fn_accum); - double precision_mic = tp_accum+fp_accum > 0 ? (double)tp_accum/(double)(tp_accum+fp_accum) : 0.; - double recall_mic = tp_accum+fn_accum > 0 ? (double)tp_accum/(double)(tp_accum+fn_accum) : 0.; - double f1_micro = recall_mic+precision_mic > 0. ? 2.*(recall_mic*precision_mic)/(recall_mic+precision_mic) : 0.; - std::cout << std::setprecision(3) << std::fixed << - " (f1_micro: " << f1_micro << ", f1_macro: " << f1_macro << ") "; + double f1_macro = f1_accum / (double)num_classes; + // double accuracy_mic = + // (double)(tp_accum+tn_accum)/(double)(tp_accum+tn_accum+fp_accum+fn_accum); + double precision_mic = tp_accum + fp_accum > 0 + ? (double)tp_accum / (double)(tp_accum + fp_accum) + : 0.; + double recall_mic = tp_accum + fn_accum > 0 + ? (double)tp_accum / (double)(tp_accum + fn_accum) + : 0.; + double f1_micro = + recall_mic + precision_mic > 0. + ? 2. * (recall_mic * precision_mic) / (recall_mic + precision_mic) + : 0.; + std::cout << std::setprecision(3) << std::fixed << " (f1_micro: " << f1_micro + << ", f1_macro: " << f1_macro << ") "; return f1_micro; } -} // end namespace +} // namespace deepgalois diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp index 97e1d71447..f2d08d3cb3 100644 --- a/lonestar/gnn/gcn/gcn.cpp +++ b/lonestar/gnn/gcn/gcn.cpp @@ -14,14 +14,14 @@ int main(int argc, char** argv) { LonestarGnnStart(argc, argv, name, desc, url); // the neural network to train: loads the entire graph on CPU - deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, - hidden1, learning_rate, dropout_rate, weight_decay, - add_selfloop, is_single_class, add_l2norm, add_dense, - neighbor_sample_sz, subgraph_sample_sz, val_interval); + deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1, + learning_rate, dropout_rate, weight_decay, + add_selfloop, is_single_class, add_l2norm, add_dense, + neighbor_sample_sz, subgraph_sample_sz, val_interval); std::vector dummyVec; deepgalois::Graph* dGraph = - galois::graphs::constructSymmetricGraph(dummyVec); + galois::graphs::constructSymmetricGraph(dummyVec); network.dist_init(dGraph, dataset); // read network, features, ground truth, initialize metadata @@ -30,7 +30,7 @@ int main(int argc, char** argv) { network.print_layers_info(); deepgalois::ResourceManager rm; // tracks peak memory usage - // the optimizer used to update parameters, + // the optimizer used to update parameters, // see optimizer.h for more details // optimizer *opt = new gradient_descent(); // optimizer *opt = new adagrad(); From 9d254bbe603928fec343dcc6661f89b3afbf9d72 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Fri, 8 May 2020 18:03:32 -0500 Subject: [PATCH 263/660] add sampler.cu --- libdeepgalois/src/sampler.cu | 145 +++++++++++++++++++++++++++++++++++ libgpu/include/graph_gpu.h | 17 +++- libgpu/src/csr_graph.cu | 9 +++ 3 files changed, 170 insertions(+), 1 deletion(-) create mode 100644 libdeepgalois/src/sampler.cu diff --git a/libdeepgalois/src/sampler.cu b/libdeepgalois/src/sampler.cu new file mode 100644 index 0000000000..cecfa6c9e0 --- /dev/null +++ b/libdeepgalois/src/sampler.cu @@ -0,0 +1,145 @@ +#include +#include +#include "deepgalois/sampler.h" + +namespace deepgalois { + +// set the masks of vertices in a given vertex set +// n is the size of the vertex set +__global__ void set_masks(index_t n, index_t* vertices, mask_t* masks) { + CUDA_KERNEL_LOOP(i, n) { masks[vertices[i]] = 1; } +} + +// compute the degrees of a masked graph +// n is the size of the original graph +__global__ void get_masked_degrees(index_t n, mask_t *masks, GraphGPU g, index_t* degrees) { + CUDA_KERNEL_LOOP(src, n) { + if (masks[src] == 1) { + for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { + auto dst = g.getEdgeDst(e); + if (masks[dst] == 1) degrees[src] ++; + } + } + } +} + +// Given a graph, remove any edge which has end-point masked, and generate the subgraph +// n is the size of the original graph and the subgraph +// offset was computed by using prefix-sum of the masked degrees +__global__ void generate_masked_graph_kernel(index_t n, const mask_t *masks, const index_t* offsets, GraphGPU g, GraphGPU subg) { + CUDA_KERNEL_LOOP(src, n) { + subg.fixEndEdge(src, offsets[src+1]); + if (masks[src] == 1) { + auto idx = offsets[src]; + for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { + auto dst = g.getEdgeDst(e); + if (masks[dst] == 1) subg.constructEdge(idx++, dst); + } + } + } +} + +// compute the degrees of the subgraph induced by the vertex set +// n is the size of the vertex set +// new_ids array maps vertex ID in the original graph to the vertex ID in the subgraph +__global__ void get_new_degrees(index_t n, index_t* vertices, index_t* new_ids, GraphGPU g, index_t* degrees) { + CUDA_KERNEL_LOOP(i, n) { + auto v = vertices[i]; + degrees[new_ids[v]] = g.getOutDegree(v); + } +} + +// Given a masked graph, remove the masked vertices, reindex the rest vertices, and generate the subgraph +// offset was computed by using prefix-sum of the new degrees +// n is the size of the old_ids and the sbugraph +__global__ void generate_graph_kernel(index_t n, const index_t* offsets, const index_t* old_ids, const index_t* new_ids, GraphGPU g, GraphGPU subg) { + CUDA_KERNEL_LOOP(i, n) { + subg.fixEndEdge(i, offsets[i+1]); + index_t j = 0; + auto src = old_ids[i]; + for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { + auto dst = new_ids[g.getEdgeDst(e)]; + assert(dst < n); + subg.constructEdge(offsets[i] + j, dst); + j++; + } + } +} + +void Sampler::update_masks(size_t n, index_t* vertices, mask_t *masks) { + set_masks<<>>(n, vertices, masks); +} + +void Sampler::indexing(size_t n, index_t* vertices, index_t *new_indices) { + index_t vid = 0; + for (index_t i = 0; i < n; i++) { + auto v = vertices[i]; + new_indices[v] = vid ++; + } +} + +inline VertexList Sampler::reindexing_vertices(size_t n, VertexSet vertex_set) { + VertexList new_ids(n, 0); + int vid = 0; + for (auto v : vertex_set) { + new_ids[v] = vid++; // reindex + } + return new_ids; +} + +void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU *g, GraphGPU *subg) { + index_t *degrees, *offsets; + CUDA_CHECK(cudaMalloc((void**)°rees, sizeof(index_t)*n); + get_masked_degrees<<>>(n, masks, g, degrees); + CUDA_CHECK(cudaFree(degrees)); + CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(n+1)); + thrust::exclusive_scan(thrust::device, degrees, degrees+n, offsets); + index_t ne; + CUDA_CHECK(cudaMemcpy(&ne, offsets+n, sizeof(index_t), cudaMemcpyDeviceToHost)); + subg.allocateFrom(n, ne); // TODO: avoid reallocation + generate_masked_graph_kernel<<>>(n, masks, offsets, g, subg); + CUDA_CHECK(cudaFree(pffsets)); +} + +// use a random walk to select vertex subset +void Sampler::select_vertices(size_t n, int m, VertexSet &st) { +} + +// n: size of the original graph +// nv: size of the subgraph; i.e. size of vertex_set +// masks, graph g and subgraph sub are on the device (GPU) +void Sampler::generate_subgraph(index_t nv, VertexSet vertex_set, mask_t* masks, GraphGPU *g, GraphGPU *sub) { + // convert the vertex_set to a vertex_list and copy it to the device + VertexList vertex_list(vertex_set.begin(), vertex_set.end()); + index_t *d_vertex_list; + cudaMalloc((void **) &d_vertex_list, nv*sizeof(index_t)); + CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv*sizeof(index_t), cudaMemcpyHostToDevice)); + + index_t n = graph->size(); + update_masks(n, d_vertex_list, masks); // set masks for vertices in the vertex_set + GraphGPU masked_sg; // size is the same as original graph, but masked dst removed + generate_masked_graph(n, masks, g, &masked_sg); // remove edges whose destination is not masked + + // re-index the subgraph + index_t *d_new_ids; // Given an old vertex ID โˆˆ [0, n), returns a new vertex ID โˆˆ [0, nv) + cudaMalloc((void **) &d_new_ids, n*sizeof(index_t)); + auto new_ids = reindexing_vertices(nv, vertex_set); + CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n*sizeof(index_t), cudaMemcpyHostToDevice)); + + // generate the offsets for the re-indexed subgraph + index_t *degrees, *offsets; + CUDA_CHECK(cudaMalloc((void**)°rees, sizeof(index_t)*nv); + get_new_degrees<<>>(nv, d_vertex_list, d_new_ids, masked_sg, degrees); + CUDA_CHECK(cudaFree(degrees)); + CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(nv+1)); + thrust::exclusive_scan(thrust::device, degrees, degrees+nv, offsets); + index_t ne; + CUDA_CHECK(cudaMemcpy(&ne, offsets+nv, sizeof(index_t), cudaMemcpyDeviceToHost)); + + // allocate memory for the subgraph + sub.allocateFrom(nv, ne); // avoid reallocation + // generate the subgraph + generate_graph_kernel<<>>(nv, offsets, d_vertex_list, d_new_ids, masked_sg, sub); +} + +} diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index b47ed326b1..449e38a7b5 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -167,7 +167,22 @@ struct CSRGraph { CUDA_HOSTDEV const node_data_type *node_data_ptr() const { return node_data; } CUDA_HOSTDEV edge_data_type *edge_data_ptr() { return edge_data; } CUDA_HOSTDEV const edge_data_type *edge_data_ptr() const { return edge_data; } - + CUDA_HOSTDEV void fixEndEdge(index_type vid, index_type row_end) { row_start[vid + 1] = row_end; } + CUDA_HOSTDEV void constructEdge(index_type eid, index_type dst, edge_data_type edata = 0) { + assert(dst < nnodes); + assert(eid < nedges); + edge_dst[eid] = dst; + //if (edge_data) edge_data[eid] = edata; + } + void malloc_index_device(index_type n, index_type *ptr); + void set_index(index_type pos, index_type value, index_type *ptr); + void allocateFrom(index_type nv, index_type ne) { + nnodes = nv; + nedges = ne; + malloc_index_device(nedges, edge_dst); + malloc_index_device(nnodes+1, row_start); + set_index(0, 0, row_start); + } size_t size() { return size_t(nnodes); } size_t sizeEdges() { return size_t(nedges); } void degree_counting() {} diff --git a/libgpu/src/csr_graph.cu b/libgpu/src/csr_graph.cu index 593451d788..e7be218138 100644 --- a/libgpu/src/csr_graph.cu +++ b/libgpu/src/csr_graph.cu @@ -46,6 +46,15 @@ unsigned CSRGraph::allocOnHost(bool no_edge_data) { return ((no_edge_data || edge_data) && row_start && edge_dst && node_data); } +void CSRGraph::malloc_index_device(index_type n, index_type *ptr) { + check_cuda(cudaMalloc((void **) &ptr, n * sizeof(index_type))); +} + +void CSRGraph::set_index(index_type pos, index_type value, index_type *ptr) { + index_type h_value = value; + check_cuda(cudaMemcpy(ptr+pos, &h_value, sizeof(index_type), cudaMemcpyHostToDevice)); +} + unsigned CSRGraph::allocOnDevice(bool no_edge_data) { if(edge_dst != NULL) // already allocated return true; From e01dbc54d627a2fda134d16fa081a1b4cd1be766 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 10 May 2020 19:16:17 -0500 Subject: [PATCH 264/660] =?UTF-8?q?fix=20error:=20=E2=80=98CSRGraph?= =?UTF-8?q?=E2=80=99=20does=20not=20name=20a=20type?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- libdeepgalois/include/deepgalois/gtypes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h index ff4a6e4e46..a2535f93a3 100644 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ b/libdeepgalois/include/deepgalois/gtypes.h @@ -1,5 +1,4 @@ #pragma once -#define USE_CSRGRAPH #include "deepgalois/types.h" #ifdef GALOIS_USE_DIST @@ -11,6 +10,7 @@ //#include "galois/graphs/LCGraph.h" #include "deepgalois/lgraph.h" #else +#define USE_CSRGRAPH #ifdef USE_CSRGRAPH #include "deepgalois/lgraph.h" #include "graph_gpu.h" From a83c74665ace438497a60d481b57374a5be5467b Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 11 May 2020 09:18:30 -0500 Subject: [PATCH 265/660] add fat and sage --- lonestar/gnn/CMakeLists.txt | 2 ++ lonestar/gnn/gat/CMakeLists.txt | 14 +++++++++ lonestar/gnn/gat/gat.cpp | 16 ++++++++++ lonestar/gnn/graphsage/gs-mean.cpp | 45 --------------------------- lonestar/gnn/include/engine.h | 50 ++++++++++++++++++++++++++++++ lonestar/gnn/sage/CMakeLists.txt | 14 +++++++++ lonestar/gnn/sage/sage.cpp | 20 ++++++++++++ 7 files changed, 116 insertions(+), 45 deletions(-) create mode 100644 lonestar/gnn/gat/CMakeLists.txt create mode 100644 lonestar/gnn/gat/gat.cpp delete mode 100644 lonestar/gnn/graphsage/gs-mean.cpp create mode 100644 lonestar/gnn/include/engine.h create mode 100644 lonestar/gnn/sage/CMakeLists.txt create mode 100644 lonestar/gnn/sage/sage.cpp diff --git a/lonestar/gnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt index 1f5d35b5f1..773df6a819 100644 --- a/lonestar/gnn/CMakeLists.txt +++ b/lonestar/gnn/CMakeLists.txt @@ -24,3 +24,5 @@ if(ENABLE_DIST_GALOIS) endif() add_subdirectory(gcn) +add_subdirectory(sage) +add_subdirectory(gat) diff --git a/lonestar/gnn/gat/CMakeLists.txt b/lonestar/gnn/gat/CMakeLists.txt new file mode 100644 index 0000000000..f9f1efdc6f --- /dev/null +++ b/lonestar/gnn/gat/CMakeLists.txt @@ -0,0 +1,14 @@ +add_executable(gat gat.cpp) +target_link_libraries(gat PRIVATE Galois::shmem lonestar) + +if(ENABLE_HETERO_GALOIS) + set_property(TARGET gat PROPERTY CUDA_STANDARD 14) + set_property(TARGET gat PROPERTY CUDA_SEPARABLE_COMPILATION ON) + target_link_libraries(gat PRIVATE dg_gpu dg_cpu) + target_link_libraries(gat PRIVATE -lcudart -lcublas -lcurand -lcudadevrt) +else() +target_link_libraries(gat PRIVATE dg_cpu) +if(ENABLE_DIST_GALOIS) + target_link_libraries(gat PRIVATE distgraphloader) +endif() +endif() diff --git a/lonestar/gnn/gat/gat.cpp b/lonestar/gnn/gat/gat.cpp new file mode 100644 index 0000000000..6f652e84c7 --- /dev/null +++ b/lonestar/gnn/gat/gat.cpp @@ -0,0 +1,16 @@ +// Graph Attension Networks (GAT) +// Xuhao Chen +#include "lonestargnn.h" + +const char* name = "Graph Attention Networks (GAT)"; +const char* desc = "Graph Attention Networks on an undirected graph: "; +const char* url = 0; + +// define aggregator here + +// math: h_i^{(l+1)} = \sum_{j\in \mathcal{N}(i)} \alpha_{i,j} W^{(l)} h_j^{(l)} +// where :math:`\alpha_{ij}` is the attention score bewteen node :math:`i` and node :math:`j`: +// .. math:: \alpha_{ij}^{l} & = \mathrm{softmax_i} (e_{ij}^{l}) +// e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W h_{j}]\right) + +#include "engine.h" diff --git a/lonestar/gnn/graphsage/gs-mean.cpp b/lonestar/gnn/graphsage/gs-mean.cpp deleted file mode 100644 index 4bd80e6203..0000000000 --- a/lonestar/gnn/graphsage/gs-mean.cpp +++ /dev/null @@ -1,45 +0,0 @@ -// Graph Neural Networks -// Xuhao Chen -#include "gnn.h" - -const char* name = "GraphSage"; -const char* desc = "A graph neural network variant: GraphSAGE"; -const char* url = 0; - -class GraphSageMean : public graph_conv_layer { - // user-defined combine function -}; - -int main(int argc, char** argv) { - galois::SharedMemSys G; - LonestarStart(argc, argv, name, desc, url); - Net network; // the neural network to train - network.init(); // default setting for now; see its implementation to find how - // to customize it by the user - ResourceManager rm; - - // the optimizer used to update parameters, see optimizer.h for more details - // optimizer *opt = new gradient_descent(); - // optimizer *opt = new adagrad(); - optimizer* opt = new adam(); - galois::StatTimer Ttrain("Train"); - Ttrain.start(); - network.train(opt); // do training using training samples - Ttrain.stop(); - - // test using test samples - acc_t test_loss = 0.0, test_acc = 0.0; - size_t test_begin = 2312, test_end = 3312; // [2312, 3327) test size = 1015 - // TODO: replace ad-hoc settings - galois::StatTimer Ttest("Test"); - Ttest.start(); - double test_time = - network.evaluate(test_begin, test_end, test_loss, test_acc); - std::cout << "\nTesting: test_loss = " << test_loss - << " test_acc = " << test_acc << " test_time = " << test_time - << "\n"; - Ttest.stop(); - - std::cout << "\n" << rm.get_peak_memory() << "\n\n"; - return 0; -} diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h new file mode 100644 index 0000000000..7d0691de0f --- /dev/null +++ b/lonestar/gnn/include/engine.h @@ -0,0 +1,50 @@ +#ifdef GALOIS_USE_DIST +#include "DistributedGraphLoader.h" +#endif + +int main(int argc, char** argv) { + galois::DistMemSys G; + LonestarGnnStart(argc, argv, name, desc, url); + + // the neural network to train: loads the entire graph on CPU + deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1, + learning_rate, dropout_rate, weight_decay, + add_selfloop, is_single_class, add_l2norm, add_dense, + neighbor_sample_sz, subgraph_sample_sz, val_interval); + + std::vector dummyVec; + deepgalois::Graph* dGraph = + galois::graphs::constructSymmetricGraph(dummyVec); + network.dist_init(dGraph, dataset); + + // read network, features, ground truth, initialize metadata + // default setting for now; can be customized by the user + network.construct_layers(); + network.print_layers_info(); + deepgalois::ResourceManager rm; // tracks peak memory usage + + // the optimizer used to update parameters, + // see optimizer.h for more details + // optimizer *opt = new gradient_descent(); + // optimizer *opt = new adagrad(); + deepgalois::optimizer* opt = new deepgalois::adam(); + galois::StatTimer Ttrain("TrainAndVal"); + Ttrain.start(); + network.train(opt, do_validate); // do training using training samples + Ttrain.stop(); + + if (do_test) { + // test using test samples + galois::gPrint("\n"); + network.read_test_masks(dataset); + galois::StatTimer Ttest("Test"); + Ttest.start(); + acc_t test_loss = 0.0, test_acc = 0.0; + double test_time = network.evaluate("test", test_loss, test_acc); + galois::gPrint("Testing: test_loss = ", test_loss, " test_acc = ", test_acc, + " test_time = ", test_time, "\n"); + Ttest.stop(); + } + galois::gPrint("\n", rm.get_peak_memory(), "\n\n"); + return 0; +} diff --git a/lonestar/gnn/sage/CMakeLists.txt b/lonestar/gnn/sage/CMakeLists.txt new file mode 100644 index 0000000000..94b6d234b7 --- /dev/null +++ b/lonestar/gnn/sage/CMakeLists.txt @@ -0,0 +1,14 @@ +add_executable(sage sage.cpp) +target_link_libraries(sage PRIVATE Galois::shmem lonestar) + +if(ENABLE_HETERO_GALOIS) + set_property(TARGET sage PROPERTY CUDA_STANDARD 14) + set_property(TARGET sage PROPERTY CUDA_SEPARABLE_COMPILATION ON) + target_link_libraries(sage PRIVATE dg_gpu dg_cpu) + target_link_libraries(sage PRIVATE -lcudart -lcublas -lcurand -lcudadevrt) +else() +target_link_libraries(sage PRIVATE dg_cpu) +if(ENABLE_DIST_GALOIS) + target_link_libraries(sage PRIVATE distgraphloader) +endif() +endif() diff --git a/lonestar/gnn/sage/sage.cpp b/lonestar/gnn/sage/sage.cpp new file mode 100644 index 0000000000..a6f6b8621e --- /dev/null +++ b/lonestar/gnn/sage/sage.cpp @@ -0,0 +1,20 @@ +// GraphSAGE +// Xuhao Chen +#include "lonestargnn.h" + +const char* name = "GraphSAGE"; +const char* desc = "GraphSAGE on an undirected graph: "; +const char* url = 0; + +// define aggregator here +// .. math:: +// h_{\mathcal{N}(i)}^{(l+1)} & = \mathrm{aggregate} +// \left(\{h_{j}^{l}, \forall j \in \mathcal{N}(i) \}\right) +// +// h_{i}^{(l+1)} & = \sigma \left(W \cdot \mathrm{concat} +// (h_{i}^{l}, h_{\mathcal{N}(i)}^{l+1} + b) \right) +// +// h_{i}^{(l+1)} & = \mathrm{norm}(h_{i}^{l}) + + +#include "engine.h" From b5f22a77c6b7ac3c87d0387966f7a8db5d326f13 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 11 May 2020 11:16:31 -0500 Subject: [PATCH 266/660] update gat and sage --- libdeepgalois/include/deepgalois/types.h | 1 + lonestar/gnn/CMakeLists.txt | 2 +- lonestar/gnn/gat/gat.cpp | 22 ++++++++++++-- lonestar/gnn/sage/sage.cpp | 37 +++++++++++++++++++++++- 4 files changed, 58 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index 71add8b650..e3165abc8a 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -31,6 +31,7 @@ typedef std::vector dims_t; // dimentions type typedef uint32_t index_t; // index type typedef float_t edata_t; // edge data type typedef float_t vdata_t; // vertex data type +typedef float_t* emb_t; // embedding (feature vector) type enum class net_phase { train, test }; diff --git a/lonestar/gnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt index 773df6a819..40eac53052 100644 --- a/lonestar/gnn/CMakeLists.txt +++ b/lonestar/gnn/CMakeLists.txt @@ -24,5 +24,5 @@ if(ENABLE_DIST_GALOIS) endif() add_subdirectory(gcn) -add_subdirectory(sage) +#add_subdirectory(sage) add_subdirectory(gat) diff --git a/lonestar/gnn/gat/gat.cpp b/lonestar/gnn/gat/gat.cpp index 6f652e84c7..10647924b7 100644 --- a/lonestar/gnn/gat/gat.cpp +++ b/lonestar/gnn/gat/gat.cpp @@ -6,11 +6,29 @@ const char* name = "Graph Attention Networks (GAT)"; const char* desc = "Graph Attention Networks on an undirected graph: "; const char* url = 0; -// define aggregator here - // math: h_i^{(l+1)} = \sum_{j\in \mathcal{N}(i)} \alpha_{i,j} W^{(l)} h_j^{(l)} // where :math:`\alpha_{ij}` is the attention score bewteen node :math:`i` and node :math:`j`: // .. math:: \alpha_{ij}^{l} & = \mathrm{softmax_i} (e_{ij}^{l}) // e_{ij}^{l} & = \mathrm{LeakyReLU}\left(\vec{a}^T [W h_{i} \| W h_{j}]\right) +/* +namespace deepgalois { + +// define aggregator here +class AppAggregator: public Aggregator { +public: + emb_t applyEdge(VertexID, VertexID u, emb_t in) { + auto ilen = get_in_feat_len(); + return &in[ilen*u]; + } + + emb_t applyVertex(VertexID v, emb_t in, emb_t accum) { + auto n = get_num_samples(); + auto ilen = get_in_feat_len(); + auto olen = get_out_feat_len(); + emb_t a, b, c; + } +}; +} +//*/ #include "engine.h" diff --git a/lonestar/gnn/sage/sage.cpp b/lonestar/gnn/sage/sage.cpp index a6f6b8621e..5f078dff63 100644 --- a/lonestar/gnn/sage/sage.cpp +++ b/lonestar/gnn/sage/sage.cpp @@ -1,4 +1,4 @@ -// GraphSAGE +// GraphSAGE: // Xuhao Chen #include "lonestargnn.h" @@ -16,5 +16,40 @@ const char* url = 0; // // h_{i}^{(l+1)} & = \mathrm{norm}(h_{i}^{l}) +namespace deepgalois { + +class AppAggregator: public Aggregator { +public: + emb_t applyEdge(VertexID, VertexID u, emb_t in) { + auto ilen = get_in_feat_len(); + return &in[ilen*u]; + } + emb_t applyVertex(VertexID v, emb_t in, emb_t accum) { + auto n = get_num_samples(); + auto ilen = get_in_feat_len(); + auto olen = get_out_feat_len(); + emb_t a, b, c; + math::mvmul(CblasTrans, olen, ilen, 1.0, W, &accum[v*ilen], 0.0, a); // a = W * accum[v]; [olen x ilen] * [ilen x 1] = [olen x 1] + math::mvmul(CblasTrans, olen, ilen, 1.0, Q, &in[v*ilen], 0.0, b); // b = Q * in; [olen x ilen] * [ilen x 1] = [olen x 1] + math::vadd_cpu(olen, a, b, c); // c = a + b; [olen x 1] + return c; // the feature vector to update h[v] + } +/* + emb_t applyVertex(emb_t in, emb_t accum) { + auto n = get_num_samples(); + auto ilen = get_in_feat_len(); + auto olen = get_out_feat_len(); + emb_t a, b, c; + math::matmul(n, olen, ilen, accum, W, a); // a = accum * W; [n x ilen] * [ilen x olen] = [n x olen] + math::matmul(n, olen, ilen, in, Q, b); // b = in * Q; [n x ilen] * [ilen x olen] = [n x olen] + math::vadd(n*olen, a, b, c); // c = a + b; [n x olen] + return c; // all the feature vectors to update the entire h + } +*/ + //void update_all(size_t len, Graph& g, const emb_t in, emb_t out) { + //} +}; + +} #include "engine.h" From 15f9aa6a514423e8f4d63326e7eb1daba0c3b0ac Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 7 May 2020 15:25:30 -0500 Subject: [PATCH 267/660] gtypes -> graph types; clean up file as well --- .../include/deepgalois/DistContext.h | 2 +- libdeepgalois/include/deepgalois/GraphTypes.h | 16 +++++ libdeepgalois/include/deepgalois/context.h | 3 +- libdeepgalois/include/deepgalois/gtypes.h | 53 --------------- .../include/deepgalois/layers/aggregator.h | 4 +- libdeepgalois/include/deepgalois/net.h | 64 +++++++++++-------- libdeepgalois/include/deepgalois/reader.h | 2 +- libdeepgalois/include/deepgalois/sampler.h | 2 +- libdeepgalois/include/deepgalois/utils.h | 2 +- 9 files changed, 61 insertions(+), 87 deletions(-) create mode 100644 libdeepgalois/include/deepgalois/GraphTypes.h delete mode 100644 libdeepgalois/include/deepgalois/gtypes.h diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 2f65360106..7069c1a0d7 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -5,7 +5,7 @@ */ #include "galois/graphs/GluonSubstrate.h" #include "deepgalois/types.h" -#include "deepgalois/gtypes.h" +#include "deepgalois/GraphTypes.h" namespace deepgalois { diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h new file mode 100644 index 0000000000..0ef3fb4a77 --- /dev/null +++ b/libdeepgalois/include/deepgalois/GraphTypes.h @@ -0,0 +1,16 @@ +#pragma once + +#include "deepgalois/types.h" +#include "galois/Galois.h" +#include "galois/graphs/NewGeneric.h" +#include "deepgalois/lgraph.h" + +#ifdef __GALOIS_HET_CUDA__ +// TODO reintroduce GPU as necessary here +#endif + +namespace deepgalois { +using index_t = edge_iterator; +using DGraph = galois::graphs::DistGraph; +using Graph = LearningGraph; +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/context.h index 77c15ee890..106427ccf7 100644 --- a/libdeepgalois/include/deepgalois/context.h +++ b/libdeepgalois/include/deepgalois/context.h @@ -7,7 +7,7 @@ #include #include "deepgalois/types.h" #include "deepgalois/reader.h" -#include "deepgalois/gtypes.h" +#include "deepgalois/GraphTypes.h" #ifdef __GALOIS_HET_CUDA__ #include "deepgalois/cutils.h" @@ -18,6 +18,7 @@ namespace deepgalois { class Context { public: Context(); + //! initializer for gpu; goes ahead and sets a few things Context(bool use_gpu) : is_device(use_gpu), n(0), num_classes(0), feat_len(0), is_single_class(true), is_selfloop_added(false), use_subgraph(false), diff --git a/libdeepgalois/include/deepgalois/gtypes.h b/libdeepgalois/include/deepgalois/gtypes.h deleted file mode 100644 index a2535f93a3..0000000000 --- a/libdeepgalois/include/deepgalois/gtypes.h +++ /dev/null @@ -1,53 +0,0 @@ -#pragma once - -#include "deepgalois/types.h" -#ifdef GALOIS_USE_DIST -#include "galois/Galois.h" -#include "galois/graphs/NewGeneric.h" -#else -#ifdef CPU_ONLY -//#include "galois/Galois.h" -//#include "galois/graphs/LCGraph.h" -#include "deepgalois/lgraph.h" -#else -#define USE_CSRGRAPH -#ifdef USE_CSRGRAPH -#include "deepgalois/lgraph.h" -#include "graph_gpu.h" -#else -#include "deepgalois/lgraph.h" -#endif -#endif -#endif - -#ifndef GALOIS_USE_DIST - -namespace deepgalois { -typedef index_t edge_iterator; -//#ifdef EDGE_LABEL -// typedef galois::graphs::LC_CSR_Graph:: -// with_numa_alloc::type ::with_no_lockable::type LCGraph; -//#else -// typedef galois::graphs::LC_CSR_Graph:: -// with_numa_alloc::type ::with_no_lockable::type LCGraph; -//#endif -// typedef LCGraph Graph; -// typedef Graph::edge_iterator edge_iterator; -typedef LearningGraph Graph; -#ifdef USE_CSRGRAPH -typedef CSRGraph GraphGPU; -#else -typedef LearningGraph GraphGPU; -#endif -} // namespace deepgalois - -#else - -namespace deepgalois { -// TODO check if this needs changing -typedef index_t edge_iterator; -using Graph = galois::graphs::DistGraph; -} // namespace deepgalois - -#endif diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h index 6e5e7a5926..cc6e22db00 100644 --- a/libdeepgalois/include/deepgalois/layers/aggregator.h +++ b/libdeepgalois/include/deepgalois/layers/aggregator.h @@ -3,7 +3,7 @@ //! For each node in the graph, add the embeddings of all of its neighbors //! together (using norm_factor if specified) #ifdef CPU_ONLY -#include "deepgalois/gtypes.h" +#include "deepgalois/GraphTypes.h" namespace deepgalois { void update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, float_t* norm_factor); @@ -11,7 +11,7 @@ void update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, float_t* norm_factor); } // namespace deepgalois #else -#include "deepgalois/gtypes.h" +#include "deepgalois/GraphTypes.h" //#include "graph_gpu.h" namespace deepgalois { void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out, diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 117de131b2..4928f61f1d 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -16,7 +16,7 @@ #ifndef GALOIS_USE_DIST #include "deepgalois/context.h" #else -#include "deepgalois/gtypes.h" +#include "deepgalois/GraphTypes.h" #include "deepgalois/DistContext.h" #endif @@ -45,28 +45,32 @@ class Net { << learning_rate << ", dropout_rate " << dropout_rate << ", weight_decay " << weight_decay << "\n"; num_layers = num_conv_layers + 1; + + // additional layers to add if (has_l2norm) num_layers++; if (has_dense) num_layers++; + // initialize feature metadata feature_dims.resize(num_layers + 1); -#ifndef GALOIS_USE_DIST + // initialze context context = new deepgalois::Context(); context->set_dataset(dataset_str); + // read graph, get num nodes num_samples = context->read_graph(selfloop); context->set_label_class(is_single_class); - // read graph, get num nodes + // read ground truth labels num_classes = context->read_labels(); - // std::cout << "Reading label masks ... "; + // get training and validation sets train_masks = new mask_t[num_samples]; val_masks = new mask_t[num_samples]; std::fill(train_masks, train_masks + num_samples, 0); std::fill(val_masks, val_masks + num_samples, 0); - // get training and validation sets + // reddit is hard coded if (dataset_str == "reddit") { train_begin = 0, train_count = 153431, train_end = train_begin + train_count; @@ -83,42 +87,52 @@ class Net { val_masks); } + // make sure sampel size isn't greater than what we have to train with if (subgraph_sample_size > train_count) { - std::cout << "FATAL: subgraph size can not be larger than the size of " - "training set\n"; - exit(1); + GALOIS_DIE("subgraph size can not be larger than the size of training " + "set\n"); } + // read features of vertices feature_dims[0] = context->read_features(); // input feature dimension: D + for (size_t i = 1; i < num_conv_layers; i++) feature_dims[i] = hidden1; // hidden1 level embedding: 16 + feature_dims[num_conv_layers] = num_classes; // output embedding: E + if (has_l2norm) feature_dims[num_conv_layers + 1] = num_classes; // l2 normalized embedding: E + if (has_dense) feature_dims[num_layers - 1] = num_classes; // MLP embedding: E + feature_dims[num_layers] = num_classes; // normalized output embedding: E layers.resize(num_layers); + + // set the subgraph boolean if sample size is greater than 0 context->set_use_subgraph(subgraph_sample_size > 0); - init(); -#endif } - Net() - : is_single_class(true), has_l2norm(false), has_dense(false), - neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1), - num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0), - num_epochs(0), learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0), - train_begin(0), train_end(0), train_count(0), val_begin(0), val_end(0), - val_count(0), test_begin(0), test_end(0), test_count(0), - val_interval(1), num_subgraphs(1), num_vertices_sg(9000), - train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {} - - void init(); + //! Default net constructor + //Net() + // : is_single_class(true), has_l2norm(false), has_dense(false), + // neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1), + // num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0), + // num_epochs(0), learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0), + // train_begin(0), train_end(0), train_count(0), val_begin(0), val_end(0), + // val_count(0), test_begin(0), test_end(0), test_count(0), + // val_interval(1), num_subgraphs(1), num_vertices_sg(9000), + // train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {} + + //! save graph pointer to context object + void saveDistGraph(Graph* dGraph); + #ifdef GALOIS_USE_DIST void dist_init(Graph* graph, std::string dataset_str); #endif + size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } size_t get_nnodes() { return num_samples; } @@ -127,13 +141,9 @@ class Net { void regularize(); // add weight decay void train(optimizer* opt, bool need_validate) { - std::string header = ""; - std::string seperator = " "; -#ifdef GALOIS_USE_DIST unsigned myID = galois::runtime::getSystemNetworkInterface().ID; - header = "[" + std::to_string(myID) + "] "; - seperator = "\n"; -#endif + std::string header = "[" + std::to_string(myID) + "] "; + std::string seperator = "\n"; double total_train_time = 0.0; int num_subg_remain = 0; diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h index 9e5faf1f39..1bcda0b4b7 100644 --- a/libdeepgalois/include/deepgalois/reader.h +++ b/libdeepgalois/include/deepgalois/reader.h @@ -1,5 +1,5 @@ #pragma once -#include "deepgalois/gtypes.h" +#include "deepgalois/GraphTypes.h" namespace deepgalois { diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h index c559804354..ab0fb03a25 100644 --- a/libdeepgalois/include/deepgalois/sampler.h +++ b/libdeepgalois/include/deepgalois/sampler.h @@ -1,7 +1,7 @@ #ifndef GALOIS_USE_DIST #pragma once -#include "deepgalois/gtypes.h" +#include "deepgalois/GraphTypes.h" namespace deepgalois { #define ETA 1.5 // length factor of DB in sampling diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h index 08f28126bf..7093897af2 100644 --- a/libdeepgalois/include/deepgalois/utils.h +++ b/libdeepgalois/include/deepgalois/utils.h @@ -8,7 +8,7 @@ #include #include #ifdef GALOIS_USE_DIST -#include "deepgalois/gtypes.h" +#include "deepgalois/GraphTypes.h" #else #include "deepgalois/types.h" #endif From a2d21a0ea5f5cecc0baf415c0f67424ff2f8c7d3 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 7 May 2020 15:29:35 -0500 Subject: [PATCH 268/660] context.h -> Context.h --- libdeepgalois/include/deepgalois/{context.h => Context.h} | 0 libdeepgalois/include/deepgalois/layers/layer.h | 2 +- libdeepgalois/include/deepgalois/net.h | 2 +- libdeepgalois/src/context.cpp | 2 +- 4 files changed, 3 insertions(+), 3 deletions(-) rename libdeepgalois/include/deepgalois/{context.h => Context.h} (100%) diff --git a/libdeepgalois/include/deepgalois/context.h b/libdeepgalois/include/deepgalois/Context.h similarity index 100% rename from libdeepgalois/include/deepgalois/context.h rename to libdeepgalois/include/deepgalois/Context.h diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index ec35c1d8c9..a1c2ef630a 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -11,7 +11,7 @@ #include #include "deepgalois/gtypes.h" #ifndef GALOIS_USE_DIST -#include "deepgalois/context.h" +#include "deepgalois/Context.h" #else #include "deepgalois/DistContext.h" #endif diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/net.h index 4928f61f1d..ac934d0c7d 100644 --- a/libdeepgalois/include/deepgalois/net.h +++ b/libdeepgalois/include/deepgalois/net.h @@ -14,7 +14,7 @@ #include "deepgalois/sampler.h" #endif #ifndef GALOIS_USE_DIST -#include "deepgalois/context.h" +#include "deepgalois/Context.h" #else #include "deepgalois/GraphTypes.h" #include "deepgalois/DistContext.h" diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/context.cpp index f07da83d6d..d5cc9435ee 100644 --- a/libdeepgalois/src/context.cpp +++ b/libdeepgalois/src/context.cpp @@ -1,7 +1,7 @@ /** * Based on common.hpp file of the Caffe deep learning library. */ -#include "deepgalois/context.h" +#include "deepgalois/Context.h" #include "deepgalois/utils.h" #include "deepgalois/configs.h" #include "galois/Galois.h" From 4eaed77a7c641f3ea09e08ef5c6b606cc212d02e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 7 May 2020 15:35:25 -0500 Subject: [PATCH 269/660] net -> Net; also getting context --- libdeepgalois/CMakeLists.txt | 58 ++++++------------- .../include/deepgalois/{net.h => Net.h} | 0 .../src/{context.cpp => Context.cpp} | 0 libdeepgalois/src/{net.cpp => Net.cpp} | 2 +- libdeepgalois/src/net.cu | 2 +- 5 files changed, 20 insertions(+), 42 deletions(-) rename libdeepgalois/include/deepgalois/{net.h => Net.h} (100%) rename libdeepgalois/src/{context.cpp => Context.cpp} (100%) rename libdeepgalois/src/{net.cpp => Net.cpp} (99%) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 41e5130818..9a20111e0b 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -72,46 +72,24 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if(NOT ENABLE_HETERO_GALOIS) - if(ENABLE_DIST_GALOIS) - # do not link regular context.cpp; TODO do this conditional in cleaner way - # also don't link sampler - set(sources - src/layers/softmax_loss_layer.cpp - src/layers/sigmoid_loss_layer.cpp - src/layers/graph_conv_layer.cpp - src/layers/leaky_relu_layer.cpp - src/layers/l2_norm_layer.cpp - src/layers/relu_layer.cpp - src/layers/aggregator.cpp - src/math_functions.cpp - src/DistContext.cpp - src/optimizer.cpp - src/reader.cpp - src/lgraph.cpp - src/utils.cpp - src/node.cpp - src/net.cpp - ) - else() - set(sources - src/layers/softmax_loss_layer.cpp - src/layers/sigmoid_loss_layer.cpp - src/layers/graph_conv_layer.cpp - src/layers/leaky_relu_layer.cpp - src/layers/l2_norm_layer.cpp - src/layers/relu_layer.cpp - src/layers/aggregator.cpp - src/math_functions.cpp - src/optimizer.cpp - src/context.cpp - src/sampler.cpp - src/reader.cpp - src/lgraph.cpp - src/utils.cpp - src/node.cpp - src/net.cpp - ) - endif(ENABLE_DIST_GALOIS) + set(sources + src/layers/softmax_loss_layer.cpp + src/layers/sigmoid_loss_layer.cpp + src/layers/graph_conv_layer.cpp + src/layers/leaky_relu_layer.cpp + src/layers/l2_norm_layer.cpp + src/layers/relu_layer.cpp + src/layers/aggregator.cpp + src/math_functions.cpp + src/optimizer.cpp + src/Context.cpp + src/sampler.cpp + src/reader.cpp + src/lgraph.cpp + src/utils.cpp + src/node.cpp + src/Net.cpp + ) else() # dummy sources set for dg_cpu for HETERO build # TODO fix this diff --git a/libdeepgalois/include/deepgalois/net.h b/libdeepgalois/include/deepgalois/Net.h similarity index 100% rename from libdeepgalois/include/deepgalois/net.h rename to libdeepgalois/include/deepgalois/Net.h diff --git a/libdeepgalois/src/context.cpp b/libdeepgalois/src/Context.cpp similarity index 100% rename from libdeepgalois/src/context.cpp rename to libdeepgalois/src/Context.cpp diff --git a/libdeepgalois/src/net.cpp b/libdeepgalois/src/Net.cpp similarity index 99% rename from libdeepgalois/src/net.cpp rename to libdeepgalois/src/Net.cpp index ebd19639da..ede45fe2a3 100644 --- a/libdeepgalois/src/net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -4,7 +4,7 @@ #include "galois/Timer.h" #include "galois/Galois.h" -#include "deepgalois/net.h" +#include "deepgalois/Net.h" #include "deepgalois/math_functions.hh" namespace deepgalois { diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/net.cu index cd635ef07f..f1bbe97c94 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/net.cu @@ -1,4 +1,4 @@ -#include "deepgalois/net.h" +#include "deepgalois/Net.h" #include "deepgalois/cutils.h" #include "deepgalois/math_functions.hh" #include "gg.h" From 19a772fd0ce94e9466666fea27f041d6bedb41b0 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 8 May 2020 14:54:29 -0500 Subject: [PATCH 270/660] mainly sampler function commenting, renaming to get a better understanding now it works --- libdeepgalois/include/deepgalois/Context.h | 3 +- libdeepgalois/include/deepgalois/Net.h | 14 +- libdeepgalois/include/deepgalois/sampler.h | 37 +++-- libdeepgalois/src/Context.cpp | 117 ++++++++------- libdeepgalois/src/DistContext.cpp | 5 +- libdeepgalois/src/context.cu | 2 +- libdeepgalois/src/sampler.cpp | 158 +++++++++++---------- 7 files changed, 177 insertions(+), 159 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h index 106427ccf7..519a75d7f3 100644 --- a/libdeepgalois/include/deepgalois/Context.h +++ b/libdeepgalois/include/deepgalois/Context.h @@ -58,7 +58,8 @@ class Context { void norm_factor_computing(bool is_subgraph, int subg_id = 0); void gen_subgraph_labels(size_t m, const mask_t* masks); void gen_subgraph_feats(size_t m, const mask_t* masks); - void createSubgraphs(int num_subgraphs); + //! Allocate subgraphs (but don't actually do sampling yet) + void allocateSubgraphs(int num_subgraphs); #ifndef __GALOIS_HET_CUDA__ Graph* graph_cpu; // the input graph, |V| = N diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index ac934d0c7d..d478d83e4c 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -147,18 +147,16 @@ class Net { double total_train_time = 0.0; int num_subg_remain = 0; -#ifdef CPU_ONLY -#ifndef GALOIS_USE_DIST + if (subgraph_sample_size) { - context->createSubgraphs(num_subgraphs); + context->allocateSubgraphs(num_subgraphs); subgraphs_masks = new mask_t[num_samples * num_subgraphs]; - std::cout << "\nConstruct training vertex set induced graph...\n"; - sampler->set_masked_graph(train_begin, train_end, train_count, - train_masks, context->getGraphPointer()); + galois::gPrint(header, " Construct training vertex set induced graph...\n"; + sampler->initializeMaskedGraph(train_count, train_masks, context->getGraphPointer()); } -#endif -#endif + std::cout << "\nStart training...\n"; + Timer t_epoch; // run epochs for (int ep = 0; ep < num_epochs; ep++) { diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/sampler.h index ab0fb03a25..7934b28aa7 100644 --- a/libdeepgalois/include/deepgalois/sampler.h +++ b/libdeepgalois/include/deepgalois/sampler.h @@ -15,10 +15,12 @@ class Sampler { Sampler() : m_(DEFAULT_SIZE_FRONTIER) {} ~Sampler() {} - // sample a subgraph sg of size n from graph g + //! sample a subgraph sg of size n from graph g + //! sg is overwritten/is output void subgraph_sample(size_t n, Graph& sg, mask_t* masks, unsigned tid = 0); - // !API function for user-defined selection strategy + //! API function for user-defined selection strategy + // TODO how to expose this? virtual void select_vertices(size_t nv, size_t n, int m, Graph* g, VertexList vertices, VertexSet& vertex_set); virtual void select_vertices(size_t n, int m, VertexSet& vertex_set, @@ -33,31 +35,38 @@ class Sampler { edge_iterator sampled_edge_end(Graph& g, VertexID v) { return g.edge_end(v); } - void set_masked_graph(size_t begin, size_t end, size_t count, mask_t* masks, - Graph* g); + //! Given a mask, construct the graph with only those vertices ans ave as the + //! masked graph in this class for the sampler. + void initializeMaskedGraph(size_t count, mask_t* masks, Graph* g); protected: int m_; size_t count_; - size_t begin_; - size_t end_; + + //! averaged degree of masked graph int avg_deg; + //! average degree cut off to a clip int subg_deg; - VertexList vertices_; + //! list of vertices active in the graph being maintained (masked_graph) + //VertexList vertices_; + //! List of training nodes; sampling set std::vector node_train; mask_t* masks_; + //! masked original graph; typically to the training set Graph* masked_graph; Graph* graph; - // Given a subset of vertices and a graph g, generate a subgraph sg from the - // graph g - void generate_subgraph(VertexSet& vertex_set, Graph& g, Graph& sub); - void generate_masked_graph(size_t n, mask_t* masks, Graph* g, Graph& mg); + //! Reindex a graph to only contain those in the vertex set + void reindexSubgraph(VertexSet& keptVertices, Graph& g, Graph& reindexed); + //! Given a graph, return a graph with edges to unmasked vertices removed in + //! mg + void getMaskedGraph(size_t n, mask_t* masks, Graph* g, Graph& mg); void get_masked_degrees(size_t n, mask_t* masks, Graph* g, std::vector& degrees); - void update_masks(size_t n, VertexSet vertices, mask_t* masks); - inline VertexList reindexing_vertice(size_t n, VertexSet vertex_set); - void check_DB(std::vector& DB0, std::vector& DB1, + //! Set masks bitset with IDs in the vertices VertexSet + void getMasks(size_t n, VertexSet vertices, mask_t* masks); + inline VertexList reindexVertices(size_t n, VertexSet vertex_set); + void checkGSDB(std::vector& DB0, std::vector& DB1, std::vector& DB2, size_t size); }; diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp index d5cc9435ee..58526d7a96 100644 --- a/libdeepgalois/src/Context.cpp +++ b/libdeepgalois/src/Context.cpp @@ -22,46 +22,52 @@ Context::~Context() { // if (norm_factors_subg) delete[] norm_factors_subg; } -void Context::createSubgraphs(int num_subgraphs) { +void Context::allocateSubgraphs(int num_subgraphs) { subgraphs_cpu.resize(num_subgraphs); for (int i = 0; i < num_subgraphs; i++) subgraphs_cpu[i] = new Graph(); } -// generate labels for the subgraph, m is subgraph size +//! generate labels for the subgraph, m is subgraph size, mask +//! tells which vertices to use void Context::gen_subgraph_labels(size_t m, const mask_t* masks) { // if (h_labels_subg == NULL) h_labels_subg = new label_t[m]; - if (is_single_class) { - h_labels_subg.resize(m); + if (Context::is_single_class) { + Context::h_labels_subg.resize(m); } else { - h_labels_subg.resize(m * num_classes); + Context::h_labels_subg.resize(m * Context::num_classes); } + size_t count = 0; + // see which labels to copy over for this subgraph for (size_t i = 0; i < n; i++) { if (masks[i] == 1) { - if (is_single_class) { - h_labels_subg[count] = h_labels[i]; + if (Context::is_single_class) { + Context::h_labels_subg[count] = h_labels[i]; } else { - std::copy(h_labels + i * num_classes, h_labels + (i + 1) * num_classes, - &h_labels_subg[count * num_classes]); + std::copy(Context::h_labels + i * Context::num_classes, Context::h_labels + (i + 1) * Context::num_classes, + &Context::h_labels_subg[count * Context::num_classes]); } count++; } } + assert(count == m); } -// generate input features for the subgraph, m is subgraph size +//! generate input features for the subgraph, m is subgraph size, +//! masks tells which vertices to use void Context::gen_subgraph_feats(size_t m, const mask_t* masks) { size_t count = 0; // if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len]; - h_feats_subg.resize(m * feat_len); + Context::h_feats_subg.resize(m * feat_len); for (size_t i = 0; i < n; i++) { if (masks[i] == 1) { - std::copy(h_feats + i * feat_len, h_feats + (i + 1) * feat_len, - &h_feats_subg[count * feat_len]); + std::copy(Context::h_feats + i * Context::feat_len, Context::h_feats + (i + 1) * Context::feat_len, + &Context::h_feats_subg[count * Context::feat_len]); count++; } } + assert(count == m); } size_t Context::read_graph(bool selfloop) { @@ -80,63 +86,64 @@ size_t Context::read_graph(bool selfloop) { std::string filename = path + dataset + ".csgr"; printf("Reading .gr file: %s\n", filename.c_str()); if (selfloop) { + galois::gWarn("SELF LOOPS NOT SUPPORTED AT THIS TIME"); Graph graph_temp; // galois::graphs::readGraph(graph_temp, filename); graph_temp.readGraph(dataset); add_selfloop(graph_temp, *graph_cpu); is_selfloop_added = selfloop; //} else galois::graphs::readGraph(*graph_cpu, filename); - } else + } else { graph_cpu->readGraph(dataset); + } // TODO dist version of self loop } else { - printf("Unkown file format\n"); - exit(1); + GALOIS_DIE("unknown file format for readgraph"); } Tread.stop(); + auto g = getGraphPointer(); - std::cout << "num_vertices " << g->size() << " num_edges " << g->sizeEdges() - << "\n"; - n = g->size(); - return n; + galois::gPrint("num_vertices ", g->size(), " num_edges ", g->sizeEdges(), + "\n"); + return g->size(); } void Context::add_selfloop(Graph& og, Graph& g) { + // TODO not actually implemented yet g.allocateFrom(og.size(), og.size() + og.sizeEdges()); g.constructNodes(); - /* - for (size_t src = 0; src < og.size(); src++) { - //g.getData(src) = 1; - auto begin = og.edge_begin(src); - auto end = og.edge_end(src); - g.fixEndEdge(src, end+src+1); - bool self_inserted = false; - if (begin == end) { - new_edge_dst[begin+i] = i; - continue; - } - for (auto e = begin; e != end; e++) { - auto dst = og.getEdgeDst(e); - if (!self_inserted) { - if (dst > src) { - g.constructEdge(e+src, src, 0); - g.constructEdge(e+src+1, dst, 0); - self_inserted = true; - } else if (e+1 == end) { - g.constructEdge(e+src+1, src, 0); - g.constructEdge(e+src, dst, 0); - self_inserted = true; - } else g.constructEdge(e+src, dst, 0); - } else g.constructEdge(e+src+1, dst, 0); - } - } - //*/ + //for (size_t src = 0; src < og.size(); src++) { + // //g.getData(src) = 1; + // auto begin = og.edge_begin(src); + // auto end = og.edge_end(src); + // g.fixEndEdge(src, end+src+1); + // bool self_inserted = false; + // if (begin == end) { + // new_edge_dst[begin+i] = i; + // continue; + // } + // for (auto e = begin; e != end; e++) { + // auto dst = og.getEdgeDst(e); + // if (!self_inserted) { + // if (dst > src) { + // g.constructEdge(e+src, src, 0); + // g.constructEdge(e+src+1, dst, 0); + // self_inserted = true; + // } else if (e+1 == end) { + // g.constructEdge(e+src+1, src, 0); + // g.constructEdge(e+src, dst, 0); + // self_inserted = true; + // } else g.constructEdge(e+src, dst, 0); + // } else g.constructEdge(e+src+1, dst, 0); + // } + //} } void Context::alloc_norm_factor() { Graph* g = getGraphPointer(); if (norm_factors == NULL) #ifdef USE_MKL + // TODO why does MKL use size edges norm_factors = new float_t[g->sizeEdges()]; #else norm_factors = new float_t[g->size()]; @@ -145,19 +152,19 @@ void Context::alloc_norm_factor() { void Context::alloc_subgraph_norm_factor(int subg_id) { Graph* g = getSubgraphPointer(subg_id); - // if (norm_factors_subg == NULL) #ifdef USE_MKL - // norm_factors_subg = new float_t[g->sizeEdges()]; norm_factors_subg.resize(g->sizeEdges()); #else norm_factors_subg.resize(g->size()); - // norm_factors_subg = new float_t[g->size()]; #endif + norm_factors_subg.clear(); } void Context::norm_factor_computing(bool is_subgraph, int subg_id) { Graph* g; float_t* constants; + + // grab orig or subgraph pointer as necessary if (!is_subgraph) { g = getGraphPointer(); alloc_norm_factor(); @@ -167,6 +174,7 @@ void Context::norm_factor_computing(bool is_subgraph, int subg_id) { alloc_subgraph_norm_factor(subg_id); constants = get_norm_factors_subg_ptr(); } + auto g_size = g->size(); g->degree_counting(); #ifdef USE_MKL @@ -265,13 +273,4 @@ void Context::read_edgelist(const char* filename, bool symmetrize, } } -/* -inline void init_features(size_t dim, vec_t &x) { - std::default_random_engine rng; - std::uniform_real_distribution dist(0, 0.1); - for (size_t i = 0; i < dim; ++i) - x[i] = dist(rng); -} -*/ - } // namespace deepgalois diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 1da6c6c5a1..3332aeabaf 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -6,13 +6,12 @@ namespace deepgalois { DistContext::DistContext() {} DistContext::~DistContext() {} -void DistContext::saveGraph(Graph* dGraph) { +void DistContext::saveGraph(DGraph* dGraph) { graph_cpu = dGraph; - localVertices = graph_cpu->size(); } -size_t DistContext::read_labels(std::string dataset_str) { +size_t DistContext::read_labels(DGraph& dGraph, std::string dataset_str) { Graph* dGraph = DistContext::graph_cpu; unsigned myID = galois::runtime::getSystemNetworkInterface().ID; galois::gPrint("[", myID, "] Reading labels from disk...\n"); diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/context.cu index 365bef8e50..05a1b0cd8f 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/context.cu @@ -99,7 +99,7 @@ Context::~Context() { CUDA_CHECK(cudaFree(norm_factors)); } -void Context::createSubgraphs(int n_sg) {} +void Context::allocateSubgraphs(int n_sg) {} void Context::gen_subgraph_labels(size_t m, const mask_t* masks) {} diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/sampler.cpp index 0ac77526f3..727a95eb55 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/sampler.cpp @@ -12,57 +12,57 @@ inline unsigned getDegree(Graph* g, index_t v) { return g->edge_end(v) - g->edge_begin(v); } -void Sampler::set_masked_graph(size_t begin, size_t end, size_t count, - mask_t* masks, Graph* g) { - // galois::gPrint("Set masked graph: begin=", begin, ", end=", end, ", - // count=", count, "\n"); - begin_ = begin; - end_ = end; - count_ = count; - masks_ = masks; - graph = g; -#ifndef GALOIS_USE_DIST - masked_graph = new Graph(); -#endif - // generate_masked_graph(g->size(), masks, g, *masked_graph); +void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g) { + this->count_ = count; + this->masks_ = masks; + // save original graph + Sampler::graph = g; + // allocate the object for the new masked graph + Sampler::masked_graph = new Graph(); + std::vector degrees(g->size(), 0); - get_masked_degrees(g->size(), masks, g, degrees); + // get degrees of nodes that will be in new graph + this->get_masked_degrees(g->size(), masks, g, degrees); auto offsets = deepgalois::parallel_prefix_sum(degrees); size_t ne = offsets[g->size()]; + + // save ids (on original graph) of training nodes to vector for (size_t i = 0; i < g->size(); i++) { if (masks[i] == 1) - node_train.push_back(i); + Sampler::node_train.push_back(i); } - masked_graph->allocateFrom(g->size(), ne); - masked_graph->constructNodes(); + + Sampler::masked_graph->allocateFrom(g->size(), ne); + Sampler::masked_graph->constructNodes(); + // same as original graph, except keep only edges involved in masks galois::do_all( galois::iterate((size_t)0, g->size()), [&](const auto src) { - masked_graph->fixEndEdge(src, offsets[src + 1]); + Sampler::masked_graph->fixEndEdge(src, offsets[src + 1]); if (masks[src] == 1) { auto idx = offsets[src]; for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { const auto dst = g->getEdgeDst(e); if (masks[dst] == 1) - masked_graph->constructEdge(idx++, dst, 0); + Sampler::masked_graph->constructEdge(idx++, dst, 0); } } }, galois::loopname("gen_subgraph")); - masked_graph->degree_counting(); - avg_deg = masked_graph->sizeEdges() / masked_graph->size(); - subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg; - // galois::gPrint("Train graph: num_vertices ", masked_graph->size(), " - // num_edges ", masked_graph->sizeEdges(), " avg_degree ", avg_deg, "\n"); - size_t idx = 0; - vertices_.resize(count); - for (size_t i = begin; i < end; i++) { - if (masks_[i] == 1) - vertices_[idx++] = i; - } + Sampler::masked_graph->degree_counting(); + Sampler::avg_deg = masked_graph->sizeEdges() / masked_graph->size(); + Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg; + + //size_t idx = 0; + //vertices_.resize(count); + //for (size_t i = begin; i < end; i++) { + // if (masks_[i] == 1) + // vertices_[idx++] = i; + //} } +//! determine degree of each vertex in a masked graph (given by masks and g) void Sampler::get_masked_degrees(size_t n, mask_t* masks, Graph* g, std::vector& degrees) { assert(degrees.size() == n); @@ -87,18 +87,22 @@ void Sampler::get_masked_degrees(size_t n, mask_t* masks, Graph* g, #endif } -void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, +//! returns a graph in the variable sub: it is g with the mask applied +void Sampler::getMaskedGraph(size_t n, mask_t* masks, Graph* g, Graph& sub) { std::vector degrees(n, 0); - get_masked_degrees(n, masks, g, degrees); + this->get_masked_degrees(n, masks, g, degrees); // auto offsets = deepgalois::parallel_prefix_sum(degrees); auto offsets = deepgalois::prefix_sum(degrees); size_t ne = offsets[n]; // galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", // ne, "\n"); -#ifndef GALOIS_USE_DIST + // + + // note this constructs the full graph's nodes; just trims edges sub.allocateFrom(n, ne); sub.constructNodes(); + #ifdef PARALLEL_GEN galois::do_all( galois::iterate((size_t)0, n), @@ -123,7 +127,9 @@ void Sampler::generate_masked_graph(size_t n, mask_t* masks, Graph* g, #endif } -void Sampler::check_DB(std::vector& DB0, std::vector& DB1, + +// helper function for graph saint implementation below +void Sampler::checkGSDB(std::vector& DB0, std::vector& DB1, std::vector& DB2, size_t size) { if (DB0.capacity() < size) { DB0.reserve(DB0.capacity() * 2); @@ -135,6 +141,7 @@ void Sampler::check_DB(std::vector& DB0, std::vector& DB1, DB2.resize(size); } +//! debug function: prints out sets of vertices void print_vertex_set(VertexSet vertex_set) { unsigned counter = 0; unsigned n = vertex_set.size(); @@ -148,9 +155,11 @@ void print_vertex_set(VertexSet vertex_set) { galois::gPrint(")\n"); } -void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned tid) { - // unsigned myseed = time(NULL); - unsigned myseed = tid + time(NULL); +// implementation from GraphSAINT +// https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp +void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned seed) { + unsigned myseed = seed; + // unsigned myseed = tid; // DBx: Dashboard line x, IAx: Index array line x std::vector DB0, DB1, DB2, IA0, IA1, IA2, IA3, IA4, nDB0, nDB1, nDB2; @@ -172,11 +181,12 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned tid) { // printf("( "); // for (size_t i = 0; i < 10; i++) std::cout << node_train[i] << " "; // printf(")\n"); + for (int i = 0; i < m; i++) { - auto rand_idx = rand_r(&myseed) % node_train.size(); - db_t v = IA3[i] = node_train[rand_idx]; - st.insert(v); - IA0[i] = getDegree(masked_graph, v); + auto rand_idx = rand_r(&myseed) % Sampler::node_train.size(); + db_t v = IA3[i] = Sampler::node_train[rand_idx]; + st.iisert(v); + IA0[i] = getDegree(Sampler::masked_graph, v); IA0[i] = (IA0[i] > SAMPLE_CLIP) ? SAMPLE_CLIP : IA0[i]; IA1[i] = 1; IA2[i] = 0; @@ -187,7 +197,7 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned tid) { for (int i = 1; i < m; i++) IA2[i] = IA2[i - 1] + IA0[i]; // now fill DB accordingly - check_DB(DB0, DB1, DB2, IA2[m - 1]); + checkGSDB(DB0, DB1, DB2, IA2[m - 1]); for (int i = 0; i < m; i++) { db_t DB_start = (i == 0) ? 0 : IA2[i - 1]; db_t DB_end = IA2[i]; @@ -209,16 +219,16 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned tid) { } choose = (DB1[choose] < 0) ? choose : (choose - DB1[choose]); db_t v = DB0[choose]; - auto degree = getDegree(masked_graph, v); + auto degree = getDegree(Sampler::masked_graph, v); neigh_v = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1); if (neigh_v != db_t(-1)) { - neigh_v = masked_graph->getEdgeDst(masked_graph->edge_begin(v) + neigh_v); + neigh_v = Sampler::masked_graph->getEdgeDst(Sampler::masked_graph->edge_begin(v) + neigh_v); st.insert(neigh_v); IA1[DB2[choose] - 1] = 0; IA0[DB2[choose] - 1] = 0; for (auto i = choose; i < choose - DB1[choose]; i++) DB0[i] = db_t(-1); - newsize = getDegree(masked_graph, neigh_v); + newsize = getDegree(Sampler::masked_graph, neigh_v); newsize = (newsize > SAMPLE_CLIP) ? SAMPLE_CLIP : newsize; } else newsize = 0; @@ -270,7 +280,7 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned tid) { IA2.resize(curr); IA3.resize(curr); } - check_DB(DB0, DB1, DB2, newsize + DB0.size()); + checkGSDB(DB0, DB1, DB2, newsize + DB0.size()); IA0.push_back(newsize); IA1.push_back(1); IA2.push_back(IA2.back() + IA0.back()); @@ -287,11 +297,12 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned tid) { // print_vertex_set(st); } -// !API function for user-defined selection strategy +// API function for user-defined selection strategy // Select n vertices from vertices and put them in vertex_set. // nv: number of vertices in the original graph; // n: number of vertices in the subgraph; // m: number of vertices in the frontier. +// our implementation of GraphSAINT sampling void Sampler::select_vertices(size_t nv, size_t n, int m, Graph* g, VertexList vertices, VertexSet& vertex_set) { // galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices, @@ -334,14 +345,14 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph* g, */ } -void Sampler::update_masks(size_t n, VertexSet vertices, mask_t* masks) { +void Sampler::getMasks(size_t n, VertexSet vertices, mask_t* masks) { // galois::gPrint("Updating masks, size = ", vertices.size(), "\n"); std::fill(masks, masks + n, 0); for (auto v : vertices) masks[v] = 1; } -inline VertexList Sampler::reindexing_vertice(size_t n, VertexSet vertex_set) { +inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) { VertexList new_ids(n, 0); int vid = 0; for (auto v : vertex_set) { @@ -352,13 +363,13 @@ inline VertexList Sampler::reindexing_vertice(size_t n, VertexSet vertex_set) { // Given a subset of vertices and a graph g, generate a subgraph sg from the // graph g -void Sampler::generate_subgraph(VertexSet& vertex_set, Graph& g, Graph& sub) { - // auto n = g.size(); // old graph size - auto nv = vertex_set.size(); // new graph (subgraph) size - VertexList new_ids = reindexing_vertice(graph->size(), vertex_set); +void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& reindexGraph) { + // auto n = origGraph.size(); // old graph size + auto nv = keptVertices.size(); // new graph (subgraph) size + VertexList new_ids = this->reindexVertices(graph->size(), keptVertices); std::vector degrees(nv, 0); // degrees of vertices in the subgraph - for (auto v : vertex_set) { - degrees[new_ids[v]] = getDegree(&g, v); + for (auto v : keptVertices) { + degrees[new_ids[v]] = getDegree(&origGraph, v); } // auto offsets = deepgalois::parallel_prefix_sum(degrees); auto offsets = deepgalois::prefix_sum(degrees); @@ -366,9 +377,9 @@ void Sampler::generate_subgraph(VertexSet& vertex_set, Graph& g, Graph& sub) { // galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne, // "\n"); #ifndef GALOIS_USE_DIST - sub.allocateFrom(nv, ne); - sub.constructNodes(); - VertexList old_ids(vertex_set.begin(), vertex_set.end()); // vertex ID mapping + reindexGraph.allocateFrom(nv, ne); + reindexGraph.constructNodes(); + VertexList old_ids(keptVertices.begin(), keptVertices.end()); // vertex ID mapping #ifdef PARALLEL_GEN galois::do_all( galois::iterate((size_t)0, nv), @@ -376,13 +387,13 @@ void Sampler::generate_subgraph(VertexSet& vertex_set, Graph& g, Graph& sub) { #else for (size_t i = 0; i < nv; i++) { #endif - sub.fixEndEdge(i, offsets[i + 1]); + reindexGraph.fixEndEdge(i, offsets[i + 1]); unsigned j = 0; auto old_id = old_ids[i]; - for (auto e = g.edge_begin(old_id); e != g.edge_end(old_id); e++) { - auto dst = new_ids[g.getEdgeDst(e)]; + for (auto e = origGraph.edge_begin(old_id); e != origGraph.edge_end(old_id); e++) { + auto dst = new_ids[origGraph.getEdgeDst(e)]; assert(dst < nv); - sub.constructEdge(offsets[i] + j, dst, 0); + reindexGraph.constructEdge(offsets[i] + j, dst, 0); j++; } } @@ -395,19 +406,20 @@ void Sampler::generate_subgraph(VertexSet& vertex_set, Graph& g, Graph& sub) { void Sampler::subgraph_sample(size_t n, Graph& sg, mask_t* masks, unsigned tid) { - VertexSet vertex_set; // n = 9000 by default - // select_vertices(count_, n, m_, masked_graph, vertices_, vertex_set); // m = - // 1000 by default - select_vertices(n, m_, vertex_set, tid); // m = 1000 by default - update_masks(graph->size(), vertex_set, - masks); // set masks for vertices in the vertex_set -#ifndef GALOIS_USE_DIST + VertexSet sampledSet; + // n = 9000 by default + // this->select_vertices(count_, n, m_, masked_graph, vertices_, sampledSet); + + // do the sampling of vertices from training set + using masked graph + this->select_vertices(n, m_, sampledSet, tid); // m = 1000 by default + + // create the masks on the masked_graph + getMasks(Sampler::graph->size(), sampledSet, masks); + Graph masked_sg; - generate_masked_graph( - graph->size(), masks, masked_graph, + this->getMaskedGraph(Sampler::graph->size(), masks, Sampler::masked_graph, masked_sg); // remove edges whose destination is not masked - generate_subgraph(vertex_set, masked_sg, sg); -#endif + this->reindexSubgraph(sampledSet, masked_sg, sg); } } // namespace deepgalois From 541745380f9cd1ddcb081ff320a72cbdc513b90f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 8 May 2020 16:01:23 -0500 Subject: [PATCH 271/660] sampler renaming --- libdeepgalois/CMakeLists.txt | 2 +- libdeepgalois/include/deepgalois/Net.h | 7 ++----- libdeepgalois/include/deepgalois/{sampler.h => Sampler.h} | 0 libdeepgalois/src/{sampler.cpp => Sampler.cpp} | 2 +- 4 files changed, 4 insertions(+), 7 deletions(-) rename libdeepgalois/include/deepgalois/{sampler.h => Sampler.h} (100%) rename libdeepgalois/src/{sampler.cpp => Sampler.cpp} (99%) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 9a20111e0b..58309084b1 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -83,7 +83,7 @@ if(NOT ENABLE_HETERO_GALOIS) src/math_functions.cpp src/optimizer.cpp src/Context.cpp - src/sampler.cpp + src/Sampler.cpp src/reader.cpp src/lgraph.cpp src/utils.cpp diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index d478d83e4c..61fb1034c7 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -10,13 +10,10 @@ #include "deepgalois/layers/sigmoid_loss_layer.h" #include "deepgalois/optimizer.h" #include "deepgalois/utils.h" -#ifdef CPU_ONLY -#include "deepgalois/sampler.h" -#endif -#ifndef GALOIS_USE_DIST +#include "deepgalois/Sampler.h" #include "deepgalois/Context.h" -#else #include "deepgalois/GraphTypes.h" + #include "deepgalois/DistContext.h" #endif diff --git a/libdeepgalois/include/deepgalois/sampler.h b/libdeepgalois/include/deepgalois/Sampler.h similarity index 100% rename from libdeepgalois/include/deepgalois/sampler.h rename to libdeepgalois/include/deepgalois/Sampler.h diff --git a/libdeepgalois/src/sampler.cpp b/libdeepgalois/src/Sampler.cpp similarity index 99% rename from libdeepgalois/src/sampler.cpp rename to libdeepgalois/src/Sampler.cpp index 727a95eb55..aa6fb6d686 100644 --- a/libdeepgalois/src/sampler.cpp +++ b/libdeepgalois/src/Sampler.cpp @@ -1,5 +1,5 @@ #include "deepgalois/utils.h" -#include "deepgalois/sampler.h" +#include "deepgalois/Sampler.h" #include "galois/Galois.h" #include #include From a4f1c063f994c394db7258d27e2596af84a1d119 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 8 May 2020 17:55:27 -0500 Subject: [PATCH 272/660] renaming net things for easier understanding; getting ready for dist sampling --- .../include/deepgalois/DistContext.h | 16 -- libdeepgalois/include/deepgalois/Net.h | 216 ++++++++++-------- libdeepgalois/src/DistContext.cpp | 20 -- libdeepgalois/src/Net.cpp | 63 ++--- lonestar/gnn/gcn/gcn.cpp | 12 +- 5 files changed, 160 insertions(+), 167 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 7069c1a0d7..e1b76fa00c 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -10,7 +10,6 @@ namespace deepgalois { class DistContext { -protected: size_t localVertices; // number of samples: N size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D @@ -23,12 +22,6 @@ class DistContext { label_t* h_labels_subg; // labels for subgraph float_t* h_feats; // input features: N x D float_t* h_feats_subg; // input features for subgraph - label_t* d_labels; // labels on device - label_t* d_labels_subg; // labels for subgraph on device - float_t* d_feats; // input features on device - float_t* d_feats_subg; // input features for subgraph on device - float_t* norm_factors; // normalization constant based on graph structure - float_t* norm_factors_subg; // normalization constant for subgraph public: DistContext(); @@ -36,23 +29,14 @@ class DistContext { //! save graph pointer to context object void saveDistGraph(Graph* dGraph); - //! read labels of local nodes only size_t read_labels(std::string dataset_str); - //! read features of local nodes only size_t read_features(std::string dataset_str); - //! read masks of local nodes only size_t read_masks(std::string dataset_str, std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks, Graph* dGraph); - //! find norm factor by looking at degree - // TODO this is a distributed operation - void norm_factor_computing(bool is_subgraph, int subg_id = 0); - // void createSubgraphs(int num_subgraphs) {} - // void gen_subgraph_labels(size_t m, const mask_t *masks) {} - // void gen_subgraph_feats(size_t m, const mask_t *masks) {} // TODO define these void createSubgraphs(int) {} void gen_subgraph_labels(size_t, const mask_t*) {} diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 61fb1034c7..59674abc41 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -24,6 +24,67 @@ namespace deepgalois { // layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16) // layer 2: features N x 16, weights 16 x E, out N x E class Net { + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + std::string header = "[" + std::to_string(myID) + "] "; + std::string seperator = "\n"; + + bool is_single_class; // single-class (one-hot) or multi-class label + bool has_l2norm; // whether the net contains an l2_norm layer + bool has_dense; // whether the net contains an dense layer + unsigned neighbor_sample_size; // neighbor sampling + unsigned subgraph_sample_size; // subgraph sampling + int num_threads; // number of threads + size_t num_samples; // number of samples: N + size_t distNumSamples; // number of samples: N + size_t num_classes; // number of vertex classes: E + size_t num_conv_layers; // number of convolutional layers + size_t num_layers; // total number of layers (conv + output) + int num_epochs; // number of epochs + float learning_rate; // learning rate + float dropout_rate; // dropout rate + float weight_decay; // weighti decay for over-fitting + // begins/ends below are global ids + size_t globalTrainBegin; + size_t globalTrainEnd; + size_t globalTrainCount; + size_t globalValBegin; + size_t globalValEnd; + size_t globalValCount; + size_t globalTestBegin; + size_t globalTestEnd; + size_t globalTestCount; + int val_interval; + int num_subgraphs; + int num_vertices_sg; + bool is_selfloop; + + mask_t* globalTrainMasks; // masks for training + mask_t* globalValMasks; // masks for validation + mask_t* distTrainMasks; + mask_t* distValMasks; + mask_t* test_masks; // masks for test + + + mask_t* d_train_masks; // masks for training on device + mask_t* d_val_masks; // masks for validation on device + mask_t* d_test_masks; // masks for test on device + + mask_t* subgraphs_masks; // masks for subgraphs + std::vector feature_dims; // feature dimnesions for each layer + std::vector layers; // all the layers in the neural network + + // one context is for entire graph; other is for partitioned graph + // TODO optimize single host case + + //! context holds all of the graph data + deepgalois::Context* context; + //! dist context holds graph data of the partitioned graph only + deepgalois::DistContext* distContext; + + DGraph* dGraph; + + Sampler* sampler; + public: Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, unsigned hidden1, float lr, float dropout, float wd, bool selfloop, @@ -34,13 +95,19 @@ class Net { num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), learning_rate(lr), dropout_rate(dropout), weight_decay(wd), val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { + // init some identifiers for this host + this->myID = galois::runtime::getSystemNetworkInterface().ID; + this->header = "[" + std::to_string(myID) + "] "; + this->seperator = "\n"; + assert(n_conv > 0); + // TODO use galois print - std::cout << "Configuration: num_threads " << num_threads - << ", num_conv_layers " << num_conv_layers << ", num_epochs " - << num_epochs << ", hidden1 " << hidden1 << ", learning_rate " - << learning_rate << ", dropout_rate " << dropout_rate - << ", weight_decay " << weight_decay << "\n"; + galois>>gPrint(header, "Configuration: num_threads ", num_threads, + ", num_conv_layers ", num_conv_layers, ", num_epochs ", + num_epochs, ", hidden1 ", hidden1, ", learning_rate ", + learning_rate, ", dropout_rate ", dropout_rate, + ", weight_decay ", weight_decay, "\n"); num_layers = num_conv_layers + 1; // additional layers to add @@ -62,30 +129,34 @@ class Net { num_classes = context->read_labels(); // get training and validation sets - train_masks = new mask_t[num_samples]; - val_masks = new mask_t[num_samples]; - std::fill(train_masks, train_masks + num_samples, 0); - std::fill(val_masks, val_masks + num_samples, 0); + globalTrainMasks = new mask_t[num_samples]; + globalValMasks = new mask_t[num_samples]; + std::fill(globalTrainMasks, globalTrainMasks + num_samples, 0); + std::fill(globalValMasks, globalValMasks + num_samples, 0); // reddit is hard coded if (dataset_str == "reddit") { - train_begin = 0, train_count = 153431, - train_end = train_begin + train_count; - val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; + this->globalTrainBegin = 0; + this->globalTrainCount = 153431; + this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount; + this->globalValBegin = 153431; + this->globalValCount = 23831; + this->globalValEnd = this->globalValBegin + this->globalValCount; + // TODO do all can be used below - for (size_t i = train_begin; i < train_end; i++) - train_masks[i] = 1; - for (size_t i = val_begin; i < val_end; i++) - val_masks[i] = 1; + for (size_t i = globalTrainBegin; i < globalTrainEnd; i++) + globalTrainMasks[i] = 1; + for (size_t i = globalValBegin; i < globalValEnd; i++) + globalValMasks[i] = 1; } else { - train_count = context->read_masks("train", num_samples, train_begin, - train_end, train_masks); - val_count = context->read_masks("val", num_samples, val_begin, val_end, - val_masks); + globalTrainCount = context->read_masks("train", num_samples, globalTrainBegin, + globalTrainEnd, globalTrainMasks); + globalValCount = context->read_masks("val", num_samples, globalValBegin, globalValEnd, + globalValMasks); } // make sure sampel size isn't greater than what we have to train with - if (subgraph_sample_size > train_count) { + if (subgraph_sample_size > globalTrainCount) { GALOIS_DIE("subgraph size can not be larger than the size of training " "set\n"); } @@ -118,17 +189,13 @@ class Net { // neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1), // num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0), // num_epochs(0), learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0), - // train_begin(0), train_end(0), train_count(0), val_begin(0), val_end(0), - // val_count(0), test_begin(0), test_end(0), test_count(0), + // globalTrainBegin(0), globalTrainEnd(0), globalTrainCount(0), globalValBegin(0), globalValEnd(0), + // globalValCount(0), globalTestBegin(0), globalTestEnd(0), globalTestCount(0), // val_interval(1), num_subgraphs(1), num_vertices_sg(9000), - // train_masks(NULL), val_masks(NULL), test_masks(NULL), context(NULL) {} - - //! save graph pointer to context object - void saveDistGraph(Graph* dGraph); + // globalTrainMasks(NULL), globalValMasks(NULL), test_masks(NULL), context(NULL) {} -#ifdef GALOIS_USE_DIST - void dist_init(Graph* graph, std::string dataset_str); -#endif + //! Initializes metadata for the partition + void partitionInit(DGraph* graph, std::string dataset_str); size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } @@ -138,9 +205,6 @@ class Net { void regularize(); // add weight decay void train(optimizer* opt, bool need_validate) { - unsigned myID = galois::runtime::getSystemNetworkInterface().ID; - std::string header = "[" + std::to_string(myID) + "] "; - std::string seperator = "\n"; double total_train_time = 0.0; int num_subg_remain = 0; @@ -149,7 +213,7 @@ class Net { context->allocateSubgraphs(num_subgraphs); subgraphs_masks = new mask_t[num_samples * num_subgraphs]; galois::gPrint(header, " Construct training vertex set induced graph...\n"; - sampler->initializeMaskedGraph(train_count, train_masks, context->getGraphPointer()); + sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, context->getGraphPointer()); } std::cout << "\nStart training...\n"; @@ -269,10 +333,10 @@ class Net { size_t begin = 0, end = 0, count = 0; mask_t* masks = NULL; if (type == "train") { - begin = train_begin; - end = train_end; - count = train_count; - masks = train_masks; + begin = globalTrainBegin; + end = globalTrainEnd; + count = globalTrainCount; + masks = globalTrainMasks; if (subgraph_sample_size) { // update masks for subgraph masks = NULL; @@ -281,14 +345,14 @@ class Net { count = num_vertices_sg; } } else if (type == "val") { - begin = val_begin; - end = val_end; - count = val_count; - masks = val_masks; + begin = globalValBegin; + end = globalValEnd; + count = globalValCount; + masks = globalValMasks; } else { - begin = test_begin; - end = test_end; - count = test_count; + begin = globalTestBegin; + end = globalTestEnd; + count = globalTestCount; masks = test_masks; } #ifdef CPU_ONLY @@ -334,14 +398,14 @@ class Net { void read_test_masks(std::string dataset) { test_masks = new mask_t[num_samples]; if (dataset == "reddit") { - test_begin = 177262; - test_count = 55703; - test_end = test_begin + test_count; + globalTestBegin = 177262; + globalTestCount = 55703; + globalTestEnd = globalTestBegin + globalTestCount; #ifndef GALOIS_USE_DIST - for (size_t i = test_begin; i < test_end; i++) + for (size_t i = globalTestBegin; i < globalTestEnd; i++) test_masks[i] = 1; #else - for (size_t i = test_begin; i < test_end; i++) { + for (size_t i = globalTestBegin; i < globalTestEnd; i++) { if (dGraph->isLocal(i)) { test_masks[dGraph->getLID(i)] = 1; } @@ -349,11 +413,11 @@ class Net { #endif } else { #ifndef GALOIS_USE_DIST - test_count = context->read_masks("test", num_samples, test_begin, - test_end, test_masks); + globalTestCount = context->read_masks("test", num_samples, globalTestBegin, + globalTestEnd, test_masks); #else - test_count = context->read_masks("test", num_samples, test_begin, - test_end, test_masks, dGraph); + globalTestCount = context->read_masks("test", num_samples, globalTestBegin, + globalTestEnd, test_masks, dGraph); #endif } #ifndef CPU_ONLY @@ -492,50 +556,6 @@ class Net { layers[i]->print_layer_info(); } -protected: - bool is_single_class; // single-class (one-hot) or multi-class label - bool has_l2norm; // whether the net contains an l2_norm layer - bool has_dense; // whether the net contains an dense layer - unsigned neighbor_sample_size; // neighbor sampling - unsigned subgraph_sample_size; // subgraph sampling - int num_threads; // number of threads - size_t num_samples; // number of samples: N - size_t num_classes; // number of vertex classes: E - size_t num_conv_layers; // number of convolutional layers - size_t num_layers; // total number of layers (conv + output) - int num_epochs; // number of epochs - float learning_rate; // learning rate - float dropout_rate; // dropout rate - float weight_decay; // weighti decay for over-fitting - size_t train_begin, train_end, train_count; - size_t val_begin, val_end, val_count; - size_t test_begin, test_end, test_count; - int val_interval; - int num_subgraphs; - int num_vertices_sg; - bool is_selfloop; - - mask_t* train_masks; // masks for training - mask_t* d_train_masks; // masks for training on device - mask_t* val_masks; // masks for validation - mask_t* d_val_masks; // masks for validation on device - mask_t* test_masks; // masks for test - mask_t* d_test_masks; // masks for test on device - mask_t* subgraphs_masks; // masks for subgraphs - std::vector feature_dims; // feature dimnesions for each layer - std::vector layers; // all the layers in the neural network -#ifndef GALOIS_USE_DIST - deepgalois::Context* context; -#else - deepgalois::DistContext* context; - Graph* dGraph; -#endif - -#ifdef CPU_ONLY -#ifndef GALOIS_USE_DIST - Sampler* sampler; -#endif -#endif // comparing outputs with the ground truth (labels) acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth); diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 3332aeabaf..f7ad18bc22 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -150,26 +150,6 @@ size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, float_t* DistContext::get_in_ptr() { return &h_feats[0]; } -// void DistContext::norm_factor_computing(bool is_subgraph, int subg_id) { -void DistContext::norm_factor_computing(bool, int) { - // TODO: this is a distributed operation - - // create for now, TODO need to actually fill it in - norm_factors = new float_t[localVertices]; - galois::do_all( - galois::iterate((size_t)0, localVertices), - [&](auto v) { norm_factors[v] = 1; }, galois::loopname("NormCounting")); - - // galois::do_all(galois::iterate((size_t)0, localVertices), - // [&](auto v) { - // auto degree = std::distance(graph_cpu->edge_begin(v), - // graph_cpu->edge_end(v)); float_t temp = std::sqrt(float_t(degree)); if - // (temp == 0.0) norm_factors[v] = 0.0; else norm_factors[v] = 1.0 / temp; - // }, galois::loopname("NormCounting")); - - return; -} - void DistContext::initializeSyncSubstrate() { DistContext::syncSubstrate = new galois::graphs::GluonSubstrate( *DistContext::graph_cpu, galois::runtime::getSystemNetworkInterface().ID, diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index ede45fe2a3..a986ec194d 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -10,44 +10,51 @@ namespace deepgalois { #ifdef GALOIS_USE_DIST -void Net::dist_init(Graph* graph, std::string dataset_str) { - dGraph = graph; - context = new deepgalois::DistContext(); - num_samples = dGraph->size(); - context->saveGraph(dGraph); - // TODO self loop setup? - context->initializeSyncSubstrate(); - num_classes = context->read_labels(); +void Net::partitionInit(DGraph* graph, std::string dataset_str) { + this->dGraph = graph; + this->distContext = new deepgalois::DistContext(); + this->distContext->saveDistGraph(dGraph); + this->distNumSamples = this->dGraph->size(); + + // TODO self loop setup would have to be done before this during partitioning + // or on master node only + + this->distContext->initializeSyncSubstrate(); + num_classes = this->distContext->read_labels(); // std::cout << "Reading label masks ... "; - train_masks = new mask_t[num_samples]; - val_masks = new mask_t[num_samples]; - std::fill(train_masks, train_masks + num_samples, 0); - std::fill(val_masks, val_masks + num_samples, 0); + this->distTrainMasks = new mask_t[this->distNumSamples]; + this->distValMasks = new mask_t[this->distNumSamples]; + std::fill(this->distTrainMasks, this->distTrainMasks + this->distNumSamples, 0); + std::fill(this->distValMasks, this->distValMasks + this->distNumSamples, 0); if (dataset_str == "reddit") { - train_begin = 0, train_count = 153431, - train_end = train_begin + train_count; - val_begin = 153431, val_count = 23831, val_end = val_begin + val_count; + //this->globalTrainBegin = 0; + //this->globalTrainCount = 153431; + //this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount; + //this->globalValBegin = 153431; + //this->globalValCount = 23831; + //this->globalValEnd = this->globalValBegin + this->globalValCount; + // find local ID from global ID, set if it exists - for (size_t i = train_begin; i < train_end; i++) { - if (dGraph->isLocal(i)) { - train_masks[dGraph->getLID(i)] = 1; + for (size_t i = globalTrainBegin; i < globalTrainEnd; i++) { + if (this->dGraph->isLocal(i)) { + this->distTrainMasks[this->dGraph->getLID(i)] = 1; } } - for (size_t i = val_begin; i < val_end; i++) { - if (dGraph->isLocal(i)) { - val_masks[dGraph->getLID(i)] = 1; + for (size_t i = globalValBegin; i < globalValEnd; i++) { + if (this->dGraph->isLocal(i)) { + this->distValMasks[this->dGraph->getLID(i)] = 1; } } } else { - train_count = context->read_masks("train", num_samples, train_begin, - train_end, train_masks, dGraph); - val_count = context->read_masks("val", num_samples, val_begin, val_end, - val_masks, dGraph); + globalTrainCount = this->distContext->read_masks("train", this->distNumSamples, globalTrainBegin, + globalTrainEnd, this->distTrainMasks, this->dGraph); + globalValCount = this->distContext->read_masks("val", this->distNumSamples, globalValBegin, globalValEnd, + this->distValMasks, this->dGraph); } - feature_dims[0] = context->read_features(); // input feature dimension: D + feature_dims[0] = this->distContext->read_features(); // input feature dimension: D for (size_t i = 1; i < num_conv_layers; i++) feature_dims[i] = hidden1; // hidden1 level embedding: 16 feature_dims[num_conv_layers] = num_classes; // output embedding: E @@ -113,10 +120,10 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, #else // only look at owned nodes (i.e. masters); the prediction for these // should only by handled on the owner - if (dGraph->isOwned(i)) { + if (this->dGraph->isOwned(i)) { sampleCount += 1; - uint32_t localID = dGraph->getLID(i); + uint32_t localID = this->dGraph->getLID(i); if (masks[localID] == 1) { // get prediction auto pred = diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp index f2d08d3cb3..702fc63516 100644 --- a/lonestar/gnn/gcn/gcn.cpp +++ b/lonestar/gnn/gcn/gcn.cpp @@ -13,16 +13,18 @@ int main(int argc, char** argv) { galois::DistMemSys G; LonestarGnnStart(argc, argv, name, desc, url); - // the neural network to train: loads the entire graph on CPU + // Get a partitioned graph first + std::vector dummyVec; + deepgalois::Graph* dGraph = + galois::graphs::constructSymmetricGraph(dummyVec); + network.dist_init(dGraph, dataset); + + // initialize entire on CPU deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1, learning_rate, dropout_rate, weight_decay, add_selfloop, is_single_class, add_l2norm, add_dense, neighbor_sample_sz, subgraph_sample_sz, val_interval); - std::vector dummyVec; - deepgalois::Graph* dGraph = - galois::graphs::constructSymmetricGraph(dummyVec); - network.dist_init(dGraph, dataset); // read network, features, ground truth, initialize metadata // default setting for now; can be customized by the user From 2f90bf93ce4f9b27d01c46adfa13479de740d5b1 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 8 May 2020 17:58:02 -0500 Subject: [PATCH 273/660] re-clang formatting --- libdeepgalois/include/deepgalois/GraphTypes.h | 4 +- libdeepgalois/include/deepgalois/Net.h | 79 ++++++++++--------- libdeepgalois/include/deepgalois/Sampler.h | 4 +- libdeepgalois/src/Context.cpp | 10 ++- libdeepgalois/src/DistContext.cpp | 2 +- libdeepgalois/src/Net.cpp | 32 ++++---- libdeepgalois/src/Sampler.cpp | 29 ++++--- libdeepgalois/src/math_functions.cu | 4 +- 8 files changed, 88 insertions(+), 76 deletions(-) diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h index 0ef3fb4a77..ba241c53f5 100644 --- a/libdeepgalois/include/deepgalois/GraphTypes.h +++ b/libdeepgalois/include/deepgalois/GraphTypes.h @@ -11,6 +11,6 @@ namespace deepgalois { using index_t = edge_iterator; -using DGraph = galois::graphs::DistGraph; -using Graph = LearningGraph; +using DGraph = galois::graphs::DistGraph; +using Graph = LearningGraph; } // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 59674abc41..195c524a2d 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -24,9 +24,9 @@ namespace deepgalois { // layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16) // layer 2: features N x 16, weights 16 x E, out N x E class Net { - unsigned myID = galois::runtime::getSystemNetworkInterface().ID; - std::string header = "[" + std::to_string(myID) + "] "; - std::string seperator = "\n"; + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + std::string header = "[" + std::to_string(myID) + "] "; + std::string seperator = "\n"; bool is_single_class; // single-class (one-hot) or multi-class label bool has_l2norm; // whether the net contains an l2_norm layer @@ -35,7 +35,7 @@ class Net { unsigned subgraph_sample_size; // subgraph sampling int num_threads; // number of threads size_t num_samples; // number of samples: N - size_t distNumSamples; // number of samples: N + size_t distNumSamples; // number of samples: N size_t num_classes; // number of vertex classes: E size_t num_conv_layers; // number of convolutional layers size_t num_layers; // total number of layers (conv + output) @@ -58,16 +58,15 @@ class Net { int num_vertices_sg; bool is_selfloop; - mask_t* globalTrainMasks; // masks for training - mask_t* globalValMasks; // masks for validation + mask_t* globalTrainMasks; // masks for training + mask_t* globalValMasks; // masks for validation mask_t* distTrainMasks; mask_t* distValMasks; - mask_t* test_masks; // masks for test + mask_t* test_masks; // masks for test - - mask_t* d_train_masks; // masks for training on device - mask_t* d_val_masks; // masks for validation on device - mask_t* d_test_masks; // masks for test on device + mask_t* d_train_masks; // masks for training on device + mask_t* d_val_masks; // masks for validation on device + mask_t* d_test_masks; // masks for test on device mask_t* subgraphs_masks; // masks for subgraphs std::vector feature_dims; // feature dimnesions for each layer @@ -96,18 +95,18 @@ class Net { learning_rate(lr), dropout_rate(dropout), weight_decay(wd), val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { // init some identifiers for this host - this->myID = galois::runtime::getSystemNetworkInterface().ID; - this->header = "[" + std::to_string(myID) + "] "; - this->seperator = "\n"; + this->myID = galois::runtime::getSystemNetworkInterface().ID; + this->header = "[" + std::to_string(myID) + "] "; + this->seperator = "\n"; assert(n_conv > 0); // TODO use galois print - galois>>gPrint(header, "Configuration: num_threads ", num_threads, - ", num_conv_layers ", num_conv_layers, ", num_epochs ", - num_epochs, ", hidden1 ", hidden1, ", learning_rate ", - learning_rate, ", dropout_rate ", dropout_rate, - ", weight_decay ", weight_decay, "\n"); + galois >> gPrint(header, "Configuration: num_threads ", num_threads, + ", num_conv_layers ", num_conv_layers, ", num_epochs ", + num_epochs, ", hidden1 ", hidden1, ", learning_rate ", + learning_rate, ", dropout_rate ", dropout_rate, + ", weight_decay ", weight_decay, "\n"); num_layers = num_conv_layers + 1; // additional layers to add @@ -138,10 +137,10 @@ class Net { if (dataset_str == "reddit") { this->globalTrainBegin = 0; this->globalTrainCount = 153431; - this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount; - this->globalValBegin = 153431; - this->globalValCount = 23831; - this->globalValEnd = this->globalValBegin + this->globalValCount; + this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount; + this->globalValBegin = 153431; + this->globalValCount = 23831; + this->globalValEnd = this->globalValBegin + this->globalValCount; // TODO do all can be used below for (size_t i = globalTrainBegin; i < globalTrainEnd; i++) @@ -149,10 +148,11 @@ class Net { for (size_t i = globalValBegin; i < globalValEnd; i++) globalValMasks[i] = 1; } else { - globalTrainCount = context->read_masks("train", num_samples, globalTrainBegin, - globalTrainEnd, globalTrainMasks); - globalValCount = context->read_masks("val", num_samples, globalValBegin, globalValEnd, - globalValMasks); + globalTrainCount = + context->read_masks("train", num_samples, globalTrainBegin, + globalTrainEnd, globalTrainMasks); + globalValCount = context->read_masks("val", num_samples, globalValBegin, + globalValEnd, globalValMasks); } // make sure sampel size isn't greater than what we have to train with @@ -165,7 +165,7 @@ class Net { feature_dims[0] = context->read_features(); // input feature dimension: D for (size_t i = 1; i < num_conv_layers; i++) - feature_dims[i] = hidden1; // hidden1 level embedding: 16 + feature_dims[i] = hidden1; // hidden1 level embedding: 16 feature_dims[num_conv_layers] = num_classes; // output embedding: E @@ -184,15 +184,17 @@ class Net { } //! Default net constructor - //Net() + // Net() // : is_single_class(true), has_l2norm(false), has_dense(false), // neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1), // num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0), - // num_epochs(0), learning_rate(0.0), dropout_rate(0.0), weight_decay(0.0), - // globalTrainBegin(0), globalTrainEnd(0), globalTrainCount(0), globalValBegin(0), globalValEnd(0), - // globalValCount(0), globalTestBegin(0), globalTestEnd(0), globalTestCount(0), - // val_interval(1), num_subgraphs(1), num_vertices_sg(9000), - // globalTrainMasks(NULL), globalValMasks(NULL), test_masks(NULL), context(NULL) {} + // num_epochs(0), learning_rate(0.0), dropout_rate(0.0), + // weight_decay(0.0), globalTrainBegin(0), globalTrainEnd(0), + // globalTrainCount(0), globalValBegin(0), globalValEnd(0), + // globalValCount(0), globalTestBegin(0), globalTestEnd(0), + // globalTestCount(0), val_interval(1), num_subgraphs(1), + // num_vertices_sg(9000), globalTrainMasks(NULL), globalValMasks(NULL), + // test_masks(NULL), context(NULL) {} //! Initializes metadata for the partition void partitionInit(DGraph* graph, std::string dataset_str); @@ -413,11 +415,12 @@ class Net { #endif } else { #ifndef GALOIS_USE_DIST - globalTestCount = context->read_masks("test", num_samples, globalTestBegin, - globalTestEnd, test_masks); + globalTestCount = context->read_masks( + "test", num_samples, globalTestBegin, globalTestEnd, test_masks); #else - globalTestCount = context->read_masks("test", num_samples, globalTestBegin, - globalTestEnd, test_masks, dGraph); + globalTestCount = + context->read_masks("test", num_samples, globalTestBegin, + globalTestEnd, test_masks, dGraph); #endif } #ifndef CPU_ONLY diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h index 7934b28aa7..578bb6abf7 100644 --- a/libdeepgalois/include/deepgalois/Sampler.h +++ b/libdeepgalois/include/deepgalois/Sampler.h @@ -48,7 +48,7 @@ class Sampler { //! average degree cut off to a clip int subg_deg; //! list of vertices active in the graph being maintained (masked_graph) - //VertexList vertices_; + // VertexList vertices_; //! List of training nodes; sampling set std::vector node_train; mask_t* masks_; @@ -67,7 +67,7 @@ class Sampler { void getMasks(size_t n, VertexSet vertices, mask_t* masks); inline VertexList reindexVertices(size_t n, VertexSet vertex_set); void checkGSDB(std::vector& DB0, std::vector& DB1, - std::vector& DB2, size_t size); + std::vector& DB2, size_t size); }; } // namespace deepgalois diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp index 58526d7a96..2fbf8e6617 100644 --- a/libdeepgalois/src/Context.cpp +++ b/libdeepgalois/src/Context.cpp @@ -45,7 +45,8 @@ void Context::gen_subgraph_labels(size_t m, const mask_t* masks) { if (Context::is_single_class) { Context::h_labels_subg[count] = h_labels[i]; } else { - std::copy(Context::h_labels + i * Context::num_classes, Context::h_labels + (i + 1) * Context::num_classes, + std::copy(Context::h_labels + i * Context::num_classes, + Context::h_labels + (i + 1) * Context::num_classes, &Context::h_labels_subg[count * Context::num_classes]); } count++; @@ -54,7 +55,7 @@ void Context::gen_subgraph_labels(size_t m, const mask_t* masks) { assert(count == m); } -//! generate input features for the subgraph, m is subgraph size, +//! generate input features for the subgraph, m is subgraph size, //! masks tells which vertices to use void Context::gen_subgraph_feats(size_t m, const mask_t* masks) { size_t count = 0; @@ -62,7 +63,8 @@ void Context::gen_subgraph_feats(size_t m, const mask_t* masks) { Context::h_feats_subg.resize(m * feat_len); for (size_t i = 0; i < n; i++) { if (masks[i] == 1) { - std::copy(Context::h_feats + i * Context::feat_len, Context::h_feats + (i + 1) * Context::feat_len, + std::copy(Context::h_feats + i * Context::feat_len, + Context::h_feats + (i + 1) * Context::feat_len, &Context::h_feats_subg[count * Context::feat_len]); count++; } @@ -112,7 +114,7 @@ void Context::add_selfloop(Graph& og, Graph& g) { // TODO not actually implemented yet g.allocateFrom(og.size(), og.size() + og.sizeEdges()); g.constructNodes(); - //for (size_t src = 0; src < og.size(); src++) { + // for (size_t src = 0; src < og.size(); src++) { // //g.getData(src) = 1; // auto begin = og.edge_begin(src); // auto end = og.edge_end(src); diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index f7ad18bc22..ee47917347 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -7,7 +7,7 @@ DistContext::DistContext() {} DistContext::~DistContext() {} void DistContext::saveGraph(DGraph* dGraph) { - graph_cpu = dGraph; + graph_cpu = dGraph; localVertices = graph_cpu->size(); } diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index a986ec194d..f7882d1209 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -11,8 +11,8 @@ namespace deepgalois { #ifdef GALOIS_USE_DIST void Net::partitionInit(DGraph* graph, std::string dataset_str) { - this->dGraph = graph; - this->distContext = new deepgalois::DistContext(); + this->dGraph = graph; + this->distContext = new deepgalois::DistContext(); this->distContext->saveDistGraph(dGraph); this->distNumSamples = this->dGraph->size(); @@ -25,16 +25,17 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) { // std::cout << "Reading label masks ... "; this->distTrainMasks = new mask_t[this->distNumSamples]; this->distValMasks = new mask_t[this->distNumSamples]; - std::fill(this->distTrainMasks, this->distTrainMasks + this->distNumSamples, 0); + std::fill(this->distTrainMasks, this->distTrainMasks + this->distNumSamples, + 0); std::fill(this->distValMasks, this->distValMasks + this->distNumSamples, 0); if (dataset_str == "reddit") { - //this->globalTrainBegin = 0; - //this->globalTrainCount = 153431; - //this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount; - //this->globalValBegin = 153431; - //this->globalValCount = 23831; - //this->globalValEnd = this->globalValBegin + this->globalValCount; + // this->globalTrainBegin = 0; + // this->globalTrainCount = 153431; + // this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount; + // this->globalValBegin = 153431; + // this->globalValCount = 23831; + // this->globalValEnd = this->globalValBegin + this->globalValCount; // find local ID from global ID, set if it exists for (size_t i = globalTrainBegin; i < globalTrainEnd; i++) { @@ -48,13 +49,16 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) { } } } else { - globalTrainCount = this->distContext->read_masks("train", this->distNumSamples, globalTrainBegin, - globalTrainEnd, this->distTrainMasks, this->dGraph); - globalValCount = this->distContext->read_masks("val", this->distNumSamples, globalValBegin, globalValEnd, - this->distValMasks, this->dGraph); + globalTrainCount = this->distContext->read_masks( + "train", this->distNumSamples, globalTrainBegin, globalTrainEnd, + this->distTrainMasks, this->dGraph); + globalValCount = this->distContext->read_masks( + "val", this->distNumSamples, globalValBegin, globalValEnd, + this->distValMasks, this->dGraph); } - feature_dims[0] = this->distContext->read_features(); // input feature dimension: D + feature_dims[0] = + this->distContext->read_features(); // input feature dimension: D for (size_t i = 1; i < num_conv_layers; i++) feature_dims[i] = hidden1; // hidden1 level embedding: 16 feature_dims[num_conv_layers] = num_classes; // output embedding: E diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp index aa6fb6d686..dbf54a7b4b 100644 --- a/libdeepgalois/src/Sampler.cpp +++ b/libdeepgalois/src/Sampler.cpp @@ -16,7 +16,7 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g) { this->count_ = count; this->masks_ = masks; // save original graph - Sampler::graph = g; + Sampler::graph = g; // allocate the object for the new masked graph Sampler::masked_graph = new Graph(); @@ -54,9 +54,9 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g) { Sampler::avg_deg = masked_graph->sizeEdges() / masked_graph->size(); Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg; - //size_t idx = 0; - //vertices_.resize(count); - //for (size_t i = begin; i < end; i++) { + // size_t idx = 0; + // vertices_.resize(count); + // for (size_t i = begin; i < end; i++) { // if (masks_[i] == 1) // vertices_[idx++] = i; //} @@ -88,8 +88,7 @@ void Sampler::get_masked_degrees(size_t n, mask_t* masks, Graph* g, } //! returns a graph in the variable sub: it is g with the mask applied -void Sampler::getMaskedGraph(size_t n, mask_t* masks, Graph* g, - Graph& sub) { +void Sampler::getMaskedGraph(size_t n, mask_t* masks, Graph* g, Graph& sub) { std::vector degrees(n, 0); this->get_masked_degrees(n, masks, g, degrees); // auto offsets = deepgalois::parallel_prefix_sum(degrees); @@ -127,10 +126,9 @@ void Sampler::getMaskedGraph(size_t n, mask_t* masks, Graph* g, #endif } - // helper function for graph saint implementation below void Sampler::checkGSDB(std::vector& DB0, std::vector& DB1, - std::vector& DB2, size_t size) { + std::vector& DB2, size_t size) { if (DB0.capacity() < size) { DB0.reserve(DB0.capacity() * 2); DB1.reserve(DB1.capacity() * 2); @@ -222,7 +220,8 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned seed) { auto degree = getDegree(Sampler::masked_graph, v); neigh_v = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1); if (neigh_v != db_t(-1)) { - neigh_v = Sampler::masked_graph->getEdgeDst(Sampler::masked_graph->edge_begin(v) + neigh_v); + neigh_v = Sampler::masked_graph->getEdgeDst( + Sampler::masked_graph->edge_begin(v) + neigh_v); st.insert(neigh_v); IA1[DB2[choose] - 1] = 0; IA0[DB2[choose] - 1] = 0; @@ -363,7 +362,8 @@ inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) { // Given a subset of vertices and a graph g, generate a subgraph sg from the // graph g -void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& reindexGraph) { +void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, + Graph& reindexGraph) { // auto n = origGraph.size(); // old graph size auto nv = keptVertices.size(); // new graph (subgraph) size VertexList new_ids = this->reindexVertices(graph->size(), keptVertices); @@ -379,7 +379,8 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& #ifndef GALOIS_USE_DIST reindexGraph.allocateFrom(nv, ne); reindexGraph.constructNodes(); - VertexList old_ids(keptVertices.begin(), keptVertices.end()); // vertex ID mapping + VertexList old_ids(keptVertices.begin(), + keptVertices.end()); // vertex ID mapping #ifdef PARALLEL_GEN galois::do_all( galois::iterate((size_t)0, nv), @@ -390,7 +391,8 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& reindexGraph.fixEndEdge(i, offsets[i + 1]); unsigned j = 0; auto old_id = old_ids[i]; - for (auto e = origGraph.edge_begin(old_id); e != origGraph.edge_end(old_id); e++) { + for (auto e = origGraph.edge_begin(old_id); + e != origGraph.edge_end(old_id); e++) { auto dst = new_ids[origGraph.getEdgeDst(e)]; assert(dst < nv); reindexGraph.constructEdge(offsets[i] + j, dst, 0); @@ -417,7 +419,8 @@ void Sampler::subgraph_sample(size_t n, Graph& sg, mask_t* masks, getMasks(Sampler::graph->size(), sampledSet, masks); Graph masked_sg; - this->getMaskedGraph(Sampler::graph->size(), masks, Sampler::masked_graph, + this->getMaskedGraph( + Sampler::graph->size(), masks, Sampler::masked_graph, masked_sg); // remove edges whose destination is not masked this->reindexSubgraph(sampledSet, masked_sg, sg); } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 06d854d4b7..8b5ab8100f 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -691,8 +691,8 @@ void d_softmax_cross_entropy_gpu(int len, int begin, int end, // float_t *grad; // float_malloc_device((end-begin)*len, grad); // d_cross_entropy_kernel<<>>( d_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, - // BLOCK_SIZE>>>( + // CUDA_NUM_THREADS>>>( + // d_cross_entropy_warp<<<(end-begin-1)/WARPS_PER_BLOCK+1, BLOCK_SIZE>>>( // len, begin, end, masks, labels, out, grad); // CudaTest("solving d_cross_entropy kernel failed"); // d_softmax_kernel<<>>( From 596dbb050dee39c5a645cc5ad4093dcf67d9d669 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 8 May 2020 18:41:36 -0500 Subject: [PATCH 274/660] context/net cleanup, reorg --- libdeepgalois/include/deepgalois/Context.h | 134 +++++++++--------- .../include/deepgalois/DistContext.h | 3 - libdeepgalois/include/deepgalois/Net.h | 58 +++----- libdeepgalois/src/Context.cpp | 72 +--------- libdeepgalois/src/DistContext.cpp | 5 - libdeepgalois/src/Net.cpp | 21 +-- lonestar/gnn/gcn/gcn.cpp | 10 +- 7 files changed, 103 insertions(+), 200 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h index 519a75d7f3..41751badcf 100644 --- a/libdeepgalois/include/deepgalois/Context.h +++ b/libdeepgalois/include/deepgalois/Context.h @@ -16,6 +16,69 @@ namespace deepgalois { class Context { + std::string dataset; + bool is_device; // is this on device or host + size_t n; // number of samples: N + size_t num_classes; // number of classes: E + size_t feat_len; // input feature length: D + bool is_single_class; // single-class (one-hot) or multi-class label + bool is_selfloop_added; // whether selfloop is added to the input graph + bool use_subgraph; // whether to use subgraph + label_t* h_labels; // labels for classification. Single-class label: Nx1, + // multi-class label: NxE + float_t* h_feats; // input features: N x D + // label_t *h_labels_subg; // labels for subgraph + // float_t* h_feats_subg; // input features for subgraph + label_t* d_labels; // labels on device + label_t* d_labels_subg; // labels for subgraph on device + float_t* d_feats; // input features on device + float_t* d_feats_subg; // input features for subgraph on device + float_t* norm_factors; // normalization constant based on graph structure + std::vector h_labels_subg; // labels for subgraph + std::vector h_feats_subg; // input features for subgraph + std::vector norm_factors_subg; // normalization constant for subgraph + // float_t* norm_factors_subg; // normalization constant for subgraph + Reader reader; + + void alloc_norm_factor(); + void alloc_subgraph_norm_factor(int subg_id); + +#ifndef __GALOIS_HET_CUDA__ + Graph* graph_cpu; // the input graph, |V| = N + std::vector subgraphs_cpu; + void add_selfloop(Graph& og, Graph& g); + //! returns pointer to the graph + Graph* getGraphPointer() { return graph_cpu; } + Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; }; + float_t* get_feats_ptr() { return h_feats; } + float_t* get_feats_subg_ptr() { return &h_feats_subg[0]; } + label_t* get_labels_ptr() { return h_labels; } + label_t* get_labels_subg_ptr() { return &h_labels_subg[0]; } +#else + static cublasHandle_t cublas_handle_; // used to call cuBLAS + static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE + static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE + static curandGenerator_t + curand_generator_; // used to generate random numbers on GPU + + GraphGPU graph_gpu; // the input graph, |V| = N + std::vector subgraphs_gpu; + GraphGPU* getGraphPointer() { return &graph_gpu; } + GraphGPU* getSubgraphPointer(int id) { return subgraphs_gpu[id]; }; + float_t* get_feats_ptr() { return d_feats; } + float_t* get_feats_subg_ptr() { return d_feats_subg; } + label_t* get_labels_ptr() { return d_labels; } + label_t* get_labels_subg_ptr() { return d_labels_subg; } + inline static cublasHandle_t cublas_handle() { return cublas_handle_; } + inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; } + inline static cusparseMatDescr_t cusparse_matdescr() { + return cusparse_matdescr_; + } + inline static curandGenerator_t curand_generator() { + return curand_generator_; + } +#endif + public: Context(); //! initializer for gpu; goes ahead and sets a few things @@ -43,6 +106,7 @@ class Context { label_t get_label(size_t i) { return h_labels[i]; } // single-class (one-hot) label + // label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } // // multi-class label float_t* get_norm_factors_ptr() { return norm_factors; } @@ -52,6 +116,7 @@ class Context { dataset = dataset_str; reader.init(dataset); } + void set_label_class(bool is_single = true) { is_single_class = is_single; } void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; } void copy_data_to_device(); // copy labels and input features @@ -60,75 +125,6 @@ class Context { void gen_subgraph_feats(size_t m, const mask_t* masks); //! Allocate subgraphs (but don't actually do sampling yet) void allocateSubgraphs(int num_subgraphs); - -#ifndef __GALOIS_HET_CUDA__ - Graph* graph_cpu; // the input graph, |V| = N - std::vector subgraphs_cpu; - void add_selfloop(Graph& og, Graph& g); - //! returns pointer to the graph - Graph* getGraphPointer() { return graph_cpu; } - Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; }; - float_t* get_feats_ptr() { return h_feats; } - float_t* get_feats_subg_ptr() { return &h_feats_subg[0]; } - label_t* get_labels_ptr() { return h_labels; } - label_t* get_labels_subg_ptr() { return &h_labels_subg[0]; } -#else - GraphGPU graph_gpu; // the input graph, |V| = N - std::vector subgraphs_gpu; - GraphGPU* getGraphPointer() { return &graph_gpu; } - GraphGPU* getSubgraphPointer(int id) { return subgraphs_gpu[id]; }; - float_t* get_feats_ptr() { return d_feats; } - float_t* get_feats_subg_ptr() { return d_feats_subg; } - label_t* get_labels_ptr() { return d_labels; } - label_t* get_labels_subg_ptr() { return d_labels_subg; } - inline static cublasHandle_t cublas_handle() { return cublas_handle_; } - inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; } - inline static cusparseMatDescr_t cusparse_matdescr() { - return cusparse_matdescr_; - } - inline static curandGenerator_t curand_generator() { - return curand_generator_; - } -#endif - -protected: - std::string dataset; - bool is_device; // is this on device or host - size_t n; // number of samples: N - size_t num_classes; // number of classes: E - size_t feat_len; // input feature length: D - bool is_single_class; // single-class (one-hot) or multi-class label - bool is_selfloop_added; // whether selfloop is added to the input graph - bool use_subgraph; // whether to use subgraph - label_t* h_labels; // labels for classification. Single-class label: Nx1, - // multi-class label: NxE - float_t* h_feats; // input features: N x D - // label_t *h_labels_subg; // labels for subgraph - // float_t* h_feats_subg; // input features for subgraph - label_t* d_labels; // labels on device - label_t* d_labels_subg; // labels for subgraph on device - float_t* d_feats; // input features on device - float_t* d_feats_subg; // input features for subgraph on device - float_t* norm_factors; // normalization constant based on graph structure - std::vector h_labels_subg; // labels for subgraph - std::vector h_feats_subg; // input features for subgraph - std::vector norm_factors_subg; // normalization constant for subgraph - // float_t* norm_factors_subg; // normalization constant for subgraph - Reader reader; - - void alloc_norm_factor(); - void alloc_subgraph_norm_factor(int subg_id); - -#ifndef __GALOIS_HET_CUDA__ - void read_edgelist(const char* filename, bool symmetrize = false, - bool add_self_loop = false); -#else - static cublasHandle_t cublas_handle_; // used to call cuBLAS - static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE - static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE - static curandGenerator_t - curand_generator_; // used to generate random numbers on GPU -#endif }; } // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index e1b76fa00c..571a873e83 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -10,7 +10,6 @@ namespace deepgalois { class DistContext { - size_t localVertices; // number of samples: N size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D galois::graphs::GluonSubstrate* syncSubstrate; @@ -27,8 +26,6 @@ class DistContext { DistContext(); ~DistContext(); - //! save graph pointer to context object - void saveDistGraph(Graph* dGraph); //! read labels of local nodes only size_t read_labels(std::string dataset_str); //! read features of local nodes only diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 195c524a2d..7547252b86 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -34,7 +34,7 @@ class Net { unsigned neighbor_sample_size; // neighbor sampling unsigned subgraph_sample_size; // subgraph sampling int num_threads; // number of threads - size_t num_samples; // number of samples: N + size_t globalSamples; // number of samples: N size_t distNumSamples; // number of samples: N size_t num_classes; // number of vertex classes: E size_t num_conv_layers; // number of convolutional layers @@ -107,31 +107,31 @@ class Net { num_epochs, ", hidden1 ", hidden1, ", learning_rate ", learning_rate, ", dropout_rate ", dropout_rate, ", weight_decay ", weight_decay, "\n"); - num_layers = num_conv_layers + 1; + this->num_layers = num_conv_layers + 1; // additional layers to add if (has_l2norm) - num_layers++; + this->num_layers++; if (has_dense) - num_layers++; - + this->num_layers++; // initialize feature metadata feature_dims.resize(num_layers + 1); - // initialze context + // initialze global graph context context = new deepgalois::Context(); context->set_dataset(dataset_str); - // read graph, get num nodes - num_samples = context->read_graph(selfloop); + // read *entire* graph, get num nodes + globalSamples = context->read_graph(selfloop); context->set_label_class(is_single_class); // read ground truth labels num_classes = context->read_labels(); - // get training and validation sets - globalTrainMasks = new mask_t[num_samples]; - globalValMasks = new mask_t[num_samples]; - std::fill(globalTrainMasks, globalTrainMasks + num_samples, 0); - std::fill(globalValMasks, globalValMasks + num_samples, 0); + // get training and validation sets: this is to create the training + // subgraph in the sampler + globalTrainMasks = new mask_t[globalSamples]; + globalValMasks = new mask_t[globalSamples]; + std::fill(globalTrainMasks, globalTrainMasks + globalSamples, 0); + std::fill(globalValMasks, globalValMasks + globalSamples, 0); // reddit is hard coded if (dataset_str == "reddit") { @@ -149,9 +149,9 @@ class Net { globalValMasks[i] = 1; } else { globalTrainCount = - context->read_masks("train", num_samples, globalTrainBegin, + context->read_masks("train", globalSamples, globalTrainBegin, globalTrainEnd, globalTrainMasks); - globalValCount = context->read_masks("val", num_samples, globalValBegin, + globalValCount = context->read_masks("val", globalSamples, globalValBegin, globalValEnd, globalValMasks); } @@ -161,23 +161,8 @@ class Net { "set\n"); } - // read features of vertices - feature_dims[0] = context->read_features(); // input feature dimension: D - - for (size_t i = 1; i < num_conv_layers; i++) - feature_dims[i] = hidden1; // hidden1 level embedding: 16 - - feature_dims[num_conv_layers] = num_classes; // output embedding: E - - if (has_l2norm) - feature_dims[num_conv_layers + 1] = - num_classes; // l2 normalized embedding: E - - if (has_dense) - feature_dims[num_layers - 1] = num_classes; // MLP embedding: E - - feature_dims[num_layers] = num_classes; // normalized output embedding: E - layers.resize(num_layers); + // features are read in distcontext, not this context (this context only + // used for sampling) // set the subgraph boolean if sample size is greater than 0 context->set_use_subgraph(subgraph_sample_size > 0); @@ -187,7 +172,7 @@ class Net { // Net() // : is_single_class(true), has_l2norm(false), has_dense(false), // neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1), - // num_samples(0), num_classes(0), num_conv_layers(0), num_layers(0), + // globalSamples(0), num_classes(0), num_conv_layers(0), num_layers(0), // num_epochs(0), learning_rate(0.0), dropout_rate(0.0), // weight_decay(0.0), globalTrainBegin(0), globalTrainEnd(0), // globalTrainCount(0), globalValBegin(0), globalValEnd(0), @@ -201,13 +186,10 @@ class Net { size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } - size_t get_nnodes() { return num_samples; } - void normalize(); // Scale gradient to counterbalance accumulation void regularize(); // add weight decay void train(optimizer* opt, bool need_validate) { - double total_train_time = 0.0; int num_subg_remain = 0; @@ -416,10 +398,10 @@ class Net { } else { #ifndef GALOIS_USE_DIST globalTestCount = context->read_masks( - "test", num_samples, globalTestBegin, globalTestEnd, test_masks); + "test", globalSamples, globalTestBegin, globalTestEnd, test_masks); #else globalTestCount = - context->read_masks("test", num_samples, globalTestBegin, + context->read_masks("test", globalSamples, globalTestBegin, globalTestEnd, test_masks, dGraph); #endif } diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp index 2fbf8e6617..b44331fe1f 100644 --- a/libdeepgalois/src/Context.cpp +++ b/libdeepgalois/src/Context.cpp @@ -77,11 +77,7 @@ size_t Context::read_graph(bool selfloop) { std::string filetype = "gr"; galois::StatTimer Tread("GraphReadingTime"); Tread.start(); - if (filetype == "el") { - filename = path + dataset + ".el"; - printf("Reading .el file: %s\n", filename.c_str()); - read_edgelist(filename.c_str(), true); // symmetrize - } else if (filetype == "bin") { + if (filetype == "bin") { graph_cpu->readGraph(dataset); } else if (filetype == "gr") { graph_cpu = new Graph(); @@ -209,70 +205,4 @@ void Context::norm_factor_computing(bool is_subgraph, int subg_id) { #endif } -void Context::read_edgelist(const char* filename, bool symmetrize, - bool add_self_loop) { - std::ifstream in; - std::string line; - in.open(filename, std::ios::in); - size_t m, n; - in >> m >> n >> std::ws; - size_t num_vertices_ = m; - size_t num_edges_ = 0; - std::cout << "num_vertices " << num_vertices_ << "\n"; - std::vector> vertices(m); - for (size_t i = 0; i < n; i++) { - std::set neighbors; - if (add_self_loop) - neighbors.insert(i); - vertices.push_back(neighbors); - } - while (std::getline(in, line)) { - std::istringstream edge_stream(line); - VertexID u, v; - edge_stream >> u; - edge_stream >> v; - vertices[u].insert(v); - if (symmetrize) - vertices[v].insert(u); - } - in.close(); - for (size_t i = 0; i < n; i++) - num_edges_ += vertices[i].size(); - std::cout << "num_edges " << num_edges_ << "\n"; - - std::vector degrees; - degrees.resize(num_vertices_); - std::fill(degrees.begin(), degrees.end(), 0); - for (size_t i = 0; i < num_vertices_; i++) - degrees[i] = vertices[i].size(); - std::vector offsets(degrees.size() + 1); - uint32_t total = 0; - for (size_t n = 0; n < degrees.size(); n++) { - offsets[n] = total; - total += degrees[n]; - } - offsets[degrees.size()] = total; - degrees.clear(); - assert(num_edges_ == offsets[num_vertices_]); - EdgeID* colidx_ = new EdgeID[num_edges_]; - VertexID* rowptr_ = new VertexID[num_vertices_ + 1]; - for (size_t i = 0; i < num_vertices_ + 1; i++) - rowptr_[i] = offsets[i]; - for (size_t i = 0; i < num_vertices_; i++) { - for (auto dst : vertices[i]) - colidx_[offsets[i]++] = dst; - } - - auto g = getGraphPointer(); - g->allocateFrom(num_vertices_, num_edges_); - g->constructNodes(); - for (size_t i = 0; i < num_vertices_; i++) { - auto row_begin = rowptr_[i]; - auto row_end = rowptr_[i + 1]; - g->fixEndEdge(i, row_end); - for (auto offset = row_begin; offset < row_end; offset++) - g->constructEdge(offset, colidx_[offset], 0); - } -} - } // namespace deepgalois diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index ee47917347..21236ef638 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -6,11 +6,6 @@ namespace deepgalois { DistContext::DistContext() {} DistContext::~DistContext() {} -void DistContext::saveGraph(DGraph* dGraph) { - graph_cpu = dGraph; - localVertices = graph_cpu->size(); -} - size_t DistContext::read_labels(DGraph& dGraph, std::string dataset_str) { Graph* dGraph = DistContext::graph_cpu; unsigned myID = galois::runtime::getSystemNetworkInterface().ID; diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index f7882d1209..4ba6c23fe1 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -57,16 +57,20 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) { this->distValMasks, this->dGraph); } - feature_dims[0] = - this->distContext->read_features(); // input feature dimension: D + // input feature dimension: D + feature_dims[0] = this->distContext->read_features(); for (size_t i = 1; i < num_conv_layers; i++) feature_dims[i] = hidden1; // hidden1 level embedding: 16 feature_dims[num_conv_layers] = num_classes; // output embedding: E - if (has_l2norm) - feature_dims[num_conv_layers + 1] = - num_classes; // l2 normalized embedding: E - if (has_dense) - feature_dims[num_layers - 1] = num_classes; // MLP embedding: E + if (this->has_l2norm) { + // l2 normalized embedding: E + feature_dims[num_conv_layers + 1] = num_classes; + } + if (this->has_dense) { + // MLP embedding: E + feature_dims[num_layers - 1] = num_classes; + } + feature_dims[num_layers] = num_classes; // normalized output embedding: E layers.resize(num_layers); } @@ -87,9 +91,6 @@ void Net::regularize() { layers[layer_id]->get_grads_ptr()); } -// Scale gradient to counterbalance accumulation -void Net::normalize() {} - /** * * @param begin GLOBAL begin diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp index 702fc63516..0a47913a0f 100644 --- a/lonestar/gnn/gcn/gcn.cpp +++ b/lonestar/gnn/gcn/gcn.cpp @@ -17,17 +17,19 @@ int main(int argc, char** argv) { std::vector dummyVec; deepgalois::Graph* dGraph = galois::graphs::constructSymmetricGraph(dummyVec); - network.dist_init(dGraph, dataset); - // initialize entire on CPU + // initialize network + whole context on CPU + // read network, features, ground truth, initialize metadata + // default setting for now; can be customized by the user deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1, learning_rate, dropout_rate, weight_decay, add_selfloop, is_single_class, add_l2norm, add_dense, neighbor_sample_sz, subgraph_sample_sz, val_interval); + // initialize distributed context + network.partitionInit(dGraph, dataset); - // read network, features, ground truth, initialize metadata - // default setting for now; can be customized by the user + // construct layers from distributed context network.construct_layers(); network.print_layers_info(); deepgalois::ResourceManager rm; // tracks peak memory usage From 42317772c26a7316e56b4fce5f10aae46cf1164e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 8 May 2020 19:29:04 -0500 Subject: [PATCH 275/660] library builds: mostly distcinting between DGraph and Graph --- libdeepgalois/include/deepgalois/Context.h | 4 +- .../include/deepgalois/DistContext.h | 21 ++++++---- libdeepgalois/include/deepgalois/GraphTypes.h | 6 +-- libdeepgalois/include/deepgalois/Net.h | 42 +++++++++---------- libdeepgalois/include/deepgalois/Sampler.h | 4 -- .../include/deepgalois/layers/aggregator.h | 1 + .../include/deepgalois/layers/layer.h | 16 ++----- libdeepgalois/src/DistContext.cpp | 10 ++--- libdeepgalois/src/Net.cpp | 10 ++--- libdeepgalois/src/Sampler.cpp | 5 +-- libdeepgalois/src/layers/aggregator.cpp | 8 +--- libdeepgalois/src/layers/graph_conv_layer.cpp | 7 +--- lonestar/gnn/include/lonestargnn.h | 5 +-- 13 files changed, 59 insertions(+), 80 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h index 41751badcf..373d91d463 100644 --- a/libdeepgalois/include/deepgalois/Context.h +++ b/libdeepgalois/include/deepgalois/Context.h @@ -43,6 +43,9 @@ class Context { void alloc_norm_factor(); void alloc_subgraph_norm_factor(int subg_id); + +public: +// TODO separate below to public and private #ifndef __GALOIS_HET_CUDA__ Graph* graph_cpu; // the input graph, |V| = N std::vector subgraphs_cpu; @@ -79,7 +82,6 @@ class Context { } #endif -public: Context(); //! initializer for gpu; goes ahead and sets a few things Context(bool use_gpu) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 571a873e83..7f1c4fb1ea 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -12,9 +12,9 @@ namespace deepgalois { class DistContext { size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D - galois::graphs::GluonSubstrate* syncSubstrate; + galois::graphs::GluonSubstrate* syncSubstrate; - Graph* graph_cpu; // the input graph, |V| = N + DGraph* graph_cpu; // the input graph, |V| = N std::vector subgraphs_cpu; label_t* h_labels; // labels for classification. Single-class label: Nx1, // multi-class label: NxE @@ -22,17 +22,23 @@ class DistContext { float_t* h_feats; // input features: N x D float_t* h_feats_subg; // input features for subgraph + // TODO needs to come from whole graph + float_t* norm_factors; // normalization constant based on graph structure + std::vector norm_factors_subg; // normalization constant for subgraph + public: DistContext(); ~DistContext(); + void saveDistGraph(DGraph* a) { graph_cpu = a; } + //! read labels of local nodes only - size_t read_labels(std::string dataset_str); + size_t read_labels(DGraph* dGraph, std::string dataset_str); //! read features of local nodes only size_t read_features(std::string dataset_str); //! read masks of local nodes only size_t read_masks(std::string dataset_str, std::string mask_type, size_t n, - size_t& begin, size_t& end, mask_t* masks, Graph* dGraph); + size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph); // TODO define these void createSubgraphs(int) {} @@ -40,16 +46,17 @@ class DistContext { void gen_subgraph_feats(size_t, const mask_t*) {} float_t* get_norm_factors_ptr() { return norm_factors; } - Graph* getGraphPointer() { return graph_cpu; } + // TODO shouldn't return 0 always + float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; } + DGraph* getGraphPointer() { return graph_cpu; } Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; }; float_t* get_feats_ptr() { return h_feats; } float_t* get_feats_subg_ptr() { return h_feats_subg; } label_t* get_labels_ptr() { return h_labels; } label_t* get_labels_subg_ptr() { return h_labels_subg; } - float_t* get_norm_factors_subg_ptr() { return norm_factors_subg; } void initializeSyncSubstrate(); - galois::graphs::GluonSubstrate* getSyncSubstrate(); + galois::graphs::GluonSubstrate* getSyncSubstrate(); //! return label for some node //! NOTE: this is LID, not GID diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h index ba241c53f5..3a93565724 100644 --- a/libdeepgalois/include/deepgalois/GraphTypes.h +++ b/libdeepgalois/include/deepgalois/GraphTypes.h @@ -10,7 +10,7 @@ #endif namespace deepgalois { -using index_t = edge_iterator; -using DGraph = galois::graphs::DistGraph; -using Graph = LearningGraph; +using edge_iterator = index_t; +using DGraph = galois::graphs::DistGraph; +using Graph = LearningGraph; } // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 7547252b86..0fb0dfefdb 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -15,7 +15,6 @@ #include "deepgalois/GraphTypes.h" #include "deepgalois/DistContext.h" -#endif namespace deepgalois { @@ -40,6 +39,7 @@ class Net { size_t num_conv_layers; // number of convolutional layers size_t num_layers; // total number of layers (conv + output) int num_epochs; // number of epochs + unsigned h1; // hidden layer size float learning_rate; // learning rate float dropout_rate; // dropout rate float weight_decay; // weighti decay for over-fitting @@ -92,6 +92,7 @@ class Net { : is_single_class(single), has_l2norm(l2norm), has_dense(dense), neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz), num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), + h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd), val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { // init some identifiers for this host @@ -102,7 +103,7 @@ class Net { assert(n_conv > 0); // TODO use galois print - galois >> gPrint(header, "Configuration: num_threads ", num_threads, + galois::gPrint(header, "Configuration: num_threads ", num_threads, ", num_conv_layers ", num_conv_layers, ", num_epochs ", num_epochs, ", hidden1 ", hidden1, ", learning_rate ", learning_rate, ", dropout_rate ", dropout_rate, @@ -181,6 +182,7 @@ class Net { // num_vertices_sg(9000), globalTrainMasks(NULL), globalValMasks(NULL), // test_masks(NULL), context(NULL) {} + void init(); //! Initializes metadata for the partition void partitionInit(DGraph* graph, std::string dataset_str); @@ -195,8 +197,8 @@ class Net { if (subgraph_sample_size) { context->allocateSubgraphs(num_subgraphs); - subgraphs_masks = new mask_t[num_samples * num_subgraphs]; - galois::gPrint(header, " Construct training vertex set induced graph...\n"; + subgraphs_masks = new mask_t[distNumSamples * num_subgraphs]; + galois::gPrint(header, " Construct training vertex set induced graph...\n"); sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, context->getGraphPointer()); } @@ -222,7 +224,7 @@ class Net { // tid = galois::substrate::ThreadPool::getTID(); sampler->subgraph_sample(subgraph_sample_size, *(context->getSubgraphPointer(sid)), - &subgraphs_masks[sid * num_samples], tid); + &subgraphs_masks[sid * globalSamples], tid); } //, galois::loopname("subgraph_gen")); #endif #endif @@ -253,12 +255,12 @@ class Net { } // update labels for subgraph context->gen_subgraph_labels(num_vertices_sg, - &subgraphs_masks[sg_id * num_samples]); + &subgraphs_masks[sg_id * globalSamples]); layers[num_layers - 1]->set_labels_ptr(context->get_labels_subg_ptr()); // update features for subgraph context->gen_subgraph_feats(num_vertices_sg, - &subgraphs_masks[sg_id * num_samples]); + &subgraphs_masks[sg_id * globalSamples]); layers[0]->set_feats_ptr( context->get_feats_subg_ptr()); // feed input data } @@ -343,7 +345,7 @@ class Net { if (subgraph_sample_size && type != "train") { // switch to the original graph for (size_t i = 0; i < num_layers; i++) - layers[i]->update_dim_size(num_samples); + layers[i]->update_dim_size(distNumSamples); for (size_t i = 0; i < num_conv_layers; i++) { layers[i]->set_graph_ptr(context->getGraphPointer()); layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr()); @@ -380,7 +382,7 @@ class Net { // read masks of test set void read_test_masks(std::string dataset) { - test_masks = new mask_t[num_samples]; + test_masks = new mask_t[distNumSamples]; if (dataset == "reddit") { globalTestBegin = 177262; globalTestCount = 55703; @@ -396,14 +398,9 @@ class Net { } #endif } else { -#ifndef GALOIS_USE_DIST - globalTestCount = context->read_masks( - "test", globalSamples, globalTestBegin, globalTestEnd, test_masks); -#else globalTestCount = - context->read_masks("test", globalSamples, globalTestBegin, + distContext->read_masks(dataset, std::string("test"), globalSamples, globalTestBegin, globalTestEnd, test_masks, dGraph); -#endif } #ifndef CPU_ONLY copy_test_masks_to_device(); @@ -443,8 +440,8 @@ class Net { void append_l2norm_layer(size_t layer_id) { assert(layer_id > 0); // can not be the first layer std::vector in_dims(2), out_dims(2); - in_dims[0] = num_samples; - in_dims[0] = num_samples; + in_dims[0] = distNumSamples; + in_dims[0] = distNumSamples; in_dims[1] = get_in_dim(layer_id); out_dims[1] = get_out_dim(layer_id); layers[layer_id] = new l2_norm_layer(layer_id, in_dims, out_dims); @@ -454,8 +451,8 @@ class Net { void append_dense_layer(size_t layer_id) { assert(layer_id > 0); // can not be the first layer std::vector in_dims(2), out_dims(2); - in_dims[0] = num_samples; - in_dims[0] = num_samples; + in_dims[0] = distNumSamples; + in_dims[0] = distNumSamples; in_dims[1] = get_in_dim(layer_id); out_dims[1] = get_out_dim(layer_id); // layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims); @@ -465,7 +462,7 @@ class Net { void append_out_layer(size_t layer_id) { assert(layer_id > 0); // can not be the first layer std::vector in_dims(2), out_dims(2); - in_dims[0] = out_dims[0] = num_samples; + in_dims[0] = out_dims[0] = distNumSamples; in_dims[1] = get_in_dim(layer_id); out_dims[1] = get_out_dim(layer_id); if (is_single_class) @@ -481,7 +478,7 @@ class Net { assert(dropout_rate < 1.0); assert(layer_id < num_conv_layers); std::vector in_dims(2), out_dims(2); - in_dims[0] = out_dims[0] = num_samples; + in_dims[0] = out_dims[0] = distNumSamples; in_dims[1] = get_in_dim(layer_id); out_dims[1] = get_out_dim(layer_id); layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, @@ -491,7 +488,6 @@ class Net { // update trainable weights after back-propagation void update_weights(optimizer* opt) { - normalize(); regularize(); for (size_t i = 0; i < num_layers; i++) { if (layers[i]->trainable()) { @@ -528,7 +524,7 @@ class Net { //! Save the context object to all layers of the network void set_contexts() { for (size_t i = 0; i < num_layers; i++) - layers[i]->set_context(context); + layers[i]->set_context(distContext); } //! set netphases for all layers in this network void set_netphases(net_phase phase) { diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h index 578bb6abf7..9c4ea06f12 100644 --- a/libdeepgalois/include/deepgalois/Sampler.h +++ b/libdeepgalois/include/deepgalois/Sampler.h @@ -1,5 +1,3 @@ -#ifndef GALOIS_USE_DIST - #pragma once #include "deepgalois/GraphTypes.h" @@ -71,5 +69,3 @@ class Sampler { }; } // namespace deepgalois - -#endif diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h index cc6e22db00..997f006de8 100644 --- a/libdeepgalois/include/deepgalois/layers/aggregator.h +++ b/libdeepgalois/include/deepgalois/layers/aggregator.h @@ -5,6 +5,7 @@ #ifdef CPU_ONLY #include "deepgalois/GraphTypes.h" namespace deepgalois { +// TODO template arg void update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, float_t* norm_factor); void update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out, diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index a1c2ef630a..5ad33ae25a 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -9,19 +9,16 @@ * Reused/revised under 3-BSD */ #include -#include "deepgalois/gtypes.h" -#ifndef GALOIS_USE_DIST +#include "deepgalois/GraphTypes.h" #include "deepgalois/Context.h" -#else + #include "deepgalois/DistContext.h" -#endif #include "deepgalois/optimizer.h" #include "deepgalois/layers/node.h" -#ifdef GALOIS_USE_DIST + #include "galois/graphs/GluonSubstrate.h" #include "deepgalois/layers/GluonGradients.h" #include "deepgalois/layers/GradientSyncStructs.h" -#endif namespace deepgalois { @@ -40,11 +37,7 @@ namespace deepgalois { **/ class layer : public deepgalois::node { public: -#ifndef GALOIS_USE_DIST - using ContextType = deepgalois::Context; -#else using ContextType = deepgalois::DistContext; -#endif layer(unsigned level, std::vector in_dims, std::vector out_dims) @@ -179,17 +172,16 @@ class layer : public deepgalois::node { ContextType* context; label_t* labels; float_t* norm_consts; +// TODO #ifdef CPU_ONLY Graph* graph_cpu; #else GraphGPU* graph_gpu; #endif -#ifdef GALOIS_USE_DIST // Used for synchronization of weight gradients deepgalois::GluonGradients* gradientGraph; galois::graphs::GluonSubstrate* syncSub; -#endif }; //! Connects tail to head's edge and sets that edge's target to tail diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 21236ef638..1b8fef711c 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -7,7 +7,7 @@ DistContext::DistContext() {} DistContext::~DistContext() {} size_t DistContext::read_labels(DGraph& dGraph, std::string dataset_str) { - Graph* dGraph = DistContext::graph_cpu; + DGraph* dGraph = DistContext::graph_cpu; unsigned myID = galois::runtime::getSystemNetworkInterface().ID; galois::gPrint("[", myID, "] Reading labels from disk...\n"); @@ -58,7 +58,7 @@ size_t DistContext::read_labels(DGraph& dGraph, std::string dataset_str) { } size_t DistContext::read_features(std::string dataset_str) { - Graph* dGraph = DistContext::graph_cpu; + DGraph* dGraph = DistContext::graph_cpu; unsigned myID = galois::runtime::getSystemNetworkInterface().ID; galois::gPrint("[", myID, "] Reading features from disk...\n"); @@ -101,7 +101,7 @@ size_t DistContext::read_features(std::string dataset_str) { size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, size_t n, size_t& begin, size_t& end, - mask_t* masks, Graph* dGraph) { + mask_t* masks, DGraph* dGraph) { bool dataset_found = false; for (int i = 0; i < NUM_DATASETS; i++) { if (dataset_str == dataset_names[i]) { @@ -146,12 +146,12 @@ size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, float_t* DistContext::get_in_ptr() { return &h_feats[0]; } void DistContext::initializeSyncSubstrate() { - DistContext::syncSubstrate = new galois::graphs::GluonSubstrate( + DistContext::syncSubstrate = new galois::graphs::GluonSubstrate( *DistContext::graph_cpu, galois::runtime::getSystemNetworkInterface().ID, galois::runtime::getSystemNetworkInterface().Num, false); } -galois::graphs::GluonSubstrate* DistContext::getSyncSubstrate() { +galois::graphs::GluonSubstrate* DistContext::getSyncSubstrate() { return DistContext::syncSubstrate; }; diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index 4ba6c23fe1..f7ff51bb2e 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -20,7 +20,7 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) { // or on master node only this->distContext->initializeSyncSubstrate(); - num_classes = this->distContext->read_labels(); + num_classes = this->distContext->read_labels(graph, dataset_str); // std::cout << "Reading label masks ... "; this->distTrainMasks = new mask_t[this->distNumSamples]; @@ -49,18 +49,18 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) { } } } else { - globalTrainCount = this->distContext->read_masks( + globalTrainCount = this->distContext->read_masks(dataset_str, "train", this->distNumSamples, globalTrainBegin, globalTrainEnd, this->distTrainMasks, this->dGraph); - globalValCount = this->distContext->read_masks( + globalValCount = this->distContext->read_masks(dataset_str, "val", this->distNumSamples, globalValBegin, globalValEnd, this->distValMasks, this->dGraph); } // input feature dimension: D - feature_dims[0] = this->distContext->read_features(); + feature_dims[0] = this->distContext->read_features(dataset_str); for (size_t i = 1; i < num_conv_layers; i++) - feature_dims[i] = hidden1; // hidden1 level embedding: 16 + feature_dims[i] = this->h1; // hidden1 level embedding: 16 feature_dims[num_conv_layers] = num_classes; // output embedding: E if (this->has_l2norm) { // l2 normalized embedding: E diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp index dbf54a7b4b..6a84a8de76 100644 --- a/libdeepgalois/src/Sampler.cpp +++ b/libdeepgalois/src/Sampler.cpp @@ -123,7 +123,6 @@ void Sampler::getMaskedGraph(size_t n, mask_t* masks, Graph* g, Graph& sub) { , galois::loopname("gen_subgraph")); #endif -#endif } // helper function for graph saint implementation below @@ -183,7 +182,7 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned seed) { for (int i = 0; i < m; i++) { auto rand_idx = rand_r(&myseed) % Sampler::node_train.size(); db_t v = IA3[i] = Sampler::node_train[rand_idx]; - st.iisert(v); + st.insert(v); IA0[i] = getDegree(Sampler::masked_graph, v); IA0[i] = (IA0[i] > SAMPLE_CLIP) ? SAMPLE_CLIP : IA0[i]; IA1[i] = 1; @@ -376,7 +375,6 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, auto ne = offsets[nv]; // galois::gPrint("Generate subgraph: num_vertices=", nv, ", num_edges=", ne, // "\n"); -#ifndef GALOIS_USE_DIST reindexGraph.allocateFrom(nv, ne); reindexGraph.constructNodes(); VertexList old_ids(keptVertices.begin(), @@ -403,7 +401,6 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, , galois::loopname("construct_graph")); #endif -#endif } void Sampler::subgraph_sample(size_t n, Graph& sg, mask_t* masks, diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 9c3454d550..4e07ca96cf 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -2,20 +2,14 @@ #include "deepgalois/math_functions.hh" #include "galois/Galois.h" +// TODO template arg void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, float_t* norm_factor) { // std::cout << "[update_all] graph size: " << n << "\n"; -#ifndef GALOIS_USE_DIST size_t n = g.size(); galois::do_all( galois::iterate(size_t(0), n), [&](const auto src) { -#else - auto& rangeObj = g.allNodesRange(); - galois::do_all( - galois::iterate(rangeObj), - [&](const auto src) { -#endif auto src_idx = src * len; // zero out the output data math::clear_cpu(len, &out[src_idx]); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index d50f7bfb0a..97facbcd83 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -66,7 +66,7 @@ void graph_conv_layer::malloc_and_init() { size_t x = input_dims[0]; size_t y = input_dims[1]; size_t z = output_dims[1]; -#ifdef GALOIS_USE_DIST + // setup gluon layer::gradientGraph = new deepgalois::GluonGradients(layer::weight_grad, y * z); @@ -74,14 +74,9 @@ void graph_conv_layer::malloc_and_init() { new galois::graphs::GluonSubstrate( *layer::gradientGraph, layer::gradientGraph->myHostID(), layer::gradientGraph->numHosts(), false); -#endif -#ifdef GALOIS_USE_DIST // make sure seed consistent across all hosts for weight matrix rand_init_matrix(y, z, W, 1); -#else - rand_init_matrix(y, z, W); -#endif // rand_init_matrix(y, z, Q); zero_init_matrix(y, z, layer::weight_grad); diff --git a/lonestar/gnn/include/lonestargnn.h b/lonestar/gnn/include/lonestargnn.h index d0255b9368..21e73cb024 100644 --- a/lonestar/gnn/include/lonestargnn.h +++ b/lonestar/gnn/include/lonestargnn.h @@ -10,10 +10,9 @@ #include "galois/runtime/Profile.h" #include "llvm/Support/CommandLine.h" #include -#ifdef GALOIS_USE_DIST + #include "galois/DistGalois.h" #include "galois/runtime/Network.h" -#endif namespace cll = llvm::cl; static cll::opt dataset(cll::Positional, @@ -109,4 +108,4 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, galois::runtime::reportParam("(NULL)", "Hostname", name); } -#include "deepgalois/net.h" +#include "deepgalois/Net.h" From 4e0d315974b1f94de107fcf77327b744547175e3 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 8 May 2020 19:48:46 -0500 Subject: [PATCH 276/660] distcontext back as a source file --- libdeepgalois/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 58309084b1..2f05527318 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -83,6 +83,7 @@ if(NOT ENABLE_HETERO_GALOIS) src/math_functions.cpp src/optimizer.cpp src/Context.cpp + src/DistContext.cpp src/Sampler.cpp src/reader.cpp src/lgraph.cpp From 8fd535f261bf5a29b888a779e3f83b38e3d36c42 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 8 May 2020 19:49:33 -0500 Subject: [PATCH 277/660] het cuda defs, signature change for dist read labels --- libdeepgalois/include/deepgalois/DistContext.h | 2 +- libdeepgalois/include/deepgalois/Net.h | 6 +++--- .../include/deepgalois/layers/aggregator.h | 2 +- .../deepgalois/layers/graph_conv_layer.h | 2 +- .../include/deepgalois/layers/layer.h | 8 ++++---- libdeepgalois/include/deepgalois/lgraph.h | 2 +- libdeepgalois/include/deepgalois/optimizer.h | 18 +++++++++--------- libdeepgalois/src/DistContext.cpp | 2 +- libdeepgalois/src/Net.cpp | 4 ++-- libdeepgalois/src/layers/graph_conv_layer.cpp | 2 +- lonestar/gnn/gcn/gcn.cpp | 2 +- 11 files changed, 25 insertions(+), 25 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 7f1c4fb1ea..212a29a287 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -33,7 +33,7 @@ class DistContext { void saveDistGraph(DGraph* a) { graph_cpu = a; } //! read labels of local nodes only - size_t read_labels(DGraph* dGraph, std::string dataset_str); + size_t read_labels(std::string dataset_str); //! read features of local nodes only size_t read_features(std::string dataset_str); //! read masks of local nodes only diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 0fb0dfefdb..548b01e79a 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -215,7 +215,7 @@ class Net { Timer t_subgen; t_subgen.Start(); // generate subgraphs -#ifdef CPU_ONLY +#ifndef __GALOIS_HET_CUDA__ #ifndef GALOIS_USE_DIST for (int sid = 0; sid < num_subgraphs; sid++) { // galois::do_all(galois::iterate(size_t(0), @@ -341,7 +341,7 @@ class Net { count = globalTestCount; masks = test_masks; } -#ifdef CPU_ONLY +#ifndef __GALOIS_HET_CUDA__ if (subgraph_sample_size && type != "train") { // switch to the original graph for (size_t i = 0; i < num_layers; i++) @@ -402,7 +402,7 @@ class Net { distContext->read_masks(dataset, std::string("test"), globalSamples, globalTestBegin, globalTestEnd, test_masks, dGraph); } -#ifndef CPU_ONLY +#ifdef __GALOIS_HET_CUDA__ copy_test_masks_to_device(); #endif } diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h index 997f006de8..142812c6ba 100644 --- a/libdeepgalois/include/deepgalois/layers/aggregator.h +++ b/libdeepgalois/include/deepgalois/layers/aggregator.h @@ -2,7 +2,7 @@ #include "deepgalois/types.h" //! For each node in the graph, add the embeddings of all of its neighbors //! together (using norm_factor if specified) -#ifdef CPU_ONLY +#ifndef __GALOIS_HET_CUDA__ #include "deepgalois/GraphTypes.h" namespace deepgalois { // TODO template arg diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 09d4233c27..216b7e1935 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -53,7 +53,7 @@ class graph_conv_layer : public layer { virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); // user-defined aggregate function -#ifdef CPU_ONLY +#ifndef __GALOIS_HET_CUDA__ virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out); void d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out); #else diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 5ad33ae25a..3a33d54440 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -72,7 +72,7 @@ class layer : public deepgalois::node { void set_norm_consts_ptr(float_t* ptr) { norm_consts = ptr; } void set_feats_ptr(float_t* ptr) { prev_->set_data(ptr); } void set_name(std::string name) { name_ = name; } // name metadata -#ifdef CPU_ONLY +#ifndef __GALOIS_HET_CUDA__ void set_graph_ptr(Graph* ptr) { graph_cpu = ptr; } #else void set_graph_ptr(GraphGPU* ptr) { graph_gpu = ptr; } @@ -97,7 +97,7 @@ class layer : public deepgalois::node { use_mask = false; if (masks != NULL) { use_mask = true; -#ifdef CPU_ONLY +#ifndef __GALOIS_HET_CUDA__ masks_ = masks; #else d_masks_ = masks; @@ -135,7 +135,7 @@ class layer : public deepgalois::node { //! use optimizer to update weights given gradient (weight_grad) void update_weight(deepgalois::optimizer* opt) { -#ifdef CPU_ONLY +#ifndef __GALOIS_HET_CUDA__ // parallelize only when target size is big enough to mitigate thread // spawning overhead. // bool parallel = (W.size() >= 512); @@ -173,7 +173,7 @@ class layer : public deepgalois::node { label_t* labels; float_t* norm_consts; // TODO -#ifdef CPU_ONLY +#ifndef __GALOIS_HET_CUDA__ Graph* graph_cpu; #else GraphGPU* graph_gpu; diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index 53382199f4..0c06a926cb 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -119,7 +119,7 @@ class LearningGraph { index_t* row_start_host_ptr() { return &rowptr_[0]; } index_t* edge_dst_host_ptr() { return &colidx_[0]; } -#ifdef CPU_ONLY +#ifndef __GALOIS_HET_CUDA__ index_t getEdgeDst(index_t eid) { return colidx_[eid]; } index_t edge_begin(index_t vid) { return rowptr_[vid]; } index_t edge_end(index_t vid) { return rowptr_[vid + 1]; } diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h index aa0dcbaab7..3a0139418e 100644 --- a/libdeepgalois/include/deepgalois/optimizer.h +++ b/libdeepgalois/include/deepgalois/optimizer.h @@ -30,7 +30,7 @@ struct optimizer { optimizer& operator=(optimizer&&) = default; virtual ~optimizer() = default; virtual void update(const vec_t& dW, vec_t& W) = 0; -#ifndef CPU_ONLY +#ifdef __GALOIS_HET_CUDA__ virtual void update_gpu(const size_t n, const float_t* dW, float_t* W) = 0; #endif virtual void reset() {} // override to implement pre-learning action @@ -53,7 +53,7 @@ struct stateful_optimizer : public optimizer { return E_[Index][&key]; } std::unordered_map E_[N]; -#ifndef CPU_ONLY +#ifdef __GALOIS_HET_CUDA__ template float_t* get_gpu(const size_t n, const float_t* key); std::unordered_map dE_[N]; @@ -70,7 +70,7 @@ struct stateful_optimizer : public optimizer { struct adagrad : public stateful_optimizer<1> { adagrad() : alpha(0.01), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W); -#ifndef CPU_ONLY +#ifdef __GALOIS_HET_CUDA__ void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif float_t alpha; // learning rate @@ -87,7 +87,7 @@ struct adagrad : public stateful_optimizer<1> { struct RMSprop : public stateful_optimizer<1> { RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W); -#ifndef CPU_ONLY +#ifdef __GALOIS_HET_CUDA__ void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif float_t alpha; // learning rate @@ -103,7 +103,7 @@ struct adam : public stateful_optimizer<2> { : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W); -#ifndef CPU_ONLY +#ifdef __GALOIS_HET_CUDA__ void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif @@ -128,7 +128,7 @@ struct adamax : public stateful_optimizer<2> { : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W); -#ifndef CPU_ONLY +#ifdef __GALOIS_HET_CUDA__ void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif @@ -146,7 +146,7 @@ struct adamax : public stateful_optimizer<2> { struct gradient_descent : public optimizer { gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {} void update(const vec_t& dW, vec_t& W); -#ifndef CPU_ONLY +#ifdef __GALOIS_HET_CUDA__ void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif float_t alpha; // learning rate @@ -164,7 +164,7 @@ struct momentum : public stateful_optimizer<1> { public: momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} void update(const vec_t& dW, vec_t& W); -#ifndef CPU_ONLY +#ifdef __GALOIS_HET_CUDA__ void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif @@ -185,7 +185,7 @@ struct nesterov_momentum : public stateful_optimizer<1> { nesterov_momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} void update(const vec_t& dW, vec_t& W); -#ifndef CPU_ONLY +#ifdef __GALOIS_HET_CUDA__ void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 1b8fef711c..8576082c7b 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -6,7 +6,7 @@ namespace deepgalois { DistContext::DistContext() {} DistContext::~DistContext() {} -size_t DistContext::read_labels(DGraph& dGraph, std::string dataset_str) { +size_t DistContext::read_labels(std::string dataset_str) { DGraph* dGraph = DistContext::graph_cpu; unsigned myID = galois::runtime::getSystemNetworkInterface().ID; galois::gPrint("[", myID, "] Reading labels from disk...\n"); diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index f7ff51bb2e..800d550048 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -20,7 +20,7 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) { // or on master node only this->distContext->initializeSyncSubstrate(); - num_classes = this->distContext->read_labels(graph, dataset_str); + num_classes = this->distContext->read_labels(dataset_str); // std::cout << "Reading label masks ... "; this->distTrainMasks = new mask_t[this->distNumSamples]; @@ -76,7 +76,7 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) { } #endif -#ifdef CPU_ONLY +#ifndef __GALOIS_HET_CUDA__ void Net::init() { if (subgraph_sample_size) sampler = new deepgalois::Sampler(); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 97facbcd83..00ca3f30e6 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -10,7 +10,7 @@ float_t* _dataToSync = nullptr; //! sync long unsigned _syncVectorSize = 0; -#ifdef CPU_ONLY +#ifndef __GALOIS_HET_CUDA__ inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed) { auto init_range = sqrt(6.0 / (dim_x + dim_y)); diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp index 0a47913a0f..d9219438ae 100644 --- a/lonestar/gnn/gcn/gcn.cpp +++ b/lonestar/gnn/gcn/gcn.cpp @@ -15,7 +15,7 @@ int main(int argc, char** argv) { // Get a partitioned graph first std::vector dummyVec; - deepgalois::Graph* dGraph = + deepgalois::DGraph* dGraph = galois::graphs::constructSymmetricGraph(dummyVec); // initialize network + whole context on CPU From 425d4944d7c516b940a82329e3c9a535ceb2bb1f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 9 May 2020 13:34:45 -0500 Subject: [PATCH 278/660] single process now runs (incorrectly probably + no sampling) --- .../include/deepgalois/DistContext.h | 8 +- libdeepgalois/include/deepgalois/Net.h | 81 +++++++++++-------- libdeepgalois/src/Context.cpp | 4 +- libdeepgalois/src/DistContext.cpp | 27 +++++++ libdeepgalois/src/layers/graph_conv_layer.cpp | 15 ++-- 5 files changed, 90 insertions(+), 45 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 212a29a287..0ffb2e1b0c 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -5,6 +5,7 @@ */ #include "galois/graphs/GluonSubstrate.h" #include "deepgalois/types.h" +#include "deepgalois/Context.h" #include "deepgalois/GraphTypes.h" namespace deepgalois { @@ -23,7 +24,7 @@ class DistContext { float_t* h_feats_subg; // input features for subgraph // TODO needs to come from whole graph - float_t* norm_factors; // normalization constant based on graph structure + float_t* normFactors; // normalization constant based on graph structure std::vector norm_factors_subg; // normalization constant for subgraph public: @@ -45,7 +46,10 @@ class DistContext { void gen_subgraph_labels(size_t, const mask_t*) {} void gen_subgraph_feats(size_t, const mask_t*) {} - float_t* get_norm_factors_ptr() { return norm_factors; } + void constructNormFactor(deepgalois::Context* globalContext, bool is_subgraph, + int subg_id = 0); + + float_t* get_norm_factors_ptr() { return normFactors; } // TODO shouldn't return 0 always float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; } DGraph* getGraphPointer() { return graph_cpu; } diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 548b01e79a..e50b081bd7 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -202,21 +202,23 @@ class Net { sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, context->getGraphPointer()); } - std::cout << "\nStart training...\n"; + galois::gPrint(header, "Start training...\n"); Timer t_epoch; + // run epochs for (int ep = 0; ep < num_epochs; ep++) { t_epoch.Start(); +//////////////////////////////////////////////////////////////////////////////// if (subgraph_sample_size) { if (num_subg_remain == 0) { std::cout << "Generating " << num_subgraphs << " subgraphs "; Timer t_subgen; t_subgen.Start(); + // generate subgraphs #ifndef __GALOIS_HET_CUDA__ -#ifndef GALOIS_USE_DIST for (int sid = 0; sid < num_subgraphs; sid++) { // galois::do_all(galois::iterate(size_t(0), // size_t(num_subgraphs)),[&](const auto sid) { @@ -226,20 +228,18 @@ class Net { *(context->getSubgraphPointer(sid)), &subgraphs_masks[sid * globalSamples], tid); } //, galois::loopname("subgraph_gen")); -#endif #endif num_subg_remain = num_subgraphs; t_subgen.Stop(); // std::cout << "Done, time: " << t_subgen.Millisecs() << "\n"; } -#ifndef GALOIS_USE_DIST for (int i = 0; i < num_subgraphs; i++) { auto sg_ptr = context->getSubgraphPointer(i); sg_ptr->degree_counting(); // galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " // num_e ", sg_ptr->sizeEdges(), "\n"); } -#endif // GALOIS_USE_DIST + num_subg_remain--; int sg_id = num_subg_remain; auto subgraph_ptr = context->getSubgraphPointer(sg_id); @@ -263,10 +263,11 @@ class Net { &subgraphs_masks[sg_id * globalSamples]); layers[0]->set_feats_ptr( context->get_feats_subg_ptr()); // feed input data - } + } // end subgraph sample loop +//////////////////////////////////////////////////////////////////////////////// // training steps - std::cout << header << "Epoch " << std::setw(3) << ep << seperator; + galois::gPrint(header, "Epoch ", std::setw(3), ep, seperator); set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; @@ -274,6 +275,8 @@ class Net { // features for use during backprop double fw_time = evaluate("train", train_loss, train_acc); + galois::gPrint(header, "Back prop\n"); + // backward: use intermediate features + ground truth to update layers // with feature gradients whcih are then used to calculate weight // gradients @@ -285,11 +288,15 @@ class Net { // validation / testing set_netphases(net_phase::test); - std::cout << header << "train_loss " << std::setprecision(3) << std::fixed - << train_loss << " train_acc " << train_acc << seperator; + + galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, + train_loss, " train_acc ", train_acc, seperator); + t_epoch.Stop(); + double epoch_time = t_epoch.Millisecs(); total_train_time += epoch_time; + if (need_validate && ep % val_interval == 0) { // Validation acc_t val_loss = 0.0, val_acc = 0.0; @@ -304,20 +311,22 @@ class Net { << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n"; } - } + } // epoch loop + double avg_train_time = total_train_time / (double)num_epochs; double throughput = 1000.0 * (double)num_epochs / total_train_time; - std::cout << "\nAverage training time: " << avg_train_time - << " ms. Throughput: " << throughput << " epoch/s\n"; + galois::gPrint(header, "Average training time per epoch: ", avg_train_time, + " ms. Throughput: ", throughput, " epoch/s\n"); } // evaluate, i.e. inference or predict double evaluate(std::string type, acc_t& loss, acc_t& acc) { - // TODO may need to do something for the dist case Timer t_eval; t_eval.Start(); size_t begin = 0, end = 0, count = 0; mask_t* masks = NULL; + + // TODO global here good for dist case? if (type == "train") { begin = globalTrainBegin; end = globalTrainEnd; @@ -341,9 +350,10 @@ class Net { count = globalTestCount; masks = test_masks; } + #ifndef __GALOIS_HET_CUDA__ - if (subgraph_sample_size && - type != "train") { // switch to the original graph + // switch to the original graph if not training + if (subgraph_sample_size && type != "train") { for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(distNumSamples); for (size_t i = 0; i < num_conv_layers; i++) { @@ -362,6 +372,7 @@ class Net { masks = d_test_masks; } #endif + loss = fprop(begin, end, count, masks); float_t* predictions = layers[num_layers - 1]->next()->get_data(); label_t* labels; @@ -387,16 +398,11 @@ class Net { globalTestBegin = 177262; globalTestCount = 55703; globalTestEnd = globalTestBegin + globalTestCount; -#ifndef GALOIS_USE_DIST - for (size_t i = globalTestBegin; i < globalTestEnd; i++) - test_masks[i] = 1; -#else for (size_t i = globalTestBegin; i < globalTestEnd; i++) { if (dGraph->isLocal(i)) { test_masks[dGraph->getLID(i)] = 1; } } -#endif } else { globalTestCount = distContext->read_masks(dataset, std::string("test"), globalSamples, globalTestBegin, @@ -411,28 +417,40 @@ class Net { void construct_layers() { // append conv layers std::cout << "\nConstructing layers...\n"; - for (size_t i = 0; i < num_conv_layers - 1; i++) + for (size_t i = 0; i < num_conv_layers - 1; i++) { append_conv_layer(i, true); // conv layers, act=true + } + append_conv_layer(num_conv_layers - 1); // the last hidden layer, act=false - if (has_l2norm) + + if (has_l2norm) { append_l2norm_layer(num_conv_layers); // l2_norm layer - if (has_dense) + } + + if (has_dense) { append_dense_layer(num_layers - 2); // dense layer + } + append_out_layer(num_layers - 1); // output layer // allocate memory for intermediate features and gradients for (size_t i = 0; i < num_layers; i++) { layers[i]->add_edge(); } - for (size_t i = 1; i < num_layers; i++) + for (size_t i = 1; i < num_layers; i++) { connect(layers[i - 1], layers[i]); - for (size_t i = 0; i < num_layers; i++) + } + + for (size_t i = 0; i < num_layers; i++) { layers[i]->malloc_and_init(); - layers[0]->set_in_data(context->get_feats_ptr()); // feed input data + } + + layers[0]->set_in_data(distContext->get_feats_ptr()); // feed input data // precompute the normalization constant based on graph structure - context->norm_factor_computing(0); + //context->norm_factor_computing(false); + distContext->constructNormFactor(context, false); for (size_t i = 0; i < num_conv_layers; i++) - layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr()); + layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr()); set_contexts(); } @@ -499,14 +517,11 @@ class Net { //! forward propagation: [begin, end) is the range of samples used. //! calls "forward" on each layer and returns the loss of the final layer acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) { - // set mask for the last layer + // set mask for the last layer; globals layers[num_layers - 1]->set_sample_mask(begin, end, count, masks); - // layer0: from N x D to N x 16 - // layer1: from N x 16 to N x E - // layer2: from N x E to N x E (normalize only) + for (size_t i = 0; i < num_layers; i++) { layers[i]->forward(); - // TODO need to sync model between layers here } // prediction error auto loss = layers[num_layers - 1]->get_prediction_loss(); diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp index b44331fe1f..7c7bcdd61b 100644 --- a/libdeepgalois/src/Context.cpp +++ b/libdeepgalois/src/Context.cpp @@ -139,13 +139,13 @@ void Context::add_selfloop(Graph& og, Graph& g) { void Context::alloc_norm_factor() { Graph* g = getGraphPointer(); - if (norm_factors == NULL) + if (norm_factors == NULL) { #ifdef USE_MKL - // TODO why does MKL use size edges norm_factors = new float_t[g->sizeEdges()]; #else norm_factors = new float_t[g->size()]; #endif + } } void Context::alloc_subgraph_norm_factor(int subg_id) { diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 8576082c7b..66a030e036 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -151,6 +151,33 @@ void DistContext::initializeSyncSubstrate() { galois::runtime::getSystemNetworkInterface().Num, false); } +void DistContext::constructNormFactor(deepgalois::Context* globalContext, bool isSubgraph, + int subgraphID) { + // TODO IMPLEMENT THIS; get relevant info from the original context + globalContext->norm_factor_computing(isSubgraph, subgraphID); + + // TODO can check if already allocated instead of freeing every time + if (this->normFactors) { + free(this->normFactors); + } + +#ifdef USE_MKL + this->normFactors = new float_t[graph_cpu->sizeEdges()]; + galois::do_all(galois::iterate((size_t)0, graph_cpu->sizeEdges()), + [&] (unsigned i) { + normFactors[i] = 1; + } + ); +#else + this->normFactors = new float_t[graph_cpu->size()]; + galois::do_all(galois::iterate((size_t)0, graph_cpu->size()), + [&] (unsigned i) { + normFactors[i] = 1; + } + ); +#endif +} + galois::graphs::GluonSubstrate* DistContext::getSyncSubstrate() { return DistContext::syncSubstrate; }; diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 00ca3f30e6..c117b55d27 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -96,17 +96,16 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, size_t x = input_dims[0]; size_t y = input_dims[1]; size_t z = output_dims[1]; - // std::cout << "layer: " << name_ << "\n"; - // std::cout << "x=" << x << ", y=" << y << ", z=" << z << "\n"; // input: x*y; W: y*z; output: x*z // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W - if (dropout_ && phase_ == net_phase::train) + if (dropout_ && phase_ == net_phase::train) { math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); - else + } else { math::copy_cpu(x * y, in_data, in_temp); + } if (y > z) { math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, @@ -117,16 +116,16 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, &layer::W[0], 0.0, out_data); } -#ifdef GALOIS_USE_DIST + // TODO sync of out_data required here + // TODO how to do this for the sampled case? deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_data; layer::context->getSyncSubstrate()->sync( "AggSync"); -#endif + // run relu activation on output if specified - if (act_) - math::relu_cpu(x * z, out_data, out_data); + if (act_) math::relu_cpu(x * z, out_data, out_data); } // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ From 76915834828a71807a054e0256dea5709dbfc1c4 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 9 May 2020 16:49:39 -0500 Subject: [PATCH 279/660] norm factor used correctly for dist graph, single process --- libdeepgalois/include/deepgalois/Context.h | 4 + .../include/deepgalois/DistContext.h | 13 +-- libdeepgalois/include/deepgalois/Net.h | 12 ++- libdeepgalois/src/Context.cpp | 15 ++++ libdeepgalois/src/DistContext.cpp | 88 ++++++++++++++----- libdeepgalois/src/Net.cpp | 35 ++++---- 6 files changed, 122 insertions(+), 45 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h index 373d91d463..41e3aac23b 100644 --- a/libdeepgalois/include/deepgalois/Context.h +++ b/libdeepgalois/include/deepgalois/Context.h @@ -119,6 +119,10 @@ class Context { reader.init(dataset); } + //! Checks if subgraph being used, sets currenet graph, then calls degreex + //! counting + Graph* getCurrentGraph(bool usingSubGraph, int subID=0); + void set_label_class(bool is_single = true) { is_single_class = is_single; } void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; } void copy_data_to_device(); // copy labels and input features diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 0ffb2e1b0c..473242f05e 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -15,7 +15,7 @@ class DistContext { size_t feat_len; // input feature length: D galois::graphs::GluonSubstrate* syncSubstrate; - DGraph* graph_cpu; // the input graph, |V| = N + DGraph* partitionedGraph; // the input graph, |V| = N std::vector subgraphs_cpu; label_t* h_labels; // labels for classification. Single-class label: Nx1, // multi-class label: NxE @@ -31,7 +31,7 @@ class DistContext { DistContext(); ~DistContext(); - void saveDistGraph(DGraph* a) { graph_cpu = a; } + void saveDistGraph(DGraph* a) { partitionedGraph = a; } //! read labels of local nodes only size_t read_labels(std::string dataset_str); @@ -46,13 +46,11 @@ class DistContext { void gen_subgraph_labels(size_t, const mask_t*) {} void gen_subgraph_feats(size_t, const mask_t*) {} - void constructNormFactor(deepgalois::Context* globalContext, bool is_subgraph, - int subg_id = 0); float_t* get_norm_factors_ptr() { return normFactors; } // TODO shouldn't return 0 always float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; } - DGraph* getGraphPointer() { return graph_cpu; } + DGraph* getGraphPointer() { return partitionedGraph; } Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; }; float_t* get_feats_ptr() { return h_feats; } float_t* get_feats_subg_ptr() { return h_feats_subg; } @@ -62,6 +60,11 @@ class DistContext { void initializeSyncSubstrate(); galois::graphs::GluonSubstrate* getSyncSubstrate(); + //! allocate the norm factor vector + void allocNormFactor(); + //! construct norm factor vector by using data from global graph + void constructNormFactor(deepgalois::Context* globalContext); + //! return label for some node //! NOTE: this is LID, not GID label_t get_label(size_t i) { return h_labels[i]; } diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index e50b081bd7..d19a54156f 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -248,6 +248,7 @@ class Net { // num_edges: ", subgraph_ptr->sizeEdges(), "\n"); for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(num_vertices_sg); + // TODO dist context->norm_factor_computing(1, sg_id); for (size_t i = 0; i < num_conv_layers; i++) { layers[i]->set_graph_ptr(subgraph_ptr); @@ -275,8 +276,6 @@ class Net { // features for use during backprop double fw_time = evaluate("train", train_loss, train_acc); - galois::gPrint(header, "Back prop\n"); - // backward: use intermediate features + ground truth to update layers // with feature gradients whcih are then used to calculate weight // gradients @@ -375,18 +374,23 @@ class Net { loss = fprop(begin, end, count, masks); float_t* predictions = layers[num_layers - 1]->next()->get_data(); + + // labels will be subgraph labels if applicable label_t* labels; if (type == "train" && subgraph_sample_size) { labels = context->get_labels_subg_ptr(); } else { + // note this grabs global labels; everything passed in should be global labels = context->get_labels_ptr(); } + if (is_single_class) { acc = masked_accuracy(begin, end, count, masks, predictions, labels); } else { acc = masked_multi_class_accuracy(begin, end, count, masks, predictions, labels); } + t_eval.Stop(); return t_eval.Millisecs(); } @@ -448,7 +452,7 @@ class Net { layers[0]->set_in_data(distContext->get_feats_ptr()); // feed input data // precompute the normalization constant based on graph structure //context->norm_factor_computing(false); - distContext->constructNormFactor(context, false); + distContext->constructNormFactor(context); for (size_t i = 0; i < num_conv_layers; i++) layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr()); set_contexts(); @@ -518,6 +522,8 @@ class Net { //! calls "forward" on each layer and returns the loss of the final layer acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) { // set mask for the last layer; globals + // TODO this should be distirbuted sample begin->end not global; fix later + // seems to be unused in code right now anyways layers[num_layers - 1]->set_sample_mask(begin, end, count, masks); for (size_t i = 0; i < num_layers; i++) { diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp index 7c7bcdd61b..8f0b8d07f5 100644 --- a/libdeepgalois/src/Context.cpp +++ b/libdeepgalois/src/Context.cpp @@ -158,6 +158,21 @@ void Context::alloc_subgraph_norm_factor(int subg_id) { norm_factors_subg.clear(); } +// get current graph, also gets degrees of g +Graph* Context::getCurrentGraph(bool usingSubGraph, int subID) { + Graph* g; + + // grab orig or subgraph pointer as necessary + if (!usingSubGraph) { + g = getGraphPointer(); + } else { + g = getSubgraphPointer(subID); + } + g->degree_counting(); + + return g; +} + void Context::norm_factor_computing(bool is_subgraph, int subg_id) { Graph* g; float_t* constants; diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 66a030e036..7d0356e189 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -7,7 +7,7 @@ DistContext::DistContext() {} DistContext::~DistContext() {} size_t DistContext::read_labels(std::string dataset_str) { - DGraph* dGraph = DistContext::graph_cpu; + DGraph* dGraph = DistContext::partitionedGraph; unsigned myID = galois::runtime::getSystemNetworkInterface().ID; galois::gPrint("[", myID, "] Reading labels from disk...\n"); @@ -58,7 +58,7 @@ size_t DistContext::read_labels(std::string dataset_str) { } size_t DistContext::read_features(std::string dataset_str) { - DGraph* dGraph = DistContext::graph_cpu; + DGraph* dGraph = DistContext::partitionedGraph; unsigned myID = galois::runtime::getSystemNetworkInterface().ID; galois::gPrint("[", myID, "] Reading features from disk...\n"); @@ -147,37 +147,85 @@ float_t* DistContext::get_in_ptr() { return &h_feats[0]; } void DistContext::initializeSyncSubstrate() { DistContext::syncSubstrate = new galois::graphs::GluonSubstrate( - *DistContext::graph_cpu, galois::runtime::getSystemNetworkInterface().ID, + *DistContext::partitionedGraph, galois::runtime::getSystemNetworkInterface().ID, galois::runtime::getSystemNetworkInterface().Num, false); } -void DistContext::constructNormFactor(deepgalois::Context* globalContext, bool isSubgraph, - int subgraphID) { +void DistContext::allocNormFactor() { + if (!normFactors) { +#ifdef USE_MKL + normFactors = new float_t[partitionedGraph->sizeEdges()]; +#else + normFactors = new float_t[partitionedGraph->size()]; +#endif + } + if (!normFactors) { + GALOIS_DIE("norm factors failed to be allocated"); + } +} + +//void DistContext::allocSubNormFactor(int subID) { +// if (!normFactors) { +//#ifdef USE_MKL +// normFactors = new float_t[partitionedGraph->sizeEdges()]; +//#else +// normFactors = new float_t[partitionedGraph->size()]; +//#endif +// } +// if (!normFactors) { +// GALOIS_DIE("norm factors failed to be allocated"); +// } +//} + +void DistContext::constructNormFactor(deepgalois::Context* globalContext) { // TODO IMPLEMENT THIS; get relevant info from the original context - globalContext->norm_factor_computing(isSubgraph, subgraphID); + // sets current subgraph + gets degrees + Graph* wholeGraph = globalContext->getCurrentGraph(false); - // TODO can check if already allocated instead of freeing every time - if (this->normFactors) { - free(this->normFactors); - } + allocNormFactor(); + + // this is for testing purposes + //galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()), + // [&] (unsigned i) { + // this->normFactors[i] = 0; + // } + //); #ifdef USE_MKL - this->normFactors = new float_t[graph_cpu->sizeEdges()]; - galois::do_all(galois::iterate((size_t)0, graph_cpu->sizeEdges()), + galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()), [&] (unsigned i) { - normFactors[i] = 1; - } + float_t c_i = std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); + + for (auto e = partitionedGraph->edge_begin(i); e != partitionedGraph->edge_end(i); e++) { + const auto j = partitionedGraph->getEdgeDst(e); + float_t c_j = std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(j)))); + + if (c_i == 0.0 || c_j == 0.0) { + this->normFactors[e] = 0.0; + } else { + this->normFactors[e] = 1.0 / (c_i * c_j); + } + }, + galois::loopname("NormCountingEdge")); ); #else - this->normFactors = new float_t[graph_cpu->size()]; - galois::do_all(galois::iterate((size_t)0, graph_cpu->size()), - [&] (unsigned i) { - normFactors[i] = 1; - } - ); + galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()), + [&] (unsigned v) { + auto degree = wholeGraph->get_degree(partitionedGraph->getGID(v)); + float_t temp = std::sqrt(float_t(degree)); + if (temp == 0.0) { + this->normFactors[v] = 0.0; + } else { + this->normFactors[v] = 1.0 / temp; + } + }, + galois::loopname("NormCountingNode")); #endif } +//void DistContext::constructNormFactorSub(deepgalois::Context* globalContext, bool isSubgraph, +// int subgraphID) { + galois::graphs::GluonSubstrate* DistContext::getSyncSubstrate() { return DistContext::syncSubstrate; }; diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index 800d550048..47b9bdc334 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -9,7 +9,6 @@ namespace deepgalois { -#ifdef GALOIS_USE_DIST void Net::partitionInit(DGraph* graph, std::string dataset_str) { this->dGraph = graph; this->distContext = new deepgalois::DistContext(); @@ -74,7 +73,6 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) { feature_dims[num_layers] = num_classes; // normalized output embedding: E layers.resize(num_layers); } -#endif #ifndef __GALOIS_HET_CUDA__ void Net::init() { @@ -95,21 +93,18 @@ void Net::regularize() { * * @param begin GLOBAL begin * @param end GLOBAL end + * @param masks: GLOBAL masks * @param count GLOBAL training count */ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth) { -#ifndef GALOIS_USE_DIST - galois::GAccumulator accuracy_all; -#else galois::DGAccumulator accuracy_all; galois::DGAccumulator sampleCount; - sampleCount.reset(); -#endif - accuracy_all.reset(); + sampleCount.reset(); + // TODO figure this out for distributed case galois::do_all( galois::iterate(begin, end), [&](const auto& i) { @@ -123,29 +118,34 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, accuracy_all += 1.0; } #else + // TODO dist subraph + // only look at owned nodes (i.e. masters); the prediction for these // should only by handled on the owner if (this->dGraph->isOwned(i)) { sampleCount += 1; uint32_t localID = this->dGraph->getLID(i); - if (masks[localID] == 1) { - // get prediction - auto pred = - math::argmax(num_classes, &preds[localID * num_classes]); - // check prediction - if ((label_t)pred == ground_truth[localID]) - accuracy_all += 1.0; + if (masks == NULL) { + GALOIS_DIE("subgraphs not implemented for dist yet"); + // subgraph here: TODO + } else { + if (masks[localID] == 1) { + // get prediction + auto pred = + math::argmax(num_classes, &preds[localID * num_classes]); + // check prediction + if ((label_t)pred == ground_truth[localID]) + accuracy_all += 1.0; + } } } #endif }, galois::loopname("getMaskedLoss")); -#ifdef GALOIS_USE_DIST count = sampleCount.reduce(); galois::gDebug("sample count is ", count); -#endif // all hosts should get same accuracy return accuracy_all.reduce() / (acc_t)count; @@ -154,6 +154,7 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, label_t* ground_truth) { + // TODO dist version return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, ground_truth, preds); } From 6881c67c15e363a5432cd7543d93fa94443badaa Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 9 May 2020 17:05:27 -0500 Subject: [PATCH 280/660] disabling sync for now; getting subgraph sampling back online first --- libdeepgalois/src/layers/graph_conv_layer.cpp | 22 ++++++++----------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index c117b55d27..7acf787bae 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -119,10 +119,10 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, // TODO sync of out_data required here // TODO how to do this for the sampled case? - deepgalois::_syncVectorSize = z; - deepgalois::_dataToSync = out_data; - layer::context->getSyncSubstrate()->sync( - "AggSync"); + //deepgalois::_syncVectorSize = z; + //deepgalois::_dataToSync = out_data; + //layer::context->getSyncSubstrate()->sync( + // "AggSync"); // run relu activation on output if specified if (act_) math::relu_cpu(x * z, out_data, out_data); @@ -163,21 +163,17 @@ void graph_conv_layer::back_propagation(const float_t* in_data, 0.0, &layer::weight_grad[0]); } -#ifdef GALOIS_USE_DIST // sync agg - deepgalois::_syncVectorSize = z; - deepgalois::_dataToSync = out_temp; - layer::context->getSyncSubstrate()->sync( - "AggSyncBack"); -#endif + //deepgalois::_syncVectorSize = z; + //deepgalois::_dataToSync = out_temp; + //layer::context->getSyncSubstrate()->sync( + // "AggSyncBack"); if (level_ != 0 && dropout_) math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad); -#ifdef GALOIS_USE_DIST - layer::syncSub->sync("GradientSync"); + //layer::syncSub->sync("GradientSync"); // galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); -#endif } acc_t graph_conv_layer::get_weight_decay_loss() { From 0726cdbe25d928e8264aa10c1f7c65c7eb6dbabd Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 9 May 2020 23:16:14 -0500 Subject: [PATCH 281/660] sampling, fixes to sampling, distgraph lgraph creation, various other things --- .../include/deepgalois/DistContext.h | 57 ++++-- libdeepgalois/include/deepgalois/Net.h | 98 ++++++----- libdeepgalois/include/deepgalois/Sampler.h | 132 ++++++++++---- libdeepgalois/include/deepgalois/lgraph.h | 2 + libdeepgalois/src/Context.cpp | 2 +- libdeepgalois/src/DistContext.cpp | 115 +++++++++++- libdeepgalois/src/Net.cpp | 9 +- libdeepgalois/src/Sampler.cpp | 164 +++++++----------- 8 files changed, 377 insertions(+), 202 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 473242f05e..ffaf430792 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -15,23 +15,45 @@ class DistContext { size_t feat_len; // input feature length: D galois::graphs::GluonSubstrate* syncSubstrate; + Graph* lGraph; // laerning graph version DGraph* partitionedGraph; // the input graph, |V| = N - std::vector subgraphs_cpu; + std::vector partitionedSubgraphs; label_t* h_labels; // labels for classification. Single-class label: Nx1, // multi-class label: NxE - label_t* h_labels_subg; // labels for subgraph + std::vector h_labels_subg; // labels for subgraph float_t* h_feats; // input features: N x D - float_t* h_feats_subg; // input features for subgraph + std::vector h_feats_subg; // input features for subgraph - // TODO needs to come from whole graph + // change regular one to a vector as well float_t* normFactors; // normalization constant based on graph structure - std::vector norm_factors_subg; // normalization constant for subgraph + std::vector normFactorsSub; // normalization constant for subgraph public: DistContext(); ~DistContext(); - void saveDistGraph(DGraph* a) { partitionedGraph = a; } + void saveDistGraph(DGraph* a) { + partitionedGraph = a; + + // construct lgraph from underlying lc csr graph + this->lGraph = new Graph(); + this->lGraph->allocateFrom(a->size(), a->sizeEdges()); + this->lGraph->constructNodes(); + + galois::do_all( + galois::iterate((size_t)0, a->size()), + [&](const auto src) { + this->lGraph->fixEndEdge(src, *a->edge_end(src)); + index_t idx = *(a->edge_begin(src)); + + for (auto e = a->edge_begin(src); e != a->edge_end(src); e++) { + const auto dst = a->getEdgeDst(e); + this->lGraph->constructEdge(idx++, dst, 0); + } + }, + galois::loopname("lgraphcopy") + ); + } //! read labels of local nodes only size_t read_labels(std::string dataset_str); @@ -46,24 +68,30 @@ class DistContext { void gen_subgraph_labels(size_t, const mask_t*) {} void gen_subgraph_feats(size_t, const mask_t*) {} - - float_t* get_norm_factors_ptr() { return normFactors; } - // TODO shouldn't return 0 always - float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; } DGraph* getGraphPointer() { return partitionedGraph; } - Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; }; + Graph* getLGraphPointer() { return lGraph; } + + Graph* getSubgraphPointer(int id) { return partitionedSubgraphs[id]; }; float_t* get_feats_ptr() { return h_feats; } - float_t* get_feats_subg_ptr() { return h_feats_subg; } + float_t* get_feats_subg_ptr() { return h_feats_subg.data(); } label_t* get_labels_ptr() { return h_labels; } - label_t* get_labels_subg_ptr() { return h_labels_subg; } + label_t* get_labels_subg_ptr() { return h_labels_subg.data(); } void initializeSyncSubstrate(); galois::graphs::GluonSubstrate* getSyncSubstrate(); //! allocate the norm factor vector void allocNormFactor(); + void allocNormFactorSub(int subID); //! construct norm factor vector by using data from global graph void constructNormFactor(deepgalois::Context* globalContext); + void constructNormFactorSub(int subgraphID); + + void constructSubgraphLabels(size_t m, const mask_t* masks); + void constructSubgraphFeatures(size_t m, const mask_t* masks); + + float_t* get_norm_factors_ptr() { return normFactors; } + float_t* get_norm_factors_subg_ptr() { return &normFactorsSub[0]; } //! return label for some node //! NOTE: this is LID, not GID @@ -71,6 +99,9 @@ class DistContext { //! returns pointer to the features of each local node float_t* get_in_ptr(); + + //! allocate memory for subgraphs (don't actually build them) + void allocateSubgraphs(int num_subgraphs); }; } // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index d19a54156f..9f49f8f847 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -55,7 +55,7 @@ class Net { size_t globalTestCount; int val_interval; int num_subgraphs; - int num_vertices_sg; + unsigned subgraphNumVertices; bool is_selfloop; mask_t* globalTrainMasks; // masks for training @@ -68,7 +68,7 @@ class Net { mask_t* d_val_masks; // masks for validation on device mask_t* d_test_masks; // masks for test on device - mask_t* subgraphs_masks; // masks for subgraphs + mask_t* subgraphs_masks; // masks for subgraphs; size of local graph std::vector feature_dims; // feature dimnesions for each layer std::vector layers; // all the layers in the neural network @@ -167,6 +167,8 @@ class Net { // set the subgraph boolean if sample size is greater than 0 context->set_use_subgraph(subgraph_sample_size > 0); + + this->sampler = new Sampler(); } //! Default net constructor @@ -196,10 +198,11 @@ class Net { int num_subg_remain = 0; if (subgraph_sample_size) { - context->allocateSubgraphs(num_subgraphs); + distContext->allocateSubgraphs(num_subgraphs); subgraphs_masks = new mask_t[distNumSamples * num_subgraphs]; galois::gPrint(header, " Construct training vertex set induced graph...\n"); - sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, context->getGraphPointer()); + sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, context->getGraphPointer(), + distContext->getGraphPointer()); } galois::gPrint(header, "Start training...\n"); @@ -207,68 +210,73 @@ class Net { Timer t_epoch; // run epochs - for (int ep = 0; ep < num_epochs; ep++) { + for (int curEpoch = 0; curEpoch < num_epochs; curEpoch++) { t_epoch.Start(); +//////////////////////////////////////////////////////////////////////////////// +// Sampling //////////////////////////////////////////////////////////////////////////////// if (subgraph_sample_size) { if (num_subg_remain == 0) { - std::cout << "Generating " << num_subgraphs << " subgraphs "; + galois::gPrint(header, "Generating ", num_subgraphs, " subgraph(s)\n"); + // TODO stat timer instead of this timer Timer t_subgen; t_subgen.Start(); // generate subgraphs #ifndef __GALOIS_HET_CUDA__ for (int sid = 0; sid < num_subgraphs; sid++) { - // galois::do_all(galois::iterate(size_t(0), - // size_t(num_subgraphs)),[&](const auto sid) { - unsigned tid = 0; - // tid = galois::substrate::ThreadPool::getTID(); - sampler->subgraph_sample(subgraph_sample_size, - *(context->getSubgraphPointer(sid)), - &subgraphs_masks[sid * globalSamples], tid); - } //, galois::loopname("subgraph_gen")); + sampler->sampleSubgraph(subgraph_sample_size, + *(distContext->getSubgraphPointer(sid)), + &subgraphs_masks[sid * globalSamples], curEpoch); + } #endif num_subg_remain = num_subgraphs; t_subgen.Stop(); // std::cout << "Done, time: " << t_subgen.Millisecs() << "\n"; } + // count their degrees for (int i = 0; i < num_subgraphs; i++) { - auto sg_ptr = context->getSubgraphPointer(i); + auto sg_ptr = distContext->getSubgraphPointer(i); sg_ptr->degree_counting(); // galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " // num_e ", sg_ptr->sizeEdges(), "\n"); } + // choose a subgraph to use num_subg_remain--; int sg_id = num_subg_remain; - auto subgraph_ptr = context->getSubgraphPointer(sg_id); - num_vertices_sg = subgraph_ptr->size(); - // galois::gPrint("Subgraph num_vertices: ", num_vertices_sg, ", - // num_edges: ", subgraph_ptr->sizeEdges(), "\n"); - for (size_t i = 0; i < num_layers; i++) - layers[i]->update_dim_size(num_vertices_sg); - // TODO dist - context->norm_factor_computing(1, sg_id); + auto subgraphPointer = distContext->getSubgraphPointer(sg_id); + this->subgraphNumVertices = subgraphPointer->size(); + + // galois::gPrint("Subgraph num_vertices: ", subgraphNumVertices, ", + // num_edges: ", subgraphPointer->sizeEdges(), "\n"); + for (size_t i = 0; i < num_layers; i++) { + layers[i]->update_dim_size(this->subgraphNumVertices); + } + + // TODO dist version where i need global degrees + // change normalization constants + distContext->constructNormFactorSub(sg_id); for (size_t i = 0; i < num_conv_layers; i++) { - layers[i]->set_graph_ptr(subgraph_ptr); - layers[i]->set_norm_consts_ptr(context->get_norm_factors_subg_ptr()); + layers[i]->set_graph_ptr(subgraphPointer); + layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_subg_ptr()); } + // update labels for subgraph - context->gen_subgraph_labels(num_vertices_sg, - &subgraphs_masks[sg_id * globalSamples]); - layers[num_layers - 1]->set_labels_ptr(context->get_labels_subg_ptr()); + distContext->constructSubgraphLabels(this->subgraphNumVertices, + &subgraphs_masks[sg_id * globalSamples]); + layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_subg_ptr()); // update features for subgraph - context->gen_subgraph_feats(num_vertices_sg, - &subgraphs_masks[sg_id * globalSamples]); - layers[0]->set_feats_ptr( - context->get_feats_subg_ptr()); // feed input data + distContext->constructSubgraphFeatures(this->subgraphNumVertices, + &subgraphs_masks[sg_id * globalSamples]); + layers[0]->set_feats_ptr(distContext->get_feats_subg_ptr()); // feed input data } // end subgraph sample loop //////////////////////////////////////////////////////////////////////////////// // training steps - galois::gPrint(header, "Epoch ", std::setw(3), ep, seperator); + galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, seperator); set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; @@ -296,7 +304,7 @@ class Net { double epoch_time = t_epoch.Millisecs(); total_train_time += epoch_time; - if (need_validate && ep % val_interval == 0) { + if (need_validate && curEpoch % val_interval == 0) { // Validation acc_t val_loss = 0.0, val_acc = 0.0; double val_time = evaluate("val", val_loss, val_acc); @@ -335,8 +343,8 @@ class Net { // update masks for subgraph masks = NULL; begin = 0; - end = num_vertices_sg; - count = num_vertices_sg; + end = this->subgraphNumVertices; + count = this->subgraphNumVertices; } } else if (type == "val") { begin = globalValBegin; @@ -356,11 +364,11 @@ class Net { for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(distNumSamples); for (size_t i = 0; i < num_conv_layers; i++) { - layers[i]->set_graph_ptr(context->getGraphPointer()); - layers[i]->set_norm_consts_ptr(context->get_norm_factors_ptr()); + layers[i]->set_graph_ptr(distContext->getLGraphPointer()); + layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr()); } - layers[num_layers - 1]->set_labels_ptr(context->get_labels_ptr()); - layers[0]->set_feats_ptr(context->get_feats_ptr()); // feed input data + layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_ptr()); + layers[0]->set_feats_ptr(distContext->get_feats_ptr()); // feed input data } #else if (type == "train") { @@ -378,10 +386,10 @@ class Net { // labels will be subgraph labels if applicable label_t* labels; if (type == "train" && subgraph_sample_size) { - labels = context->get_labels_subg_ptr(); + labels = distContext->get_labels_subg_ptr(); } else { // note this grabs global labels; everything passed in should be global - labels = context->get_labels_ptr(); + labels = distContext->get_labels_ptr(); } if (is_single_class) { @@ -487,11 +495,13 @@ class Net { in_dims[0] = out_dims[0] = distNumSamples; in_dims[1] = get_in_dim(layer_id); out_dims[1] = get_out_dim(layer_id); + if (is_single_class) layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); else layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims); - layers[layer_id]->set_labels_ptr(context->get_labels_ptr()); + + layers[layer_id]->set_labels_ptr(distContext->get_labels_ptr()); } //! Add a convolution layer to the network @@ -505,7 +515,7 @@ class Net { out_dims[1] = get_out_dim(layer_id); layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); - layers[layer_id]->set_graph_ptr(context->getGraphPointer()); + layers[layer_id]->set_graph_ptr(distContext->getLGraphPointer()); } // update trainable weights after back-propagation diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h index 9c4ea06f12..d29c537ab9 100644 --- a/libdeepgalois/include/deepgalois/Sampler.h +++ b/libdeepgalois/include/deepgalois/Sampler.h @@ -4,38 +4,12 @@ namespace deepgalois { #define ETA 1.5 // length factor of DB in sampling #define SAMPLE_CLIP 3000 // clip degree in sampling -#define DEFAULT_SIZE_FRONTIER 3000 +#define DEFAULT_SIZE_FRONTIER 1000 #define DEFAULT_SIZE_SUBG 9000 class Sampler { public: typedef int db_t; - Sampler() : m_(DEFAULT_SIZE_FRONTIER) {} - ~Sampler() {} - - //! sample a subgraph sg of size n from graph g - //! sg is overwritten/is output - void subgraph_sample(size_t n, Graph& sg, mask_t* masks, unsigned tid = 0); - - //! API function for user-defined selection strategy - // TODO how to expose this? - virtual void select_vertices(size_t nv, size_t n, int m, Graph* g, - VertexList vertices, VertexSet& vertex_set); - virtual void select_vertices(size_t n, int m, VertexSet& vertex_set, - unsigned tid); - - // galois::runtime::iterable > - // neighbor_sampler(Graph &g, VertexID v); - - edge_iterator sampled_edge_begin(Graph& g, VertexID v) { - return g.edge_begin(v); - } - - edge_iterator sampled_edge_end(Graph& g, VertexID v) { return g.edge_end(v); } - - //! Given a mask, construct the graph with only those vertices ans ave as the - //! masked graph in this class for the sampler. - void initializeMaskedGraph(size_t count, mask_t* masks, Graph* g); protected: int m_; @@ -45,27 +19,111 @@ class Sampler { int avg_deg; //! average degree cut off to a clip int subg_deg; - //! list of vertices active in the graph being maintained (masked_graph) - // VertexList vertices_; + + //VertexList vertices_; + //mask_t* masks_; + //! List of training nodes; sampling set - std::vector node_train; - mask_t* masks_; + std::vector trainingNodes; + //! masked original graph; typically to the training set - Graph* masked_graph; - Graph* graph; + Graph* globalMaskedGraph; + Graph* globalGraph; + DGraph* partGraph; //! Reindex a graph to only contain those in the vertex set void reindexSubgraph(VertexSet& keptVertices, Graph& g, Graph& reindexed); + //! Given a graph, return a graph with edges to unmasked vertices removed in //! mg - void getMaskedGraph(size_t n, mask_t* masks, Graph* g, Graph& mg); - void get_masked_degrees(size_t n, mask_t* masks, Graph* g, - std::vector& degrees); + template + void getMaskedGraph(size_t n, mask_t* masks, GraphTy* g, Graph& sub) { + std::vector degrees(n, 0); + this->getMaskedDegrees(n, masks, g, degrees); + // auto offsets = deepgalois::parallel_prefix_sum(degrees); + auto offsets = deepgalois::prefix_sum(degrees); + size_t ne = offsets[n]; + //galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", ne, "\n"); + + // note this constructs the full graph's nodes; just trims edges + sub.allocateFrom(n, ne); + sub.constructNodes(); + + galois::do_all( + galois::iterate((size_t)0, n), + [&](const auto src) { + sub.fixEndEdge(src, offsets[src + 1]); + if (masks[src] == 1) { + auto idx = offsets[src]; + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) + sub.constructEdge(idx++, dst, 0); + } + } + } + , + galois::loopname("gen_subgraph")); + } + + +//! determine degree of each vertex in a masked graph (given by masks and g) +template +void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, + std::vector& degrees) { + assert(degrees.size() == n); +#ifdef PARALLEL_GEN + galois::do_all( + galois::iterate(size_t(0), n), + [&](const auto src) { +#else + for (size_t src = 0; src < n; src++) { +#endif + if (masks[src] == 1) { + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) { + //galois::gInfo("Edge ", src, " ", dst); + degrees[src]++; + } + } + } + } +#ifdef PARALLEL_GEN + , + galois::loopname("update_degrees")); +#endif +} + //! Set masks bitset with IDs in the vertices VertexSet - void getMasks(size_t n, VertexSet vertices, mask_t* masks); + void createMasks(size_t n, VertexSet vertices, mask_t* masks); inline VertexList reindexVertices(size_t n, VertexSet vertex_set); void checkGSDB(std::vector& DB0, std::vector& DB1, std::vector& DB2, size_t size); + + VertexSet convertToLID(VertexSet& gidSet); + +public: + Sampler() : m_(DEFAULT_SIZE_FRONTIER) {} + ~Sampler() {} + + //! sample a subgraph sg of size n from graph g + //! sg is overwritten/is output + void sampleSubgraph(size_t n, Graph& sg, mask_t* masks, unsigned seed = 0); + + //! API function for user-defined selection strategy + // TODO how to expose this? + virtual void selectVertices(size_t nv, size_t n, int m, Graph* g, + VertexList vertices, VertexSet& vertex_set); + virtual void selectVertices(size_t n, int m, VertexSet& vertex_set, + unsigned seed); + + // galois::runtime::iterable > + // neighbor_sampler(Graph &g, VertexID v); + + //! Given a mask, construct the graph with only those vertices ans ave as the + //! masked graph in this class for the sampler. + void initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg); }; } // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index 0c06a926cb..40ca6c5a18 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -68,6 +68,7 @@ class LearningGraph { degrees_.resize(num_vertices_); rowptr_[0] = 0; } + void constructEdge(index_t eid, index_t dst, edata_t edata = 0) { assert(dst < num_vertices_); assert(eid < num_edges_); @@ -75,6 +76,7 @@ class LearningGraph { if (edge_data_) edge_data_[eid] = edata; } + void add_selfloop() { auto old_colidx_ = colidx_; colidx_.resize(num_vertices_ + num_edges_); diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp index 8f0b8d07f5..17b9872f74 100644 --- a/libdeepgalois/src/Context.cpp +++ b/libdeepgalois/src/Context.cpp @@ -93,6 +93,7 @@ size_t Context::read_graph(bool selfloop) { //} else galois::graphs::readGraph(*graph_cpu, filename); } else { graph_cpu->readGraph(dataset); + galois::gPrint("graph read size ", graph_cpu->size()); } // TODO dist version of self loop } else { @@ -155,7 +156,6 @@ void Context::alloc_subgraph_norm_factor(int subg_id) { #else norm_factors_subg.resize(g->size()); #endif - norm_factors_subg.clear(); } // get current graph, also gets degrees of g diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 7d0356e189..7899a180e2 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -164,6 +164,16 @@ void DistContext::allocNormFactor() { } } +void DistContext::allocNormFactorSub(int subID) { +#ifdef USE_MKL + normFactorsSub.resize(partitionedSubgraphs[subID]->sizeEdges()); +#else + normFactorsSub.resize(partitionedSubgraphs[subID]->size()); +#endif + // TODO clean out? +} + + //void DistContext::allocSubNormFactor(int subID) { // if (!normFactors) { //#ifdef USE_MKL @@ -223,11 +233,112 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) { #endif } -//void DistContext::constructNormFactorSub(deepgalois::Context* globalContext, bool isSubgraph, -// int subgraphID) { +void DistContext::constructNormFactorSub(int subgraphID) { + // right now norm factor based on subgraph + // TODO fix this + + allocNormFactorSub(subgraphID); + + Graph& graphToUse = *partitionedSubgraphs[subgraphID]; + graphToUse.degree_counting(); + + // TODO using partitioned subgraph rather than whoel graph; i.e. dist setting wrong +#ifdef USE_MKL + galois::do_all(galois::iterate((size_t)0, graphToUse->size()), + [&] (unsigned i) { + //float_t c_i = std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); + float_t c_i = std::sqrt(float_t(graphToUse.get_degree(i))); + + for (auto e = graphToUse->edge_begin(i); e != graphToUse->edge_end(i); e++) { + const auto j = graphToUse->getEdgeDst(e); + float_t c_j = std::sqrt(float_t(graphToUse.get_degree(j))); + + if (c_i == 0.0 || c_j == 0.0) { + this->normFactors[e] = 0.0; + } else { + this->normFactors[e] = 1.0 / (c_i * c_j); + } + }, + galois::loopname("NormCountingEdge")); + ); +#else + galois::do_all(galois::iterate((size_t)0, graphToUse.size()), + [&] (unsigned v) { + //auto degree = wholeGraph->get_degree(partitionedGraph->getGID(v)); + auto degree = graphToUse.get_degree(v); + float_t temp = std::sqrt(float_t(degree)); + if (temp == 0.0) { + this->normFactors[v] = 0.0; + } else { + this->normFactors[v] = 1.0 / temp; + } + }, + galois::loopname("NormCountingNode")); +#endif +} +//! generate labels for the subgraph, m is subgraph size, mask +//! tells which vertices to use +void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) { + // TODO multiclass + + // if (h_labels_subg == NULL) h_labels_subg = new label_t[m]; + //if (DistContext::is_single_class) { + //} else { + // DistContext::h_labels_subg.resize(m * Context::num_classes); + //} + + DistContext::h_labels_subg.resize(m); + + size_t count = 0; + // see which labels to copy over for this subgraph + for (size_t i = 0; i < this->partitionedGraph->size(); i++) { + if (masks[i] == 1) { + //if (Context::is_single_class) { + //} else { + // std::copy(Context::h_labels + i * Context::num_classes, + // Context::h_labels + (i + 1) * Context::num_classes, + // &Context::h_labels_subg[count * Context::num_classes]); + //} + DistContext::h_labels_subg[count] = h_labels[i]; + count++; + } + } + GALOIS_ASSERT(count == m); +} + +//! generate input features for the subgraph, m is subgraph size, +//! masks tells which vertices to use +void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) { + size_t count = 0; + // if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len]; + DistContext::h_feats_subg.resize(m * feat_len); + for (size_t i = 0; i < this->partitionedGraph->size(); i++) { + if (masks[i] == 1) { + std::copy(DistContext::h_feats + i * DistContext::feat_len, + DistContext::h_feats + (i + 1) * DistContext::feat_len, + &DistContext::h_feats_subg[count * DistContext::feat_len]); + count++; + } + } + GALOIS_ASSERT(count == m); +} + + + + + + + galois::graphs::GluonSubstrate* DistContext::getSyncSubstrate() { return DistContext::syncSubstrate; }; +void DistContext::allocateSubgraphs(int num_subgraphs) { + partitionedSubgraphs.resize(num_subgraphs); + for (int i = 0; i < num_subgraphs; i++) { + partitionedSubgraphs[i] = new Graph(); + } +} + } // namespace deepgalois diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index 47b9bdc334..3500911c74 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -127,13 +127,16 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, uint32_t localID = this->dGraph->getLID(i); if (masks == NULL) { - GALOIS_DIE("subgraphs not implemented for dist yet"); + //GALOIS_DIE("subgraphs not implemented for dist yet"); // subgraph here: TODO + auto pred = math::argmax(num_classes, &preds[localID * num_classes]); + // check prediction + if ((label_t)pred == ground_truth[localID]) + accuracy_all += 1.0; } else { if (masks[localID] == 1) { // get prediction - auto pred = - math::argmax(num_classes, &preds[localID * num_classes]); + auto pred = math::argmax(num_classes, &preds[localID * num_classes]); // check prediction if ((label_t)pred == ground_truth[localID]) accuracy_all += 1.0; diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp index 6a84a8de76..3dfbcf8c86 100644 --- a/libdeepgalois/src/Sampler.cpp +++ b/libdeepgalois/src/Sampler.cpp @@ -12,48 +12,52 @@ inline unsigned getDegree(Graph* g, index_t v) { return g->edge_end(v) - g->edge_begin(v); } -void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g) { +void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg) { this->count_ = count; - this->masks_ = masks; // save original graph - Sampler::graph = g; + Sampler::globalGraph = g; + // save partitioned graph + Sampler::partGraph = dg; + // allocate the object for the new masked graph - Sampler::masked_graph = new Graph(); + Sampler::globalMaskedGraph = new Graph(); std::vector degrees(g->size(), 0); // get degrees of nodes that will be in new graph - this->get_masked_degrees(g->size(), masks, g, degrees); + this->getMaskedDegrees(g->size(), masks, g, degrees); auto offsets = deepgalois::parallel_prefix_sum(degrees); size_t ne = offsets[g->size()]; - // save ids (on original graph) of training nodes to vector + // save ids (of original graph) of training nodes to vector for (size_t i = 0; i < g->size(); i++) { if (masks[i] == 1) - Sampler::node_train.push_back(i); + Sampler::trainingNodes.push_back(i); } - Sampler::masked_graph->allocateFrom(g->size(), ne); - Sampler::masked_graph->constructNodes(); + Sampler::globalMaskedGraph->allocateFrom(g->size(), ne); + Sampler::globalMaskedGraph->constructNodes(); // same as original graph, except keep only edges involved in masks galois::do_all( galois::iterate((size_t)0, g->size()), [&](const auto src) { - Sampler::masked_graph->fixEndEdge(src, offsets[src + 1]); + Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]); if (masks[src] == 1) { auto idx = offsets[src]; for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { const auto dst = g->getEdgeDst(e); if (masks[dst] == 1) - Sampler::masked_graph->constructEdge(idx++, dst, 0); + Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0); } } }, galois::loopname("gen_subgraph")); - Sampler::masked_graph->degree_counting(); - Sampler::avg_deg = masked_graph->sizeEdges() / masked_graph->size(); + Sampler::globalMaskedGraph->degree_counting(); + Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size(); Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg; + // TODO masked part graph as well to save time later + // size_t idx = 0; // vertices_.resize(count); // for (size_t i = begin; i < end; i++) { @@ -62,69 +66,6 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g) { //} } -//! determine degree of each vertex in a masked graph (given by masks and g) -void Sampler::get_masked_degrees(size_t n, mask_t* masks, Graph* g, - std::vector& degrees) { - assert(degrees.size() == n); -#ifdef PARALLEL_GEN - galois::do_all( - galois::iterate(size_t(0), n), - [&](const auto src) { -#else - for (size_t src = 0; src < n; src++) { -#endif - if (masks[src] == 1) { - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - const auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) - degrees[src]++; - } - } - } -#ifdef PARALLEL_GEN - , - galois::loopname("update_degrees")); -#endif -} - -//! returns a graph in the variable sub: it is g with the mask applied -void Sampler::getMaskedGraph(size_t n, mask_t* masks, Graph* g, Graph& sub) { - std::vector degrees(n, 0); - this->get_masked_degrees(n, masks, g, degrees); - // auto offsets = deepgalois::parallel_prefix_sum(degrees); - auto offsets = deepgalois::prefix_sum(degrees); - size_t ne = offsets[n]; - // galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", - // ne, "\n"); - // - - // note this constructs the full graph's nodes; just trims edges - sub.allocateFrom(n, ne); - sub.constructNodes(); - -#ifdef PARALLEL_GEN - galois::do_all( - galois::iterate((size_t)0, n), - [&](const auto src) { -#else - for (size_t src = 0; src < n; src++) { -#endif - sub.fixEndEdge(src, offsets[src + 1]); - if (masks[src] == 1) { - auto idx = offsets[src]; - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - const auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) - sub.constructEdge(idx++, dst, 0); - } - } - } -#ifdef PARALLEL_GEN - , - galois::loopname("gen_subgraph")); -#endif -} - // helper function for graph saint implementation below void Sampler::checkGSDB(std::vector& DB0, std::vector& DB1, std::vector& DB2, size_t size) { @@ -154,7 +95,10 @@ void print_vertex_set(VertexSet vertex_set) { // implementation from GraphSAINT // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp -void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned seed) { +void Sampler::selectVertices(size_t n, int m, VertexSet& st, unsigned seed) { + if (n < (size_t)m) { + m = n; + } unsigned myseed = seed; // unsigned myseed = tid; @@ -174,16 +118,16 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned seed) { IA3.resize(m); // galois::gPrint("seed ", myseed, " m ", m, "\n"); - // galois::gPrint("node_train size: ", node_train.size(), "\n"); + // galois::gPrint("trainingNodes size: ", trainingNodes.size(), "\n"); // printf("( "); - // for (size_t i = 0; i < 10; i++) std::cout << node_train[i] << " "; + // for (size_t i = 0; i < 10; i++) std::cout << trainingNodes[i] << " "; // printf(")\n"); for (int i = 0; i < m; i++) { - auto rand_idx = rand_r(&myseed) % Sampler::node_train.size(); - db_t v = IA3[i] = Sampler::node_train[rand_idx]; + auto rand_idx = rand_r(&myseed) % Sampler::trainingNodes.size(); + db_t v = IA3[i] = Sampler::trainingNodes[rand_idx]; st.insert(v); - IA0[i] = getDegree(Sampler::masked_graph, v); + IA0[i] = getDegree(Sampler::globalMaskedGraph, v); IA0[i] = (IA0[i] > SAMPLE_CLIP) ? SAMPLE_CLIP : IA0[i]; IA1[i] = 1; IA2[i] = 0; @@ -216,17 +160,17 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned seed) { } choose = (DB1[choose] < 0) ? choose : (choose - DB1[choose]); db_t v = DB0[choose]; - auto degree = getDegree(Sampler::masked_graph, v); + auto degree = getDegree(Sampler::globalMaskedGraph, v); neigh_v = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1); if (neigh_v != db_t(-1)) { - neigh_v = Sampler::masked_graph->getEdgeDst( - Sampler::masked_graph->edge_begin(v) + neigh_v); + neigh_v = Sampler::globalMaskedGraph->getEdgeDst( + Sampler::globalMaskedGraph->edge_begin(v) + neigh_v); st.insert(neigh_v); IA1[DB2[choose] - 1] = 0; IA0[DB2[choose] - 1] = 0; for (auto i = choose; i < choose - DB1[choose]; i++) DB0[i] = db_t(-1); - newsize = getDegree(Sampler::masked_graph, neigh_v); + newsize = getDegree(Sampler::globalMaskedGraph, neigh_v); newsize = (newsize > SAMPLE_CLIP) ? SAMPLE_CLIP : newsize; } else newsize = 0; @@ -301,8 +245,8 @@ void Sampler::select_vertices(size_t n, int m, VertexSet& st, unsigned seed) { // n: number of vertices in the subgraph; // m: number of vertices in the frontier. // our implementation of GraphSAINT sampling -void Sampler::select_vertices(size_t nv, size_t n, int m, Graph* g, - VertexList vertices, VertexSet& vertex_set) { +void Sampler::selectVertices(size_t nv, size_t n, int m, Graph* g, + VertexList vertices, VertexSet& vertex_set) { // galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices, // graph size: ", g->size(), "\n"); assert(nv == vertices.size()); @@ -343,7 +287,7 @@ void Sampler::select_vertices(size_t nv, size_t n, int m, Graph* g, */ } -void Sampler::getMasks(size_t n, VertexSet vertices, mask_t* masks) { +void Sampler::createMasks(size_t n, VertexSet vertices, mask_t* masks) { // galois::gPrint("Updating masks, size = ", vertices.size(), "\n"); std::fill(masks, masks + n, 0); for (auto v : vertices) @@ -365,7 +309,7 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& reindexGraph) { // auto n = origGraph.size(); // old graph size auto nv = keptVertices.size(); // new graph (subgraph) size - VertexList new_ids = this->reindexVertices(graph->size(), keptVertices); + VertexList new_ids = this->reindexVertices(globalGraph->size(), keptVertices); std::vector degrees(nv, 0); // degrees of vertices in the subgraph for (auto v : keptVertices) { degrees[new_ids[v]] = getDegree(&origGraph, v); @@ -403,23 +347,39 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, #endif } -void Sampler::subgraph_sample(size_t n, Graph& sg, mask_t* masks, - unsigned tid) { +VertexSet Sampler::convertToLID(VertexSet& gidSet) { + VertexSet existingLIDs; + + for (auto i : gidSet) { + if (partGraph->isLocal(i)) { + existingLIDs.insert(partGraph->getLID(i)); + } + } + + return existingLIDs; +} + +void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks, + unsigned seed) { VertexSet sampledSet; // n = 9000 by default - // this->select_vertices(count_, n, m_, masked_graph, vertices_, sampledSet); - + // this->selectVertices(count_, n, m_, globalMaskedGraph, vertices_, sampledSet); // do the sampling of vertices from training set + using masked graph - this->select_vertices(n, m_, sampledSet, tid); // m = 1000 by default + this->selectVertices(n, m_, sampledSet, seed); // m = 1000 by default + + // sampledSet is a list of *global* ids in the graph + // create new vertex set with LIDs for partitioned graph + VertexSet sampledLIDs = this->convertToLID(sampledSet); - // create the masks on the masked_graph - getMasks(Sampler::graph->size(), sampledSet, masks); + // create the masks + createMasks(Sampler::partGraph->size(), sampledLIDs, masks); - Graph masked_sg; - this->getMaskedGraph( - Sampler::graph->size(), masks, Sampler::masked_graph, - masked_sg); // remove edges whose destination is not masked - this->reindexSubgraph(sampledSet, masked_sg, sg); + // this graph will contain sampled vertices and induced subgraph for it + Graph maskedSG; + // TODO use partMaskedGraph once constructed later + this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph, + maskedSG); // remove edges whose destination is not masked + this->reindexSubgraph(sampledLIDs, maskedSG, sg); } } // namespace deepgalois From 066a0b64b2c21b00b0a7afd536ca2891ec73e664 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 11 May 2020 14:32:46 -0500 Subject: [PATCH 282/660] subgraph norm factor generation: assing to correct var also added a bunch of commented out print debugs --- libdeepgalois/include/deepgalois/Net.h | 9 +++++- libdeepgalois/include/deepgalois/Sampler.h | 4 ++- libdeepgalois/src/DistContext.cpp | 36 ++++++++++++++-------- libdeepgalois/src/Sampler.cpp | 16 ++++++++-- 4 files changed, 47 insertions(+), 18 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 9f49f8f847..cb559f4a4d 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -200,7 +200,7 @@ class Net { if (subgraph_sample_size) { distContext->allocateSubgraphs(num_subgraphs); subgraphs_masks = new mask_t[distNumSamples * num_subgraphs]; - galois::gPrint(header, " Construct training vertex set induced graph...\n"); + galois::gPrint(header, "Constructing training vertex set induced graph...\n"); sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, context->getGraphPointer(), distContext->getGraphPointer()); } @@ -272,6 +272,13 @@ class Net { distContext->constructSubgraphFeatures(this->subgraphNumVertices, &subgraphs_masks[sg_id * globalSamples]); layers[0]->set_feats_ptr(distContext->get_feats_subg_ptr()); // feed input data + + //Graph* testing = distContext->getSubgraphPointer(sg_id); + //for (size_t i = 0; i < testing->size(); i++) { + // for (auto j = testing->edge_begin(i); j < testing->edge_end(i); j++) { + // galois::gPrint(i, " ", testing->getEdgeDst(j), "\n"); + // } + //} } // end subgraph sample loop //////////////////////////////////////////////////////////////////////////////// diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h index d29c537ab9..f736fa6a8f 100644 --- a/libdeepgalois/include/deepgalois/Sampler.h +++ b/libdeepgalois/include/deepgalois/Sampler.h @@ -57,8 +57,10 @@ class Sampler { auto idx = offsets[src]; for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { const auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) + if (masks[dst] == 1) { + //galois::gPrint(src, " ", dst, "\n"); sub.constructEdge(idx++, dst, 0); + } } } } diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 7899a180e2..2e23d967fe 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -68,9 +68,17 @@ size_t DistContext::read_features(std::string dataset_str) { in.open(filename, std::ios::in); size_t m; // m = number of global vertices - // header read in >> m >> feat_len >> std::ws; + +// std::string file_dims = path + dataset_str + "-dims.txt"; +// std::ifstream ifs; +// ifs.open(file_dims, std::ios::in); +// ifs >> m >> feat_len >> std::ws; +// ifs.close(); +// + + galois::gPrint("N x D: ", m, " x ", feat_len, "\n"); // use local size, not global size h_feats = new float_t[dGraph->size() * feat_len]; @@ -87,9 +95,9 @@ size_t DistContext::read_features(std::string dataset_str) { edge_stream >> v; // actual feature edge_stream >> w; - h_feats[dGraph->getLID(u) * feat_len + v] = w; } + //galois::gPrint(u, "\n"); } in.close(); @@ -235,7 +243,7 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) { void DistContext::constructNormFactorSub(int subgraphID) { // right now norm factor based on subgraph - // TODO fix this + // TODO fix this for dist execution allocNormFactorSub(subgraphID); @@ -254,9 +262,9 @@ void DistContext::constructNormFactorSub(int subgraphID) { float_t c_j = std::sqrt(float_t(graphToUse.get_degree(j))); if (c_i == 0.0 || c_j == 0.0) { - this->normFactors[e] = 0.0; + this->normFactorsSub[e] = 0.0; } else { - this->normFactors[e] = 1.0 / (c_i * c_j); + this->normFactorsSub[e] = 1.0 / (c_i * c_j); } }, galois::loopname("NormCountingEdge")); @@ -268,10 +276,11 @@ void DistContext::constructNormFactorSub(int subgraphID) { auto degree = graphToUse.get_degree(v); float_t temp = std::sqrt(float_t(degree)); if (temp == 0.0) { - this->normFactors[v] = 0.0; + this->normFactorsSub[v] = 0.0; } else { - this->normFactors[v] = 1.0 / temp; + this->normFactorsSub[v] = 1.0 / temp; } + galois::gPrint(this->normFactorsSub[v], "\n"); }, galois::loopname("NormCountingNode")); #endif @@ -300,6 +309,7 @@ void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) { // &Context::h_labels_subg[count * Context::num_classes]); //} DistContext::h_labels_subg[count] = h_labels[i]; + //galois::gPrint("l ", (float)DistContext::h_labels_subg[count], "\n"); count++; } } @@ -317,6 +327,12 @@ void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) { std::copy(DistContext::h_feats + i * DistContext::feat_len, DistContext::h_feats + (i + 1) * DistContext::feat_len, &DistContext::h_feats_subg[count * DistContext::feat_len]); + //for (unsigned a = 0; a < DistContext::feat_len; a++) { + // if (h_feats_subg[count * DistContext::feat_len + a] != 0) { + // galois::gPrint(h_feats_subg[count * DistContext::feat_len + a], " "); + // } + //} + //galois::gPrint("\n"); count++; } } @@ -324,12 +340,6 @@ void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) { } - - - - - - galois::graphs::GluonSubstrate* DistContext::getSyncSubstrate() { return DistContext::syncSubstrate; }; diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp index 3dfbcf8c86..d57bf85537 100644 --- a/libdeepgalois/src/Sampler.cpp +++ b/libdeepgalois/src/Sampler.cpp @@ -45,8 +45,10 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGrap auto idx = offsets[src]; for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { const auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) + if (masks[dst] == 1) { + //galois::gPrint(src, " ", dst, "\n"); Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0); + } } } }, @@ -359,8 +361,7 @@ VertexSet Sampler::convertToLID(VertexSet& gidSet) { return existingLIDs; } -void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks, - unsigned seed) { +void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks, unsigned seed) { VertexSet sampledSet; // n = 9000 by default // this->selectVertices(count_, n, m_, globalMaskedGraph, vertices_, sampledSet); @@ -371,6 +372,13 @@ void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks, // create new vertex set with LIDs for partitioned graph VertexSet sampledLIDs = this->convertToLID(sampledSet); + //VertexSet sampledLIDs; + //galois::gPrint("part graph num edges is ", partGraph->sizeEdges(), "\n"); + //galois::gPrint("global mask num edges is ", globalMaskedGraph->sizeEdges(), "\n"); + //for (auto i : this->trainingNodes) { + // sampledLIDs.insert(i); + //} + // create the masks createMasks(Sampler::partGraph->size(), sampledLIDs, masks); @@ -380,6 +388,8 @@ void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks, this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph, maskedSG); // remove edges whose destination is not masked this->reindexSubgraph(sampledLIDs, maskedSG, sg); + + //galois::gPrint("sg num edges is ", sg.sizeEdges(), "\n"); } } // namespace deepgalois From b770e71e15a481cbad8ac252fdeebaa790ea9001 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 11 May 2020 16:43:35 -0500 Subject: [PATCH 283/660] dist: feature reading using binary --- libdeepgalois/src/DistContext.cpp | 61 +++++++++++++++---------------- 1 file changed, 30 insertions(+), 31 deletions(-) diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 2e23d967fe..7e27ea0d92 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -64,42 +64,41 @@ size_t DistContext::read_features(std::string dataset_str) { std::string filename = path + dataset_str + ".ft"; std::ifstream in; - std::string line; - - in.open(filename, std::ios::in); - size_t m; // m = number of global vertices - // header read - in >> m >> feat_len >> std::ws; - -// std::string file_dims = path + dataset_str + "-dims.txt"; -// std::ifstream ifs; -// ifs.open(file_dims, std::ios::in); -// ifs >> m >> feat_len >> std::ws; -// ifs.close(); -// + size_t m; // m = number of vertices + // dimension read + std::string file_dims = path + dataset_str + "-dims.txt"; + std::ifstream ifs; + ifs.open(file_dims, std::ios::in); + ifs >> m >> this->feat_len >> std::ws; + ifs.close(); galois::gPrint("N x D: ", m, " x ", feat_len, "\n"); - // use local size, not global size + + // TODO read in without using 2 in-memory buffers + // full read feats to load into h_feats + float_t* fullFeats = new float_t[m * feat_len]; + // actual stored feats h_feats = new float_t[dGraph->size() * feat_len]; - // loop through all features - while (std::getline(in, line)) { - std::istringstream edge_stream(line); - unsigned u, v; - float_t w; - // vertex to set feature for - edge_stream >> u; - // only set if local - if (dGraph->isLocal(u)) { - // feature index - edge_stream >> v; - // actual feature - edge_stream >> w; - h_feats[dGraph->getLID(u) * feat_len + v] = w; + // read in full feats + filename = path + dataset_str + "-feats.bin"; + in.open(filename, std::ios::binary | std::ios::in); + in.read((char*)fullFeats, sizeof(float_t) * m * feat_len); + in.close(); + + // get the local ids we want + size_t count = 0; + for (size_t i = 0; i < m; i++) { + if (dGraph->isLocal(i)) { + //h_feats[count * feat_len] = fullFeats[i]; + std::copy(fullFeats + i * DistContext::feat_len, + fullFeats + (i + 1) * DistContext::feat_len, + &this->h_feats[count * DistContext::feat_len]); + count++; } - //galois::gPrint(u, "\n"); } - in.close(); + GALOIS_ASSERT(count == dGraph->size()); + free(fullFeats); galois::gPrint("[", myID, "] Done with features, feature length: ", feat_len, "\n"); @@ -280,7 +279,7 @@ void DistContext::constructNormFactorSub(int subgraphID) { } else { this->normFactorsSub[v] = 1.0 / temp; } - galois::gPrint(this->normFactorsSub[v], "\n"); + //galois::gPrint(this->normFactorsSub[v], "\n"); }, galois::loopname("NormCountingNode")); #endif From d3ae95ec8888691b96027adfe540a58a584b591b Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 11 May 2020 18:22:08 -0500 Subject: [PATCH 284/660] multiclass reading reimplemented in distcontext --- .../include/deepgalois/DistContext.h | 11 ++- libdeepgalois/include/deepgalois/Net.h | 5 +- libdeepgalois/src/DistContext.cpp | 93 +++++++++---------- libdeepgalois/src/Net.cpp | 4 +- lonestar/gnn/gcn/gcn.cpp | 2 +- 5 files changed, 56 insertions(+), 59 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index ffaf430792..afd441b6e1 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -25,17 +25,20 @@ class DistContext { std::vector h_feats_subg; // input features for subgraph // change regular one to a vector as well - float_t* normFactors; // normalization constant based on graph structure + std::vector normFactors; // normalization constant based on graph structure std::vector normFactorsSub; // normalization constant for subgraph + bool usingSingleClass; public: - DistContext(); + // TODO better constructor + DistContext() : usingSingleClass(true) {}; ~DistContext(); void saveDistGraph(DGraph* a) { partitionedGraph = a; // construct lgraph from underlying lc csr graph + // TODO fix this so i don't have more than 1 copy of graph in memory this->lGraph = new Graph(); this->lGraph->allocateFrom(a->size(), a->sizeEdges()); this->lGraph->constructNodes(); @@ -56,7 +59,7 @@ class DistContext { } //! read labels of local nodes only - size_t read_labels(std::string dataset_str); + size_t read_labels(bool isSingleClassLabel, std::string dataset_str); //! read features of local nodes only size_t read_features(std::string dataset_str); //! read masks of local nodes only @@ -90,7 +93,7 @@ class DistContext { void constructSubgraphLabels(size_t m, const mask_t* masks); void constructSubgraphFeatures(size_t m, const mask_t* masks); - float_t* get_norm_factors_ptr() { return normFactors; } + float_t* get_norm_factors_ptr() { return normFactors.data(); } float_t* get_norm_factors_subg_ptr() { return &normFactorsSub[0]; } //! return label for some node diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index cb559f4a4d..8bc27df33f 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -123,9 +123,6 @@ class Net { context->set_dataset(dataset_str); // read *entire* graph, get num nodes globalSamples = context->read_graph(selfloop); - context->set_label_class(is_single_class); - // read ground truth labels - num_classes = context->read_labels(); // get training and validation sets: this is to create the training // subgraph in the sampler @@ -186,7 +183,7 @@ class Net { void init(); //! Initializes metadata for the partition - void partitionInit(DGraph* graph, std::string dataset_str); + void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel); size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 7e27ea0d92..d354301eb5 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -3,11 +3,11 @@ #include "deepgalois/configs.h" namespace deepgalois { -DistContext::DistContext() {} DistContext::~DistContext() {} -size_t DistContext::read_labels(std::string dataset_str) { +size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str) { DGraph* dGraph = DistContext::partitionedGraph; + this->usingSingleClass = isSingleClassLabel; unsigned myID = galois::runtime::getSystemNetworkInterface().ID; galois::gPrint("[", myID, "] Reading labels from disk...\n"); @@ -17,11 +17,18 @@ size_t DistContext::read_labels(std::string dataset_str) { in.open(filename, std::ios::in); size_t m; // read file header - in >> m >> num_classes >> std::ws; + in >> m >> this->num_classes >> std::ws; assert(m == dGraph->globalSize()); + // size of labels should be # local nodes - h_labels = new label_t[dGraph->size()]; // single-class (one-hot) label for - // each vertex: N x 1 + if (isSingleClassLabel) { + galois::gPrint("[", myID, "] One hot labels...\n"); + this->h_labels = new label_t[dGraph->size()]; // single-class (one-hot) label for + // each vertex: N x 1 + } else { + galois::gPrint("[", myID, "] Multi-class labels...\n"); + this->h_labels = new label_t[dGraph->size() * this->num_classes]; // multi-class label for each vertex: N x E + } uint32_t foundVertices = 0; unsigned v = 0; @@ -32,14 +39,21 @@ size_t DistContext::read_labels(std::string dataset_str) { std::istringstream label_stream(line); unsigned x; // for each class - for (size_t idx = 0; idx < num_classes; ++idx) { + for (size_t idx = 0; idx < this->num_classes; ++idx) { // check if that class is labeled label_stream >> x; - if (x != 0) { - // set local id - h_labels[dGraph->getLID(v)] = idx; + + // diff between single and multi class + if (isSingleClassLabel) { + if (x != 0) { + // set local id + this->h_labels[dGraph->getLID(v)] = idx; + foundVertices++; + break; + } + } else { + this->h_labels[dGraph->getLID(v) * this->num_classes + idx] = x; foundVertices++; - break; } } } @@ -159,42 +173,26 @@ void DistContext::initializeSyncSubstrate() { } void DistContext::allocNormFactor() { - if (!normFactors) { #ifdef USE_MKL - normFactors = new float_t[partitionedGraph->sizeEdges()]; + this->normFactors.resize(partitionedGraph->sizeEdges()); #else - normFactors = new float_t[partitionedGraph->size()]; + this->normFactors.resize(partitionedGraph->size()); #endif - } - if (!normFactors) { - GALOIS_DIE("norm factors failed to be allocated"); - } + // TODO clean out? } void DistContext::allocNormFactorSub(int subID) { #ifdef USE_MKL - normFactorsSub.resize(partitionedSubgraphs[subID]->sizeEdges()); + this->normFactorsSub.resize(partitionedSubgraphs[subID]->sizeEdges()); #else - normFactorsSub.resize(partitionedSubgraphs[subID]->size()); + this->normFactorsSub.resize(partitionedSubgraphs[subID]->size()); #endif // TODO clean out? } -//void DistContext::allocSubNormFactor(int subID) { -// if (!normFactors) { -//#ifdef USE_MKL -// normFactors = new float_t[partitionedGraph->sizeEdges()]; -//#else -// normFactors = new float_t[partitionedGraph->size()]; -//#endif -// } -// if (!normFactors) { -// GALOIS_DIE("norm factors failed to be allocated"); -// } -//} - void DistContext::constructNormFactor(deepgalois::Context* globalContext) { + galois::gPrint("Norm factor construction\n"); // TODO IMPLEMENT THIS; get relevant info from the original context // sets current subgraph + gets degrees Graph* wholeGraph = globalContext->getCurrentGraph(false); @@ -238,9 +236,11 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) { }, galois::loopname("NormCountingNode")); #endif + galois::gPrint("Norm factor construction done\n"); } void DistContext::constructNormFactorSub(int subgraphID) { + galois::gPrint("Sub norm factor construction\n"); // right now norm factor based on subgraph // TODO fix this for dist execution @@ -283,31 +283,28 @@ void DistContext::constructNormFactorSub(int subgraphID) { }, galois::loopname("NormCountingNode")); #endif + galois::gPrint("Sub norm factor construction done\n"); } //! generate labels for the subgraph, m is subgraph size, mask //! tells which vertices to use void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) { - // TODO multiclass - - // if (h_labels_subg == NULL) h_labels_subg = new label_t[m]; - //if (DistContext::is_single_class) { - //} else { - // DistContext::h_labels_subg.resize(m * Context::num_classes); - //} - - DistContext::h_labels_subg.resize(m); + if (DistContext::usingSingleClass) { + DistContext::h_labels_subg.resize(m); + } else { + DistContext::h_labels_subg.resize(m * DistContext::num_classes); + } size_t count = 0; // see which labels to copy over for this subgraph for (size_t i = 0; i < this->partitionedGraph->size(); i++) { if (masks[i] == 1) { - //if (Context::is_single_class) { - //} else { - // std::copy(Context::h_labels + i * Context::num_classes, - // Context::h_labels + (i + 1) * Context::num_classes, - // &Context::h_labels_subg[count * Context::num_classes]); - //} - DistContext::h_labels_subg[count] = h_labels[i]; + if (DistContext::usingSingleClass) { + DistContext::h_labels_subg[count] = h_labels[i]; + } else { + std::copy(DistContext::h_labels + i * DistContext::num_classes, + DistContext::h_labels + (i + 1) * DistContext::num_classes, + &DistContext::h_labels_subg[count * DistContext::num_classes]); + } //galois::gPrint("l ", (float)DistContext::h_labels_subg[count], "\n"); count++; } diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index 3500911c74..ce23b2b51d 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -9,7 +9,7 @@ namespace deepgalois { -void Net::partitionInit(DGraph* graph, std::string dataset_str) { +void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel) { this->dGraph = graph; this->distContext = new deepgalois::DistContext(); this->distContext->saveDistGraph(dGraph); @@ -19,7 +19,7 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str) { // or on master node only this->distContext->initializeSyncSubstrate(); - num_classes = this->distContext->read_labels(dataset_str); + num_classes = this->distContext->read_labels(isSingleClassLabel, dataset_str); // std::cout << "Reading label masks ... "; this->distTrainMasks = new mask_t[this->distNumSamples]; diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp index d9219438ae..fabd27667f 100644 --- a/lonestar/gnn/gcn/gcn.cpp +++ b/lonestar/gnn/gcn/gcn.cpp @@ -27,7 +27,7 @@ int main(int argc, char** argv) { neighbor_sample_sz, subgraph_sample_sz, val_interval); // initialize distributed context - network.partitionInit(dGraph, dataset); + network.partitionInit(dGraph, dataset, is_single_class); // construct layers from distributed context network.construct_layers(); From ad6424209a3efd3527742a1a4e1b7888eff1a9e1 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 11 May 2020 18:52:30 -0500 Subject: [PATCH 285/660] bunch of cleanup: context is now topo only, removed unused functions --- libdeepgalois/include/deepgalois/Context.h | 60 +------- .../include/deepgalois/DistContext.h | 5 - libdeepgalois/include/deepgalois/Net.h | 19 +-- libdeepgalois/include/deepgalois/Sampler.h | 1 + libdeepgalois/src/Context.cpp | 141 +----------------- libdeepgalois/src/DistContext.cpp | 9 +- libdeepgalois/src/Sampler.cpp | 41 ++--- 7 files changed, 45 insertions(+), 231 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h index 41e3aac23b..7faebd7c83 100644 --- a/libdeepgalois/include/deepgalois/Context.h +++ b/libdeepgalois/include/deepgalois/Context.h @@ -18,31 +18,14 @@ namespace deepgalois { class Context { std::string dataset; bool is_device; // is this on device or host - size_t n; // number of samples: N - size_t num_classes; // number of classes: E - size_t feat_len; // input feature length: D - bool is_single_class; // single-class (one-hot) or multi-class label bool is_selfloop_added; // whether selfloop is added to the input graph - bool use_subgraph; // whether to use subgraph - label_t* h_labels; // labels for classification. Single-class label: Nx1, - // multi-class label: NxE - float_t* h_feats; // input features: N x D - // label_t *h_labels_subg; // labels for subgraph - // float_t* h_feats_subg; // input features for subgraph + label_t* d_labels; // labels on device label_t* d_labels_subg; // labels for subgraph on device float_t* d_feats; // input features on device float_t* d_feats_subg; // input features for subgraph on device - float_t* norm_factors; // normalization constant based on graph structure - std::vector h_labels_subg; // labels for subgraph - std::vector h_feats_subg; // input features for subgraph - std::vector norm_factors_subg; // normalization constant for subgraph - // float_t* norm_factors_subg; // normalization constant for subgraph - Reader reader; - - void alloc_norm_factor(); - void alloc_subgraph_norm_factor(int subg_id); + Reader reader; public: // TODO separate below to public and private @@ -52,11 +35,6 @@ class Context { void add_selfloop(Graph& og, Graph& g); //! returns pointer to the graph Graph* getGraphPointer() { return graph_cpu; } - Graph* getSubgraphPointer(int id) { return subgraphs_cpu[id]; }; - float_t* get_feats_ptr() { return h_feats; } - float_t* get_feats_subg_ptr() { return &h_feats_subg[0]; } - label_t* get_labels_ptr() { return h_labels; } - label_t* get_labels_subg_ptr() { return &h_labels_subg[0]; } #else static cublasHandle_t cublas_handle_; // used to call cuBLAS static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE @@ -85,35 +63,18 @@ class Context { Context(); //! initializer for gpu; goes ahead and sets a few things Context(bool use_gpu) - : is_device(use_gpu), n(0), num_classes(0), feat_len(0), - is_single_class(true), is_selfloop_added(false), use_subgraph(false), - h_labels(NULL), h_feats(NULL), d_labels(NULL), d_labels_subg(NULL), - d_feats(NULL), d_feats_subg(NULL), norm_factors(NULL) {} + : is_device(use_gpu), + is_selfloop_added(false), d_labels(NULL), d_labels_subg(NULL), + d_feats(NULL), d_feats_subg(NULL) {} ~Context(); size_t read_graph(bool selfloop); - size_t read_labels() { - num_classes = reader.read_labels(is_single_class, h_labels); - return num_classes; - } - size_t read_features() { - feat_len = reader.read_features(h_feats); - return feat_len; - } + size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks) { return reader.read_masks(mask_type, n, begin, end, masks); } - label_t get_label(size_t i) { - return h_labels[i]; - } // single-class (one-hot) label - - // label_t get_label(size_t i, size_t j) { return labels[i*num_classes+j]; } - // // multi-class label - float_t* get_norm_factors_ptr() { return norm_factors; } - float_t* get_norm_factors_subg_ptr() { return &norm_factors_subg[0]; } - void set_dataset(std::string dataset_str) { dataset = dataset_str; reader.init(dataset); @@ -121,16 +82,9 @@ class Context { //! Checks if subgraph being used, sets currenet graph, then calls degreex //! counting - Graph* getCurrentGraph(bool usingSubGraph, int subID=0); + Graph* getFullGraph(); - void set_label_class(bool is_single = true) { is_single_class = is_single; } - void set_use_subgraph(bool use_subg) { use_subgraph = use_subg; } void copy_data_to_device(); // copy labels and input features - void norm_factor_computing(bool is_subgraph, int subg_id = 0); - void gen_subgraph_labels(size_t m, const mask_t* masks); - void gen_subgraph_feats(size_t m, const mask_t* masks); - //! Allocate subgraphs (but don't actually do sampling yet) - void allocateSubgraphs(int num_subgraphs); }; } // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index afd441b6e1..be7dd7be45 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -66,11 +66,6 @@ class DistContext { size_t read_masks(std::string dataset_str, std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph); - // TODO define these - void createSubgraphs(int) {} - void gen_subgraph_labels(size_t, const mask_t*) {} - void gen_subgraph_feats(size_t, const mask_t*) {} - DGraph* getGraphPointer() { return partitionedGraph; } Graph* getLGraphPointer() { return lGraph; } diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 8bc27df33f..c10c262a02 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -76,7 +76,7 @@ class Net { // TODO optimize single host case //! context holds all of the graph data - deepgalois::Context* context; + deepgalois::Context* graphTopologyContext; //! dist context holds graph data of the partitioned graph only deepgalois::DistContext* distContext; @@ -119,10 +119,10 @@ class Net { feature_dims.resize(num_layers + 1); // initialze global graph context - context = new deepgalois::Context(); - context->set_dataset(dataset_str); + graphTopologyContext = new deepgalois::Context(); + graphTopologyContext->set_dataset(dataset_str); // read *entire* graph, get num nodes - globalSamples = context->read_graph(selfloop); + globalSamples = graphTopologyContext->read_graph(selfloop); // get training and validation sets: this is to create the training // subgraph in the sampler @@ -147,9 +147,9 @@ class Net { globalValMasks[i] = 1; } else { globalTrainCount = - context->read_masks("train", globalSamples, globalTrainBegin, + graphTopologyContext->read_masks("train", globalSamples, globalTrainBegin, globalTrainEnd, globalTrainMasks); - globalValCount = context->read_masks("val", globalSamples, globalValBegin, + globalValCount = graphTopologyContext->read_masks("val", globalSamples, globalValBegin, globalValEnd, globalValMasks); } @@ -162,9 +162,6 @@ class Net { // features are read in distcontext, not this context (this context only // used for sampling) - // set the subgraph boolean if sample size is greater than 0 - context->set_use_subgraph(subgraph_sample_size > 0); - this->sampler = new Sampler(); } @@ -198,7 +195,7 @@ class Net { distContext->allocateSubgraphs(num_subgraphs); subgraphs_masks = new mask_t[distNumSamples * num_subgraphs]; galois::gPrint(header, "Constructing training vertex set induced graph...\n"); - sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, context->getGraphPointer(), + sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, graphTopologyContext->getGraphPointer(), distContext->getGraphPointer()); } @@ -464,7 +461,7 @@ class Net { layers[0]->set_in_data(distContext->get_feats_ptr()); // feed input data // precompute the normalization constant based on graph structure //context->norm_factor_computing(false); - distContext->constructNormFactor(context); + distContext->constructNormFactor(graphTopologyContext); for (size_t i = 0; i < num_conv_layers; i++) layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr()); set_contexts(); diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h index f736fa6a8f..1a9fabd9ec 100644 --- a/libdeepgalois/include/deepgalois/Sampler.h +++ b/libdeepgalois/include/deepgalois/Sampler.h @@ -103,6 +103,7 @@ void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, void checkGSDB(std::vector& DB0, std::vector& DB1, std::vector& DB2, size_t size); + //! convert set of gids to lids VertexSet convertToLID(VertexSet& gidSet); public: diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp index 17b9872f74..c9bbe9e706 100644 --- a/libdeepgalois/src/Context.cpp +++ b/libdeepgalois/src/Context.cpp @@ -10,67 +10,7 @@ namespace deepgalois { Context::Context() : Context(false) {} -Context::~Context() { - if (h_labels) - delete[] h_labels; - if (h_feats) - delete[] h_feats; - if (norm_factors) - delete[] norm_factors; - // if (h_feats_subg) delete[] h_feats_subg; - // if (h_labels_subg) delete[] h_labels_subg; - // if (norm_factors_subg) delete[] norm_factors_subg; -} - -void Context::allocateSubgraphs(int num_subgraphs) { - subgraphs_cpu.resize(num_subgraphs); - for (int i = 0; i < num_subgraphs; i++) - subgraphs_cpu[i] = new Graph(); -} - -//! generate labels for the subgraph, m is subgraph size, mask -//! tells which vertices to use -void Context::gen_subgraph_labels(size_t m, const mask_t* masks) { - // if (h_labels_subg == NULL) h_labels_subg = new label_t[m]; - if (Context::is_single_class) { - Context::h_labels_subg.resize(m); - } else { - Context::h_labels_subg.resize(m * Context::num_classes); - } - - size_t count = 0; - // see which labels to copy over for this subgraph - for (size_t i = 0; i < n; i++) { - if (masks[i] == 1) { - if (Context::is_single_class) { - Context::h_labels_subg[count] = h_labels[i]; - } else { - std::copy(Context::h_labels + i * Context::num_classes, - Context::h_labels + (i + 1) * Context::num_classes, - &Context::h_labels_subg[count * Context::num_classes]); - } - count++; - } - } - assert(count == m); -} - -//! generate input features for the subgraph, m is subgraph size, -//! masks tells which vertices to use -void Context::gen_subgraph_feats(size_t m, const mask_t* masks) { - size_t count = 0; - // if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len]; - Context::h_feats_subg.resize(m * feat_len); - for (size_t i = 0; i < n; i++) { - if (masks[i] == 1) { - std::copy(Context::h_feats + i * Context::feat_len, - Context::h_feats + (i + 1) * Context::feat_len, - &Context::h_feats_subg[count * Context::feat_len]); - count++; - } - } - assert(count == m); -} +Context::~Context() {} size_t Context::read_graph(bool selfloop) { std::string filename = path + dataset + ".csgr"; @@ -138,86 +78,11 @@ void Context::add_selfloop(Graph& og, Graph& g) { //} } -void Context::alloc_norm_factor() { - Graph* g = getGraphPointer(); - if (norm_factors == NULL) { -#ifdef USE_MKL - norm_factors = new float_t[g->sizeEdges()]; -#else - norm_factors = new float_t[g->size()]; -#endif - } -} - -void Context::alloc_subgraph_norm_factor(int subg_id) { - Graph* g = getSubgraphPointer(subg_id); -#ifdef USE_MKL - norm_factors_subg.resize(g->sizeEdges()); -#else - norm_factors_subg.resize(g->size()); -#endif -} - // get current graph, also gets degrees of g -Graph* Context::getCurrentGraph(bool usingSubGraph, int subID) { - Graph* g; - - // grab orig or subgraph pointer as necessary - if (!usingSubGraph) { - g = getGraphPointer(); - } else { - g = getSubgraphPointer(subID); - } +Graph* Context::getFullGraph() { + Graph* g = getGraphPointer(); g->degree_counting(); - return g; } -void Context::norm_factor_computing(bool is_subgraph, int subg_id) { - Graph* g; - float_t* constants; - - // grab orig or subgraph pointer as necessary - if (!is_subgraph) { - g = getGraphPointer(); - alloc_norm_factor(); - constants = norm_factors; - } else { - g = getSubgraphPointer(subg_id); - alloc_subgraph_norm_factor(subg_id); - constants = get_norm_factors_subg_ptr(); - } - - auto g_size = g->size(); - g->degree_counting(); -#ifdef USE_MKL - galois::do_all( - galois::iterate((size_t)0, g_size), - [&](auto i) { - float_t c_i = std::sqrt(float_t(g->get_degree(i))); - for (auto e = g->edge_begin(i); e != g->edge_end(i); e++) { - const auto j = g->getEdgeDst(e); - float_t c_j = std::sqrt(float_t(g->get_degree(j))); - if (c_i == 0.0 || c_j == 0.0) - constants[e] = 0.0; - else - constants[e] = 1.0 / (c_i * c_j); - } - }, - galois::loopname("NormCountingEdge")); -#else - galois::do_all( - galois::iterate((size_t)0, g_size), - [&](auto v) { - auto degree = g->get_degree(v); - float_t temp = std::sqrt(float_t(degree)); - if (temp == 0.0) - constants[v] = 0.0; - else - constants[v] = 1.0 / temp; - }, - galois::loopname("NormCountingVertex")); -#endif -} - } // namespace deepgalois diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index d354301eb5..ea98e26007 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -5,6 +5,7 @@ namespace deepgalois { DistContext::~DistContext() {} +// TODO move to reader class size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str) { DGraph* dGraph = DistContext::partitionedGraph; this->usingSingleClass = isSingleClassLabel; @@ -71,6 +72,7 @@ size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str return num_classes; } +// TODO move to reader class size_t DistContext::read_features(std::string dataset_str) { DGraph* dGraph = DistContext::partitionedGraph; unsigned myID = galois::runtime::getSystemNetworkInterface().ID; @@ -120,6 +122,7 @@ size_t DistContext::read_features(std::string dataset_str) { return feat_len; } +// TODO move to reader class size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph) { @@ -193,12 +196,10 @@ void DistContext::allocNormFactorSub(int subID) { void DistContext::constructNormFactor(deepgalois::Context* globalContext) { galois::gPrint("Norm factor construction\n"); - // TODO IMPLEMENT THIS; get relevant info from the original context - // sets current subgraph + gets degrees - Graph* wholeGraph = globalContext->getCurrentGraph(false); + // using original graph to get ids + Graph* wholeGraph = globalContext->getFullGraph(); allocNormFactor(); - // this is for testing purposes //galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()), // [&] (unsigned i) { diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp index d57bf85537..466cee6584 100644 --- a/libdeepgalois/src/Sampler.cpp +++ b/libdeepgalois/src/Sampler.cpp @@ -6,9 +6,23 @@ #define PARALLEL_GEN namespace deepgalois { + +//! debug function: prints out sets of vertices +void print_vertex_set(VertexSet vertex_set) { + unsigned counter = 0; + unsigned n = vertex_set.size(); + galois::gPrint("( "); + for (int i : vertex_set) { + counter++; + if (counter > 16 && counter < n - 16) + continue; + galois::gPrint(i, " "); + } + galois::gPrint(")\n"); +} + +//! helper function to get degree of some vertex given some graph inline unsigned getDegree(Graph* g, index_t v) { - // return g->get_degree(v); - // return std::distance(g->edge_begin(v), g->edge_end(v)); return g->edge_end(v) - g->edge_begin(v); } @@ -58,7 +72,8 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGrap Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size(); Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg; - // TODO masked part graph as well to save time later + // TODO masked part graph as well to save time later; right now constructing + // from full part graph // size_t idx = 0; // vertices_.resize(count); @@ -81,19 +96,7 @@ void Sampler::checkGSDB(std::vector& DB0, std::vector& DB1, DB2.resize(size); } -//! debug function: prints out sets of vertices -void print_vertex_set(VertexSet vertex_set) { - unsigned counter = 0; - unsigned n = vertex_set.size(); - galois::gPrint("( "); - for (int i : vertex_set) { - counter++; - if (counter > 16 && counter < n - 16) - continue; - galois::gPrint(i, " "); - } - galois::gPrint(")\n"); -} + // implementation from GraphSAINT // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp @@ -292,8 +295,7 @@ void Sampler::selectVertices(size_t nv, size_t n, int m, Graph* g, void Sampler::createMasks(size_t n, VertexSet vertices, mask_t* masks) { // galois::gPrint("Updating masks, size = ", vertices.size(), "\n"); std::fill(masks, masks + n, 0); - for (auto v : vertices) - masks[v] = 1; + for (auto v : vertices) masks[v] = 1; } inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) { @@ -351,13 +353,12 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, VertexSet Sampler::convertToLID(VertexSet& gidSet) { VertexSet existingLIDs; - + // find local selected vertices, convert to lid for (auto i : gidSet) { if (partGraph->isLocal(i)) { existingLIDs.insert(partGraph->getLID(i)); } } - return existingLIDs; } From a326fb6600fc000500a10cc8aaa6fdab9cf4cb2b Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 11 May 2020 18:56:30 -0500 Subject: [PATCH 286/660] ran clang-format 10 on deepgalois --- libdeepgalois/include/deepgalois/Context.h | 5 +- .../include/deepgalois/DistContext.h | 20 +- libdeepgalois/include/deepgalois/Net.h | 95 ++++---- libdeepgalois/include/deepgalois/Sampler.h | 59 +++-- libdeepgalois/src/DistContext.cpp | 207 ++++++++++-------- libdeepgalois/src/Net.cpp | 25 ++- libdeepgalois/src/Sampler.cpp | 33 +-- libdeepgalois/src/layers/aggregator.cpp | 2 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 17 +- libdeepgalois/src/sampler.cu | 87 +++++--- 10 files changed, 297 insertions(+), 253 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h index 7faebd7c83..341270201a 100644 --- a/libdeepgalois/include/deepgalois/Context.h +++ b/libdeepgalois/include/deepgalois/Context.h @@ -63,9 +63,8 @@ class Context { Context(); //! initializer for gpu; goes ahead and sets a few things Context(bool use_gpu) - : is_device(use_gpu), - is_selfloop_added(false), d_labels(NULL), d_labels_subg(NULL), - d_feats(NULL), d_feats_subg(NULL) {} + : is_device(use_gpu), is_selfloop_added(false), d_labels(NULL), + d_labels_subg(NULL), d_feats(NULL), d_feats_subg(NULL) {} ~Context(); size_t read_graph(bool selfloop); diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index be7dd7be45..14b2ae18b7 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -11,27 +11,28 @@ namespace deepgalois { class DistContext { - size_t num_classes; // number of classes: E - size_t feat_len; // input feature length: D + size_t num_classes; // number of classes: E + size_t feat_len; // input feature length: D galois::graphs::GluonSubstrate* syncSubstrate; - Graph* lGraph; // laerning graph version + Graph* lGraph; // laerning graph version DGraph* partitionedGraph; // the input graph, |V| = N std::vector partitionedSubgraphs; - label_t* h_labels; // labels for classification. Single-class label: Nx1, - // multi-class label: NxE + label_t* h_labels; // labels for classification. Single-class label: Nx1, + // multi-class label: NxE std::vector h_labels_subg; // labels for subgraph - float_t* h_feats; // input features: N x D + float_t* h_feats; // input features: N x D std::vector h_feats_subg; // input features for subgraph // change regular one to a vector as well - std::vector normFactors; // normalization constant based on graph structure + std::vector + normFactors; // normalization constant based on graph structure std::vector normFactorsSub; // normalization constant for subgraph bool usingSingleClass; public: // TODO better constructor - DistContext() : usingSingleClass(true) {}; + DistContext() : usingSingleClass(true){}; ~DistContext(); void saveDistGraph(DGraph* a) { @@ -54,8 +55,7 @@ class DistContext { this->lGraph->constructEdge(idx++, dst, 0); } }, - galois::loopname("lgraphcopy") - ); + galois::loopname("lgraphcopy")); } //! read labels of local nodes only diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index c10c262a02..04f51f317b 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -33,13 +33,13 @@ class Net { unsigned neighbor_sample_size; // neighbor sampling unsigned subgraph_sample_size; // subgraph sampling int num_threads; // number of threads - size_t globalSamples; // number of samples: N + size_t globalSamples; // number of samples: N size_t distNumSamples; // number of samples: N size_t num_classes; // number of vertex classes: E size_t num_conv_layers; // number of convolutional layers size_t num_layers; // total number of layers (conv + output) int num_epochs; // number of epochs - unsigned h1; // hidden layer size + unsigned h1; // hidden layer size float learning_rate; // learning rate float dropout_rate; // dropout rate float weight_decay; // weighti decay for over-fitting @@ -92,8 +92,7 @@ class Net { : is_single_class(single), has_l2norm(l2norm), has_dense(dense), neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz), num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), - h1(hidden1), - learning_rate(lr), dropout_rate(dropout), weight_decay(wd), + h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd), val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { // init some identifiers for this host this->myID = galois::runtime::getSystemNetworkInterface().ID; @@ -104,10 +103,10 @@ class Net { // TODO use galois print galois::gPrint(header, "Configuration: num_threads ", num_threads, - ", num_conv_layers ", num_conv_layers, ", num_epochs ", - num_epochs, ", hidden1 ", hidden1, ", learning_rate ", - learning_rate, ", dropout_rate ", dropout_rate, - ", weight_decay ", weight_decay, "\n"); + ", num_conv_layers ", num_conv_layers, ", num_epochs ", + num_epochs, ", hidden1 ", hidden1, ", learning_rate ", + learning_rate, ", dropout_rate ", dropout_rate, + ", weight_decay ", weight_decay, "\n"); this->num_layers = num_conv_layers + 1; // additional layers to add @@ -146,11 +145,11 @@ class Net { for (size_t i = globalValBegin; i < globalValEnd; i++) globalValMasks[i] = 1; } else { - globalTrainCount = - graphTopologyContext->read_masks("train", globalSamples, globalTrainBegin, - globalTrainEnd, globalTrainMasks); - globalValCount = graphTopologyContext->read_masks("val", globalSamples, globalValBegin, - globalValEnd, globalValMasks); + globalTrainCount = graphTopologyContext->read_masks( + "train", globalSamples, globalTrainBegin, globalTrainEnd, + globalTrainMasks); + globalValCount = graphTopologyContext->read_masks( + "val", globalSamples, globalValBegin, globalValEnd, globalValMasks); } // make sure sampel size isn't greater than what we have to train with @@ -180,7 +179,8 @@ class Net { void init(); //! Initializes metadata for the partition - void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel); + void partitionInit(DGraph* graph, std::string dataset_str, + bool isSingleClassLabel); size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } @@ -194,8 +194,10 @@ class Net { if (subgraph_sample_size) { distContext->allocateSubgraphs(num_subgraphs); subgraphs_masks = new mask_t[distNumSamples * num_subgraphs]; - galois::gPrint(header, "Constructing training vertex set induced graph...\n"); - sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, graphTopologyContext->getGraphPointer(), + galois::gPrint(header, + "Constructing training vertex set induced graph...\n"); + sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, + graphTopologyContext->getGraphPointer(), distContext->getGraphPointer()); } @@ -207,12 +209,13 @@ class Net { for (int curEpoch = 0; curEpoch < num_epochs; curEpoch++) { t_epoch.Start(); -//////////////////////////////////////////////////////////////////////////////// -// Sampling -//////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////// + // Sampling + //////////////////////////////////////////////////////////////////////////////// if (subgraph_sample_size) { if (num_subg_remain == 0) { - galois::gPrint(header, "Generating ", num_subgraphs, " subgraph(s)\n"); + galois::gPrint(header, "Generating ", num_subgraphs, + " subgraph(s)\n"); // TODO stat timer instead of this timer Timer t_subgen; t_subgen.Start(); @@ -220,9 +223,9 @@ class Net { // generate subgraphs #ifndef __GALOIS_HET_CUDA__ for (int sid = 0; sid < num_subgraphs; sid++) { - sampler->sampleSubgraph(subgraph_sample_size, - *(distContext->getSubgraphPointer(sid)), - &subgraphs_masks[sid * globalSamples], curEpoch); + sampler->sampleSubgraph( + subgraph_sample_size, *(distContext->getSubgraphPointer(sid)), + &subgraphs_masks[sid * globalSamples], curEpoch); } #endif num_subg_remain = num_subgraphs; @@ -239,9 +242,9 @@ class Net { // choose a subgraph to use num_subg_remain--; - int sg_id = num_subg_remain; - auto subgraphPointer = distContext->getSubgraphPointer(sg_id); - this->subgraphNumVertices = subgraphPointer->size(); + int sg_id = num_subg_remain; + auto subgraphPointer = distContext->getSubgraphPointer(sg_id); + this->subgraphNumVertices = subgraphPointer->size(); // galois::gPrint("Subgraph num_vertices: ", subgraphNumVertices, ", // num_edges: ", subgraphPointer->sizeEdges(), "\n"); @@ -254,27 +257,31 @@ class Net { distContext->constructNormFactorSub(sg_id); for (size_t i = 0; i < num_conv_layers; i++) { layers[i]->set_graph_ptr(subgraphPointer); - layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_subg_ptr()); + layers[i]->set_norm_consts_ptr( + distContext->get_norm_factors_subg_ptr()); } // update labels for subgraph - distContext->constructSubgraphLabels(this->subgraphNumVertices, - &subgraphs_masks[sg_id * globalSamples]); - layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_subg_ptr()); + distContext->constructSubgraphLabels( + this->subgraphNumVertices, &subgraphs_masks[sg_id * globalSamples]); + layers[num_layers - 1]->set_labels_ptr( + distContext->get_labels_subg_ptr()); // update features for subgraph - distContext->constructSubgraphFeatures(this->subgraphNumVertices, - &subgraphs_masks[sg_id * globalSamples]); - layers[0]->set_feats_ptr(distContext->get_feats_subg_ptr()); // feed input data - - //Graph* testing = distContext->getSubgraphPointer(sg_id); - //for (size_t i = 0; i < testing->size(); i++) { - // for (auto j = testing->edge_begin(i); j < testing->edge_end(i); j++) { + distContext->constructSubgraphFeatures( + this->subgraphNumVertices, &subgraphs_masks[sg_id * globalSamples]); + layers[0]->set_feats_ptr( + distContext->get_feats_subg_ptr()); // feed input data + + // Graph* testing = distContext->getSubgraphPointer(sg_id); + // for (size_t i = 0; i < testing->size(); i++) { + // for (auto j = testing->edge_begin(i); j < testing->edge_end(i); j++) + // { // galois::gPrint(i, " ", testing->getEdgeDst(j), "\n"); // } //} } // end subgraph sample loop -//////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////// // training steps galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, seperator); @@ -417,9 +424,9 @@ class Net { } } } else { - globalTestCount = - distContext->read_masks(dataset, std::string("test"), globalSamples, globalTestBegin, - globalTestEnd, test_masks, dGraph); + globalTestCount = distContext->read_masks( + dataset, std::string("test"), globalSamples, globalTestBegin, + globalTestEnd, test_masks, dGraph); } #ifdef __GALOIS_HET_CUDA__ copy_test_masks_to_device(); @@ -431,7 +438,7 @@ class Net { // append conv layers std::cout << "\nConstructing layers...\n"; for (size_t i = 0; i < num_conv_layers - 1; i++) { - append_conv_layer(i, true); // conv layers, act=true + append_conv_layer(i, true); // conv layers, act=true } append_conv_layer(num_conv_layers - 1); // the last hidden layer, act=false @@ -444,7 +451,7 @@ class Net { append_dense_layer(num_layers - 2); // dense layer } - append_out_layer(num_layers - 1); // output layer + append_out_layer(num_layers - 1); // output layer // allocate memory for intermediate features and gradients for (size_t i = 0; i < num_layers; i++) { @@ -460,7 +467,7 @@ class Net { layers[0]->set_in_data(distContext->get_feats_ptr()); // feed input data // precompute the normalization constant based on graph structure - //context->norm_factor_computing(false); + // context->norm_factor_computing(false); distContext->constructNormFactor(graphTopologyContext); for (size_t i = 0; i < num_conv_layers; i++) layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr()); diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h index 1a9fabd9ec..72d5425817 100644 --- a/libdeepgalois/include/deepgalois/Sampler.h +++ b/libdeepgalois/include/deepgalois/Sampler.h @@ -20,8 +20,8 @@ class Sampler { //! average degree cut off to a clip int subg_deg; - //VertexList vertices_; - //mask_t* masks_; + // VertexList vertices_; + // mask_t* masks_; //! List of training nodes; sampling set std::vector trainingNodes; @@ -43,12 +43,13 @@ class Sampler { // auto offsets = deepgalois::parallel_prefix_sum(degrees); auto offsets = deepgalois::prefix_sum(degrees); size_t ne = offsets[n]; - //galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", ne, "\n"); - + // galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", + // ne, "\n"); + // note this constructs the full graph's nodes; just trims edges sub.allocateFrom(n, ne); sub.constructNodes(); - + galois::do_all( galois::iterate((size_t)0, n), [&](const auto src) { @@ -58,44 +59,42 @@ class Sampler { for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { const auto dst = g->getEdgeDst(e); if (masks[dst] == 1) { - //galois::gPrint(src, " ", dst, "\n"); + // galois::gPrint(src, " ", dst, "\n"); sub.constructEdge(idx++, dst, 0); } } } - } - , + }, galois::loopname("gen_subgraph")); } - -//! determine degree of each vertex in a masked graph (given by masks and g) -template -void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, - std::vector& degrees) { - assert(degrees.size() == n); + //! determine degree of each vertex in a masked graph (given by masks and g) + template + void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, + std::vector& degrees) { + assert(degrees.size() == n); #ifdef PARALLEL_GEN - galois::do_all( - galois::iterate(size_t(0), n), - [&](const auto src) { + galois::do_all( + galois::iterate(size_t(0), n), + [&](const auto src) { #else - for (size_t src = 0; src < n; src++) { + for (size_t src = 0; src < n; src++) { #endif - if (masks[src] == 1) { - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - const auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) { - //galois::gInfo("Edge ", src, " ", dst); - degrees[src]++; + if (masks[src] == 1) { + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) { + // galois::gInfo("Edge ", src, " ", dst); + degrees[src]++; + } } } } - } #ifdef PARALLEL_GEN - , - galois::loopname("update_degrees")); + , + galois::loopname("update_degrees")); #endif -} + } //! Set masks bitset with IDs in the vertices VertexSet void createMasks(size_t n, VertexSet vertices, mask_t* masks); @@ -117,9 +116,9 @@ void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, //! API function for user-defined selection strategy // TODO how to expose this? virtual void selectVertices(size_t nv, size_t n, int m, Graph* g, - VertexList vertices, VertexSet& vertex_set); + VertexList vertices, VertexSet& vertex_set); virtual void selectVertices(size_t n, int m, VertexSet& vertex_set, - unsigned seed); + unsigned seed); // galois::runtime::iterable > // neighbor_sampler(Graph &g, VertexID v); diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index ea98e26007..1df20fb96b 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -6,10 +6,11 @@ namespace deepgalois { DistContext::~DistContext() {} // TODO move to reader class -size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str) { - DGraph* dGraph = DistContext::partitionedGraph; +size_t DistContext::read_labels(bool isSingleClassLabel, + std::string dataset_str) { + DGraph* dGraph = DistContext::partitionedGraph; this->usingSingleClass = isSingleClassLabel; - unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; galois::gPrint("[", myID, "] Reading labels from disk...\n"); std::string filename = path + dataset_str + "-labels.txt"; @@ -24,11 +25,14 @@ size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str // size of labels should be # local nodes if (isSingleClassLabel) { galois::gPrint("[", myID, "] One hot labels...\n"); - this->h_labels = new label_t[dGraph->size()]; // single-class (one-hot) label for - // each vertex: N x 1 + this->h_labels = + new label_t[dGraph->size()]; // single-class (one-hot) label for + // each vertex: N x 1 } else { galois::gPrint("[", myID, "] Multi-class labels...\n"); - this->h_labels = new label_t[dGraph->size() * this->num_classes]; // multi-class label for each vertex: N x E + this->h_labels = new label_t[dGraph->size() * + this->num_classes]; // multi-class label for + // each vertex: N x E } uint32_t foundVertices = 0; @@ -75,7 +79,7 @@ size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str // TODO move to reader class size_t DistContext::read_features(std::string dataset_str) { DGraph* dGraph = DistContext::partitionedGraph; - unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; galois::gPrint("[", myID, "] Reading features from disk...\n"); std::string filename = path + dataset_str + ".ft"; @@ -106,7 +110,7 @@ size_t DistContext::read_features(std::string dataset_str) { size_t count = 0; for (size_t i = 0; i < m; i++) { if (dGraph->isLocal(i)) { - //h_feats[count * feat_len] = fullFeats[i]; + // h_feats[count * feat_len] = fullFeats[i]; std::copy(fullFeats + i * DistContext::feat_len, fullFeats + (i + 1) * DistContext::feat_len, &this->h_feats[count * DistContext::feat_len]); @@ -171,7 +175,8 @@ float_t* DistContext::get_in_ptr() { return &h_feats[0]; } void DistContext::initializeSyncSubstrate() { DistContext::syncSubstrate = new galois::graphs::GluonSubstrate( - *DistContext::partitionedGraph, galois::runtime::getSystemNetworkInterface().ID, + *DistContext::partitionedGraph, + galois::runtime::getSystemNetworkInterface().ID, galois::runtime::getSystemNetworkInterface().Num, false); } @@ -193,7 +198,6 @@ void DistContext::allocNormFactorSub(int subID) { // TODO clean out? } - void DistContext::constructNormFactor(deepgalois::Context* globalContext) { galois::gPrint("Norm factor construction\n"); // using original graph to get ids @@ -201,7 +205,7 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) { allocNormFactor(); // this is for testing purposes - //galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()), + // galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()), // [&] (unsigned i) { // this->normFactors[i] = 0; // } @@ -210,54 +214,61 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) { #ifdef USE_MKL galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()), [&] (unsigned i) { - float_t c_i = std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); + float_t c_i = + std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); - for (auto e = partitionedGraph->edge_begin(i); e != partitionedGraph->edge_end(i); e++) { - const auto j = partitionedGraph->getEdgeDst(e); - float_t c_j = std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(j)))); + for (auto e = partitionedGraph->edge_begin(i); + e != partitionedGraph->edge_end(i); e++) { + const auto j = partitionedGraph->getEdgeDst(e); + float_t c_j = std::sqrt( + float_t(wholeGraph->get_degree(partitionedGraph->getGID(j)))); - if (c_i == 0.0 || c_j == 0.0) { - this->normFactors[e] = 0.0; - } else { - this->normFactors[e] = 1.0 / (c_i * c_j); - } + if (c_i == 0.0 || c_j == 0.0) { + this->normFactors[e] = 0.0; + } else { + this->normFactors[e] = 1.0 / (c_i * c_j); + } }, galois::loopname("NormCountingEdge")); ); #else - galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()), - [&] (unsigned v) { - auto degree = wholeGraph->get_degree(partitionedGraph->getGID(v)); - float_t temp = std::sqrt(float_t(degree)); - if (temp == 0.0) { - this->normFactors[v] = 0.0; - } else { - this->normFactors[v] = 1.0 / temp; - } - }, - galois::loopname("NormCountingNode")); + galois::do_all( + galois::iterate((size_t)0, partitionedGraph->size()), + [&](unsigned v) { + auto degree = wholeGraph->get_degree(partitionedGraph->getGID(v)); + float_t temp = std::sqrt(float_t(degree)); + if (temp == 0.0) { + this->normFactors[v] = 0.0; + } else { + this->normFactors[v] = 1.0 / temp; + } + }, + galois::loopname("NormCountingNode")); #endif galois::gPrint("Norm factor construction done\n"); } void DistContext::constructNormFactorSub(int subgraphID) { - galois::gPrint("Sub norm factor construction\n"); - // right now norm factor based on subgraph - // TODO fix this for dist execution + galois::gPrint("Sub norm factor construction\n"); + // right now norm factor based on subgraph + // TODO fix this for dist execution - allocNormFactorSub(subgraphID); + allocNormFactorSub(subgraphID); - Graph& graphToUse = *partitionedSubgraphs[subgraphID]; - graphToUse.degree_counting(); + Graph& graphToUse = *partitionedSubgraphs[subgraphID]; + graphToUse.degree_counting(); - // TODO using partitioned subgraph rather than whoel graph; i.e. dist setting wrong + // TODO using partitioned subgraph rather than whoel graph; i.e. dist + // setting wrong #ifdef USE_MKL galois::do_all(galois::iterate((size_t)0, graphToUse->size()), [&] (unsigned i) { - //float_t c_i = std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); + // float_t c_i = + // std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); float_t c_i = std::sqrt(float_t(graphToUse.get_degree(i))); - for (auto e = graphToUse->edge_begin(i); e != graphToUse->edge_end(i); e++) { + for (auto e = graphToUse->edge_begin(i); e != graphToUse->edge_end(i); + e++) { const auto j = graphToUse->getEdgeDst(e); float_t c_j = std::sqrt(float_t(graphToUse.get_degree(j))); @@ -266,86 +277,90 @@ void DistContext::constructNormFactorSub(int subgraphID) { } else { this->normFactorsSub[e] = 1.0 / (c_i * c_j); } - }, + }, galois::loopname("NormCountingEdge")); ); #else - galois::do_all(galois::iterate((size_t)0, graphToUse.size()), - [&] (unsigned v) { - //auto degree = wholeGraph->get_degree(partitionedGraph->getGID(v)); - auto degree = graphToUse.get_degree(v); - float_t temp = std::sqrt(float_t(degree)); - if (temp == 0.0) { - this->normFactorsSub[v] = 0.0; - } else { - this->normFactorsSub[v] = 1.0 / temp; - } - //galois::gPrint(this->normFactorsSub[v], "\n"); - }, - galois::loopname("NormCountingNode")); + galois::do_all( + galois::iterate((size_t)0, graphToUse.size()), + [&](unsigned v) { + // auto degree = wholeGraph->get_degree(partitionedGraph->getGID(v)); + auto degree = graphToUse.get_degree(v); + float_t temp = std::sqrt(float_t(degree)); + if (temp == 0.0) { + this->normFactorsSub[v] = 0.0; + } else { + this->normFactorsSub[v] = 1.0 / temp; + } + // galois::gPrint(this->normFactorsSub[v], "\n"); + }, + galois::loopname("NormCountingNode")); #endif galois::gPrint("Sub norm factor construction done\n"); } //! generate labels for the subgraph, m is subgraph size, mask //! tells which vertices to use void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) { - if (DistContext::usingSingleClass) { - DistContext::h_labels_subg.resize(m); - } else { - DistContext::h_labels_subg.resize(m * DistContext::num_classes); - } - - size_t count = 0; - // see which labels to copy over for this subgraph - for (size_t i = 0; i < this->partitionedGraph->size(); i++) { - if (masks[i] == 1) { if (DistContext::usingSingleClass) { - DistContext::h_labels_subg[count] = h_labels[i]; + DistContext::h_labels_subg.resize(m); } else { - std::copy(DistContext::h_labels + i * DistContext::num_classes, - DistContext::h_labels + (i + 1) * DistContext::num_classes, - &DistContext::h_labels_subg[count * DistContext::num_classes]); + DistContext::h_labels_subg.resize(m * DistContext::num_classes); } - //galois::gPrint("l ", (float)DistContext::h_labels_subg[count], "\n"); - count++; - } - } - GALOIS_ASSERT(count == m); + + size_t count = 0; + // see which labels to copy over for this subgraph + for (size_t i = 0; i < this->partitionedGraph->size(); i++) { + if (masks[i] == 1) { + if (DistContext::usingSingleClass) { + DistContext::h_labels_subg[count] = h_labels[i]; + } else { + std::copy( + DistContext::h_labels + i * DistContext::num_classes, + DistContext::h_labels + (i + 1) * DistContext::num_classes, + &DistContext::h_labels_subg[count * DistContext::num_classes]); + } + // galois::gPrint("l ", (float)DistContext::h_labels_subg[count], + // "\n"); + count++; + } + } + GALOIS_ASSERT(count == m); } //! generate input features for the subgraph, m is subgraph size, //! masks tells which vertices to use void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) { - size_t count = 0; - // if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len]; - DistContext::h_feats_subg.resize(m * feat_len); - for (size_t i = 0; i < this->partitionedGraph->size(); i++) { - if (masks[i] == 1) { - std::copy(DistContext::h_feats + i * DistContext::feat_len, - DistContext::h_feats + (i + 1) * DistContext::feat_len, - &DistContext::h_feats_subg[count * DistContext::feat_len]); - //for (unsigned a = 0; a < DistContext::feat_len; a++) { - // if (h_feats_subg[count * DistContext::feat_len + a] != 0) { - // galois::gPrint(h_feats_subg[count * DistContext::feat_len + a], " "); - // } - //} - //galois::gPrint("\n"); - count++; - } - } - GALOIS_ASSERT(count == m); + size_t count = 0; + // if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len]; + DistContext::h_feats_subg.resize(m * feat_len); + for (size_t i = 0; i < this->partitionedGraph->size(); i++) { + if (masks[i] == 1) { + std::copy(DistContext::h_feats + i * DistContext::feat_len, + DistContext::h_feats + (i + 1) * DistContext::feat_len, + &DistContext::h_feats_subg[count * DistContext::feat_len]); + // for (unsigned a = 0; a < DistContext::feat_len; a++) { + // if (h_feats_subg[count * DistContext::feat_len + a] != 0) { + // galois::gPrint(h_feats_subg[count * DistContext::feat_len + a], + // " "); + // } + //} + // galois::gPrint("\n"); + count++; + } + } + GALOIS_ASSERT(count == m); } galois::graphs::GluonSubstrate* DistContext::getSyncSubstrate() { - return DistContext::syncSubstrate; + return DistContext::syncSubstrate; }; void DistContext::allocateSubgraphs(int num_subgraphs) { - partitionedSubgraphs.resize(num_subgraphs); - for (int i = 0; i < num_subgraphs; i++) { - partitionedSubgraphs[i] = new Graph(); - } + partitionedSubgraphs.resize(num_subgraphs); + for (int i = 0; i < num_subgraphs; i++) { + partitionedSubgraphs[i] = new Graph(); + } } } // namespace deepgalois diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index ce23b2b51d..fbb6323891 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -9,7 +9,8 @@ namespace deepgalois { -void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel) { +void Net::partitionInit(DGraph* graph, std::string dataset_str, + bool isSingleClassLabel) { this->dGraph = graph; this->distContext = new deepgalois::DistContext(); this->distContext->saveDistGraph(dGraph); @@ -48,25 +49,25 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleCla } } } else { - globalTrainCount = this->distContext->read_masks(dataset_str, - "train", this->distNumSamples, globalTrainBegin, globalTrainEnd, - this->distTrainMasks, this->dGraph); - globalValCount = this->distContext->read_masks(dataset_str, - "val", this->distNumSamples, globalValBegin, globalValEnd, + globalTrainCount = this->distContext->read_masks( + dataset_str, "train", this->distNumSamples, globalTrainBegin, + globalTrainEnd, this->distTrainMasks, this->dGraph); + globalValCount = this->distContext->read_masks( + dataset_str, "val", this->distNumSamples, globalValBegin, globalValEnd, this->distValMasks, this->dGraph); } // input feature dimension: D feature_dims[0] = this->distContext->read_features(dataset_str); for (size_t i = 1; i < num_conv_layers; i++) - feature_dims[i] = this->h1; // hidden1 level embedding: 16 + feature_dims[i] = this->h1; // hidden1 level embedding: 16 feature_dims[num_conv_layers] = num_classes; // output embedding: E if (this->has_l2norm) { // l2 normalized embedding: E feature_dims[num_conv_layers + 1] = num_classes; } if (this->has_dense) { - // MLP embedding: E + // MLP embedding: E feature_dims[num_layers - 1] = num_classes; } @@ -127,16 +128,18 @@ acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, uint32_t localID = this->dGraph->getLID(i); if (masks == NULL) { - //GALOIS_DIE("subgraphs not implemented for dist yet"); + // GALOIS_DIE("subgraphs not implemented for dist yet"); // subgraph here: TODO - auto pred = math::argmax(num_classes, &preds[localID * num_classes]); + auto pred = + math::argmax(num_classes, &preds[localID * num_classes]); // check prediction if ((label_t)pred == ground_truth[localID]) accuracy_all += 1.0; } else { if (masks[localID] == 1) { // get prediction - auto pred = math::argmax(num_classes, &preds[localID * num_classes]); + auto pred = + math::argmax(num_classes, &preds[localID * num_classes]); // check prediction if ((label_t)pred == ground_truth[localID]) accuracy_all += 1.0; diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp index 466cee6584..966caaedf3 100644 --- a/libdeepgalois/src/Sampler.cpp +++ b/libdeepgalois/src/Sampler.cpp @@ -26,7 +26,8 @@ inline unsigned getDegree(Graph* g, index_t v) { return g->edge_end(v) - g->edge_begin(v); } -void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg) { +void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, + DGraph* dg) { this->count_ = count; // save original graph Sampler::globalGraph = g; @@ -60,7 +61,7 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGrap for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { const auto dst = g->getEdgeDst(e); if (masks[dst] == 1) { - //galois::gPrint(src, " ", dst, "\n"); + // galois::gPrint(src, " ", dst, "\n"); Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0); } } @@ -69,7 +70,7 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGrap galois::loopname("gen_subgraph")); Sampler::globalMaskedGraph->degree_counting(); - Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size(); + Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size(); Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg; // TODO masked part graph as well to save time later; right now constructing @@ -96,8 +97,6 @@ void Sampler::checkGSDB(std::vector& DB0, std::vector& DB1, DB2.resize(size); } - - // implementation from GraphSAINT // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp void Sampler::selectVertices(size_t n, int m, VertexSet& st, unsigned seed) { @@ -295,7 +294,8 @@ void Sampler::selectVertices(size_t nv, size_t n, int m, Graph* g, void Sampler::createMasks(size_t n, VertexSet vertices, mask_t* masks) { // galois::gPrint("Updating masks, size = ", vertices.size(), "\n"); std::fill(masks, masks + n, 0); - for (auto v : vertices) masks[v] = 1; + for (auto v : vertices) + masks[v] = 1; } inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) { @@ -362,21 +362,23 @@ VertexSet Sampler::convertToLID(VertexSet& gidSet) { return existingLIDs; } -void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks, unsigned seed) { +void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks, + unsigned seed) { VertexSet sampledSet; // n = 9000 by default - // this->selectVertices(count_, n, m_, globalMaskedGraph, vertices_, sampledSet); - // do the sampling of vertices from training set + using masked graph + // this->selectVertices(count_, n, m_, globalMaskedGraph, vertices_, + // sampledSet); do the sampling of vertices from training set + using masked + // graph this->selectVertices(n, m_, sampledSet, seed); // m = 1000 by default // sampledSet is a list of *global* ids in the graph // create new vertex set with LIDs for partitioned graph VertexSet sampledLIDs = this->convertToLID(sampledSet); - //VertexSet sampledLIDs; - //galois::gPrint("part graph num edges is ", partGraph->sizeEdges(), "\n"); - //galois::gPrint("global mask num edges is ", globalMaskedGraph->sizeEdges(), "\n"); - //for (auto i : this->trainingNodes) { + // VertexSet sampledLIDs; + // galois::gPrint("part graph num edges is ", partGraph->sizeEdges(), "\n"); + // galois::gPrint("global mask num edges is ", globalMaskedGraph->sizeEdges(), + // "\n"); for (auto i : this->trainingNodes) { // sampledLIDs.insert(i); //} @@ -386,11 +388,12 @@ void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks, unsigned seed) // this graph will contain sampled vertices and induced subgraph for it Graph maskedSG; // TODO use partMaskedGraph once constructed later - this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph, + this->getMaskedGraph( + Sampler::partGraph->size(), masks, Sampler::partGraph, maskedSG); // remove edges whose destination is not masked this->reindexSubgraph(sampledLIDs, maskedSG, sg); - //galois::gPrint("sg num edges is ", sg.sizeEdges(), "\n"); + // galois::gPrint("sg num edges is ", sg.sizeEdges(), "\n"); } } // namespace deepgalois diff --git a/libdeepgalois/src/layers/aggregator.cpp b/libdeepgalois/src/layers/aggregator.cpp index 4e07ca96cf..ce9d709dbf 100644 --- a/libdeepgalois/src/layers/aggregator.cpp +++ b/libdeepgalois/src/layers/aggregator.cpp @@ -5,7 +5,7 @@ // TODO template arg void deepgalois::update_all(size_t len, Graph& g, const float_t* in, float_t* out, bool norm, float_t* norm_factor) { -// std::cout << "[update_all] graph size: " << n << "\n"; + // std::cout << "[update_all] graph size: " << n << "\n"; size_t n = g.size(); galois::do_all( galois::iterate(size_t(0), n), diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 7acf787bae..5881b617cc 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -119,13 +119,14 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, // TODO sync of out_data required here // TODO how to do this for the sampled case? - //deepgalois::_syncVectorSize = z; - //deepgalois::_dataToSync = out_data; - //layer::context->getSyncSubstrate()->sync( + // deepgalois::_syncVectorSize = z; + // deepgalois::_dataToSync = out_data; + // layer::context->getSyncSubstrate()->sync( // "AggSync"); // run relu activation on output if specified - if (act_) math::relu_cpu(x * z, out_data, out_data); + if (act_) + math::relu_cpu(x * z, out_data, out_data); } // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ @@ -164,15 +165,15 @@ void graph_conv_layer::back_propagation(const float_t* in_data, } // sync agg - //deepgalois::_syncVectorSize = z; - //deepgalois::_dataToSync = out_temp; - //layer::context->getSyncSubstrate()->sync( + // deepgalois::_syncVectorSize = z; + // deepgalois::_dataToSync = out_temp; + // layer::context->getSyncSubstrate()->sync( // "AggSyncBack"); if (level_ != 0 && dropout_) math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad); - //layer::syncSub->sync("GradientSync"); + // layer::syncSub->sync("GradientSync"); // galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); } diff --git a/libdeepgalois/src/sampler.cu b/libdeepgalois/src/sampler.cu index cecfa6c9e0..6fb452db4c 100644 --- a/libdeepgalois/src/sampler.cu +++ b/libdeepgalois/src/sampler.cu @@ -11,29 +11,34 @@ __global__ void set_masks(index_t n, index_t* vertices, mask_t* masks) { } // compute the degrees of a masked graph -// n is the size of the original graph -__global__ void get_masked_degrees(index_t n, mask_t *masks, GraphGPU g, index_t* degrees) { +// n is the size of the original graph +__global__ void get_masked_degrees(index_t n, mask_t* masks, GraphGPU g, + index_t* degrees) { CUDA_KERNEL_LOOP(src, n) { if (masks[src] == 1) { for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { auto dst = g.getEdgeDst(e); - if (masks[dst] == 1) degrees[src] ++; + if (masks[dst] == 1) + degrees[src]++; } } } } -// Given a graph, remove any edge which has end-point masked, and generate the subgraph -// n is the size of the original graph and the subgraph -// offset was computed by using prefix-sum of the masked degrees -__global__ void generate_masked_graph_kernel(index_t n, const mask_t *masks, const index_t* offsets, GraphGPU g, GraphGPU subg) { +// Given a graph, remove any edge which has end-point masked, and generate the +// subgraph n is the size of the original graph and the subgraph offset was +// computed by using prefix-sum of the masked degrees +__global__ void generate_masked_graph_kernel(index_t n, const mask_t* masks, + const index_t* offsets, GraphGPU g, + GraphGPU subg) { CUDA_KERNEL_LOOP(src, n) { - subg.fixEndEdge(src, offsets[src+1]); + subg.fixEndEdge(src, offsets[src + 1]); if (masks[src] == 1) { auto idx = offsets[src]; for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { auto dst = g.getEdgeDst(e); - if (masks[dst] == 1) subg.constructEdge(idx++, dst); + if (masks[dst] == 1) + subg.constructEdge(idx++, dst); } } } @@ -41,20 +46,25 @@ __global__ void generate_masked_graph_kernel(index_t n, const mask_t *masks, con // compute the degrees of the subgraph induced by the vertex set // n is the size of the vertex set -// new_ids array maps vertex ID in the original graph to the vertex ID in the subgraph -__global__ void get_new_degrees(index_t n, index_t* vertices, index_t* new_ids, GraphGPU g, index_t* degrees) { +// new_ids array maps vertex ID in the original graph to the vertex ID in the +// subgraph +__global__ void get_new_degrees(index_t n, index_t* vertices, index_t* new_ids, + GraphGPU g, index_t* degrees) { CUDA_KERNEL_LOOP(i, n) { - auto v = vertices[i]; + auto v = vertices[i]; degrees[new_ids[v]] = g.getOutDegree(v); } } -// Given a masked graph, remove the masked vertices, reindex the rest vertices, and generate the subgraph -// offset was computed by using prefix-sum of the new degrees -// n is the size of the old_ids and the sbugraph -__global__ void generate_graph_kernel(index_t n, const index_t* offsets, const index_t* old_ids, const index_t* new_ids, GraphGPU g, GraphGPU subg) { +// Given a masked graph, remove the masked vertices, reindex the rest vertices, +// and generate the subgraph offset was computed by using prefix-sum of the new +// degrees n is the size of the old_ids and the sbugraph +__global__ void generate_graph_kernel(index_t n, const index_t* offsets, + const index_t* old_ids, + const index_t* new_ids, GraphGPU g, + GraphGPU subg) { CUDA_KERNEL_LOOP(i, n) { - subg.fixEndEdge(i, offsets[i+1]); + subg.fixEndEdge(i, offsets[i + 1]); index_t j = 0; auto src = old_ids[i]; for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { @@ -66,15 +76,15 @@ __global__ void generate_graph_kernel(index_t n, const index_t* offsets, const i } } -void Sampler::update_masks(size_t n, index_t* vertices, mask_t *masks) { +void Sampler::update_masks(size_t n, index_t* vertices, mask_t* masks) { set_masks<<>>(n, vertices, masks); } -void Sampler::indexing(size_t n, index_t* vertices, index_t *new_indices) { +void Sampler::indexing(size_t n, index_t* vertices, index_t* new_indices) { index_t vid = 0; for (index_t i = 0; i < n; i++) { - auto v = vertices[i]; - new_indices[v] = vid ++; + auto v = vertices[i]; + new_indices[v] = vid++; } } @@ -87,7 +97,8 @@ inline VertexList Sampler::reindexing_vertices(size_t n, VertexSet vertex_set) { return new_ids; } -void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU *g, GraphGPU *subg) { +void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU* g, + GraphGPU* subg) { index_t *degrees, *offsets; CUDA_CHECK(cudaMalloc((void**)°rees, sizeof(index_t)*n); get_masked_degrees<<>>(n, masks, g, degrees); @@ -102,29 +113,35 @@ void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU *g, Graph } // use a random walk to select vertex subset -void Sampler::select_vertices(size_t n, int m, VertexSet &st) { -} +void Sampler::select_vertices(size_t n, int m, VertexSet& st) {} // n: size of the original graph // nv: size of the subgraph; i.e. size of vertex_set // masks, graph g and subgraph sub are on the device (GPU) -void Sampler::generate_subgraph(index_t nv, VertexSet vertex_set, mask_t* masks, GraphGPU *g, GraphGPU *sub) { +void Sampler::generate_subgraph(index_t nv, VertexSet vertex_set, mask_t* masks, + GraphGPU* g, GraphGPU* sub) { // convert the vertex_set to a vertex_list and copy it to the device VertexList vertex_list(vertex_set.begin(), vertex_set.end()); - index_t *d_vertex_list; - cudaMalloc((void **) &d_vertex_list, nv*sizeof(index_t)); - CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv*sizeof(index_t), cudaMemcpyHostToDevice)); + index_t* d_vertex_list; + cudaMalloc((void**)&d_vertex_list, nv * sizeof(index_t)); + CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv * sizeof(index_t), + cudaMemcpyHostToDevice)); index_t n = graph->size(); - update_masks(n, d_vertex_list, masks); // set masks for vertices in the vertex_set - GraphGPU masked_sg; // size is the same as original graph, but masked dst removed - generate_masked_graph(n, masks, g, &masked_sg); // remove edges whose destination is not masked + update_masks(n, d_vertex_list, + masks); // set masks for vertices in the vertex_set + GraphGPU + masked_sg; // size is the same as original graph, but masked dst removed + generate_masked_graph( + n, masks, g, &masked_sg); // remove edges whose destination is not masked // re-index the subgraph - index_t *d_new_ids; // Given an old vertex ID โˆˆ [0, n), returns a new vertex ID โˆˆ [0, nv) - cudaMalloc((void **) &d_new_ids, n*sizeof(index_t)); + index_t* d_new_ids; // Given an old vertex ID โˆˆ [0, n), returns a new vertex + // ID โˆˆ [0, nv) + cudaMalloc((void**)&d_new_ids, n * sizeof(index_t)); auto new_ids = reindexing_vertices(nv, vertex_set); - CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n*sizeof(index_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n * sizeof(index_t), + cudaMemcpyHostToDevice)); // generate the offsets for the re-indexed subgraph index_t *degrees, *offsets; @@ -142,4 +159,4 @@ void Sampler::generate_subgraph(index_t nv, VertexSet vertex_set, mask_t* masks, generate_graph_kernel<<>>(nv, offsets, d_vertex_list, d_new_ids, masked_sg, sub); } -} +} // namespace deepgalois From 57dacca83622e189c39a58a9869293fe9644e5b6 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 11 May 2020 21:47:21 -0500 Subject: [PATCH 287/660] fix some errors --- libdeepgalois/CMakeLists.txt | 16 ++--- libdeepgalois/include/deepgalois/GraphTypes.h | 12 +++- libdeepgalois/include/deepgalois/Net.h | 69 ++++++++++++------- .../include/deepgalois/layers/layer.h | 18 +++-- libdeepgalois/src/layers/graph_conv_layer.cpp | 2 - libdeepgalois/src/layers/leaky_relu_layer.cpp | 2 - libdeepgalois/src/layers/relu_layer.cpp | 2 - .../src/layers/sigmoid_loss_layer.cpp | 2 - .../src/layers/softmax_loss_layer.cpp | 2 - libgpu/include/graph_gpu.h | 3 +- lonestar/gnn/gcn/gcn.cpp | 53 +------------- lonestar/gnn/include/engine.h | 18 +++-- 12 files changed, 85 insertions(+), 114 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 2f05527318..d591c4927f 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -25,10 +25,9 @@ include_directories(${CMAKE_SOURCE_DIR}/libgalois/include) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) link_directories(${CMAKE_SOURCE_DIR}/libgalois) -if(NOT ENABLE_HETERO_GALOIS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DCPU_ONLY") -else() +if(ENABLE_HETERO_GALOIS) # hetero path + set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D__GALOIS_HET_CUDA__") set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers include_directories("${CUB_ROOT}") set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers @@ -71,7 +70,10 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -if(NOT ENABLE_HETERO_GALOIS) +if(ENABLE_HETERO_GALOIS) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GALOIS_HET_CUDA__") + set(sources src/reader.cpp) +else() set(sources src/layers/softmax_loss_layer.cpp src/layers/sigmoid_loss_layer.cpp @@ -91,11 +93,7 @@ if(NOT ENABLE_HETERO_GALOIS) src/node.cpp src/Net.cpp ) -else() - # dummy sources set for dg_cpu for HETERO build - # TODO fix this - set(sources src/reader.cpp) -endif(NOT ENABLE_HETERO_GALOIS) +endif(ENABLE_HETERO_GALOIS) add_library(dg_cpu STATIC ${sources}) target_link_libraries(dg_cpu galois_shmem) diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h index 3a93565724..c542f42b89 100644 --- a/libdeepgalois/include/deepgalois/GraphTypes.h +++ b/libdeepgalois/include/deepgalois/GraphTypes.h @@ -1,16 +1,22 @@ #pragma once #include "deepgalois/types.h" -#include "galois/Galois.h" -#include "galois/graphs/NewGeneric.h" #include "deepgalois/lgraph.h" #ifdef __GALOIS_HET_CUDA__ -// TODO reintroduce GPU as necessary here +#include "graph_gpu.h" +#else +#include "galois/Galois.h" +#include "galois/graphs/NewGeneric.h" #endif namespace deepgalois { using edge_iterator = index_t; +#ifdef __GALOIS_HET_CUDA__ +using Graph = CSRGraph; +using GraphGPU = CSRGraph; +#else using DGraph = galois::graphs::DistGraph; using Graph = LearningGraph; +#endif } // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 04f51f317b..58433c7c1c 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -10,11 +10,13 @@ #include "deepgalois/layers/sigmoid_loss_layer.h" #include "deepgalois/optimizer.h" #include "deepgalois/utils.h" -#include "deepgalois/Sampler.h" #include "deepgalois/Context.h" #include "deepgalois/GraphTypes.h" +#ifndef __GALOIS_HET_CUDA__ +#include "deepgalois/Sampler.h" #include "deepgalois/DistContext.h" +#endif namespace deepgalois { @@ -23,7 +25,11 @@ namespace deepgalois { // layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16) // layer 2: features N x 16, weights 16 x E, out N x E class Net { - unsigned myID = galois::runtime::getSystemNetworkInterface().ID; +#ifdef __GALOIS_HET_CUDA__ + unsigned myID = 0; +#else + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; +#endif std::string header = "[" + std::to_string(myID) + "] "; std::string seperator = "\n"; @@ -77,36 +83,38 @@ class Net { //! context holds all of the graph data deepgalois::Context* graphTopologyContext; + +#ifndef __GALOIS_HET_CUDA__ //! dist context holds graph data of the partitioned graph only deepgalois::DistContext* distContext; - DGraph* dGraph; - Sampler* sampler; +#endif public: Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, unsigned hidden1, float lr, float dropout, float wd, bool selfloop, - bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz, - int val_itv) + bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz, int val_itv) : is_single_class(single), has_l2norm(l2norm), has_dense(dense), neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz), num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd), val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { // init some identifiers for this host +#ifndef __GALOIS_HET_CUDA__ this->myID = galois::runtime::getSystemNetworkInterface().ID; +#endif this->header = "[" + std::to_string(myID) + "] "; - this->seperator = "\n"; + this->seperator = " "; assert(n_conv > 0); // TODO use galois print - galois::gPrint(header, "Configuration: num_threads ", num_threads, - ", num_conv_layers ", num_conv_layers, ", num_epochs ", - num_epochs, ", hidden1 ", hidden1, ", learning_rate ", - learning_rate, ", dropout_rate ", dropout_rate, - ", weight_decay ", weight_decay, "\n"); + std::cout << header << "Configuration: num_threads " << num_threads + << ", num_conv_layers " << num_conv_layers << ", num_epochs " + << num_epochs << ", hidden1 " << hidden1 << ", learning_rate " + << learning_rate << ", dropout_rate " << dropout_rate + << ", weight_decay " << weight_decay << "\n"; this->num_layers = num_conv_layers + 1; // additional layers to add @@ -152,6 +160,7 @@ class Net { "val", globalSamples, globalValBegin, globalValEnd, globalValMasks); } +#ifndef __GALOIS_HET_CUDA__ // make sure sampel size isn't greater than what we have to train with if (subgraph_sample_size > globalTrainCount) { GALOIS_DIE("subgraph size can not be larger than the size of training " @@ -162,6 +171,7 @@ class Net { // used for sampling) this->sampler = new Sampler(); +#endif } //! Default net constructor @@ -178,10 +188,12 @@ class Net { // test_masks(NULL), context(NULL) {} void init(); + +#ifndef __GALOIS_HET_CUDA__ //! Initializes metadata for the partition void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel); - +#endif size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } @@ -194,14 +206,15 @@ class Net { if (subgraph_sample_size) { distContext->allocateSubgraphs(num_subgraphs); subgraphs_masks = new mask_t[distNumSamples * num_subgraphs]; - galois::gPrint(header, - "Constructing training vertex set induced graph...\n"); + std::cout << header << "Constructing training vertex set induced graph...\n"; +#ifndef __GALOIS_HET_CUDA__ sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, graphTopologyContext->getGraphPointer(), distContext->getGraphPointer()); +#endif } - galois::gPrint(header, "Start training...\n"); + std::cout << header << "Start training...\n"; Timer t_epoch; @@ -214,8 +227,7 @@ class Net { //////////////////////////////////////////////////////////////////////////////// if (subgraph_sample_size) { if (num_subg_remain == 0) { - galois::gPrint(header, "Generating ", num_subgraphs, - " subgraph(s)\n"); + std::cout << header << "Generating " << num_subgraphs << " subgraph(s)\n"; // TODO stat timer instead of this timer Timer t_subgen; t_subgen.Start(); @@ -284,7 +296,7 @@ class Net { //////////////////////////////////////////////////////////////////////////////// // training steps - galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, seperator); + std::cout << header << "Epoch " << std::setw(3) << curEpoch << seperator; set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; @@ -304,8 +316,8 @@ class Net { // validation / testing set_netphases(net_phase::test); - galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, - train_loss, " train_acc ", train_acc, seperator); + std::cout << header << "train_loss " << std::setprecision(3) << std::fixed + << train_loss << " train_acc " << train_acc << seperator; t_epoch.Stop(); @@ -330,8 +342,8 @@ class Net { double avg_train_time = total_train_time / (double)num_epochs; double throughput = 1000.0 * (double)num_epochs / total_train_time; - galois::gPrint(header, "Average training time per epoch: ", avg_train_time, - " ms. Throughput: ", throughput, " epoch/s\n"); + std::cout << header << "Average training time per epoch: " << avg_train_time + << " ms. Throughput: " << throughput << " epoch/s\n"; } // evaluate, i.e. inference or predict @@ -419,14 +431,21 @@ class Net { globalTestCount = 55703; globalTestEnd = globalTestBegin + globalTestCount; for (size_t i = globalTestBegin; i < globalTestEnd; i++) { - if (dGraph->isLocal(i)) { +#ifndef __GALOIS_HET_CUDA__ + if (dGraph->isLocal(i)) test_masks[dGraph->getLID(i)] = 1; - } +#else + // TODO: Read for GPU +#endif } } else { globalTestCount = distContext->read_masks( dataset, std::string("test"), globalSamples, globalTestBegin, +#ifdef __GALOIS_HET_CUDA__ + globalTestEnd, test_masks, NULL); +#else globalTestEnd, test_masks, dGraph); +#endif } #ifdef __GALOIS_HET_CUDA__ copy_test_masks_to_device(); diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 3a33d54440..45b7bcc8bd 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -11,14 +11,15 @@ #include #include "deepgalois/GraphTypes.h" #include "deepgalois/Context.h" - -#include "deepgalois/DistContext.h" #include "deepgalois/optimizer.h" #include "deepgalois/layers/node.h" +#ifndef __GALOIS_HET_CUDA__ +#include "deepgalois/DistContext.h" #include "galois/graphs/GluonSubstrate.h" #include "deepgalois/layers/GluonGradients.h" #include "deepgalois/layers/GradientSyncStructs.h" +#endif namespace deepgalois { @@ -37,7 +38,11 @@ namespace deepgalois { **/ class layer : public deepgalois::node { public: +#ifdef __GALOIS_HET_CUDA__ + using ContextType = deepgalois::Context; +#else using ContextType = deepgalois::DistContext; +#endif layer(unsigned level, std::vector in_dims, std::vector out_dims) @@ -173,15 +178,14 @@ class layer : public deepgalois::node { label_t* labels; float_t* norm_consts; // TODO -#ifndef __GALOIS_HET_CUDA__ - Graph* graph_cpu; -#else +#ifdef __GALOIS_HET_CUDA__ GraphGPU* graph_gpu; -#endif - +#else + Graph* graph_cpu; // Used for synchronization of weight gradients deepgalois::GluonGradients* gradientGraph; galois::graphs::GluonSubstrate* syncSub; +#endif }; //! Connects tail to head's edge and sets that edge's target to tail diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 5881b617cc..d7c29d1cfa 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -10,7 +10,6 @@ float_t* _dataToSync = nullptr; //! sync long unsigned _syncVectorSize = 0; -#ifndef __GALOIS_HET_CUDA__ inline void graph_conv_layer::rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, unsigned seed) { auto init_range = sqrt(6.0 / (dim_x + dim_y)); @@ -181,5 +180,4 @@ acc_t graph_conv_layer::get_weight_decay_loss() { return math::l2_norm(input_dims[1] * output_dims[1], &layer::W[0]); } -#endif // end if CPU_ONLY } // namespace deepgalois diff --git a/libdeepgalois/src/layers/leaky_relu_layer.cpp b/libdeepgalois/src/layers/leaky_relu_layer.cpp index dd4357739f..a230de1090 100644 --- a/libdeepgalois/src/layers/leaky_relu_layer.cpp +++ b/libdeepgalois/src/layers/leaky_relu_layer.cpp @@ -12,7 +12,6 @@ leaky_relu_layer::leaky_relu_layer(unsigned level, float_t eps, dims_t in_dims, name_ = layer_type() + "_" + std::to_string(level); } -#ifdef CPU_ONLY // ๐‘ฆ[๐‘™] = ๐‘ฆ[๐‘™โˆ’1] > 0 ? ๐‘ฆ[๐‘™โˆ’1]) : ๐‘ฆ[๐‘™โˆ’1] * ฮต void leaky_relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) { @@ -25,6 +24,5 @@ void leaky_relu_layer::back_propagation(const float_t*, const float_t* out_data, float_t* out_grad, float_t* in_grad) { math::d_leaky_relu_cpu(n, epsilon_, out_grad, out_data, in_grad); } -#endif } // namespace deepgalois diff --git a/libdeepgalois/src/layers/relu_layer.cpp b/libdeepgalois/src/layers/relu_layer.cpp index 03cd0f4652..0576bea642 100644 --- a/libdeepgalois/src/layers/relu_layer.cpp +++ b/libdeepgalois/src/layers/relu_layer.cpp @@ -3,7 +3,6 @@ namespace deepgalois { -#ifdef CPU_ONLY // ๐‘ฆ[๐‘™] = max(0, ๐‘ฆ[๐‘™โˆ’1]) void relu_layer::forward_propagation(const float_t* in_data, float_t* out_data) { @@ -18,6 +17,5 @@ void relu_layer::back_propagation(const float_t*, const float_t* out_data, size_t n = input_dims[0] * input_dims[1]; math::d_relu_cpu(n, out_grad, out_data, in_grad); } -#endif } // namespace deepgalois diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index d20f2a769b..3dcb312f08 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -4,7 +4,6 @@ namespace deepgalois { -#ifdef CPU_ONLY sigmoid_loss_layer::sigmoid_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims) @@ -93,6 +92,5 @@ acc_t sigmoid_loss_layer::get_prediction_loss() { assert(valid_sample_count.reduce() == count_); return total_loss.reduce() / (acc_t)count_; } -#endif } // namespace deepgalois diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index f3eb3ee969..940fbeb798 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -4,7 +4,6 @@ namespace deepgalois { -#ifdef CPU_ONLY softmax_loss_layer::softmax_loss_layer(unsigned level, std::vector in_dims, std::vector out_dims) @@ -98,6 +97,5 @@ acc_t softmax_loss_layer::get_prediction_loss() { assert(valid_sample_count.reduce() == count_); return total_loss.reduce() / (acc_t)count_; } -#endif } // namespace deepgalois diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index 449e38a7b5..4c480bd8fa 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -159,6 +159,7 @@ struct CSRGraph { assert(src <= nnodes); return row_start[src+1]; }; + CUDA_HOSTDEV index_type *row_start_host_ptr() { return row_start; } CUDA_HOSTDEV index_type *row_start_ptr() { return row_start; } CUDA_HOSTDEV const index_type *row_start_ptr() const { return row_start; } CUDA_HOSTDEV index_type *edge_dst_ptr() { return edge_dst; } @@ -172,7 +173,7 @@ struct CSRGraph { assert(dst < nnodes); assert(eid < nedges); edge_dst[eid] = dst; - //if (edge_data) edge_data[eid] = edata; + if (edge_data) edge_data[eid] = edata; } void malloc_index_device(index_type n, index_type *ptr); void set_index(index_type pos, index_type value, index_type *ptr); diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp index fabd27667f..c33e7d5574 100644 --- a/lonestar/gnn/gcn/gcn.cpp +++ b/lonestar/gnn/gcn/gcn.cpp @@ -1,61 +1,10 @@ // Graph Neural Networks // Xuhao Chen #include "lonestargnn.h" -#ifdef GALOIS_USE_DIST -#include "DistributedGraphLoader.h" -#endif const char* name = "Graph Convolutional Networks"; const char* desc = "Graph convolutional neural networks on an undirected graph"; const char* url = 0; -int main(int argc, char** argv) { - galois::DistMemSys G; - LonestarGnnStart(argc, argv, name, desc, url); +#include "engine.h" - // Get a partitioned graph first - std::vector dummyVec; - deepgalois::DGraph* dGraph = - galois::graphs::constructSymmetricGraph(dummyVec); - - // initialize network + whole context on CPU - // read network, features, ground truth, initialize metadata - // default setting for now; can be customized by the user - deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1, - learning_rate, dropout_rate, weight_decay, - add_selfloop, is_single_class, add_l2norm, add_dense, - neighbor_sample_sz, subgraph_sample_sz, val_interval); - - // initialize distributed context - network.partitionInit(dGraph, dataset, is_single_class); - - // construct layers from distributed context - network.construct_layers(); - network.print_layers_info(); - deepgalois::ResourceManager rm; // tracks peak memory usage - - // the optimizer used to update parameters, - // see optimizer.h for more details - // optimizer *opt = new gradient_descent(); - // optimizer *opt = new adagrad(); - deepgalois::optimizer* opt = new deepgalois::adam(); - galois::StatTimer Ttrain("TrainAndVal"); - Ttrain.start(); - network.train(opt, do_validate); // do training using training samples - Ttrain.stop(); - - if (do_test) { - // test using test samples - galois::gPrint("\n"); - network.read_test_masks(dataset); - galois::StatTimer Ttest("Test"); - Ttest.start(); - acc_t test_loss = 0.0, test_acc = 0.0; - double test_time = network.evaluate("test", test_loss, test_acc); - galois::gPrint("Testing: test_loss = ", test_loss, " test_acc = ", test_acc, - " test_time = ", test_time, "\n"); - Ttest.stop(); - } - galois::gPrint("\n", rm.get_peak_memory(), "\n\n"); - return 0; -} diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h index 7d0691de0f..4820d5c7fc 100644 --- a/lonestar/gnn/include/engine.h +++ b/lonestar/gnn/include/engine.h @@ -6,19 +6,23 @@ int main(int argc, char** argv) { galois::DistMemSys G; LonestarGnnStart(argc, argv, name, desc, url); - // the neural network to train: loads the entire graph on CPU + // Get a partitioned graph first + std::vector dummyVec; + deepgalois::DGraph* dGraph = + galois::graphs::constructSymmetricGraph(dummyVec); + + // initialize network + whole context on CPU + // read network, features, ground truth, initialize metadata + // default setting for now; can be customized by the user deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1, learning_rate, dropout_rate, weight_decay, add_selfloop, is_single_class, add_l2norm, add_dense, neighbor_sample_sz, subgraph_sample_sz, val_interval); - std::vector dummyVec; - deepgalois::Graph* dGraph = - galois::graphs::constructSymmetricGraph(dummyVec); - network.dist_init(dGraph, dataset); + // initialize distributed context + network.partitionInit(dGraph, dataset, is_single_class); - // read network, features, ground truth, initialize metadata - // default setting for now; can be customized by the user + // construct layers from distributed context network.construct_layers(); network.print_layers_info(); deepgalois::ResourceManager rm; // tracks peak memory usage From e37a4f6b7e98d6ef07742e0aaffc2ea15e0ad515 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 11 May 2020 23:03:16 -0500 Subject: [PATCH 288/660] fix mkl build --- libdeepgalois/src/DistContext.cpp | 35 +++++++++++++++---------------- 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 1df20fb96b..a9c604befc 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -214,23 +214,23 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) { #ifdef USE_MKL galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()), [&] (unsigned i) { - float_t c_i = - std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); + float_t c_i = + std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); - for (auto e = partitionedGraph->edge_begin(i); - e != partitionedGraph->edge_end(i); e++) { - const auto j = partitionedGraph->getEdgeDst(e); - float_t c_j = std::sqrt( - float_t(wholeGraph->get_degree(partitionedGraph->getGID(j)))); + for (auto e = partitionedGraph->edge_begin(i); + e != partitionedGraph->edge_end(i); e++) { + const auto j = partitionedGraph->getEdgeDst(e); + float_t c_j = std::sqrt( + float_t(wholeGraph->get_degree(partitionedGraph->getGID(j)))); - if (c_i == 0.0 || c_j == 0.0) { - this->normFactors[e] = 0.0; - } else { - this->normFactors[e] = 1.0 / (c_i * c_j); + if (c_i == 0.0 || c_j == 0.0) { + this->normFactors[*e] = 0.0; + } else { + this->normFactors[*e] = 1.0 / (c_i * c_j); + } } }, galois::loopname("NormCountingEdge")); - ); #else galois::do_all( galois::iterate((size_t)0, partitionedGraph->size()), @@ -261,15 +261,15 @@ void DistContext::constructNormFactorSub(int subgraphID) { // TODO using partitioned subgraph rather than whoel graph; i.e. dist // setting wrong #ifdef USE_MKL - galois::do_all(galois::iterate((size_t)0, graphToUse->size()), + galois::do_all(galois::iterate((size_t)0, graphToUse.size()), [&] (unsigned i) { // float_t c_i = // std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); float_t c_i = std::sqrt(float_t(graphToUse.get_degree(i))); - for (auto e = graphToUse->edge_begin(i); e != graphToUse->edge_end(i); + for (index_t e = graphToUse.edge_begin(i); e != graphToUse.edge_end(i); e++) { - const auto j = graphToUse->getEdgeDst(e); + const auto j = graphToUse.getEdgeDst(e); float_t c_j = std::sqrt(float_t(graphToUse.get_degree(j))); if (c_i == 0.0 || c_j == 0.0) { @@ -277,9 +277,8 @@ void DistContext::constructNormFactorSub(int subgraphID) { } else { this->normFactorsSub[e] = 1.0 / (c_i * c_j); } - }, - galois::loopname("NormCountingEdge")); - ); + } + }, galois::loopname("NormCountingEdge")); #else galois::do_all( galois::iterate((size_t)0, graphToUse.size()), From fd25e3e71bb5332a75a3077e118f6144ed1af877 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 12 May 2020 09:41:34 -0500 Subject: [PATCH 289/660] fix more --- libdeepgalois/CMakeLists.txt | 2 +- libdeepgalois/include/deepgalois/Context.h | 12 ++--- .../include/deepgalois/DistContext.h | 45 +++++-------------- libdeepgalois/include/deepgalois/GraphTypes.h | 1 + libdeepgalois/include/deepgalois/Net.h | 11 ++--- .../include/deepgalois/layers/layer.h | 6 +-- libdeepgalois/src/DistContext.cpp | 23 ++++++++++ libdeepgalois/src/{net.cu => Net.cu} | 11 +++-- 8 files changed, 51 insertions(+), 60 deletions(-) rename libdeepgalois/src/{net.cu => Net.cu} (95%) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index d591c4927f..aa4850c8c4 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -60,7 +60,7 @@ if(ENABLE_HETERO_GALOIS) src/context.cu src/lgraph.cu src/node.cu - src/net.cu + src/Net.cu ) cuda_add_library(dg_gpu ${CUDA_SOURCES}) target_link_libraries(dg_gpu galois_gpu -lcudart -lcublas -lcusparse -lcurand) diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h index 341270201a..0be03a1972 100644 --- a/libdeepgalois/include/deepgalois/Context.h +++ b/libdeepgalois/include/deepgalois/Context.h @@ -39,8 +39,7 @@ class Context { static cublasHandle_t cublas_handle_; // used to call cuBLAS static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE - static curandGenerator_t - curand_generator_; // used to generate random numbers on GPU + static curandGenerator_t curand_generator_; // used to generate random numbers on GPU GraphGPU graph_gpu; // the input graph, |V| = N std::vector subgraphs_gpu; @@ -50,14 +49,11 @@ class Context { float_t* get_feats_subg_ptr() { return d_feats_subg; } label_t* get_labels_ptr() { return d_labels; } label_t* get_labels_subg_ptr() { return d_labels_subg; } + inline static cublasHandle_t cublas_handle() { return cublas_handle_; } inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; } - inline static cusparseMatDescr_t cusparse_matdescr() { - return cusparse_matdescr_; - } - inline static curandGenerator_t curand_generator() { - return curand_generator_; - } + inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; } + inline static curandGenerator_t curand_generator() { return curand_generator_; } #endif Context(); diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 14b2ae18b7..9a3496a9c9 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -3,7 +3,9 @@ /** * Based on common.hpp file of the Caffe deep learning library. */ +#ifndef __GALOIS_HET_CUDA__ #include "galois/graphs/GluonSubstrate.h" +#endif #include "deepgalois/types.h" #include "deepgalois/Context.h" #include "deepgalois/GraphTypes.h" @@ -13,20 +15,18 @@ namespace deepgalois { class DistContext { size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D - galois::graphs::GluonSubstrate* syncSubstrate; - Graph* lGraph; // laerning graph version +#ifndef __GALOIS_HET_CUDA__ + galois::graphs::GluonSubstrate* syncSubstrate; +#endif DGraph* partitionedGraph; // the input graph, |V| = N std::vector partitionedSubgraphs; label_t* h_labels; // labels for classification. Single-class label: Nx1, // multi-class label: NxE - std::vector h_labels_subg; // labels for subgraph - float_t* h_feats; // input features: N x D - std::vector h_feats_subg; // input features for subgraph - - // change regular one to a vector as well - std::vector - normFactors; // normalization constant based on graph structure + float_t* h_feats; // input features: N x D + std::vector h_labels_subg; // labels for subgraph + std::vector h_feats_subg; // input features for subgraph + std::vector normFactors; // normalization constant based on graph structure std::vector normFactorsSub; // normalization constant for subgraph bool usingSingleClass; @@ -35,29 +35,6 @@ class DistContext { DistContext() : usingSingleClass(true){}; ~DistContext(); - void saveDistGraph(DGraph* a) { - partitionedGraph = a; - - // construct lgraph from underlying lc csr graph - // TODO fix this so i don't have more than 1 copy of graph in memory - this->lGraph = new Graph(); - this->lGraph->allocateFrom(a->size(), a->sizeEdges()); - this->lGraph->constructNodes(); - - galois::do_all( - galois::iterate((size_t)0, a->size()), - [&](const auto src) { - this->lGraph->fixEndEdge(src, *a->edge_end(src)); - index_t idx = *(a->edge_begin(src)); - - for (auto e = a->edge_begin(src); e != a->edge_end(src); e++) { - const auto dst = a->getEdgeDst(e); - this->lGraph->constructEdge(idx++, dst, 0); - } - }, - galois::loopname("lgraphcopy")); - } - //! read labels of local nodes only size_t read_labels(bool isSingleClassLabel, std::string dataset_str); //! read features of local nodes only @@ -68,7 +45,6 @@ class DistContext { DGraph* getGraphPointer() { return partitionedGraph; } Graph* getLGraphPointer() { return lGraph; } - Graph* getSubgraphPointer(int id) { return partitionedSubgraphs[id]; }; float_t* get_feats_ptr() { return h_feats; } float_t* get_feats_subg_ptr() { return h_feats_subg.data(); } @@ -76,7 +52,10 @@ class DistContext { label_t* get_labels_subg_ptr() { return h_labels_subg.data(); } void initializeSyncSubstrate(); +#ifndef __GALOIS_HET_CUDA__ + void saveDistGraph(DGraph* a); galois::graphs::GluonSubstrate* getSyncSubstrate(); +#endif //! allocate the norm factor vector void allocNormFactor(); diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h index c542f42b89..6829194e26 100644 --- a/libdeepgalois/include/deepgalois/GraphTypes.h +++ b/libdeepgalois/include/deepgalois/GraphTypes.h @@ -13,6 +13,7 @@ namespace deepgalois { using edge_iterator = index_t; #ifdef __GALOIS_HET_CUDA__ +using DGraph = CSRGraph; using Graph = CSRGraph; using GraphGPU = CSRGraph; #else diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 58433c7c1c..53ffd54960 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -12,10 +12,10 @@ #include "deepgalois/utils.h" #include "deepgalois/Context.h" #include "deepgalois/GraphTypes.h" +#include "deepgalois/DistContext.h" #ifndef __GALOIS_HET_CUDA__ #include "deepgalois/Sampler.h" -#include "deepgalois/DistContext.h" #endif namespace deepgalois { @@ -84,10 +84,11 @@ class Net { //! context holds all of the graph data deepgalois::Context* graphTopologyContext; -#ifndef __GALOIS_HET_CUDA__ //! dist context holds graph data of the partitioned graph only deepgalois::DistContext* distContext; DGraph* dGraph; + +#ifndef __GALOIS_HET_CUDA__ Sampler* sampler; #endif @@ -189,14 +190,10 @@ class Net { void init(); -#ifndef __GALOIS_HET_CUDA__ //! Initializes metadata for the partition - void partitionInit(DGraph* graph, std::string dataset_str, - bool isSingleClassLabel); -#endif + void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel); size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } - void regularize(); // add weight decay void train(optimizer* opt, bool need_validate) { diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 45b7bcc8bd..91b57c7041 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -13,9 +13,9 @@ #include "deepgalois/Context.h" #include "deepgalois/optimizer.h" #include "deepgalois/layers/node.h" +#include "deepgalois/DistContext.h" #ifndef __GALOIS_HET_CUDA__ -#include "deepgalois/DistContext.h" #include "galois/graphs/GluonSubstrate.h" #include "deepgalois/layers/GluonGradients.h" #include "deepgalois/layers/GradientSyncStructs.h" @@ -38,11 +38,7 @@ namespace deepgalois { **/ class layer : public deepgalois::node { public: -#ifdef __GALOIS_HET_CUDA__ - using ContextType = deepgalois::Context; -#else using ContextType = deepgalois::DistContext; -#endif layer(unsigned level, std::vector in_dims, std::vector out_dims) diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index a9c604befc..528ba700cc 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -5,6 +5,29 @@ namespace deepgalois { DistContext::~DistContext() {} +void DistContext::saveDistGraph(DGraph* a) { + partitionedGraph = a; + + // construct lgraph from underlying lc csr graph + // TODO fix this so i don't have more than 1 copy of graph in memory + this->lGraph = new Graph(); + this->lGraph->allocateFrom(a->size(), a->sizeEdges()); + this->lGraph->constructNodes(); + + galois::do_all( + galois::iterate((size_t)0, a->size()), + [&](const auto src) { + this->lGraph->fixEndEdge(src, *a->edge_end(src)); + index_t idx = *(a->edge_begin(src)); + + for (auto e = a->edge_begin(src); e != a->edge_end(src); e++) { + const auto dst = a->getEdgeDst(e); + this->lGraph->constructEdge(idx++, dst, 0); + } + }, + galois::loopname("lgraphcopy")); +} + // TODO move to reader class size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str) { diff --git a/libdeepgalois/src/net.cu b/libdeepgalois/src/Net.cu similarity index 95% rename from libdeepgalois/src/net.cu rename to libdeepgalois/src/Net.cu index f1bbe97c94..647e8e0738 100644 --- a/libdeepgalois/src/net.cu +++ b/libdeepgalois/src/Net.cu @@ -148,14 +148,13 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, namespace deepgalois { void Net::init() { - copy_masks_device(num_samples, train_masks, d_train_masks); - copy_masks_device(num_samples, val_masks, d_val_masks); - context - ->copy_data_to_device(); // copy labels and input features to the device + copy_masks_device(globalSamples, globalTrainMasks, d_train_masks); + copy_masks_device(globalSamples, globalValMasks, d_val_masks); + distContext->copy_data_to_device(); // copy labels and input features to the device } void Net::copy_test_masks_to_device() { - copy_masks_device(num_samples, test_masks, d_test_masks); + copy_masks_device(globalSamples, test_masks, d_test_masks); } // add weight decay @@ -166,7 +165,7 @@ void Net::regularize() { layers[layer_id]->get_grads_device_ptr()); } -void Net::normalize() {} +//void Net::normalize() {} acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, float_t* preds, From 5acfdf9a62d95c812ee66273714c4448fc1b328b Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 12 May 2020 11:01:57 -0500 Subject: [PATCH 290/660] fix DistContext --- libdeepgalois/CMakeLists.txt | 4 +- .../include/deepgalois/DistContext.h | 36 ++++++-- libdeepgalois/include/deepgalois/GraphTypes.h | 3 + libdeepgalois/include/deepgalois/Net.h | 4 + libdeepgalois/include/deepgalois/reader.h | 4 +- libdeepgalois/src/DistContext.cpp | 1 + .../src/{context.cu => DistContext.cu} | 92 ++++++++----------- libdeepgalois/src/math_functions.cu | 2 +- libdeepgalois/src/reader.cpp | 2 +- lonestar/gnn/include/engine.h | 54 ++++++++++- lonestar/gnn/include/lonestargnn.h | 70 ++------------ 11 files changed, 143 insertions(+), 129 deletions(-) rename libdeepgalois/src/{context.cu => DistContext.cu} (67%) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index aa4850c8c4..7548664a9d 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -27,7 +27,7 @@ link_directories(${CMAKE_SOURCE_DIR}/libgalois) if(ENABLE_HETERO_GALOIS) # hetero path - set(CUDA_NVCC_FLAGS "${CUDA_NVCC_FLAGS} -D__GALOIS_HET_CUDA__") + set(CUDA_NVCC_FLAGS "-D__GALOIS_HET_CUDA__ ${CUDA_NVCC_FLAGS}") set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers include_directories("${CUB_ROOT}") set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers @@ -57,7 +57,7 @@ if(ENABLE_HETERO_GALOIS) src/layers/aggregator.cu src/math_functions.cu src/optimizer.cu - src/context.cu + src/DistContext.cu src/lgraph.cu src/node.cu src/Net.cu diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 9a3496a9c9..300bd216cc 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -13,10 +13,18 @@ namespace deepgalois { class DistContext { - size_t num_classes; // number of classes: E - size_t feat_len; // input feature length: D - Graph* lGraph; // laerning graph version -#ifndef __GALOIS_HET_CUDA__ + bool is_device; // is this on device or host + bool is_selfloop_added; // whether selfloop is added to the input graph + bool usingSingleClass; + size_t num_classes; // number of classes: E + size_t feat_len; // input feature length: D + Graph* lGraph; // laerning graph version +#ifdef __GALOIS_HET_CUDA__ + label_t* d_labels; // labels on device + label_t* d_labels_subg; // labels for subgraph on device + float_t* d_feats; // input features on device + float_t* d_feats_subg; // input features for subgraph on device +#else galois::graphs::GluonSubstrate* syncSubstrate; #endif DGraph* partitionedGraph; // the input graph, |V| = N @@ -28,17 +36,21 @@ class DistContext { std::vector h_feats_subg; // input features for subgraph std::vector normFactors; // normalization constant based on graph structure std::vector normFactorsSub; // normalization constant for subgraph - bool usingSingleClass; public: // TODO better constructor - DistContext() : usingSingleClass(true){}; + DistContext(); + DistContext(bool isDevice) : is_device(isDevice) {} ~DistContext(); + size_t read_graph(std::string dataset_str, bool selfloop = false); + //! read labels of local nodes only size_t read_labels(bool isSingleClassLabel, std::string dataset_str); + //! read features of local nodes only size_t read_features(std::string dataset_str); + //! read masks of local nodes only size_t read_masks(std::string dataset_str, std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph); @@ -52,7 +64,17 @@ class DistContext { label_t* get_labels_subg_ptr() { return h_labels_subg.data(); } void initializeSyncSubstrate(); -#ifndef __GALOIS_HET_CUDA__ +#ifdef __GALOIS_HET_CUDA__ + void copy_data_to_device(); // copy labels and input features + static cublasHandle_t cublas_handle_; // used to call cuBLAS + static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE + static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE + static curandGenerator_t curand_generator_; // used to generate random numbers on GPU + inline static cublasHandle_t cublas_handle() { return cublas_handle_; } + inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; } + inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; } + inline static curandGenerator_t curand_generator() { return curand_generator_; } +#else void saveDistGraph(DGraph* a); galois::graphs::GluonSubstrate* getSyncSubstrate(); #endif diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h index 6829194e26..4e39a820f9 100644 --- a/libdeepgalois/include/deepgalois/GraphTypes.h +++ b/libdeepgalois/include/deepgalois/GraphTypes.h @@ -4,7 +4,10 @@ #include "deepgalois/lgraph.h" #ifdef __GALOIS_HET_CUDA__ +#define USE_CSRGRAPH +#ifdef USE_CSRGRAPH #include "graph_gpu.h" +#endif #else #include "galois/Galois.h" #include "galois/graphs/NewGeneric.h" diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 53ffd54960..2264db6690 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -131,6 +131,10 @@ class Net { graphTopologyContext->set_dataset(dataset_str); // read *entire* graph, get num nodes globalSamples = graphTopologyContext->read_graph(selfloop); +#ifdef __GALOIS_HET_CUDA__ + this->distContext = new deepgalois::DistContext(); + this->distNumSamples = this->distContext->read_graph(dataset_str, selfloop); +#endif // get training and validation sets: this is to create the training // subgraph in the sampler diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h index 1bcda0b4b7..55890d79ae 100644 --- a/libdeepgalois/include/deepgalois/reader.h +++ b/libdeepgalois/include/deepgalois/reader.h @@ -1,5 +1,5 @@ #pragma once -#include "deepgalois/GraphTypes.h" +#include "deepgalois/lgraph.h" namespace deepgalois { @@ -16,7 +16,7 @@ class Reader { size_t read_features(float_t*& feats, std::string filetype = "bin"); size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks); - void readGraphFromGRFile(Graph* g); + void readGraphFromGRFile(LearningGraph* g); }; } // namespace deepgalois diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 528ba700cc..2d6cb5de85 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -3,6 +3,7 @@ #include "deepgalois/configs.h" namespace deepgalois { +DistContext::DistContext() : usingSingleClass(true) {} DistContext::~DistContext() {} void DistContext::saveDistGraph(DGraph* a) { diff --git a/libdeepgalois/src/context.cu b/libdeepgalois/src/DistContext.cu similarity index 67% rename from libdeepgalois/src/context.cu rename to libdeepgalois/src/DistContext.cu index 05a1b0cd8f..26c5c56d90 100644 --- a/libdeepgalois/src/context.cu +++ b/libdeepgalois/src/DistContext.cu @@ -2,7 +2,7 @@ #include #include #include -#include "deepgalois/context.h" +#include "deepgalois/DistContext.h" #include "deepgalois/math_functions.hh" #include "deepgalois/configs.h" @@ -27,26 +27,21 @@ int64_t cluster_seedgen(void) { namespace deepgalois { // computing normalization factor for each vertex -__global__ void norm_factor_computing_node(int n, GraphGPU graph, - float_t* norm_fac) { +__global__ void norm_factor_computing_node(int n, GraphGPU graph, float_t* norm_fac) { CUDA_KERNEL_LOOP(i, n) { float_t temp = sqrt(float_t(graph.getOutDegree(i))); - if (temp == 0.0) - norm_fac[i] = 0.0; - else - norm_fac[i] = 1.0 / temp; + if (temp == 0.0) norm_fac[i] = 0.0; + else norm_fac[i] = 1.0 / temp; } } // TODO: make sure self-loop added for each vertex // computing normalization factor for each edge -__global__ void norm_factor_computing_edge(int n, GraphGPU graph, - float_t* norm_fac) { +__global__ void norm_factor_computing_edge(int n, GraphGPU graph, float_t* norm_fac) { CUDA_KERNEL_LOOP(src, n) { assert(src < n); float_t d_src = float_t(graph.getOutDegree(src)); - assert(d_src != - 0.0); // should never be zero since self-loop added for each vertex + assert(d_src != 0.0); // should never be zero since self-loop added for each vertex d_src = 1.0 / sqrt(d_src); auto start = graph.edge_begin(src); index_t end = graph.edge_end(src); @@ -63,12 +58,12 @@ __global__ void norm_factor_computing_edge(int n, GraphGPU graph, } } -cublasHandle_t Context::cublas_handle_ = 0; -cusparseHandle_t Context::cusparse_handle_ = 0; -cusparseMatDescr_t Context::cusparse_matdescr_ = 0; -curandGenerator_t Context::curand_generator_ = 0; +cublasHandle_t DistContext::cublas_handle_ = 0; +cusparseHandle_t DistContext::cusparse_handle_ = 0; +cusparseMatDescr_t DistContext::cusparse_matdescr_ = 0; +curandGenerator_t DistContext::curand_generator_ = 0; -Context::Context() : Context(true) { +DistContext::DistContext() : DistContext(true) { CUBLAS_CHECK(cublasCreate(&cublas_handle_)); CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_)); @@ -82,7 +77,7 @@ Context::Context() : Context(true) { curandSetPseudoRandomGeneratorSeed(curand_generator_, cluster_seedgen())); } -Context::~Context() { +DistContext::~DistContext() { if (cublas_handle_) CUBLAS_CHECK(cublasDestroy(cublas_handle_)); if (cusparse_handle_) @@ -95,38 +90,37 @@ Context::~Context() { CUDA_CHECK(cudaFree(d_labels)); if (d_feats) CUDA_CHECK(cudaFree(d_feats)); - if (norm_factors) - CUDA_CHECK(cudaFree(norm_factors)); } -void Context::allocateSubgraphs(int n_sg) {} +void DistContext::allocateSubgraphs(int n_sg) {} -void Context::gen_subgraph_labels(size_t m, const mask_t* masks) {} +void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {} -void Context::gen_subgraph_feats(size_t m, const mask_t* masks) {} +void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {} -void Context::norm_factor_computing(bool is_subgraph, int subg_id) { +void DistContext::constructNormFactor(deepgalois::Context* globalContext) { + auto n = partitionedGraph->size(); std::cout << "Pre-computing normalization factor (n=" << n << ") ... "; if (!is_selfloop_added) { std::cout << "Set -sl=1 to add selfloop\n"; exit(0); } #ifdef USE_CUSPARSE - int nnz = graph_gpu.sizeEdges(); - CUDA_CHECK(cudaMalloc((void**)&norm_factors, nnz * sizeof(float_t))); - init_const_gpu(nnz, 0.0, norm_factors); + int nnz = partitionedGraph->sizeEdges(); + CUDA_CHECK(cudaMalloc((void**)&normFactors[0], nnz * sizeof(float_t))); + init_const_gpu(nnz, 0.0, &normFactors[0]); norm_factor_computing_edge<<>>( - n, graph_gpu, norm_factors); + n, *partitionedGraph, &normFactors[0]); #else - CUDA_CHECK(cudaMalloc((void**)&norm_factors, n * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void**)&(&normFactors[0]), n * sizeof(float_t))); norm_factor_computing_node<<>>( - n, graph_gpu, norm_factors); + n, *partitionedGraph, &normFactors[0]); #endif CudaTest("solving norm_factor_computing kernel failed"); std::cout << "Done\n"; } /* -void Context::SetDevice(const int device_id) { +void DistContext::SetDevice(const int device_id) { int current_device; CUDA_CHECK(cudaGetDevice(¤t_device)); if (current_device == device_id) return; @@ -141,7 +135,8 @@ CURAND_RNG_PSEUDO_DEFAULT)); cluster_seedgen())); } */ -size_t Context::read_graph(bool selfloop) { +size_t DistContext::read_graph(std::string dataset, bool selfloop) { + partitionedGraph = new DGraph(); #ifdef USE_CSRGRAPH std::string filename = path + dataset + ".csgr"; GraphGPU g; @@ -150,41 +145,30 @@ size_t Context::read_graph(bool selfloop) { g.add_selfloop(); is_selfloop_added = selfloop; } - g.copy_to_gpu(graph_gpu); + g.copy_to_gpu(*partitionedGraph); #else - graph_gpu.readGraph(dataset); + partitionedGraph->readGraph(dataset); if (selfloop) { - graph_gpu.add_selfloop(); + partitionedGraph->add_selfloop(); is_selfloop_added = selfloop; } - graph_gpu.copy_to_gpu(); + partitionedGraph->copy_to_gpu(); #endif - n = graph_gpu.size(); - return n; + return partitionedGraph->size(); } -void Context::copy_data_to_device() { - if (is_single_class) { +void DistContext::copy_data_to_device() { + auto n = partitionedGraph->size(); + if (usingSingleClass) { CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t))); - CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * sizeof(label_t), - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * sizeof(label_t), cudaMemcpyHostToDevice)); } else { - CUDA_CHECK( - cudaMalloc((void**)&d_labels, n * num_classes * sizeof(label_t))); - CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * num_classes * sizeof(label_t), - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMalloc((void**)&d_labels, n * num_classes * sizeof(label_t))); + CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * num_classes * sizeof(label_t), cudaMemcpyHostToDevice)); } CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t))); - CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); // print_device_vector(10, d_feats, "d_feats"); } -// void Context::copy_data_to_device() { -// float_malloc_device(n, d_labels); -// float_copy_device(n, h_labels, d_labels); -// float_malloc_device(n*feat_len, d_feats); -// float_copy_device(n*feat_len, &h_feats[0], d_feats); -//} - } // namespace deepgalois diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 8b5ab8100f..80e4f6d394 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -1,5 +1,5 @@ #include "deepgalois/math_functions.hh" -#include "deepgalois/context.h" +#include "deepgalois/DistContext.h" #include "gg.h" #include "ggcuda.h" #include "cub/cub.cuh" diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp index 29f729f3a4..c8de34e448 100644 --- a/libdeepgalois/src/reader.cpp +++ b/libdeepgalois/src/reader.cpp @@ -166,7 +166,7 @@ void Reader::progressPrint(unsigned max, unsigned i) { } } -void Reader::readGraphFromGRFile(Graph* g) { +void Reader::readGraphFromGRFile(LearningGraph* g) { std::string filename = path + dataset_str + ".csgr"; std::ifstream ifs; ifs.open(filename); diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h index 4820d5c7fc..21be590817 100644 --- a/lonestar/gnn/include/engine.h +++ b/lonestar/gnn/include/engine.h @@ -1,15 +1,65 @@ #ifdef GALOIS_USE_DIST #include "DistributedGraphLoader.h" +#include "galois/DistGalois.h" +#include "galois/runtime/Network.h" #endif +#include "deepgalois/Net.h" + +//! initialize lonestargnn benchmark +void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, + const char* url) { + llvm::cl::SetVersionPrinter(LonestarGnnPrintVersion); + llvm::cl::ParseCommandLineOptions(argc, argv); + galois::runtime::setStatFile(statFile); + +#ifndef __GALOIS_HET_CUDA__ + numThreads = galois::setActiveThreads(numThreads); // number of threads on CPU +#endif + +#ifdef GALOIS_USE_DIST + auto& net = galois::runtime::getSystemNetworkInterface(); + if (net.ID == 0) { +#endif + LonestarGnnPrintVersion(llvm::outs()); + std::cout << "Copyright (C) " << galois::getCopyrightYear() + << " The University of Texas at Austin\n"; + std::cout << "http://iss.ices.utexas.edu/galois/\n\n"; + std::cout << "application: " << (app ? app : "unspecified") << "\n"; + if (desc) + std::cout << desc << "\n"; + if (url) + std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" + << url << "\n"; + std::cout << "\n"; + std::ostringstream cmdout; + for (int i = 0; i < argc; ++i) { + cmdout << argv[i]; + if (i != argc - 1) + cmdout << " "; + } + galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str()); + galois::runtime::reportParam("(NULL)", "Threads", numThreads); +#ifdef GALOIS_USE_DIST + } +#endif + + char name[256]; + gethostname(name, 256); + galois::runtime::reportParam("(NULL)", "Hostname", name); +} int main(int argc, char** argv) { +#ifdef GALOIS_USE_DIST galois::DistMemSys G; +#endif LonestarGnnStart(argc, argv, name, desc, url); // Get a partitioned graph first std::vector dummyVec; - deepgalois::DGraph* dGraph = - galois::graphs::constructSymmetricGraph(dummyVec); + deepgalois::DGraph* dGraph = NULL; +#ifdef GALOIS_USE_DIST + dGraph = galois::graphs::constructSymmetricGraph(dummyVec); +#endif // initialize network + whole context on CPU // read network, features, ground truth, initialize metadata diff --git a/lonestar/gnn/include/lonestargnn.h b/lonestar/gnn/include/lonestargnn.h index 21e73cb024..639396c6b5 100644 --- a/lonestar/gnn/include/lonestargnn.h +++ b/lonestar/gnn/include/lonestargnn.h @@ -5,15 +5,12 @@ #include "galois/Timer.h" #include "galois/Galois.h" #include "galois/Version.h" -#include "galois/Reduction.h" -#include "galois/ParallelSTL.h" -#include "galois/runtime/Profile.h" +//#include "galois/Reduction.h" +//#include "galois/ParallelSTL.h" +//#include "galois/runtime/Profile.h" #include "llvm/Support/CommandLine.h" #include -#include "galois/DistGalois.h" -#include "galois/runtime/Network.h" - namespace cll = llvm::cl; static cll::opt dataset(cll::Positional, cll::desc(""), cll::Required); // 'cora', 'citeseer', 'pubmed' @@ -50,62 +47,15 @@ extern llvm::cl::opt numThreads; extern llvm::cl::opt statFile; //! standard global options to the benchmarks -llvm::cl::opt - skipVerify("noverify", - llvm::cl::desc("Skip verification step (default value false)"), - llvm::cl::init(false)); -llvm::cl::opt - numThreads("t", llvm::cl::desc("Number of threads (default value 1)"), - llvm::cl::init(1)); -llvm::cl::opt statFile( - "statFile", - llvm::cl::desc("ouput file to print stats to (default value empty)"), - llvm::cl::init("")); +llvm::cl::opt skipVerify("noverify", + llvm::cl::desc("Skip verification step (default value false)"), llvm::cl::init(false)); +llvm::cl::optnumThreads("t", llvm::cl::desc("Number of threads (default value 1)"), llvm::cl::init(1)); +llvm::cl::opt statFile("statFile", + llvm::cl::desc("ouput file to print stats to (default value empty)"), llvm::cl::init("")); static void LonestarGnnPrintVersion(llvm::raw_ostream& out) { - out << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " (" - << galois::getRevision() << ")\n"; + out << "LoneStarGNN Benchmark Suite v" << galois::getVersion() + << " (" << galois::getRevision() << ")\n"; out.flush(); } -//! initialize lonestargnn benchmark -void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, - const char* url) { - llvm::cl::SetVersionPrinter(LonestarGnnPrintVersion); - llvm::cl::ParseCommandLineOptions(argc, argv); - numThreads = galois::setActiveThreads(numThreads); - galois::runtime::setStatFile(statFile); - -#ifdef GALOIS_USE_DIST - auto& net = galois::runtime::getSystemNetworkInterface(); - if (net.ID == 0) { -#endif - LonestarGnnPrintVersion(llvm::outs()); - std::cout << "Copyright (C) " << galois::getCopyrightYear() - << " The University of Texas at Austin\n"; - std::cout << "http://iss.ices.utexas.edu/galois/\n\n"; - std::cout << "application: " << (app ? app : "unspecified") << "\n"; - if (desc) - std::cout << desc << "\n"; - if (url) - std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" - << url << "\n"; - std::cout << "\n"; - std::ostringstream cmdout; - for (int i = 0; i < argc; ++i) { - cmdout << argv[i]; - if (i != argc - 1) - cmdout << " "; - } - galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str()); - galois::runtime::reportParam("(NULL)", "Threads", numThreads); -#ifdef GALOIS_USE_DIST - } -#endif - - char name[256]; - gethostname(name, 256); - galois::runtime::reportParam("(NULL)", "Hostname", name); -} - -#include "deepgalois/Net.h" From 9d824604b31f1e1f583024717cf120c91f0a9990 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 12 May 2020 13:20:28 -0500 Subject: [PATCH 291/660] remove Context.cpp --- libdeepgalois/CMakeLists.txt | 1 - libdeepgalois/include/deepgalois/Context.h | 82 +++++------------ .../include/deepgalois/DistContext.h | 29 ++++-- libdeepgalois/include/deepgalois/GraphTypes.h | 1 + libdeepgalois/include/deepgalois/Net.h | 1 + libdeepgalois/include/deepgalois/lgraph.h | 2 +- libdeepgalois/src/Context.cpp | 88 ------------------- libdeepgalois/src/DistContext.cu | 8 ++ libdeepgalois/src/lgraph.cpp | 10 +-- libdeepgalois/src/reader.cpp | 32 +++++++ 10 files changed, 92 insertions(+), 162 deletions(-) delete mode 100644 libdeepgalois/src/Context.cpp diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 7548664a9d..064f24d0d7 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -84,7 +84,6 @@ else() src/layers/aggregator.cpp src/math_functions.cpp src/optimizer.cpp - src/Context.cpp src/DistContext.cpp src/Sampler.cpp src/reader.cpp diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h index 0be03a1972..6200540847 100644 --- a/libdeepgalois/include/deepgalois/Context.h +++ b/libdeepgalois/include/deepgalois/Context.h @@ -1,85 +1,49 @@ #pragma once -/** - * Based on common.hpp file of the Caffe deep learning library. - */ - #include #include #include "deepgalois/types.h" #include "deepgalois/reader.h" +#include "deepgalois/configs.h" #include "deepgalois/GraphTypes.h" -#ifdef __GALOIS_HET_CUDA__ -#include "deepgalois/cutils.h" -#endif - namespace deepgalois { class Context { - std::string dataset; bool is_device; // is this on device or host bool is_selfloop_added; // whether selfloop is added to the input graph - - label_t* d_labels; // labels on device - label_t* d_labels_subg; // labels for subgraph on device - float_t* d_feats; // input features on device - float_t* d_feats_subg; // input features for subgraph on device - + std::string dataset; Reader reader; public: -// TODO separate below to public and private -#ifndef __GALOIS_HET_CUDA__ - Graph* graph_cpu; // the input graph, |V| = N - std::vector subgraphs_cpu; - void add_selfloop(Graph& og, Graph& g); - //! returns pointer to the graph - Graph* getGraphPointer() { return graph_cpu; } -#else - static cublasHandle_t cublas_handle_; // used to call cuBLAS - static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE - static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE - static curandGenerator_t curand_generator_; // used to generate random numbers on GPU - - GraphGPU graph_gpu; // the input graph, |V| = N - std::vector subgraphs_gpu; - GraphGPU* getGraphPointer() { return &graph_gpu; } - GraphGPU* getSubgraphPointer(int id) { return subgraphs_gpu[id]; }; - float_t* get_feats_ptr() { return d_feats; } - float_t* get_feats_subg_ptr() { return d_feats_subg; } - label_t* get_labels_ptr() { return d_labels; } - label_t* get_labels_subg_ptr() { return d_labels_subg; } - - inline static cublasHandle_t cublas_handle() { return cublas_handle_; } - inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; } - inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; } - inline static curandGenerator_t curand_generator() { return curand_generator_; } -#endif - - Context(); + GraphCPU* graph_cpu; // the input graph, |V| = N + GraphCPU* getGraphPointer() { return graph_cpu; } + Context() : Context(false) {} //! initializer for gpu; goes ahead and sets a few things - Context(bool use_gpu) - : is_device(use_gpu), is_selfloop_added(false), d_labels(NULL), - d_labels_subg(NULL), d_feats(NULL), d_feats_subg(NULL) {} - ~Context(); - - size_t read_graph(bool selfloop); - - size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, - mask_t* masks) { - return reader.read_masks(mask_type, n, begin, end, masks); - } - + Context(bool use_gpu) : is_device(use_gpu), is_selfloop_added(false) {} + ~Context() {} void set_dataset(std::string dataset_str) { dataset = dataset_str; reader.init(dataset); } + size_t read_masks(std::string mask_type, size_t n, + size_t& begin, size_t& end, mask_t* masks) { + return reader.read_masks(mask_type, n, begin, end, masks); + } + size_t read_graph(bool selfloop) { + graph_cpu = new GraphCPU(); + graph_cpu->readGraph(dataset, selfloop); + is_selfloop_added = selfloop; + std::cout << "num_vertices " << graph_cpu->size() + << " num_edges " << graph_cpu->sizeEdges() << "\n"; + return graph_cpu->size(); + } //! Checks if subgraph being used, sets currenet graph, then calls degreex //! counting - Graph* getFullGraph(); - - void copy_data_to_device(); // copy labels and input features + GraphCPU* getFullGraph() { + graph_cpu->degree_counting(); // TODO: why is it here? should be in read_graph + return graph_cpu; + } }; } // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 300bd216cc..745c298608 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -1,14 +1,15 @@ #ifndef __DG_DIST_CONTEXT__ #define __DG_DIST_CONTEXT__ -/** - * Based on common.hpp file of the Caffe deep learning library. - */ -#ifndef __GALOIS_HET_CUDA__ +#ifdef __GALOIS_HET_CUDA__ +#include "deepgalois/cutils.h" +#else #include "galois/graphs/GluonSubstrate.h" #endif + #include "deepgalois/types.h" #include "deepgalois/Context.h" #include "deepgalois/GraphTypes.h" +#include "deepgalois/reader.h" namespace deepgalois { @@ -16,6 +17,7 @@ class DistContext { bool is_device; // is this on device or host bool is_selfloop_added; // whether selfloop is added to the input graph bool usingSingleClass; + std::string dataset; size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D Graph* lGraph; // laerning graph version @@ -37,6 +39,8 @@ class DistContext { std::vector normFactors; // normalization constant based on graph structure std::vector normFactorsSub; // normalization constant for subgraph + Reader reader; + public: // TODO better constructor DistContext(); @@ -58,13 +62,13 @@ class DistContext { DGraph* getGraphPointer() { return partitionedGraph; } Graph* getLGraphPointer() { return lGraph; } Graph* getSubgraphPointer(int id) { return partitionedSubgraphs[id]; }; - float_t* get_feats_ptr() { return h_feats; } - float_t* get_feats_subg_ptr() { return h_feats_subg.data(); } - label_t* get_labels_ptr() { return h_labels; } - label_t* get_labels_subg_ptr() { return h_labels_subg.data(); } void initializeSyncSubstrate(); #ifdef __GALOIS_HET_CUDA__ + float_t* get_feats_ptr() { return d_feats; } + float_t* get_feats_subg_ptr() { return d_feats_subg; } + label_t* get_labels_ptr() { return d_labels; } + label_t* get_labels_subg_ptr() { return d_labels_subg; } void copy_data_to_device(); // copy labels and input features static cublasHandle_t cublas_handle_; // used to call cuBLAS static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE @@ -77,8 +81,17 @@ class DistContext { #else void saveDistGraph(DGraph* a); galois::graphs::GluonSubstrate* getSyncSubstrate(); + float_t* get_feats_ptr() { return h_feats; } + float_t* get_feats_subg_ptr() { return h_feats_subg.data(); } + label_t* get_labels_ptr() { return h_labels; } + label_t* get_labels_subg_ptr() { return h_labels_subg.data(); } #endif + void set_dataset(std::string dataset_str) { + dataset = dataset_str; + reader.init(dataset); + } + //! allocate the norm factor vector void allocNormFactor(); void allocNormFactorSub(int subID); diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h index 4e39a820f9..2ee3f86b93 100644 --- a/libdeepgalois/include/deepgalois/GraphTypes.h +++ b/libdeepgalois/include/deepgalois/GraphTypes.h @@ -15,6 +15,7 @@ namespace deepgalois { using edge_iterator = index_t; +using GraphCPU = LearningGraph; #ifdef __GALOIS_HET_CUDA__ using DGraph = CSRGraph; using Graph = CSRGraph; diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 2264db6690..f6e8516c5e 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -133,6 +133,7 @@ class Net { globalSamples = graphTopologyContext->read_graph(selfloop); #ifdef __GALOIS_HET_CUDA__ this->distContext = new deepgalois::DistContext(); + this->distContext->set_dataset(dataset_str); this->distNumSamples = this->distContext->read_graph(dataset_str, selfloop); #endif diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index 40ca6c5a18..e9a185bfac 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -56,7 +56,7 @@ class LearningGraph { void degree_counting(); void constructNodes() {} - void readGraph(std::string dataset); + void readGraph(std::string dataset, bool selfloop = false); void fixEndEdge(index_t vid, index_t row_end) { rowptr_[vid + 1] = row_end; } void allocateFrom(index_t nv, index_t ne) { // printf("Allocating num_vertices %d num_edgesi %d\n", num_vertices_, diff --git a/libdeepgalois/src/Context.cpp b/libdeepgalois/src/Context.cpp deleted file mode 100644 index c9bbe9e706..0000000000 --- a/libdeepgalois/src/Context.cpp +++ /dev/null @@ -1,88 +0,0 @@ -/** - * Based on common.hpp file of the Caffe deep learning library. - */ -#include "deepgalois/Context.h" -#include "deepgalois/utils.h" -#include "deepgalois/configs.h" -#include "galois/Galois.h" - -namespace deepgalois { - -Context::Context() : Context(false) {} - -Context::~Context() {} - -size_t Context::read_graph(bool selfloop) { - std::string filename = path + dataset + ".csgr"; - std::string filetype = "gr"; - galois::StatTimer Tread("GraphReadingTime"); - Tread.start(); - if (filetype == "bin") { - graph_cpu->readGraph(dataset); - } else if (filetype == "gr") { - graph_cpu = new Graph(); - std::string filename = path + dataset + ".csgr"; - printf("Reading .gr file: %s\n", filename.c_str()); - if (selfloop) { - galois::gWarn("SELF LOOPS NOT SUPPORTED AT THIS TIME"); - Graph graph_temp; - // galois::graphs::readGraph(graph_temp, filename); - graph_temp.readGraph(dataset); - add_selfloop(graph_temp, *graph_cpu); - is_selfloop_added = selfloop; - //} else galois::graphs::readGraph(*graph_cpu, filename); - } else { - graph_cpu->readGraph(dataset); - galois::gPrint("graph read size ", graph_cpu->size()); - } - // TODO dist version of self loop - } else { - GALOIS_DIE("unknown file format for readgraph"); - } - Tread.stop(); - - auto g = getGraphPointer(); - galois::gPrint("num_vertices ", g->size(), " num_edges ", g->sizeEdges(), - "\n"); - return g->size(); -} - -void Context::add_selfloop(Graph& og, Graph& g) { - // TODO not actually implemented yet - g.allocateFrom(og.size(), og.size() + og.sizeEdges()); - g.constructNodes(); - // for (size_t src = 0; src < og.size(); src++) { - // //g.getData(src) = 1; - // auto begin = og.edge_begin(src); - // auto end = og.edge_end(src); - // g.fixEndEdge(src, end+src+1); - // bool self_inserted = false; - // if (begin == end) { - // new_edge_dst[begin+i] = i; - // continue; - // } - // for (auto e = begin; e != end; e++) { - // auto dst = og.getEdgeDst(e); - // if (!self_inserted) { - // if (dst > src) { - // g.constructEdge(e+src, src, 0); - // g.constructEdge(e+src+1, dst, 0); - // self_inserted = true; - // } else if (e+1 == end) { - // g.constructEdge(e+src+1, src, 0); - // g.constructEdge(e+src, dst, 0); - // self_inserted = true; - // } else g.constructEdge(e+src, dst, 0); - // } else g.constructEdge(e+src+1, dst, 0); - // } - //} -} - -// get current graph, also gets degrees of g -Graph* Context::getFullGraph() { - Graph* g = getGraphPointer(); - g->degree_counting(); - return g; -} - -} // namespace deepgalois diff --git a/libdeepgalois/src/DistContext.cu b/libdeepgalois/src/DistContext.cu index 26c5c56d90..c43d0020f0 100644 --- a/libdeepgalois/src/DistContext.cu +++ b/libdeepgalois/src/DistContext.cu @@ -92,6 +92,10 @@ DistContext::~DistContext() { CUDA_CHECK(cudaFree(d_feats)); } +size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks) { + return reader.read_masks(mask_type, n, begin, end, masks); +} + void DistContext::allocateSubgraphs(int n_sg) {} void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {} @@ -119,6 +123,10 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) { CudaTest("solving norm_factor_computing kernel failed"); std::cout << "Done\n"; } + +void DistContext::constructNormFactorSub(int subgraphID) { +} + /* void DistContext::SetDevice(const int device_id) { int current_device; diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index 572f4e5662..c0c39b4023 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -20,7 +20,8 @@ uint64_t LearningGraph::numMasters() { return 0; } uint64_t LearningGraph::globalSize() { return 0; } -void LearningGraph::readGraph(std::string dataset) { +void LearningGraph::readGraph(std::string dataset, bool selfloop) { + if (selfloop) std::cout << "selfloop not yet implemented\n"; deepgalois::Reader reader(dataset); reader.readGraphFromGRFile(this); } @@ -28,10 +29,9 @@ void LearningGraph::readGraph(std::string dataset) { void LearningGraph::degree_counting() { // if (degrees_ != NULL) return; // degrees_ = new index_t[num_vertices_]; - galois::do_all( - galois::iterate(size_t(0), size_t(num_vertices_)), - [&](auto v) { degrees_[v] = rowptr_[v + 1] - rowptr_[v]; }, - galois::loopname("DegreeCounting")); + galois::do_all(galois::iterate(size_t(0), size_t(num_vertices_)), + [&](auto v) { degrees_[v] = rowptr_[v + 1] - rowptr_[v]; }, + galois::loopname("DegreeCounting")); } void LearningGraph::dealloc() {} diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp index c8de34e448..961b852ded 100644 --- a/libdeepgalois/src/reader.cpp +++ b/libdeepgalois/src/reader.cpp @@ -255,4 +255,36 @@ void Reader::readGraphFromGRFile(LearningGraph* g) { << masterLength / 1000.0 / runtime << " MB/s)\n\n"; } +/* +void add_selfloop(Graph& og, Graph& g) { + g.allocateFrom(og.size(), og.size() + og.sizeEdges()); + g.constructNodes(); + for (size_t src = 0; src < og.size(); src++) { + //g.getData(src) = 1; + auto begin = og.edge_begin(src); + auto end = og.edge_end(src); + g.fixEndEdge(src, end+src+1); + bool self_inserted = false; + if (begin == end) { + new_edge_dst[begin+i] = i; + continue; + } + for (auto e = begin; e != end; e++) { + auto dst = og.getEdgeDst(e); + if (!self_inserted) { + if (dst > src) { + g.constructEdge(e+src, src, 0); + g.constructEdge(e+src+1, dst, 0); + self_inserted = true; + } else if (e+1 == end) { + g.constructEdge(e+src+1, src, 0); + g.constructEdge(e+src, dst, 0); + self_inserted = true; + } else g.constructEdge(e+src, dst, 0); + } else g.constructEdge(e+src+1, dst, 0); + } + } +} +//*/ + } // namespace deepgalois From 7d6cc3bef777c8395d6f8a01927a3816db1c296b Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 12 May 2020 14:16:40 -0500 Subject: [PATCH 292/660] fix gpu compilation --- libdeepgalois/include/deepgalois/Net.h | 21 +++++---------- libdeepgalois/src/DistContext.cpp | 13 ++++----- libdeepgalois/src/DistContext.cu | 11 +++++++- libdeepgalois/src/Net.cpp | 7 +---- libdeepgalois/src/Net.cu | 27 +++++++++++++++++++ libdeepgalois/src/lgraph.cu | 2 +- libdeepgalois/src/math_functions.cu | 37 +++++++++++++------------- lonestar/gnn/include/engine.h | 12 +++++++++ lonestar/gnn/include/lonestargnn.h | 15 ----------- 9 files changed, 81 insertions(+), 64 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index f6e8516c5e..405e4c5cb2 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -110,7 +110,7 @@ class Net { assert(n_conv > 0); - // TODO use galois print + // TODO use galois print: need avoid including Galois.h for GPU std::cout << header << "Configuration: num_threads " << num_threads << ", num_conv_layers " << num_conv_layers << ", num_epochs " << num_epochs << ", hidden1 " << hidden1 << ", learning_rate " @@ -131,11 +131,6 @@ class Net { graphTopologyContext->set_dataset(dataset_str); // read *entire* graph, get num nodes globalSamples = graphTopologyContext->read_graph(selfloop); -#ifdef __GALOIS_HET_CUDA__ - this->distContext = new deepgalois::DistContext(); - this->distContext->set_dataset(dataset_str); - this->distNumSamples = this->distContext->read_graph(dataset_str, selfloop); -#endif // get training and validation sets: this is to create the training // subgraph in the sampler @@ -166,18 +161,16 @@ class Net { "val", globalSamples, globalValBegin, globalValEnd, globalValMasks); } -#ifndef __GALOIS_HET_CUDA__ // make sure sampel size isn't greater than what we have to train with - if (subgraph_sample_size > globalTrainCount) { - GALOIS_DIE("subgraph size can not be larger than the size of training " - "set\n"); - } + assert(subgraph_sample_size <= globalTrainCount); + + layers.resize(num_layers); + // hidden1 level embedding: 16 + for (size_t i = 1; i < num_conv_layers; i++) feature_dims[i] = this->h1; // features are read in distcontext, not this context (this context only // used for sampling) - - this->sampler = new Sampler(); -#endif + init(); } //! Default net constructor diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 2d6cb5de85..e3c5efb038 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -30,8 +30,7 @@ void DistContext::saveDistGraph(DGraph* a) { } // TODO move to reader class -size_t DistContext::read_labels(bool isSingleClassLabel, - std::string dataset_str) { +size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str) { DGraph* dGraph = DistContext::partitionedGraph; this->usingSingleClass = isSingleClassLabel; unsigned myID = galois::runtime::getSystemNetworkInterface().ID; @@ -49,14 +48,12 @@ size_t DistContext::read_labels(bool isSingleClassLabel, // size of labels should be # local nodes if (isSingleClassLabel) { galois::gPrint("[", myID, "] One hot labels...\n"); - this->h_labels = - new label_t[dGraph->size()]; // single-class (one-hot) label for - // each vertex: N x 1 + // single-class (one-hot) label for each vertex: N x 1 + this->h_labels = new label_t[dGraph->size()]; } else { galois::gPrint("[", myID, "] Multi-class labels...\n"); - this->h_labels = new label_t[dGraph->size() * - this->num_classes]; // multi-class label for - // each vertex: N x E + this->h_labels = new label_t[dGraph->size() * this->num_classes]; + // multi-class label for each vertex: N x E } uint32_t foundVertices = 0; diff --git a/libdeepgalois/src/DistContext.cu b/libdeepgalois/src/DistContext.cu index c43d0020f0..0c7f89d2db 100644 --- a/libdeepgalois/src/DistContext.cu +++ b/libdeepgalois/src/DistContext.cu @@ -92,7 +92,16 @@ DistContext::~DistContext() { CUDA_CHECK(cudaFree(d_feats)); } -size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks) { +size_t DistContext::read_labels(bool isSingleClass, std::string dataset_str) { + return reader.read_labels(isSingleClass, h_labels); +} + +size_t DistContext::read_features(std::string dataset_str) { + return reader.read_features(h_feats); +} + +size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, size_t n, + size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph) { return reader.read_masks(mask_type, n, begin, end, masks); } diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index fbb6323891..809642dfd8 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -59,8 +59,7 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str, // input feature dimension: D feature_dims[0] = this->distContext->read_features(dataset_str); - for (size_t i = 1; i < num_conv_layers; i++) - feature_dims[i] = this->h1; // hidden1 level embedding: 16 + feature_dims[num_conv_layers] = num_classes; // output embedding: E if (this->has_l2norm) { // l2 normalized embedding: E @@ -70,12 +69,9 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str, // MLP embedding: E feature_dims[num_layers - 1] = num_classes; } - feature_dims[num_layers] = num_classes; // normalized output embedding: E - layers.resize(num_layers); } -#ifndef __GALOIS_HET_CUDA__ void Net::init() { if (subgraph_sample_size) sampler = new deepgalois::Sampler(); @@ -164,6 +160,5 @@ acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, ground_truth, preds); } -#endif } // namespace deepgalois diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu index 647e8e0738..b170afa35d 100644 --- a/libdeepgalois/src/Net.cu +++ b/libdeepgalois/src/Net.cu @@ -150,7 +150,34 @@ namespace deepgalois { void Net::init() { copy_masks_device(globalSamples, globalTrainMasks, d_train_masks); copy_masks_device(globalSamples, globalValMasks, d_val_masks); +} + +void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel) { + this->distContext = new deepgalois::DistContext(); + this->distContext->set_dataset(dataset_str); + + // read the graph into CPU memory and copy it to GPU memory + this->distNumSamples = this->distContext->read_graph(dataset_str, is_selfloop); + + // read labels into CPU memory + num_classes = this->distContext->read_labels(isSingleClassLabel, dataset_str); + + // read features into CPU memory + feature_dims[0] = this->distContext->read_features(dataset_str); + + // copy labels and features from CPU memory to GPU memory distContext->copy_data_to_device(); // copy labels and input features to the device + + feature_dims[num_conv_layers] = num_classes; // output embedding: E + if (this->has_l2norm) { + // l2 normalized embedding: E + feature_dims[num_conv_layers + 1] = num_classes; + } + if (this->has_dense) { + // MLP embedding: E + feature_dims[num_layers - 1] = num_classes; + } + feature_dims[num_layers] = num_classes; // normalized output embedding: E } void Net::copy_test_masks_to_device() { diff --git a/libdeepgalois/src/lgraph.cu b/libdeepgalois/src/lgraph.cu index 679a4b6d8a..9e1f2ab29e 100644 --- a/libdeepgalois/src/lgraph.cu +++ b/libdeepgalois/src/lgraph.cu @@ -5,7 +5,7 @@ namespace deepgalois { -void LearningGraph::readGraph(std::string dataset) { +void LearningGraph::readGraph(std::string dataset, bool selfloop) { deepgalois::Reader reader(dataset); reader.readGraphFromGRFile(this); } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 80e4f6d394..1ea5662d91 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -33,12 +33,12 @@ bool isnan_gpu(int n, const float_t* array) { void gpu_rng_uniform(size_t n, float_t* r) { CURAND_CHECK( - curandGenerateUniform(deepgalois::Context::curand_generator(), r, n)); + curandGenerateUniform(deepgalois::DistContext::curand_generator(), r, n)); } void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r) { CURAND_CHECK( - curandGenerateUniform(deepgalois::Context::curand_generator(), r, n)); + curandGenerateUniform(deepgalois::DistContext::curand_generator(), r, n)); const float range = b - a; if (range != float_t(1)) scal_gpu(n, range, r); @@ -48,7 +48,7 @@ void rng_uniform_gpu(size_t n, const float_t a, const float_t b, float_t* r) { void gpu_rng_gaussian(const int n, const float_t mu, const float_t sigma, float_t* r) { - CURAND_CHECK(curandGenerateNormal(deepgalois::Context::curand_generator(), r, + CURAND_CHECK(curandGenerateNormal(deepgalois::DistContext::curand_generator(), r, n, mu, sigma)); } @@ -203,7 +203,7 @@ void sgemm_gpu(const CBLAS_TRANSPOSE TransA, const CBLAS_TRANSPOSE TransB, (TransA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; cublasOperation_t cuTransB = (TransB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBLAS_CHECK(cublasSgemm(deepgalois::Context::cublas_handle(), cuTransB, + CUBLAS_CHECK(cublasSgemm(deepgalois::DistContext::cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A, lda, &beta, C, N)); } @@ -228,14 +228,14 @@ void csrmm_gpu(const int M, const int N, const int K, const int nnz, // std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << // ", nnz=" << nnz << "\n"; CUSPARSE_CHECK(cusparseScsrmm2( - deepgalois::Context::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, + deepgalois::DistContext::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, M, N, K, nnz, &alpha, - deepgalois::Context::cusparse_matdescr(), A_nonzeros, A_idx_ptr, + deepgalois::DistContext::cusparse_matdescr(), A_nonzeros, A_idx_ptr, A_nnz_idx, B, N, &beta, transpose_C, M)); // transpose C const float one = 1.0; const float zero = 0.0; - CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T, + CUBLAS_CHECK(cublasSgeam(deepgalois::DistContext::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL, M, C, N)); } @@ -253,21 +253,21 @@ A_nonzeros, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ZERO, CUDA_R_32F)); CUSPARSE_ORDER_COL)); cusparseDnMatDescr_t C_descr; CUSPARSE_CHECK(cusparseCreateDnMat(&C_descr, M, N, M, C, CUDA_R_32F, CUSPARSE_ORDER_COL)); size_t bufferSize; - CUSPARSE_CHECK(cusparseSpMM_bufferSize(deepgalois::Context::cusparse_handle(), + CUSPARSE_CHECK(cusparseSpMM_bufferSize(deepgalois::DistContext::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, (void*)&alpha, A_descr, B_descr, (void*)&beta, C_descr, CUDA_R_32F, CUSPARSE_COOMM_ALG1, &bufferSize)); cudaDeviceSynchronize(); void* buffer = NULL; if (bufferSize > 0) CUDA_CHECK(cudaMalloc(&buffer, bufferSize)); - CUSPARSE_CHECK(cusparseSpMM(deepgalois::Context::cusparse_handle(), + CUSPARSE_CHECK(cusparseSpMM(deepgalois::DistContext::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, (const void*)&alpha, A_descr, B_descr, (const void*)&beta, C_descr, CUDA_R_32F, CUSPARSE_COOMM_ALG1, buffer)); cudaDeviceSynchronize(); //transpose C const float one = 1.0; const float zero = 0.0; - CUBLAS_CHECK(cublasSgeam(deepgalois::Context::cublas_handle(), CUBLAS_OP_T, + CUBLAS_CHECK(cublasSgeam(deepgalois::DistContext::cublas_handle(), CUBLAS_OP_T, CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL, M, C, N)); } //*/ @@ -276,29 +276,29 @@ void gemv_gpu(const CBLAS_TRANSPOSE TransA, const int M, const int N, const float beta, float* y) { cublasOperation_t cuTransA = (TransA == CblasNoTrans) ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBLAS_CHECK(cublasSgemv(deepgalois::Context::cublas_handle(), cuTransA, N, M, + CUBLAS_CHECK(cublasSgemv(deepgalois::DistContext::cublas_handle(), cuTransA, N, M, &alpha, A, N, x, 1, &beta, y, 1)); } void scal_gpu(const int N, const float alpha, float* X) { CUBLAS_CHECK( - cublasSscal(deepgalois::Context::cublas_handle(), N, &alpha, X, 1)); + cublasSscal(deepgalois::DistContext::cublas_handle(), N, &alpha, X, 1)); } void dot_gpu(const int n, const float* x, const float* y, float* out) { CUBLAS_CHECK( - cublasSdot(deepgalois::Context::cublas_handle(), n, x, 1, y, 1, out)); + cublasSdot(deepgalois::DistContext::cublas_handle(), n, x, 1, y, 1, out)); } void asum_gpu(const int n, const float* x, float* y) { - CUBLAS_CHECK(cublasSasum(deepgalois::Context::cublas_handle(), n, x, 1, y)); + CUBLAS_CHECK(cublasSasum(deepgalois::DistContext::cublas_handle(), n, x, 1, y)); } void scale_gpu(const int n, const float alpha, const float* x, float* y) { CUBLAS_CHECK( - cublasScopy(deepgalois::Context::cublas_handle(), n, x, 1, y, 1)); + cublasScopy(deepgalois::DistContext::cublas_handle(), n, x, 1, y, 1)); CUBLAS_CHECK( - cublasSscal(deepgalois::Context::cublas_handle(), n, &alpha, y, 1)); + cublasSscal(deepgalois::DistContext::cublas_handle(), n, &alpha, y, 1)); } __global__ void set_kernel(const int n, const float_t alpha, float_t* y) { @@ -341,7 +341,7 @@ __global__ void axpy_kernel(const int n, const float_t a, const float_t* x, void axpy_gpu(const int n, const float_t a, const float_t* x, float_t* y) { // axpy_kernel<<>>(n, a, x, y); CUBLAS_CHECK( - cublasSaxpy(deepgalois::Context::cublas_handle(), n, &a, x, 1, y, 1)); + cublasSaxpy(deepgalois::DistContext::cublas_handle(), n, &a, x, 1, y, 1)); CudaTest("solving axpy kernel failed"); } @@ -354,8 +354,7 @@ __global__ void l2_norm_kernel(const int n, const float_t* a, float_t* sum) { acc_t l2_norm_gpu(int n, const float_t* x) { float_t sum = 0.0; - CUBLAS_CHECK( - cublasSnrm2(deepgalois::Context::cublas_handle(), n, x, 1, &sum)); + CUBLAS_CHECK(cublasSnrm2(deepgalois::DistContext::cublas_handle(), n, x, 1, &sum)); // float_t *d_sum; // CUDA_CHECK(cudaMalloc((void**)&d_sum, sizeof(float_t)); // CUDA_CHECK(cudaMemcpy(d_sum, &sum, sizeof(acc_t), cudaMemcpyHostToDevice)); diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h index 21be590817..727cd52f6b 100644 --- a/lonestar/gnn/include/engine.h +++ b/lonestar/gnn/include/engine.h @@ -1,10 +1,22 @@ +// Execution engine +#include +#include #ifdef GALOIS_USE_DIST #include "DistributedGraphLoader.h" #include "galois/DistGalois.h" #include "galois/runtime/Network.h" #endif +#include "galois/Galois.h" +#include "galois/Version.h" +#include "galois/Timer.h" #include "deepgalois/Net.h" +static void LonestarGnnPrintVersion(llvm::raw_ostream& out) { + out << "LoneStarGNN Benchmark Suite v" << galois::getVersion() + << " (" << galois::getRevision() << ")\n"; + out.flush(); +} + //! initialize lonestargnn benchmark void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, const char* url) { diff --git a/lonestar/gnn/include/lonestargnn.h b/lonestar/gnn/include/lonestargnn.h index 639396c6b5..8b18e80ae0 100644 --- a/lonestar/gnn/include/lonestargnn.h +++ b/lonestar/gnn/include/lonestargnn.h @@ -1,15 +1,6 @@ #pragma once -#include -#include -#include "galois/Timer.h" -#include "galois/Galois.h" -#include "galois/Version.h" -//#include "galois/Reduction.h" -//#include "galois/ParallelSTL.h" -//#include "galois/runtime/Profile.h" #include "llvm/Support/CommandLine.h" -#include namespace cll = llvm::cl; static cll::opt dataset(cll::Positional, @@ -53,9 +44,3 @@ llvm::cl::optnumThreads("t", llvm::cl::desc("Number of threads (default val llvm::cl::opt statFile("statFile", llvm::cl::desc("ouput file to print stats to (default value empty)"), llvm::cl::init("")); -static void LonestarGnnPrintVersion(llvm::raw_ostream& out) { - out << "LoneStarGNN Benchmark Suite v" << galois::getVersion() - << " (" << galois::getRevision() << ")\n"; - out.flush(); -} - From d053ce9d0c0bff60ba479f64da439720d425a6b8 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 12 May 2020 15:39:51 -0500 Subject: [PATCH 293/660] gpu fixed --- .../include/deepgalois/DistContext.h | 11 +++-- libdeepgalois/include/deepgalois/Net.h | 42 ++++--------------- libdeepgalois/src/DistContext.cu | 20 +++++---- libdeepgalois/src/Net.cpp | 17 ++++++++ libdeepgalois/src/Net.cu | 15 +++++++ libdeepgalois/src/layers/aggregator.cu | 7 ++-- libdeepgalois/src/math_functions.cu | 15 +++---- lonestar/gnn/include/engine.h | 2 + 8 files changed, 69 insertions(+), 60 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 745c298608..ff28bb607c 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -20,12 +20,14 @@ class DistContext { std::string dataset; size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D - Graph* lGraph; // laerning graph version + Graph* lGraph; // learning graph version #ifdef __GALOIS_HET_CUDA__ label_t* d_labels; // labels on device label_t* d_labels_subg; // labels for subgraph on device float_t* d_feats; // input features on device float_t* d_feats_subg; // input features for subgraph on device + float_t* d_normFactors; + float_t* d_normFactorsSub; #else galois::graphs::GluonSubstrate* syncSubstrate; #endif @@ -69,6 +71,8 @@ class DistContext { float_t* get_feats_subg_ptr() { return d_feats_subg; } label_t* get_labels_ptr() { return d_labels; } label_t* get_labels_subg_ptr() { return d_labels_subg; } + float_t* get_norm_factors_ptr() { return d_normFactors; } + float_t* get_norm_factors_subg_ptr() { return d_normFactorsSub; } void copy_data_to_device(); // copy labels and input features static cublasHandle_t cublas_handle_; // used to call cuBLAS static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE @@ -85,6 +89,8 @@ class DistContext { float_t* get_feats_subg_ptr() { return h_feats_subg.data(); } label_t* get_labels_ptr() { return h_labels; } label_t* get_labels_subg_ptr() { return h_labels_subg.data(); } + float_t* get_norm_factors_ptr() { return normFactors.data(); } + float_t* get_norm_factors_subg_ptr() { return &normFactorsSub[0]; } #endif void set_dataset(std::string dataset_str) { @@ -102,9 +108,6 @@ class DistContext { void constructSubgraphLabels(size_t m, const mask_t* masks); void constructSubgraphFeatures(size_t m, const mask_t* masks); - float_t* get_norm_factors_ptr() { return normFactors.data(); } - float_t* get_norm_factors_subg_ptr() { return &normFactorsSub[0]; } - //! return label for some node //! NOTE: this is LID, not GID label_t get_label(size_t i) { return h_labels[i]; } diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 405e4c5cb2..f8e601d0fa 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -271,14 +271,12 @@ class Net { // update labels for subgraph distContext->constructSubgraphLabels( this->subgraphNumVertices, &subgraphs_masks[sg_id * globalSamples]); - layers[num_layers - 1]->set_labels_ptr( - distContext->get_labels_subg_ptr()); + layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_subg_ptr()); // update features for subgraph distContext->constructSubgraphFeatures( this->subgraphNumVertices, &subgraphs_masks[sg_id * globalSamples]); - layers[0]->set_feats_ptr( - distContext->get_feats_subg_ptr()); // feed input data + layers[0]->set_feats_ptr(distContext->get_feats_subg_ptr()); // feed input data // Graph* testing = distContext->getSubgraphPointer(sg_id); // for (size_t i = 0; i < testing->size(); i++) { @@ -419,33 +417,7 @@ class Net { } // read masks of test set - void read_test_masks(std::string dataset) { - test_masks = new mask_t[distNumSamples]; - if (dataset == "reddit") { - globalTestBegin = 177262; - globalTestCount = 55703; - globalTestEnd = globalTestBegin + globalTestCount; - for (size_t i = globalTestBegin; i < globalTestEnd; i++) { -#ifndef __GALOIS_HET_CUDA__ - if (dGraph->isLocal(i)) - test_masks[dGraph->getLID(i)] = 1; -#else - // TODO: Read for GPU -#endif - } - } else { - globalTestCount = distContext->read_masks( - dataset, std::string("test"), globalSamples, globalTestBegin, -#ifdef __GALOIS_HET_CUDA__ - globalTestEnd, test_masks, NULL); -#else - globalTestEnd, test_masks, dGraph); -#endif - } -#ifdef __GALOIS_HET_CUDA__ - copy_test_masks_to_device(); -#endif - } + void read_test_masks(std::string dataset); void copy_test_masks_to_device(); void construct_layers() { @@ -454,17 +426,14 @@ class Net { for (size_t i = 0; i < num_conv_layers - 1; i++) { append_conv_layer(i, true); // conv layers, act=true } - append_conv_layer(num_conv_layers - 1); // the last hidden layer, act=false if (has_l2norm) { append_l2norm_layer(num_conv_layers); // l2_norm layer } - if (has_dense) { append_dense_layer(num_layers - 2); // dense layer } - append_out_layer(num_layers - 1); // output layer // allocate memory for intermediate features and gradients @@ -474,7 +443,6 @@ class Net { for (size_t i = 1; i < num_layers; i++) { connect(layers[i - 1], layers[i]); } - for (size_t i = 0; i < num_layers; i++) { layers[i]->malloc_and_init(); } @@ -537,7 +505,11 @@ class Net { out_dims[1] = get_out_dim(layer_id); layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); +#ifdef __GALOIS_HET_CUDA__ + layers[layer_id]->set_graph_ptr(distContext->getGraphPointer()); +#else layers[layer_id]->set_graph_ptr(distContext->getLGraphPointer()); +#endif } // update trainable weights after back-propagation diff --git a/libdeepgalois/src/DistContext.cu b/libdeepgalois/src/DistContext.cu index 0c7f89d2db..91d39bb9a4 100644 --- a/libdeepgalois/src/DistContext.cu +++ b/libdeepgalois/src/DistContext.cu @@ -93,11 +93,13 @@ DistContext::~DistContext() { } size_t DistContext::read_labels(bool isSingleClass, std::string dataset_str) { - return reader.read_labels(isSingleClass, h_labels); + num_classes = reader.read_labels(isSingleClass, h_labels); + return num_classes; } size_t DistContext::read_features(std::string dataset_str) { - return reader.read_features(h_feats); + feat_len = reader.read_features(h_feats); + return feat_len; } size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, size_t n, @@ -119,15 +121,15 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) { exit(0); } #ifdef USE_CUSPARSE - int nnz = partitionedGraph->sizeEdges(); - CUDA_CHECK(cudaMalloc((void**)&normFactors[0], nnz * sizeof(float_t))); - init_const_gpu(nnz, 0.0, &normFactors[0]); + auto nnz = partitionedGraph->sizeEdges(); + CUDA_CHECK(cudaMalloc((void**)&d_normFactors, nnz * sizeof(float_t))); + init_const_gpu(nnz, 0.0, d_normFactors); norm_factor_computing_edge<<>>( - n, *partitionedGraph, &normFactors[0]); + n, *partitionedGraph, d_normFactors); #else - CUDA_CHECK(cudaMalloc((void**)&(&normFactors[0]), n * sizeof(float_t))); + CUDA_CHECK(cudaMalloc((void**)&d_normFactors, n * sizeof(float_t))); norm_factor_computing_node<<>>( - n, *partitionedGraph, &normFactors[0]); + n, *partitionedGraph, d_normFactors); #endif CudaTest("solving norm_factor_computing kernel failed"); std::cout << "Done\n"; @@ -176,6 +178,7 @@ size_t DistContext::read_graph(std::string dataset, bool selfloop) { void DistContext::copy_data_to_device() { auto n = partitionedGraph->size(); + std::cout << "Copying labels and features to GPU memory. n = " << n << " ... "; if (usingSingleClass) { CUDA_CHECK(cudaMalloc((void**)&d_labels, n * sizeof(label_t))); CUDA_CHECK(cudaMemcpy(d_labels, h_labels, n * sizeof(label_t), cudaMemcpyHostToDevice)); @@ -186,6 +189,7 @@ void DistContext::copy_data_to_device() { CUDA_CHECK(cudaMalloc((void**)&d_feats, n * feat_len * sizeof(float_t))); CUDA_CHECK(cudaMemcpy(d_feats, &h_feats[0], n * feat_len * sizeof(float_t), cudaMemcpyHostToDevice)); // print_device_vector(10, d_feats, "d_feats"); + std::cout << "Done\n"; } } // namespace deepgalois diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index 809642dfd8..c9d5f1e7fc 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -86,6 +86,23 @@ void Net::regularize() { layers[layer_id]->get_grads_ptr()); } +void Net::read_test_masks(std::string dataset) { + test_masks = new mask_t[distNumSamples]; + if (dataset == "reddit") { + globalTestBegin = 177262; + globalTestCount = 55703; + globalTestEnd = globalTestBegin + globalTestCount; + for (size_t i = globalTestBegin; i < globalTestEnd; i++) { + if (dGraph->isLocal(i)) + test_masks[dGraph->getLID(i)] = 1; + } + } else { + globalTestCount = distContext->read_masks(dataset, std::string("test"), + globalSamples, globalTestBegin, globalTestEnd, test_masks, dGraph); + } + copy_test_masks_to_device(); +} + /** * * @param begin GLOBAL begin diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu index b170afa35d..7ea47dc3ae 100644 --- a/libdeepgalois/src/Net.cu +++ b/libdeepgalois/src/Net.cu @@ -180,6 +180,21 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleCla feature_dims[num_layers] = num_classes; // normalized output embedding: E } +void Net::read_test_masks(std::string dataset) { + test_masks = new mask_t[distNumSamples]; + if (dataset == "reddit") { + globalTestBegin = 177262; + globalTestCount = 55703; + globalTestEnd = globalTestBegin + globalTestCount; + for (size_t i = globalTestBegin; i < globalTestEnd; i++) + test_masks[i] = 1; + } else { + globalTestCount = distContext->read_masks(dataset, std::string("test"), + globalSamples, globalTestBegin, globalTestEnd, test_masks, NULL); + } + copy_test_masks_to_device(); +} + void Net::copy_test_masks_to_device() { copy_masks_device(globalSamples, test_masks, d_test_masks); } diff --git a/libdeepgalois/src/layers/aggregator.cu b/libdeepgalois/src/layers/aggregator.cu index 2bfe55ca46..b29e980da3 100644 --- a/libdeepgalois/src/layers/aggregator.cu +++ b/libdeepgalois/src/layers/aggregator.cu @@ -90,13 +90,12 @@ void update_all_csrmm(size_t len, GraphGPU& g, const float_t* in, float_t* out, float* temp; const int* row_start = (const int*)g.row_start_ptr(); const int* edge_dst = (const int*)g.edge_dst_ptr(); - // printf("row_start_ptr: 0x%x\n", row_start); - // printf("edge_dst_ptr: 0x%x\n", edge_dst); + //printf("row_start_ptr: 0x%x\n", row_start); + //printf("edge_dst_ptr: 0x%x\n", edge_dst); // print_device_int_vector(10, row_start, "row_start"); // print_device_int_vector(10, edge_dst, "edge_dst"); float_malloc_device(n * len, temp); // TODO: avoid repetitive allocation - csrmm_gpu(n, len, n, nnz, 1.0, norm_factor, row_start, edge_dst, in, 0.0, - temp, out); + csrmm_gpu(n, len, n, nnz, 1.0, norm_factor, row_start, edge_dst, in, 0.0, temp, out); float_free_device(temp); } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 1ea5662d91..246091903c 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -225,19 +225,16 @@ void csrmm_gpu(const int M, const int N, const int K, const int nnz, const float alpha, const float* A_nonzeros, const int* A_idx_ptr, const int* A_nnz_idx, const float* B, const float beta, float* transpose_C, float* C) { - // std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << - // ", nnz=" << nnz << "\n"; - CUSPARSE_CHECK(cusparseScsrmm2( - deepgalois::DistContext::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, - CUSPARSE_OPERATION_TRANSPOSE, M, N, K, nnz, &alpha, - deepgalois::DistContext::cusparse_matdescr(), A_nonzeros, A_idx_ptr, - A_nnz_idx, B, N, &beta, transpose_C, M)); + //std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n"; + CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::DistContext::cusparse_handle(), + CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, + M, N, K, nnz, &alpha, deepgalois::DistContext::cusparse_matdescr(), + A_nonzeros, A_idx_ptr, A_nnz_idx, B, N, &beta, transpose_C, M)); // transpose C const float one = 1.0; const float zero = 0.0; CUBLAS_CHECK(cublasSgeam(deepgalois::DistContext::cublas_handle(), CUBLAS_OP_T, - CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL, - M, C, N)); + CUBLAS_OP_T, N, M, &one, transpose_C, M, &zero, NULL, M, C, N)); } /* void csrmm_gpu_new(const int M, const int N, const int K, const int nnz, diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h index 727cd52f6b..ad63ffdb78 100644 --- a/lonestar/gnn/include/engine.h +++ b/lonestar/gnn/include/engine.h @@ -63,6 +63,8 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, int main(int argc, char** argv) { #ifdef GALOIS_USE_DIST galois::DistMemSys G; +#else + galois::SharedMemSys G; #endif LonestarGnnStart(argc, argv, name, desc, url); From 84527772827a8b2d3f5bc055038b885c3bb68b52 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 12 May 2020 15:47:22 -0500 Subject: [PATCH 294/660] small fix --- libdeepgalois/include/deepgalois/Net.h | 2 +- libdeepgalois/src/Net.cpp | 1 - libdeepgalois/src/Net.cu | 7 +++---- 3 files changed, 4 insertions(+), 6 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index f8e601d0fa..5b45b03d11 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -418,7 +418,7 @@ class Net { // read masks of test set void read_test_masks(std::string dataset); - void copy_test_masks_to_device(); + //void copy_test_masks_to_device(); void construct_layers() { // append conv layers diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index c9d5f1e7fc..41ce7b2d77 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -100,7 +100,6 @@ void Net::read_test_masks(std::string dataset) { globalTestCount = distContext->read_masks(dataset, std::string("test"), globalSamples, globalTestBegin, globalTestEnd, test_masks, dGraph); } - copy_test_masks_to_device(); } /** diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu index 7ea47dc3ae..b63e5df3a6 100644 --- a/libdeepgalois/src/Net.cu +++ b/libdeepgalois/src/Net.cu @@ -192,13 +192,12 @@ void Net::read_test_masks(std::string dataset) { globalTestCount = distContext->read_masks(dataset, std::string("test"), globalSamples, globalTestBegin, globalTestEnd, test_masks, NULL); } - copy_test_masks_to_device(); -} - -void Net::copy_test_masks_to_device() { + //copy_test_masks_to_device(); copy_masks_device(globalSamples, test_masks, d_test_masks); } +//void Net::copy_test_masks_to_device() {} + // add weight decay void Net::regularize() { size_t layer_id = 0; From df94c434ba18a4e6e24e84c2d9f3e1d38f29c230 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 12 May 2020 17:04:44 -0500 Subject: [PATCH 295/660] fix sampler interface --- libdeepgalois/include/deepgalois/Net.h | 8 ++- libdeepgalois/include/deepgalois/Sampler.h | 44 ++++++-------- libdeepgalois/src/DistContext.cpp | 4 +- libdeepgalois/src/Sampler.cpp | 64 ++++++++------------ libdeepgalois/src/{sampler.cu => Sampler.cu} | 26 +++----- 5 files changed, 59 insertions(+), 87 deletions(-) rename libdeepgalois/src/{sampler.cu => Sampler.cu} (87%) diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 5b45b03d11..e62f68b297 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -230,9 +230,11 @@ class Net { // generate subgraphs #ifndef __GALOIS_HET_CUDA__ for (int sid = 0; sid < num_subgraphs; sid++) { - sampler->sampleSubgraph( - subgraph_sample_size, *(distContext->getSubgraphPointer(sid)), - &subgraphs_masks[sid * globalSamples], curEpoch); + VertexSet sampledSet; + sampler->selectVertices(subgraph_sample_size, sampledSet, curEpoch); // m = 1000 by default + sampler->generateSubgraph(sampledSet, + &subgraphs_masks[sid * globalSamples], + distContext->getSubgraphPointer(sid)); } #endif num_subg_remain = num_subgraphs; diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h index 72d5425817..bdff17e6e2 100644 --- a/libdeepgalois/include/deepgalois/Sampler.h +++ b/libdeepgalois/include/deepgalois/Sampler.h @@ -12,7 +12,7 @@ class Sampler { typedef int db_t; protected: - int m_; + index_t m; // number of vertice in the frontier size_t count_; //! averaged degree of masked graph @@ -34,8 +34,7 @@ class Sampler { //! Reindex a graph to only contain those in the vertex set void reindexSubgraph(VertexSet& keptVertices, Graph& g, Graph& reindexed); - //! Given a graph, return a graph with edges to unmasked vertices removed in - //! mg + //! Given a graph, return a graph with edges to unmasked vertices removed in mg template void getMaskedGraph(size_t n, mask_t* masks, GraphTy* g, Graph& sub) { std::vector degrees(n, 0); @@ -70,55 +69,48 @@ class Sampler { //! determine degree of each vertex in a masked graph (given by masks and g) template - void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, - std::vector& degrees) { + void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector& degrees) { assert(degrees.size() == n); #ifdef PARALLEL_GEN - galois::do_all( - galois::iterate(size_t(0), n), - [&](const auto src) { + galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { #else for (size_t src = 0; src < n; src++) { #endif - if (masks[src] == 1) { - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - const auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) { - // galois::gInfo("Edge ", src, " ", dst); - degrees[src]++; - } - } + if (masks[src] == 1) { + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) { + // galois::gInfo("Edge ", src, " ", dst); + degrees[src]++; } } + } + } #ifdef PARALLEL_GEN - , - galois::loopname("update_degrees")); + , galois::loopname("update_degrees")); #endif } //! Set masks bitset with IDs in the vertices VertexSet void createMasks(size_t n, VertexSet vertices, mask_t* masks); inline VertexList reindexVertices(size_t n, VertexSet vertex_set); - void checkGSDB(std::vector& DB0, std::vector& DB1, - std::vector& DB2, size_t size); + void checkGSDB(std::vector& DB0, std::vector& DB1, std::vector& DB2, index_t size); //! convert set of gids to lids VertexSet convertToLID(VertexSet& gidSet); public: - Sampler() : m_(DEFAULT_SIZE_FRONTIER) {} + Sampler() : m(DEFAULT_SIZE_FRONTIER) {} ~Sampler() {} //! sample a subgraph sg of size n from graph g //! sg is overwritten/is output - void sampleSubgraph(size_t n, Graph& sg, mask_t* masks, unsigned seed = 0); + void generateSubgraph(VertexSet &vertex_set, mask_t* masks, Graph* sg); //! API function for user-defined selection strategy // TODO how to expose this? - virtual void selectVertices(size_t nv, size_t n, int m, Graph* g, - VertexList vertices, VertexSet& vertex_set); - virtual void selectVertices(size_t n, int m, VertexSet& vertex_set, - unsigned seed); + virtual void selectVertices(index_t nv, index_t n, Graph* g, VertexList vertices, VertexSet& vertex_set); + virtual void selectVertices(index_t n, VertexSet& vertex_set, unsigned seed); // galois::runtime::iterable > // neighbor_sampler(Graph &g, VertexID v); diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index e3c5efb038..4e6b839179 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -270,7 +270,7 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) { } void DistContext::constructNormFactorSub(int subgraphID) { - galois::gPrint("Sub norm factor construction\n"); + //galois::gPrint("Sub norm factor construction\n"); // right now norm factor based on subgraph // TODO fix this for dist execution @@ -316,7 +316,7 @@ void DistContext::constructNormFactorSub(int subgraphID) { }, galois::loopname("NormCountingNode")); #endif - galois::gPrint("Sub norm factor construction done\n"); + //galois::gPrint("Sub norm factor construction done\n"); } //! generate labels for the subgraph, m is subgraph size, mask //! tells which vertices to use diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp index 966caaedf3..2eb18942a4 100644 --- a/libdeepgalois/src/Sampler.cpp +++ b/libdeepgalois/src/Sampler.cpp @@ -26,8 +26,7 @@ inline unsigned getDegree(Graph* g, index_t v) { return g->edge_end(v) - g->edge_begin(v); } -void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, - DGraph* dg) { +void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg) { this->count_ = count; // save original graph Sampler::globalGraph = g; @@ -41,7 +40,7 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, // get degrees of nodes that will be in new graph this->getMaskedDegrees(g->size(), masks, g, degrees); auto offsets = deepgalois::parallel_prefix_sum(degrees); - size_t ne = offsets[g->size()]; + auto ne = offsets[g->size()]; // save ids (of original graph) of training nodes to vector for (size_t i = 0; i < g->size(); i++) { @@ -86,7 +85,7 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, // helper function for graph saint implementation below void Sampler::checkGSDB(std::vector& DB0, std::vector& DB1, - std::vector& DB2, size_t size) { + std::vector& DB2, index_t size) { if (DB0.capacity() < size) { DB0.reserve(DB0.capacity() * 2); DB1.reserve(DB1.capacity() * 2); @@ -99,10 +98,8 @@ void Sampler::checkGSDB(std::vector& DB0, std::vector& DB1, // implementation from GraphSAINT // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp -void Sampler::selectVertices(size_t n, int m, VertexSet& st, unsigned seed) { - if (n < (size_t)m) { - m = n; - } +void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) { + if (n < m) m = n; unsigned myseed = seed; // unsigned myseed = tid; @@ -127,7 +124,7 @@ void Sampler::selectVertices(size_t n, int m, VertexSet& st, unsigned seed) { // for (size_t i = 0; i < 10; i++) std::cout << trainingNodes[i] << " "; // printf(")\n"); - for (int i = 0; i < m; i++) { + for (index_t i = 0; i < m; i++) { auto rand_idx = rand_r(&myseed) % Sampler::trainingNodes.size(); db_t v = IA3[i] = Sampler::trainingNodes[rand_idx]; st.insert(v); @@ -139,11 +136,11 @@ void Sampler::selectVertices(size_t n, int m, VertexSet& st, unsigned seed) { // calculate prefix sum for IA0 and store in IA2 to compute the address for // each frontier in DB IA2[0] = IA0[0]; - for (int i = 1; i < m; i++) + for (index_t i = 1; i < m; i++) IA2[i] = IA2[i - 1] + IA0[i]; // now fill DB accordingly checkGSDB(DB0, DB1, DB2, IA2[m - 1]); - for (int i = 0; i < m; i++) { + for (index_t i = 0; i < m; i++) { db_t DB_start = (i == 0) ? 0 : IA2[i - 1]; db_t DB_end = IA2[i]; for (auto j = DB_start; j < DB_end; j++) { @@ -154,7 +151,7 @@ void Sampler::selectVertices(size_t n, int m, VertexSet& st, unsigned seed) { } db_t choose, neigh_v, newsize, tmp; - for (size_t itr = 0; itr < n - m; itr++) { + for (index_t itr = 0; itr < n - m; itr++) { choose = db_t(-1); while (choose == db_t(-1)) { tmp = rand_r(&myseed) % DB0.size(); @@ -249,24 +246,24 @@ void Sampler::selectVertices(size_t n, int m, VertexSet& st, unsigned seed) { // n: number of vertices in the subgraph; // m: number of vertices in the frontier. // our implementation of GraphSAINT sampling -void Sampler::selectVertices(size_t nv, size_t n, int m, Graph* g, +void Sampler::selectVertices(index_t nv, index_t n, Graph* g, VertexList vertices, VertexSet& vertex_set) { // galois::gPrint("Select a vertex set of size ", n, " from ", nv, " vertices, // graph size: ", g->size(), "\n"); assert(nv == vertices.size()); - auto frontier_indices = deepgalois::select_k_items( - m, 0, (int)nv); // randomly select m vertices from vertices as frontier + // randomly select m vertices from vertices as frontier + auto frontier_indices = deepgalois::select_k_items((int)m, 0, (int)nv); VertexList frontier(m); - for (int i = 0; i < m; i++) + for (index_t i = 0; i < m; i++) frontier[i] = vertices[frontier_indices[i]]; vertex_set.insert(frontier.begin(), frontier.end()); // galois::gPrint("vertex_set size: ", vertex_set.size(), "\n"); int* degrees = new int[m]; - for (int i = 0; i < m; i++) { - // galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) { + //galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) { + for (index_t i = 0; i < m; i++) { degrees[i] = (int)getDegree(g, frontier[i]); } //, galois::loopname("compute_degrees")); - for (size_t i = 0; i < n - m; i++) { + for (index_t i = 0; i < n - m; i++) { auto pos = select_one_item((int)m, degrees); auto u = frontier[pos]; auto degree = degrees[pos]; @@ -294,8 +291,7 @@ void Sampler::selectVertices(size_t nv, size_t n, int m, Graph* g, void Sampler::createMasks(size_t n, VertexSet vertices, mask_t* masks) { // galois::gPrint("Updating masks, size = ", vertices.size(), "\n"); std::fill(masks, masks + n, 0); - for (auto v : vertices) - masks[v] = 1; + for (auto v : vertices) masks[v] = 1; } inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) { @@ -309,8 +305,7 @@ inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) { // Given a subset of vertices and a graph g, generate a subgraph sg from the // graph g -void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, - Graph& reindexGraph) { +void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& reindexGraph) { // auto n = origGraph.size(); // old graph size auto nv = keptVertices.size(); // new graph (subgraph) size VertexList new_ids = this->reindexVertices(globalGraph->size(), keptVertices); @@ -328,9 +323,7 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, VertexList old_ids(keptVertices.begin(), keptVertices.end()); // vertex ID mapping #ifdef PARALLEL_GEN - galois::do_all( - galois::iterate((size_t)0, nv), - [&](const auto i) { + galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) { #else for (size_t i = 0; i < nv; i++) { #endif @@ -346,8 +339,7 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, } } #ifdef PARALLEL_GEN - , - galois::loopname("construct_graph")); + , galois::loopname("construct_graph")); #endif } @@ -362,14 +354,9 @@ VertexSet Sampler::convertToLID(VertexSet& gidSet) { return existingLIDs; } -void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks, - unsigned seed) { - VertexSet sampledSet; +void Sampler::generateSubgraph(VertexSet &sampledSet, mask_t* masks, Graph* sg) { // n = 9000 by default - // this->selectVertices(count_, n, m_, globalMaskedGraph, vertices_, - // sampledSet); do the sampling of vertices from training set + using masked - // graph - this->selectVertices(n, m_, sampledSet, seed); // m = 1000 by default + // do the sampling of vertices from training set + using masked graph // sampledSet is a list of *global* ids in the graph // create new vertex set with LIDs for partitioned graph @@ -388,10 +375,9 @@ void Sampler::sampleSubgraph(size_t n, Graph& sg, mask_t* masks, // this graph will contain sampled vertices and induced subgraph for it Graph maskedSG; // TODO use partMaskedGraph once constructed later - this->getMaskedGraph( - Sampler::partGraph->size(), masks, Sampler::partGraph, - maskedSG); // remove edges whose destination is not masked - this->reindexSubgraph(sampledLIDs, maskedSG, sg); + // remove edges whose destination is not masked + this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph, maskedSG); + this->reindexSubgraph(sampledLIDs, maskedSG, *sg); // galois::gPrint("sg num edges is ", sg.sizeEdges(), "\n"); } diff --git a/libdeepgalois/src/sampler.cu b/libdeepgalois/src/Sampler.cu similarity index 87% rename from libdeepgalois/src/sampler.cu rename to libdeepgalois/src/Sampler.cu index 6fb452db4c..b3f949ca39 100644 --- a/libdeepgalois/src/sampler.cu +++ b/libdeepgalois/src/Sampler.cu @@ -97,8 +97,7 @@ inline VertexList Sampler::reindexing_vertices(size_t n, VertexSet vertex_set) { return new_ids; } -void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU* g, - GraphGPU* subg) { +void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU* g, GraphGPU* subg) { index_t *degrees, *offsets; CUDA_CHECK(cudaMalloc((void**)°rees, sizeof(index_t)*n); get_masked_degrees<<>>(n, masks, g, degrees); @@ -112,33 +111,26 @@ void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU* g, CUDA_CHECK(cudaFree(pffsets)); } -// use a random walk to select vertex subset -void Sampler::select_vertices(size_t n, int m, VertexSet& st) {} - // n: size of the original graph // nv: size of the subgraph; i.e. size of vertex_set // masks, graph g and subgraph sub are on the device (GPU) -void Sampler::generate_subgraph(index_t nv, VertexSet vertex_set, mask_t* masks, - GraphGPU* g, GraphGPU* sub) { +void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* g, GraphGPU* sub) { + auto nv = vertex_set.size(); // convert the vertex_set to a vertex_list and copy it to the device VertexList vertex_list(vertex_set.begin(), vertex_set.end()); index_t* d_vertex_list; cudaMalloc((void**)&d_vertex_list, nv * sizeof(index_t)); - CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv * sizeof(index_t), - cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv * sizeof(index_t), cudaMemcpyHostToDevice)); index_t n = graph->size(); - update_masks(n, d_vertex_list, - masks); // set masks for vertices in the vertex_set - GraphGPU - masked_sg; // size is the same as original graph, but masked dst removed - generate_masked_graph( - n, masks, g, &masked_sg); // remove edges whose destination is not masked + update_masks(n, d_vertex_list, masks); // set masks for vertices in the vertex_set + GraphGPU masked_sg; // size is the same as original graph, but masked dst removed + generate_masked_graph(n, masks, globalGraph, &masked_sg); // remove edges whose destination is not masked // re-index the subgraph - index_t* d_new_ids; // Given an old vertex ID โˆˆ [0, n), returns a new vertex - // ID โˆˆ [0, nv) + index_t* d_new_ids; cudaMalloc((void**)&d_new_ids, n * sizeof(index_t)); + // Given an old vertex ID โˆˆ [0, n), returns a new vertex ID โˆˆ [0, nv) auto new_ids = reindexing_vertices(nv, vertex_set); CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n * sizeof(index_t), cudaMemcpyHostToDevice)); From ee98dfbd9b2fec69411a889d2be35aad4cf58533 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 13 May 2020 08:13:33 -0500 Subject: [PATCH 296/660] refine Sampler interface --- libdeepgalois/CMakeLists.txt | 4 +- libdeepgalois/include/deepgalois/Net.h | 24 ++-- libdeepgalois/include/deepgalois/Sampler.h | 75 +++------- libdeepgalois/src/RandomWalk.cpp | 153 +++++++++++++++++++++ libdeepgalois/src/Sampler.cpp | 108 ++++++++++----- libdeepgalois/src/Sampler.cu | 53 +++---- 6 files changed, 286 insertions(+), 131 deletions(-) create mode 100644 libdeepgalois/src/RandomWalk.cpp diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 064f24d0d7..e66443c22a 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -58,6 +58,7 @@ if(ENABLE_HETERO_GALOIS) src/math_functions.cu src/optimizer.cu src/DistContext.cu + src/Sampler.cu src/lgraph.cu src/node.cu src/Net.cu @@ -72,7 +73,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if(ENABLE_HETERO_GALOIS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GALOIS_HET_CUDA__") - set(sources src/reader.cpp) + set(sources src/reader.cpp src/RandomWalk.cpp) else() set(sources src/layers/softmax_loss_layer.cpp @@ -85,6 +86,7 @@ else() src/math_functions.cpp src/optimizer.cpp src/DistContext.cpp + src/RandomWalk.cpp src/Sampler.cpp src/reader.cpp src/lgraph.cpp diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index e62f68b297..e47664804f 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -13,10 +13,7 @@ #include "deepgalois/Context.h" #include "deepgalois/GraphTypes.h" #include "deepgalois/DistContext.h" - -#ifndef __GALOIS_HET_CUDA__ #include "deepgalois/Sampler.h" -#endif namespace deepgalois { @@ -87,10 +84,7 @@ class Net { //! dist context holds graph data of the partitioned graph only deepgalois::DistContext* distContext; DGraph* dGraph; - -#ifndef __GALOIS_HET_CUDA__ Sampler* sampler; -#endif public: Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, @@ -202,11 +196,13 @@ class Net { distContext->allocateSubgraphs(num_subgraphs); subgraphs_masks = new mask_t[distNumSamples * num_subgraphs]; std::cout << header << "Constructing training vertex set induced graph...\n"; -#ifndef __GALOIS_HET_CUDA__ - sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, - graphTopologyContext->getGraphPointer(), - distContext->getGraphPointer()); +#ifdef __GALOIS_HET_CUDA__ + auto gg = distContext->getGraphPointer(); +#else + auto gg = graphTopologyContext->getGraphPointer(); #endif + sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg, + distContext->getGraphPointer()); } std::cout << header << "Start training...\n"; @@ -228,15 +224,12 @@ class Net { t_subgen.Start(); // generate subgraphs -#ifndef __GALOIS_HET_CUDA__ for (int sid = 0; sid < num_subgraphs; sid++) { VertexSet sampledSet; sampler->selectVertices(subgraph_sample_size, sampledSet, curEpoch); // m = 1000 by default - sampler->generateSubgraph(sampledSet, - &subgraphs_masks[sid * globalSamples], + sampler->generateSubgraph(sampledSet, &subgraphs_masks[sid * globalSamples], distContext->getSubgraphPointer(sid)); } -#endif num_subg_remain = num_subgraphs; t_subgen.Stop(); // std::cout << "Done, time: " << t_subgen.Millisecs() << "\n"; @@ -373,7 +366,6 @@ class Net { masks = test_masks; } -#ifndef __GALOIS_HET_CUDA__ // switch to the original graph if not training if (subgraph_sample_size && type != "train") { for (size_t i = 0; i < num_layers; i++) @@ -385,7 +377,7 @@ class Net { layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_ptr()); layers[0]->set_feats_ptr(distContext->get_feats_ptr()); // feed input data } -#else +#ifdef __GALOIS_HET_CUDA__ if (type == "train") { masks = d_train_masks; } else if (type == "val") { diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h index bdff17e6e2..6b24c8fce2 100644 --- a/libdeepgalois/include/deepgalois/Sampler.h +++ b/libdeepgalois/include/deepgalois/Sampler.h @@ -35,70 +35,39 @@ class Sampler { void reindexSubgraph(VertexSet& keptVertices, Graph& g, Graph& reindexed); //! Given a graph, return a graph with edges to unmasked vertices removed in mg - template - void getMaskedGraph(size_t n, mask_t* masks, GraphTy* g, Graph& sub) { - std::vector degrees(n, 0); - this->getMaskedDegrees(n, masks, g, degrees); - // auto offsets = deepgalois::parallel_prefix_sum(degrees); - auto offsets = deepgalois::prefix_sum(degrees); - size_t ne = offsets[n]; - // galois::gPrint("Generate masked graph: num_vertices=", n, ", num_edges=", - // ne, "\n"); - - // note this constructs the full graph's nodes; just trims edges - sub.allocateFrom(n, ne); - sub.constructNodes(); - - galois::do_all( - galois::iterate((size_t)0, n), - [&](const auto src) { - sub.fixEndEdge(src, offsets[src + 1]); - if (masks[src] == 1) { - auto idx = offsets[src]; - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - const auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) { - // galois::gPrint(src, " ", dst, "\n"); - sub.constructEdge(idx++, dst, 0); - } - } - } - }, - galois::loopname("gen_subgraph")); - } + template + void getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* sub); //! determine degree of each vertex in a masked graph (given by masks and g) template - void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector& degrees) { - assert(degrees.size() == n); -#ifdef PARALLEL_GEN - galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { -#else - for (size_t src = 0; src < n; src++) { -#endif - if (masks[src] == 1) { - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - const auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) { - // galois::gInfo("Edge ", src, " ", dst); - degrees[src]++; - } - } - } - } -#ifdef PARALLEL_GEN - , galois::loopname("update_degrees")); -#endif - } + void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector& degrees); //! Set masks bitset with IDs in the vertices VertexSet void createMasks(size_t n, VertexSet vertices, mask_t* masks); inline VertexList reindexVertices(size_t n, VertexSet vertex_set); - void checkGSDB(std::vector& DB0, std::vector& DB1, std::vector& DB2, index_t size); + //void checkGSDB(std::vector& DB0, std::vector& DB1, std::vector& DB2, index_t size); //! convert set of gids to lids VertexSet convertToLID(VertexSet& gidSet); + //! helper function to get degree of some vertex given some graph + inline unsigned getDegree(Graph* g, index_t v) { + return g->edge_end(v) - g->edge_begin(v); + } + + // helper function for graph saint implementation below + void checkGSDB(std::vector& DB0, std::vector& DB1, + std::vector& DB2, index_t size) { + if (DB0.capacity() < size) { + DB0.reserve(DB0.capacity() * 2); + DB1.reserve(DB1.capacity() * 2); + DB2.reserve(DB2.capacity() * 2); + } + DB0.resize(size); + DB1.resize(size); + DB2.resize(size); + } + public: Sampler() : m(DEFAULT_SIZE_FRONTIER) {} ~Sampler() {} diff --git a/libdeepgalois/src/RandomWalk.cpp b/libdeepgalois/src/RandomWalk.cpp new file mode 100644 index 0000000000..09e76e9fc7 --- /dev/null +++ b/libdeepgalois/src/RandomWalk.cpp @@ -0,0 +1,153 @@ +#include +#include +#include +#include "deepgalois/utils.h" +#include "deepgalois/Sampler.h" + +namespace deepgalois { + +// implementation from GraphSAINT +// https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp +void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) { + if (n < m) m = n; + unsigned myseed = seed; + + // unsigned myseed = tid; + // DBx: Dashboard line x, IAx: Index array line x + std::vector DB0, DB1, DB2, IA0, IA1, IA2, IA3, IA4, nDB0, nDB1, nDB2; + DB0.reserve(subg_deg * m * ETA); + DB1.reserve(subg_deg * m * ETA); + DB2.reserve(subg_deg * m * ETA); + IA0.reserve(n); + IA1.reserve(n); + IA2.reserve(n); + IA3.reserve(n); + IA4.reserve(n); + IA0.resize(m); + IA1.resize(m); + IA2.resize(m); + IA3.resize(m); + + // galois::gPrint("seed ", myseed, " m ", m, "\n"); + // galois::gPrint("trainingNodes size: ", trainingNodes.size(), "\n"); + // printf("( "); + // for (size_t i = 0; i < 10; i++) std::cout << trainingNodes[i] << " "; + // printf(")\n"); + + for (index_t i = 0; i < m; i++) { + auto rand_idx = rand_r(&myseed) % Sampler::trainingNodes.size(); + db_t v = IA3[i] = Sampler::trainingNodes[rand_idx]; + st.insert(v); + IA0[i] = getDegree(Sampler::globalMaskedGraph, v); + IA0[i] = (IA0[i] > SAMPLE_CLIP) ? SAMPLE_CLIP : IA0[i]; + IA1[i] = 1; + IA2[i] = 0; + } + // calculate prefix sum for IA0 and store in IA2 to compute the address for + // each frontier in DB + IA2[0] = IA0[0]; + for (index_t i = 1; i < m; i++) + IA2[i] = IA2[i - 1] + IA0[i]; + // now fill DB accordingly + checkGSDB(DB0, DB1, DB2, IA2[m - 1]); + for (index_t i = 0; i < m; i++) { + db_t DB_start = (i == 0) ? 0 : IA2[i - 1]; + db_t DB_end = IA2[i]; + for (auto j = DB_start; j < DB_end; j++) { + DB0[j] = IA3[i]; + DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start); + DB2[j] = i + 1; + } + } + + db_t choose, neigh_v, newsize, tmp; + for (index_t itr = 0; itr < n - m; itr++) { + choose = db_t(-1); + while (choose == db_t(-1)) { + tmp = rand_r(&myseed) % DB0.size(); + if (size_t(tmp) < DB0.size()) + if (DB0[tmp] != db_t(-1)) + choose = tmp; + } + choose = (DB1[choose] < 0) ? choose : (choose - DB1[choose]); + db_t v = DB0[choose]; + auto degree = getDegree(Sampler::globalMaskedGraph, v); + neigh_v = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1); + if (neigh_v != db_t(-1)) { + neigh_v = Sampler::globalMaskedGraph->getEdgeDst( + Sampler::globalMaskedGraph->edge_begin(v) + neigh_v); + st.insert(neigh_v); + IA1[DB2[choose] - 1] = 0; + IA0[DB2[choose] - 1] = 0; + for (auto i = choose; i < choose - DB1[choose]; i++) + DB0[i] = db_t(-1); + newsize = getDegree(Sampler::globalMaskedGraph, neigh_v); + newsize = (newsize > SAMPLE_CLIP) ? SAMPLE_CLIP : newsize; + } else + newsize = 0; + // shrink DB to remove sampled nodes, also shrink IA accordingly + bool cond = DB0.size() + newsize > DB0.capacity(); + if (cond) { + // compute prefix sum for the location in shrinked DB + IA4.resize(IA0.size()); + IA4[0] = IA0[0]; + for (size_t i = 1; i < IA0.size(); i++) + IA4[i] = IA4[i - 1] + IA0[i]; + nDB0.resize(IA4.back()); + nDB1.resize(IA4.back()); + nDB2.resize(IA4.back()); + IA2.assign(IA4.begin(), IA4.end()); + for (size_t i = 0; i < IA0.size(); i++) { + if (IA1[i] == 0) + continue; + db_t DB_start = (i == 0) ? 0 : IA4[i - 1]; + db_t DB_end = IA4[i]; + for (auto j = DB_start; j < DB_end; j++) { + nDB0[j] = IA3[i]; + nDB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start); + nDB2[j] = i + 1; + } + } + // remap the index in DB2 by compute prefix of IA1 (new idx in IA) + IA4.resize(IA1.size()); + IA4[0] = IA1[0]; + for (size_t i = 1; i < IA1.size(); i++) + IA4[i] = IA4[i - 1] + IA1[i]; + DB0.assign(nDB0.begin(), nDB0.end()); + DB1.assign(nDB1.begin(), nDB1.end()); + DB2.assign(nDB2.begin(), nDB2.end()); + for (auto i = DB2.begin(); i < DB2.end(); i++) + *i = IA4[*i - 1]; + db_t curr = 0; + for (size_t i = 0; i < IA0.size(); i++) { + if (IA0[i] != 0) { + IA0[curr] = IA0[i]; + IA1[curr] = IA1[i]; + IA2[curr] = IA2[i]; + IA3[curr] = IA3[i]; + curr++; + } + } + IA0.resize(curr); + IA1.resize(curr); + IA2.resize(curr); + IA3.resize(curr); + } + checkGSDB(DB0, DB1, DB2, newsize + DB0.size()); + IA0.push_back(newsize); + IA1.push_back(1); + IA2.push_back(IA2.back() + IA0.back()); + IA3.push_back(neigh_v); + db_t DB_start = (*(IA2.end() - 2)); + db_t DB_end = IA2.back(); + for (auto j = DB_start; j < DB_end; j++) { + DB0[j] = IA3.back(); + DB1[j] = (j == DB_start) ? (j - DB_end) : (j - DB_start); + DB2[j] = IA3.size(); + } + } + // galois::gPrint("Done selection, vertex_set size: ", st.size(), ", set: "); + // print_vertex_set(st); +} + +} // namespace deepgalois diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp index 2eb18942a4..aed0768ac0 100644 --- a/libdeepgalois/src/Sampler.cpp +++ b/libdeepgalois/src/Sampler.cpp @@ -1,8 +1,8 @@ -#include "deepgalois/utils.h" -#include "deepgalois/Sampler.h" -#include "galois/Galois.h" #include #include +#include "galois/Galois.h" +#include "deepgalois/utils.h" +#include "deepgalois/Sampler.h" #define PARALLEL_GEN namespace deepgalois { @@ -21,11 +21,6 @@ void print_vertex_set(VertexSet vertex_set) { galois::gPrint(")\n"); } -//! helper function to get degree of some vertex given some graph -inline unsigned getDegree(Graph* g, index_t v) { - return g->edge_end(v) - g->edge_begin(v); -} - void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg) { this->count_ = count; // save original graph @@ -51,22 +46,19 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGrap Sampler::globalMaskedGraph->allocateFrom(g->size(), ne); Sampler::globalMaskedGraph->constructNodes(); // same as original graph, except keep only edges involved in masks - galois::do_all( - galois::iterate((size_t)0, g->size()), - [&](const auto src) { - Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]); - if (masks[src] == 1) { - auto idx = offsets[src]; - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - const auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) { - // galois::gPrint(src, " ", dst, "\n"); - Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0); - } - } + galois::do_all(galois::iterate((size_t)0, g->size()), [&](const auto src) { + Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]); + if (masks[src] == 1) { + auto idx = offsets[src]; + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) { + // galois::gPrint(src, " ", dst, "\n"); + Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0); } - }, - galois::loopname("gen_subgraph")); + } + } + }, galois::loopname("gen_subgraph")); Sampler::globalMaskedGraph->degree_counting(); Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size(); @@ -83,19 +75,7 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGrap //} } -// helper function for graph saint implementation below -void Sampler::checkGSDB(std::vector& DB0, std::vector& DB1, - std::vector& DB2, index_t size) { - if (DB0.capacity() < size) { - DB0.reserve(DB0.capacity() * 2); - DB1.reserve(DB1.capacity() * 2); - DB2.reserve(DB2.capacity() * 2); - } - DB0.resize(size); - DB1.resize(size); - DB2.resize(size); -} - +/* // implementation from GraphSAINT // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) { @@ -239,6 +219,7 @@ void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) { // galois::gPrint("Done selection, vertex_set size: ", st.size(), ", set: "); // print_vertex_set(st); } +*/ // API function for user-defined selection strategy // Select n vertices from vertices and put them in vertex_set. @@ -323,7 +304,7 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& VertexList old_ids(keptVertices.begin(), keptVertices.end()); // vertex ID mapping #ifdef PARALLEL_GEN - galois::do_all(galois::iterate((size_t)0, nv), [&](const auto i) { + galois::do_all(galois::iterate(size_t(0), size_t(nv)), [&](const auto i) { #else for (size_t i = 0; i < nv; i++) { #endif @@ -354,6 +335,57 @@ VertexSet Sampler::convertToLID(VertexSet& gidSet) { return existingLIDs; } +template +void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector& degrees) { + assert(degrees.size() == n); +#ifdef PARALLEL_GEN + galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { +#else + for (size_t src = 0; src < n; src++) { +#endif + if (masks[src] == 1) { + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) { + // galois::gInfo("Edge ", src, " ", dst); + degrees[src]++; + } + } + } + } +#ifdef PARALLEL_GEN + , galois::loopname("update_degrees")); +#endif +} + +template +void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* sub) { + std::vector degrees(n, 0); + this->getMaskedDegrees(n, masks, g, degrees); + // auto offsets = deepgalois::parallel_prefix_sum(degrees); + auto offsets = deepgalois::prefix_sum(degrees); + size_t ne = offsets[n]; + // galois::gPrint("getMaskedGraph: num_vertices=", n, ", num_edges=", ne, "\n"); + + // note this constructs the full graph's nodes; just trims edges + sub->allocateFrom(n, ne); + sub->constructNodes(); + + galois::do_all(galois::iterate(size_t(0), size_t(n)), [&](const auto src) { + sub->fixEndEdge(src, offsets[src + 1]); + if (masks[src] == 1) { + auto idx = offsets[src]; + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) { + // galois::gPrint(src, " ", dst, "\n"); + sub->constructEdge(idx++, dst, 0); + } + } + } + }, galois::loopname("gen_subgraph")); +} + void Sampler::generateSubgraph(VertexSet &sampledSet, mask_t* masks, Graph* sg) { // n = 9000 by default // do the sampling of vertices from training set + using masked graph @@ -376,7 +408,7 @@ void Sampler::generateSubgraph(VertexSet &sampledSet, mask_t* masks, Graph* sg) Graph maskedSG; // TODO use partMaskedGraph once constructed later // remove edges whose destination is not masked - this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph, maskedSG); + this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph, &maskedSG); this->reindexSubgraph(sampledLIDs, maskedSG, *sg); // galois::gPrint("sg num edges is ", sg.sizeEdges(), "\n"); diff --git a/libdeepgalois/src/Sampler.cu b/libdeepgalois/src/Sampler.cu index b3f949ca39..a0528564dd 100644 --- a/libdeepgalois/src/Sampler.cu +++ b/libdeepgalois/src/Sampler.cu @@ -1,6 +1,7 @@ #include #include -#include "deepgalois/sampler.h" +#include "deepgalois/cutils.h" +#include "deepgalois/Sampler.h" namespace deepgalois { @@ -76,10 +77,15 @@ __global__ void generate_graph_kernel(index_t n, const index_t* offsets, } } -void Sampler::update_masks(size_t n, index_t* vertices, mask_t* masks) { - set_masks<<>>(n, vertices, masks); +void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg) { + this->count_ = count; + // save original graph + Sampler::globalGraph = g; + // save partitioned graph + Sampler::partGraph = dg; } +/* void Sampler::indexing(size_t n, index_t* vertices, index_t* new_indices) { index_t vid = 0; for (index_t i = 0; i < n; i++) { @@ -87,8 +93,8 @@ void Sampler::indexing(size_t n, index_t* vertices, index_t* new_indices) { new_indices[v] = vid++; } } - -inline VertexList Sampler::reindexing_vertices(size_t n, VertexSet vertex_set) { +*/ +inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) { VertexList new_ids(n, 0); int vid = 0; for (auto v : vertex_set) { @@ -97,24 +103,26 @@ inline VertexList Sampler::reindexing_vertices(size_t n, VertexSet vertex_set) { return new_ids; } -void Sampler::generate_masked_graph(index_t n, mask_t* masks, GraphGPU* g, GraphGPU* subg) { +template +void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* subg) { index_t *degrees, *offsets; - CUDA_CHECK(cudaMalloc((void**)°rees, sizeof(index_t)*n); - get_masked_degrees<<>>(n, masks, g, degrees); + CUDA_CHECK(cudaMalloc((void**)°rees, sizeof(index_t)*n)); + get_masked_degrees<<>>(n, masks, *g, degrees); CUDA_CHECK(cudaFree(degrees)); - CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(n+1)); + CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(n+1))); thrust::exclusive_scan(thrust::device, degrees, degrees+n, offsets); index_t ne; CUDA_CHECK(cudaMemcpy(&ne, offsets+n, sizeof(index_t), cudaMemcpyDeviceToHost)); - subg.allocateFrom(n, ne); // TODO: avoid reallocation - generate_masked_graph_kernel<<>>(n, masks, offsets, g, subg); - CUDA_CHECK(cudaFree(pffsets)); + subg->allocateFrom(n, ne); // TODO: avoid reallocation + generate_masked_graph_kernel<<>>(n, masks, offsets, *g, *subg); + CUDA_CHECK(cudaFree(offsets)); } // n: size of the original graph // nv: size of the subgraph; i.e. size of vertex_set // masks, graph g and subgraph sub are on the device (GPU) -void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* g, GraphGPU* sub) { +void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* sub) { + index_t n = globalGraph->size(); auto nv = vertex_set.size(); // convert the vertex_set to a vertex_list and copy it to the device VertexList vertex_list(vertex_set.begin(), vertex_set.end()); @@ -122,33 +130,32 @@ void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* g cudaMalloc((void**)&d_vertex_list, nv * sizeof(index_t)); CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv * sizeof(index_t), cudaMemcpyHostToDevice)); - index_t n = graph->size(); - update_masks(n, d_vertex_list, masks); // set masks for vertices in the vertex_set + // createMasks: set masks for vertices in the vertex_set + set_masks<<>>(n, d_vertex_list, masks); GraphGPU masked_sg; // size is the same as original graph, but masked dst removed - generate_masked_graph(n, masks, globalGraph, &masked_sg); // remove edges whose destination is not masked + getMaskedGraph(n, masks, globalGraph, &masked_sg); // remove edges whose destination is not masked // re-index the subgraph index_t* d_new_ids; cudaMalloc((void**)&d_new_ids, n * sizeof(index_t)); // Given an old vertex ID โˆˆ [0, n), returns a new vertex ID โˆˆ [0, nv) - auto new_ids = reindexing_vertices(nv, vertex_set); - CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n * sizeof(index_t), - cudaMemcpyHostToDevice)); + auto new_ids = reindexVertices(nv, vertex_set); + CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n * sizeof(index_t), cudaMemcpyHostToDevice)); // generate the offsets for the re-indexed subgraph index_t *degrees, *offsets; - CUDA_CHECK(cudaMalloc((void**)°rees, sizeof(index_t)*nv); + CUDA_CHECK(cudaMalloc((void**)°rees, sizeof(index_t)*nv)); get_new_degrees<<>>(nv, d_vertex_list, d_new_ids, masked_sg, degrees); CUDA_CHECK(cudaFree(degrees)); - CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(nv+1)); + CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(nv+1))); thrust::exclusive_scan(thrust::device, degrees, degrees+nv, offsets); index_t ne; CUDA_CHECK(cudaMemcpy(&ne, offsets+nv, sizeof(index_t), cudaMemcpyDeviceToHost)); // allocate memory for the subgraph - sub.allocateFrom(nv, ne); // avoid reallocation + sub->allocateFrom(nv, ne); // avoid reallocation // generate the subgraph - generate_graph_kernel<<>>(nv, offsets, d_vertex_list, d_new_ids, masked_sg, sub); + generate_graph_kernel<<>>(nv, offsets, d_vertex_list, d_new_ids, masked_sg, *sub); } } // namespace deepgalois From c0041ac4dbe85121ca40830067d4f6cf860f1bfe Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 13 May 2020 09:07:55 -0500 Subject: [PATCH 297/660] fix types --- libdeepgalois/CMakeLists.txt | 2 +- libdeepgalois/include/deepgalois/Net.h | 7 +-- libdeepgalois/include/deepgalois/Sampler.h | 10 ++-- libdeepgalois/src/RandomWalk.cpp | 57 +++++++++++++++++++ libdeepgalois/src/Sampler.cpp | 66 ++-------------------- libdeepgalois/src/Sampler.cu | 12 +--- 6 files changed, 71 insertions(+), 83 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index e66443c22a..a022a36655 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -73,7 +73,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if(ENABLE_HETERO_GALOIS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GALOIS_HET_CUDA__") - set(sources src/reader.cpp src/RandomWalk.cpp) + set(sources src/reader.cpp src/RandomWalk.cpp src/utils.cpp) else() set(sources src/layers/softmax_loss_layer.cpp diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index e47664804f..7893e40502 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -196,11 +196,8 @@ class Net { distContext->allocateSubgraphs(num_subgraphs); subgraphs_masks = new mask_t[distNumSamples * num_subgraphs]; std::cout << header << "Constructing training vertex set induced graph...\n"; -#ifdef __GALOIS_HET_CUDA__ - auto gg = distContext->getGraphPointer(); -#else - auto gg = graphTopologyContext->getGraphPointer(); -#endif + //auto gg = distContext->getGraphPointer(); + auto gg = graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg, distContext->getGraphPointer()); } diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h index 6b24c8fce2..e823ef67ce 100644 --- a/libdeepgalois/include/deepgalois/Sampler.h +++ b/libdeepgalois/include/deepgalois/Sampler.h @@ -27,8 +27,8 @@ class Sampler { std::vector trainingNodes; //! masked original graph; typically to the training set - Graph* globalMaskedGraph; - Graph* globalGraph; + GraphCPU* globalMaskedGraph; + GraphCPU* globalGraph; DGraph* partGraph; //! Reindex a graph to only contain those in the vertex set @@ -39,7 +39,7 @@ class Sampler { void getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* sub); //! determine degree of each vertex in a masked graph (given by masks and g) - template + template void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector& degrees); //! Set masks bitset with IDs in the vertices VertexSet @@ -51,7 +51,7 @@ class Sampler { VertexSet convertToLID(VertexSet& gidSet); //! helper function to get degree of some vertex given some graph - inline unsigned getDegree(Graph* g, index_t v) { + inline unsigned getDegree(GraphCPU* g, index_t v) { return g->edge_end(v) - g->edge_begin(v); } @@ -86,7 +86,7 @@ class Sampler { //! Given a mask, construct the graph with only those vertices ans ave as the //! masked graph in this class for the sampler. - void initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg); + void initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DGraph* dg); }; } // namespace deepgalois diff --git a/libdeepgalois/src/RandomWalk.cpp b/libdeepgalois/src/RandomWalk.cpp index 09e76e9fc7..ed2b3528c1 100644 --- a/libdeepgalois/src/RandomWalk.cpp +++ b/libdeepgalois/src/RandomWalk.cpp @@ -1,11 +1,68 @@ #include #include #include +#include "galois/Galois.h" #include "deepgalois/utils.h" #include "deepgalois/Sampler.h" namespace deepgalois { +void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DGraph* dg) { + this->count_ = count; + // save original graph + Sampler::globalGraph = g; + // save partitioned graph + Sampler::partGraph = dg; + + // allocate the object for the new masked graph + Sampler::globalMaskedGraph = new GraphCPU(); + + std::vector degrees(g->size(), 0); + // get degrees of nodes that will be in new graph + //this->getMaskedDegrees(g->size(), masks, g, degrees); + galois::do_all(galois::iterate(size_t(0), g->size()), [&](const auto src) { + if (masks[src] == 1) { + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) degrees[src]++; + } + } + } , galois::loopname("update_degrees")); + + auto offsets = deepgalois::parallel_prefix_sum(degrees); + auto ne = offsets[g->size()]; + + // save ids (of original graph) of training nodes to vector + for (size_t i = 0; i < g->size(); i++) { + if (masks[i] == 1) + Sampler::trainingNodes.push_back(i); + } + + Sampler::globalMaskedGraph->allocateFrom(g->size(), ne); + Sampler::globalMaskedGraph->constructNodes(); + // same as original graph, except keep only edges involved in masks + galois::do_all(galois::iterate((size_t)0, g->size()), [&](const auto src) { + Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]); + if (masks[src] == 1) { + auto idx = offsets[src]; + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) { + // galois::gPrint(src, " ", dst, "\n"); + Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0); + } + } + } + }, galois::loopname("gen_subgraph")); + + Sampler::globalMaskedGraph->degree_counting(); + Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size(); + Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg; + + // TODO masked part graph as well to save time later; right now constructing + // from full part graph +} + // implementation from GraphSAINT // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) { diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp index aed0768ac0..b3cc862eca 100644 --- a/libdeepgalois/src/Sampler.cpp +++ b/libdeepgalois/src/Sampler.cpp @@ -21,60 +21,6 @@ void print_vertex_set(VertexSet vertex_set) { galois::gPrint(")\n"); } -void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg) { - this->count_ = count; - // save original graph - Sampler::globalGraph = g; - // save partitioned graph - Sampler::partGraph = dg; - - // allocate the object for the new masked graph - Sampler::globalMaskedGraph = new Graph(); - - std::vector degrees(g->size(), 0); - // get degrees of nodes that will be in new graph - this->getMaskedDegrees(g->size(), masks, g, degrees); - auto offsets = deepgalois::parallel_prefix_sum(degrees); - auto ne = offsets[g->size()]; - - // save ids (of original graph) of training nodes to vector - for (size_t i = 0; i < g->size(); i++) { - if (masks[i] == 1) - Sampler::trainingNodes.push_back(i); - } - - Sampler::globalMaskedGraph->allocateFrom(g->size(), ne); - Sampler::globalMaskedGraph->constructNodes(); - // same as original graph, except keep only edges involved in masks - galois::do_all(galois::iterate((size_t)0, g->size()), [&](const auto src) { - Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]); - if (masks[src] == 1) { - auto idx = offsets[src]; - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - const auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) { - // galois::gPrint(src, " ", dst, "\n"); - Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0); - } - } - } - }, galois::loopname("gen_subgraph")); - - Sampler::globalMaskedGraph->degree_counting(); - Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size(); - Sampler::subg_deg = (avg_deg > SAMPLE_CLIP) ? SAMPLE_CLIP : avg_deg; - - // TODO masked part graph as well to save time later; right now constructing - // from full part graph - - // size_t idx = 0; - // vertices_.resize(count); - // for (size_t i = begin; i < end; i++) { - // if (masks_[i] == 1) - // vertices_[idx++] = i; - //} -} - /* // implementation from GraphSAINT // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp @@ -337,12 +283,11 @@ VertexSet Sampler::convertToLID(VertexSet& gidSet) { template void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector& degrees) { +//template <> +//void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphCPU* g, std::vector& degrees) { assert(degrees.size() == n); -#ifdef PARALLEL_GEN galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { -#else - for (size_t src = 0; src < n; src++) { -#endif + //for (size_t src = 0; src < n; src++) { if (masks[src] == 1) { for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { const auto dst = g->getEdgeDst(e); @@ -352,10 +297,7 @@ void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector< } } } - } -#ifdef PARALLEL_GEN - , galois::loopname("update_degrees")); -#endif + } , galois::loopname("update_degrees")); } template diff --git a/libdeepgalois/src/Sampler.cu b/libdeepgalois/src/Sampler.cu index a0528564dd..69a66d1cfc 100644 --- a/libdeepgalois/src/Sampler.cu +++ b/libdeepgalois/src/Sampler.cu @@ -77,14 +77,6 @@ __global__ void generate_graph_kernel(index_t n, const index_t* offsets, } } -void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, Graph* g, DGraph* dg) { - this->count_ = count; - // save original graph - Sampler::globalGraph = g; - // save partitioned graph - Sampler::partGraph = dg; -} - /* void Sampler::indexing(size_t n, index_t* vertices, index_t* new_indices) { index_t vid = 0; @@ -122,7 +114,7 @@ void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* s // nv: size of the subgraph; i.e. size of vertex_set // masks, graph g and subgraph sub are on the device (GPU) void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* sub) { - index_t n = globalGraph->size(); + index_t n = partGraph->size(); auto nv = vertex_set.size(); // convert the vertex_set to a vertex_list and copy it to the device VertexList vertex_list(vertex_set.begin(), vertex_set.end()); @@ -133,7 +125,7 @@ void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* s // createMasks: set masks for vertices in the vertex_set set_masks<<>>(n, d_vertex_list, masks); GraphGPU masked_sg; // size is the same as original graph, but masked dst removed - getMaskedGraph(n, masks, globalGraph, &masked_sg); // remove edges whose destination is not masked + getMaskedGraph(n, masks, partGraph, &masked_sg); // remove edges whose destination is not masked // re-index the subgraph index_t* d_new_ids; From 755baf19c50033e12d9677b2e6526f2ecc28b4ae Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 13 May 2020 13:22:08 -0500 Subject: [PATCH 298/660] extend csrgraph --- .../include/deepgalois/DistContext.h | 8 +- libdeepgalois/include/deepgalois/Net.h | 26 +++--- libdeepgalois/include/deepgalois/Sampler.h | 15 +++- libdeepgalois/include/deepgalois/lgraph.h | 10 ++- .../include/deepgalois/math_functions.hh | 9 +- libdeepgalois/src/DistContext.cpp | 87 ++++++++----------- libdeepgalois/src/DistContext.cu | 38 ++++++-- libdeepgalois/src/Net.cpp | 5 +- libdeepgalois/src/Net.cu | 8 +- libdeepgalois/src/RandomWalk.cpp | 13 +-- libdeepgalois/src/Sampler.cpp | 9 -- libdeepgalois/src/Sampler.cu | 39 ++++++--- libdeepgalois/src/math_functions.cu | 16 +++- libdeepgalois/src/reader.cpp | 2 +- libgpu/include/graph_gpu.h | 17 +++- libgpu/src/csr_graph.cu | 11 ++- 16 files changed, 189 insertions(+), 124 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index ff28bb607c..08e101e898 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -116,7 +116,13 @@ class DistContext { float_t* get_in_ptr(); //! allocate memory for subgraphs (don't actually build them) - void allocateSubgraphs(int num_subgraphs); + void allocateSubgraphs(int num_subgraphs, unsigned max_size) { + partitionedSubgraphs.resize(num_subgraphs); + for (int i = 0; i < num_subgraphs; i++) { + partitionedSubgraphs[i] = new Graph(); + partitionedSubgraphs[i]->set_max_size(max_size); + } + } }; } // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 7893e40502..082949d7fb 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -72,6 +72,7 @@ class Net { mask_t* d_test_masks; // masks for test on device mask_t* subgraphs_masks; // masks for subgraphs; size of local graph + mask_t* d_subgraphs_masks; // masks for subgraphs on device; size of local graph std::vector feature_dims; // feature dimnesions for each layer std::vector layers; // all the layers in the neural network @@ -164,7 +165,7 @@ class Net { // features are read in distcontext, not this context (this context only // used for sampling) - init(); + if (subgraph_sample_size) sampler = new deepgalois::Sampler(); } //! Default net constructor @@ -180,7 +181,7 @@ class Net { // num_vertices_sg(9000), globalTrainMasks(NULL), globalValMasks(NULL), // test_masks(NULL), context(NULL) {} - void init(); + void allocateSubgraphsMasks(int num_subgraphs); //! Initializes metadata for the partition void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel); @@ -193,8 +194,8 @@ class Net { int num_subg_remain = 0; if (subgraph_sample_size) { - distContext->allocateSubgraphs(num_subgraphs); - subgraphs_masks = new mask_t[distNumSamples * num_subgraphs]; + distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size); + allocateSubgraphsMasks(num_subgraphs); std::cout << header << "Constructing training vertex set induced graph...\n"; //auto gg = distContext->getGraphPointer(); auto gg = graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem @@ -224,7 +225,7 @@ class Net { for (int sid = 0; sid < num_subgraphs; sid++) { VertexSet sampledSet; sampler->selectVertices(subgraph_sample_size, sampledSet, curEpoch); // m = 1000 by default - sampler->generateSubgraph(sampledSet, &subgraphs_masks[sid * globalSamples], + sampler->generateSubgraph(sampledSet, subgraphs_masks + sid * globalSamples, distContext->getSubgraphPointer(sid)); } num_subg_remain = num_subgraphs; @@ -245,8 +246,8 @@ class Net { auto subgraphPointer = distContext->getSubgraphPointer(sg_id); this->subgraphNumVertices = subgraphPointer->size(); - // galois::gPrint("Subgraph num_vertices: ", subgraphNumVertices, ", - // num_edges: ", subgraphPointer->sizeEdges(), "\n"); + std::cout << "Subgraph num_vertices: " << subgraphNumVertices + << ", num_edges: " << subgraphPointer->sizeEdges() << "\n"; for (size_t i = 0; i < num_layers; i++) { layers[i]->update_dim_size(this->subgraphNumVertices); } @@ -256,18 +257,17 @@ class Net { distContext->constructNormFactorSub(sg_id); for (size_t i = 0; i < num_conv_layers; i++) { layers[i]->set_graph_ptr(subgraphPointer); - layers[i]->set_norm_consts_ptr( - distContext->get_norm_factors_subg_ptr()); + layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_subg_ptr()); } // update labels for subgraph - distContext->constructSubgraphLabels( - this->subgraphNumVertices, &subgraphs_masks[sg_id * globalSamples]); + distContext->constructSubgraphLabels(this->subgraphNumVertices, + subgraphs_masks + sg_id * globalSamples); layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_subg_ptr()); // update features for subgraph - distContext->constructSubgraphFeatures( - this->subgraphNumVertices, &subgraphs_masks[sg_id * globalSamples]); + distContext->constructSubgraphFeatures(this->subgraphNumVertices, + subgraphs_masks + sg_id * globalSamples); layers[0]->set_feats_ptr(distContext->get_feats_subg_ptr()); // feed input data // Graph* testing = distContext->getSubgraphPointer(sg_id); diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h index e823ef67ce..b8f19dcca7 100644 --- a/libdeepgalois/include/deepgalois/Sampler.h +++ b/libdeepgalois/include/deepgalois/Sampler.h @@ -44,7 +44,7 @@ class Sampler { //! Set masks bitset with IDs in the vertices VertexSet void createMasks(size_t n, VertexSet vertices, mask_t* masks); - inline VertexList reindexVertices(size_t n, VertexSet vertex_set); + //inline VertexList reindexVertices(size_t n, VertexSet vertex_set); //void checkGSDB(std::vector& DB0, std::vector& DB1, std::vector& DB2, index_t size); //! convert set of gids to lids @@ -52,7 +52,16 @@ class Sampler { //! helper function to get degree of some vertex given some graph inline unsigned getDegree(GraphCPU* g, index_t v) { - return g->edge_end(v) - g->edge_begin(v); + return g->edge_end_host(v) - g->edge_begin_host(v); + } + + inline VertexList reindexVertices(size_t n, VertexSet vertex_set) { + VertexList new_ids(n, 0); + int vid = 0; + for (auto v : vertex_set) { + new_ids[v] = vid++; // reindex + } + return new_ids; } // helper function for graph saint implementation below @@ -78,7 +87,7 @@ class Sampler { //! API function for user-defined selection strategy // TODO how to expose this? - virtual void selectVertices(index_t nv, index_t n, Graph* g, VertexList vertices, VertexSet& vertex_set); + void selectVertices(index_t nv, index_t n, Graph* g, VertexList vertices, VertexSet& vertex_set); virtual void selectVertices(index_t n, VertexSet& vertex_set, unsigned seed); // galois::runtime::iterable > diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index e9a185bfac..e0527b2161 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -16,6 +16,7 @@ class LearningGraph { // typedef index_t* IndexList; protected: bool is_device; + index_t max_size_; index_t num_vertices_; index_t num_edges_; IndexList rowptr_; @@ -34,8 +35,9 @@ class LearningGraph { public: typedef size_t iterator; LearningGraph(bool use_gpu) - : is_device(use_gpu), num_vertices_(0), num_edges_(0), vertex_data_(NULL), - edge_data_(NULL) {} + : is_device(use_gpu), max_size_(0), + num_vertices_(0), num_edges_(0), + vertex_data_(NULL), edge_data_(NULL) {} LearningGraph() : LearningGraph(false) {} ~LearningGraph() { dealloc(); } void init(index_t nv, index_t ne) { @@ -55,6 +57,7 @@ class LearningGraph { void dealloc(); void degree_counting(); void constructNodes() {} + void set_max_size(index_t max) { assert(max>0); max_size_ = max; } void readGraph(std::string dataset, bool selfloop = false); void fixEndEdge(index_t vid, index_t row_end) { rowptr_[vid + 1] = row_end; } @@ -121,6 +124,9 @@ class LearningGraph { index_t* row_start_host_ptr() { return &rowptr_[0]; } index_t* edge_dst_host_ptr() { return &colidx_[0]; } + index_t getEdgeDstHost(index_t eid) { return colidx_[eid]; } + index_t edge_begin_host(index_t vid) { return rowptr_[vid]; } + index_t edge_end_host(index_t vid) { return rowptr_[vid + 1]; } #ifndef __GALOIS_HET_CUDA__ index_t getEdgeDst(index_t eid) { return colidx_[eid]; } index_t edge_begin(index_t vid) { return rowptr_[vid]; } diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 89cc3d5d9c..6c002e2ffb 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -163,10 +163,11 @@ void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks); void float_malloc_device(int n, float_t*& ptr); void float_free_device(float_t*& ptr); void float_copy_device(int n, float_t* h_ptr, float_t* d_ptr); -acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, - float_t* loss); +void uint8_malloc_device(int n, uint8_t*& ptr); +void uint8_free_device(uint8_t*& ptr); +void uint8_copy_device(int n, uint8_t* h_ptr, uint8_t* d_ptr); +acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss); acc_t l2_norm_gpu(int n, const float_t* in); void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t* out); -void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff, - float_t* out_diff); +void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff, float_t* out_diff); #endif diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 4e6b839179..e6c6121d80 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -321,66 +321,55 @@ void DistContext::constructNormFactorSub(int subgraphID) { //! generate labels for the subgraph, m is subgraph size, mask //! tells which vertices to use void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) { + if (DistContext::usingSingleClass) { + DistContext::h_labels_subg.resize(m); + } else { + DistContext::h_labels_subg.resize(m * DistContext::num_classes); + } + size_t count = 0; + // see which labels to copy over for this subgraph + for (size_t i = 0; i < this->partitionedGraph->size(); i++) { + if (masks[i] == 1) { if (DistContext::usingSingleClass) { - DistContext::h_labels_subg.resize(m); + DistContext::h_labels_subg[count] = h_labels[i]; } else { - DistContext::h_labels_subg.resize(m * DistContext::num_classes); - } - - size_t count = 0; - // see which labels to copy over for this subgraph - for (size_t i = 0; i < this->partitionedGraph->size(); i++) { - if (masks[i] == 1) { - if (DistContext::usingSingleClass) { - DistContext::h_labels_subg[count] = h_labels[i]; - } else { - std::copy( - DistContext::h_labels + i * DistContext::num_classes, - DistContext::h_labels + (i + 1) * DistContext::num_classes, - &DistContext::h_labels_subg[count * DistContext::num_classes]); - } - // galois::gPrint("l ", (float)DistContext::h_labels_subg[count], - // "\n"); - count++; - } + std::copy( + DistContext::h_labels + i * DistContext::num_classes, + DistContext::h_labels + (i + 1) * DistContext::num_classes, + &DistContext::h_labels_subg[count * DistContext::num_classes]); } - GALOIS_ASSERT(count == m); + // galois::gPrint("l ", (float)DistContext::h_labels_subg[count], "\n"); + count++; + } + } + GALOIS_ASSERT(count == m); } //! generate input features for the subgraph, m is subgraph size, //! masks tells which vertices to use void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) { - size_t count = 0; - // if (h_feats_subg == NULL) h_feats_subg = new float_t[m*feat_len]; - DistContext::h_feats_subg.resize(m * feat_len); - for (size_t i = 0; i < this->partitionedGraph->size(); i++) { - if (masks[i] == 1) { - std::copy(DistContext::h_feats + i * DistContext::feat_len, - DistContext::h_feats + (i + 1) * DistContext::feat_len, - &DistContext::h_feats_subg[count * DistContext::feat_len]); - // for (unsigned a = 0; a < DistContext::feat_len; a++) { - // if (h_feats_subg[count * DistContext::feat_len + a] != 0) { - // galois::gPrint(h_feats_subg[count * DistContext::feat_len + a], - // " "); - // } - //} - // galois::gPrint("\n"); - count++; - } - } - GALOIS_ASSERT(count == m); + size_t count = 0; + DistContext::h_feats_subg.resize(m * feat_len); + for (size_t i = 0; i < this->partitionedGraph->size(); i++) { + if (masks[i] == 1) { + std::copy(DistContext::h_feats + i * DistContext::feat_len, + DistContext::h_feats + (i + 1) * DistContext::feat_len, + &DistContext::h_feats_subg[count * DistContext::feat_len]); + // for (unsigned a = 0; a < DistContext::feat_len; a++) { + // if (h_feats_subg[count * DistContext::feat_len + a] != 0) { + // galois::gPrint(h_feats_subg[count * DistContext::feat_len + a], + // " "); + // } + //} + // galois::gPrint("\n"); + count++; + } + } + GALOIS_ASSERT(count == m); } - galois::graphs::GluonSubstrate* DistContext::getSyncSubstrate() { - return DistContext::syncSubstrate; -}; - -void DistContext::allocateSubgraphs(int num_subgraphs) { - partitionedSubgraphs.resize(num_subgraphs); - for (int i = 0; i < num_subgraphs; i++) { - partitionedSubgraphs[i] = new Graph(); - } + return DistContext::syncSubstrate; } } // namespace deepgalois diff --git a/libdeepgalois/src/DistContext.cu b/libdeepgalois/src/DistContext.cu index 91d39bb9a4..7542849cef 100644 --- a/libdeepgalois/src/DistContext.cu +++ b/libdeepgalois/src/DistContext.cu @@ -107,11 +107,40 @@ size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, s return reader.read_masks(mask_type, n, begin, end, masks); } -void DistContext::allocateSubgraphs(int n_sg) {} +void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) { + size_t labels_size = m; + if (!usingSingleClass) labels_size = m * num_classes; + h_labels_subg.resize(labels_size); + size_t count = 0; + for (size_t i = 0; i < this->partitionedGraph->size(); i++) { + if (masks[i] == 1) { + if (usingSingleClass) h_labels_subg[count] = h_labels[i]; + else std::copy(h_labels + i * num_classes, h_labels + (i + 1) * num_classes, + &h_labels_subg[count * num_classes]); + count++; + } + } + if (d_labels_subg) uint8_free_device(d_labels_subg); + uint8_malloc_device(labels_size, d_labels_subg); + uint8_copy_device(labels_size, &h_labels_subg[0], d_labels_subg); +} -void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) {} +void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) { + size_t count = 0; + DistContext::h_feats_subg.resize(m * feat_len); + for (size_t i = 0; i < this->partitionedGraph->size(); i++) { + if (masks[i] == 1) { + std::copy(h_feats + i * feat_len, h_feats + (i + 1) * feat_len, &h_feats_subg[count * feat_len]); + count++; + } + } + if (d_feats_subg) float_free_device(d_feats_subg); + float_malloc_device(m * feat_len, d_feats_subg); + float_copy_device(m * feat_len, &h_feats_subg[0], d_feats_subg); +} -void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) {} +void DistContext::constructNormFactorSub(int subgraphID) { +} void DistContext::constructNormFactor(deepgalois::Context* globalContext) { auto n = partitionedGraph->size(); @@ -135,9 +164,6 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) { std::cout << "Done\n"; } -void DistContext::constructNormFactorSub(int subgraphID) { -} - /* void DistContext::SetDevice(const int device_id) { int current_device; diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index 41ce7b2d77..3bc7762fd5 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -72,9 +72,8 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str, feature_dims[num_layers] = num_classes; // normalized output embedding: E } -void Net::init() { - if (subgraph_sample_size) - sampler = new deepgalois::Sampler(); +void Net::allocateSubgraphsMasks(int num_subgraphs) { + subgraphs_masks = new mask_t[distNumSamples * num_subgraphs]; } // add weight decay diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu index b63e5df3a6..7b76f217dd 100644 --- a/libdeepgalois/src/Net.cu +++ b/libdeepgalois/src/Net.cu @@ -147,12 +147,14 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, namespace deepgalois { -void Net::init() { - copy_masks_device(globalSamples, globalTrainMasks, d_train_masks); - copy_masks_device(globalSamples, globalValMasks, d_val_masks); +void Net::allocateSubgraphsMasks(int num_subgraphs) { + CUDA_CHECK(cudaMalloc((void**)&subgraphs_masks, distNumSamples * num_subgraphs * sizeof(mask_t))); } void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel) { + copy_masks_device(globalSamples, globalTrainMasks, d_train_masks); + copy_masks_device(globalSamples, globalValMasks, d_val_masks); + this->distContext = new deepgalois::DistContext(); this->distContext->set_dataset(dataset_str); diff --git a/libdeepgalois/src/RandomWalk.cpp b/libdeepgalois/src/RandomWalk.cpp index ed2b3528c1..cf2112ca60 100644 --- a/libdeepgalois/src/RandomWalk.cpp +++ b/libdeepgalois/src/RandomWalk.cpp @@ -18,12 +18,13 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DG Sampler::globalMaskedGraph = new GraphCPU(); std::vector degrees(g->size(), 0); + galois::gPrint("graph size: ", g->size(), "\n"); // get degrees of nodes that will be in new graph //this->getMaskedDegrees(g->size(), masks, g, degrees); galois::do_all(galois::iterate(size_t(0), g->size()), [&](const auto src) { if (masks[src] == 1) { - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - const auto dst = g->getEdgeDst(e); + for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src); e++) { + const auto dst = g->getEdgeDstHost(e); if (masks[dst] == 1) degrees[src]++; } } @@ -45,8 +46,8 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DG Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]); if (masks[src] == 1) { auto idx = offsets[src]; - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - const auto dst = g->getEdgeDst(e); + for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src); e++) { + const auto dst = g->getEdgeDstHost(e); if (masks[dst] == 1) { // galois::gPrint(src, " ", dst, "\n"); Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0); @@ -131,8 +132,8 @@ void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) { auto degree = getDegree(Sampler::globalMaskedGraph, v); neigh_v = (degree != 0) ? rand_r(&myseed) % degree : db_t(-1); if (neigh_v != db_t(-1)) { - neigh_v = Sampler::globalMaskedGraph->getEdgeDst( - Sampler::globalMaskedGraph->edge_begin(v) + neigh_v); + neigh_v = Sampler::globalMaskedGraph->getEdgeDstHost( + Sampler::globalMaskedGraph->edge_begin_host(v) + neigh_v); st.insert(neigh_v); IA1[DB2[choose] - 1] = 0; IA0[DB2[choose] - 1] = 0; diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp index b3cc862eca..1feb2ecb69 100644 --- a/libdeepgalois/src/Sampler.cpp +++ b/libdeepgalois/src/Sampler.cpp @@ -221,15 +221,6 @@ void Sampler::createMasks(size_t n, VertexSet vertices, mask_t* masks) { for (auto v : vertices) masks[v] = 1; } -inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) { - VertexList new_ids(n, 0); - int vid = 0; - for (auto v : vertex_set) { - new_ids[v] = vid++; // reindex - } - return new_ids; -} - // Given a subset of vertices and a graph g, generate a subgraph sg from the // graph g void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& reindexGraph) { diff --git a/libdeepgalois/src/Sampler.cu b/libdeepgalois/src/Sampler.cu index 69a66d1cfc..c5db16c5f1 100644 --- a/libdeepgalois/src/Sampler.cu +++ b/libdeepgalois/src/Sampler.cu @@ -5,6 +5,10 @@ namespace deepgalois { +__global__ void clear_masks(index_t n, mask_t* masks) { + CUDA_KERNEL_LOOP(i, n) { masks[i] = 0; } +} + // set the masks of vertices in a given vertex set // n is the size of the vertex set __global__ void set_masks(index_t n, index_t* vertices, mask_t* masks) { @@ -16,6 +20,8 @@ __global__ void set_masks(index_t n, index_t* vertices, mask_t* masks) { __global__ void get_masked_degrees(index_t n, mask_t* masks, GraphGPU g, index_t* degrees) { CUDA_KERNEL_LOOP(src, n) { + if (src < 10) printf("masks[%d] = %d\n", src, masks[src]); + degrees[src] = 0; if (masks[src] == 1) { for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { auto dst = g.getEdgeDst(e); @@ -23,6 +29,7 @@ __global__ void get_masked_degrees(index_t n, mask_t* masks, GraphGPU g, degrees[src]++; } } + if (src < 10) printf("degrees[%d] = %d\n", src, degrees[src]); } } @@ -86,25 +93,19 @@ void Sampler::indexing(size_t n, index_t* vertices, index_t* new_indices) { } } */ -inline VertexList Sampler::reindexVertices(size_t n, VertexSet vertex_set) { - VertexList new_ids(n, 0); - int vid = 0; - for (auto v : vertex_set) { - new_ids[v] = vid++; // reindex - } - return new_ids; -} template void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* subg) { + std::cout << "Original graph size: " << g->size() << " edges: " << g->sizeEdges() << "\n"; index_t *degrees, *offsets; CUDA_CHECK(cudaMalloc((void**)°rees, sizeof(index_t)*n)); get_masked_degrees<<>>(n, masks, *g, degrees); - CUDA_CHECK(cudaFree(degrees)); CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(n+1))); - thrust::exclusive_scan(thrust::device, degrees, degrees+n, offsets); + thrust::exclusive_scan(thrust::device, degrees, degrees+n+1, offsets); + CUDA_CHECK(cudaFree(degrees)); index_t ne; - CUDA_CHECK(cudaMemcpy(&ne, offsets+n, sizeof(index_t), cudaMemcpyDeviceToHost)); + CUDA_CHECK(cudaMemcpy(&ne, &offsets[n], sizeof(index_t), cudaMemcpyDeviceToHost)); + std::cout << "maskedSG num_edges " << ne << "\n"; subg->allocateFrom(n, ne); // TODO: avoid reallocation generate_masked_graph_kernel<<>>(n, masks, offsets, *g, *subg); CUDA_CHECK(cudaFree(offsets)); @@ -116,38 +117,48 @@ void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* s void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* sub) { index_t n = partGraph->size(); auto nv = vertex_set.size(); + std::cout << "g size: " << n << " sg sizes: " << nv << "\n"; // convert the vertex_set to a vertex_list and copy it to the device VertexList vertex_list(vertex_set.begin(), vertex_set.end()); index_t* d_vertex_list; cudaMalloc((void**)&d_vertex_list, nv * sizeof(index_t)); CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv * sizeof(index_t), cudaMemcpyHostToDevice)); + clear_masks<<>>(n, masks); // set all 0 + CudaTest("solving clear_masks kernel failed"); // createMasks: set masks for vertices in the vertex_set set_masks<<>>(n, d_vertex_list, masks); + CudaTest("solving set_masks kernel failed"); GraphGPU masked_sg; // size is the same as original graph, but masked dst removed getMaskedGraph(n, masks, partGraph, &masked_sg); // remove edges whose destination is not masked + std::cout << "maskedGraph generated\n"; // re-index the subgraph index_t* d_new_ids; cudaMalloc((void**)&d_new_ids, n * sizeof(index_t)); // Given an old vertex ID โˆˆ [0, n), returns a new vertex ID โˆˆ [0, nv) - auto new_ids = reindexVertices(nv, vertex_set); + auto new_ids = reindexVertices(n, vertex_set); CUDA_CHECK(cudaMemcpy(d_new_ids, &new_ids[0], n * sizeof(index_t), cudaMemcpyHostToDevice)); // generate the offsets for the re-indexed subgraph index_t *degrees, *offsets; CUDA_CHECK(cudaMalloc((void**)°rees, sizeof(index_t)*nv)); get_new_degrees<<>>(nv, d_vertex_list, d_new_ids, masked_sg, degrees); - CUDA_CHECK(cudaFree(degrees)); + CudaTest("solving get_new_degrees kernel failed"); CUDA_CHECK(cudaMalloc((void**)&offsets, sizeof(index_t)*(nv+1))); - thrust::exclusive_scan(thrust::device, degrees, degrees+nv, offsets); + thrust::exclusive_scan(thrust::device, degrees, degrees+nv+1, offsets); + CUDA_CHECK(cudaFree(degrees)); index_t ne; CUDA_CHECK(cudaMemcpy(&ne, offsets+nv, sizeof(index_t), cudaMemcpyDeviceToHost)); + std::cout << "subgraph num_edges " << ne << "\n"; // allocate memory for the subgraph sub->allocateFrom(nv, ne); // avoid reallocation // generate the subgraph generate_graph_kernel<<>>(nv, offsets, d_vertex_list, d_new_ids, masked_sg, *sub); + CudaTest("solving generate_graph kernel failed"); + CUDA_CHECK(cudaFree(offsets)); + std::cout << "Subgraph generated\n"; } } // namespace deepgalois diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 246091903c..9a7c4bc1dd 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -69,15 +69,23 @@ void float_malloc_device(int n, float_t*& ptr) { void float_free_device(float_t*& ptr) { CUDA_CHECK(cudaFree(ptr)); } void float_copy_device(int n, float_t* h_ptr, float_t* d_ptr) { - CUDA_CHECK( - cudaMemcpy(d_ptr, h_ptr, n * sizeof(float_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_ptr, h_ptr, n * sizeof(float_t), cudaMemcpyHostToDevice)); +} + +void uint8_malloc_device(int n, uint8_t*& ptr) { + CUDA_CHECK(cudaMalloc((void**)&ptr, n * sizeof(uint8_t))); +} + +void uint8_free_device(uint8_t*& ptr) { CUDA_CHECK(cudaFree(ptr)); } + +void uint8_copy_device(int n, uint8_t* h_ptr, uint8_t* d_ptr) { + CUDA_CHECK(cudaMemcpy(d_ptr, h_ptr, n * sizeof(uint8_t), cudaMemcpyHostToDevice)); } void copy_masks_device(int n, mask_t* h_masks, mask_t*& d_masks) { assert(h_masks != NULL); CUDA_CHECK(cudaMalloc((void**)&d_masks, n * sizeof(mask_t))); - CUDA_CHECK( - cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(d_masks, h_masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); } __global__ void setup_curand_kernel(const int n, curandState* state) { diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp index 961b852ded..6e6e00a5d1 100644 --- a/libdeepgalois/src/reader.cpp +++ b/libdeepgalois/src/reader.cpp @@ -209,7 +209,7 @@ void Reader::readGraphFromGRFile(LearningGraph* g) { std::cout << "LearningGraph: currently edge data not supported.\n"; exit(1); } - printf("num_vertices %lu, num_edges %lu.\n", nv, ne); + printf("num_vertices %lu num_edges %lu\n", nv, ne); g->allocateFrom(nv, ne); auto rowptr = g->row_start_host_ptr(); for (unsigned vid = 0; vid < nv; ++vid) { diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index 4c480bd8fa..4ddf57b950 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -175,15 +175,24 @@ struct CSRGraph { edge_dst[eid] = dst; if (edge_data) edge_data[eid] = edata; } - void malloc_index_device(index_type n, index_type *ptr); + void malloc_index_device(index_type n, index_type*& ptr); + void free_index_device(index_type*& ptr); void set_index(index_type pos, index_type value, index_type *ptr); void allocateFrom(index_type nv, index_type ne) { + bool need_realloc = false; + if (nedges < ne) need_realloc = true; nnodes = nv; nedges = ne; - malloc_index_device(nedges, edge_dst); - malloc_index_device(nnodes+1, row_start); + if (max_size < nnodes) max_size = nnodes; + printf("allocating memory on gpu nnodes %d nedges %d\n", max_size, nedges); + if (need_realloc) { + if (edge_dst) free_index_device(edge_dst); + malloc_index_device(nedges, edge_dst); + } + if (!row_start) malloc_index_device(max_size+1, row_start); set_index(0, 0, row_start); } + void set_max_size(index_type max) { assert(max>0); max_size = max; } size_t size() { return size_t(nnodes); } size_t sizeEdges() { return size_t(nedges); } void degree_counting() {} @@ -194,5 +203,7 @@ struct CSRGraph { edge_data_type* edge_data; node_data_type* node_data; bool device_graph; + index_type max_size; // this is for reallocation; avoid re-malloc + bool is_allocated; // this is for reallocation }; #endif diff --git a/libgpu/src/csr_graph.cu b/libgpu/src/csr_graph.cu index e7be218138..19ca915cd0 100644 --- a/libgpu/src/csr_graph.cu +++ b/libgpu/src/csr_graph.cu @@ -21,7 +21,8 @@ unsigned CSRGraph::init() { node_data = NULL; nnodes = nedges = 0; device_graph = false; - + is_allocated = false; + max_size = 0; return 0; } @@ -46,7 +47,11 @@ unsigned CSRGraph::allocOnHost(bool no_edge_data) { return ((no_edge_data || edge_data) && row_start && edge_dst && node_data); } -void CSRGraph::malloc_index_device(index_type n, index_type *ptr) { +void CSRGraph::free_index_device(index_type*& ptr) { + check_cuda(cudaFree(ptr)); +} + +void CSRGraph::malloc_index_device(index_type n, index_type*& ptr) { check_cuda(cudaMalloc((void **) &ptr, n * sizeof(index_type))); } @@ -213,7 +218,7 @@ unsigned CSRGraph::readFromGR(const char file[], bool read_edge_data) { nnodes = numNodes; nedges = numEdges; - printf("nnodes=%d, nedges=%d, sizeEdge=%d.\n", nnodes, nedges, sizeEdgeTy); + printf("nnodes %d nedges %d sizeEdge %d\n", nnodes, nedges, sizeEdgeTy); allocOnHost(!read_edge_data); row_start[0] = 0; From 4010b586f223261a701912255123aefcd421aa0d Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 14 May 2020 14:44:25 -0500 Subject: [PATCH 299/660] cout->gPrint, host ID headers, general cleanup makes dist execution logs easier to parse --- libdeepgalois/include/deepgalois/Context.h | 11 +- libdeepgalois/include/deepgalois/Net.h | 102 +++++++++------ .../include/deepgalois/layers/layer.h | 79 ++++++------ libdeepgalois/src/DistContext.cpp | 122 +++++++++--------- libdeepgalois/src/Net.cpp | 25 ++-- libdeepgalois/src/reader.cpp | 26 ++-- libdeepgalois/src/utils.cpp | 6 +- lonestar/gnn/gcn/gcn.cpp | 3 +- lonestar/gnn/include/engine.h | 44 +++---- 9 files changed, 224 insertions(+), 194 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Context.h b/libdeepgalois/include/deepgalois/Context.h index 6200540847..ba3d1510bf 100644 --- a/libdeepgalois/include/deepgalois/Context.h +++ b/libdeepgalois/include/deepgalois/Context.h @@ -25,23 +25,22 @@ class Context { dataset = dataset_str; reader.init(dataset); } - size_t read_masks(std::string mask_type, size_t n, - size_t& begin, size_t& end, mask_t* masks) { + size_t read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, + mask_t* masks) { return reader.read_masks(mask_type, n, begin, end, masks); } size_t read_graph(bool selfloop) { - graph_cpu = new GraphCPU(); + graph_cpu = new GraphCPU(); graph_cpu->readGraph(dataset, selfloop); is_selfloop_added = selfloop; - std::cout << "num_vertices " << graph_cpu->size() - << " num_edges " << graph_cpu->sizeEdges() << "\n"; return graph_cpu->size(); } //! Checks if subgraph being used, sets currenet graph, then calls degreex //! counting GraphCPU* getFullGraph() { - graph_cpu->degree_counting(); // TODO: why is it here? should be in read_graph + graph_cpu + ->degree_counting(); // TODO: why is it here? should be in read_graph return graph_cpu; } }; diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 082949d7fb..3971da74d2 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -71,8 +71,9 @@ class Net { mask_t* d_val_masks; // masks for validation on device mask_t* d_test_masks; // masks for test on device - mask_t* subgraphs_masks; // masks for subgraphs; size of local graph - mask_t* d_subgraphs_masks; // masks for subgraphs on device; size of local graph + mask_t* subgraphs_masks; // masks for subgraphs; size of local graph + mask_t* + d_subgraphs_masks; // masks for subgraphs on device; size of local graph std::vector feature_dims; // feature dimnesions for each layer std::vector layers; // all the layers in the neural network @@ -90,7 +91,8 @@ class Net { public: Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, unsigned hidden1, float lr, float dropout, float wd, bool selfloop, - bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz, int val_itv) + bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz, + int val_itv) : is_single_class(single), has_l2norm(l2norm), has_dense(dense), neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz), num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), @@ -98,7 +100,7 @@ class Net { val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { // init some identifiers for this host #ifndef __GALOIS_HET_CUDA__ - this->myID = galois::runtime::getSystemNetworkInterface().ID; + this->myID = galois::runtime::getSystemNetworkInterface().ID; #endif this->header = "[" + std::to_string(myID) + "] "; this->seperator = " "; @@ -161,11 +163,13 @@ class Net { layers.resize(num_layers); // hidden1 level embedding: 16 - for (size_t i = 1; i < num_conv_layers; i++) feature_dims[i] = this->h1; + for (size_t i = 1; i < num_conv_layers; i++) + feature_dims[i] = this->h1; // features are read in distcontext, not this context (this context only // used for sampling) - if (subgraph_sample_size) sampler = new deepgalois::Sampler(); + if (subgraph_sample_size) + sampler = new deepgalois::Sampler(); } //! Default net constructor @@ -183,8 +187,9 @@ class Net { void allocateSubgraphsMasks(int num_subgraphs); - //! Initializes metadata for the partition - void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel); + //! Initializes metadata for the partition: loads data, labels, etc + void partitionInit(DGraph* graph, std::string dataset_str, + bool isSingleClassLabel); size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } void regularize(); // add weight decay @@ -196,14 +201,16 @@ class Net { if (subgraph_sample_size) { distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size); allocateSubgraphsMasks(num_subgraphs); - std::cout << header << "Constructing training vertex set induced graph...\n"; - //auto gg = distContext->getGraphPointer(); - auto gg = graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem + std::cout << header + << "Constructing training vertex set induced graph...\n"; + // auto gg = distContext->getGraphPointer(); + auto gg = + graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg, distContext->getGraphPointer()); } - std::cout << header << "Start training...\n"; + galois::gPrint(header, "Start training...\n"); Timer t_epoch; @@ -216,7 +223,8 @@ class Net { //////////////////////////////////////////////////////////////////////////////// if (subgraph_sample_size) { if (num_subg_remain == 0) { - std::cout << header << "Generating " << num_subgraphs << " subgraph(s)\n"; + std::cout << header << "Generating " << num_subgraphs + << " subgraph(s)\n"; // TODO stat timer instead of this timer Timer t_subgen; t_subgen.Start(); @@ -224,8 +232,10 @@ class Net { // generate subgraphs for (int sid = 0; sid < num_subgraphs; sid++) { VertexSet sampledSet; - sampler->selectVertices(subgraph_sample_size, sampledSet, curEpoch); // m = 1000 by default - sampler->generateSubgraph(sampledSet, subgraphs_masks + sid * globalSamples, + sampler->selectVertices(subgraph_sample_size, sampledSet, + curEpoch); // m = 1000 by default + sampler->generateSubgraph(sampledSet, + subgraphs_masks + sid * globalSamples, distContext->getSubgraphPointer(sid)); } num_subg_remain = num_subgraphs; @@ -246,7 +256,7 @@ class Net { auto subgraphPointer = distContext->getSubgraphPointer(sg_id); this->subgraphNumVertices = subgraphPointer->size(); - std::cout << "Subgraph num_vertices: " << subgraphNumVertices + std::cout << "Subgraph num_vertices: " << subgraphNumVertices << ", num_edges: " << subgraphPointer->sizeEdges() << "\n"; for (size_t i = 0; i < num_layers; i++) { layers[i]->update_dim_size(this->subgraphNumVertices); @@ -257,18 +267,21 @@ class Net { distContext->constructNormFactorSub(sg_id); for (size_t i = 0; i < num_conv_layers; i++) { layers[i]->set_graph_ptr(subgraphPointer); - layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_subg_ptr()); + layers[i]->set_norm_consts_ptr( + distContext->get_norm_factors_subg_ptr()); } // update labels for subgraph - distContext->constructSubgraphLabels(this->subgraphNumVertices, - subgraphs_masks + sg_id * globalSamples); - layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_subg_ptr()); + distContext->constructSubgraphLabels( + this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples); + layers[num_layers - 1]->set_labels_ptr( + distContext->get_labels_subg_ptr()); // update features for subgraph - distContext->constructSubgraphFeatures(this->subgraphNumVertices, - subgraphs_masks + sg_id * globalSamples); - layers[0]->set_feats_ptr(distContext->get_feats_subg_ptr()); // feed input data + distContext->constructSubgraphFeatures( + this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples); + layers[0]->set_feats_ptr( + distContext->get_feats_subg_ptr()); // feed input data // Graph* testing = distContext->getSubgraphPointer(sg_id); // for (size_t i = 0; i < testing->size(); i++) { @@ -281,28 +294,31 @@ class Net { //////////////////////////////////////////////////////////////////////////////// // training steps - std::cout << header << "Epoch " << std::setw(3) << curEpoch << seperator; + galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n"); set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; + galois::gPrint(header, "Calling into eval for forward propagation\n"); // forward: after this phase, layer edges will contain intermediate // features for use during backprop double fw_time = evaluate("train", train_loss, train_acc); + galois::gPrint(header, "Calling into backward propagation\n"); // backward: use intermediate features + ground truth to update layers // with feature gradients whcih are then used to calculate weight // gradients Net::bprop(); - // gradient update: use gradients stored on each layer to update model for - // next epoch + galois::gPrint(header, "Weight update call\n"); + // gradient update: use gradients stored on each layer to update model + // for next epoch Net::update_weights(opt); // update parameters // validation / testing set_netphases(net_phase::test); - std::cout << header << "train_loss " << std::setprecision(3) << std::fixed - << train_loss << " train_acc " << train_acc << seperator; + galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, + train_loss, " train_acc ", train_acc, "\n"); t_epoch.Stop(); @@ -313,22 +329,22 @@ class Net { // Validation acc_t val_loss = 0.0, val_acc = 0.0; double val_time = evaluate("val", val_loss, val_acc); - std::cout << header << "val_loss " << std::setprecision(3) << std::fixed - << val_loss << " val_acc " << val_acc << seperator; - std::cout << header << "time " << std::setprecision(3) << std::fixed - << epoch_time + val_time << " ms (train_time " << epoch_time - << " val_time " << val_time << ")\n"; + galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, + val_loss, " val_acc ", val_acc, "\n"); + galois::gPrint(header, "time ", std::setprecision(3), std::fixed, + epoch_time + val_time, " ms (train_time ", epoch_time, + " val_time ", val_time, ")\n"); } else { - std::cout << header << "train_time " << std::fixed << epoch_time - << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time - << ")\n"; + galois::gPrint(header, "train_time ", std::fixed, epoch_time, + " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, + ")\n"); } } // epoch loop double avg_train_time = total_train_time / (double)num_epochs; double throughput = 1000.0 * (double)num_epochs / total_train_time; - std::cout << header << "Average training time per epoch: " << avg_train_time - << " ms. Throughput: " << throughput << " epoch/s\n"; + galois::gPrint(header, "Average training time per epoch: ", avg_train_time, + " ms. Throughput: ", throughput, " epoch/s\n"); } // evaluate, i.e. inference or predict @@ -384,7 +400,9 @@ class Net { } #endif + galois::gPrint(header, "Doing actual forward propagation\n"); loss = fprop(begin, end, count, masks); + galois::gPrint(header, "Forward propagation donne, going to check accuracy\n"); float_t* predictions = layers[num_layers - 1]->next()->get_data(); // labels will be subgraph labels if applicable @@ -409,11 +427,11 @@ class Net { // read masks of test set void read_test_masks(std::string dataset); - //void copy_test_masks_to_device(); + // void copy_test_masks_to_device(); void construct_layers() { // append conv layers - std::cout << "\nConstructing layers...\n"; + galois::gPrint(header, "Constructing layers...\n"); for (size_t i = 0; i < num_conv_layers - 1; i++) { append_conv_layer(i, true); // conv layers, act=true } @@ -519,11 +537,15 @@ class Net { // set mask for the last layer; globals // TODO this should be distirbuted sample begin->end not global; fix later // seems to be unused in code right now anyways + galois::gPrint(header, "fprop: set sample mask\n"); layers[num_layers - 1]->set_sample_mask(begin, end, count, masks); for (size_t i = 0; i < num_layers; i++) { + galois::gPrint(header, "fprop: layer ", i, " forward call\n"); layers[i]->forward(); } + + galois::gPrint(header, "fprop: getting loss\n"); // prediction error auto loss = layers[num_layers - 1]->get_prediction_loss(); // Squared Norm Regularization to mitigate overfitting diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 91b57c7041..99ec74fb4a 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -40,6 +40,44 @@ class layer : public deepgalois::node { public: using ContextType = deepgalois::DistContext; +protected: + const std::string header = + "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) + + "] "; + unsigned level_; // layer id: [0, num_layers-1] + size_t begin_; // sample begin index + size_t end_; // sample end index + size_t count_; // number of samples + size_t num_dims; // number of dimensions + net_phase phase_; // in which phase: train, val or test + std::vector input_dims; // input dimensions + std::vector output_dims; // output dimentions + std::string name_; // name of this layer + bool trainable_; // is this layer trainable + bool use_mask; + vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E + vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x + // 16, layer1: 16 x E + vec_t weight_grad; // weight gradient for updating parameters + float_t* d_W; + float_t* d_weight_grad; + mask_t* masks_; // masks to show which samples are valid + mask_t* d_masks_; + float_t* loss; // error for each vertex: N x 1 + ContextType* context; + label_t* labels; + float_t* norm_consts; +// TODO +#ifdef __GALOIS_HET_CUDA__ + GraphGPU* graph_gpu; +#else + Graph* graph_cpu; + // Used for synchronization of weight gradients + deepgalois::GluonGradients* gradientGraph; + galois::graphs::GluonSubstrate* syncSub; +#endif + +public: layer(unsigned level, std::vector in_dims, std::vector out_dims) : level_(level), begin_(0), end_(0), num_dims(in_dims.size()), @@ -48,9 +86,10 @@ class layer : public deepgalois::node { virtual std::string layer_type() const = 0; virtual void malloc_and_init() {} void print_layer_info() { //! debug print function - std::cout << "Layer" << level_ << " type: " << layer_type() << " input[" - << input_dims[0] << "," << input_dims[1] << "] output[" - << output_dims[0] << "," << output_dims[1] << "]\n"; + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + galois::gPrint("[", myID, "] Layer", level_, " type: ", layer_type(), + "input[", input_dims[0], ",", input_dims[1], "] output[", + output_dims[0], ",", output_dims[1], "]\n"); } // get methods virtual acc_t get_prediction_loss() { return acc_t(0); } @@ -148,40 +187,6 @@ class layer : public deepgalois::node { // prev()->clear_grads(); next()->clear_grads(); } - -protected: - unsigned level_; // layer id: [0, num_layers-1] - size_t begin_; // sample begin index - size_t end_; // sample end index - size_t count_; // number of samples - size_t num_dims; // number of dimensions - net_phase phase_; // in which phase: train, val or test - std::vector input_dims; // input dimensions - std::vector output_dims; // output dimentions - std::string name_; // name of this layer - bool trainable_; // is this layer trainable - bool use_mask; - vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E - vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x - // 16, layer1: 16 x E - vec_t weight_grad; // weight gradient for updating parameters - float_t* d_W; - float_t* d_weight_grad; - mask_t* masks_; // masks to show which samples are valid - mask_t* d_masks_; - float_t* loss; // error for each vertex: N x 1 - ContextType* context; - label_t* labels; - float_t* norm_consts; -// TODO -#ifdef __GALOIS_HET_CUDA__ - GraphGPU* graph_gpu; -#else - Graph* graph_cpu; - // Used for synchronization of weight gradients - deepgalois::GluonGradients* gradientGraph; - galois::graphs::GluonSubstrate* syncSub; -#endif }; //! Connects tail to head's edge and sets that edge's target to tail diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index e6c6121d80..b9caa7ef5a 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -30,7 +30,8 @@ void DistContext::saveDistGraph(DGraph* a) { } // TODO move to reader class -size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str) { +size_t DistContext::read_labels(bool isSingleClassLabel, + std::string dataset_str) { DGraph* dGraph = DistContext::partitionedGraph; this->usingSingleClass = isSingleClassLabel; unsigned myID = galois::runtime::getSystemNetworkInterface().ID; @@ -49,10 +50,10 @@ size_t DistContext::read_labels(bool isSingleClassLabel, std::string dataset_str if (isSingleClassLabel) { galois::gPrint("[", myID, "] One hot labels...\n"); // single-class (one-hot) label for each vertex: N x 1 - this->h_labels = new label_t[dGraph->size()]; + this->h_labels = new label_t[dGraph->size()]; } else { galois::gPrint("[", myID, "] Multi-class labels...\n"); - this->h_labels = new label_t[dGraph->size() * this->num_classes]; + this->h_labels = new label_t[dGraph->size() * this->num_classes]; // multi-class label for each vertex: N x E } @@ -113,7 +114,7 @@ size_t DistContext::read_features(std::string dataset_str) { ifs >> m >> this->feat_len >> std::ws; ifs.close(); - galois::gPrint("N x D: ", m, " x ", feat_len, "\n"); + galois::gPrint("[", myID, "] N x D: ", m, " x ", feat_len, "\n"); // TODO read in without using 2 in-memory buffers // full read feats to load into h_feats @@ -151,6 +152,8 @@ size_t DistContext::read_features(std::string dataset_str) { size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph) { + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + bool dataset_found = false; for (int i = 0; i < NUM_DATASETS; i++) { if (dataset_str == dataset_names[i]) { @@ -159,8 +162,7 @@ size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, } } if (!dataset_found) { - std::cout << "Dataset currently not supported\n"; - exit(1); + GALOIS_DIE("Dataset currently not supported"); } size_t i = 0; size_t sample_count = 0; @@ -185,9 +187,9 @@ size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, } i++; } - std::cout << mask_type + "_mask range: [" << begin << ", " << end - << ") Number of valid samples: " << sample_count << "(" - << (float)sample_count / (float)n * (float)100 << "\%)\n"; + galois::gPrint("[", myID, "] ", mask_type, "_mask range: [", begin, ", ", end, + ") Number of valid samples: ", sample_count, "(", + (float)sample_count / (float)n * (float)100, "\%)\n"); in.close(); return sample_count; } @@ -207,7 +209,6 @@ void DistContext::allocNormFactor() { #else this->normFactors.resize(partitionedGraph->size()); #endif - // TODO clean out? } void DistContext::allocNormFactorSub(int subID) { @@ -216,11 +217,11 @@ void DistContext::allocNormFactorSub(int subID) { #else this->normFactorsSub.resize(partitionedSubgraphs[subID]->size()); #endif - // TODO clean out? } void DistContext::constructNormFactor(deepgalois::Context* globalContext) { - galois::gPrint("Norm factor construction\n"); + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + galois::gPrint("[", myID, "] Norm factor construction\n"); // using original graph to get ids Graph* wholeGraph = globalContext->getFullGraph(); @@ -233,25 +234,26 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) { //); #ifdef USE_MKL - galois::do_all(galois::iterate((size_t)0, partitionedGraph->size()), - [&] (unsigned i) { - float_t c_i = - std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); - - for (auto e = partitionedGraph->edge_begin(i); - e != partitionedGraph->edge_end(i); e++) { - const auto j = partitionedGraph->getEdgeDst(e); - float_t c_j = std::sqrt( - float_t(wholeGraph->get_degree(partitionedGraph->getGID(j)))); - - if (c_i == 0.0 || c_j == 0.0) { - this->normFactors[*e] = 0.0; - } else { - this->normFactors[*e] = 1.0 / (c_i * c_j); + galois::do_all( + galois::iterate((size_t)0, partitionedGraph->size()), + [&](unsigned i) { + float_t c_i = std::sqrt( + float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); + + for (auto e = partitionedGraph->edge_begin(i); + e != partitionedGraph->edge_end(i); e++) { + const auto j = partitionedGraph->getEdgeDst(e); + float_t c_j = std::sqrt( + float_t(wholeGraph->get_degree(partitionedGraph->getGID(j)))); + + if (c_i == 0.0 || c_j == 0.0) { + this->normFactors[*e] = 0.0; + } else { + this->normFactors[*e] = 1.0 / (c_i * c_j); + } } - } - }, - galois::loopname("NormCountingEdge")); + }, + galois::loopname("NormCountingEdge")); #else galois::do_all( galois::iterate((size_t)0, partitionedGraph->size()), @@ -266,40 +268,42 @@ void DistContext::constructNormFactor(deepgalois::Context* globalContext) { }, galois::loopname("NormCountingNode")); #endif - galois::gPrint("Norm factor construction done\n"); + galois::gPrint("[", myID, "] Norm factor construction done \n"); } void DistContext::constructNormFactorSub(int subgraphID) { - //galois::gPrint("Sub norm factor construction\n"); - // right now norm factor based on subgraph - // TODO fix this for dist execution + // galois::gPrint("Sub norm factor construction\n"); + // right now norm factor based on subgraph + // TODO fix this for dist execution - allocNormFactorSub(subgraphID); + allocNormFactorSub(subgraphID); - Graph& graphToUse = *partitionedSubgraphs[subgraphID]; - graphToUse.degree_counting(); + Graph& graphToUse = *partitionedSubgraphs[subgraphID]; + graphToUse.degree_counting(); - // TODO using partitioned subgraph rather than whoel graph; i.e. dist - // setting wrong + // TODO using partitioned subgraph rather than whoel graph; i.e. dist + // setting wrong #ifdef USE_MKL - galois::do_all(galois::iterate((size_t)0, graphToUse.size()), - [&] (unsigned i) { - // float_t c_i = - // std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); - float_t c_i = std::sqrt(float_t(graphToUse.get_degree(i))); - - for (index_t e = graphToUse.edge_begin(i); e != graphToUse.edge_end(i); - e++) { - const auto j = graphToUse.getEdgeDst(e); - float_t c_j = std::sqrt(float_t(graphToUse.get_degree(j))); - - if (c_i == 0.0 || c_j == 0.0) { - this->normFactorsSub[e] = 0.0; - } else { - this->normFactorsSub[e] = 1.0 / (c_i * c_j); + galois::do_all( + galois::iterate((size_t)0, graphToUse.size()), + [&](unsigned i) { + // float_t c_i = + // std::sqrt(float_t(wholeGraph->get_degree(partitionedGraph->getGID(i)))); + float_t c_i = std::sqrt(float_t(graphToUse.get_degree(i))); + + for (index_t e = graphToUse.edge_begin(i); e != graphToUse.edge_end(i); + e++) { + const auto j = graphToUse.getEdgeDst(e); + float_t c_j = std::sqrt(float_t(graphToUse.get_degree(j))); + + if (c_i == 0.0 || c_j == 0.0) { + this->normFactorsSub[e] = 0.0; + } else { + this->normFactorsSub[e] = 1.0 / (c_i * c_j); + } } - } - }, galois::loopname("NormCountingEdge")); + }, + galois::loopname("NormCountingEdge")); #else galois::do_all( galois::iterate((size_t)0, graphToUse.size()), @@ -316,7 +320,7 @@ void DistContext::constructNormFactorSub(int subgraphID) { }, galois::loopname("NormCountingNode")); #endif - //galois::gPrint("Sub norm factor construction done\n"); + // galois::gPrint("Sub norm factor construction done\n"); } //! generate labels for the subgraph, m is subgraph size, mask //! tells which vertices to use @@ -353,8 +357,8 @@ void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) { for (size_t i = 0; i < this->partitionedGraph->size(); i++) { if (masks[i] == 1) { std::copy(DistContext::h_feats + i * DistContext::feat_len, - DistContext::h_feats + (i + 1) * DistContext::feat_len, - &DistContext::h_feats_subg[count * DistContext::feat_len]); + DistContext::h_feats + (i + 1) * DistContext::feat_len, + &DistContext::h_feats_subg[count * DistContext::feat_len]); // for (unsigned a = 0; a < DistContext::feat_len; a++) { // if (h_feats_subg[count * DistContext::feat_len + a] != 0) { // galois::gPrint(h_feats_subg[count * DistContext::feat_len + a], diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index 3bc7762fd5..da2a7356ea 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -29,32 +29,26 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str, 0); std::fill(this->distValMasks, this->distValMasks + this->distNumSamples, 0); + // load the training/val masks if (dataset_str == "reddit") { - // this->globalTrainBegin = 0; - // this->globalTrainCount = 153431; - // this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount; - // this->globalValBegin = 153431; - // this->globalValCount = 23831; - // this->globalValEnd = this->globalValBegin + this->globalValCount; - // find local ID from global ID, set if it exists - for (size_t i = globalTrainBegin; i < globalTrainEnd; i++) { + for (size_t i = this->globalTrainBegin; i < this->globalTrainEnd; i++) { if (this->dGraph->isLocal(i)) { this->distTrainMasks[this->dGraph->getLID(i)] = 1; } } - for (size_t i = globalValBegin; i < globalValEnd; i++) { + for (size_t i = this->globalValBegin; i < this->globalValEnd; i++) { if (this->dGraph->isLocal(i)) { this->distValMasks[this->dGraph->getLID(i)] = 1; } } } else { globalTrainCount = this->distContext->read_masks( - dataset_str, "train", this->distNumSamples, globalTrainBegin, - globalTrainEnd, this->distTrainMasks, this->dGraph); + dataset_str, "train", this->distNumSamples, this->globalTrainBegin, + this->globalTrainEnd, this->distTrainMasks, this->dGraph); globalValCount = this->distContext->read_masks( - dataset_str, "val", this->distNumSamples, globalValBegin, globalValEnd, - this->distValMasks, this->dGraph); + dataset_str, "val", this->distNumSamples, this->globalValBegin, + this->globalValEnd, this->distValMasks, this->dGraph); } // input feature dimension: D @@ -96,8 +90,9 @@ void Net::read_test_masks(std::string dataset) { test_masks[dGraph->getLID(i)] = 1; } } else { - globalTestCount = distContext->read_masks(dataset, std::string("test"), - globalSamples, globalTestBegin, globalTestEnd, test_masks, dGraph); + globalTestCount = distContext->read_masks( + dataset, std::string("test"), globalSamples, globalTestBegin, + globalTestEnd, test_masks, dGraph); } } diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp index 6e6e00a5d1..54987d4635 100644 --- a/libdeepgalois/src/reader.cpp +++ b/libdeepgalois/src/reader.cpp @@ -16,7 +16,9 @@ namespace deepgalois { // be computed as y.argmax(axis=1) from one-hot encoded vector (y) of labels if // required. size_t Reader::read_labels(bool is_single_class, label_t*& labels) { - std::cout << "Reading labels ... "; + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + galois::gPrint("[", myID, "] Reader: Reading labels...\n"); + Timer t_read; t_read.Start(); std::string filename = path + dataset_str + "-labels.txt"; @@ -26,11 +28,12 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) { size_t m, num_classes; // m: number of samples in >> m >> num_classes >> std::ws; if (is_single_class) { - std::cout << "Using single-class (one-hot) labels\n"; + galois::gPrint("[", myID, + "] Reader: Using single-class (one-hot) labels\n"); labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1 } else { - std::cout << "Using multi-class labels\n"; + galois::gPrint("[", myID, "] Reader: Using multi-class (one-hot) labels\n"); labels = new label_t[m * num_classes]; // multi-class label for each vertex: N x E @@ -55,8 +58,8 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) { in.close(); t_read.Stop(); // print the number of vertex classes - std::cout << "Done, unique label counts: " << num_classes - << ", time: " << t_read.Millisecs() << " ms\n"; + galois::gPrint("[", myID, "] Done, unique label counts: ", num_classes, + ", time: ", t_read.Millisecs(), " ms\n"); // for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " << // unsigned(labels[i]) << "\n"; return num_classes; @@ -147,9 +150,9 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, } i++; } - std::cout << mask_type + "_mask range: [" << begin << ", " << end - << ") Number of valid samples: " << sample_count << " (" - << (float)sample_count / (float)n * (float)100 << "\%)\n"; + galois::gPrint("Global read", mask_type, "_mask range: [", begin, ", ", end, + ") Number of valid samples: ", sample_count, " (", + (float)sample_count / (float)n * (float)100, "\%)\n"); in.close(); return sample_count; } @@ -209,7 +212,6 @@ void Reader::readGraphFromGRFile(LearningGraph* g) { std::cout << "LearningGraph: currently edge data not supported.\n"; exit(1); } - printf("num_vertices %lu num_edges %lu\n", nv, ne); g->allocateFrom(nv, ne); auto rowptr = g->row_start_host_ptr(); for (unsigned vid = 0; vid < nv; ++vid) { @@ -250,9 +252,9 @@ void Reader::readGraphFromGRFile(LearningGraph* g) { ifs.close(); */ t.Stop(); - double runtime = t.Millisecs(); - std::cout << "read " << masterLength << " bytes in " << runtime << " ms (" - << masterLength / 1000.0 / runtime << " MB/s)\n\n"; + // double runtime = t.Millisecs(); + // std::cout << "read " << masterLength << " bytes in " << runtime << " ms (" + // << masterLength / 1000.0 / runtime << " MB/s)\n\n"; } /* diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index 3f67974c67..2780c692be 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -25,6 +25,7 @@ OutTy* parallel_prefix_sum(const std::vector& in) { total += local_sums[block]; } bulk_prefix[num_blocks] = total; + // TODO do not use new here: difficult to track and free later OutTy* prefix = new OutTy[in.size() + 1]; galois::do_all( galois::iterate((size_t)0, num_blocks), [&](const size_t& block) { @@ -109,8 +110,9 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks, recall_mic + precision_mic > 0. ? 2. * (recall_mic * precision_mic) / (recall_mic + precision_mic) : 0.; - std::cout << std::setprecision(3) << std::fixed << " (f1_micro: " << f1_micro - << ", f1_macro: " << f1_macro << ") "; + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + galois::gPrint("[", myID, "]", std::setprecision(3), std::fixed, + " (f1_micro:", f1_micro, ", f1_macro: ", f1_macro, ")\n"); return f1_micro; } diff --git a/lonestar/gnn/gcn/gcn.cpp b/lonestar/gnn/gcn/gcn.cpp index c33e7d5574..454179ad5d 100644 --- a/lonestar/gnn/gcn/gcn.cpp +++ b/lonestar/gnn/gcn/gcn.cpp @@ -6,5 +6,6 @@ const char* name = "Graph Convolutional Networks"; const char* desc = "Graph convolutional neural networks on an undirected graph"; const char* url = 0; +// TODO rather than having main being part of include file, have main in this +// just be a function call to some common start function #include "engine.h" - diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h index ad63ffdb78..155c65ca68 100644 --- a/lonestar/gnn/include/engine.h +++ b/lonestar/gnn/include/engine.h @@ -12,8 +12,8 @@ #include "deepgalois/Net.h" static void LonestarGnnPrintVersion(llvm::raw_ostream& out) { - out << "LoneStarGNN Benchmark Suite v" << galois::getVersion() - << " (" << galois::getRevision() << ")\n"; + out << "LoneStarGNN Benchmark Suite v" << galois::getVersion() << " (" + << galois::getRevision() << ")\n"; out.flush(); } @@ -32,25 +32,25 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, auto& net = galois::runtime::getSystemNetworkInterface(); if (net.ID == 0) { #endif - LonestarGnnPrintVersion(llvm::outs()); - std::cout << "Copyright (C) " << galois::getCopyrightYear() - << " The University of Texas at Austin\n"; - std::cout << "http://iss.ices.utexas.edu/galois/\n\n"; - std::cout << "application: " << (app ? app : "unspecified") << "\n"; - if (desc) - std::cout << desc << "\n"; - if (url) - std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" - << url << "\n"; - std::cout << "\n"; - std::ostringstream cmdout; - for (int i = 0; i < argc; ++i) { - cmdout << argv[i]; - if (i != argc - 1) - cmdout << " "; - } - galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str()); - galois::runtime::reportParam("(NULL)", "Threads", numThreads); + LonestarGnnPrintVersion(llvm::outs()); + std::cout << "Copyright (C) " << galois::getCopyrightYear() + << " The University of Texas at Austin\n"; + std::cout << "http://iss.ices.utexas.edu/galois/\n\n"; + std::cout << "application: " << (app ? app : "unspecified") << "\n"; + if (desc) + std::cout << desc << "\n"; + if (url) + std::cout << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" + << url << "\n"; + std::cout << "\n"; + std::ostringstream cmdout; + for (int i = 0; i < argc; ++i) { + cmdout << argv[i]; + if (i != argc - 1) + cmdout << " "; + } + galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str()); + galois::runtime::reportParam("(NULL)", "Threads", numThreads); #ifdef GALOIS_USE_DIST } #endif @@ -76,7 +76,7 @@ int main(int argc, char** argv) { #endif // initialize network + whole context on CPU - // read network, features, ground truth, initialize metadata + // read network, initialize metadata // default setting for now; can be customized by the user deepgalois::Net network(dataset, numThreads, num_conv_layers, epochs, hidden1, learning_rate, dropout_rate, weight_decay, From eb95b9ef254786850ff87faefbb6be87116fdc5c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 15 May 2020 17:02:24 -0500 Subject: [PATCH 300/660] output layers now dist part aware --- .../src/layers/sigmoid_loss_layer.cpp | 98 ++++++++++++------- .../src/layers/softmax_loss_layer.cpp | 84 +++++++++------- 2 files changed, 112 insertions(+), 70 deletions(-) diff --git a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp index 3dcb312f08..8d72ed4b07 100644 --- a/libdeepgalois/src/layers/sigmoid_loss_layer.cpp +++ b/libdeepgalois/src/layers/sigmoid_loss_layer.cpp @@ -25,22 +25,32 @@ inline label_t sigmoid_loss_layer::get_label(size_t i, size_t j) { void sigmoid_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { - size_t len = input_dims[1]; + size_t featLen = input_dims[1]; galois::do_all( galois::iterate(begin_, end_), - [&](const auto& i) { - if (!use_mask || masks_[i] == 1) { // masked - size_t idx = len * i; - // output is normalized input for this layer - math::sigmoid(len, &in_data[idx], - &out_data[idx]); // normalize using sigmoid - // one hot encoded vector for the labels - float_t* ground_truth = new float_t[len]; - for (size_t j = 0; j < len; j++) - ground_truth[j] = (float_t)get_label(i, j); - // loss calculation - loss[i] = math::cross_entropy(len, ground_truth, &out_data[idx]); - delete[] ground_truth; + [&](const auto& gid) { + if (!use_mask || masks_[gid] == 1) { // masked + // check if local to this host + if (this->context->isLocal(gid)) { + unsigned lid = this->context->getLID(gid); + size_t idx = featLen * lid; + + // output is normalized input for this layer + math::sigmoid(featLen, &in_data[idx], + &out_data[idx]); // normalize using sigmoid + + // one hot encoded vector for the labels + // TODO this is a bottleneck; big lock on memory allocator + float_t* ground_truth = new float_t[featLen]; + for (size_t j = 0; j < featLen; j++) + ground_truth[j] = (float_t)get_label(lid, j); + // loss calculation + this->loss[lid] = + math::cross_entropy(featLen, ground_truth, &out_data[idx]); + + // TODO this is a bottleneck, lock on memory possibly + delete[] ground_truth; + } } }, galois::chunk_size(), galois::steal(), @@ -50,23 +60,31 @@ void sigmoid_loss_layer::forward_propagation(const float_t* in_data, void sigmoid_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t*, float_t* in_grad) { - size_t len = layer::input_dims[1]; + size_t featLen = layer::input_dims[1]; + galois::do_all( galois::iterate(layer::begin_, layer::end_), - [&](const auto& i) { - if (!use_mask || masks_[i] == 1) { // masked - size_t idx = len * i; - float_t* norm_grad = new float_t[len]; - float_t* ground_truth = new float_t[len]; - for (size_t j = 0; j < len; j++) - ground_truth[j] = (float_t)get_label(i, j); - // use ground truth to determine derivative of cross entropy - math::d_cross_entropy(len, ground_truth, &out_data[idx], norm_grad); - // derviative sigmoid to gradient used in the next layer - math::d_sigmoid(len, &in_data[idx], &out_data[idx], &in_grad[idx], - norm_grad); - delete[] norm_grad; - delete[] ground_truth; + [&](const auto& gid) { + if (!use_mask || masks_[gid] == 1) { // masked + if (this->context->isLocal(gid)) { + unsigned lid = this->context->getLID(gid); + + size_t idx = featLen * lid; + // TODO this is bad + float_t* norm_grad = new float_t[featLen]; + float_t* ground_truth = new float_t[featLen]; + for (size_t j = 0; j < featLen; j++) + ground_truth[j] = (float_t)get_label(lid, j); + // use ground truth to determine derivative of cross entropy + math::d_cross_entropy(featLen, ground_truth, &out_data[idx], + norm_grad); + // derviative sigmoid to gradient used in the next layer + math::d_sigmoid(featLen, &in_data[idx], &out_data[idx], + &in_grad[idx], norm_grad); + // TODO this is bad + delete[] norm_grad; + delete[] ground_truth; + } } }, galois::chunk_size(), galois::steal(), @@ -74,23 +92,31 @@ void sigmoid_loss_layer::back_propagation(const float_t* in_data, } acc_t sigmoid_loss_layer::get_prediction_loss() { - assert(count_ > 0); galois::GAccumulator total_loss; galois::GAccumulator valid_sample_count; total_loss.reset(); valid_sample_count.reset(); + galois::do_all( galois::iterate(layer::begin_, layer::end_), - [&](const auto& i) { - if (!use_mask || masks_[i]) { - total_loss += loss[i]; - valid_sample_count += 1; + [&](const auto& gid) { + if (!use_mask || masks_[gid]) { + if (this->context->isLocal(gid)) { + unsigned lid = this->context->getLID(gid); + total_loss += this->loss[lid]; + valid_sample_count += 1; + } } }, galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); - assert(valid_sample_count.reduce() == count_); - return total_loss.reduce() / (acc_t)count_; + + size_t c = valid_sample_count.reduce(); + if (c > 0) { + return total_loss.reduce() / (acc_t)valid_sample_count.reduce(); + } else { + return 0; + } } } // namespace deepgalois diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 940fbeb798..3581365427 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -27,20 +27,26 @@ inline label_t softmax_loss_layer::get_label(size_t i) { // ๐‘ฆ[i] = ๐‘’^๐‘ฅ[i] / ฮฃ ๐‘’^๐‘ฅ[๐‘˜] void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { - size_t len = input_dims[1]; + // size_t numSamples = input_dims; + size_t featLen = input_dims[1]; galois::do_all( galois::iterate(begin_, end_), - [&](const auto& i) { - if (!use_mask || masks_[i] == 1) { // masked - // output is normalized input for this layer - math::softmax(len, &in_data[len * i], - &out_data[len * i]); // normalize using softmax - // one hot encoded vector for the labels - vec_t groundTruth(output_dims[1], 0.0); // ground truth - groundTruth[get_label(i)] = 1.0; // one-hot - // loss calculation - loss[i] = - math::cross_entropy(len, &groundTruth[0], &out_data[len * i]); + [&](const unsigned gid) { + // if no mask used it means all are fair game + if (!use_mask || masks_[gid] == 1) { + if (this->context->isLocal(gid)) { + unsigned lid = this->context->getLID(gid); + // output is normalized input for this layer + math::softmax(featLen, &in_data[featLen * lid], + &out_data[featLen * lid]); // normalize using softmax + // one hot encoded vector for the labels + vec_t groundTruth(output_dims[1], 0.0); // ground truth + // labels are local + groundTruth[get_label(lid)] = 1.0; // one-hot + // loss calculation + loss[lid] = math::cross_entropy(featLen, &groundTruth[0], + &out_data[featLen * lid]); + } } }, galois::chunk_size<64>(), galois::steal(), @@ -54,20 +60,24 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t*, float_t* in_grad) { // note: out_grad is ignored because it shouldn't exist (this is output layer) - size_t len = layer::input_dims[1]; + size_t featLen = layer::input_dims[1]; galois::do_all( galois::iterate(layer::begin_, layer::end_), - [&](const auto& i) { - if (!use_mask || masks_[i] == 1) { // masked - vec_t norm_grad(len); - std::vector groundTruth(len, 0.0); - groundTruth[get_label(i)] = 1.0; - // use ground truth to determine derivative of cross entropy - math::d_cross_entropy(len, &groundTruth[0], &out_data[len * i], - &norm_grad[0]); - // derviative softmax to gradient used in the next layer - math::d_softmax(len, &in_data[len * i], &out_data[len * i], - &in_grad[len * i], &norm_grad[0]); + [&](const auto& gid) { + if (!use_mask || masks_[gid] == 1) { // masked + if (this->context->isLocal(gid)) { + unsigned lid = this->context->getLID(gid); + vec_t norm_grad(featLen); + std::vector groundTruth(featLen, 0.0); + groundTruth[get_label(lid)] = 1.0; + // use ground truth to determine derivative of cross entropy + math::d_cross_entropy(featLen, &groundTruth[0], + &out_data[featLen * lid], &norm_grad[0]); + // derviative softmax to gradient used in the next layer + math::d_softmax(featLen, &in_data[featLen * lid], + &out_data[featLen * lid], &in_grad[featLen * lid], + &norm_grad[0]); + } } }, galois::chunk_size<64>(), galois::steal(), @@ -77,25 +87,31 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, } acc_t softmax_loss_layer::get_prediction_loss() { - assert(count_ > 0); galois::GAccumulator total_loss; galois::GAccumulator valid_sample_count; total_loss.reset(); valid_sample_count.reset(); + galois::do_all( galois::iterate(layer::begin_, layer::end_), - [&](const auto& i) { - if (!use_mask || masks_[i]) { - total_loss += loss[i]; - valid_sample_count += 1; + [&](const auto& gid) { + if (!use_mask || masks_[gid]) { + if (this->context->isLocal(gid)) { + unsigned lid = this->context->getLID(gid); + total_loss += this->loss[lid]; + valid_sample_count += 1; + } } }, - galois::chunk_size<64>(), galois::steal(), + galois::chunk_size<256>(), galois::steal(), galois::loopname("getMaskedLoss")); - // std::cout << "begin = " << begin_ << " end = " << end_ << " count = " << - // count_ << " valid_count = " << valid_sample_count.reduce() << "\n"; - assert(valid_sample_count.reduce() == count_); - return total_loss.reduce() / (acc_t)count_; + + size_t c = valid_sample_count.reduce(); + if (c > 0) { + return total_loss.reduce() / (acc_t)valid_sample_count.reduce(); + } else { + return 0; + } } } // namespace deepgalois From 7bab2e7558549c226fe8e7fb2306c68bf37b75dd Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 15 May 2020 17:03:38 -0500 Subject: [PATCH 301/660] distcontext: added interface to query node presence in dist graph --- .../include/deepgalois/DistContext.h | 19 +++++++------ libdeepgalois/src/DistContext.cpp | 27 ++++++++++++++++++- 2 files changed, 37 insertions(+), 9 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 08e101e898..49222eb3ab 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -110,19 +110,22 @@ class DistContext { //! return label for some node //! NOTE: this is LID, not GID - label_t get_label(size_t i) { return h_labels[i]; } + label_t get_label(size_t lid) { return h_labels[lid]; } //! returns pointer to the features of each local node float_t* get_in_ptr(); //! allocate memory for subgraphs (don't actually build them) - void allocateSubgraphs(int num_subgraphs, unsigned max_size) { - partitionedSubgraphs.resize(num_subgraphs); - for (int i = 0; i < num_subgraphs; i++) { - partitionedSubgraphs[i] = new Graph(); - partitionedSubgraphs[i]->set_max_size(max_size); - } - } + void allocateSubgraphs(int num_subgraphs, unsigned max_size); + + //! return if a vertex is owned by the partitioned graph this context contains + bool isOwned(unsigned gid); + //! return if part graph has provided vertex for given gid locally + bool isLocal(unsigned gid); + //! get GID of an lid for a vertex + unsigned getGID(unsigned lid); + //! get local id of a vertex given a global id for that vertex + unsigned getLID(unsigned gid); }; } // namespace deepgalois diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index b9caa7ef5a..320cc75b7f 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -148,7 +148,7 @@ size_t DistContext::read_features(std::string dataset_str) { return feat_len; } -// TODO move to reader class +// TODO move to reader class/reuse reader class somehow size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph) { @@ -376,4 +376,29 @@ galois::graphs::GluonSubstrate* DistContext::getSyncSubstrate() { return DistContext::syncSubstrate; } +//! allocate memory for subgraphs (don't actually build them) +void DistContext::allocateSubgraphs(int num_subgraphs, unsigned max_size) { + this->partitionedSubgraphs.resize(num_subgraphs); + for (int i = 0; i < num_subgraphs; i++) { + this->partitionedSubgraphs[i] = new Graph(); + this->partitionedSubgraphs[i]->set_max_size(max_size); + } +} + +bool DistContext::isOwned(unsigned gid) { + return this->partitionedGraph->isOwned(gid); +} + +bool DistContext::isLocal(unsigned gid) { + return this->partitionedGraph->isLocal(gid); +} + +unsigned DistContext::getGID(unsigned lid) { + return this->partitionedGraph->getGID(lid); +} + +unsigned DistContext::getLID(unsigned gid) { + return this->partitionedGraph->getLID(gid); +} + } // namespace deepgalois From 5a251d4906dcf444f0b6dff098643d7f43fbac56 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 15 May 2020 17:11:06 -0500 Subject: [PATCH 302/660] test masks always read global: fixed var names to be much more readable --- libdeepgalois/include/deepgalois/Net.h | 108 +++++++++++++------------ libdeepgalois/src/Net.cpp | 86 ++++++++++---------- libdeepgalois/src/reader.cpp | 2 +- libdeepgalois/src/utils.cpp | 9 ++- 4 files changed, 111 insertions(+), 94 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 3971da74d2..3bc9f8684c 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -63,17 +63,19 @@ class Net { mask_t* globalTrainMasks; // masks for training mask_t* globalValMasks; // masks for validation + mask_t* globalTestMasks; // masks for test + // TODO it's looking like we may not even need these dist versions mask_t* distTrainMasks; mask_t* distValMasks; - mask_t* test_masks; // masks for test + mask_t* distTestMasks; // masks for test, dst mask_t* d_train_masks; // masks for training on device mask_t* d_val_masks; // masks for validation on device mask_t* d_test_masks; // masks for test on device mask_t* subgraphs_masks; // masks for subgraphs; size of local graph - mask_t* - d_subgraphs_masks; // masks for subgraphs on device; size of local graph + // masks for subgraphs on device; size of local graph + mask_t* d_subgraphs_masks; std::vector feature_dims; // feature dimnesions for each layer std::vector layers; // all the layers in the neural network @@ -107,12 +109,11 @@ class Net { assert(n_conv > 0); - // TODO use galois print: need avoid including Galois.h for GPU - std::cout << header << "Configuration: num_threads " << num_threads - << ", num_conv_layers " << num_conv_layers << ", num_epochs " - << num_epochs << ", hidden1 " << hidden1 << ", learning_rate " - << learning_rate << ", dropout_rate " << dropout_rate - << ", weight_decay " << weight_decay << "\n"; + galois::gPrint(header, "Configuration: num_threads ", num_threads, + ", num_conv_layers ", num_conv_layers, ", num_epochs ", + num_epochs, ", hidden1 ", hidden1, ", learning_rate ", + learning_rate, ", dropout_rate ", dropout_rate, + ", weight_decay ", weight_decay, "\n"); this->num_layers = num_conv_layers + 1; // additional layers to add @@ -133,6 +134,7 @@ class Net { // subgraph in the sampler globalTrainMasks = new mask_t[globalSamples]; globalValMasks = new mask_t[globalSamples]; + globalTestMasks = new mask_t[globalSamples]; std::fill(globalTrainMasks, globalTrainMasks + globalSamples, 0); std::fill(globalValMasks, globalValMasks + globalSamples, 0); @@ -183,7 +185,7 @@ class Net { // globalValCount(0), globalTestBegin(0), globalTestEnd(0), // globalTestCount(0), val_interval(1), num_subgraphs(1), // num_vertices_sg(9000), globalTrainMasks(NULL), globalValMasks(NULL), - // test_masks(NULL), context(NULL) {} + // globalTestMasks(NULL), context(NULL) {} void allocateSubgraphsMasks(int num_subgraphs); @@ -351,32 +353,32 @@ class Net { double evaluate(std::string type, acc_t& loss, acc_t& acc) { Timer t_eval; t_eval.Start(); - size_t begin = 0, end = 0, count = 0; - mask_t* masks = NULL; + size_t gBegin = 0, gEnd = 0, gCount = 0; + mask_t* gMasks = NULL; // TODO global here good for dist case? if (type == "train") { - begin = globalTrainBegin; - end = globalTrainEnd; - count = globalTrainCount; - masks = globalTrainMasks; + gBegin = globalTrainBegin; + gEnd = globalTrainEnd; + gCount = globalTrainCount; + gMasks = globalTrainMasks; if (subgraph_sample_size) { - // update masks for subgraph - masks = NULL; - begin = 0; - end = this->subgraphNumVertices; - count = this->subgraphNumVertices; + // update gMasks for subgraph + gMasks = NULL; + gBegin = 0; + gEnd = this->subgraphNumVertices; + gCount = this->subgraphNumVertices; } } else if (type == "val") { - begin = globalValBegin; - end = globalValEnd; - count = globalValCount; - masks = globalValMasks; + gBegin = globalValBegin; + gEnd = globalValEnd; + gCount = globalValCount; + gMasks = globalValMasks; } else { - begin = globalTestBegin; - end = globalTestEnd; - count = globalTestCount; - masks = test_masks; + gBegin = globalTestBegin; + gEnd = globalTestEnd; + gCount = globalTestCount; + gMasks = globalTestMasks; } // switch to the original graph if not training @@ -392,41 +394,46 @@ class Net { } #ifdef __GALOIS_HET_CUDA__ if (type == "train") { - masks = d_train_masks; + gMasks = d_train_masks; } else if (type == "val") { - masks = d_val_masks; + gMasks = d_val_masks; } else { - masks = d_test_masks; + gMasks = d_test_masks; } #endif galois::gPrint(header, "Doing actual forward propagation\n"); - loss = fprop(begin, end, count, masks); - galois::gPrint(header, "Forward propagation donne, going to check accuracy\n"); + loss = fprop(gBegin, gEnd, gCount, gMasks); + galois::gPrint(header, + "Forward propagation donne, going to check accuracy\n"); float_t* predictions = layers[num_layers - 1]->next()->get_data(); // labels will be subgraph labels if applicable - label_t* labels; + label_t* localLabels; if (type == "train" && subgraph_sample_size) { - labels = distContext->get_labels_subg_ptr(); + localLabels = distContext->get_labels_subg_ptr(); } else { // note this grabs global labels; everything passed in should be global - labels = distContext->get_labels_ptr(); + localLabels = distContext->get_labels_ptr(); } if (is_single_class) { - acc = masked_accuracy(begin, end, count, masks, predictions, labels); + acc = masked_accuracy(gBegin, gEnd, gCount, gMasks, predictions, + localLabels); } else { - acc = masked_multi_class_accuracy(begin, end, count, masks, predictions, - labels); + acc = masked_multi_class_accuracy(gBegin, gEnd, gCount, gMasks, + predictions, localLabels); } t_eval.Stop(); return t_eval.Millisecs(); } - // read masks of test set + //! read masks of test set for GLOBAL set void read_test_masks(std::string dataset); + //! read test masks only for local nodes; assumes dist context is initialized + void readDistributedTestMasks(std::string dataset); + // void copy_test_masks_to_device(); void construct_layers() { @@ -533,12 +540,12 @@ class Net { //! forward propagation: [begin, end) is the range of samples used. //! calls "forward" on each layer and returns the loss of the final layer - acc_t fprop(size_t begin, size_t end, size_t count, mask_t* masks) { + acc_t fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) { // set mask for the last layer; globals - // TODO this should be distirbuted sample begin->end not global; fix later + // TODO this should be distirbuted sample gBegin->end not global; fix later // seems to be unused in code right now anyways galois::gPrint(header, "fprop: set sample mask\n"); - layers[num_layers - 1]->set_sample_mask(begin, end, count, masks); + layers[num_layers - 1]->set_sample_mask(gBegin, gEnd, gCount, gMasks); for (size_t i = 0; i < num_layers; i++) { galois::gPrint(header, "fprop: layer ", i, " forward call\n"); @@ -547,7 +554,7 @@ class Net { galois::gPrint(header, "fprop: getting loss\n"); // prediction error - auto loss = layers[num_layers - 1]->get_prediction_loss(); + acc_t loss = layers[num_layers - 1]->get_prediction_loss(); // Squared Norm Regularization to mitigate overfitting loss += weight_decay * layers[0]->get_weight_decay_loss(); return loss; @@ -576,11 +583,12 @@ class Net { } // comparing outputs with the ground truth (labels) - acc_t masked_accuracy(size_t begin, size_t end, size_t count, mask_t* masks, - float_t* preds, label_t* ground_truth); - acc_t masked_multi_class_accuracy(size_t begin, size_t end, size_t count, - mask_t* masks, float_t* preds, - label_t* ground_truth); + acc_t masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount, + mask_t* gMasks, float_t* preds, + label_t* localGroundTruth); + acc_t masked_multi_class_accuracy(size_t gBegin, size_t gEnd, size_t gCount, + mask_t* gMasks, float_t* preds, + label_t* localGroundTruth); }; } // namespace deepgalois diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index da2a7356ea..55de8ad3ae 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -80,95 +80,99 @@ void Net::regularize() { } void Net::read_test_masks(std::string dataset) { - test_masks = new mask_t[distNumSamples]; + if (dataset == "reddit") { + globalTestBegin = 177262; + globalTestCount = 55703; + globalTestEnd = globalTestBegin + globalTestCount; + for (size_t i = globalTestBegin; i < globalTestEnd; i++) { + globalTestMasks[i] = 1; + } + } else { + globalTestCount = graphTopologyContext->read_masks( + "test", globalSamples, globalTestBegin, globalTestEnd, globalTestMasks); + } +} + +void Net::readDistributedTestMasks(std::string dataset) { + distTestMasks = new mask_t[distNumSamples]; if (dataset == "reddit") { globalTestBegin = 177262; globalTestCount = 55703; globalTestEnd = globalTestBegin + globalTestCount; for (size_t i = globalTestBegin; i < globalTestEnd; i++) { if (dGraph->isLocal(i)) - test_masks[dGraph->getLID(i)] = 1; + distTestMasks[dGraph->getLID(i)] = 1; } } else { globalTestCount = distContext->read_masks( dataset, std::string("test"), globalSamples, globalTestBegin, - globalTestEnd, test_masks, dGraph); + globalTestEnd, distTestMasks, dGraph); } } /** - * - * @param begin GLOBAL begin - * @param end GLOBAL end - * @param masks: GLOBAL masks - * @param count GLOBAL training count + * @param gBegin GLOBAL begin + * @param gEnd GLOBAL end + * @param gMasks: GLOBAL masks + * @param gCount GLOBAL training count */ -acc_t Net::masked_accuracy(size_t begin, size_t end, size_t count, - mask_t* masks, float_t* preds, - label_t* ground_truth) { +acc_t Net::masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount, + mask_t* gMasks, float_t* preds, + label_t* localGroundTruth) { galois::DGAccumulator accuracy_all; galois::DGAccumulator sampleCount; accuracy_all.reset(); sampleCount.reset(); - // TODO figure this out for distributed case galois::do_all( - galois::iterate(begin, end), + galois::iterate(gBegin, gEnd), [&](const auto& i) { -#ifndef GALOIS_USE_DIST - if (masks == NULL || - masks[i] == 1) { // use sampled graph when masks is NULL - // get prediction - auto pred = math::argmax(num_classes, preds + i * num_classes); - // check prediction - if ((label_t)pred == ground_truth[i]) - accuracy_all += 1.0; - } -#else - // TODO dist subraph - // only look at owned nodes (i.e. masters); the prediction for these // should only by handled on the owner if (this->dGraph->isOwned(i)) { sampleCount += 1; uint32_t localID = this->dGraph->getLID(i); - if (masks == NULL) { - // GALOIS_DIE("subgraphs not implemented for dist yet"); - // subgraph here: TODO + if (gMasks == NULL) { auto pred = math::argmax(num_classes, &preds[localID * num_classes]); // check prediction - if ((label_t)pred == ground_truth[localID]) + if ((label_t)pred == localGroundTruth[localID]) accuracy_all += 1.0; } else { - if (masks[localID] == 1) { + // TODO masks needs to be local id + if (gMasks[localID] == 1) { // get prediction auto pred = math::argmax(num_classes, &preds[localID * num_classes]); // check prediction - if ((label_t)pred == ground_truth[localID]) + if ((label_t)pred == localGroundTruth[localID]) accuracy_all += 1.0; } } } -#endif }, galois::loopname("getMaskedLoss")); - count = sampleCount.reduce(); - galois::gDebug("sample count is ", count); + gCount = sampleCount.reduce(); + galois::gDebug("sample count is ", gCount); // all hosts should get same accuracy - return accuracy_all.reduce() / (acc_t)count; + return accuracy_all.reduce() / (acc_t)gCount; } -acc_t Net::masked_multi_class_accuracy(size_t begin, size_t end, size_t count, - mask_t* masks, float_t* preds, - label_t* ground_truth) { - // TODO dist version - return deepgalois::masked_f1_score(begin, end, count, masks, num_classes, - ground_truth, preds); +acc_t Net::masked_multi_class_accuracy(size_t gBegin, size_t gEnd, + size_t gCount, mask_t* gMasks, + float_t* preds, + label_t* localGroundTruth) { + // TODO fix this + if (galois::runtime::getSystemNetworkInterface().Num > 1) { + GALOIS_DIE( + "Multi-class accuracy not yet implemented for distributed setting\n"); + } + + return deepgalois::masked_f1_score(gBegin, gEnd, gCount, gMasks, num_classes, + localGroundTruth, preds); } } // namespace deepgalois diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp index 54987d4635..d131913587 100644 --- a/libdeepgalois/src/reader.cpp +++ b/libdeepgalois/src/reader.cpp @@ -150,7 +150,7 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, } i++; } - galois::gPrint("Global read", mask_type, "_mask range: [", begin, ", ", end, + galois::gPrint("Global read ", mask_type, "_mask range: [", begin, ", ", end, ") Number of valid samples: ", sample_count, " (", (float)sample_count / (float)n * (float)100, "\%)\n"); in.close(); diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index 2780c692be..9fb90c46c1 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -60,12 +60,14 @@ parallel_prefix_sum(const std::vector& in); acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks, size_t num_classes, label_t* ground_truth, float_t* pred) { + // TODO dist version; make aware of distributed execution double precision_cls(0.), recall_cls(0.), f1_accum(0.); int tp_accum(0), fn_accum(0), fp_accum(0), tn_accum(0); + for (size_t col = 0; col < num_classes; col++) { int tp_cls(0), fp_cls(0), fn_cls(0), tn_cls(0); + for (size_t row = begin; row < end; row++) { - // galois::do_all(galois::iterate(begin, end), [&](const auto& row) { if (masks == NULL || masks[row] == 1) { auto idx = row * num_classes + col; if (ground_truth[idx] == 1 && pred[idx] > 0.5) { @@ -83,7 +85,7 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks, } } } - //}, galois::loopname("MaskedF1Score")); + tp_accum += tp_cls; fn_accum += fn_cls; fp_accum += fp_cls; @@ -97,6 +99,7 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks, ? 2. * (recall_cls * precision_cls) / (recall_cls + precision_cls) : 0.; } + double f1_macro = f1_accum / (double)num_classes; // double accuracy_mic = // (double)(tp_accum+tn_accum)/(double)(tp_accum+tn_accum+fp_accum+fn_accum); @@ -110,9 +113,11 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks, recall_mic + precision_mic > 0. ? 2. * (recall_mic * precision_mic) / (recall_mic + precision_mic) : 0.; + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; galois::gPrint("[", myID, "]", std::setprecision(3), std::fixed, " (f1_micro:", f1_micro, ", f1_macro: ", f1_macro, ")\n"); + return f1_micro; } From f68125b25dd5077925c26531d0e3c261aa2f1820 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 15 May 2020 18:16:32 -0500 Subject: [PATCH 303/660] reactivated sync; still buggy apparenltly --- libdeepgalois/include/deepgalois/Net.h | 2 +- .../deepgalois/layers/GradientSyncStructs.h | 1 + libdeepgalois/src/Net.cpp | 2 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 20 +++++++++---------- 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 3bc9f8684c..522365b662 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -413,7 +413,7 @@ class Net { if (type == "train" && subgraph_sample_size) { localLabels = distContext->get_labels_subg_ptr(); } else { - // note this grabs global labels; everything passed in should be global + // note this grabs local labels localLabels = distContext->get_labels_ptr(); } diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h index dd2f3de6a9..0f73f2cbca 100644 --- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h +++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h @@ -15,6 +15,7 @@ struct GradientSync { // galois::gInfo("weight ", node_id, " not consistent with one received"); //} weight += y; + weight /= 2; return true; } diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index 55de8ad3ae..605cd209e1 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -155,7 +155,7 @@ acc_t Net::masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount, galois::loopname("getMaskedLoss")); gCount = sampleCount.reduce(); - galois::gDebug("sample count is ", gCount); + galois::gDebug("Total sample count is ", gCount); // all hosts should get same accuracy return accuracy_all.reduce() / (acc_t)gCount; diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index d7c29d1cfa..941a796a81 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -118,10 +118,10 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, // TODO sync of out_data required here // TODO how to do this for the sampled case? - // deepgalois::_syncVectorSize = z; - // deepgalois::_dataToSync = out_data; - // layer::context->getSyncSubstrate()->sync( - // "AggSync"); + deepgalois::_syncVectorSize = z; + deepgalois::_dataToSync = out_data; + layer::context->getSyncSubstrate()->sync( + "AggSync"); // run relu activation on output if specified if (act_) @@ -164,16 +164,16 @@ void graph_conv_layer::back_propagation(const float_t* in_data, } // sync agg - // deepgalois::_syncVectorSize = z; - // deepgalois::_dataToSync = out_temp; - // layer::context->getSyncSubstrate()->sync( - // "AggSyncBack"); + deepgalois::_syncVectorSize = z; + deepgalois::_dataToSync = out_temp; + layer::context->getSyncSubstrate()->sync( + "AggSyncBack"); if (level_ != 0 && dropout_) math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad); - // layer::syncSub->sync("GradientSync"); - // galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); + layer::syncSub->sync("GradientSync"); + galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); } acc_t graph_conv_layer::get_weight_decay_loss() { From e41fa34451bf8aa429f339ef768060c2db745910 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 16 May 2020 06:58:20 -0500 Subject: [PATCH 304/660] update sampler --- libdeepgalois/include/deepgalois/Sampler.h | 7 ++++++- libdeepgalois/src/Sampler.cpp | 6 ------ libdeepgalois/src/Sampler.cu | 14 +++++++++----- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h index b8f19dcca7..1b5754f394 100644 --- a/libdeepgalois/include/deepgalois/Sampler.h +++ b/libdeepgalois/include/deepgalois/Sampler.h @@ -43,13 +43,18 @@ class Sampler { void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector& degrees); //! Set masks bitset with IDs in the vertices VertexSet - void createMasks(size_t n, VertexSet vertices, mask_t* masks); + //void createMasks(size_t n, VertexSet vertices, mask_t* masks); //inline VertexList reindexVertices(size_t n, VertexSet vertex_set); //void checkGSDB(std::vector& DB0, std::vector& DB1, std::vector& DB2, index_t size); //! convert set of gids to lids VertexSet convertToLID(VertexSet& gidSet); + void createMasks(size_t n, VertexSet vertices, mask_t* masks) { + std::fill(masks, masks + n, 0); + for (auto v : vertices) masks[v] = 1; + } + //! helper function to get degree of some vertex given some graph inline unsigned getDegree(GraphCPU* g, index_t v) { return g->edge_end_host(v) - g->edge_begin_host(v); diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp index 1feb2ecb69..36b697ecb6 100644 --- a/libdeepgalois/src/Sampler.cpp +++ b/libdeepgalois/src/Sampler.cpp @@ -215,12 +215,6 @@ void Sampler::selectVertices(index_t nv, index_t n, Graph* g, */ } -void Sampler::createMasks(size_t n, VertexSet vertices, mask_t* masks) { - // galois::gPrint("Updating masks, size = ", vertices.size(), "\n"); - std::fill(masks, masks + n, 0); - for (auto v : vertices) masks[v] = 1; -} - // Given a subset of vertices and a graph g, generate a subgraph sg from the // graph g void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& reindexGraph) { diff --git a/libdeepgalois/src/Sampler.cu b/libdeepgalois/src/Sampler.cu index c5db16c5f1..97835ea9cc 100644 --- a/libdeepgalois/src/Sampler.cu +++ b/libdeepgalois/src/Sampler.cu @@ -124,13 +124,17 @@ void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* s cudaMalloc((void**)&d_vertex_list, nv * sizeof(index_t)); CUDA_CHECK(cudaMemcpy(d_vertex_list, &vertex_list[0], nv * sizeof(index_t), cudaMemcpyHostToDevice)); - clear_masks<<>>(n, masks); // set all 0 - CudaTest("solving clear_masks kernel failed"); + createMasks(n, vertex_set, masks); + mask_t* d_masks; + cudaMalloc((void**)&d_masks, n * sizeof(mask_t)); + CUDA_CHECK(cudaMemcpy(d_masks, masks, n * sizeof(mask_t), cudaMemcpyHostToDevice)); + //clear_masks<<>>(n, d_masks); // set all 0 + //CudaTest("solving clear_masks kernel failed"); // createMasks: set masks for vertices in the vertex_set - set_masks<<>>(n, d_vertex_list, masks); - CudaTest("solving set_masks kernel failed"); + //set_masks<<>>(n, d_vertex_list, d_masks); + //CudaTest("solving set_masks kernel failed"); GraphGPU masked_sg; // size is the same as original graph, but masked dst removed - getMaskedGraph(n, masks, partGraph, &masked_sg); // remove edges whose destination is not masked + getMaskedGraph(n, d_masks, partGraph, &masked_sg); // remove edges whose destination is not masked std::cout << "maskedGraph generated\n"; // re-index the subgraph From 6a72b42153f63aabdad44fc59de83cd8444a76ad Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 16 May 2020 21:00:40 -0500 Subject: [PATCH 305/660] fix some gpu compile errors --- .../include/deepgalois/layers/layer.h | 6 +++--- libdeepgalois/src/reader.cpp | 20 ++++++++++--------- libdeepgalois/src/utils.cpp | 9 ++++++--- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 99ec74fb4a..ee2d66aa95 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -41,9 +41,9 @@ class layer : public deepgalois::node { using ContextType = deepgalois::DistContext; protected: - const std::string header = - "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) + - "] "; + //const std::string header = + // "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) + + // "] "; unsigned level_; // layer id: [0, num_layers-1] size_t begin_; // sample begin index size_t end_; // sample end index diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp index d131913587..6c408f6449 100644 --- a/libdeepgalois/src/reader.cpp +++ b/libdeepgalois/src/reader.cpp @@ -16,8 +16,11 @@ namespace deepgalois { // be computed as y.argmax(axis=1) from one-hot encoded vector (y) of labels if // required. size_t Reader::read_labels(bool is_single_class, label_t*& labels) { - unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + unsigned myID = 0; +#ifndef __GALOIS_HET_CUDA__ + myID = galois::runtime::getSystemNetworkInterface().ID; galois::gPrint("[", myID, "] Reader: Reading labels...\n"); +#endif Timer t_read; t_read.Start(); @@ -28,12 +31,11 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) { size_t m, num_classes; // m: number of samples in >> m >> num_classes >> std::ws; if (is_single_class) { - galois::gPrint("[", myID, - "] Reader: Using single-class (one-hot) labels\n"); + std::cout << "[" << myID << "] Reader: Using single-class (one-hot) labels\n"; labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1 } else { - galois::gPrint("[", myID, "] Reader: Using multi-class (one-hot) labels\n"); + std::cout << "[" << myID << "] Reader: Using multi-class (one-hot) labels\n"; labels = new label_t[m * num_classes]; // multi-class label for each vertex: N x E @@ -58,8 +60,8 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) { in.close(); t_read.Stop(); // print the number of vertex classes - galois::gPrint("[", myID, "] Done, unique label counts: ", num_classes, - ", time: ", t_read.Millisecs(), " ms\n"); + std::cout << "[" << myID << "] Done, unique label counts: " << num_classes + << ", time: " << t_read.Millisecs() << " ms\n"; // for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " << // unsigned(labels[i]) << "\n"; return num_classes; @@ -150,9 +152,9 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, } i++; } - galois::gPrint("Global read ", mask_type, "_mask range: [", begin, ", ", end, - ") Number of valid samples: ", sample_count, " (", - (float)sample_count / (float)n * (float)100, "\%)\n"); + std::cout << "Global read " << mask_type << "_mask range: [" << begin + << ", " << end << ") Number of valid samples: " << sample_count + << " (" << (float)sample_count / (float)n * (float)100 << "\%)\n"; in.close(); return sample_count; } diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index 9fb90c46c1..2f4e6ba549 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -114,9 +114,12 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks, ? 2. * (recall_mic * precision_mic) / (recall_mic + precision_mic) : 0.; - unsigned myID = galois::runtime::getSystemNetworkInterface().ID; - galois::gPrint("[", myID, "]", std::setprecision(3), std::fixed, - " (f1_micro:", f1_micro, ", f1_macro: ", f1_macro, ")\n"); + unsigned myID = 0; +#ifndef __GALOIS_HET_CUDA__ + myID = galois::runtime::getSystemNetworkInterface().ID; +#endif + std::cout << "[" << myID << "]" << std::setprecision(3) << std::fixed + << " (f1_micro:" << f1_micro << ", f1_macro: " << f1_macro << ")\n"; return f1_micro; } From 08cb177d133a7266f91392e8ee3ea3b94008a8ed Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 16 May 2020 21:10:42 -0500 Subject: [PATCH 306/660] tiny --- libdeepgalois/include/deepgalois/layers/layer.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index ee2d66aa95..7715836404 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -86,10 +86,13 @@ class layer : public deepgalois::node { virtual std::string layer_type() const = 0; virtual void malloc_and_init() {} void print_layer_info() { //! debug print function - unsigned myID = galois::runtime::getSystemNetworkInterface().ID; - galois::gPrint("[", myID, "] Layer", level_, " type: ", layer_type(), - "input[", input_dims[0], ",", input_dims[1], "] output[", - output_dims[0], ",", output_dims[1], "]\n"); + unsigned myID = 0; +#ifndef __GALOIS_HET_CUDA__ + galois::runtime::getSystemNetworkInterface().ID; +#endif + std::cout << "[" << myID << "] Layer" << level_ << " type: " << layer_type() + << "input[" << input_dims[0], "," << input_dims[1] << "] output[" + << output_dims[0] << "," << output_dims[1] << "]\n"; } // get methods virtual acc_t get_prediction_loss() { return acc_t(0); } From 8bd3e33fa5e0b60048faa9d7315b67135729069a Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 16 May 2020 21:18:47 -0500 Subject: [PATCH 307/660] fix error --- libdeepgalois/include/deepgalois/layers/layer.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 7715836404..b7779b7e5b 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -88,10 +88,10 @@ class layer : public deepgalois::node { void print_layer_info() { //! debug print function unsigned myID = 0; #ifndef __GALOIS_HET_CUDA__ - galois::runtime::getSystemNetworkInterface().ID; + myID = galois::runtime::getSystemNetworkInterface().ID; #endif std::cout << "[" << myID << "] Layer" << level_ << " type: " << layer_type() - << "input[" << input_dims[0], "," << input_dims[1] << "] output[" + << "input[" << input_dims[0] << "," << input_dims[1] << "] output[" << output_dims[0] << "," << output_dims[1] << "]\n"; } // get methods From 5e7f0b56bb744c78b425c8a4823ce4faf2fafa08 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 16 May 2020 21:58:39 -0500 Subject: [PATCH 308/660] fix test_masks name --- libdeepgalois/src/Net.cu | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu index 7b76f217dd..8e8ce83267 100644 --- a/libdeepgalois/src/Net.cu +++ b/libdeepgalois/src/Net.cu @@ -183,19 +183,18 @@ void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleCla } void Net::read_test_masks(std::string dataset) { - test_masks = new mask_t[distNumSamples]; if (dataset == "reddit") { globalTestBegin = 177262; globalTestCount = 55703; globalTestEnd = globalTestBegin + globalTestCount; for (size_t i = globalTestBegin; i < globalTestEnd; i++) - test_masks[i] = 1; + globalTestMasks[i] = 1; } else { globalTestCount = distContext->read_masks(dataset, std::string("test"), - globalSamples, globalTestBegin, globalTestEnd, test_masks, NULL); + globalSamples, globalTestBegin, globalTestEnd, globalTestMasks, NULL); } //copy_test_masks_to_device(); - copy_masks_device(globalSamples, test_masks, d_test_masks); + copy_masks_device(globalSamples, globalTestMasks, d_test_masks); } //void Net::copy_test_masks_to_device() {} From 925d1a5a54cb1c632cd085d9514c364d7ab4c9ba Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 16 May 2020 21:14:44 -0500 Subject: [PATCH 309/660] Fixed accuracy checking in the distributed setting --- libdeepgalois/src/Net.cpp | 17 ++++++++--------- lonestar/gnn/include/engine.h | 1 - 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/libdeepgalois/src/Net.cpp b/libdeepgalois/src/Net.cpp index 605cd209e1..d07b19f912 100644 --- a/libdeepgalois/src/Net.cpp +++ b/libdeepgalois/src/Net.cpp @@ -126,28 +126,28 @@ acc_t Net::masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount, galois::do_all( galois::iterate(gBegin, gEnd), - [&](const auto& i) { + [&](const auto& gid) { // only look at owned nodes (i.e. masters); the prediction for these // should only by handled on the owner - if (this->dGraph->isOwned(i)) { + if (this->dGraph->isOwned(gid)) { sampleCount += 1; - - uint32_t localID = this->dGraph->getLID(i); + uint32_t localID = this->dGraph->getLID(gid); if (gMasks == NULL) { auto pred = math::argmax(num_classes, &preds[localID * num_classes]); // check prediction - if ((label_t)pred == localGroundTruth[localID]) + if ((label_t)pred == localGroundTruth[localID]) { accuracy_all += 1.0; + } } else { - // TODO masks needs to be local id - if (gMasks[localID] == 1) { + if (gMasks[gid] == 1) { // get prediction auto pred = math::argmax(num_classes, &preds[localID * num_classes]); // check prediction - if ((label_t)pred == localGroundTruth[localID]) + if ((label_t)pred == localGroundTruth[localID]) { accuracy_all += 1.0; + } } } } @@ -156,7 +156,6 @@ acc_t Net::masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount, gCount = sampleCount.reduce(); galois::gDebug("Total sample count is ", gCount); - // all hosts should get same accuracy return accuracy_all.reduce() / (acc_t)gCount; } diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h index 155c65ca68..36a04b2f70 100644 --- a/lonestar/gnn/include/engine.h +++ b/lonestar/gnn/include/engine.h @@ -103,7 +103,6 @@ int main(int argc, char** argv) { if (do_test) { // test using test samples - galois::gPrint("\n"); network.read_test_masks(dataset); galois::StatTimer Ttest("Test"); Ttest.start(); From bc1f798bef0f97c63c1527165e01629512ab5761 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 16 May 2020 22:11:21 -0500 Subject: [PATCH 310/660] RIP gPrint In all seriosness, GPU not compiling if we include Galois in Net and such is a serious issue for distributed execution; will need to be fixed later --- libdeepgalois/include/deepgalois/Net.h | 68 +++++++++++-------- .../deepgalois/layers/GradientSyncStructs.h | 2 + .../layers/GraphConvSyncStructures.h | 2 + .../include/deepgalois/layers/layer.h | 16 +++-- libdeepgalois/include/deepgalois/reader.h | 2 +- libdeepgalois/include/deepgalois/utils.h | 5 +- libdeepgalois/src/reader.cpp | 9 +++ libdeepgalois/src/utils.cpp | 2 + lonestar/gnn/include/engine.h | 7 +- 9 files changed, 66 insertions(+), 47 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 522365b662..d3558a99e3 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -109,11 +109,11 @@ class Net { assert(n_conv > 0); - galois::gPrint(header, "Configuration: num_threads ", num_threads, - ", num_conv_layers ", num_conv_layers, ", num_epochs ", - num_epochs, ", hidden1 ", hidden1, ", learning_rate ", - learning_rate, ", dropout_rate ", dropout_rate, - ", weight_decay ", weight_decay, "\n"); + //galois::gPrint(header, "Configuration: num_threads ", num_threads, + // ", num_conv_layers ", num_conv_layers, ", num_epochs ", + // num_epochs, ", hidden1 ", hidden1, ", learning_rate ", + // learning_rate, ", dropout_rate ", dropout_rate, + // ", weight_decay ", weight_decay, "\n"); this->num_layers = num_conv_layers + 1; // additional layers to add @@ -201,7 +201,10 @@ class Net { int num_subg_remain = 0; if (subgraph_sample_size) { +// TOOD this needs to be enabled +#ifndef __GALOIS_HET_CUDA__ distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size); +#endif allocateSubgraphsMasks(num_subgraphs); std::cout << header << "Constructing training vertex set induced graph...\n"; @@ -212,7 +215,7 @@ class Net { distContext->getGraphPointer()); } - galois::gPrint(header, "Start training...\n"); + //galois::gPrint(header, "Start training...\n"); Timer t_epoch; @@ -296,22 +299,24 @@ class Net { //////////////////////////////////////////////////////////////////////////////// // training steps - galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n"); + //galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n"); set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; - galois::gPrint(header, "Calling into eval for forward propagation\n"); + //galois::gPrint(header, "Calling into eval for forward propagation\n"); // forward: after this phase, layer edges will contain intermediate // features for use during backprop - double fw_time = evaluate("train", train_loss, train_acc); + //double fw_time = evaluate("train", train_loss, train_acc); + evaluate("train", train_loss, train_acc); + - galois::gPrint(header, "Calling into backward propagation\n"); + //galois::gPrint(header, "Calling into backward propagation\n"); // backward: use intermediate features + ground truth to update layers // with feature gradients whcih are then used to calculate weight // gradients Net::bprop(); - galois::gPrint(header, "Weight update call\n"); + //galois::gPrint(header, "Weight update call\n"); // gradient update: use gradients stored on each layer to update model // for next epoch Net::update_weights(opt); // update parameters @@ -319,8 +324,8 @@ class Net { // validation / testing set_netphases(net_phase::test); - galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, - train_loss, " train_acc ", train_acc, "\n"); + //galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, + // train_loss, " train_acc ", train_acc, "\n"); t_epoch.Stop(); @@ -331,22 +336,25 @@ class Net { // Validation acc_t val_loss = 0.0, val_acc = 0.0; double val_time = evaluate("val", val_loss, val_acc); - galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, - val_loss, " val_acc ", val_acc, "\n"); - galois::gPrint(header, "time ", std::setprecision(3), std::fixed, - epoch_time + val_time, " ms (train_time ", epoch_time, - " val_time ", val_time, ")\n"); + std::cout << header << "val_loss " << std::setprecision(3) << std::fixed << + val_loss << " val_acc " << val_acc << " time " << val_time << "\n"; + //galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, + // val_loss, " val_acc ", val_acc, "\n"); + //galois::gPrint(header, "time ", std::setprecision(3), std::fixed, + // epoch_time + val_time, " ms (train_time ", epoch_time, + // " val_time ", val_time, ")\n"); } else { - galois::gPrint(header, "train_time ", std::fixed, epoch_time, - " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, - ")\n"); + //galois::gPrint(header, "train_time ", std::fixed, epoch_time, + // " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, + // ")\n"); } } // epoch loop double avg_train_time = total_train_time / (double)num_epochs; double throughput = 1000.0 * (double)num_epochs / total_train_time; - galois::gPrint(header, "Average training time per epoch: ", avg_train_time, - " ms. Throughput: ", throughput, " epoch/s\n"); + std::cout << "ave training time " << avg_train_time << " through " << throughput << "\n"; + //galois::gPrint(header, "Average training time per epoch: ", avg_train_time, + // " ms. Throughput: ", throughput, " epoch/s\n"); } // evaluate, i.e. inference or predict @@ -402,10 +410,10 @@ class Net { } #endif - galois::gPrint(header, "Doing actual forward propagation\n"); + //galois::gPrint(header, "Doing actual forward propagation\n"); loss = fprop(gBegin, gEnd, gCount, gMasks); - galois::gPrint(header, - "Forward propagation donne, going to check accuracy\n"); + //galois::gPrint(header, + // "Forward propagation donne, going to check accuracy\n"); float_t* predictions = layers[num_layers - 1]->next()->get_data(); // labels will be subgraph labels if applicable @@ -438,7 +446,7 @@ class Net { void construct_layers() { // append conv layers - galois::gPrint(header, "Constructing layers...\n"); + //galois::gPrint(header, "Constructing layers...\n"); for (size_t i = 0; i < num_conv_layers - 1; i++) { append_conv_layer(i, true); // conv layers, act=true } @@ -544,15 +552,15 @@ class Net { // set mask for the last layer; globals // TODO this should be distirbuted sample gBegin->end not global; fix later // seems to be unused in code right now anyways - galois::gPrint(header, "fprop: set sample mask\n"); + //galois::gPrint(header, "fprop: set sample mask\n"); layers[num_layers - 1]->set_sample_mask(gBegin, gEnd, gCount, gMasks); for (size_t i = 0; i < num_layers; i++) { - galois::gPrint(header, "fprop: layer ", i, " forward call\n"); + //galois::gPrint(header, "fprop: layer ", i, " forward call\n"); layers[i]->forward(); } - galois::gPrint(header, "fprop: getting loss\n"); + //galois::gPrint(header, "fprop: getting loss\n"); // prediction error acc_t loss = layers[num_layers - 1]->get_prediction_loss(); // Squared Norm Regularization to mitigate overfitting diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h index 0f73f2cbca..c962f20004 100644 --- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h +++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h @@ -1,3 +1,4 @@ +#ifndef __GALOIS_HET_CUDA__ #ifndef __GRAD_SYNC_STRUCT__ #define __GRAD_SYNC_STRUCT__ @@ -44,3 +45,4 @@ struct GradientSync { // TODO bitset; might have to do it manually // GALOIS_SYNC_STRUCTURE_BITSET(TODOTHIS?); #endif +#endif diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h index cb5a33e783..7c3c038d15 100644 --- a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h +++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h @@ -1,3 +1,4 @@ +#ifndef __GALOIS_HET_CUDA__ #ifndef __GRAPH_CONV_SYNC_STRUCT__ #define __GRAPH_CONV_SYNC_STRUCT__ @@ -62,3 +63,4 @@ struct GraphConvSync { }; #endif +#endif diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index b7779b7e5b..47ddb20dc3 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -41,9 +41,11 @@ class layer : public deepgalois::node { using ContextType = deepgalois::DistContext; protected: - //const std::string header = - // "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) + - // "] "; + #ifndef __GALOIS_HET_CUDA__ + const std::string header = + "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) + + "] "; + #endif unsigned level_; // layer id: [0, num_layers-1] size_t begin_; // sample begin index size_t end_; // sample end index @@ -86,13 +88,15 @@ class layer : public deepgalois::node { virtual std::string layer_type() const = 0; virtual void malloc_and_init() {} void print_layer_info() { //! debug print function - unsigned myID = 0; #ifndef __GALOIS_HET_CUDA__ - myID = galois::runtime::getSystemNetworkInterface().ID; + unsigned myID = galois::runtime::getSystemNetworkInterface().ID; #endif - std::cout << "[" << myID << "] Layer" << level_ << " type: " << layer_type() + std::cout << "Layer " << level_ << " type: " << layer_type() << "input[" << input_dims[0] << "," << input_dims[1] << "] output[" << output_dims[0] << "," << output_dims[1] << "]\n"; + //galois::gPrint("[", myID, "] Layer", level_, " type: ", layer_type(), + // "input[", input_dims[0], ",", input_dims[1], "] output[", + // output_dims[0], ",", output_dims[1], "]\n"); } // get methods virtual acc_t get_prediction_loss() { return acc_t(0); } diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h index 55890d79ae..5e034ec210 100644 --- a/libdeepgalois/include/deepgalois/reader.h +++ b/libdeepgalois/include/deepgalois/reader.h @@ -1,6 +1,6 @@ #pragma once #include "deepgalois/lgraph.h" - +//#include "galois/DistGalois.h" namespace deepgalois { class Reader { diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h index 7093897af2..91ccc94b83 100644 --- a/libdeepgalois/include/deepgalois/utils.h +++ b/libdeepgalois/include/deepgalois/utils.h @@ -7,11 +7,8 @@ #include #include #include -#ifdef GALOIS_USE_DIST #include "deepgalois/GraphTypes.h" -#else -#include "deepgalois/types.h" -#endif +//#include "galois/DistGalois.h" namespace deepgalois { diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp index 6c408f6449..e4c110dd1e 100644 --- a/libdeepgalois/src/reader.cpp +++ b/libdeepgalois/src/reader.cpp @@ -32,9 +32,12 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) { in >> m >> num_classes >> std::ws; if (is_single_class) { std::cout << "[" << myID << "] Reader: Using single-class (one-hot) labels\n"; + //galois::gPrint("[", myID, + // "] Reader: Using single-class (one-hot) labels\n"); labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1 } else { + //galois::gPrint("[", myID, "] Reader: Using multi-class (one-hot) labels\n"); std::cout << "[" << myID << "] Reader: Using multi-class (one-hot) labels\n"; labels = new label_t[m * @@ -62,6 +65,8 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) { // print the number of vertex classes std::cout << "[" << myID << "] Done, unique label counts: " << num_classes << ", time: " << t_read.Millisecs() << " ms\n"; + //galois::gPrint("[", myID, "] Done, unique label counts: ", num_classes, + //", time: ", t_read.Millisecs(), " ms\n"); // for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " << // unsigned(labels[i]) << "\n"; return num_classes; @@ -121,6 +126,7 @@ size_t Reader::read_features(float_t*& feats, std::string filetype) { //! set to create mask from size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks) { + std::cout << "n:" << n << "\n"; bool dataset_found = false; for (int i = 0; i < NUM_DATASETS; i++) { if (dataset_str == dataset_names[i]) { @@ -155,6 +161,9 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, std::cout << "Global read " << mask_type << "_mask range: [" << begin << ", " << end << ") Number of valid samples: " << sample_count << " (" << (float)sample_count / (float)n * (float)100 << "\%)\n"; + //galois::gPrint("Global read ", mask_type, "_mask range: [", begin, ", ", end, + // ") Number of valid samples: ", sample_count, " (", + // (float)sample_count / (float)n * (float)100, "\%)\n"); in.close(); return sample_count; } diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index 2f4e6ba549..db738dd2f3 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -120,6 +120,8 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks, #endif std::cout << "[" << myID << "]" << std::setprecision(3) << std::fixed << " (f1_micro:" << f1_micro << ", f1_macro: " << f1_macro << ")\n"; + //galois::gPrint("[", myID, "]", std::setprecision(3), std::fixed, + // " (f1_micro:", f1_micro, ", f1_macro: ", f1_macro, ")\n"); return f1_micro; } diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h index 36a04b2f70..65a3aa9d37 100644 --- a/lonestar/gnn/include/engine.h +++ b/lonestar/gnn/include/engine.h @@ -61,20 +61,15 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, } int main(int argc, char** argv) { -#ifdef GALOIS_USE_DIST galois::DistMemSys G; -#else - galois::SharedMemSys G; -#endif LonestarGnnStart(argc, argv, name, desc, url); // Get a partitioned graph first std::vector dummyVec; deepgalois::DGraph* dGraph = NULL; -#ifdef GALOIS_USE_DIST +#ifndef __GALOIS_HET_CUDA__ dGraph = galois::graphs::constructSymmetricGraph(dummyVec); #endif - // initialize network + whole context on CPU // read network, initialize metadata // default setting for now; can be customized by the user From ad7e98fe65d68e4a48f7d65298036fb6b78fde59 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 16 May 2020 22:58:26 -0500 Subject: [PATCH 311/660] fix printing --- libdeepgalois/include/deepgalois/Net.h | 60 +++++++++++++------ .../include/deepgalois/layers/layer.h | 5 +- 2 files changed, 44 insertions(+), 21 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index d3558a99e3..b48315f00f 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -299,15 +299,19 @@ class Net { //////////////////////////////////////////////////////////////////////////////// // training steps - //galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n"); +#ifdef __GALOIS_HET_CUDA__ + std::cout << header << "Epoch " << std::setw(3) << curEpoch << " "; +#else + galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n"); +#endif set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; //galois::gPrint(header, "Calling into eval for forward propagation\n"); // forward: after this phase, layer edges will contain intermediate // features for use during backprop - //double fw_time = evaluate("train", train_loss, train_acc); - evaluate("train", train_loss, train_acc); + double fw_time = evaluate("train", train_loss, train_acc); + //evaluate("train", train_loss, train_acc); //galois::gPrint(header, "Calling into backward propagation\n"); @@ -324,9 +328,13 @@ class Net { // validation / testing set_netphases(net_phase::test); - //galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, - // train_loss, " train_acc ", train_acc, "\n"); - +#ifdef __GALOIS_HET_CUDA__ + std::cout << header << "train_loss " << std::setprecision(3) << std::fixed + << train_loss << " train_acc " << train_acc << " "; +#else + galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, + train_loss, " train_acc ", train_acc, "\n"); +#endif t_epoch.Stop(); double epoch_time = t_epoch.Millisecs(); @@ -336,25 +344,39 @@ class Net { // Validation acc_t val_loss = 0.0, val_acc = 0.0; double val_time = evaluate("val", val_loss, val_acc); - std::cout << header << "val_loss " << std::setprecision(3) << std::fixed << - val_loss << " val_acc " << val_acc << " time " << val_time << "\n"; - //galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, - // val_loss, " val_acc ", val_acc, "\n"); - //galois::gPrint(header, "time ", std::setprecision(3), std::fixed, - // epoch_time + val_time, " ms (train_time ", epoch_time, - // " val_time ", val_time, ")\n"); +#ifdef __GALOIS_HET_CUDA__ + std::cout << header << "val_loss " << std::setprecision(3) << std::fixed + << val_loss << " val_acc " << val_acc << " "; + std::cout << header << "time " << std::setprecision(3) << std::fixed + << epoch_time + val_time << " ms (train_time " << epoch_time + << " val_time " << val_time << ")\n"; +#else + galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, + val_loss, " val_acc ", val_acc, "\n"); + galois::gPrint(header, "time ", std::setprecision(3), std::fixed, + epoch_time + val_time, " ms (train_time ", epoch_time, + " val_time ", val_time, ")\n"); +#endif } else { - //galois::gPrint(header, "train_time ", std::fixed, epoch_time, - // " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, - // ")\n"); +#ifdef __GALOIS_HET_CUDA__ + std::cout << header << "train_time " << std::fixed << epoch_time + << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n"; +#else + galois::gPrint(header, "train_time ", std::fixed, epoch_time, + " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, ")\n"); +#endif } } // epoch loop double avg_train_time = total_train_time / (double)num_epochs; double throughput = 1000.0 * (double)num_epochs / total_train_time; - std::cout << "ave training time " << avg_train_time << " through " << throughput << "\n"; - //galois::gPrint(header, "Average training time per epoch: ", avg_train_time, - // " ms. Throughput: ", throughput, " epoch/s\n"); +#ifdef __GALOIS_HET_CUDA__ + std::cout << "Average training time per epoch: " << avg_train_time + << "ms. Throughput " << throughput << " epoch/s\n"; +#else + galois::gPrint(header, "Average training time per epoch: ", avg_train_time, + " ms. Throughput: ", throughput, " epoch/s\n"); +#endif } // evaluate, i.e. inference or predict diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 47ddb20dc3..b21adefea1 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -88,10 +88,11 @@ class layer : public deepgalois::node { virtual std::string layer_type() const = 0; virtual void malloc_and_init() {} void print_layer_info() { //! debug print function + unsigned myID = 0; #ifndef __GALOIS_HET_CUDA__ - unsigned myID = galois::runtime::getSystemNetworkInterface().ID; + myID = galois::runtime::getSystemNetworkInterface().ID; #endif - std::cout << "Layer " << level_ << " type: " << layer_type() + std::cout << "[" << myID << "] Layer " << level_ << " type: " << layer_type() << "input[" << input_dims[0] << "," << input_dims[1] << "] output[" << output_dims[0] << "," << output_dims[1] << "]\n"; //galois::gPrint("[", myID, "] Layer", level_, " type: ", layer_type(), From b26df3b6bd385bb052ddd183c9d54e0b87d23862 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 16 May 2020 23:18:38 -0500 Subject: [PATCH 312/660] add for single-gpu compilation --- lonestar/gnn/include/engine.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h index 65a3aa9d37..7de3350399 100644 --- a/lonestar/gnn/include/engine.h +++ b/lonestar/gnn/include/engine.h @@ -61,7 +61,11 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, } int main(int argc, char** argv) { +#ifdef __GALOIS_HET_CUDA__ + galois::SharedMemSys G; +#else galois::DistMemSys G; +#endif LonestarGnnStart(argc, argv, name, desc, url); // Get a partitioned graph first From 12c44e42ba60a1216054d8789a7ff275e14a3ca5 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 17 May 2020 09:13:35 -0500 Subject: [PATCH 313/660] gpu sampling works. Use this command to test (test acc 93.4%): ./gcn reddit -k=200 -sl=1 -ss=9000 -vi=20 -h=128 -dr=0.1 --- .../include/deepgalois/DistContext.h | 14 +++--- libdeepgalois/include/deepgalois/Net.h | 11 ++--- libdeepgalois/src/DistContext.cpp | 5 ++- libdeepgalois/src/DistContext.cu | 44 +++++++++++++++++-- libdeepgalois/src/Net.cu | 3 +- libdeepgalois/src/Sampler.cu | 16 +++---- libgpu/include/graph_gpu.h | 4 +- 7 files changed, 70 insertions(+), 27 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 49222eb3ab..332eddb3ba 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -21,6 +21,10 @@ class DistContext { size_t num_classes; // number of classes: E size_t feat_len; // input feature length: D Graph* lGraph; // learning graph version + DGraph* partitionedGraph; // the input graph, |V| = N + std::vector partitionedSubgraphs; + label_t* h_labels; // labels for classification. Single-class: Nx1, multi-class: NxE + float_t* h_feats; // input features: N x D #ifdef __GALOIS_HET_CUDA__ label_t* d_labels; // labels on device label_t* d_labels_subg; // labels for subgraph on device @@ -31,11 +35,6 @@ class DistContext { #else galois::graphs::GluonSubstrate* syncSubstrate; #endif - DGraph* partitionedGraph; // the input graph, |V| = N - std::vector partitionedSubgraphs; - label_t* h_labels; // labels for classification. Single-class label: Nx1, - // multi-class label: NxE - float_t* h_feats; // input features: N x D std::vector h_labels_subg; // labels for subgraph std::vector h_feats_subg; // input features for subgraph std::vector normFactors; // normalization constant based on graph structure @@ -46,7 +45,10 @@ class DistContext { public: // TODO better constructor DistContext(); - DistContext(bool isDevice) : is_device(isDevice) {} + DistContext(bool isDevice) : is_device(isDevice), is_selfloop_added(false), + usingSingleClass(true), dataset(""), + num_classes(0), feat_len(0), lGraph(NULL), + partitionedGraph(NULL), h_labels(0), h_feats(0) {} ~DistContext(); size_t read_graph(std::string dataset_str, bool selfloop = false); diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index b48315f00f..7026ee623d 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -201,10 +201,7 @@ class Net { int num_subg_remain = 0; if (subgraph_sample_size) { -// TOOD this needs to be enabled -#ifndef __GALOIS_HET_CUDA__ distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size); -#endif allocateSubgraphsMasks(num_subgraphs); std::cout << header << "Constructing training vertex set induced graph...\n"; @@ -261,8 +258,8 @@ class Net { auto subgraphPointer = distContext->getSubgraphPointer(sg_id); this->subgraphNumVertices = subgraphPointer->size(); - std::cout << "Subgraph num_vertices: " << subgraphNumVertices - << ", num_edges: " << subgraphPointer->sizeEdges() << "\n"; + //std::cout << "Subgraph num_vertices: " << subgraphNumVertices + // << ", num_edges: " << subgraphPointer->sizeEdges() << "\n"; for (size_t i = 0; i < num_layers; i++) { layers[i]->update_dim_size(this->subgraphNumVertices); } @@ -416,7 +413,11 @@ class Net { for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(distNumSamples); for (size_t i = 0; i < num_conv_layers; i++) { +#ifdef __GALOIS_HET_CUDA__ + layers[i]->set_graph_ptr(distContext->getGraphPointer()); +#else layers[i]->set_graph_ptr(distContext->getLGraphPointer()); +#endif layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr()); } layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_ptr()); diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 320cc75b7f..4a9087b0b3 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -3,7 +3,10 @@ #include "deepgalois/configs.h" namespace deepgalois { -DistContext::DistContext() : usingSingleClass(true) {} +DistContext::DistContext() : DistContext(false) { + syncSubstrate = NULL; +} + DistContext::~DistContext() {} void DistContext::saveDistGraph(DGraph* a) { diff --git a/libdeepgalois/src/DistContext.cu b/libdeepgalois/src/DistContext.cu index 7542849cef..b67f0f9125 100644 --- a/libdeepgalois/src/DistContext.cu +++ b/libdeepgalois/src/DistContext.cu @@ -64,6 +64,12 @@ cusparseMatDescr_t DistContext::cusparse_matdescr_ = 0; curandGenerator_t DistContext::curand_generator_ = 0; DistContext::DistContext() : DistContext(true) { + d_labels = NULL; + d_feats = NULL; + d_labels_subg = NULL; + d_feats_subg = NULL; + d_normFactors = NULL; + d_normFactorsSub = NULL; CUBLAS_CHECK(cublasCreate(&cublas_handle_)); CUSPARSE_CHECK(cusparseCreate(&cusparse_handle_)); CUSPARSE_CHECK(cusparseCreateMatDescr(&cusparse_matdescr_)); @@ -86,10 +92,12 @@ DistContext::~DistContext() { CUSPARSE_CHECK(cusparseDestroyMatDescr(cusparse_matdescr_)); if (curand_generator_) CURAND_CHECK(curandDestroyGenerator(curand_generator_)); - if (d_labels) - CUDA_CHECK(cudaFree(d_labels)); - if (d_feats) - CUDA_CHECK(cudaFree(d_feats)); + if (d_labels) CUDA_CHECK(cudaFree(d_labels)); + if (d_feats) CUDA_CHECK(cudaFree(d_feats)); + if (d_normFactors) CUDA_CHECK(cudaFree(d_normFactors)); + if (d_labels_subg) CUDA_CHECK(cudaFree(d_labels_subg)); + if (d_feats_subg) CUDA_CHECK(cudaFree(d_feats_subg)); + if (d_normFactorsSub) CUDA_CHECK(cudaFree(d_normFactorsSub)); } size_t DistContext::read_labels(bool isSingleClass, std::string dataset_str) { @@ -107,6 +115,15 @@ size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, s return reader.read_masks(mask_type, n, begin, end, masks); } +//! allocate memory for subgraphs (don't actually build them) +void DistContext::allocateSubgraphs(int num_subgraphs, unsigned max_size) { + this->partitionedSubgraphs.resize(num_subgraphs); + for (int i = 0; i < num_subgraphs; i++) { + this->partitionedSubgraphs[i] = new Graph(); + this->partitionedSubgraphs[i]->set_max_size(max_size); + } +} + void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) { size_t labels_size = m; if (!usingSingleClass) labels_size = m * num_classes; @@ -126,6 +143,7 @@ void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) { } void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) { + //std::cout << "construct subgraph features (d_feats_subg: " << d_feats_subg << ") ... "; size_t count = 0; DistContext::h_feats_subg.resize(m * feat_len); for (size_t i = 0; i < this->partitionedGraph->size(); i++) { @@ -137,9 +155,27 @@ void DistContext::constructSubgraphFeatures(size_t m, const mask_t* masks) { if (d_feats_subg) float_free_device(d_feats_subg); float_malloc_device(m * feat_len, d_feats_subg); float_copy_device(m * feat_len, &h_feats_subg[0], d_feats_subg); + //std::cout << "Done\n"; } void DistContext::constructNormFactorSub(int subgraphID) { + Graph& graphToUse = *partitionedSubgraphs[subgraphID]; + auto n = graphToUse.size(); + //std::cout << "Pre-computing subgraph normalization factor (n=" << n << ") ... "; + + #ifdef USE_CUSPARSE + auto nnz = graphToUse.sizeEdges(); + float_malloc_device(nnz, d_normFactorsSub); + init_const_gpu(nnz, 0.0, d_normFactors); + norm_factor_computing_edge<<>>( + n, graphToUse, d_normFactorsSub); +#else + float_malloc_device(n, d_normFactorsSub); + norm_factor_computing_node<<>>( + n, graphToUse, d_normFactorsSub); +#endif + CudaTest("solving norm_factor_computing kernel failed"); + //std::cout << "Done\n"; } void DistContext::constructNormFactor(deepgalois::Context* globalContext) { diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu index 8e8ce83267..2921b81996 100644 --- a/libdeepgalois/src/Net.cu +++ b/libdeepgalois/src/Net.cu @@ -148,7 +148,8 @@ acc_t masked_f1_score_gpu(int num_classes, int begin, int end, int count, namespace deepgalois { void Net::allocateSubgraphsMasks(int num_subgraphs) { - CUDA_CHECK(cudaMalloc((void**)&subgraphs_masks, distNumSamples * num_subgraphs * sizeof(mask_t))); + subgraphs_masks = new mask_t[distNumSamples * num_subgraphs]; + //CUDA_CHECK(cudaMalloc((void**)&subgraphs_masks, distNumSamples * num_subgraphs * sizeof(mask_t))); } void Net::partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel) { diff --git a/libdeepgalois/src/Sampler.cu b/libdeepgalois/src/Sampler.cu index 97835ea9cc..1cdfc49e32 100644 --- a/libdeepgalois/src/Sampler.cu +++ b/libdeepgalois/src/Sampler.cu @@ -20,7 +20,7 @@ __global__ void set_masks(index_t n, index_t* vertices, mask_t* masks) { __global__ void get_masked_degrees(index_t n, mask_t* masks, GraphGPU g, index_t* degrees) { CUDA_KERNEL_LOOP(src, n) { - if (src < 10) printf("masks[%d] = %d\n", src, masks[src]); + //if (src < 10) printf("masks[%d] = %d\n", src, masks[src]); degrees[src] = 0; if (masks[src] == 1) { for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { @@ -29,7 +29,7 @@ __global__ void get_masked_degrees(index_t n, mask_t* masks, GraphGPU g, degrees[src]++; } } - if (src < 10) printf("degrees[%d] = %d\n", src, degrees[src]); + //if (src < 10) printf("degrees[%d] = %d\n", src, degrees[src]); } } @@ -96,7 +96,7 @@ void Sampler::indexing(size_t n, index_t* vertices, index_t* new_indices) { template void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* subg) { - std::cout << "Original graph size: " << g->size() << " edges: " << g->sizeEdges() << "\n"; + //std::cout << "Original graph size: " << g->size() << " edges: " << g->sizeEdges() << "\n"; index_t *degrees, *offsets; CUDA_CHECK(cudaMalloc((void**)°rees, sizeof(index_t)*n)); get_masked_degrees<<>>(n, masks, *g, degrees); @@ -105,7 +105,7 @@ void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* s CUDA_CHECK(cudaFree(degrees)); index_t ne; CUDA_CHECK(cudaMemcpy(&ne, &offsets[n], sizeof(index_t), cudaMemcpyDeviceToHost)); - std::cout << "maskedSG num_edges " << ne << "\n"; + //std::cout << "maskedSG num_edges " << ne << "\n"; subg->allocateFrom(n, ne); // TODO: avoid reallocation generate_masked_graph_kernel<<>>(n, masks, offsets, *g, *subg); CUDA_CHECK(cudaFree(offsets)); @@ -117,7 +117,7 @@ void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* s void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* sub) { index_t n = partGraph->size(); auto nv = vertex_set.size(); - std::cout << "g size: " << n << " sg sizes: " << nv << "\n"; + //std::cout << "g size: " << n << " sg sizes: " << nv << "\n"; // convert the vertex_set to a vertex_list and copy it to the device VertexList vertex_list(vertex_set.begin(), vertex_set.end()); index_t* d_vertex_list; @@ -135,7 +135,7 @@ void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* s //CudaTest("solving set_masks kernel failed"); GraphGPU masked_sg; // size is the same as original graph, but masked dst removed getMaskedGraph(n, d_masks, partGraph, &masked_sg); // remove edges whose destination is not masked - std::cout << "maskedGraph generated\n"; + //std::cout << "maskedGraph generated\n"; // re-index the subgraph index_t* d_new_ids; @@ -154,7 +154,7 @@ void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* s CUDA_CHECK(cudaFree(degrees)); index_t ne; CUDA_CHECK(cudaMemcpy(&ne, offsets+nv, sizeof(index_t), cudaMemcpyDeviceToHost)); - std::cout << "subgraph num_edges " << ne << "\n"; + //std::cout << "subgraph num_edges " << ne << "\n"; // allocate memory for the subgraph sub->allocateFrom(nv, ne); // avoid reallocation @@ -162,7 +162,7 @@ void Sampler::generateSubgraph(VertexSet &vertex_set, mask_t* masks, GraphGPU* s generate_graph_kernel<<>>(nv, offsets, d_vertex_list, d_new_ids, masked_sg, *sub); CudaTest("solving generate_graph kernel failed"); CUDA_CHECK(cudaFree(offsets)); - std::cout << "Subgraph generated\n"; + //std::cout << "Subgraph generated\n"; } } // namespace deepgalois diff --git a/libgpu/include/graph_gpu.h b/libgpu/include/graph_gpu.h index 4ddf57b950..d208a3328c 100644 --- a/libgpu/include/graph_gpu.h +++ b/libgpu/include/graph_gpu.h @@ -139,7 +139,7 @@ struct CSRGraph { delete edge_dst; edge_dst = new_edge_dst; nedges += nnodes; - printf("nnodes = %d, nedges = %d\n", nnodes, nedges); + printf("nnodes = %d, nedges = %d\n", nnodes, nedges); //print_neighbors(nnodes-1); //print_neighbors(0); } @@ -184,7 +184,7 @@ struct CSRGraph { nnodes = nv; nedges = ne; if (max_size < nnodes) max_size = nnodes; - printf("allocating memory on gpu nnodes %d nedges %d\n", max_size, nedges); + //printf("allocating memory on gpu nnodes %d nedges %d\n", max_size, nedges); if (need_realloc) { if (edge_dst) free_index_device(edge_dst); malloc_index_device(nedges, edge_dst); From a647e0ec321f51bc960c1025efd7a3c1072f2926 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 18 May 2020 16:50:57 -0500 Subject: [PATCH 314/660] GNN-OEC policy: test vertices evenly among hosts --- libcusp/include/galois/graphs/BasePolicies.h | 17 ++++- .../galois/graphs/GenericPartitioners.h | 47 ++++++++++++++ libcusp/include/galois/graphs/NewGeneric.h | 58 +++++++++++++++++ lonestar/gnn/include/DistributedGraphLoader.h | 63 +++++++------------ lonestar/gnn/src/DistributedGraphLoader.cpp | 28 ++++----- 5 files changed, 154 insertions(+), 59 deletions(-) diff --git a/libcusp/include/galois/graphs/BasePolicies.h b/libcusp/include/galois/graphs/BasePolicies.h index 1b1fcd8aa4..511804276e 100644 --- a/libcusp/include/galois/graphs/BasePolicies.h +++ b/libcusp/include/galois/graphs/BasePolicies.h @@ -65,6 +65,8 @@ class PartitioningScaffold { void saveGIDToHost(std::vector>& gid2host) { _gid2host = gid2host; } + + bool predeterminedMapping(std::vector&) { return false; } }; /** @@ -149,8 +151,13 @@ class CustomMasterAssignment : public PartitioningScaffold { char _status; //!< Specifies what phase of master assignment partitioner is on //! Metadata for determining where a node's master is std::vector _localNodeToMaster; - //! Map GID to its master + //! Map GID to its master; only for nodes we own std::unordered_map _gid2masters; + //! Unlike gid2masters, this contains a mapping in vector form of ALL mappings + //! for all nodes in the graph instead of just local ones; only used if it is + //! known exactly where everything ends up before partitioning + std::vector _globalHostMap; + //! This host's node offset (each host reads a distinct contiguous portion //! of graph uint64_t _nodeOffset; @@ -183,6 +190,8 @@ class CustomMasterAssignment : public PartitioningScaffold { * mapping is not found but instead returns -1 if in stage 1, else * fails. * + * ONLY WORKS IF GID IS ON LOCAL HOST ELSE WILL FAIL + * * @param gid GID to get master of * @returns Master of specified GID, -1, unsigned, if not found */ @@ -202,11 +211,13 @@ class CustomMasterAssignment : public PartitioningScaffold { } else { // NOT FOUND (not necessarily a bad thing, and required for // some cases) - galois::gDebug("[", _hostID, "] ", gid, " not found!"); + galois::gDebug("[", _hostID, "] ", gid, + " not found for retrieveMaster!"); if (_status == 2) { // die if we expect all gids to be mapped already (stage 2) GALOIS_DIE("should not fail to find a GID after stage 2 " - "of master assignment phase"); + "of master assignment phase; that or passed in gid that" + " doesn't exist on this host"); } return (uint32_t)-1; } diff --git a/libcusp/include/galois/graphs/GenericPartitioners.h b/libcusp/include/galois/graphs/GenericPartitioners.h index db73b84525..f1a0809f37 100644 --- a/libcusp/include/galois/graphs/GenericPartitioners.h +++ b/libcusp/include/galois/graphs/GenericPartitioners.h @@ -909,4 +909,51 @@ class SugarColumnFlipP : public galois::graphs::CustomMasterAssignment { } }; +class GnnOEC : public galois::graphs::CustomMasterAssignment { +public: + GnnOEC(uint32_t hostID, uint32_t numHosts, uint64_t numNodes, + uint64_t numEdges) + : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes, + numEdges){}; + + template + uint32_t getMaster(uint32_t src, galois::graphs::BufferedGraph&, + const std::vector&, + std::unordered_map&, + const std::vector&, + std::vector>&, + const std::vector&, + std::vector>&) { + // this is expected to be set + return _globalHostMap[src]; + } + + uint32_t retrieveMaster(uint32_t gid) const { return _globalHostMap[gid]; } + + //! outgoing edge cut + uint32_t getEdgeOwner(uint32_t src, uint32_t, uint64_t) const { + return retrieveMaster(src); + } + + bool noCommunication() { return false; } + bool isVertexCut() const { return false; } + void serializePartition(boost::archive::binary_oarchive&) {} + void deserializePartition(boost::archive::binary_iarchive&) {} + std::pair cartesianGrid() { + return std::make_pair(0u, 0u); + } + + bool predeterminedMapping(std::vector& mappings) { + if (mappings.size() != _numNodes) { + GALOIS_DIE("predetermined mapping size not equal to num nodes"); + } + _globalHostMap.resize(_numNodes); + + galois::do_all(galois::iterate((size_t)0, mappings.size()), + [&](size_t n) { _globalHostMap[n] = mappings[n]; }); + + return true; + } +}; + #endif diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index efbf657670..ac17e25aed 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -73,6 +73,28 @@ class NewDistGraphGeneric : public DistGraph { uint32_t nodesToReceive; + std::vector getGNNBreakpoints(std::string filename) { + // contains 2 numbers: begin and end of test + // everything else can be split evenly among hosts as they are not + // performance critical + std::vector bps; + + // if through all possible GNN outputs + if (filename.find("cora") != std::string::npos) { + bps.push_back(0); + bps.push_back(140); + } else if (filename.find("reddit") != std::string::npos) { + bps.push_back(0); + bps.push_back(153431); + } else if (filename.find("ppi") != std::string::npos) { + bps.push_back(0); + bps.push_back(9716); + } + // TODO hardcode the rest + + return bps; + } + public: //! typedef for base DistGraph class using base_DistGraph = DistGraph; @@ -173,6 +195,7 @@ class NewDistGraphGeneric : public DistGraph { } galois::graphs::OfflineGraph g(filename); + base_DistGraph::numGlobalNodes = g.size(); base_DistGraph::numGlobalEdges = g.sizeEdges(); std::vector dummy; @@ -190,6 +213,41 @@ class NewDistGraphGeneric : public DistGraph { // TODO abstract this away somehow graphPartitioner->saveGIDToHost(base_DistGraph::gid2host); + // get training nodes and split evenly among hosts + std::vector trainPoints = this->getGNNBreakpoints(filename); + if (!trainPoints.empty()) { + std::vector testDistribution = + galois::graphs::determineUnitRangesFromPrefixSum( + base_DistGraph::numHosts, g, trainPoints[0], trainPoints[1]); + + std::vector restDistribution = + galois::graphs::determineUnitRangesFromPrefixSum( + base_DistGraph::numHosts, g, trainPoints[1], g.size()); + + // create global distribution of edges + std::vector mappings(g.size()); + galois::do_all( + galois::iterate((size_t)0, (size_t)base_DistGraph::numHosts), + [&](size_t h) { + // test + uint32_t hCur = testDistribution[h]; + uint32_t hEnd = testDistribution[h + 1]; + for (; hCur < hEnd; hCur++) { + mappings[hCur] = h; + } + // the rest + hCur = restDistribution[h]; + hEnd = restDistribution[h + 1]; + for (; hCur < hEnd; hCur++) { + mappings[hCur] = h; + } + }); + bool validPart = graphPartitioner->predeterminedMapping(mappings); + if (!validPart) { + galois::gWarn("partitioning policy used doesn't use trainpoints"); + } + } + uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first; typename galois::graphs::OfflineGraph::edge_iterator edgeBegin = g.edge_begin(nodeBegin); diff --git a/lonestar/gnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h index 7827c1a39f..f3755a886f 100644 --- a/lonestar/gnn/include/DistributedGraphLoader.h +++ b/lonestar/gnn/include/DistributedGraphLoader.h @@ -1,7 +1,7 @@ /* - * This file belongs to the Galois project, a C++ library for exploiting parallelism. - * The code is being released under the terms of the 3-Clause BSD License (a - * copy is located in LICENSE.txt at the top-level directory). + * This file belongs to the Galois project, a C++ library for exploiting + * parallelism. The code is being released under the terms of the 3-Clause BSD + * License (a copy is located in LICENSE.txt at the top-level directory). * * Copyright (C) 2019, The University of Texas at Austin. All rights reserved. * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS @@ -42,17 +42,18 @@ namespace graphs { //! enums of partitioning schemes supported enum PARTITIONING_SCHEME { - OEC, //!< outgoing edge cut - IEC, //!< incoming edge cut - HOVC, //!< outgoing hybrid vertex cut - HIVC, //!< incoming hybrid vertex cut - CART_VCUT, //!< cartesian vertex cut - CART_VCUT_IEC, //!< cartesian vertex cut using iec - GINGER_O, //!< Ginger, outgoing - GINGER_I, //!< Ginger, incoming - FENNEL_O, //!< Fennel, oec - FENNEL_I, //!< Fennel, iec - SUGAR_O //!< Sugar, oec + OEC, //!< outgoing edge cut + IEC, //!< incoming edge cut + HOVC, //!< outgoing hybrid vertex cut + HIVC, //!< incoming hybrid vertex cut + CART_VCUT, //!< cartesian vertex cut + CART_VCUT_IEC, //!< cartesian vertex cut using iec + GINGER_O, //!< Ginger, outgoing + GINGER_I, //!< Ginger, incoming + FENNEL_O, //!< Fennel, oec + FENNEL_I, //!< Fennel, iec + SUGAR_O, //!< Sugar, oec + GNN_OEC //!< gnn, oec }; /** @@ -85,6 +86,8 @@ inline const char* EnumToString(PARTITIONING_SCHEME e) { return "fennel-iec"; case SUGAR_O: return "sugar-oec"; + case GNN_OEC: + return "gnn-oec"; default: GALOIS_DIE("Unsupported partition"); } @@ -121,8 +124,7 @@ namespace graphs { * loaded based on command line arguments */ template -DistGraph* -constructSymmetricGraph(std::vector&) { +DistGraph* constructSymmetricGraph(std::vector&) { std::string inputFile = deepgalois::path + dataset + ".csgr"; galois::gInfo("File to read is ", inputFile); @@ -130,36 +132,19 @@ constructSymmetricGraph(std::vector&) { case OEC: case IEC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "" - ); + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); case HOVC: case HIVC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "" - ); + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); case CART_VCUT: case CART_VCUT_IEC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "" - ); - - case GINGER_O: - case GINGER_I: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "" - ); - - case FENNEL_O: - case FENNEL_I: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "" - ); - - case SUGAR_O: - return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, "" - ); + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); + case GNN_OEC: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); default: GALOIS_DIE("Error: partition scheme specified is invalid"); return nullptr; diff --git a/lonestar/gnn/src/DistributedGraphLoader.cpp b/lonestar/gnn/src/DistributedGraphLoader.cpp index 7c309dedc2..71953ea53e 100644 --- a/lonestar/gnn/src/DistributedGraphLoader.cpp +++ b/lonestar/gnn/src/DistributedGraphLoader.cpp @@ -1,7 +1,7 @@ /* - * This file belongs to the Galois project, a C++ library for exploiting parallelism. - * The code is being released under the terms of the 3-Clause BSD License (a - * copy is located in LICENSE.txt at the top-level directory). + * This file belongs to the Galois project, a C++ library for exploiting + * parallelism. The code is being released under the terms of the 3-Clause BSD + * License (a copy is located in LICENSE.txt at the top-level directory). * * Copyright (C) 2019, The University of Texas at Austin. All rights reserved. * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS @@ -32,17 +32,11 @@ namespace cll = llvm::cl; cll::opt partitionScheme( "partition", cll::desc("Type of partitioning."), - cll::values( - clEnumValN(OEC, "oec", "Outgoing Edge-Cut (default)"), - clEnumValN(IEC, "iec", "Incoming Edge-Cut"), - clEnumValN(HOVC, "hovc", "Outgoing Hybrid Vertex-Cut"), - clEnumValN(HIVC, "hivc", "Incoming Hybrid Vertex-Cut"), - clEnumValN(CART_VCUT, "cvc", "Cartesian Vertex-Cut of oec"), - clEnumValN(CART_VCUT_IEC, "cvc-iec", "Cartesian Vertex-Cut of iec"), - //clEnumValN(CEC, "cec", "Custom edge cut from vertexID mapping"), - clEnumValN(GINGER_O, "ginger-o", "ginger, outgiong edges, using CuSP"), - clEnumValN(GINGER_I, "ginger-i", "ginger, incoming edges, using CuSP"), - clEnumValN(FENNEL_O, "fennel-o", "fennel, outgoing edge cut, using CuSP"), - clEnumValN(FENNEL_I, "fennel-i", "fennel, incoming edge cut, using CuSP"), - clEnumValN(SUGAR_O, "sugar-o", "fennel, incoming edge cut, using CuSP")), - cll::init(OEC)); + cll::values(clEnumValN(OEC, "oec", "Outgoing Edge-Cut (default)"), + clEnumValN(IEC, "iec", "Incoming Edge-Cut"), + clEnumValN(CART_VCUT, "cvc", "Cartesian Vertex-Cut of oec"), + clEnumValN(CART_VCUT_IEC, "cvc-iec", + "Cartesian Vertex-Cut of iec"), + clEnumValN(GNN_OEC, "g-oec", + "gnn oec: train nodes evenly distributed")), + cll::init(GNN_OEC)); From 18d102c8923983ed53d0bf91586177af4146cc43 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 19 May 2020 13:16:45 -0500 Subject: [PATCH 315/660] gnn eneable dist galoi s-> galois use dist --- libdeepgalois/CMakeLists.txt | 2 +- libdeepgalois/include/deepgalois/layers/graph_conv_layer.h | 2 -- libdeepgalois/include/deepgalois/types.h | 2 -- lonestar/gnn/CMakeLists.txt | 2 +- lonestar/gnn/gat/CMakeLists.txt | 2 +- lonestar/gnn/gcn/CMakeLists.txt | 2 +- lonestar/gnn/gin/CMakeLists.txt | 2 +- lonestar/gnn/gin/gin.cpp | 2 -- lonestar/gnn/include/engine.h | 7 ------- lonestar/gnn/sage/CMakeLists.txt | 2 +- 10 files changed, 6 insertions(+), 19 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index a022a36655..32c5fa0212 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -106,7 +106,7 @@ target_include_directories(dg_cpu PUBLIC ) # dist galois setup/linking to dg_cpu -if(ENABLE_DIST_GALOIS) +if(GALOIS_ENABLE_DIST) target_link_libraries(dg_cpu galois_dist_async galois_cusp galois_gluon) target_include_directories(dg_cpu PUBLIC ${CMAKE_SOURCE_DIR}/libdist/include diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index 216b7e1935..a02beebd57 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -1,9 +1,7 @@ #pragma once #include "layer.h" #include "deepgalois/layers/aggregator.h" -#ifdef GALOIS_USE_DIST #include "deepgalois/layers/GraphConvSyncStructures.h" -#endif /** * GraphConv Layer; based on DGL implementation + follows TinyDNN layer diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index e3165abc8a..43d55eb331 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -43,7 +43,6 @@ enum class net_phase { train, test }; #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE) #define USE_CUSPARSE -#ifdef GALOIS_USE_DIST namespace deepgalois { // TODO only being used by graph conv layer at the moment so extern works, // but this design is bad and needs to be revisited @@ -54,6 +53,5 @@ extern float_t* _dataToSync; //! sync extern long unsigned _syncVectorSize; } // namespace deepgalois -#endif #endif diff --git a/lonestar/gnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt index 40eac53052..0020736fee 100644 --- a/lonestar/gnn/CMakeLists.txt +++ b/lonestar/gnn/CMakeLists.txt @@ -17,7 +17,7 @@ if(ENABLE_HETERO_GALOIS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GALOIS_HET_CUDA__") endif() -if(ENABLE_DIST_GALOIS) +if(GALOIS_ENABLE_DIST) add_library(distgraphloader STATIC src/DistributedGraphLoader.cpp) target_include_directories(distgraphloader PUBLIC include) target_link_libraries(distgraphloader galois_cusp LLVMSupport) diff --git a/lonestar/gnn/gat/CMakeLists.txt b/lonestar/gnn/gat/CMakeLists.txt index f9f1efdc6f..8d172ac154 100644 --- a/lonestar/gnn/gat/CMakeLists.txt +++ b/lonestar/gnn/gat/CMakeLists.txt @@ -8,7 +8,7 @@ if(ENABLE_HETERO_GALOIS) target_link_libraries(gat PRIVATE -lcudart -lcublas -lcurand -lcudadevrt) else() target_link_libraries(gat PRIVATE dg_cpu) -if(ENABLE_DIST_GALOIS) +if(GALOIS_ENABLE_DIST) target_link_libraries(gat PRIVATE distgraphloader) endif() endif() diff --git a/lonestar/gnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt index fc5f134d76..9ed4ef97d9 100644 --- a/lonestar/gnn/gcn/CMakeLists.txt +++ b/lonestar/gnn/gcn/CMakeLists.txt @@ -11,7 +11,7 @@ if(ENABLE_HETERO_GALOIS) target_link_libraries(gcn PRIVATE -lcudart -lcublas -lcurand -lcudadevrt) else() target_link_libraries(gcn PRIVATE dg_cpu) -if(ENABLE_DIST_GALOIS) +if(GALOIS_ENABLE_DIST) target_link_libraries(gcn PRIVATE distgraphloader) endif() endif() diff --git a/lonestar/gnn/gin/CMakeLists.txt b/lonestar/gnn/gin/CMakeLists.txt index f32f47179e..5d63e3d0d7 100644 --- a/lonestar/gnn/gin/CMakeLists.txt +++ b/lonestar/gnn/gin/CMakeLists.txt @@ -1,6 +1,6 @@ app(gin gin.cpp) target_link_libraries(gin dg_cpu) -if(ENABLE_DIST_GALOIS) +if(GALOIS_ENABLE_DIST) target_link_libraries(gin distgraphloader) endif() if(ENABLE_HETERO_GALOIS) diff --git a/lonestar/gnn/gin/gin.cpp b/lonestar/gnn/gin/gin.cpp index aecfcf9b35..4eb8835214 100644 --- a/lonestar/gnn/gin/gin.cpp +++ b/lonestar/gnn/gin/gin.cpp @@ -1,9 +1,7 @@ // Graph Neural Networks // Xuhao Chen #include "lonestargnn.h" -#ifdef GALOIS_USE_DIST #include "DistributedGraphLoader.h" -#endif const char* name = "Graph Isomorphism Network (GIN)"; const char* desc = "Graph isomorphism neural networks on an undirected graph"; diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h index 7de3350399..cf39ce95f3 100644 --- a/lonestar/gnn/include/engine.h +++ b/lonestar/gnn/include/engine.h @@ -1,12 +1,9 @@ // Execution engine #include #include -#ifdef GALOIS_USE_DIST #include "DistributedGraphLoader.h" #include "galois/DistGalois.h" #include "galois/runtime/Network.h" -#endif -#include "galois/Galois.h" #include "galois/Version.h" #include "galois/Timer.h" #include "deepgalois/Net.h" @@ -28,10 +25,8 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, numThreads = galois::setActiveThreads(numThreads); // number of threads on CPU #endif -#ifdef GALOIS_USE_DIST auto& net = galois::runtime::getSystemNetworkInterface(); if (net.ID == 0) { -#endif LonestarGnnPrintVersion(llvm::outs()); std::cout << "Copyright (C) " << galois::getCopyrightYear() << " The University of Texas at Austin\n"; @@ -51,9 +46,7 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, } galois::runtime::reportParam("(NULL)", "CommandLine", cmdout.str()); galois::runtime::reportParam("(NULL)", "Threads", numThreads); -#ifdef GALOIS_USE_DIST } -#endif char name[256]; gethostname(name, 256); diff --git a/lonestar/gnn/sage/CMakeLists.txt b/lonestar/gnn/sage/CMakeLists.txt index 94b6d234b7..ee95292588 100644 --- a/lonestar/gnn/sage/CMakeLists.txt +++ b/lonestar/gnn/sage/CMakeLists.txt @@ -8,7 +8,7 @@ if(ENABLE_HETERO_GALOIS) target_link_libraries(sage PRIVATE -lcudart -lcublas -lcurand -lcudadevrt) else() target_link_libraries(sage PRIVATE dg_cpu) -if(ENABLE_DIST_GALOIS) +if(GALOIS_ENABLE_DIST) target_link_libraries(sage PRIVATE distgraphloader) endif() endif() From 6659aa0356393bf5657bc268509afb9997ab757f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 19 May 2020 13:33:53 -0500 Subject: [PATCH 316/660] heterogalois -> eneable gpu; TODO fix moderngpu --- CMakeLists.txt | 18 ++++++++--------- libdeepgalois/CMakeLists.txt | 12 +++++------ .../include/deepgalois/DistContext.h | 6 +++--- libdeepgalois/include/deepgalois/GraphTypes.h | 4 ++-- libdeepgalois/include/deepgalois/Net.h | 20 +++++++++---------- .../deepgalois/layers/GradientSyncStructs.h | 2 +- .../layers/GraphConvSyncStructures.h | 2 +- .../include/deepgalois/layers/aggregator.h | 2 +- .../deepgalois/layers/graph_conv_layer.h | 2 +- .../include/deepgalois/layers/layer.h | 14 ++++++------- libdeepgalois/include/deepgalois/lgraph.h | 2 +- libdeepgalois/include/deepgalois/optimizer.h | 18 ++++++++--------- libdeepgalois/src/reader.cpp | 2 +- libdeepgalois/src/utils.cpp | 2 +- lonestar/gnn/CMakeLists.txt | 6 +++--- lonestar/gnn/gat/CMakeLists.txt | 2 +- lonestar/gnn/gcn/CMakeLists.txt | 4 ++-- lonestar/gnn/gin/CMakeLists.txt | 2 +- lonestar/gnn/include/engine.h | 6 +++--- lonestar/gnn/sage/CMakeLists.txt | 2 +- 20 files changed, 63 insertions(+), 65 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f08d91e6fb..02e2aca6b6 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -254,7 +254,14 @@ if(USE_DEEPGALOIS) endif(USE_DEEPGALOIS) if (GALOIS_ENABLE_GPU) - # TODO may require cleanup + enable_language(CUDA) + string(REPLACE "." "" GENCODES ${CUDA_CAPABILITY}) + string(REPLACE "," ";" GENCODES ${GENCODES}) + foreach(GENCODE ${GENCODES}) + add_compile_options("$<$:-gencode=arch=compute_${GENCODE},code=sm_${GENCODE}>") + endforeach() + add_subdirectory(libgpu) + if (USE_DEEPGALOIS) SET(CUDA_SEPARABLE_COMPILATION ON) find_package(CUDA REQUIRED) @@ -279,15 +286,6 @@ if (GALOIS_ENABLE_GPU) #find_package(OpenCL REQUIRED) endif() - - enable_language(CUDA) - string(REPLACE "." "" GENCODES ${CUDA_CAPABILITY}) - string(REPLACE "," ";" GENCODES ${GENCODES}) - foreach(GENCODE ${GENCODES}) - add_compile_options("$<$:-gencode=arch=compute_${GENCODE},code=sm_${GENCODE}>") - endforeach() - - add_subdirectory(libgpu) endif() add_subdirectory(libpangolin) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 32c5fa0212..1abc692a9f 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -25,9 +25,9 @@ include_directories(${CMAKE_SOURCE_DIR}/libgalois/include) include_directories(${CMAKE_CURRENT_SOURCE_DIR}/include) link_directories(${CMAKE_SOURCE_DIR}/libgalois) -if(ENABLE_HETERO_GALOIS) +if(GALOIS_ENABLE_GPU) # hetero path - set(CUDA_NVCC_FLAGS "-D__GALOIS_HET_CUDA__ ${CUDA_NVCC_FLAGS}") + set(CUDA_NVCC_FLAGS "-DGALOIS_ENABLE_GPU ${CUDA_NVCC_FLAGS}") set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers include_directories("${CUB_ROOT}") set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers @@ -71,8 +71,8 @@ endif() set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") -if(ENABLE_HETERO_GALOIS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GALOIS_HET_CUDA__") +if(GALOIS_ENABLE_GPU) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGALOIS_ENABLE_GPU") set(sources src/reader.cpp src/RandomWalk.cpp src/utils.cpp) else() set(sources @@ -94,7 +94,7 @@ else() src/node.cpp src/Net.cpp ) -endif(ENABLE_HETERO_GALOIS) +endif(GALOIS_ENABLE_GPU) add_library(dg_cpu STATIC ${sources}) target_link_libraries(dg_cpu galois_shmem) @@ -114,7 +114,7 @@ if(GALOIS_ENABLE_DIST) ${CMAKE_SOURCE_DIR}/libgluon/include ) - if(ENABLE_HETERO_GALOIS) + if(GALOIS_ENABLE_GPU) target_link_libraries(dg_gpu galois_dist_async galois_cusp galois_gluon) target_include_directories(dg_gpu PUBLIC ${CMAKE_SOURCE_DIR}/libdist/include diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index 332eddb3ba..c614a92ca2 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -1,6 +1,6 @@ #ifndef __DG_DIST_CONTEXT__ #define __DG_DIST_CONTEXT__ -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU #include "deepgalois/cutils.h" #else #include "galois/graphs/GluonSubstrate.h" @@ -25,7 +25,7 @@ class DistContext { std::vector partitionedSubgraphs; label_t* h_labels; // labels for classification. Single-class: Nx1, multi-class: NxE float_t* h_feats; // input features: N x D -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU label_t* d_labels; // labels on device label_t* d_labels_subg; // labels for subgraph on device float_t* d_feats; // input features on device @@ -68,7 +68,7 @@ class DistContext { Graph* getSubgraphPointer(int id) { return partitionedSubgraphs[id]; }; void initializeSyncSubstrate(); -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU float_t* get_feats_ptr() { return d_feats; } float_t* get_feats_subg_ptr() { return d_feats_subg; } label_t* get_labels_ptr() { return d_labels; } diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h index 2ee3f86b93..1528375290 100644 --- a/libdeepgalois/include/deepgalois/GraphTypes.h +++ b/libdeepgalois/include/deepgalois/GraphTypes.h @@ -3,7 +3,7 @@ #include "deepgalois/types.h" #include "deepgalois/lgraph.h" -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU #define USE_CSRGRAPH #ifdef USE_CSRGRAPH #include "graph_gpu.h" @@ -16,7 +16,7 @@ namespace deepgalois { using edge_iterator = index_t; using GraphCPU = LearningGraph; -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU using DGraph = CSRGraph; using Graph = CSRGraph; using GraphGPU = CSRGraph; diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 7026ee623d..e17a9f9b76 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -22,7 +22,7 @@ namespace deepgalois { // layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16) // layer 2: features N x 16, weights 16 x E, out N x E class Net { -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU unsigned myID = 0; #else unsigned myID = galois::runtime::getSystemNetworkInterface().ID; @@ -101,7 +101,7 @@ class Net { h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd), val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { // init some identifiers for this host -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU this->myID = galois::runtime::getSystemNetworkInterface().ID; #endif this->header = "[" + std::to_string(myID) + "] "; @@ -296,7 +296,7 @@ class Net { //////////////////////////////////////////////////////////////////////////////// // training steps -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU std::cout << header << "Epoch " << std::setw(3) << curEpoch << " "; #else galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n"); @@ -325,7 +325,7 @@ class Net { // validation / testing set_netphases(net_phase::test); -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU std::cout << header << "train_loss " << std::setprecision(3) << std::fixed << train_loss << " train_acc " << train_acc << " "; #else @@ -341,7 +341,7 @@ class Net { // Validation acc_t val_loss = 0.0, val_acc = 0.0; double val_time = evaluate("val", val_loss, val_acc); -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU std::cout << header << "val_loss " << std::setprecision(3) << std::fixed << val_loss << " val_acc " << val_acc << " "; std::cout << header << "time " << std::setprecision(3) << std::fixed @@ -355,7 +355,7 @@ class Net { " val_time ", val_time, ")\n"); #endif } else { -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU std::cout << header << "train_time " << std::fixed << epoch_time << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n"; #else @@ -367,7 +367,7 @@ class Net { double avg_train_time = total_train_time / (double)num_epochs; double throughput = 1000.0 * (double)num_epochs / total_train_time; -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU std::cout << "Average training time per epoch: " << avg_train_time << "ms. Throughput " << throughput << " epoch/s\n"; #else @@ -413,7 +413,7 @@ class Net { for (size_t i = 0; i < num_layers; i++) layers[i]->update_dim_size(distNumSamples); for (size_t i = 0; i < num_conv_layers; i++) { -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU layers[i]->set_graph_ptr(distContext->getGraphPointer()); #else layers[i]->set_graph_ptr(distContext->getLGraphPointer()); @@ -423,7 +423,7 @@ class Net { layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_ptr()); layers[0]->set_feats_ptr(distContext->get_feats_ptr()); // feed input data } -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU if (type == "train") { gMasks = d_train_masks; } else if (type == "val") { @@ -552,7 +552,7 @@ class Net { out_dims[1] = get_out_dim(layer_id); layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, dropout_rate, in_dims, out_dims); -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU layers[layer_id]->set_graph_ptr(distContext->getGraphPointer()); #else layers[layer_id]->set_graph_ptr(distContext->getLGraphPointer()); diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h index c962f20004..9b325311b7 100644 --- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h +++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h @@ -1,4 +1,4 @@ -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU #ifndef __GRAD_SYNC_STRUCT__ #define __GRAD_SYNC_STRUCT__ diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h index 7c3c038d15..95e09b1c0d 100644 --- a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h +++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h @@ -1,4 +1,4 @@ -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU #ifndef __GRAPH_CONV_SYNC_STRUCT__ #define __GRAPH_CONV_SYNC_STRUCT__ diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h index 142812c6ba..3f2d3c7f1b 100644 --- a/libdeepgalois/include/deepgalois/layers/aggregator.h +++ b/libdeepgalois/include/deepgalois/layers/aggregator.h @@ -2,7 +2,7 @@ #include "deepgalois/types.h" //! For each node in the graph, add the embeddings of all of its neighbors //! together (using norm_factor if specified) -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU #include "deepgalois/GraphTypes.h" namespace deepgalois { // TODO template arg diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index a02beebd57..ad954215fc 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -51,7 +51,7 @@ class graph_conv_layer : public layer { virtual void back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad); // user-defined aggregate function -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU virtual void aggregate(size_t len, Graph& g, const float_t* in, float_t* out); void d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out); #else diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index b21adefea1..02b5abebb4 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -15,7 +15,7 @@ #include "deepgalois/layers/node.h" #include "deepgalois/DistContext.h" -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU #include "galois/graphs/GluonSubstrate.h" #include "deepgalois/layers/GluonGradients.h" #include "deepgalois/layers/GradientSyncStructs.h" @@ -41,7 +41,7 @@ class layer : public deepgalois::node { using ContextType = deepgalois::DistContext; protected: - #ifndef __GALOIS_HET_CUDA__ + #ifndef GALOIS_ENABLE_GPU const std::string header = "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) + "] "; @@ -70,7 +70,7 @@ class layer : public deepgalois::node { label_t* labels; float_t* norm_consts; // TODO -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU GraphGPU* graph_gpu; #else Graph* graph_cpu; @@ -89,7 +89,7 @@ class layer : public deepgalois::node { virtual void malloc_and_init() {} void print_layer_info() { //! debug print function unsigned myID = 0; -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU myID = galois::runtime::getSystemNetworkInterface().ID; #endif std::cout << "[" << myID << "] Layer " << level_ << " type: " << layer_type() @@ -120,7 +120,7 @@ class layer : public deepgalois::node { void set_norm_consts_ptr(float_t* ptr) { norm_consts = ptr; } void set_feats_ptr(float_t* ptr) { prev_->set_data(ptr); } void set_name(std::string name) { name_ = name; } // name metadata -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU void set_graph_ptr(Graph* ptr) { graph_cpu = ptr; } #else void set_graph_ptr(GraphGPU* ptr) { graph_gpu = ptr; } @@ -145,7 +145,7 @@ class layer : public deepgalois::node { use_mask = false; if (masks != NULL) { use_mask = true; -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU masks_ = masks; #else d_masks_ = masks; @@ -183,7 +183,7 @@ class layer : public deepgalois::node { //! use optimizer to update weights given gradient (weight_grad) void update_weight(deepgalois::optimizer* opt) { -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU // parallelize only when target size is big enough to mitigate thread // spawning overhead. // bool parallel = (W.size() >= 512); diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index e0527b2161..2e086ebf88 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -127,7 +127,7 @@ class LearningGraph { index_t getEdgeDstHost(index_t eid) { return colidx_[eid]; } index_t edge_begin_host(index_t vid) { return rowptr_[vid]; } index_t edge_end_host(index_t vid) { return rowptr_[vid + 1]; } -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU index_t getEdgeDst(index_t eid) { return colidx_[eid]; } index_t edge_begin(index_t vid) { return rowptr_[vid]; } index_t edge_end(index_t vid) { return rowptr_[vid + 1]; } diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h index 3a0139418e..ceb0f93ba0 100644 --- a/libdeepgalois/include/deepgalois/optimizer.h +++ b/libdeepgalois/include/deepgalois/optimizer.h @@ -30,7 +30,7 @@ struct optimizer { optimizer& operator=(optimizer&&) = default; virtual ~optimizer() = default; virtual void update(const vec_t& dW, vec_t& W) = 0; -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU virtual void update_gpu(const size_t n, const float_t* dW, float_t* W) = 0; #endif virtual void reset() {} // override to implement pre-learning action @@ -53,7 +53,7 @@ struct stateful_optimizer : public optimizer { return E_[Index][&key]; } std::unordered_map E_[N]; -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU template float_t* get_gpu(const size_t n, const float_t* key); std::unordered_map dE_[N]; @@ -70,7 +70,7 @@ struct stateful_optimizer : public optimizer { struct adagrad : public stateful_optimizer<1> { adagrad() : alpha(0.01), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W); -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif float_t alpha; // learning rate @@ -87,7 +87,7 @@ struct adagrad : public stateful_optimizer<1> { struct RMSprop : public stateful_optimizer<1> { RMSprop() : alpha(float_t(0.0001)), mu(float_t(0.99)), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W); -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif float_t alpha; // learning rate @@ -103,7 +103,7 @@ struct adam : public stateful_optimizer<2> { : alpha(float_t(0.01)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(float_t(0.9)), b2_t(float_t(0.999)), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W); -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif @@ -128,7 +128,7 @@ struct adamax : public stateful_optimizer<2> { : alpha(float_t(0.002)), b1(float_t(0.9)), b2(float_t(0.999)), b1_t(b1), eps(float_t(1e-8)) {} void update(const vec_t& dW, vec_t& W); -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif @@ -146,7 +146,7 @@ struct adamax : public stateful_optimizer<2> { struct gradient_descent : public optimizer { gradient_descent() : alpha(float_t(0.01)), lambda(float_t(0)) {} void update(const vec_t& dW, vec_t& W); -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif float_t alpha; // learning rate @@ -164,7 +164,7 @@ struct momentum : public stateful_optimizer<1> { public: momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} void update(const vec_t& dW, vec_t& W); -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif @@ -185,7 +185,7 @@ struct nesterov_momentum : public stateful_optimizer<1> { nesterov_momentum() : alpha(float_t(0.01)), lambda(float_t(0)), mu(float_t(0.9)) {} void update(const vec_t& dW, vec_t& W); -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU void update_gpu(const size_t n, const float_t* dW, float_t* W); #endif diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp index e4c110dd1e..016a72d26a 100644 --- a/libdeepgalois/src/reader.cpp +++ b/libdeepgalois/src/reader.cpp @@ -17,7 +17,7 @@ namespace deepgalois { // required. size_t Reader::read_labels(bool is_single_class, label_t*& labels) { unsigned myID = 0; -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU myID = galois::runtime::getSystemNetworkInterface().ID; galois::gPrint("[", myID, "] Reader: Reading labels...\n"); #endif diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index db738dd2f3..10cd18832c 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -115,7 +115,7 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks, : 0.; unsigned myID = 0; -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU myID = galois::runtime::getSystemNetworkInterface().ID; #endif std::cout << "[" << myID << "]" << std::setprecision(3) << std::fixed diff --git a/lonestar/gnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt index 0020736fee..f718db4942 100644 --- a/lonestar/gnn/CMakeLists.txt +++ b/lonestar/gnn/CMakeLists.txt @@ -2,7 +2,7 @@ include_directories(${CMAKE_SOURCE_DIR}/lonestar/gnn/include) include_directories(${CMAKE_SOURCE_DIR}/libdeepgalois/include) include_directories(${CUDA_HOME}/include) link_directories(${CUDA_HOME}/lib64) -if(ENABLE_HETERO_GALOIS) +if(GALOIS_ENABLE_GPU) include_directories(${CMAKE_SOURCE_DIR}/libgpu/include) link_directories(${INTEL_LIBS_DIR}) endif() @@ -13,8 +13,8 @@ if(USE_MKL_BLAS) endif() link_directories(${BLAS_LIB_DIR}) -if(ENABLE_HETERO_GALOIS) - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -D__GALOIS_HET_CUDA__") +if(GALOIS_ENABLE_GPU) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGALOIS_ENABLE_GPU") endif() if(GALOIS_ENABLE_DIST) diff --git a/lonestar/gnn/gat/CMakeLists.txt b/lonestar/gnn/gat/CMakeLists.txt index 8d172ac154..5fc85aa8a6 100644 --- a/lonestar/gnn/gat/CMakeLists.txt +++ b/lonestar/gnn/gat/CMakeLists.txt @@ -1,7 +1,7 @@ add_executable(gat gat.cpp) target_link_libraries(gat PRIVATE Galois::shmem lonestar) -if(ENABLE_HETERO_GALOIS) +if(GALOIS_ENABLE_GPU) set_property(TARGET gat PROPERTY CUDA_STANDARD 14) set_property(TARGET gat PROPERTY CUDA_SEPARABLE_COMPILATION ON) target_link_libraries(gat PRIVATE dg_gpu dg_cpu) diff --git a/lonestar/gnn/gcn/CMakeLists.txt b/lonestar/gnn/gcn/CMakeLists.txt index 9ed4ef97d9..7ad1ba6e29 100644 --- a/lonestar/gnn/gcn/CMakeLists.txt +++ b/lonestar/gnn/gcn/CMakeLists.txt @@ -1,10 +1,10 @@ -#if(ENABLE_HETERO_GALOIS) +#if(GALOIS_ENABLE_GPU) # set_source_files_properties(gcn.cpp PROPERTIES LANGUAGE CUDA) #endif() add_executable(gcn gcn.cpp) target_link_libraries(gcn PRIVATE Galois::shmem lonestar) -if(ENABLE_HETERO_GALOIS) +if(GALOIS_ENABLE_GPU) set_property(TARGET gcn PROPERTY CUDA_STANDARD 14) set_property(TARGET gcn PROPERTY CUDA_SEPARABLE_COMPILATION ON) target_link_libraries(gcn PRIVATE dg_gpu dg_cpu) diff --git a/lonestar/gnn/gin/CMakeLists.txt b/lonestar/gnn/gin/CMakeLists.txt index 5d63e3d0d7..7e6027174a 100644 --- a/lonestar/gnn/gin/CMakeLists.txt +++ b/lonestar/gnn/gin/CMakeLists.txt @@ -3,7 +3,7 @@ target_link_libraries(gin dg_cpu) if(GALOIS_ENABLE_DIST) target_link_libraries(gin distgraphloader) endif() -if(ENABLE_HETERO_GALOIS) +if(GALOIS_ENABLE_GPU) target_link_libraries(gin dg_gpu) target_link_libraries(gin -lcudart -lcublas -lcurand -lcudadevrt) endif() diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h index cf39ce95f3..84aa5cbadd 100644 --- a/lonestar/gnn/include/engine.h +++ b/lonestar/gnn/include/engine.h @@ -21,7 +21,7 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, llvm::cl::ParseCommandLineOptions(argc, argv); galois::runtime::setStatFile(statFile); -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU numThreads = galois::setActiveThreads(numThreads); // number of threads on CPU #endif @@ -54,7 +54,7 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, } int main(int argc, char** argv) { -#ifdef __GALOIS_HET_CUDA__ +#ifdef GALOIS_ENABLE_GPU galois::SharedMemSys G; #else galois::DistMemSys G; @@ -64,7 +64,7 @@ int main(int argc, char** argv) { // Get a partitioned graph first std::vector dummyVec; deepgalois::DGraph* dGraph = NULL; -#ifndef __GALOIS_HET_CUDA__ +#ifndef GALOIS_ENABLE_GPU dGraph = galois::graphs::constructSymmetricGraph(dummyVec); #endif // initialize network + whole context on CPU diff --git a/lonestar/gnn/sage/CMakeLists.txt b/lonestar/gnn/sage/CMakeLists.txt index ee95292588..b820f7024b 100644 --- a/lonestar/gnn/sage/CMakeLists.txt +++ b/lonestar/gnn/sage/CMakeLists.txt @@ -1,7 +1,7 @@ add_executable(sage sage.cpp) target_link_libraries(sage PRIVATE Galois::shmem lonestar) -if(ENABLE_HETERO_GALOIS) +if(GALOIS_ENABLE_GPU) set_property(TARGET sage PROPERTY CUDA_STANDARD 14) set_property(TARGET sage PROPERTY CUDA_SEPARABLE_COMPILATION ON) target_link_libraries(sage PRIVATE dg_gpu dg_cpu) From 893e44128e1a28b3de23b0fd9f53ff5756dfc5fd Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 19 May 2020 16:53:44 -0500 Subject: [PATCH 317/660] fix gpu compile errors --- libdeepgalois/CMakeLists.txt | 2 +- lonestar/gnn/include/engine.h | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 1abc692a9f..4f7d9f93ac 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -27,7 +27,7 @@ link_directories(${CMAKE_SOURCE_DIR}/libgalois) if(GALOIS_ENABLE_GPU) # hetero path - set(CUDA_NVCC_FLAGS "-DGALOIS_ENABLE_GPU ${CUDA_NVCC_FLAGS}") + set(CUDA_NVCC_FLAGS "-DGALOIS_ENABLE_GPU --extended-lambda ${CUDA_NVCC_FLAGS}") set(CUB_ROOT "${CMAKE_SOURCE_DIR}/cub") # only required headers include_directories("${CUB_ROOT}") set(MGPU_ROOT "${CMAKE_SOURCE_DIR}/moderngpu") # only required headers diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h index 84aa5cbadd..25644c720d 100644 --- a/lonestar/gnn/include/engine.h +++ b/lonestar/gnn/include/engine.h @@ -1,9 +1,13 @@ // Execution engine #include #include +#ifdef GALOIS_ENABLE_GPU +#include "galois/Galois.h" +#else #include "DistributedGraphLoader.h" #include "galois/DistGalois.h" #include "galois/runtime/Network.h" +#endif #include "galois/Version.h" #include "galois/Timer.h" #include "deepgalois/Net.h" @@ -21,12 +25,13 @@ void LonestarGnnStart(int argc, char** argv, const char* app, const char* desc, llvm::cl::ParseCommandLineOptions(argc, argv); galois::runtime::setStatFile(statFile); + unsigned hostID = 0; #ifndef GALOIS_ENABLE_GPU numThreads = galois::setActiveThreads(numThreads); // number of threads on CPU + hostID = galois::runtime::getSystemNetworkInterface().ID; #endif - auto& net = galois::runtime::getSystemNetworkInterface(); - if (net.ID == 0) { + if (hostID == 0) { LonestarGnnPrintVersion(llvm::outs()); std::cout << "Copyright (C) " << galois::getCopyrightYear() << " The University of Texas at Austin\n"; From c5a5ee56892f85943279f51e25e5676ca02f0156 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Mon, 1 Jun 2020 11:41:08 -0500 Subject: [PATCH 318/660] add src/Train.cpp --- libdeepgalois/CMakeLists.txt | 3 +- libdeepgalois/include/deepgalois/Net.h | 527 ++----------------------- libdeepgalois/src/Train.cpp | 509 ++++++++++++++++++++++++ 3 files changed, 540 insertions(+), 499 deletions(-) create mode 100644 libdeepgalois/src/Train.cpp diff --git a/libdeepgalois/CMakeLists.txt b/libdeepgalois/CMakeLists.txt index 4f7d9f93ac..44be89edad 100644 --- a/libdeepgalois/CMakeLists.txt +++ b/libdeepgalois/CMakeLists.txt @@ -73,7 +73,7 @@ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11") if(GALOIS_ENABLE_GPU) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DGALOIS_ENABLE_GPU") - set(sources src/reader.cpp src/RandomWalk.cpp src/utils.cpp) + set(sources src/reader.cpp src/RandomWalk.cpp src/utils.cpp src/Train.cpp) else() set(sources src/layers/softmax_loss_layer.cpp @@ -91,6 +91,7 @@ else() src/reader.cpp src/lgraph.cpp src/utils.cpp + src/Train.cpp src/node.cpp src/Net.cpp ) diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index e17a9f9b76..6c720f730d 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -91,374 +91,26 @@ class Net { Sampler* sampler; public: - Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, - unsigned hidden1, float lr, float dropout, float wd, bool selfloop, - bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz, - int val_itv) - : is_single_class(single), has_l2norm(l2norm), has_dense(dense), - neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz), - num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), - h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd), - val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { - // init some identifiers for this host -#ifndef GALOIS_ENABLE_GPU - this->myID = galois::runtime::getSystemNetworkInterface().ID; -#endif - this->header = "[" + std::to_string(myID) + "] "; - this->seperator = " "; - - assert(n_conv > 0); - - //galois::gPrint(header, "Configuration: num_threads ", num_threads, - // ", num_conv_layers ", num_conv_layers, ", num_epochs ", - // num_epochs, ", hidden1 ", hidden1, ", learning_rate ", - // learning_rate, ", dropout_rate ", dropout_rate, - // ", weight_decay ", weight_decay, "\n"); - this->num_layers = num_conv_layers + 1; - - // additional layers to add - if (has_l2norm) - this->num_layers++; - if (has_dense) - this->num_layers++; - // initialize feature metadata - feature_dims.resize(num_layers + 1); - - // initialze global graph context - graphTopologyContext = new deepgalois::Context(); - graphTopologyContext->set_dataset(dataset_str); - // read *entire* graph, get num nodes - globalSamples = graphTopologyContext->read_graph(selfloop); - - // get training and validation sets: this is to create the training - // subgraph in the sampler - globalTrainMasks = new mask_t[globalSamples]; - globalValMasks = new mask_t[globalSamples]; - globalTestMasks = new mask_t[globalSamples]; - std::fill(globalTrainMasks, globalTrainMasks + globalSamples, 0); - std::fill(globalValMasks, globalValMasks + globalSamples, 0); - - // reddit is hard coded - if (dataset_str == "reddit") { - this->globalTrainBegin = 0; - this->globalTrainCount = 153431; - this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount; - this->globalValBegin = 153431; - this->globalValCount = 23831; - this->globalValEnd = this->globalValBegin + this->globalValCount; - - // TODO do all can be used below - for (size_t i = globalTrainBegin; i < globalTrainEnd; i++) - globalTrainMasks[i] = 1; - for (size_t i = globalValBegin; i < globalValEnd; i++) - globalValMasks[i] = 1; - } else { - globalTrainCount = graphTopologyContext->read_masks( - "train", globalSamples, globalTrainBegin, globalTrainEnd, - globalTrainMasks); - globalValCount = graphTopologyContext->read_masks( - "val", globalSamples, globalValBegin, globalValEnd, globalValMasks); - } - - // make sure sampel size isn't greater than what we have to train with - assert(subgraph_sample_size <= globalTrainCount); - - layers.resize(num_layers); - // hidden1 level embedding: 16 - for (size_t i = 1; i < num_conv_layers; i++) - feature_dims[i] = this->h1; - - // features are read in distcontext, not this context (this context only - // used for sampling) - if (subgraph_sample_size) - sampler = new deepgalois::Sampler(); - } - //! Default net constructor - // Net() - // : is_single_class(true), has_l2norm(false), has_dense(false), - // neighbor_sample_size(0), subgraph_sample_size(0), num_threads(1), - // globalSamples(0), num_classes(0), num_conv_layers(0), num_layers(0), - // num_epochs(0), learning_rate(0.0), dropout_rate(0.0), - // weight_decay(0.0), globalTrainBegin(0), globalTrainEnd(0), - // globalTrainCount(0), globalValBegin(0), globalValEnd(0), - // globalValCount(0), globalTestBegin(0), globalTestEnd(0), - // globalTestCount(0), val_interval(1), num_subgraphs(1), - // num_vertices_sg(9000), globalTrainMasks(NULL), globalValMasks(NULL), - // globalTestMasks(NULL), context(NULL) {} + Net() : Net("reddit", 1, 2, 200, 16, 0.01, 0.5, 5e-4, + false, true, false, false, 25, 9000, 1) {} + //! Net constructor + Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, + unsigned hidden1, float lr, float dropout, float wd, + bool selfloop, bool single, bool l2norm, bool dense, + unsigned neigh_sz, unsigned subg_sz, int val_itv); + + // allocate memory for subgraph masks void allocateSubgraphsMasks(int num_subgraphs); //! Initializes metadata for the partition: loads data, labels, etc - void partitionInit(DGraph* graph, std::string dataset_str, - bool isSingleClassLabel); + void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel); size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } void regularize(); // add weight decay - - void train(optimizer* opt, bool need_validate) { - double total_train_time = 0.0; - int num_subg_remain = 0; - - if (subgraph_sample_size) { - distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size); - allocateSubgraphsMasks(num_subgraphs); - std::cout << header - << "Constructing training vertex set induced graph...\n"; - // auto gg = distContext->getGraphPointer(); - auto gg = - graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem - sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg, - distContext->getGraphPointer()); - } - - //galois::gPrint(header, "Start training...\n"); - - Timer t_epoch; - - // run epochs - for (int curEpoch = 0; curEpoch < num_epochs; curEpoch++) { - t_epoch.Start(); - - //////////////////////////////////////////////////////////////////////////////// - // Sampling - //////////////////////////////////////////////////////////////////////////////// - if (subgraph_sample_size) { - if (num_subg_remain == 0) { - std::cout << header << "Generating " << num_subgraphs - << " subgraph(s)\n"; - // TODO stat timer instead of this timer - Timer t_subgen; - t_subgen.Start(); - - // generate subgraphs - for (int sid = 0; sid < num_subgraphs; sid++) { - VertexSet sampledSet; - sampler->selectVertices(subgraph_sample_size, sampledSet, - curEpoch); // m = 1000 by default - sampler->generateSubgraph(sampledSet, - subgraphs_masks + sid * globalSamples, - distContext->getSubgraphPointer(sid)); - } - num_subg_remain = num_subgraphs; - t_subgen.Stop(); - // std::cout << "Done, time: " << t_subgen.Millisecs() << "\n"; - } - // count their degrees - for (int i = 0; i < num_subgraphs; i++) { - auto sg_ptr = distContext->getSubgraphPointer(i); - sg_ptr->degree_counting(); - // galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " - // num_e ", sg_ptr->sizeEdges(), "\n"); - } - - // choose a subgraph to use - num_subg_remain--; - int sg_id = num_subg_remain; - auto subgraphPointer = distContext->getSubgraphPointer(sg_id); - this->subgraphNumVertices = subgraphPointer->size(); - - //std::cout << "Subgraph num_vertices: " << subgraphNumVertices - // << ", num_edges: " << subgraphPointer->sizeEdges() << "\n"; - for (size_t i = 0; i < num_layers; i++) { - layers[i]->update_dim_size(this->subgraphNumVertices); - } - - // TODO dist version where i need global degrees - // change normalization constants - distContext->constructNormFactorSub(sg_id); - for (size_t i = 0; i < num_conv_layers; i++) { - layers[i]->set_graph_ptr(subgraphPointer); - layers[i]->set_norm_consts_ptr( - distContext->get_norm_factors_subg_ptr()); - } - - // update labels for subgraph - distContext->constructSubgraphLabels( - this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples); - layers[num_layers - 1]->set_labels_ptr( - distContext->get_labels_subg_ptr()); - - // update features for subgraph - distContext->constructSubgraphFeatures( - this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples); - layers[0]->set_feats_ptr( - distContext->get_feats_subg_ptr()); // feed input data - - // Graph* testing = distContext->getSubgraphPointer(sg_id); - // for (size_t i = 0; i < testing->size(); i++) { - // for (auto j = testing->edge_begin(i); j < testing->edge_end(i); j++) - // { - // galois::gPrint(i, " ", testing->getEdgeDst(j), "\n"); - // } - //} - } // end subgraph sample loop - //////////////////////////////////////////////////////////////////////////////// - - // training steps -#ifdef GALOIS_ENABLE_GPU - std::cout << header << "Epoch " << std::setw(3) << curEpoch << " "; -#else - galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n"); -#endif - set_netphases(net_phase::train); - acc_t train_loss = 0.0, train_acc = 0.0; - - //galois::gPrint(header, "Calling into eval for forward propagation\n"); - // forward: after this phase, layer edges will contain intermediate - // features for use during backprop - double fw_time = evaluate("train", train_loss, train_acc); - //evaluate("train", train_loss, train_acc); - - - //galois::gPrint(header, "Calling into backward propagation\n"); - // backward: use intermediate features + ground truth to update layers - // with feature gradients whcih are then used to calculate weight - // gradients - Net::bprop(); - - //galois::gPrint(header, "Weight update call\n"); - // gradient update: use gradients stored on each layer to update model - // for next epoch - Net::update_weights(opt); // update parameters - - // validation / testing - set_netphases(net_phase::test); - -#ifdef GALOIS_ENABLE_GPU - std::cout << header << "train_loss " << std::setprecision(3) << std::fixed - << train_loss << " train_acc " << train_acc << " "; -#else - galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, - train_loss, " train_acc ", train_acc, "\n"); -#endif - t_epoch.Stop(); - - double epoch_time = t_epoch.Millisecs(); - total_train_time += epoch_time; - - if (need_validate && curEpoch % val_interval == 0) { - // Validation - acc_t val_loss = 0.0, val_acc = 0.0; - double val_time = evaluate("val", val_loss, val_acc); -#ifdef GALOIS_ENABLE_GPU - std::cout << header << "val_loss " << std::setprecision(3) << std::fixed - << val_loss << " val_acc " << val_acc << " "; - std::cout << header << "time " << std::setprecision(3) << std::fixed - << epoch_time + val_time << " ms (train_time " << epoch_time - << " val_time " << val_time << ")\n"; -#else - galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, - val_loss, " val_acc ", val_acc, "\n"); - galois::gPrint(header, "time ", std::setprecision(3), std::fixed, - epoch_time + val_time, " ms (train_time ", epoch_time, - " val_time ", val_time, ")\n"); -#endif - } else { -#ifdef GALOIS_ENABLE_GPU - std::cout << header << "train_time " << std::fixed << epoch_time - << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n"; -#else - galois::gPrint(header, "train_time ", std::fixed, epoch_time, - " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, ")\n"); -#endif - } - } // epoch loop - - double avg_train_time = total_train_time / (double)num_epochs; - double throughput = 1000.0 * (double)num_epochs / total_train_time; -#ifdef GALOIS_ENABLE_GPU - std::cout << "Average training time per epoch: " << avg_train_time - << "ms. Throughput " << throughput << " epoch/s\n"; -#else - galois::gPrint(header, "Average training time per epoch: ", avg_train_time, - " ms. Throughput: ", throughput, " epoch/s\n"); -#endif - } - - // evaluate, i.e. inference or predict - double evaluate(std::string type, acc_t& loss, acc_t& acc) { - Timer t_eval; - t_eval.Start(); - size_t gBegin = 0, gEnd = 0, gCount = 0; - mask_t* gMasks = NULL; - - // TODO global here good for dist case? - if (type == "train") { - gBegin = globalTrainBegin; - gEnd = globalTrainEnd; - gCount = globalTrainCount; - gMasks = globalTrainMasks; - if (subgraph_sample_size) { - // update gMasks for subgraph - gMasks = NULL; - gBegin = 0; - gEnd = this->subgraphNumVertices; - gCount = this->subgraphNumVertices; - } - } else if (type == "val") { - gBegin = globalValBegin; - gEnd = globalValEnd; - gCount = globalValCount; - gMasks = globalValMasks; - } else { - gBegin = globalTestBegin; - gEnd = globalTestEnd; - gCount = globalTestCount; - gMasks = globalTestMasks; - } - - // switch to the original graph if not training - if (subgraph_sample_size && type != "train") { - for (size_t i = 0; i < num_layers; i++) - layers[i]->update_dim_size(distNumSamples); - for (size_t i = 0; i < num_conv_layers; i++) { -#ifdef GALOIS_ENABLE_GPU - layers[i]->set_graph_ptr(distContext->getGraphPointer()); -#else - layers[i]->set_graph_ptr(distContext->getLGraphPointer()); -#endif - layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr()); - } - layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_ptr()); - layers[0]->set_feats_ptr(distContext->get_feats_ptr()); // feed input data - } -#ifdef GALOIS_ENABLE_GPU - if (type == "train") { - gMasks = d_train_masks; - } else if (type == "val") { - gMasks = d_val_masks; - } else { - gMasks = d_test_masks; - } -#endif - - //galois::gPrint(header, "Doing actual forward propagation\n"); - loss = fprop(gBegin, gEnd, gCount, gMasks); - //galois::gPrint(header, - // "Forward propagation donne, going to check accuracy\n"); - float_t* predictions = layers[num_layers - 1]->next()->get_data(); - - // labels will be subgraph labels if applicable - label_t* localLabels; - if (type == "train" && subgraph_sample_size) { - localLabels = distContext->get_labels_subg_ptr(); - } else { - // note this grabs local labels - localLabels = distContext->get_labels_ptr(); - } - - if (is_single_class) { - acc = masked_accuracy(gBegin, gEnd, gCount, gMasks, predictions, - localLabels); - } else { - acc = masked_multi_class_accuracy(gBegin, gEnd, gCount, gMasks, - predictions, localLabels); - } - - t_eval.Stop(); - return t_eval.Millisecs(); - } + void train(optimizer* opt, bool need_validate); + double evaluate(std::string type, acc_t& loss, acc_t& acc); //! read masks of test set for GLOBAL set void read_test_masks(std::string dataset); @@ -466,153 +118,32 @@ class Net { void readDistributedTestMasks(std::string dataset); // void copy_test_masks_to_device(); - - void construct_layers() { - // append conv layers - //galois::gPrint(header, "Constructing layers...\n"); - for (size_t i = 0; i < num_conv_layers - 1; i++) { - append_conv_layer(i, true); // conv layers, act=true - } - append_conv_layer(num_conv_layers - 1); // the last hidden layer, act=false - - if (has_l2norm) { - append_l2norm_layer(num_conv_layers); // l2_norm layer - } - if (has_dense) { - append_dense_layer(num_layers - 2); // dense layer - } - append_out_layer(num_layers - 1); // output layer - - // allocate memory for intermediate features and gradients - for (size_t i = 0; i < num_layers; i++) { - layers[i]->add_edge(); - } - for (size_t i = 1; i < num_layers; i++) { - connect(layers[i - 1], layers[i]); - } - for (size_t i = 0; i < num_layers; i++) { - layers[i]->malloc_and_init(); - } - - layers[0]->set_in_data(distContext->get_feats_ptr()); // feed input data - // precompute the normalization constant based on graph structure - // context->norm_factor_computing(false); - distContext->constructNormFactor(graphTopologyContext); - for (size_t i = 0; i < num_conv_layers; i++) - layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr()); - set_contexts(); - } + void construct_layers(); //! Add an l2_norm layer to the network - void append_l2norm_layer(size_t layer_id) { - assert(layer_id > 0); // can not be the first layer - std::vector in_dims(2), out_dims(2); - in_dims[0] = distNumSamples; - in_dims[0] = distNumSamples; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - layers[layer_id] = new l2_norm_layer(layer_id, in_dims, out_dims); - } + void append_l2norm_layer(size_t layer_id); //! Add an dense layer to the network - void append_dense_layer(size_t layer_id) { - assert(layer_id > 0); // can not be the first layer - std::vector in_dims(2), out_dims(2); - in_dims[0] = distNumSamples; - in_dims[0] = distNumSamples; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - // layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims); - } + void append_dense_layer(size_t layer_id); //! Add an output layer to the network - void append_out_layer(size_t layer_id) { - assert(layer_id > 0); // can not be the first layer - std::vector in_dims(2), out_dims(2); - in_dims[0] = out_dims[0] = distNumSamples; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - - if (is_single_class) - layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); - else - layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims); - - layers[layer_id]->set_labels_ptr(distContext->get_labels_ptr()); - } + void append_out_layer(size_t layer_id); //! Add a convolution layer to the network void append_conv_layer(size_t layer_id, bool act = false, bool norm = true, - bool bias = false, bool dropout = true) { - assert(dropout_rate < 1.0); - assert(layer_id < num_conv_layers); - std::vector in_dims(2), out_dims(2); - in_dims[0] = out_dims[0] = distNumSamples; - in_dims[1] = get_in_dim(layer_id); - out_dims[1] = get_out_dim(layer_id); - layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, - dropout_rate, in_dims, out_dims); -#ifdef GALOIS_ENABLE_GPU - layers[layer_id]->set_graph_ptr(distContext->getGraphPointer()); -#else - layers[layer_id]->set_graph_ptr(distContext->getLGraphPointer()); -#endif - } - - // update trainable weights after back-propagation - void update_weights(optimizer* opt) { - regularize(); - for (size_t i = 0; i < num_layers; i++) { - if (layers[i]->trainable()) { - layers[i]->update_weight(opt); - } - } - } - - //! forward propagation: [begin, end) is the range of samples used. - //! calls "forward" on each layer and returns the loss of the final layer - acc_t fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) { - // set mask for the last layer; globals - // TODO this should be distirbuted sample gBegin->end not global; fix later - // seems to be unused in code right now anyways - //galois::gPrint(header, "fprop: set sample mask\n"); - layers[num_layers - 1]->set_sample_mask(gBegin, gEnd, gCount, gMasks); - - for (size_t i = 0; i < num_layers; i++) { - //galois::gPrint(header, "fprop: layer ", i, " forward call\n"); - layers[i]->forward(); - } - - //galois::gPrint(header, "fprop: getting loss\n"); - // prediction error - acc_t loss = layers[num_layers - 1]->get_prediction_loss(); - // Squared Norm Regularization to mitigate overfitting - loss += weight_decay * layers[0]->get_weight_decay_loss(); - return loss; - } - - void bprop() { - for (size_t i = num_layers; i != 0; i--) { - layers[i - 1]->backward(); - } - } - - //! Save the context object to all layers of the network - void set_contexts() { - for (size_t i = 0; i < num_layers; i++) - layers[i]->set_context(distContext); - } - //! set netphases for all layers in this network - void set_netphases(net_phase phase) { - for (size_t i = 0; i < num_layers; i++) - layers[i]->set_netphase(phase); - } - //! print all layers - void print_layers_info() { - for (size_t i = 0; i < num_layers; i++) - layers[i]->print_layer_info(); - } - + bool bias = false, bool dropout = true); + + // update trainable weights after back-prop + void update_weights(optimizer* opt); + + // forward propagation + acc_t fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks); + void bprop(); // back propagation + void set_contexts(); // Save the context + void set_netphases(net_phase phase); // current phase: train or test + void print_layers_info(); // print layer information + void print_configs(); // print the configurations + // comparing outputs with the ground truth (labels) acc_t masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks, float_t* preds, diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp new file mode 100644 index 0000000000..75724a134d --- /dev/null +++ b/libdeepgalois/src/Train.cpp @@ -0,0 +1,509 @@ +#include "galois/Galois.h" +#include "deepgalois/Net.h" + +namespace deepgalois { + +Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, + unsigned hidden1, float lr, float dropout, float wd, + bool selfloop, bool single, bool l2norm, bool dense, + unsigned neigh_sz, unsigned subg_sz, int val_itv) : +// globalSamples(0), num_classes(0), num_conv_layers(0), num_layers(0), +// globalTrainBegin(0), globalTrainEnd(0), globalTrainCount(0), +// globalValBegin(0), globalValEnd(0), globalValCount(0), +// globalTestBegin(0), globalTestEnd(0), globalTestCount(0), +// globalTrainMasks(NULL), globalValMasks(NULL), globalTestMasks(NULL) {} + is_single_class(single), has_l2norm(l2norm), has_dense(dense), + neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz), + num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), + h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd), + val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { + // init some identifiers for this host +#ifndef GALOIS_ENABLE_GPU + this->myID = galois::runtime::getSystemNetworkInterface().ID; +#endif + this->header = "[" + std::to_string(myID) + "] "; + this->seperator = " "; + + assert(n_conv > 0); + + this->num_layers = num_conv_layers + 1; + + // additional layers to add + if (has_l2norm) + this->num_layers++; + if (has_dense) + this->num_layers++; + // initialize feature metadata + feature_dims.resize(num_layers + 1); + print_configs(); + + // initialze global graph context + graphTopologyContext = new deepgalois::Context(); + graphTopologyContext->set_dataset(dataset_str); + // read *entire* graph, get num nodes + globalSamples = graphTopologyContext->read_graph(selfloop); + + // get training and validation sets: this is to create the training + // subgraph in the sampler + globalTrainMasks = new mask_t[globalSamples]; + globalValMasks = new mask_t[globalSamples]; + globalTestMasks = new mask_t[globalSamples]; + std::fill(globalTrainMasks, globalTrainMasks + globalSamples, 0); + std::fill(globalValMasks, globalValMasks + globalSamples, 0); + + // reddit is hard coded + if (dataset_str == "reddit") { + this->globalTrainBegin = 0; + this->globalTrainCount = 153431; + this->globalTrainEnd = this->globalTrainBegin + this->globalTrainCount; + this->globalValBegin = 153431; + this->globalValCount = 23831; + this->globalValEnd = this->globalValBegin + this->globalValCount; + + // TODO do all can be used below + for (size_t i = globalTrainBegin; i < globalTrainEnd; i++) + globalTrainMasks[i] = 1; + for (size_t i = globalValBegin; i < globalValEnd; i++) + globalValMasks[i] = 1; + } else { + globalTrainCount = graphTopologyContext->read_masks( + "train", globalSamples, globalTrainBegin, globalTrainEnd, + globalTrainMasks); + globalValCount = graphTopologyContext->read_masks( + "val", globalSamples, globalValBegin, globalValEnd, globalValMasks); + } + // make sure sampel size isn't greater than what we have to train with + assert(subgraph_sample_size <= globalTrainCount); + + layers.resize(num_layers); + // hidden1 level embedding: 16 + for (size_t i = 1; i < num_conv_layers; i++) + feature_dims[i] = this->h1; + + // features are read in distcontext, not this context (this context only + // used for sampling) + if (subgraph_sample_size) + sampler = new deepgalois::Sampler(); +} + +void Net::train(optimizer* opt, bool need_validate) { + double total_train_time = 0.0; + int num_subg_remain = 0; + + if (subgraph_sample_size) { + distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size); + allocateSubgraphsMasks(num_subgraphs); + std::cout << header + << "Constructing training vertex set induced graph...\n"; + // auto gg = distContext->getGraphPointer(); + auto gg = + graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem + sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg, + distContext->getGraphPointer()); + } + + //galois::gPrint(header, "Start training...\n"); + + Timer t_epoch; + + // run epochs + for (int curEpoch = 0; curEpoch < num_epochs; curEpoch++) { + t_epoch.Start(); + + //////////////////////////////////////////////////////////////////////////////// + // Sampling + //////////////////////////////////////////////////////////////////////////////// + if (subgraph_sample_size) { + if (num_subg_remain == 0) { + std::cout << header << "Generating " << num_subgraphs + << " subgraph(s)\n"; + // TODO stat timer instead of this timer + Timer t_subgen; + t_subgen.Start(); + + // generate subgraphs + for (int sid = 0; sid < num_subgraphs; sid++) { + VertexSet sampledSet; + sampler->selectVertices(subgraph_sample_size, sampledSet, + curEpoch); // m = 1000 by default + sampler->generateSubgraph(sampledSet, + subgraphs_masks + sid * globalSamples, + distContext->getSubgraphPointer(sid)); + } + num_subg_remain = num_subgraphs; + t_subgen.Stop(); + // std::cout << "Done, time: " << t_subgen.Millisecs() << "\n"; + } + // count their degrees + for (int i = 0; i < num_subgraphs; i++) { + auto sg_ptr = distContext->getSubgraphPointer(i); + sg_ptr->degree_counting(); + // galois::gPrint("\tsubgraph[", i, "]: num_v ", sg_ptr->size(), " + // num_e ", sg_ptr->sizeEdges(), "\n"); + } + + // choose a subgraph to use + num_subg_remain--; + int sg_id = num_subg_remain; + auto subgraphPointer = distContext->getSubgraphPointer(sg_id); + this->subgraphNumVertices = subgraphPointer->size(); + + //std::cout << "Subgraph num_vertices: " << subgraphNumVertices + // << ", num_edges: " << subgraphPointer->sizeEdges() << "\n"; + for (size_t i = 0; i < num_layers; i++) { + layers[i]->update_dim_size(this->subgraphNumVertices); + } + + // TODO dist version where i need global degrees + // change normalization constants + distContext->constructNormFactorSub(sg_id); + for (size_t i = 0; i < num_conv_layers; i++) { + layers[i]->set_graph_ptr(subgraphPointer); + layers[i]->set_norm_consts_ptr( + distContext->get_norm_factors_subg_ptr()); + } + + // update labels for subgraph + distContext->constructSubgraphLabels( + this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples); + layers[num_layers - 1]->set_labels_ptr( + distContext->get_labels_subg_ptr()); + + // update features for subgraph + distContext->constructSubgraphFeatures( + this->subgraphNumVertices, subgraphs_masks + sg_id * globalSamples); + layers[0]->set_feats_ptr( + distContext->get_feats_subg_ptr()); // feed input data + + // Graph* testing = distContext->getSubgraphPointer(sg_id); + // for (size_t i = 0; i < testing->size(); i++) { + // for (auto j = testing->edge_begin(i); j < testing->edge_end(i); j++) + // { + // galois::gPrint(i, " ", testing->getEdgeDst(j), "\n"); + // } + //} + } // end subgraph sample loop + //////////////////////////////////////////////////////////////////////////////// + + // training steps +#ifdef GALOIS_ENABLE_GPU + std::cout << header << "Epoch " << std::setw(3) << curEpoch << " "; +#else + galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n"); +#endif + set_netphases(net_phase::train); + acc_t train_loss = 0.0, train_acc = 0.0; + + //galois::gPrint(header, "Calling into eval for forward propagation\n"); + // forward: after this phase, layer edges will contain intermediate + // features for use during backprop + double fw_time = evaluate("train", train_loss, train_acc); + //evaluate("train", train_loss, train_acc); + + + //galois::gPrint(header, "Calling into backward propagation\n"); + // backward: use intermediate features + ground truth to update layers + // with feature gradients whcih are then used to calculate weight + // gradients + Net::bprop(); + + //galois::gPrint(header, "Weight update call\n"); + // gradient update: use gradients stored on each layer to update model + // for next epoch + Net::update_weights(opt); // update parameters + + // validation / testing + set_netphases(net_phase::test); + +#ifdef GALOIS_ENABLE_GPU + std::cout << header << "train_loss " << std::setprecision(3) << std::fixed + << train_loss << " train_acc " << train_acc << " "; +#else + galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, + train_loss, " train_acc ", train_acc, "\n"); +#endif + t_epoch.Stop(); + + double epoch_time = t_epoch.Millisecs(); + total_train_time += epoch_time; + + if (need_validate && curEpoch % val_interval == 0) { + // Validation + acc_t val_loss = 0.0, val_acc = 0.0; + double val_time = evaluate("val", val_loss, val_acc); +#ifdef GALOIS_ENABLE_GPU + std::cout << header << "val_loss " << std::setprecision(3) << std::fixed + << val_loss << " val_acc " << val_acc << " "; + std::cout << header << "time " << std::setprecision(3) << std::fixed + << epoch_time + val_time << " ms (train_time " << epoch_time + << " val_time " << val_time << ")\n"; +#else + galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, + val_loss, " val_acc ", val_acc, "\n"); + galois::gPrint(header, "time ", std::setprecision(3), std::fixed, + epoch_time + val_time, " ms (train_time ", epoch_time, + " val_time ", val_time, ")\n"); +#endif + } else { +#ifdef GALOIS_ENABLE_GPU + std::cout << header << "train_time " << std::fixed << epoch_time + << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n"; +#else + galois::gPrint(header, "train_time ", std::fixed, epoch_time, + " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, ")\n"); +#endif + } + } // epoch loop + + double avg_train_time = total_train_time / (double)num_epochs; + double throughput = 1000.0 * (double)num_epochs / total_train_time; +#ifdef GALOIS_ENABLE_GPU + std::cout << "Average training time per epoch: " << avg_train_time + << "ms. Throughput " << throughput << " epoch/s\n"; +#else + galois::gPrint(header, "Average training time per epoch: ", avg_train_time, + " ms. Throughput: ", throughput, " epoch/s\n"); +#endif +} + +// evaluate, i.e. inference or predict +double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { + Timer t_eval; + t_eval.Start(); + size_t gBegin = 0, gEnd = 0, gCount = 0; + mask_t* gMasks = NULL; + + // TODO global here good for dist case? + if (type == "train") { + gBegin = globalTrainBegin; + gEnd = globalTrainEnd; + gCount = globalTrainCount; + gMasks = globalTrainMasks; + if (subgraph_sample_size) { + // update gMasks for subgraph + gMasks = NULL; + gBegin = 0; + gEnd = this->subgraphNumVertices; + gCount = this->subgraphNumVertices; + } + } else if (type == "val") { + gBegin = globalValBegin; + gEnd = globalValEnd; + gCount = globalValCount; + gMasks = globalValMasks; + } else { + gBegin = globalTestBegin; + gEnd = globalTestEnd; + gCount = globalTestCount; + gMasks = globalTestMasks; + } + + // switch to the original graph if not training + if (subgraph_sample_size && type != "train") { + for (size_t i = 0; i < num_layers; i++) + layers[i]->update_dim_size(distNumSamples); + for (size_t i = 0; i < num_conv_layers; i++) { +#ifdef GALOIS_ENABLE_GPU + layers[i]->set_graph_ptr(distContext->getGraphPointer()); +#else + layers[i]->set_graph_ptr(distContext->getLGraphPointer()); +#endif + layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr()); + } + layers[num_layers - 1]->set_labels_ptr(distContext->get_labels_ptr()); + layers[0]->set_feats_ptr(distContext->get_feats_ptr()); // feed input data + } +#ifdef GALOIS_ENABLE_GPU + if (type == "train") { + gMasks = d_train_masks; + } else if (type == "val") { + gMasks = d_val_masks; + } else { + gMasks = d_test_masks; + } +#endif + + //galois::gPrint(header, "Doing actual forward propagation\n"); + loss = fprop(gBegin, gEnd, gCount, gMasks); + //galois::gPrint(header, + // "Forward propagation donne, going to check accuracy\n"); + float_t* predictions = layers[num_layers - 1]->next()->get_data(); + + // labels will be subgraph labels if applicable + label_t* localLabels; + if (type == "train" && subgraph_sample_size) { + localLabels = distContext->get_labels_subg_ptr(); + } else { + // note this grabs local labels + localLabels = distContext->get_labels_ptr(); + } + + if (is_single_class) { + acc = masked_accuracy(gBegin, gEnd, gCount, gMasks, predictions, + localLabels); + } else { + acc = masked_multi_class_accuracy(gBegin, gEnd, gCount, gMasks, + predictions, localLabels); + } + + t_eval.Stop(); + return t_eval.Millisecs(); +} + +void Net::construct_layers() { + // append conv layers + //galois::gPrint(header, "Constructing layers...\n"); + for (size_t i = 0; i < num_conv_layers - 1; i++) { + append_conv_layer(i, true); // conv layers, act=true + } + append_conv_layer(num_conv_layers - 1); // the last hidden layer, act=false + + if (has_l2norm) { + append_l2norm_layer(num_conv_layers); // l2_norm layer + } + if (has_dense) { + append_dense_layer(num_layers - 2); // dense layer + } + append_out_layer(num_layers - 1); // output layer + + // allocate memory for intermediate features and gradients + for (size_t i = 0; i < num_layers; i++) { + layers[i]->add_edge(); + } + for (size_t i = 1; i < num_layers; i++) { + connect(layers[i - 1], layers[i]); + } + for (size_t i = 0; i < num_layers; i++) { + layers[i]->malloc_and_init(); + } + + layers[0]->set_in_data(distContext->get_feats_ptr()); // feed input data + // precompute the normalization constant based on graph structure + // context->norm_factor_computing(false); + distContext->constructNormFactor(graphTopologyContext); + for (size_t i = 0; i < num_conv_layers; i++) + layers[i]->set_norm_consts_ptr(distContext->get_norm_factors_ptr()); + set_contexts(); +} + +//! Add an l2_norm layer to the network +void Net::append_l2norm_layer(size_t layer_id) { + assert(layer_id > 0); // can not be the first layer + std::vector in_dims(2), out_dims(2); + in_dims[0] = distNumSamples; + in_dims[0] = distNumSamples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + layers[layer_id] = new l2_norm_layer(layer_id, in_dims, out_dims); +} + +//! Add an dense layer to the network +void Net::append_dense_layer(size_t layer_id) { + assert(layer_id > 0); // can not be the first layer + std::vector in_dims(2), out_dims(2); + in_dims[0] = distNumSamples; + in_dims[0] = distNumSamples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + // layers[layer_id] = new dense_layer(layer_id, in_dims, out_dims); +} + +//! Add an output layer to the network +void Net::append_out_layer(size_t layer_id) { + assert(layer_id > 0); // can not be the first layer + std::vector in_dims(2), out_dims(2); + in_dims[0] = out_dims[0] = distNumSamples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + + if (is_single_class) + layers[layer_id] = new softmax_loss_layer(layer_id, in_dims, out_dims); + else + layers[layer_id] = new sigmoid_loss_layer(layer_id, in_dims, out_dims); + + layers[layer_id]->set_labels_ptr(distContext->get_labels_ptr()); +} +//! Add a convolution layer to the network +void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, bool dropout) { + assert(dropout_rate < 1.0); + assert(layer_id < num_conv_layers); + std::vector in_dims(2), out_dims(2); + in_dims[0] = out_dims[0] = distNumSamples; + in_dims[1] = get_in_dim(layer_id); + out_dims[1] = get_out_dim(layer_id); + layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, + dropout_rate, in_dims, out_dims); +#ifdef GALOIS_ENABLE_GPU + layers[layer_id]->set_graph_ptr(distContext->getGraphPointer()); +#else + layers[layer_id]->set_graph_ptr(distContext->getLGraphPointer()); +#endif +} + +//! forward propagation: [begin, end) is the range of samples used. +//! calls "forward" on each layer and returns the loss of the final layer +acc_t Net::fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) { + // set mask for the last layer; globals + // TODO this should be distirbuted sample gBegin->end not global; fix later + // seems to be unused in code right now anyways + //galois::gPrint(header, "fprop: set sample mask\n"); + layers[num_layers - 1]->set_sample_mask(gBegin, gEnd, gCount, gMasks); + + for (size_t i = 0; i < num_layers; i++) { + //galois::gPrint(header, "fprop: layer ", i, " forward call\n"); + layers[i]->forward(); + } + + //galois::gPrint(header, "fprop: getting loss\n"); + // prediction error + acc_t loss = layers[num_layers - 1]->get_prediction_loss(); + // Squared Norm Regularization to mitigate overfitting + loss += weight_decay * layers[0]->get_weight_decay_loss(); + return loss; +} + +// back propagation +void Net::bprop() { + for (size_t i = num_layers; i != 0; i--) { + layers[i - 1]->backward(); + } +} + +// update trainable weights after back-propagation +void Net::update_weights(optimizer* opt) { + regularize(); + for (size_t i = 0; i < num_layers; i++) { + if (layers[i]->trainable()) { + layers[i]->update_weight(opt); + } + } +} + +//! Save the context object to all layers of the network +void Net::set_contexts() { + for (size_t i = 0; i < num_layers; i++) + layers[i]->set_context(distContext); +} + +//! set netphases for all layers in this network +void Net::set_netphases(net_phase phase) { + for (size_t i = 0; i < num_layers; i++) + layers[i]->set_netphase(phase); +} + +//! print all layers +void Net::print_layers_info() { + for (size_t i = 0; i < num_layers; i++) + layers[i]->print_layer_info(); +} + +// print the configurations +void Net::print_configs() { + galois::gPrint(header, "Configuration: num_threads ", num_threads, + ", num_conv_layers ", num_conv_layers, ", num_epochs ", + num_epochs, ", hidden_feat_len ", h1, ", learning_rate ", + learning_rate, ", dropout_rate ", dropout_rate, + ", weight_decay ", weight_decay, "\n"); +} + +} // end namespace From bd6e0fbecf08a4399c70701018d9d15b0d84540f Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Tue, 2 Jun 2020 09:16:16 -0500 Subject: [PATCH 319/660] clean Net.h --- libdeepgalois/include/deepgalois/Net.h | 9 +-------- libdeepgalois/src/Train.cpp | 19 +++++++++---------- 2 files changed, 10 insertions(+), 18 deletions(-) diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 6c720f730d..81754f915a 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -22,14 +22,7 @@ namespace deepgalois { // layer 1: features N x D, weights D x 16, out N x 16 (hidden1=16) // layer 2: features N x 16, weights 16 x E, out N x E class Net { -#ifdef GALOIS_ENABLE_GPU - unsigned myID = 0; -#else - unsigned myID = galois::runtime::getSystemNetworkInterface().ID; -#endif - std::string header = "[" + std::to_string(myID) + "] "; - std::string seperator = "\n"; - + std::string header; bool is_single_class; // single-class (one-hot) or multi-class label bool has_l2norm; // whether the net contains an l2_norm layer bool has_dense; // whether the net contains an dense layer diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp index 75724a134d..7bd0b70385 100644 --- a/libdeepgalois/src/Train.cpp +++ b/libdeepgalois/src/Train.cpp @@ -18,14 +18,12 @@ Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd), val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { // init some identifiers for this host -#ifndef GALOIS_ENABLE_GPU - this->myID = galois::runtime::getSystemNetworkInterface().ID; + unsigned myID = 0; +#ifdef GALOIS_ENABLE_DIST + myID = galois::runtime::getSystemNetworkInterface().ID; #endif - this->header = "[" + std::to_string(myID) + "] "; - this->seperator = " "; - + this->header = "[" + std::to_string(myID) + "] "; assert(n_conv > 0); - this->num_layers = num_conv_layers + 1; // additional layers to add @@ -87,6 +85,7 @@ Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, } void Net::train(optimizer* opt, bool need_validate) { + std::string separator = "\n"; double total_train_time = 0.0; int num_subg_remain = 0; @@ -102,7 +101,7 @@ void Net::train(optimizer* opt, bool need_validate) { distContext->getGraphPointer()); } - //galois::gPrint(header, "Start training...\n"); + galois::gPrint(header, "Start training...\n"); Timer t_epoch; @@ -189,7 +188,7 @@ void Net::train(optimizer* opt, bool need_validate) { #ifdef GALOIS_ENABLE_GPU std::cout << header << "Epoch " << std::setw(3) << curEpoch << " "; #else - galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, "\n"); + galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, separator); #endif set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; @@ -220,7 +219,7 @@ void Net::train(optimizer* opt, bool need_validate) { << train_loss << " train_acc " << train_acc << " "; #else galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, - train_loss, " train_acc ", train_acc, "\n"); + train_loss, " train_acc ", train_acc, separator); #endif t_epoch.Stop(); @@ -239,7 +238,7 @@ void Net::train(optimizer* opt, bool need_validate) { << " val_time " << val_time << ")\n"; #else galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, - val_loss, " val_acc ", val_acc, "\n"); + val_loss, " val_acc ", val_acc, separator); galois::gPrint(header, "time ", std::setprecision(3), std::fixed, epoch_time + val_time, " ms (train_time ", epoch_time, " val_time ", val_time, ")\n"); From bb4179b3df37855b21764eeb5ceb04c76f0e6e16 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 27 Jun 2020 14:07:02 -0500 Subject: [PATCH 320/660] timer set 1 --- libdeepgalois/src/Train.cpp | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp index 7bd0b70385..284044cd4c 100644 --- a/libdeepgalois/src/Train.cpp +++ b/libdeepgalois/src/Train.cpp @@ -90,6 +90,8 @@ void Net::train(optimizer* opt, bool need_validate) { int num_subg_remain = 0; if (subgraph_sample_size) { + galois::StatTimer construct_time("SubgraphAllocateTime"); + construct_time.start(); distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size); allocateSubgraphsMasks(num_subgraphs); std::cout << header @@ -99,6 +101,7 @@ void Net::train(optimizer* opt, bool need_validate) { graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg, distContext->getGraphPointer()); + construct_time.stop(); } galois::gPrint(header, "Start training...\n"); @@ -113,12 +116,13 @@ void Net::train(optimizer* opt, bool need_validate) { // Sampling //////////////////////////////////////////////////////////////////////////////// if (subgraph_sample_size) { + galois::StatTimer sample_time("SubgraphSampleTime"); + sample_time.start(); if (num_subg_remain == 0) { std::cout << header << "Generating " << num_subgraphs << " subgraph(s)\n"; - // TODO stat timer instead of this timer - Timer t_subgen; - t_subgen.Start(); + galois::StatTimer t_subgen("SubgraphGenerateTime"); + t_subgen.start(); // generate subgraphs for (int sid = 0; sid < num_subgraphs; sid++) { @@ -130,8 +134,7 @@ void Net::train(optimizer* opt, bool need_validate) { distContext->getSubgraphPointer(sid)); } num_subg_remain = num_subgraphs; - t_subgen.Stop(); - // std::cout << "Done, time: " << t_subgen.Millisecs() << "\n"; + t_subgen.stop(); } // count their degrees for (int i = 0; i < num_subgraphs; i++) { @@ -181,6 +184,7 @@ void Net::train(optimizer* opt, bool need_validate) { // galois::gPrint(i, " ", testing->getEdgeDst(j), "\n"); // } //} + sample_time.stop(); } // end subgraph sample loop //////////////////////////////////////////////////////////////////////////////// @@ -267,8 +271,13 @@ void Net::train(optimizer* opt, bool need_validate) { // evaluate, i.e. inference or predict double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { + // TODO get rid of this timer Timer t_eval; t_eval.Start(); + + galois::StatTimer eval_timer("EvaluateTime"); + eval_timer.start(); + size_t gBegin = 0, gEnd = 0, gCount = 0; mask_t* gMasks = NULL; @@ -345,6 +354,9 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { predictions, localLabels); } + eval_timer.stop(); + + // TODO replace with stat timer t_eval.Stop(); return t_eval.Millisecs(); } @@ -442,6 +454,8 @@ void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, boo //! forward propagation: [begin, end) is the range of samples used. //! calls "forward" on each layer and returns the loss of the final layer acc_t Net::fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) { + galois::StatTimer fprop_timer("ForwardPropTime"); + fprop_timer.start(); // set mask for the last layer; globals // TODO this should be distirbuted sample gBegin->end not global; fix later // seems to be unused in code right now anyways @@ -458,14 +472,18 @@ acc_t Net::fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) { acc_t loss = layers[num_layers - 1]->get_prediction_loss(); // Squared Norm Regularization to mitigate overfitting loss += weight_decay * layers[0]->get_weight_decay_loss(); + fprop_timer.stop(); return loss; } // back propagation void Net::bprop() { + galois::StatTimer bprop_timer("BackPropTime"); + bprop_timer.start(); for (size_t i = num_layers; i != 0; i--) { layers[i - 1]->backward(); } + bprop_timer.stop(); } // update trainable weights after back-propagation From 8f37a6d8754646a4ab0d1bcacb77c375fbf89bc3 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 27 Jun 2020 15:29:02 -0500 Subject: [PATCH 321/660] timers set 2 --- libdeepgalois/src/Train.cpp | 3 +++ libdeepgalois/src/layers/graph_conv_layer.cpp | 12 +++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp index 284044cd4c..4e363bb1b1 100644 --- a/libdeepgalois/src/Train.cpp +++ b/libdeepgalois/src/Train.cpp @@ -85,6 +85,8 @@ Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, } void Net::train(optimizer* opt, bool need_validate) { + galois::StatTimer train_timer("Timer_0"); + train_timer.start(); std::string separator = "\n"; double total_train_time = 0.0; int num_subg_remain = 0; @@ -267,6 +269,7 @@ void Net::train(optimizer* opt, bool need_validate) { galois::gPrint(header, "Average training time per epoch: ", avg_train_time, " ms. Throughput: ", throughput, " epoch/s\n"); #endif + train_timer.stop(); } // evaluate, i.e. inference or predict diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 941a796a81..2a0eb05d67 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -92,6 +92,8 @@ void graph_conv_layer::malloc_and_init() { // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { + galois::StatTimer conv_timer("GraphConvForward"); + conv_timer.start(); size_t x = input_dims[0]; size_t y = input_dims[1]; size_t z = output_dims[1]; @@ -121,17 +123,20 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_data; layer::context->getSyncSubstrate()->sync( - "AggSync"); + "GraphConvForward"); // run relu activation on output if specified if (act_) math::relu_cpu(x * z, out_data, out_data); + conv_timer.stop(); } // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ void graph_conv_layer::back_propagation(const float_t* in_data, const float_t* out_data, float_t* out_grad, float_t* in_grad) { + galois::StatTimer conv_timer("GraphConvBackward"); + conv_timer.start(); size_t x = input_dims[0]; size_t y = input_dims[1]; size_t z = output_dims[1]; @@ -167,13 +172,14 @@ void graph_conv_layer::back_propagation(const float_t* in_data, deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_temp; layer::context->getSyncSubstrate()->sync( - "AggSyncBack"); + "GraphConvBackward"); if (level_ != 0 && dropout_) math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad); - layer::syncSub->sync("GradientSync"); + layer::syncSub->sync("Gradients"); galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); + conv_timer.stop(); } acc_t graph_conv_layer::get_weight_decay_loss() { From fe13b47cc86a83271282f407ff34bb3a85787f30 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 27 Jun 2020 16:48:43 -0500 Subject: [PATCH 322/660] timer set 3 (graph conv breakdown) --- libdeepgalois/src/layers/graph_conv_layer.cpp | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 2a0eb05d67..b2fe0784f7 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -34,22 +34,28 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, // aggregate based on graph topology void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { + galois::StatTimer aggregate_timer("AggregateTime"); + aggregate_timer.start(); // normalization constant based on graph structure #ifdef USE_MKL update_all_csrmm(len, g, in, out, norm_, norm_consts); #else update_all(len, g, in, out, norm_, norm_consts); #endif + aggregate_timer.stop(); } // since graph is symmetric, the derivative is the same void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { + galois::StatTimer aggregate_timer("AggregateDerivativeTime"); + aggregate_timer.start(); #ifdef USE_MKL update_all_csrmm(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z #else update_all(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z #endif + aggregate_timer.stop(); } void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, @@ -98,6 +104,8 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, size_t y = input_dims[1]; size_t z = output_dims[1]; + galois::StatTimer drop_timer("GraphConvForwardDropout"); + drop_timer.start(); // input: x*y; W: y*z; output: x*z // if y > z: mult W first to reduce the feature size for aggregation // else: aggregate first then mult W @@ -107,7 +115,10 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, } else { math::copy_cpu(x * y, in_data, in_temp); } + drop_timer.stop(); + galois::StatTimer compute_timer("GraphConvForwardCompute"); + compute_timer.start(); if (y > z) { math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp); @@ -117,6 +128,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp1, &layer::W[0], 0.0, out_data); } + compute_timer.stop(); // TODO sync of out_data required here // TODO how to do this for the sampled case? @@ -126,8 +138,12 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, "GraphConvForward"); // run relu activation on output if specified + galois::StatTimer relu_timer("GraphConvForwardRelu"); + relu_timer.start(); if (act_) math::relu_cpu(x * z, out_data, out_data); + relu_timer.stop(); + conv_timer.stop(); } @@ -141,10 +157,15 @@ void graph_conv_layer::back_propagation(const float_t* in_data, size_t y = input_dims[1]; size_t z = output_dims[1]; // note; assumption here is that out_grad contains 1s or 0s via relu? + galois::StatTimer relu_timer("GraphConvBackwardRelu"); + relu_timer.start(); if (act_) math::d_relu_cpu(x * z, out_grad, out_data, out_grad); + relu_timer.stop(); // else math::copy_cpu(x * z, out_grad, out_temp); // TODO: avoid copying + galois::StatTimer compute_timer("GraphConvBackwardCompute"); + compute_timer.start(); if (y > z) { d_aggregate(z, *graph_cpu, out_grad, out_temp); // at this point, out_temp has the derivative of data from last step to @@ -167,6 +188,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_grad, 0.0, &layer::weight_grad[0]); } + compute_timer.stop(); // sync agg deepgalois::_syncVectorSize = z; @@ -174,8 +196,11 @@ void graph_conv_layer::back_propagation(const float_t* in_data, layer::context->getSyncSubstrate()->sync( "GraphConvBackward"); + galois::StatTimer drop_timer("GraphConvBackwardDropout"); + drop_timer.start(); if (level_ != 0 && dropout_) math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad); + drop_timer.stop(); layer::syncSub->sync("Gradients"); galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); From 90cc74711647b4c8fd9bad5b181e8bf2ae234d3e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 11 Jul 2020 10:30:48 -0500 Subject: [PATCH 323/660] clang-format 10 run --- .../include/deepgalois/DistContext.h | 43 ++++--- libdeepgalois/include/deepgalois/GraphTypes.h | 10 +- libdeepgalois/include/deepgalois/Net.h | 24 ++-- libdeepgalois/include/deepgalois/Sampler.h | 26 ++-- .../include/deepgalois/layers/layer.h | 13 +- libdeepgalois/include/deepgalois/lgraph.h | 8 +- libdeepgalois/src/DistContext.cpp | 4 +- libdeepgalois/src/RandomWalk.cpp | 59 +++++---- libdeepgalois/src/Sampler.cpp | 84 +++++++------ libdeepgalois/src/Train.cpp | 114 +++++++++--------- libdeepgalois/src/layers/graph_conv_layer.cpp | 4 +- libdeepgalois/src/lgraph.cpp | 10 +- libdeepgalois/src/reader.cpp | 24 ++-- libdeepgalois/src/utils.cpp | 4 +- 14 files changed, 239 insertions(+), 188 deletions(-) diff --git a/libdeepgalois/include/deepgalois/DistContext.h b/libdeepgalois/include/deepgalois/DistContext.h index c614a92ca2..3ecf9ed411 100644 --- a/libdeepgalois/include/deepgalois/DistContext.h +++ b/libdeepgalois/include/deepgalois/DistContext.h @@ -18,13 +18,14 @@ class DistContext { bool is_selfloop_added; // whether selfloop is added to the input graph bool usingSingleClass; std::string dataset; - size_t num_classes; // number of classes: E - size_t feat_len; // input feature length: D - Graph* lGraph; // learning graph version + size_t num_classes; // number of classes: E + size_t feat_len; // input feature length: D + Graph* lGraph; // learning graph version DGraph* partitionedGraph; // the input graph, |V| = N std::vector partitionedSubgraphs; - label_t* h_labels; // labels for classification. Single-class: Nx1, multi-class: NxE - float_t* h_feats; // input features: N x D + label_t* h_labels; // labels for classification. Single-class: Nx1, + // multi-class: NxE + float_t* h_feats; // input features: N x D #ifdef GALOIS_ENABLE_GPU label_t* d_labels; // labels on device label_t* d_labels_subg; // labels for subgraph on device @@ -35,9 +36,10 @@ class DistContext { #else galois::graphs::GluonSubstrate* syncSubstrate; #endif - std::vector h_labels_subg; // labels for subgraph - std::vector h_feats_subg; // input features for subgraph - std::vector normFactors; // normalization constant based on graph structure + std::vector h_labels_subg; // labels for subgraph + std::vector h_feats_subg; // input features for subgraph + std::vector + normFactors; // normalization constant based on graph structure std::vector normFactorsSub; // normalization constant for subgraph Reader reader; @@ -45,10 +47,10 @@ class DistContext { public: // TODO better constructor DistContext(); - DistContext(bool isDevice) : is_device(isDevice), is_selfloop_added(false), - usingSingleClass(true), dataset(""), - num_classes(0), feat_len(0), lGraph(NULL), - partitionedGraph(NULL), h_labels(0), h_feats(0) {} + DistContext(bool isDevice) + : is_device(isDevice), is_selfloop_added(false), usingSingleClass(true), + dataset(""), num_classes(0), feat_len(0), lGraph(NULL), + partitionedGraph(NULL), h_labels(0), h_feats(0) {} ~DistContext(); size_t read_graph(std::string dataset_str, bool selfloop = false); @@ -75,15 +77,20 @@ class DistContext { label_t* get_labels_subg_ptr() { return d_labels_subg; } float_t* get_norm_factors_ptr() { return d_normFactors; } float_t* get_norm_factors_subg_ptr() { return d_normFactorsSub; } - void copy_data_to_device(); // copy labels and input features - static cublasHandle_t cublas_handle_; // used to call cuBLAS - static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE + void copy_data_to_device(); // copy labels and input features + static cublasHandle_t cublas_handle_; // used to call cuBLAS + static cusparseHandle_t cusparse_handle_; // used to call cuSPARSE static cusparseMatDescr_t cusparse_matdescr_; // used to call cuSPARSE - static curandGenerator_t curand_generator_; // used to generate random numbers on GPU + static curandGenerator_t + curand_generator_; // used to generate random numbers on GPU inline static cublasHandle_t cublas_handle() { return cublas_handle_; } inline static cusparseHandle_t cusparse_handle() { return cusparse_handle_; } - inline static cusparseMatDescr_t cusparse_matdescr() { return cusparse_matdescr_; } - inline static curandGenerator_t curand_generator() { return curand_generator_; } + inline static cusparseMatDescr_t cusparse_matdescr() { + return cusparse_matdescr_; + } + inline static curandGenerator_t curand_generator() { + return curand_generator_; + } #else void saveDistGraph(DGraph* a); galois::graphs::GluonSubstrate* getSyncSubstrate(); diff --git a/libdeepgalois/include/deepgalois/GraphTypes.h b/libdeepgalois/include/deepgalois/GraphTypes.h index 1528375290..3f613a3039 100644 --- a/libdeepgalois/include/deepgalois/GraphTypes.h +++ b/libdeepgalois/include/deepgalois/GraphTypes.h @@ -17,11 +17,11 @@ namespace deepgalois { using edge_iterator = index_t; using GraphCPU = LearningGraph; #ifdef GALOIS_ENABLE_GPU -using DGraph = CSRGraph; -using Graph = CSRGraph; -using GraphGPU = CSRGraph; +using DGraph = CSRGraph; +using Graph = CSRGraph; +using GraphGPU = CSRGraph; #else -using DGraph = galois::graphs::DistGraph; -using Graph = LearningGraph; +using DGraph = galois::graphs::DistGraph; +using Graph = LearningGraph; #endif } // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/Net.h b/libdeepgalois/include/deepgalois/Net.h index 81754f915a..bd33924eee 100644 --- a/libdeepgalois/include/deepgalois/Net.h +++ b/libdeepgalois/include/deepgalois/Net.h @@ -85,20 +85,22 @@ class Net { public: //! Default net constructor - Net() : Net("reddit", 1, 2, 200, 16, 0.01, 0.5, 5e-4, - false, true, false, false, 25, 9000, 1) {} + Net() + : Net("reddit", 1, 2, 200, 16, 0.01, 0.5, 5e-4, false, true, false, false, + 25, 9000, 1) {} //! Net constructor Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, - unsigned hidden1, float lr, float dropout, float wd, - bool selfloop, bool single, bool l2norm, bool dense, - unsigned neigh_sz, unsigned subg_sz, int val_itv); + unsigned hidden1, float lr, float dropout, float wd, bool selfloop, + bool single, bool l2norm, bool dense, unsigned neigh_sz, unsigned subg_sz, + int val_itv); // allocate memory for subgraph masks void allocateSubgraphsMasks(int num_subgraphs); //! Initializes metadata for the partition: loads data, labels, etc - void partitionInit(DGraph* graph, std::string dataset_str, bool isSingleClassLabel); + void partitionInit(DGraph* graph, std::string dataset_str, + bool isSingleClassLabel); size_t get_in_dim(size_t layer_id) { return feature_dims[layer_id]; } size_t get_out_dim(size_t layer_id) { return feature_dims[layer_id + 1]; } void regularize(); // add weight decay @@ -131,12 +133,12 @@ class Net { // forward propagation acc_t fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks); - void bprop(); // back propagation - void set_contexts(); // Save the context + void bprop(); // back propagation + void set_contexts(); // Save the context void set_netphases(net_phase phase); // current phase: train or test - void print_layers_info(); // print layer information - void print_configs(); // print the configurations - + void print_layers_info(); // print layer information + void print_configs(); // print the configurations + // comparing outputs with the ground truth (labels) acc_t masked_accuracy(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks, float_t* preds, diff --git a/libdeepgalois/include/deepgalois/Sampler.h b/libdeepgalois/include/deepgalois/Sampler.h index 1b5754f394..ff1b460b10 100644 --- a/libdeepgalois/include/deepgalois/Sampler.h +++ b/libdeepgalois/include/deepgalois/Sampler.h @@ -34,25 +34,29 @@ class Sampler { //! Reindex a graph to only contain those in the vertex set void reindexSubgraph(VertexSet& keptVertices, Graph& g, Graph& reindexed); - //! Given a graph, return a graph with edges to unmasked vertices removed in mg + //! Given a graph, return a graph with edges to unmasked vertices removed in + //! mg template void getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* sub); //! determine degree of each vertex in a masked graph (given by masks and g) template - void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector& degrees); + void getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, + std::vector& degrees); //! Set masks bitset with IDs in the vertices VertexSet - //void createMasks(size_t n, VertexSet vertices, mask_t* masks); - //inline VertexList reindexVertices(size_t n, VertexSet vertex_set); - //void checkGSDB(std::vector& DB0, std::vector& DB1, std::vector& DB2, index_t size); + // void createMasks(size_t n, VertexSet vertices, mask_t* masks); + // inline VertexList reindexVertices(size_t n, VertexSet vertex_set); + // void checkGSDB(std::vector& DB0, std::vector& DB1, + // std::vector& DB2, index_t size); //! convert set of gids to lids VertexSet convertToLID(VertexSet& gidSet); void createMasks(size_t n, VertexSet vertices, mask_t* masks) { std::fill(masks, masks + n, 0); - for (auto v : vertices) masks[v] = 1; + for (auto v : vertices) + masks[v] = 1; } //! helper function to get degree of some vertex given some graph @@ -71,7 +75,7 @@ class Sampler { // helper function for graph saint implementation below void checkGSDB(std::vector& DB0, std::vector& DB1, - std::vector& DB2, index_t size) { + std::vector& DB2, index_t size) { if (DB0.capacity() < size) { DB0.reserve(DB0.capacity() * 2); DB1.reserve(DB1.capacity() * 2); @@ -88,11 +92,12 @@ class Sampler { //! sample a subgraph sg of size n from graph g //! sg is overwritten/is output - void generateSubgraph(VertexSet &vertex_set, mask_t* masks, Graph* sg); + void generateSubgraph(VertexSet& vertex_set, mask_t* masks, Graph* sg); //! API function for user-defined selection strategy // TODO how to expose this? - void selectVertices(index_t nv, index_t n, Graph* g, VertexList vertices, VertexSet& vertex_set); + void selectVertices(index_t nv, index_t n, Graph* g, VertexList vertices, + VertexSet& vertex_set); virtual void selectVertices(index_t n, VertexSet& vertex_set, unsigned seed); // galois::runtime::iterable > @@ -100,7 +105,8 @@ class Sampler { //! Given a mask, construct the graph with only those vertices ans ave as the //! masked graph in this class for the sampler. - void initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DGraph* dg); + void initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, + DGraph* dg); }; } // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 02b5abebb4..874e7d41c6 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -41,11 +41,11 @@ class layer : public deepgalois::node { using ContextType = deepgalois::DistContext; protected: - #ifndef GALOIS_ENABLE_GPU +#ifndef GALOIS_ENABLE_GPU const std::string header = "[" + std::to_string(galois::runtime::getSystemNetworkInterface().ID) + "] "; - #endif +#endif unsigned level_; // layer id: [0, num_layers-1] size_t begin_; // sample begin index size_t end_; // sample end index @@ -92,10 +92,11 @@ class layer : public deepgalois::node { #ifndef GALOIS_ENABLE_GPU myID = galois::runtime::getSystemNetworkInterface().ID; #endif - std::cout << "[" << myID << "] Layer " << level_ << " type: " << layer_type() - << "input[" << input_dims[0] << "," << input_dims[1] << "] output[" - << output_dims[0] << "," << output_dims[1] << "]\n"; - //galois::gPrint("[", myID, "] Layer", level_, " type: ", layer_type(), + std::cout << "[" << myID << "] Layer " << level_ + << " type: " << layer_type() << "input[" << input_dims[0] << "," + << input_dims[1] << "] output[" << output_dims[0] << "," + << output_dims[1] << "]\n"; + // galois::gPrint("[", myID, "] Layer", level_, " type: ", layer_type(), // "input[", input_dims[0], ",", input_dims[1], "] output[", // output_dims[0], ",", output_dims[1], "]\n"); } diff --git a/libdeepgalois/include/deepgalois/lgraph.h b/libdeepgalois/include/deepgalois/lgraph.h index 2e086ebf88..01b84a60b6 100644 --- a/libdeepgalois/include/deepgalois/lgraph.h +++ b/libdeepgalois/include/deepgalois/lgraph.h @@ -35,8 +35,7 @@ class LearningGraph { public: typedef size_t iterator; LearningGraph(bool use_gpu) - : is_device(use_gpu), max_size_(0), - num_vertices_(0), num_edges_(0), + : is_device(use_gpu), max_size_(0), num_vertices_(0), num_edges_(0), vertex_data_(NULL), edge_data_(NULL) {} LearningGraph() : LearningGraph(false) {} ~LearningGraph() { dealloc(); } @@ -57,7 +56,10 @@ class LearningGraph { void dealloc(); void degree_counting(); void constructNodes() {} - void set_max_size(index_t max) { assert(max>0); max_size_ = max; } + void set_max_size(index_t max) { + assert(max > 0); + max_size_ = max; + } void readGraph(std::string dataset, bool selfloop = false); void fixEndEdge(index_t vid, index_t row_end) { rowptr_[vid + 1] = row_end; } diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index 4a9087b0b3..e9f0ef4214 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -3,9 +3,7 @@ #include "deepgalois/configs.h" namespace deepgalois { -DistContext::DistContext() : DistContext(false) { - syncSubstrate = NULL; -} +DistContext::DistContext() : DistContext(false) { syncSubstrate = NULL; } DistContext::~DistContext() {} diff --git a/libdeepgalois/src/RandomWalk.cpp b/libdeepgalois/src/RandomWalk.cpp index cf2112ca60..23efe124d2 100644 --- a/libdeepgalois/src/RandomWalk.cpp +++ b/libdeepgalois/src/RandomWalk.cpp @@ -7,7 +7,8 @@ namespace deepgalois { -void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DGraph* dg) { +void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, + DGraph* dg) { this->count_ = count; // save original graph Sampler::globalGraph = g; @@ -20,18 +21,23 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DG std::vector degrees(g->size(), 0); galois::gPrint("graph size: ", g->size(), "\n"); // get degrees of nodes that will be in new graph - //this->getMaskedDegrees(g->size(), masks, g, degrees); - galois::do_all(galois::iterate(size_t(0), g->size()), [&](const auto src) { - if (masks[src] == 1) { - for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src); e++) { - const auto dst = g->getEdgeDstHost(e); - if (masks[dst] == 1) degrees[src]++; - } - } - } , galois::loopname("update_degrees")); + // this->getMaskedDegrees(g->size(), masks, g, degrees); + galois::do_all( + galois::iterate(size_t(0), g->size()), + [&](const auto src) { + if (masks[src] == 1) { + for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src); + e++) { + const auto dst = g->getEdgeDstHost(e); + if (masks[dst] == 1) + degrees[src]++; + } + } + }, + galois::loopname("update_degrees")); auto offsets = deepgalois::parallel_prefix_sum(degrees); - auto ne = offsets[g->size()]; + auto ne = offsets[g->size()]; // save ids (of original graph) of training nodes to vector for (size_t i = 0; i < g->size(); i++) { @@ -42,19 +48,23 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DG Sampler::globalMaskedGraph->allocateFrom(g->size(), ne); Sampler::globalMaskedGraph->constructNodes(); // same as original graph, except keep only edges involved in masks - galois::do_all(galois::iterate((size_t)0, g->size()), [&](const auto src) { - Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]); - if (masks[src] == 1) { - auto idx = offsets[src]; - for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src); e++) { - const auto dst = g->getEdgeDstHost(e); - if (masks[dst] == 1) { - // galois::gPrint(src, " ", dst, "\n"); - Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0); + galois::do_all( + galois::iterate((size_t)0, g->size()), + [&](const auto src) { + Sampler::globalMaskedGraph->fixEndEdge(src, offsets[src + 1]); + if (masks[src] == 1) { + auto idx = offsets[src]; + for (auto e = g->edge_begin_host(src); e != g->edge_end_host(src); + e++) { + const auto dst = g->getEdgeDstHost(e); + if (masks[dst] == 1) { + // galois::gPrint(src, " ", dst, "\n"); + Sampler::globalMaskedGraph->constructEdge(idx++, dst, 0); + } + } } - } - } - }, galois::loopname("gen_subgraph")); + }, + galois::loopname("gen_subgraph")); Sampler::globalMaskedGraph->degree_counting(); Sampler::avg_deg = globalMaskedGraph->sizeEdges() / globalMaskedGraph->size(); @@ -67,7 +77,8 @@ void Sampler::initializeMaskedGraph(size_t count, mask_t* masks, GraphCPU* g, DG // implementation from GraphSAINT // https://github.com/GraphSAINT/GraphSAINT/blob/master/ipdps19_cpp/sample.cpp void Sampler::selectVertices(index_t n, VertexSet& st, unsigned seed) { - if (n < m) m = n; + if (n < m) + m = n; unsigned myseed = seed; // unsigned myseed = tid; diff --git a/libdeepgalois/src/Sampler.cpp b/libdeepgalois/src/Sampler.cpp index 36b697ecb6..055b5c0a85 100644 --- a/libdeepgalois/src/Sampler.cpp +++ b/libdeepgalois/src/Sampler.cpp @@ -186,7 +186,7 @@ void Sampler::selectVertices(index_t nv, index_t n, Graph* g, vertex_set.insert(frontier.begin(), frontier.end()); // galois::gPrint("vertex_set size: ", vertex_set.size(), "\n"); int* degrees = new int[m]; - //galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) { + // galois::do_all(galois::iterate(size_t(0), size_t(m)), [&](const auto i) { for (index_t i = 0; i < m; i++) { degrees[i] = (int)getDegree(g, frontier[i]); } //, galois::loopname("compute_degrees")); @@ -217,7 +217,8 @@ void Sampler::selectVertices(index_t nv, index_t n, Graph* g, // Given a subset of vertices and a graph g, generate a subgraph sg from the // graph g -void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& reindexGraph) { +void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, + Graph& reindexGraph) { // auto n = origGraph.size(); // old graph size auto nv = keptVertices.size(); // new graph (subgraph) size VertexList new_ids = this->reindexVertices(globalGraph->size(), keptVertices); @@ -235,7 +236,9 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& VertexList old_ids(keptVertices.begin(), keptVertices.end()); // vertex ID mapping #ifdef PARALLEL_GEN - galois::do_all(galois::iterate(size_t(0), size_t(nv)), [&](const auto i) { + galois::do_all( + galois::iterate(size_t(0), size_t(nv)), + [&](const auto i) { #else for (size_t i = 0; i < nv; i++) { #endif @@ -251,7 +254,8 @@ void Sampler::reindexSubgraph(VertexSet& keptVertices, Graph& origGraph, Graph& } } #ifdef PARALLEL_GEN - , galois::loopname("construct_graph")); + , + galois::loopname("construct_graph")); #endif } @@ -267,53 +271,64 @@ VertexSet Sampler::convertToLID(VertexSet& gidSet) { } template -void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, std::vector& degrees) { -//template <> -//void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphCPU* g, std::vector& degrees) { +void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphTy* g, + std::vector& degrees) { + // template <> + // void Sampler::getMaskedDegrees(size_t n, mask_t* masks, GraphCPU* g, + // std::vector& degrees) { assert(degrees.size() == n); - galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { - //for (size_t src = 0; src < n; src++) { - if (masks[src] == 1) { - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - const auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) { - // galois::gInfo("Edge ", src, " ", dst); - degrees[src]++; + galois::do_all( + galois::iterate(size_t(0), n), + [&](const auto src) { + // for (size_t src = 0; src < n; src++) { + if (masks[src] == 1) { + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + const auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) { + // galois::gInfo("Edge ", src, " ", dst); + degrees[src]++; + } + } } - } - } - } , galois::loopname("update_degrees")); + }, + galois::loopname("update_degrees")); } template -void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, SubgraphTy* sub) { +void Sampler::getMaskedGraph(index_t n, mask_t* masks, GraphTy* g, + SubgraphTy* sub) { std::vector degrees(n, 0); this->getMaskedDegrees(n, masks, g, degrees); // auto offsets = deepgalois::parallel_prefix_sum(degrees); auto offsets = deepgalois::prefix_sum(degrees); size_t ne = offsets[n]; - // galois::gPrint("getMaskedGraph: num_vertices=", n, ", num_edges=", ne, "\n"); + // galois::gPrint("getMaskedGraph: num_vertices=", n, ", num_edges=", ne, + // "\n"); // note this constructs the full graph's nodes; just trims edges sub->allocateFrom(n, ne); sub->constructNodes(); - galois::do_all(galois::iterate(size_t(0), size_t(n)), [&](const auto src) { - sub->fixEndEdge(src, offsets[src + 1]); - if (masks[src] == 1) { - auto idx = offsets[src]; - for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { - auto dst = g->getEdgeDst(e); - if (masks[dst] == 1) { - // galois::gPrint(src, " ", dst, "\n"); - sub->constructEdge(idx++, dst, 0); + galois::do_all( + galois::iterate(size_t(0), size_t(n)), + [&](const auto src) { + sub->fixEndEdge(src, offsets[src + 1]); + if (masks[src] == 1) { + auto idx = offsets[src]; + for (auto e = g->edge_begin(src); e != g->edge_end(src); e++) { + auto dst = g->getEdgeDst(e); + if (masks[dst] == 1) { + // galois::gPrint(src, " ", dst, "\n"); + sub->constructEdge(idx++, dst, 0); + } + } } - } - } - }, galois::loopname("gen_subgraph")); + }, + galois::loopname("gen_subgraph")); } -void Sampler::generateSubgraph(VertexSet &sampledSet, mask_t* masks, Graph* sg) { +void Sampler::generateSubgraph(VertexSet& sampledSet, mask_t* masks, + Graph* sg) { // n = 9000 by default // do the sampling of vertices from training set + using masked graph @@ -335,7 +350,8 @@ void Sampler::generateSubgraph(VertexSet &sampledSet, mask_t* masks, Graph* sg) Graph maskedSG; // TODO use partMaskedGraph once constructed later // remove edges whose destination is not masked - this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph, &maskedSG); + this->getMaskedGraph(Sampler::partGraph->size(), masks, Sampler::partGraph, + &maskedSG); this->reindexSubgraph(sampledLIDs, maskedSG, *sg); // galois::gPrint("sg num edges is ", sg.sizeEdges(), "\n"); diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp index 4e363bb1b1..992902e7b6 100644 --- a/libdeepgalois/src/Train.cpp +++ b/libdeepgalois/src/Train.cpp @@ -4,20 +4,21 @@ namespace deepgalois { Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, - unsigned hidden1, float lr, float dropout, float wd, - bool selfloop, bool single, bool l2norm, bool dense, - unsigned neigh_sz, unsigned subg_sz, int val_itv) : -// globalSamples(0), num_classes(0), num_conv_layers(0), num_layers(0), -// globalTrainBegin(0), globalTrainEnd(0), globalTrainCount(0), -// globalValBegin(0), globalValEnd(0), globalValCount(0), -// globalTestBegin(0), globalTestEnd(0), globalTestCount(0), -// globalTrainMasks(NULL), globalValMasks(NULL), globalTestMasks(NULL) {} + unsigned hidden1, float lr, float dropout, float wd, bool selfloop, + bool single, bool l2norm, bool dense, unsigned neigh_sz, + unsigned subg_sz, int val_itv) + : // globalSamples(0), num_classes(0), num_conv_layers(0), num_layers(0), + // globalTrainBegin(0), globalTrainEnd(0), globalTrainCount(0), + // globalValBegin(0), globalValEnd(0), globalValCount(0), + // globalTestBegin(0), globalTestEnd(0), globalTestCount(0), + // globalTrainMasks(NULL), globalValMasks(NULL), globalTestMasks(NULL) + // {} is_single_class(single), has_l2norm(l2norm), has_dense(dense), neighbor_sample_size(neigh_sz), subgraph_sample_size(subg_sz), - num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), - h1(hidden1), learning_rate(lr), dropout_rate(dropout), weight_decay(wd), + num_threads(nt), num_conv_layers(n_conv), num_epochs(epochs), h1(hidden1), + learning_rate(lr), dropout_rate(dropout), weight_decay(wd), val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { - // init some identifiers for this host + // init some identifiers for this host unsigned myID = 0; #ifdef GALOIS_ENABLE_DIST myID = galois::runtime::getSystemNetworkInterface().ID; @@ -87,7 +88,7 @@ Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, void Net::train(optimizer* opt, bool need_validate) { galois::StatTimer train_timer("Timer_0"); train_timer.start(); - std::string separator = "\n"; + std::string separator = "\n"; double total_train_time = 0.0; int num_subg_remain = 0; @@ -97,12 +98,12 @@ void Net::train(optimizer* opt, bool need_validate) { distContext->allocateSubgraphs(num_subgraphs, subgraph_sample_size); allocateSubgraphsMasks(num_subgraphs); std::cout << header - << "Constructing training vertex set induced graph...\n"; + << "Constructing training vertex set induced graph...\n"; // auto gg = distContext->getGraphPointer(); auto gg = - graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem + graphTopologyContext->getGraphPointer(); // gloabl graph in CPU mem sampler->initializeMaskedGraph(globalTrainCount, globalTrainMasks, gg, - distContext->getGraphPointer()); + distContext->getGraphPointer()); construct_time.stop(); } @@ -122,7 +123,7 @@ void Net::train(optimizer* opt, bool need_validate) { sample_time.start(); if (num_subg_remain == 0) { std::cout << header << "Generating " << num_subgraphs - << " subgraph(s)\n"; + << " subgraph(s)\n"; galois::StatTimer t_subgen("SubgraphGenerateTime"); t_subgen.start(); @@ -130,10 +131,10 @@ void Net::train(optimizer* opt, bool need_validate) { for (int sid = 0; sid < num_subgraphs; sid++) { VertexSet sampledSet; sampler->selectVertices(subgraph_sample_size, sampledSet, - curEpoch); // m = 1000 by default + curEpoch); // m = 1000 by default sampler->generateSubgraph(sampledSet, - subgraphs_masks + sid * globalSamples, - distContext->getSubgraphPointer(sid)); + subgraphs_masks + sid * globalSamples, + distContext->getSubgraphPointer(sid)); } num_subg_remain = num_subgraphs; t_subgen.stop(); @@ -152,7 +153,7 @@ void Net::train(optimizer* opt, bool need_validate) { auto subgraphPointer = distContext->getSubgraphPointer(sg_id); this->subgraphNumVertices = subgraphPointer->size(); - //std::cout << "Subgraph num_vertices: " << subgraphNumVertices + // std::cout << "Subgraph num_vertices: " << subgraphNumVertices // << ", num_edges: " << subgraphPointer->sizeEdges() << "\n"; for (size_t i = 0; i < num_layers; i++) { layers[i]->update_dim_size(this->subgraphNumVertices); @@ -199,20 +200,19 @@ void Net::train(optimizer* opt, bool need_validate) { set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; - //galois::gPrint(header, "Calling into eval for forward propagation\n"); + // galois::gPrint(header, "Calling into eval for forward propagation\n"); // forward: after this phase, layer edges will contain intermediate // features for use during backprop double fw_time = evaluate("train", train_loss, train_acc); - //evaluate("train", train_loss, train_acc); + // evaluate("train", train_loss, train_acc); - - //galois::gPrint(header, "Calling into backward propagation\n"); + // galois::gPrint(header, "Calling into backward propagation\n"); // backward: use intermediate features + ground truth to update layers // with feature gradients whcih are then used to calculate weight // gradients Net::bprop(); - //galois::gPrint(header, "Weight update call\n"); + // galois::gPrint(header, "Weight update call\n"); // gradient update: use gradients stored on each layer to update model // for next epoch Net::update_weights(opt); // update parameters @@ -222,10 +222,10 @@ void Net::train(optimizer* opt, bool need_validate) { #ifdef GALOIS_ENABLE_GPU std::cout << header << "train_loss " << std::setprecision(3) << std::fixed - << train_loss << " train_acc " << train_acc << " "; + << train_loss << " train_acc " << train_acc << " "; #else galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, - train_loss, " train_acc ", train_acc, separator); + train_loss, " train_acc ", train_acc, separator); #endif t_epoch.Stop(); @@ -238,24 +238,25 @@ void Net::train(optimizer* opt, bool need_validate) { double val_time = evaluate("val", val_loss, val_acc); #ifdef GALOIS_ENABLE_GPU std::cout << header << "val_loss " << std::setprecision(3) << std::fixed - << val_loss << " val_acc " << val_acc << " "; + << val_loss << " val_acc " << val_acc << " "; std::cout << header << "time " << std::setprecision(3) << std::fixed - << epoch_time + val_time << " ms (train_time " << epoch_time - << " val_time " << val_time << ")\n"; + << epoch_time + val_time << " ms (train_time " << epoch_time + << " val_time " << val_time << ")\n"; #else galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, - val_loss, " val_acc ", val_acc, separator); + val_loss, " val_acc ", val_acc, separator); galois::gPrint(header, "time ", std::setprecision(3), std::fixed, - epoch_time + val_time, " ms (train_time ", epoch_time, - " val_time ", val_time, ")\n"); + epoch_time + val_time, " ms (train_time ", epoch_time, + " val_time ", val_time, ")\n"); #endif } else { #ifdef GALOIS_ENABLE_GPU std::cout << header << "train_time " << std::fixed << epoch_time - << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n"; + << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time + << ")\n"; #else - galois::gPrint(header, "train_time ", std::fixed, epoch_time, - " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, ")\n"); + galois::gPrint(header, "train_time ", std::fixed, epoch_time, " ms (fw ", + fw_time, ", bw ", epoch_time - fw_time, ")\n"); #endif } } // epoch loop @@ -263,11 +264,11 @@ void Net::train(optimizer* opt, bool need_validate) { double avg_train_time = total_train_time / (double)num_epochs; double throughput = 1000.0 * (double)num_epochs / total_train_time; #ifdef GALOIS_ENABLE_GPU - std::cout << "Average training time per epoch: " << avg_train_time - << "ms. Throughput " << throughput << " epoch/s\n"; + std::cout << "Average training time per epoch: " << avg_train_time + << "ms. Throughput " << throughput << " epoch/s\n"; #else galois::gPrint(header, "Average training time per epoch: ", avg_train_time, - " ms. Throughput: ", throughput, " epoch/s\n"); + " ms. Throughput: ", throughput, " epoch/s\n"); #endif train_timer.stop(); } @@ -334,9 +335,9 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { } #endif - //galois::gPrint(header, "Doing actual forward propagation\n"); + // galois::gPrint(header, "Doing actual forward propagation\n"); loss = fprop(gBegin, gEnd, gCount, gMasks); - //galois::gPrint(header, + // galois::gPrint(header, // "Forward propagation donne, going to check accuracy\n"); float_t* predictions = layers[num_layers - 1]->next()->get_data(); @@ -350,11 +351,11 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { } if (is_single_class) { - acc = masked_accuracy(gBegin, gEnd, gCount, gMasks, predictions, - localLabels); + acc = + masked_accuracy(gBegin, gEnd, gCount, gMasks, predictions, localLabels); } else { - acc = masked_multi_class_accuracy(gBegin, gEnd, gCount, gMasks, - predictions, localLabels); + acc = masked_multi_class_accuracy(gBegin, gEnd, gCount, gMasks, predictions, + localLabels); } eval_timer.stop(); @@ -366,7 +367,7 @@ double Net::evaluate(std::string type, acc_t& loss, acc_t& acc) { void Net::construct_layers() { // append conv layers - //galois::gPrint(header, "Constructing layers...\n"); + // galois::gPrint(header, "Constructing layers...\n"); for (size_t i = 0; i < num_conv_layers - 1; i++) { append_conv_layer(i, true); // conv layers, act=true } @@ -438,7 +439,8 @@ void Net::append_out_layer(size_t layer_id) { layers[layer_id]->set_labels_ptr(distContext->get_labels_ptr()); } //! Add a convolution layer to the network -void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, bool dropout) { +void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, + bool dropout) { assert(dropout_rate < 1.0); assert(layer_id < num_conv_layers); std::vector in_dims(2), out_dims(2); @@ -446,7 +448,7 @@ void Net::append_conv_layer(size_t layer_id, bool act, bool norm, bool bias, boo in_dims[1] = get_in_dim(layer_id); out_dims[1] = get_out_dim(layer_id); layers[layer_id] = new graph_conv_layer(layer_id, act, norm, bias, dropout, - dropout_rate, in_dims, out_dims); + dropout_rate, in_dims, out_dims); #ifdef GALOIS_ENABLE_GPU layers[layer_id]->set_graph_ptr(distContext->getGraphPointer()); #else @@ -462,15 +464,15 @@ acc_t Net::fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) { // set mask for the last layer; globals // TODO this should be distirbuted sample gBegin->end not global; fix later // seems to be unused in code right now anyways - //galois::gPrint(header, "fprop: set sample mask\n"); + // galois::gPrint(header, "fprop: set sample mask\n"); layers[num_layers - 1]->set_sample_mask(gBegin, gEnd, gCount, gMasks); for (size_t i = 0; i < num_layers; i++) { - //galois::gPrint(header, "fprop: layer ", i, " forward call\n"); + // galois::gPrint(header, "fprop: layer ", i, " forward call\n"); layers[i]->forward(); } - //galois::gPrint(header, "fprop: getting loss\n"); + // galois::gPrint(header, "fprop: getting loss\n"); // prediction error acc_t loss = layers[num_layers - 1]->get_prediction_loss(); // Squared Norm Regularization to mitigate overfitting @@ -520,10 +522,10 @@ void Net::print_layers_info() { // print the configurations void Net::print_configs() { galois::gPrint(header, "Configuration: num_threads ", num_threads, - ", num_conv_layers ", num_conv_layers, ", num_epochs ", - num_epochs, ", hidden_feat_len ", h1, ", learning_rate ", - learning_rate, ", dropout_rate ", dropout_rate, - ", weight_decay ", weight_decay, "\n"); + ", num_conv_layers ", num_conv_layers, ", num_epochs ", + num_epochs, ", hidden_feat_len ", h1, ", learning_rate ", + learning_rate, ", dropout_rate ", dropout_rate, + ", weight_decay ", weight_decay, "\n"); } -} // end namespace +} // namespace deepgalois diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index b2fe0784f7..4c11086495 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -135,7 +135,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_data; layer::context->getSyncSubstrate()->sync( - "GraphConvForward"); + "GraphConvForward"); // run relu activation on output if specified galois::StatTimer relu_timer("GraphConvForwardRelu"); @@ -194,7 +194,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_temp; layer::context->getSyncSubstrate()->sync( - "GraphConvBackward"); + "GraphConvBackward"); galois::StatTimer drop_timer("GraphConvBackwardDropout"); drop_timer.start(); diff --git a/libdeepgalois/src/lgraph.cpp b/libdeepgalois/src/lgraph.cpp index c0c39b4023..31cd353e51 100644 --- a/libdeepgalois/src/lgraph.cpp +++ b/libdeepgalois/src/lgraph.cpp @@ -21,7 +21,8 @@ uint64_t LearningGraph::numMasters() { return 0; } uint64_t LearningGraph::globalSize() { return 0; } void LearningGraph::readGraph(std::string dataset, bool selfloop) { - if (selfloop) std::cout << "selfloop not yet implemented\n"; + if (selfloop) + std::cout << "selfloop not yet implemented\n"; deepgalois::Reader reader(dataset); reader.readGraphFromGRFile(this); } @@ -29,9 +30,10 @@ void LearningGraph::readGraph(std::string dataset, bool selfloop) { void LearningGraph::degree_counting() { // if (degrees_ != NULL) return; // degrees_ = new index_t[num_vertices_]; - galois::do_all(galois::iterate(size_t(0), size_t(num_vertices_)), - [&](auto v) { degrees_[v] = rowptr_[v + 1] - rowptr_[v]; }, - galois::loopname("DegreeCounting")); + galois::do_all( + galois::iterate(size_t(0), size_t(num_vertices_)), + [&](auto v) { degrees_[v] = rowptr_[v + 1] - rowptr_[v]; }, + galois::loopname("DegreeCounting")); } void LearningGraph::dealloc() {} diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp index 016a72d26a..bf5792fca4 100644 --- a/libdeepgalois/src/reader.cpp +++ b/libdeepgalois/src/reader.cpp @@ -31,14 +31,17 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) { size_t m, num_classes; // m: number of samples in >> m >> num_classes >> std::ws; if (is_single_class) { - std::cout << "[" << myID << "] Reader: Using single-class (one-hot) labels\n"; - //galois::gPrint("[", myID, + std::cout << "[" << myID + << "] Reader: Using single-class (one-hot) labels\n"; + // galois::gPrint("[", myID, // "] Reader: Using single-class (one-hot) labels\n"); labels = new label_t[m]; // single-class (one-hot) label for each vertex: N x 1 } else { - //galois::gPrint("[", myID, "] Reader: Using multi-class (one-hot) labels\n"); - std::cout << "[" << myID << "] Reader: Using multi-class (one-hot) labels\n"; + // galois::gPrint("[", myID, "] Reader: Using multi-class (one-hot) + // labels\n"); + std::cout << "[" << myID + << "] Reader: Using multi-class (one-hot) labels\n"; labels = new label_t[m * num_classes]; // multi-class label for each vertex: N x E @@ -65,8 +68,8 @@ size_t Reader::read_labels(bool is_single_class, label_t*& labels) { // print the number of vertex classes std::cout << "[" << myID << "] Done, unique label counts: " << num_classes << ", time: " << t_read.Millisecs() << " ms\n"; - //galois::gPrint("[", myID, "] Done, unique label counts: ", num_classes, - //", time: ", t_read.Millisecs(), " ms\n"); + // galois::gPrint("[", myID, "] Done, unique label counts: ", num_classes, + //", time: ", t_read.Millisecs(), " ms\n"); // for (auto i = 0; i < 10; i ++) std::cout << "labels[" << i << "] = " << // unsigned(labels[i]) << "\n"; return num_classes; @@ -158,10 +161,11 @@ size_t Reader::read_masks(std::string mask_type, size_t n, size_t& begin, } i++; } - std::cout << "Global read " << mask_type << "_mask range: [" << begin - << ", " << end << ") Number of valid samples: " << sample_count - << " (" << (float)sample_count / (float)n * (float)100 << "\%)\n"; - //galois::gPrint("Global read ", mask_type, "_mask range: [", begin, ", ", end, + std::cout << "Global read " << mask_type << "_mask range: [" << begin << ", " + << end << ") Number of valid samples: " << sample_count << " (" + << (float)sample_count / (float)n * (float)100 << "\%)\n"; + // galois::gPrint("Global read ", mask_type, "_mask range: [", begin, ", ", + // end, // ") Number of valid samples: ", sample_count, " (", // (float)sample_count / (float)n * (float)100, "\%)\n"); in.close(); diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index 10cd18832c..929881dd25 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -26,7 +26,7 @@ OutTy* parallel_prefix_sum(const std::vector& in) { } bulk_prefix[num_blocks] = total; // TODO do not use new here: difficult to track and free later - OutTy* prefix = new OutTy[in.size() + 1]; + OutTy* prefix = new OutTy[in.size() + 1]; galois::do_all( galois::iterate((size_t)0, num_blocks), [&](const size_t& block) { OutTy local_total = bulk_prefix[block]; @@ -120,7 +120,7 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks, #endif std::cout << "[" << myID << "]" << std::setprecision(3) << std::fixed << " (f1_micro:" << f1_micro << ", f1_macro: " << f1_macro << ")\n"; - //galois::gPrint("[", myID, "]", std::setprecision(3), std::fixed, + // galois::gPrint("[", myID, "]", std::setprecision(3), std::fixed, // " (f1_micro:", f1_micro, ", f1_macro: ", f1_macro, ")\n"); return f1_micro; From 1af2982fac79a4702155dde306ff0a80b41190b9 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 11 Jul 2020 11:39:36 -0500 Subject: [PATCH 324/660] BufferWrapper to wrap memory buffers with a size --- .../layers/GraphConvSyncStructures.h | 16 ++----- libdist/include/galois/BufferWrapper.h | 44 +++++++++++++++++++ 2 files changed, 48 insertions(+), 12 deletions(-) create mode 100644 libdist/include/galois/BufferWrapper.h diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h index 95e09b1c0d..b07b672fa1 100644 --- a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h +++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h @@ -1,23 +1,15 @@ #ifndef GALOIS_ENABLE_GPU #ifndef __GRAPH_CONV_SYNC_STRUCT__ #define __GRAPH_CONV_SYNC_STRUCT__ +#include "galois/BufferWrapper.h" struct GraphConvSync { - using ValTy = std::vector; + using ValTy = galois::BufferWrapper; //! return a vector of floats to sync static ValTy extract(uint32_t node_id, char&) { - // TODO figure out how to avoid copy from C array to vector; best - // way is if original data is in a vector probably, but that has the - // issue of not being able to directly call BLAS - ValTy vecToReturn; - // allocate space - vecToReturn.resize(deepgalois::_syncVectorSize); - // copy the node's data to vector to serialize/send - for (unsigned i = 0; i < deepgalois::_syncVectorSize; i++) { - vecToReturn[i] = - deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + i]; - } + ValTy vecToReturn(&deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize], + deepgalois::_syncVectorSize); // move constructor should kick in here to avoid return copy return vecToReturn; } diff --git a/libdist/include/galois/BufferWrapper.h b/libdist/include/galois/BufferWrapper.h new file mode 100644 index 0000000000..eeebd7f747 --- /dev/null +++ b/libdist/include/galois/BufferWrapper.h @@ -0,0 +1,44 @@ +#ifndef GALOIS_BUFFER_WRAPPER +#define GALOIS_BUFFER_WRAPPER +#include + +namespace galois { + +//! Wraps a pointer representing an array with the number of elements the +//! array contains (or that we want to handle with this class) +//! Used to avoid copying of memory into a vector for +//! serialization/deserialization purpose +//! @todo give this a better name +template +class BufferWrapper { + //! Raw memory kept by this class + ElementType* raw_memory; + //! Number of elements that can be accessed from the raw_memory pointer + size_t num_elements; +public: + //! Default constructor doesn't exist: must provide pointer and size + BufferWrapper() = delete; + //! Save a pointer and the number of elements in that array that this can access + BufferWrapper(ElementType* pointer, size_t num_elements_) : raw_memory(pointer), + num_elements(num_elements_) {}; + + //! Returns element at some specified index of the array + ElementType& operator[](size_t index) { + assert(index < num_elements); + return raw_memory[index]; + } + + //! Returns element at some specified index of the array; const i.e. not modifiable + const ElementType& operator[](size_t index) const { + assert(index < num_elements); + return raw_memory[index]; + } + + //! Return number of elements in the array + size_t size() const { + return this->num_elements; + } +}; + +} // end namespace +#endif From 1f000d8e9855b21ecc88ad1d930d9208ae370700 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 11 Jul 2020 14:33:42 -0500 Subject: [PATCH 325/660] bufferwrapper functionality + serialization of it --- libdist/include/galois/BufferWrapper.h | 67 ++++++++++++++++++---- libdist/include/galois/runtime/Serialize.h | 42 ++++++++++++++ 2 files changed, 97 insertions(+), 12 deletions(-) diff --git a/libdist/include/galois/BufferWrapper.h b/libdist/include/galois/BufferWrapper.h index eeebd7f747..4c5854d5ad 100644 --- a/libdist/include/galois/BufferWrapper.h +++ b/libdist/include/galois/BufferWrapper.h @@ -1,26 +1,52 @@ #ifndef GALOIS_BUFFER_WRAPPER #define GALOIS_BUFFER_WRAPPER +#include "galois/gstl.h" #include namespace galois { //! Wraps a pointer representing an array with the number of elements the //! array contains (or that we want to handle with this class) +//! //! Used to avoid copying of memory into a vector for //! serialization/deserialization purpose //! @todo give this a better name -template +template class BufferWrapper { - //! Raw memory kept by this class +public: + using size_type = size_t; + using value_type = ElementType; + +private: + //! This vector is allocated when creating a buffer wrapper from scratch + //! (i.e. during deserialization into one) + galois::gstl::Vector dummy; + //! Raw memory kept by this class; either points to existing memory or to the + //! vector memory held by this class ElementType* raw_memory; //! Number of elements that can be accessed from the raw_memory pointer - size_t num_elements; + size_type num_elements; + public: - //! Default constructor doesn't exist: must provide pointer and size - BufferWrapper() = delete; - //! Save a pointer and the number of elements in that array that this can access - BufferWrapper(ElementType* pointer, size_t num_elements_) : raw_memory(pointer), - num_elements(num_elements_) {}; + //! Default constructor 0s everything + BufferWrapper() { + this->raw_memory = 0; + this->num_elements = 0; + } + + //! frees dummy vector + ~BufferWrapper() { + // explicit vector clear; regular destructor probably frees it, but + // doing it for safetey + if (dummy.size()) { + dummy.clear(); + } + } + + //! Save a pointer and the number of elements in that array that this can + //! access + BufferWrapper(ElementType* pointer, size_t num_elements_) + : raw_memory(pointer), num_elements(num_elements_){}; //! Returns element at some specified index of the array ElementType& operator[](size_t index) { @@ -28,17 +54,34 @@ class BufferWrapper { return raw_memory[index]; } - //! Returns element at some specified index of the array; const i.e. not modifiable + //! Returns element at some specified index of the array; const i.e. not + //! modifiable const ElementType& operator[](size_t index) const { assert(index < num_elements); return raw_memory[index]; } //! Return number of elements in the array - size_t size() const { - return this->num_elements; + size_t size() const { return this->num_elements; } + + //! return unmodifiable pointer to raw_memory + const ElementType* data() const { return raw_memory; } + //! return pointer to raw_memory + ElementType* data() { return raw_memory; } + + //! Allocates memory in the underlying vector; should only be used for + //! deserialization into this class during communication + void resize(size_t new_size) { + if (!this->raw_memory) { + this->dummy.resize(new_size); + this->raw_memory = this->dummy.data(); + this->num_elements = this->dummy.size(); + } else { + GALOIS_DIE("calling resize when there is already raw memory " + "allocated"); + } } }; -} // end namespace +} // namespace galois #endif diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h index 688e4be59d..40cd4f4b7e 100644 --- a/libdist/include/galois/runtime/Serialize.h +++ b/libdist/include/galois/runtime/Serialize.h @@ -43,6 +43,7 @@ #include #include #include "galois/CopyableTuple.h" +#include "galois/BufferWrapper.h" #include "galois/Bag.h" namespace galois { @@ -305,6 +306,12 @@ gSizedObj(const T&, return sizeof(uintptr_t); } +//! Size of BufferWrapper is size + number of things in it +template +inline size_t gSizedObj(const galois::BufferWrapper& data) { + return sizeof(size_t) + data.size() * sizeof(T); +} + /** * Returns the size necessary for storing 2 elements of a pair into a * serialize buffer. @@ -561,6 +568,11 @@ template inline void gSerializeObj(SerializeBuffer& buf, const std::vector& data); +// Forward declaration of buff serialize +template +inline void gSerializeObj(SerializeBuffer& buf, + const galois::BufferWrapper& data); + /** * Serialize a sequence type into a buffer. * @@ -608,6 +620,18 @@ inline void gSerializeObj(SerializeBuffer& buf, gSerializeSeq(buf, data); } +//! Serialize BufferWrapper similarly to vector +template +inline void gSerializeObj(SerializeBuffer& buf, + const galois::BufferWrapper& data) { + if (is_memory_copyable::value) { + gSerializeLinearSeq(buf, data); + } else { + GALOIS_DIE("have not implemented support for serializing nonPOD buffer " + "wrapper"); + } +} + /** * Serialize a PODResizeableArray into a buffer, choosing to do a memcopy or * to serialize each element individually depending on data. @@ -919,6 +943,10 @@ gDeserializeObj(DeSerializeBuffer& buf, template void gDeserializeObj(DeSerializeBuffer& buf, std::vector& data); +// Forward declaration of buff wrapper deserialize +template +void gDeserializeObj(DeSerializeBuffer& buf, galois::BufferWrapper& data); + /** * Deserialize into a sequence object * @@ -986,6 +1014,20 @@ void gDeserializeObj(DeSerializeBuffer& buf, std::vector& data) { gDeserializeSeq(buf, data); } +//! deserialize into buf wrapper +template +void gDeserializeObj(DeSerializeBuffer& buf, galois::BufferWrapper& bf) { + if (is_memory_copyable::value) { + // manual deserialization here + size_t buffer_size; + gDeserializeObj(buf, buffer_size); + bf.resize(buffer_size); + buf.extract((uint8_t*)bf.data(), buffer_size * sizeof(T)); + } else { + GALOIS_DIE("deserialize for buf wrapper not implemented for nonpod"); + } +} + /** * Deserialize into a PODResizeableArray * From 46a052effcea16c251db6427c554a16c7773d117 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 11 Jul 2020 18:03:00 -0500 Subject: [PATCH 326/660] fixed bufferwrapper vector access Originally the raw_mem pointer is initialized to the same data as the dummy vector. However, the dummy vector's data pointer is updated at some point, causes the old initiailization to be useless. Changed behvaior such that if dummy vector is initialized, then access is always from it. TODO how much overhead is the check? --- libdist/include/galois/BufferWrapper.h | 50 +++++++++++++++++----- libdist/include/galois/runtime/Serialize.h | 2 +- 2 files changed, 40 insertions(+), 12 deletions(-) diff --git a/libdist/include/galois/BufferWrapper.h b/libdist/include/galois/BufferWrapper.h index 4c5854d5ad..8066f3a25e 100644 --- a/libdist/include/galois/BufferWrapper.h +++ b/libdist/include/galois/BufferWrapper.h @@ -21,8 +21,9 @@ class BufferWrapper { //! This vector is allocated when creating a buffer wrapper from scratch //! (i.e. during deserialization into one) galois::gstl::Vector dummy; - //! Raw memory kept by this class; either points to existing memory or to the - //! vector memory held by this class + //! Raw memory kept by this class; either points to existing memory or is + //! empty (vector.data changes when this object is copied, causes issues + //! with correcntess) ElementType* raw_memory; //! Number of elements that can be accessed from the raw_memory pointer size_type num_elements; @@ -30,6 +31,7 @@ class BufferWrapper { public: //! Default constructor 0s everything BufferWrapper() { + dummy.clear(); this->raw_memory = 0; this->num_elements = 0; } @@ -50,37 +52,63 @@ class BufferWrapper { //! Returns element at some specified index of the array ElementType& operator[](size_t index) { - assert(index < num_elements); - return raw_memory[index]; + assert(index < this->num_elements); + if (dummy.size()) { + return dummy[index]; + } else { + return raw_memory[index]; + } } //! Returns element at some specified index of the array; const i.e. not //! modifiable const ElementType& operator[](size_t index) const { - assert(index < num_elements); - return raw_memory[index]; + assert(index < this->num_elements); + if (dummy.size()) { + return dummy[index]; + } else { + return raw_memory[index]; + } } //! Return number of elements in the array size_t size() const { return this->num_elements; } //! return unmodifiable pointer to raw_memory - const ElementType* data() const { return raw_memory; } + const ElementType* data() const { + if (dummy.size()) { + return dummy.data(); + } else { + return raw_memory; + } + } + //! return pointer to raw_memory - ElementType* data() { return raw_memory; } + ElementType* data() { + if (dummy.size()) { + return dummy.data(); + } else { + return raw_memory; + } + } //! Allocates memory in the underlying vector; should only be used for //! deserialization into this class during communication + //! This also means you shouldn't use raw_data void resize(size_t new_size) { - if (!this->raw_memory) { + if (!this->dummy.size()) { this->dummy.resize(new_size); - this->raw_memory = this->dummy.data(); this->num_elements = this->dummy.size(); } else { - GALOIS_DIE("calling resize when there is already raw memory " + GALOIS_DIE("calling resize when there is already memory " "allocated"); } } + + ElementType* get_vec_data() { + assert(this->dummy.size()); + return dummy.data(); + } }; } // namespace galois diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h index 40cd4f4b7e..489676928b 100644 --- a/libdist/include/galois/runtime/Serialize.h +++ b/libdist/include/galois/runtime/Serialize.h @@ -1022,7 +1022,7 @@ void gDeserializeObj(DeSerializeBuffer& buf, galois::BufferWrapper& bf) { size_t buffer_size; gDeserializeObj(buf, buffer_size); bf.resize(buffer_size); - buf.extract((uint8_t*)bf.data(), buffer_size * sizeof(T)); + buf.extract((uint8_t*)bf.get_vec_data(), buffer_size * sizeof(T)); } else { GALOIS_DIE("deserialize for buf wrapper not implemented for nonpod"); } From b1fba8e18afc05311a8dba8b5ab388ec776364a7 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 11 Jul 2020 19:21:50 -0500 Subject: [PATCH 327/660] thread local rng for dropout_cpu --- libdeepgalois/include/deepgalois/utils.h | 16 ++++ libdeepgalois/src/math_functions.cpp | 112 +++-------------------- 2 files changed, 30 insertions(+), 98 deletions(-) diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h index 91ccc94b83..ca93b9da62 100644 --- a/libdeepgalois/include/deepgalois/utils.h +++ b/libdeepgalois/include/deepgalois/utils.h @@ -65,6 +65,22 @@ class Timer { struct timeval elapsed_time_; }; +class PerThreadRNG { + galois::substrate::PerThreadStorage engine; + galois::substrate::PerThreadStorage> + distribution; + +public: + //! init distribution + PerThreadRNG() : distribution{0.0, 1.0} {}; + + //! thread local RNG float from 0 to 1 + float_t get_number() { + float_t num = (*distribution.getLocal())(*engine.getLocal()); + return num; + } +}; + class random_generator { public: static random_generator& get_instance() { diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 3b96341c66..11f99e15e0 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -38,15 +38,20 @@ void rng_bernoulli(size_t n, const float_t p, uint8_t* r) { } */ -std::default_random_engine generator; -std::uniform_real_distribution distribution(0.0, 1.0); +// anon namespace so these things don't leak elsewhere +namespace { +static deepgalois::PerThreadRNG* per_thread_rng = nullptr; +} namespace deepgalois { namespace math { inline uint8_t bernoulli(float_t p) { - return distribution(generator) > p ? 1 : 0; + if (!per_thread_rng) { + per_thread_rng = new PerThreadRNG(); + } + return per_thread_rng->get_number() > p ? 1 : 0; } //! wrapper function to call cblas_sgemm @@ -116,80 +121,7 @@ void mvmul(const CBLAS_TRANSPOSE TransA, const int M, const int N, cblas_sgemv(CblasRowMajor, TransA, M, N, alpha, A, N, x, 1, beta, y, 1); } -inline void rng_uniform_cpu(size_t n, float_t* r) { -#ifdef USE_MKL - VSLStreamStatePtr stream; - // Initializing the streams - vslNewStream(&stream, VSL_BRNG_SOBOL, 1); - // Generating - vsRngUniform(VSL_RNG_METHOD_UNIFORM_STD, stream, n, r, 0.0f, 1.0f); - // Deleting the streams - vslDeleteStream(&stream); -#else - for (size_t i = 0; i < n; ++i) { - r[i] = distribution(generator); - } - // galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { - // unsigned short xi[3]; - // r[i] = erand48(xi); - //}, galois::loopname("randomMaskGen")); -#endif -} - const size_t vec_len = 8; // for 32-bit floating point in AVX2; TODO AVX512 -/* -// vector add -void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* out) { -#ifdef __AVX2__ - const size_t alignedN = n - n % vec_len; - for (size_t i = 0; i < alignedN; i += vec_len) - _mm256_storeu_ps(&out[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), -_mm256_loadu_ps(&b[i]))); for (size_t i = alignedN; i < n; ++i) out[i] = a[i] + -b[i]; #else for (size_t i = 0; i < n; ++i) out[i] = a[i] + b[i]; #endif -} - -#if defined(__AVX__) || defined(__AVX2__) -void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) -{ const size_t alignedN = n - n % vec_len; const __m256 scal = -_mm256_set1_ps(alpha); for (size_t i = 0; i < alignedN; i += vec_len) - _mm256_storeu_ps(&out[i], _mm256_mul_ps(_mm256_loadu_ps(&in[i]), scal)); - for (size_t i = alignedN; i < n; ++i) out[i] = alpha * in[i]; -} - -// SAXPY stands for โ€œSingle-precision A*X Plus Y" -void axpy(size_t n, const float_t a, float_t *x, float_t *y) { - const size_t alignedN = n - n % vec_len; - const __m256 alpha = _mm256_set1_ps(a); - for (size_t i = 0; i < alignedN; i += vec_len) { - __m256 product = _mm256_mul_ps(_mm256_loadu_ps(&x[i]), alpha); - _mm256_storeu_ps(&y[i], _mm256_add_ps(_mm256_loadu_ps(&y[i]), product)); - } - for (size_t i = alignedN; i < n; ++i) y[i] = a * x[i] + y[i]; -} - -float_t l2_norm(size_t n, const float_t* in) { - const size_t alignedN = n - n % vec_len; - __m256 vsum = _mm256_set1_ps(0.0); - for (size_t i = 0; i < alignedN; i += vec_len) { - __m256 a = _mm256_loadu_ps(&in[i]); - vsum = _mm256_add_ps(vsum, _mm256_mul_ps(a, a)); - } - __m256 sum = _mm256_hadd_ps(vsum, vsum); - return (((float_t*)&sum)[0] + ((float_t*)&sum)[2]) / 2.0; -} -#else -// vector multiply scalar -void mul_scalar(size_t n, const float_t alpha, const float_t* in, float_t* out) -{ for (size_t i = 0; i < n; ++i) out[i] = alpha * in[i]; -} - -float_t l2_norm(size_t n, const float_t* a) { - float_t sum = 0.0; - for (size_t i = 0; i < n; ++i) sum += a[i] * a[i]; - return sum / 2.0; -} -#endif -*/ void vadd_cpu(size_t n, const float_t* a, const float_t* b, float_t* y) { #ifdef USE_MKL @@ -259,28 +191,12 @@ void dropout(size_t m, float scale, float dropout_rate, const float_t* in, void dropout_cpu(size_t n, size_t m, float scale, float dropout_rate, const float_t* in, mask_t* masks, float_t* out) { size_t len = n * m; - /* - #ifdef USE_MKL - vec_t rands(len); - rng_uniform_cpu(len, &rands[0]); - galois::do_all(galois::iterate((size_t)0, len), [&](const auto& i) { - masks[i] = rands[i] > dropout_rate ? 1 : 0; - }, galois::loopname("randomMaskGen")); - */ - /* - galois::do_all(galois::iterate((size_t)0, n), [&](const auto& i) { - auto idx = i * m; - vec_t rands(m); - rng_uniform_cpu(m, &rands[0]); - for (size_t j = 0; j < m; ++j) - masks[idx+j] = rands[j] > dropout_rate ? 1 : 0; - }, galois::loopname("dropout")); - #else - */ - for (size_t i = 0; i < len; ++i) { - masks[i] = bernoulli(dropout_rate); - } - //#endif + + galois::do_all( + galois::iterate((size_t)0, len), + [&](size_t i) { masks[i] = bernoulli(dropout_rate); }, + galois::loopname("dropout RNG")); + galois::do_all( galois::iterate((size_t)0, len), [&](const auto& i) { out[i] = in[i] * (float_t)masks[i] * scale; }, From b2f1c6f82d9e114aa462ce58186d05ebe6d6487e Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 15 Jul 2020 09:58:00 -0500 Subject: [PATCH 328/660] fix gpu compilation --- libdeepgalois/include/deepgalois/random.h | 53 +++++++++++++++++++++++ libdeepgalois/include/deepgalois/utils.h | 47 +------------------- libdeepgalois/src/math_functions.cpp | 1 + libdeepgalois/src/reader.cpp | 4 ++ libdeepgalois/src/utils.cpp | 3 ++ 5 files changed, 62 insertions(+), 46 deletions(-) create mode 100644 libdeepgalois/include/deepgalois/random.h diff --git a/libdeepgalois/include/deepgalois/random.h b/libdeepgalois/include/deepgalois/random.h new file mode 100644 index 0000000000..bf1648bc2a --- /dev/null +++ b/libdeepgalois/include/deepgalois/random.h @@ -0,0 +1,53 @@ +// random number generators for CPU +#pragma once + +#include +#include "galois/Galois.h" +#include "deepgalois/GraphTypes.h" + +namespace deepgalois { + +class PerThreadRNG { + galois::substrate::PerThreadStorage engine; + galois::substrate::PerThreadStorage> + distribution; + +public: + //! init distribution + PerThreadRNG() : distribution{0.0, 1.0} {}; + + //! thread local RNG float from 0 to 1 + float_t get_number() { + float_t num = (*distribution.getLocal())(*engine.getLocal()); + return num; + } +}; + +class random_generator { +public: + static random_generator& get_instance() { + static random_generator instance; + return instance; + } + std::mt19937& operator()() { return gen_; } + void set_seed(unsigned int seed) { gen_.seed(seed); } + +private: + random_generator() : gen_(1) {} + std::mt19937 gen_; +}; + +template +inline typename std::enable_if::value, T>::type +uniform_rand(T min, T max) { + std::uniform_int_distribution dst(min, max); + return dst(random_generator::get_instance()()); +} + +template +inline typename std::enable_if::value, T>::type +uniform_rand(T min, T max) { + std::uniform_real_distribution dst(min, max); + return dst(random_generator::get_instance()()); +} +} //end of namespace diff --git a/libdeepgalois/include/deepgalois/utils.h b/libdeepgalois/include/deepgalois/utils.h index ca93b9da62..bf74aad196 100644 --- a/libdeepgalois/include/deepgalois/utils.h +++ b/libdeepgalois/include/deepgalois/utils.h @@ -7,8 +7,7 @@ #include #include #include -#include "deepgalois/GraphTypes.h" -//#include "galois/DistGalois.h" +#include "deepgalois/types.h" namespace deepgalois { @@ -65,50 +64,6 @@ class Timer { struct timeval elapsed_time_; }; -class PerThreadRNG { - galois::substrate::PerThreadStorage engine; - galois::substrate::PerThreadStorage> - distribution; - -public: - //! init distribution - PerThreadRNG() : distribution{0.0, 1.0} {}; - - //! thread local RNG float from 0 to 1 - float_t get_number() { - float_t num = (*distribution.getLocal())(*engine.getLocal()); - return num; - } -}; - -class random_generator { -public: - static random_generator& get_instance() { - static random_generator instance; - return instance; - } - std::mt19937& operator()() { return gen_; } - void set_seed(unsigned int seed) { gen_.seed(seed); } - -private: - random_generator() : gen_(1) {} - std::mt19937 gen_; -}; - -template -inline typename std::enable_if::value, T>::type -uniform_rand(T min, T max) { - std::uniform_int_distribution dst(min, max); - return dst(random_generator::get_instance()()); -} - -template -inline typename std::enable_if::value, T>::type -uniform_rand(T min, T max) { - std::uniform_real_distribution dst(min, max); - return dst(random_generator::get_instance()()); -} - // sequential prefix sum template inline std::vector prefix_sum(const std::vector& in) { diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 11f99e15e0..6d5b13df78 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -6,6 +6,7 @@ #include "galois/Timer.h" #include "galois/Galois.h" #include "deepgalois/utils.h" +#include "deepgalois/random.h" #include "deepgalois/math_functions.hh" #ifdef USE_MKL diff --git a/libdeepgalois/src/reader.cpp b/libdeepgalois/src/reader.cpp index bf5792fca4..d7e1bcf44b 100644 --- a/libdeepgalois/src/reader.cpp +++ b/libdeepgalois/src/reader.cpp @@ -1,6 +1,7 @@ #include "deepgalois/reader.h" #include "deepgalois/utils.h" #include "deepgalois/configs.h" +#include "galois/Galois.h" #include #include #include @@ -8,6 +9,9 @@ #include /* For open(), creat() */ #include #include +#ifndef GALOIS_ENABLE_GPU +#include "galois/DistGalois.h" +#endif namespace deepgalois { diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index 929881dd25..61ff3a2e58 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -1,5 +1,8 @@ #include "galois/Galois.h" #include "deepgalois/utils.h" +#ifndef GALOIS_ENABLE_GPU +#include "galois/DistGalois.h" +#endif namespace deepgalois { From 33d2b542d9e2aea7091b503537160bab4e783652 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Wed, 15 Jul 2020 11:24:57 -0500 Subject: [PATCH 329/660] add gat_fw --- .../include/deepgalois/layers/layer.h | 1 + .../include/deepgalois/math_functions.hh | 3 + libdeepgalois/src/layers/gat_fw.h | 66 +++++++++++++++++++ libdeepgalois/src/layers/graph_conv_layer.cpp | 6 ++ libdeepgalois/src/math_functions.cpp | 6 ++ 5 files changed, 82 insertions(+) create mode 100644 libdeepgalois/src/layers/gat_fw.h diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 874e7d41c6..7ac5b8b649 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -60,6 +60,7 @@ class layer : public deepgalois::node { vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x // 16, layer1: 16 x E + vec_t alpha; // parameters to learn (H x 1), only used for GAT vec_t weight_grad; // weight gradient for updating parameters float_t* d_W; float_t* d_weight_grad; diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 6c002e2ffb..8f5cc25d37 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -48,6 +48,9 @@ void mul_scalar(size_t n, const float_t alpha, const float_t* x, float_t* y); //! do dot product of 2 vectors float_t dot(size_t n, const float_t* x, const float_t* y); +// concatenation of two vectors into one +void concat(size_t n, const float_t* x, const float_t* y, float_t* z); + // SAXPY stands for โ€œSingle-precision A*X Plus Y" void axpy(size_t n, const float_t a, float_t* x, float_t* y); diff --git a/libdeepgalois/src/layers/gat_fw.h b/libdeepgalois/src/layers/gat_fw.h new file mode 100644 index 0000000000..3e77ebc797 --- /dev/null +++ b/libdeepgalois/src/layers/gat_fw.h @@ -0,0 +1,66 @@ +//#define USE_GAT +#ifdef USE_GAT +void graph_conv_layer::forward_propagation(const float_t* in_data, + float_t* out_data) { + galois::StatTimer conv_timer("GraphConvForward"); + conv_timer.start(); + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; + + // (1) dropout + if (dropout_ && phase_ == net_phase::train) { + math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, + dropout_mask, in_temp); + } else { + math::copy_cpu(x * y, in_data, in_temp); + } + + // (2) linear transformation + math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, + &layer::W[0], 0.0, out_temp); + + auto &g = *graph_cpu; + size_t n = g.size(); + size_t len = z; + float_t* in = out_temp; + float_t* out = out_data; + + galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { + auto src_idx = src * len; + auto deg_src = g.get_degree(src); + + // (3) concatenation, dot product, LeakyReLU + int i = 0; + vec_t scores(deg_src); + //for (auto e : g.edges(src)) { + for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { + auto dst = g.getEdgeDst(e); + auto dst_idx = dst * len; + vec_t concat_vec(2*z); + math::concat(z, &in[src_idx], &in[dst_idx], &concat_vec[0]); + // alpha: learnable weight vector + scores[i++] = math::dot(2*z, &alpha[0], &concat_vec[0]); + } + + // (4) softmax to normalize the attention scores on each vertexโ€™s incoming edges + vec_t normalized_scores(deg_src, 0); + math::softmax(deg_src, &scores[0], &normalized_scores[0]); // normalize using softmax + math::clear_cpu(len, &out[src_idx]); + + // (5) aggregation: scaled by the attention scores + //for (auto e : g.edges(src)) { + for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { + auto dst = g.getEdgeDst(e); + auto dst_idx = dst * len; + auto score = normalized_scores[dst]; + vec_t neighbor(len); + math::scale(len, score, &in[dst_idx], &neighbor[0]); + math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]); + } + }); + + // (6) ReLU + if (act_) math::relu_cpu(x * z, out_data, out_data); +} +#endif diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 4c11086495..a6c49f615b 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -3,6 +3,7 @@ #include "deepgalois/utils.h" namespace deepgalois { +#include "gat_fw.h" //! Set this to let sync struct know where to get data from float_t* _dataToSync = nullptr; @@ -86,6 +87,9 @@ void graph_conv_layer::malloc_and_init() { // rand_init_matrix(y, z, Q); zero_init_matrix(y, z, layer::weight_grad); + // alpha is only used for GAT + rand_init_matrix(2*z, 1, alpha, 1); + if (dropout_) dropout_mask = new mask_t[x * y]; in_temp = new float_t[x * y]; @@ -95,6 +99,7 @@ void graph_conv_layer::malloc_and_init() { in_temp1 = new float_t[x * y]; } +#ifndef USE_GAT // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { @@ -146,6 +151,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, conv_timer.stop(); } +#endif // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ void graph_conv_layer::back_propagation(const float_t* in_data, diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index 6d5b13df78..a5e6b50eec 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -175,6 +175,12 @@ float_t dot(size_t n, const float_t* x, const float_t* y) { return cblas_sdot(n, x, 1, y, 1); } +// concatenation of two vectors into one +void concat(size_t n, const float_t* x, const float_t* y, float_t* z) { + copy_cpu(n, x, z); + copy_cpu(n, y, z+n); +} + void clear_cpu(size_t n, float_t* in) { // for (size_t i = 0; i < n; i++) in[i] = 0; std::fill(in, in + n, 0); From 234f31c56eb0492cb5b7e0db8efd5819a39285ec Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 18 Jul 2020 12:52:29 -0500 Subject: [PATCH 330/660] bitset for forward sync for graphconv --- .../layers/GraphConvSyncStructures.h | 17 ++++++++- .../deepgalois/layers/graph_conv_layer.h | 1 - libdeepgalois/src/layers/graph_conv_layer.cpp | 37 +++++++++++++++++-- 3 files changed, 49 insertions(+), 6 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h index b07b672fa1..bc88656bec 100644 --- a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h +++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h @@ -8,8 +8,9 @@ struct GraphConvSync { //! return a vector of floats to sync static ValTy extract(uint32_t node_id, char&) { - ValTy vecToReturn(&deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize], - deepgalois::_syncVectorSize); + ValTy vecToReturn( + &deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize], + deepgalois::_syncVectorSize); // move constructor should kick in here to avoid return copy return vecToReturn; } @@ -54,5 +55,17 @@ struct GraphConvSync { static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } }; +struct Bitset_gradient { + static constexpr bool is_vector_bitset() { return false; } + + static constexpr bool is_valid() { return true; } + + static galois::DynamicBitSet& get() { return bitset_gradient; } + + static void reset_range(size_t begin, size_t end) { + bitset_gradient.reset(begin, end); + } +}; + #endif #endif diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index ad954215fc..f1501d39d2 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -1,7 +1,6 @@ #pragma once #include "layer.h" #include "deepgalois/layers/aggregator.h" -#include "deepgalois/layers/GraphConvSyncStructures.h" /** * GraphConv Layer; based on DGL implementation + follows TinyDNN layer diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index a6c49f615b..791b9c9a51 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -2,6 +2,9 @@ #include "deepgalois/math_functions.hh" #include "deepgalois/utils.h" +static galois::DynamicBitSet bitset_gradient; +#include "deepgalois/layers/GraphConvSyncStructures.h" + namespace deepgalois { #include "gat_fw.h" @@ -73,6 +76,9 @@ void graph_conv_layer::malloc_and_init() { size_t y = input_dims[1]; size_t z = output_dims[1]; + galois::gInfo("bitset size is going to be ", x); + bitset_gradient.resize(x); + // setup gluon layer::gradientGraph = new deepgalois::GluonGradients(layer::weight_grad, y * z); @@ -88,7 +94,7 @@ void graph_conv_layer::malloc_and_init() { zero_init_matrix(y, z, layer::weight_grad); // alpha is only used for GAT - rand_init_matrix(2*z, 1, alpha, 1); + rand_init_matrix(2 * z, 1, alpha, 1); if (dropout_) dropout_mask = new mask_t[x * y]; @@ -139,8 +145,33 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, // TODO how to do this for the sampled case? deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_data; - layer::context->getSyncSubstrate()->sync( - "GraphConvForward"); + // bitset setting + galois::do_all( + galois::iterate((size_t)0, bitset_gradient.size()), + [&](size_t node_id) { + bool set_true = false; + // check for non-zeros; the moment one is found, set true becomes true + // and we break out of the loop + for (size_t i = 0; i < deepgalois::_syncVectorSize; i++) { + auto val = + deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + + i]; + if (val != 0) { + set_true = true; + break; + } + } + + if (set_true) { + bitset_gradient.set(node_id); + } + }, + galois::loopname("BitsetGraphConvForward"), galois::no_stats()); + galois::gPrint(bitset_gradient.count(), " out of ", bitset_gradient.size(), + "\n"); + layer::context->getSyncSubstrate() + ->sync( + "GraphConvForward"); // run relu activation on output if specified galois::StatTimer relu_timer("GraphConvForwardRelu"); From 1451f8550f46df2436d731f1adc7b83b2d67c645 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 18 Jul 2020 13:09:43 -0500 Subject: [PATCH 331/660] backward graphconv sync bitset --- libdeepgalois/src/layers/graph_conv_layer.cpp | 29 +++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 791b9c9a51..3ba451ebf3 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -230,8 +230,33 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // sync agg deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_temp; - layer::context->getSyncSubstrate()->sync( - "GraphConvBackward"); + galois::do_all( + galois::iterate((size_t)0, bitset_gradient.size()), + [&](size_t node_id) { + bool set_true = false; + // check for non-zeros; the moment one is found, set true becomes true + // and we break out of the loop + for (size_t i = 0; i < deepgalois::_syncVectorSize; i++) { + auto val = + deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + + i]; + if (val != 0) { + set_true = true; + break; + } + } + + if (set_true) { + bitset_gradient.set(node_id); + } + }, + galois::loopname("BitsetGraphConvBackward"), galois::no_stats()); + galois::gPrint("backward ", bitset_gradient.count(), " out of ", + bitset_gradient.size(), "\n"); + + layer::context->getSyncSubstrate() + ->sync( + "GraphConvBackward"); galois::StatTimer drop_timer("GraphConvBackwardDropout"); drop_timer.start(); From 374b2725a8a9c499e350fea2dae0fe202b7bc73d Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 18 Jul 2020 15:58:26 -0500 Subject: [PATCH 332/660] conv bitset namechange, cleanup/modularity --- .../deepgalois/layers/GradientSyncStructs.h | 3 +- .../layers/GraphConvSyncStructures.h | 13 +-- .../include/deepgalois/layers/layer.h | 1 - libdeepgalois/src/layers/graph_conv_layer.cpp | 95 +++++++++---------- 4 files changed, 45 insertions(+), 67 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h index 9b325311b7..26420aa30d 100644 --- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h +++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h @@ -42,7 +42,6 @@ struct GradientSync { static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } }; -// TODO bitset; might have to do it manually -// GALOIS_SYNC_STRUCTURE_BITSET(TODOTHIS?); +// no bitset; everything is sent anyways #endif #endif diff --git a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h index bc88656bec..570aa56d2b 100644 --- a/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h +++ b/libdeepgalois/include/deepgalois/layers/GraphConvSyncStructures.h @@ -55,17 +55,6 @@ struct GraphConvSync { static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } }; -struct Bitset_gradient { - static constexpr bool is_vector_bitset() { return false; } - - static constexpr bool is_valid() { return true; } - - static galois::DynamicBitSet& get() { return bitset_gradient; } - - static void reset_range(size_t begin, size_t end) { - bitset_gradient.reset(begin, end); - } -}; - +GALOIS_SYNC_STRUCTURE_BITSET(conv); #endif #endif diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 7ac5b8b649..5d4aae6023 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -18,7 +18,6 @@ #ifndef GALOIS_ENABLE_GPU #include "galois/graphs/GluonSubstrate.h" #include "deepgalois/layers/GluonGradients.h" -#include "deepgalois/layers/GradientSyncStructs.h" #endif namespace deepgalois { diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 3ba451ebf3..1d543f0a78 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -2,8 +2,10 @@ #include "deepgalois/math_functions.hh" #include "deepgalois/utils.h" -static galois::DynamicBitSet bitset_gradient; +static galois::DynamicBitSet bitset_conv; + #include "deepgalois/layers/GraphConvSyncStructures.h" +#include "deepgalois/layers/GradientSyncStructs.h" namespace deepgalois { #include "gat_fw.h" @@ -76,8 +78,8 @@ void graph_conv_layer::malloc_and_init() { size_t y = input_dims[1]; size_t z = output_dims[1]; - galois::gInfo("bitset size is going to be ", x); - bitset_gradient.resize(x); + galois::gInfo("conv bitset size is going to be ", x); + bitset_conv.resize(x); // setup gluon layer::gradientGraph = @@ -86,6 +88,7 @@ void graph_conv_layer::malloc_and_init() { new galois::graphs::GluonSubstrate( *layer::gradientGraph, layer::gradientGraph->myHostID(), layer::gradientGraph->numHosts(), false); + galois::gInfo("gradient bitset size is going to be ", y * z); // make sure seed consistent across all hosts for weight matrix rand_init_matrix(y, z, W, 1); @@ -105,6 +108,34 @@ void graph_conv_layer::malloc_and_init() { in_temp1 = new float_t[x * y]; } +namespace { +void set_conv_bitset() { + // bitset setting + galois::do_all( + galois::iterate((size_t)0, bitset_conv.size()), + [&](size_t node_id) { + bool set_true = false; + // check for non-zeros; the moment one is found, set true becomes true + // and we break out of the loop + for (size_t i = 0; i < deepgalois::_syncVectorSize; i++) { + auto val = + deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + + i]; + if (val != 0) { + set_true = true; + break; + } + } + + if (set_true) { + bitset_conv.set(node_id); + } + }, + galois::loopname("BitsetGraphConv"), galois::no_stats()); +} + +} // end anonymous namespace + #ifndef USE_GAT // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) void graph_conv_layer::forward_propagation(const float_t* in_data, @@ -145,33 +176,12 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, // TODO how to do this for the sampled case? deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_data; - // bitset setting - galois::do_all( - galois::iterate((size_t)0, bitset_gradient.size()), - [&](size_t node_id) { - bool set_true = false; - // check for non-zeros; the moment one is found, set true becomes true - // and we break out of the loop - for (size_t i = 0; i < deepgalois::_syncVectorSize; i++) { - auto val = - deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + - i]; - if (val != 0) { - set_true = true; - break; - } - } + set_conv_bitset(); + galois::gPrint("forward ", bitset_conv.count(), " out of ", + bitset_conv.size(), "\n"); - if (set_true) { - bitset_gradient.set(node_id); - } - }, - galois::loopname("BitsetGraphConvForward"), galois::no_stats()); - galois::gPrint(bitset_gradient.count(), " out of ", bitset_gradient.size(), - "\n"); layer::context->getSyncSubstrate() - ->sync( - "GraphConvForward"); + ->sync("GraphConvForward"); // run relu activation on output if specified galois::StatTimer relu_timer("GraphConvForwardRelu"); @@ -230,32 +240,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // sync agg deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_temp; - galois::do_all( - galois::iterate((size_t)0, bitset_gradient.size()), - [&](size_t node_id) { - bool set_true = false; - // check for non-zeros; the moment one is found, set true becomes true - // and we break out of the loop - for (size_t i = 0; i < deepgalois::_syncVectorSize; i++) { - auto val = - deepgalois::_dataToSync[node_id * deepgalois::_syncVectorSize + - i]; - if (val != 0) { - set_true = true; - break; - } - } - - if (set_true) { - bitset_gradient.set(node_id); - } - }, - galois::loopname("BitsetGraphConvBackward"), galois::no_stats()); - galois::gPrint("backward ", bitset_gradient.count(), " out of ", - bitset_gradient.size(), "\n"); + set_conv_bitset(); + galois::gPrint("backward ", bitset_conv.count(), " out of ", + bitset_conv.size(), "\n"); layer::context->getSyncSubstrate() - ->sync( + ->sync( + //->sync( "GraphConvBackward"); galois::StatTimer drop_timer("GraphConvBackwardDropout"); From ac136dbf443c7e6091edef4db54f27f8cdb5a8db Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 18 Jul 2020 17:12:16 -0500 Subject: [PATCH 333/660] gnn-cvc --- .../galois/graphs/GenericPartitioners.h | 88 +++++++++++++++++++ lonestar/gnn/include/DistributedGraphLoader.h | 8 +- lonestar/gnn/src/DistributedGraphLoader.cpp | 15 ++-- 3 files changed, 103 insertions(+), 8 deletions(-) diff --git a/libcusp/include/galois/graphs/GenericPartitioners.h b/libcusp/include/galois/graphs/GenericPartitioners.h index f1a0809f37..3f0d30e212 100644 --- a/libcusp/include/galois/graphs/GenericPartitioners.h +++ b/libcusp/include/galois/graphs/GenericPartitioners.h @@ -956,4 +956,92 @@ class GnnOEC : public galois::graphs::CustomMasterAssignment { } }; +class GnnCVC : public galois::graphs::CustomMasterAssignment { + unsigned numRowHosts; + unsigned numColumnHosts; + unsigned _h_offset; + + void factorizeHosts() { + numColumnHosts = sqrt(_numHosts); + + while ((_numHosts % numColumnHosts) != 0) + numColumnHosts--; + + numRowHosts = _numHosts / numColumnHosts; + assert(numRowHosts >= numColumnHosts); + + if (_hostID == 0) { + galois::gPrint("Cartesian grid: ", numRowHosts, " x ", numColumnHosts, + "\n"); + } + } + + //! Returns the grid row ID of this host + unsigned gridRowID() const { return (_hostID / numColumnHosts); } + //! Returns the grid row ID of the specified host + unsigned gridRowID(unsigned id) const { return (id / numColumnHosts); } + //! Returns the grid column ID of this host + unsigned gridColumnID() const { return (_hostID % numColumnHosts); } + //! Returns the grid column ID of the specified host + unsigned gridColumnID(unsigned id) const { return (id % numColumnHosts); } + + //! Find the column of a particular node + unsigned getColumnOfNode(uint64_t gid) const { + return gridColumnID(retrieveMaster(gid)); + } + +public: + GnnCVC(uint32_t hostID, uint32_t numHosts, uint64_t numNodes, + uint64_t numEdges) + : galois::graphs::CustomMasterAssignment(hostID, numHosts, numNodes, + numEdges) { + factorizeHosts(); + _h_offset = gridRowID() * numColumnHosts; + }; + + template + uint32_t getMaster(uint32_t src, galois::graphs::BufferedGraph&, + const std::vector&, + std::unordered_map&, + const std::vector&, + std::vector>&, + const std::vector&, + std::vector>&) { + // this is expected to be set + return _globalHostMap[src]; + } + + uint32_t retrieveMaster(uint32_t gid) const { return _globalHostMap[gid]; } + + uint32_t getEdgeOwner(uint32_t, uint32_t dst, uint64_t) const { + int i = getColumnOfNode(dst); + return _h_offset + i; + } + + bool noCommunication() { return false; } + bool isVertexCut() const { + if ((numRowHosts == 1) || (numColumnHosts == 1)) + return false; + return true; + } + + void serializePartition(boost::archive::binary_oarchive&) {} + void deserializePartition(boost::archive::binary_iarchive&) {} + std::pair cartesianGrid() { + return std::make_pair(numRowHosts, numColumnHosts); + } + + bool predeterminedMapping(std::vector& mappings) { + if (mappings.size() != _numNodes) { + GALOIS_DIE("predetermined mapping size not equal to num nodes"); + } + _globalHostMap.resize(_numNodes); + + galois::do_all(galois::iterate((size_t)0, mappings.size()), + [&](size_t n) { _globalHostMap[n] = mappings[n]; }); + + return true; + } +}; + #endif diff --git a/lonestar/gnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h index f3755a886f..65104a6031 100644 --- a/lonestar/gnn/include/DistributedGraphLoader.h +++ b/lonestar/gnn/include/DistributedGraphLoader.h @@ -53,7 +53,8 @@ enum PARTITIONING_SCHEME { FENNEL_O, //!< Fennel, oec FENNEL_I, //!< Fennel, iec SUGAR_O, //!< Sugar, oec - GNN_OEC //!< gnn, oec + GNN_OEC, //!< gnn, oec + GNN_CVC //!< gnn, cvc }; /** @@ -88,6 +89,8 @@ inline const char* EnumToString(PARTITIONING_SCHEME e) { return "sugar-oec"; case GNN_OEC: return "gnn-oec"; + case GNN_CVC: + return "gnn-cvc"; default: GALOIS_DIE("Unsupported partition"); } @@ -145,6 +148,9 @@ DistGraph* constructSymmetricGraph(std::vector&) { case GNN_OEC: return cuspPartitionGraph( inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); + case GNN_CVC: + return cuspPartitionGraph( + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); default: GALOIS_DIE("Error: partition scheme specified is invalid"); return nullptr; diff --git a/lonestar/gnn/src/DistributedGraphLoader.cpp b/lonestar/gnn/src/DistributedGraphLoader.cpp index 71953ea53e..e18340fe82 100644 --- a/lonestar/gnn/src/DistributedGraphLoader.cpp +++ b/lonestar/gnn/src/DistributedGraphLoader.cpp @@ -32,11 +32,12 @@ namespace cll = llvm::cl; cll::opt partitionScheme( "partition", cll::desc("Type of partitioning."), - cll::values(clEnumValN(OEC, "oec", "Outgoing Edge-Cut (default)"), - clEnumValN(IEC, "iec", "Incoming Edge-Cut"), - clEnumValN(CART_VCUT, "cvc", "Cartesian Vertex-Cut of oec"), - clEnumValN(CART_VCUT_IEC, "cvc-iec", - "Cartesian Vertex-Cut of iec"), - clEnumValN(GNN_OEC, "g-oec", - "gnn oec: train nodes evenly distributed")), + cll::values( + clEnumValN(OEC, "oec", "Outgoing Edge-Cut (default)"), + clEnumValN(IEC, "iec", "Incoming Edge-Cut"), + clEnumValN(CART_VCUT, "cvc", "Cartesian Vertex-Cut of oec"), + clEnumValN(CART_VCUT_IEC, "cvc-iec", "Cartesian Vertex-Cut of iec"), + clEnumValN(GNN_OEC, "g-oec", "gnn oec: train nodes evenly distributed"), + clEnumValN(GNN_CVC, "g-cvc", + "gnn cvc: train nodes evenly distributed")), cll::init(GNN_OEC)); From 33661539fc9276f1f0e6f21a57006ed7156bad8e Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sun, 19 Jul 2020 10:37:46 -0500 Subject: [PATCH 334/660] update gat --- .../deepgalois/layers/graph_conv_layer.h | 1 + .../include/deepgalois/layers/layer.h | 21 ++- .../include/deepgalois/math_functions.hh | 2 + libdeepgalois/src/layers/gat_fw.h | 171 +++++++++++++----- libdeepgalois/src/layers/graph_conv_layer.cpp | 92 ++++++---- libdeepgalois/src/math_functions.cpp | 8 + 6 files changed, 208 insertions(+), 87 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index f1501d39d2..d112ddf785 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -74,6 +74,7 @@ class graph_conv_layer : public layer { float_t* in_temp1; float_t* trans_data; // y*x mask_t* dropout_mask; // x*y + float_t epsilon; // LeakyReLU angle of negative slope: set to 0.2 // Glorot & Bengio (AISTATS 2010) inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 5d4aae6023..534d99b821 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -57,18 +57,25 @@ class layer : public deepgalois::node { bool trainable_; // is this layer trainable bool use_mask; vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E - vec_t Q; // parameters to learn, for vertex u, i.e. v's neighbors, layer0: D x - // 16, layer1: 16 x E - vec_t alpha; // parameters to learn (H x 1), only used for GAT + vec_t Q; // parameters to learn, for vertex v's neighbors, same size as W vec_t weight_grad; // weight gradient for updating parameters - float_t* d_W; - float_t* d_weight_grad; + float_t* d_W; // parameters to learn on device (GPU) + float_t* d_weight_grad; // weight gradient on device (GPU) + vec_t alpha_l; // parameters to learn (H x 1), only used for GAT + vec_t alpha_r; // parameters to learn (H x 1), only used for GAT + vec_t alpha_lgrad; // gradients for updating alpha (GAT only) + vec_t alpha_rgrad; // gradients for updating alpha (GAT only) mask_t* masks_; // masks to show which samples are valid - mask_t* d_masks_; + mask_t* d_masks_; // masks on device (GPU) float_t* loss; // error for each vertex: N x 1 ContextType* context; label_t* labels; - float_t* norm_consts; + float_t* norm_consts; // normalization score + vec_t scores; // un-normalized scores + vec_t temp_scores; // un-normalized scores + vec_t scores_grad; // gradients of un-normalized scores + vec_t norm_scores; // normalized scores + vec_t norm_scores_grad; // gradients of normalized scores // TODO #ifdef GALOIS_ENABLE_GPU GraphGPU* graph_gpu; diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 8f5cc25d37..38f461620a 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -81,6 +81,8 @@ void relu_cpu(size_t n, const float_t* in, float_t* out); void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out); // Leaky ReLU +void leaky_relu(float_t epsilon, float_t in, float_t &out); +void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t &out); void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out); void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, const float_t* data, float_t* out); diff --git a/libdeepgalois/src/layers/gat_fw.h b/libdeepgalois/src/layers/gat_fw.h index 3e77ebc797..e9a7bada37 100644 --- a/libdeepgalois/src/layers/gat_fw.h +++ b/libdeepgalois/src/layers/gat_fw.h @@ -1,5 +1,108 @@ //#define USE_GAT #ifdef USE_GAT +// `Graph Attention Network ` +// NOTE: GAT paper uses "first concatenation then linear projection" +// to compute attention scores, while ours is "first projection then +// addition", the two approaches are mathematically equivalent: +// We decompose the weight vector a mentioned in the paper into +// [a_l || a_r], then a^T [Wh_i || Wh_j] = a_l Wh_i + a_r Wh_j +// Our implementation is much efficient because we do not need to +// save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus, +// addition could be optimized with DGL's built-in function u_add_v, +// which further speeds up computation and saves memory footprint. + +void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, + float_t* out) { + size_t n = g.size(); + galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { + auto src_idx = src * len; + auto deg_src = g.get_degree(src); + + // concatenation, dot product, LeakyReLU + //int i = 0; + //vec_t scores(deg_src); + auto begin = g.edge_begin(src); + auto end = g.edge_end(src); + // alpha: learnable weight vector (shared by all vertices) + float_t src_score = math::dot(len, &alpha_l[0], &in[src_idx]); + for (auto e = begin; e != end; e++) { + auto dst = g.getEdgeDst(e); + auto dst_idx = dst * len; + //vec_t concat_vec(2*len); + //math::concat(len, &in[src_idx], &in[dst_idx], &concat_vec[0]); + //float_t score = math::dot(2*len, &alpha[0], &concat_vec[0]); + float_t dst_score = math::dot(len, &alpha_r[0], &in[dst_idx]); + temp_scores[e] = src_score + dst_score; + math::leaky_relu(epsilon, temp_scores[e], scores[e]); + } + + // softmax to normalize the attention scores on each vertexโ€™s incoming edges + //vec_t normalized_scores(deg_src, 0); + //math::softmax(deg_src, &scores[0], &normalized_scores[0]); + math::softmax(deg_src, &scores[begin], &norm_scores[begin]); + + // aggregation: scaled by the attention scores + math::clear_cpu(len, &out[src_idx]); + for (auto e = begin; e != end; e++) { + auto dst = g.getEdgeDst(e); + auto dst_idx = dst * len; + auto score = norm_scores[e]; + vec_t neighbor(len); + math::scale(len, score, &in[dst_idx], &neighbor[0]); + math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]); + } + }); +} + +void graph_conv_layer::d_compute_scores(size_t len, Graph& g, + const float_t* in_data, + const float_t *out_data, + const float_t* in_grad) { + size_t n = g.size(); + + // compute gradients for the learnable vector `alpha` + //vec_t temp_grad(n*n); + //math::sgemm_cpu(CblasTrans, CblasNoTrans, n, len, n, 1.0, out_data, + // in_grad, 0.0, temp_grad); + galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { + auto begin = g.edge_begin(src); + auto end = g.edge_end(src); + auto deg_src = g.get_degree(src); + math::d_softmax(deg_src, &scores[begin], &norm_scores[begin], + &scores_grad[begin], &norm_scores_grad[begin]); + for (auto e = begin; e != end; e++) { + auto dst = g.getEdgeDst(e); + // use norm_scores_grad as temp_scores_grad since its data is useless already + math::d_leaky_relu(epsilon, &scores_grad[e], + &temp_scores[e], &norm_scores_grad[e]); + math::scale(len, norm_scores_grad[e], &in_data[src_idx], &alpha_lgrad[0]); + math::scale(len, norm_scores_grad[e], &in_data[dst_idx], &alpha_rgrad[0]); + } + }); +} + +void graph_conv_layer::d_aggregate(size_t len, Graph& g, + const float_t* in_grad, float_t* out_grad) { + size_t n = g.size(); + + // aggregation: the derivative is transposed; + // the graph is undirected (structurally symmetric), + // but values are not the same for the symmetric positions + galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { + auto src_idx = src * len; + auto src_begin = g.edge_begin(src); + for (auto e = src_begin; e != g.edge_end(src); e++) { + auto dst = g.getEdgeDst(e); + auto dst_idx = dst * len; + auto dst_begin = g.edge_begin(dst); + auto score = norm_scores[dst_begin+e-src_begin]; // transposed + vec_t neighbor(len); + math::scale(len, score, &in_grad[dst_idx], &neighbor[0]); + math::vadd_cpu(len, &out_grad[src_idx], &neighbor[0], &out_grad[src_idx]); + } + }); +} + void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { galois::StatTimer conv_timer("GraphConvForward"); @@ -8,7 +111,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, size_t y = input_dims[1]; size_t z = output_dims[1]; - // (1) dropout + // dropout if (dropout_ && phase_ == net_phase::train) { math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask, in_temp); @@ -16,51 +119,37 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, math::copy_cpu(x * y, in_data, in_temp); } - // (2) linear transformation + // linear transformation math::sgemm_cpu(CblasNoTrans, CblasNoTrans, x, z, y, 1.0, in_temp, &layer::W[0], 0.0, out_temp); - auto &g = *graph_cpu; - size_t n = g.size(); - size_t len = z; - float_t* in = out_temp; - float_t* out = out_data; + // aggregation + aggregate(z, *graph_cpu, out_temp, out_data); - galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { - auto src_idx = src * len; - auto deg_src = g.get_degree(src); - - // (3) concatenation, dot product, LeakyReLU - int i = 0; - vec_t scores(deg_src); - //for (auto e : g.edges(src)) { - for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { - auto dst = g.getEdgeDst(e); - auto dst_idx = dst * len; - vec_t concat_vec(2*z); - math::concat(z, &in[src_idx], &in[dst_idx], &concat_vec[0]); - // alpha: learnable weight vector - scores[i++] = math::dot(2*z, &alpha[0], &concat_vec[0]); - } + // ReLU + if (act_) math::relu_cpu(x * z, out_data, out_data); +} - // (4) softmax to normalize the attention scores on each vertexโ€™s incoming edges - vec_t normalized_scores(deg_src, 0); - math::softmax(deg_src, &scores[0], &normalized_scores[0]); // normalize using softmax - math::clear_cpu(len, &out[src_idx]); +void graph_conv_layer::back_propagation(const float_t* in_data, + const float_t* out_data, + float_t* out_grad, float_t* in_grad) { + size_t x = input_dims[0]; + size_t y = input_dims[1]; + size_t z = output_dims[1]; + if (act_) math::d_relu_cpu(x * z, out_grad, out_data, out_grad); - // (5) aggregation: scaled by the attention scores - //for (auto e : g.edges(src)) { - for (auto e = g.edge_begin(src); e != g.edge_end(src); e++) { - auto dst = g.getEdgeDst(e); - auto dst_idx = dst * len; - auto score = normalized_scores[dst]; - vec_t neighbor(len); - math::scale(len, score, &in[dst_idx], &neighbor[0]); - math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]); - } - }); - - // (6) ReLU - if (act_) math::relu_cpu(x * z, out_data, out_data); + // compute gradients for alpha (alpha is a learnable vector) + d_compute_scores(z, *graph_cpu, in_temp, out_temp, out_grad); + // compute gradients for feature vectors + d_aggregate(z, *graph_cpu, out_grad, out_temp); + if (level_ != 0) { + math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], + 0.0, in_grad); // x*z; z*y -> x*y + math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, + 0.0, &layer::weight_grad[0]); // y*x; x*z; y*z + } + if (level_ != 0 && dropout_) + math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad); } + #endif diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 1d543f0a78..a17f6527bc 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -37,42 +37,6 @@ inline void graph_conv_layer::zero_init_matrix(size_t dim_x, size_t dim_y, } } -// aggregate based on graph topology -void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, - float_t* out) { - galois::StatTimer aggregate_timer("AggregateTime"); - aggregate_timer.start(); - // normalization constant based on graph structure -#ifdef USE_MKL - update_all_csrmm(len, g, in, out, norm_, norm_consts); -#else - update_all(len, g, in, out, norm_, norm_consts); -#endif - aggregate_timer.stop(); -} - -// since graph is symmetric, the derivative is the same -void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, - float_t* out) { - galois::StatTimer aggregate_timer("AggregateDerivativeTime"); - aggregate_timer.start(); -#ifdef USE_MKL - update_all_csrmm(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z -#else - update_all(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z -#endif - aggregate_timer.stop(); -} - -void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, - const float_t* neighbors, float_t* out) { - float_t* a = new float_t[len]; - float_t* b = new float_t[len]; - math::mvmul(CblasNoTrans, n, len, 1.0, &Q[0], self, 0.0, a); - math::mvmul(CblasNoTrans, n, len, 1.0, &W[0], neighbors, 0.0, b); - math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors -} - void graph_conv_layer::malloc_and_init() { size_t x = input_dims[0]; size_t y = input_dims[1]; @@ -92,12 +56,26 @@ void graph_conv_layer::malloc_and_init() { // make sure seed consistent across all hosts for weight matrix rand_init_matrix(y, z, W, 1); + //rand_init_matrix(y, z, Q, 1); // for GraphSAGE - // rand_init_matrix(y, z, Q); zero_init_matrix(y, z, layer::weight_grad); +#ifdef USE_GAT // alpha is only used for GAT - rand_init_matrix(2 * z, 1, alpha, 1); + rand_init_matrix(z, 1, alpha_l, 1); + rand_init_matrix(z, 1, alpha_r, 1); + alpha_lgrad.resize(2*z); + alpha_rgrad.resize(2*z); + std::fill(alpha_lgrad.begin(), alpha_lgrad.end(), 0); + std::fill(alpha_rgrad.begin(), alpha_rgrad.end(), 0); + auto ne = graph_cpu->sizeEdges(); // number of edges + scores.resize(ne); // a score for each edge + temp_scores.resize(ne); + scores_grad.resize(ne); + norm_scores.resize(ne); + norm_scores_grad.resize(ne); + epsilon = 0.2; // LeakyReLU angle of negative slope +#endif if (dropout_) dropout_mask = new mask_t[x * y]; @@ -136,7 +114,43 @@ void set_conv_bitset() { } // end anonymous namespace +void graph_conv_layer::combine(size_t n, size_t len, const float_t* self, + const float_t* neighbors, float_t* out) { + float_t* a = new float_t[len]; + float_t* b = new float_t[len]; + math::mvmul(CblasNoTrans, n, len, 1.0, &Q[0], self, 0.0, a); + math::mvmul(CblasNoTrans, n, len, 1.0, &W[0], neighbors, 0.0, b); + math::vadd_cpu(len, a, b, out); // out = W*self + Q*neighbors +} + #ifndef USE_GAT +// aggregate based on graph topology +void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, + float_t* out) { + galois::StatTimer aggregate_timer("AggregateTime"); + aggregate_timer.start(); + // normalization constant based on graph structure +#ifdef USE_MKL + update_all_csrmm(len, g, in, out, norm_, norm_consts); +#else + update_all(len, g, in, out, norm_, norm_consts); +#endif + aggregate_timer.stop(); +} + +// since graph is symmetric, the derivative is the same +void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in, + float_t* out) { + galois::StatTimer aggregate_timer("AggregateDerivativeTime"); + aggregate_timer.start(); +#ifdef USE_MKL + update_all_csrmm(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z +#else + update_all(len, g, in, out, norm_, norm_consts); // x*x; x*z -> x*z +#endif + aggregate_timer.stop(); +} + // ๐’‰[๐‘™] = ฯƒ(๐‘Š * ฮฃ(๐’‰[๐‘™-1])) void graph_conv_layer::forward_propagation(const float_t* in_data, float_t* out_data) { @@ -192,7 +206,6 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, conv_timer.stop(); } -#endif // ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™โˆ’1] = ๐œ•๐ธ / ๐œ•๐‘ฆ[๐‘™] โˆ— ๐‘Š ^๐‘‡ void graph_conv_layer::back_propagation(const float_t* in_data, @@ -259,6 +272,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); conv_timer.stop(); } +#endif acc_t graph_conv_layer::get_weight_decay_loss() { return math::l2_norm(input_dims[1] * output_dims[1], &layer::W[0]); diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index a5e6b50eec..aed0ac79b9 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -244,6 +244,14 @@ void d_relu_cpu(size_t n, const float_t* in, const float_t* data, galois::chunk_size<64>(), galois::loopname("d_relu")); } +void leaky_relu(float_t epsilon, float_t in, float_t &out) { + out = in > 0.0 ? in : epsilon * in; +} + +void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t &out) { + out = in * (data > 0.0 ? 1.0 : epsilon); +} + void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out) { // TODO: vectorize From 2eeea3c20fd85be382106d43318a1682d2c84780 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 25 Jul 2020 12:49:02 -0500 Subject: [PATCH 335/660] print statement fixing Reducing debug prints, reducing amount of times test/train acc is printed in distributed execution (cleans up stdout). --- libdeepgalois/src/Train.cpp | 19 +++++++++++++++---- libdeepgalois/src/layers/graph_conv_layer.cpp | 6 +----- lonestar/gnn/include/engine.h | 9 ++++++++- 3 files changed, 24 insertions(+), 10 deletions(-) diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp index 992902e7b6..25b0c47d6f 100644 --- a/libdeepgalois/src/Train.cpp +++ b/libdeepgalois/src/Train.cpp @@ -20,7 +20,7 @@ Net::Net(std::string dataset_str, int nt, unsigned n_conv, int epochs, val_interval(val_itv), num_subgraphs(1), is_selfloop(selfloop) { // init some identifiers for this host unsigned myID = 0; -#ifdef GALOIS_ENABLE_DIST +#ifndef GALOIS_ENABLE_GPU myID = galois::runtime::getSystemNetworkInterface().ID; #endif this->header = "[" + std::to_string(myID) + "] "; @@ -91,6 +91,9 @@ void Net::train(optimizer* opt, bool need_validate) { std::string separator = "\n"; double total_train_time = 0.0; int num_subg_remain = 0; +#ifndef GALOIS_ENABLE_GPU + unsigned hostID = galois::runtime::getSystemNetworkInterface().ID; +#endif if (subgraph_sample_size) { galois::StatTimer construct_time("SubgraphAllocateTime"); @@ -195,7 +198,9 @@ void Net::train(optimizer* opt, bool need_validate) { #ifdef GALOIS_ENABLE_GPU std::cout << header << "Epoch " << std::setw(3) << curEpoch << " "; #else - galois::gPrint(header, "Epoch ", std::setw(3), curEpoch, separator); + if (hostID == 0) { + galois::gPrint("Epoch ", std::setw(3), curEpoch, separator); + } #endif set_netphases(net_phase::train); acc_t train_loss = 0.0, train_acc = 0.0; @@ -224,8 +229,10 @@ void Net::train(optimizer* opt, bool need_validate) { std::cout << header << "train_loss " << std::setprecision(3) << std::fixed << train_loss << " train_acc " << train_acc << " "; #else - galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, - train_loss, " train_acc ", train_acc, separator); + if (hostID == 0) { + galois::gPrint(header, "train_loss ", std::setprecision(3), std::fixed, + train_loss, " train_acc ", train_acc, separator); + } #endif t_epoch.Stop(); @@ -243,11 +250,13 @@ void Net::train(optimizer* opt, bool need_validate) { << epoch_time + val_time << " ms (train_time " << epoch_time << " val_time " << val_time << ")\n"; #else + if (hostID == 0) { galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, val_loss, " val_acc ", val_acc, separator); galois::gPrint(header, "time ", std::setprecision(3), std::fixed, epoch_time + val_time, " ms (train_time ", epoch_time, " val_time ", val_time, ")\n"); + } #endif } else { #ifdef GALOIS_ENABLE_GPU @@ -255,8 +264,10 @@ void Net::train(optimizer* opt, bool need_validate) { << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n"; #else + if (hostID == 0) { galois::gPrint(header, "train_time ", std::fixed, epoch_time, " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, ")\n"); + } #endif } } // epoch loop diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index a17f6527bc..58da90e9ad 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -191,8 +191,6 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_data; set_conv_bitset(); - galois::gPrint("forward ", bitset_conv.count(), " out of ", - bitset_conv.size(), "\n"); layer::context->getSyncSubstrate() ->sync("GraphConvForward"); @@ -254,8 +252,6 @@ void graph_conv_layer::back_propagation(const float_t* in_data, deepgalois::_syncVectorSize = z; deepgalois::_dataToSync = out_temp; set_conv_bitset(); - galois::gPrint("backward ", bitset_conv.count(), " out of ", - bitset_conv.size(), "\n"); layer::context->getSyncSubstrate() ->sync( @@ -269,7 +265,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, drop_timer.stop(); layer::syncSub->sync("Gradients"); - galois::gInfo("[", layer::gradientGraph->myHostID(), "] Sync done"); + galois::gDebug("[", layer::gradientGraph->myHostID(), "] Sync done"); conv_timer.stop(); } #endif diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h index 25644c720d..f4bbf8e5b5 100644 --- a/lonestar/gnn/include/engine.h +++ b/lonestar/gnn/include/engine.h @@ -105,10 +105,17 @@ int main(int argc, char** argv) { Ttest.start(); acc_t test_loss = 0.0, test_acc = 0.0; double test_time = network.evaluate("test", test_loss, test_acc); +#ifndef GALOIS_ENABLE_GPU + if (galois::runtime::getSystemNetworkInterface().ID == 0) { + galois::gPrint("test_loss = ", test_loss, " test_acc = ", test_acc, + " test_time = ", test_time, "\n"); + } +#else galois::gPrint("Testing: test_loss = ", test_loss, " test_acc = ", test_acc, " test_time = ", test_time, "\n"); +#endif Ttest.stop(); } - galois::gPrint("\n", rm.get_peak_memory(), "\n\n"); + galois::gInfo(rm.get_peak_memory()); return 0; } From 09141e09719f3693e2173abb7a88bab748e58460 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 25 Jul 2020 13:54:27 -0500 Subject: [PATCH 336/660] report test epoch accuracy to time elapsed used to get accuracy gain to time plots --- libdeepgalois/src/Train.cpp | 38 ++++++++++++++++++++++++------------- 1 file changed, 25 insertions(+), 13 deletions(-) diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp index 25b0c47d6f..67533bc55d 100644 --- a/libdeepgalois/src/Train.cpp +++ b/libdeepgalois/src/Train.cpp @@ -222,6 +222,8 @@ void Net::train(optimizer* opt, bool need_validate) { // for next epoch Net::update_weights(opt); // update parameters + t_epoch.Stop(); + // validation / testing set_netphases(net_phase::test); @@ -234,12 +236,21 @@ void Net::train(optimizer* opt, bool need_validate) { train_loss, " train_acc ", train_acc, separator); } #endif - t_epoch.Stop(); double epoch_time = t_epoch.Millisecs(); total_train_time += epoch_time; - if (need_validate && curEpoch % val_interval == 0) { + // report current total time + accuracy as a stat +#ifndef GALOIS_ENABLE_GPU + if (hostID == 0) { + galois::runtime::reportParam( + std::string("GNN"), + "Epoch" + std::to_string(curEpoch) + "TestAccuracyAndTime", + std::to_string(train_acc) + ";" + std::to_string(total_train_time)); + } +#endif + + if (need_validate && (curEpoch % val_interval == 0)) { // Validation acc_t val_loss = 0.0, val_acc = 0.0; double val_time = evaluate("val", val_loss, val_acc); @@ -250,13 +261,13 @@ void Net::train(optimizer* opt, bool need_validate) { << epoch_time + val_time << " ms (train_time " << epoch_time << " val_time " << val_time << ")\n"; #else - if (hostID == 0) { - galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, - val_loss, " val_acc ", val_acc, separator); - galois::gPrint(header, "time ", std::setprecision(3), std::fixed, - epoch_time + val_time, " ms (train_time ", epoch_time, - " val_time ", val_time, ")\n"); - } + if (hostID == 0) { + galois::gPrint(header, "val_loss ", std::setprecision(3), std::fixed, + val_loss, " val_acc ", val_acc, separator); + galois::gPrint(header, "time ", std::setprecision(3), std::fixed, + epoch_time + val_time, " ms (train_time ", epoch_time, + " val_time ", val_time, ")\n"); + } #endif } else { #ifdef GALOIS_ENABLE_GPU @@ -264,10 +275,11 @@ void Net::train(optimizer* opt, bool need_validate) { << " ms (fw " << fw_time << ", bw " << epoch_time - fw_time << ")\n"; #else - if (hostID == 0) { - galois::gPrint(header, "train_time ", std::fixed, epoch_time, " ms (fw ", - fw_time, ", bw ", epoch_time - fw_time, ")\n"); - } + if (hostID == 0) { + galois::gPrint(header, "train_time ", std::fixed, epoch_time, + " ms (fw ", fw_time, ", bw ", epoch_time - fw_time, + ")\n"); + } #endif } } // epoch loop From f6ba338c58784528b54939194cc21ace40164356 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 1 Aug 2020 12:47:54 -0500 Subject: [PATCH 337/660] very minor print change --- libdeepgalois/src/Train.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp index 67533bc55d..3a1b1c397e 100644 --- a/libdeepgalois/src/Train.cpp +++ b/libdeepgalois/src/Train.cpp @@ -199,7 +199,7 @@ void Net::train(optimizer* opt, bool need_validate) { std::cout << header << "Epoch " << std::setw(3) << curEpoch << " "; #else if (hostID == 0) { - galois::gPrint("Epoch ", std::setw(3), curEpoch, separator); + galois::gInfo("Epoch ", std::setw(3), curEpoch); } #endif set_netphases(net_phase::train); From b07f050ede97905984db7b49a73c76c99ecd6126 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 4 Sep 2020 18:08:45 -0500 Subject: [PATCH 338/660] sync only if not last layer (backward) --- .../deepgalois/layers/GradientSyncStructs.h | 2 +- libdeepgalois/src/Train.cpp | 2 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 26 +++++++++++-------- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h index 26420aa30d..2c32f13be2 100644 --- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h +++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h @@ -9,7 +9,7 @@ struct GradientSync { static ValTy extract(uint32_t, float_t& weight) { return weight; } - static bool reduce(uint32_t, float_t& weight, ValTy y) { + static bool reduce(uint32_t, float_t&, ValTy) { // TODO merge function here // for now make sure the weights are close enough // if (std::abs(weight - y) > 0.00001) { diff --git a/libdeepgalois/src/Train.cpp b/libdeepgalois/src/Train.cpp index 3a1b1c397e..4275232baa 100644 --- a/libdeepgalois/src/Train.cpp +++ b/libdeepgalois/src/Train.cpp @@ -491,7 +491,7 @@ acc_t Net::fprop(size_t gBegin, size_t gEnd, size_t gCount, mask_t* gMasks) { layers[num_layers - 1]->set_sample_mask(gBegin, gEnd, gCount, gMasks); for (size_t i = 0; i < num_layers; i++) { - // galois::gPrint(header, "fprop: layer ", i, " forward call\n"); + galois::gPrint(header, "fprop: layer ", i, " forward call\n"); layers[i]->forward(); } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 58da90e9ad..9320ade39c 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -52,7 +52,7 @@ void graph_conv_layer::malloc_and_init() { new galois::graphs::GluonSubstrate( *layer::gradientGraph, layer::gradientGraph->myHostID(), layer::gradientGraph->numHosts(), false); - galois::gInfo("gradient bitset size is going to be ", y * z); + galois::gInfo("gradient bitset size is going to be ", y * z, " ", y, " ", z); // make sure seed consistent across all hosts for weight matrix rand_init_matrix(y, z, W, 1); @@ -159,6 +159,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, size_t x = input_dims[0]; size_t y = input_dims[1]; size_t z = output_dims[1]; + galois::gPrint("forward ", x, " ", y, " ", z, "\n"); galois::StatTimer drop_timer("GraphConvForwardDropout"); drop_timer.start(); @@ -192,6 +193,7 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, deepgalois::_dataToSync = out_data; set_conv_bitset(); + galois::gPrint("forward ", x, " ", y, " ", z, " sync calling\n"); layer::context->getSyncSubstrate() ->sync("GraphConvForward"); @@ -229,10 +231,11 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // at this point, out_temp has the derivative of data from last step to // use for both updating gradients for features and gradients for weights // this calculates gradients for the node predictions - if (level_ != 0) // no need to calculate in_grad for the first layer + if (level_ != 0) {// no need to calculate in_grad for the first layer // derivative of matmul needs transposed matrix math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y + } // calculate weight gradients using input data; multiplied by gradients from // last back prop step math::sgemm_cpu(CblasTrans, CblasNoTrans, y, z, x, 1.0, in_data, out_temp, @@ -249,15 +252,16 @@ void graph_conv_layer::back_propagation(const float_t* in_data, compute_timer.stop(); // sync agg - deepgalois::_syncVectorSize = z; - deepgalois::_dataToSync = out_temp; - set_conv_bitset(); - - layer::context->getSyncSubstrate() - ->sync( - //->sync( - "GraphConvBackward"); - + //galois::gPrint(header, "x is ", x, " y is ", y, " z is ", z, "\n"); + if (level_ != 0) { + deepgalois::_syncVectorSize = y; + deepgalois::_dataToSync = in_grad; + set_conv_bitset(); + layer::context->getSyncSubstrate() + ->sync( + //->sync( + "GraphConvBackward"); + } galois::StatTimer drop_timer("GraphConvBackwardDropout"); drop_timer.start(); if (level_ != 0 && dropout_) From 57d1596e5f3397f571591d81d8e5be5f471eac16 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 14 Sep 2020 16:38:09 -0500 Subject: [PATCH 339/660] GNNGraph initial load implementation This commit adds the GNNGraph class with the implementation of the load features done: partitions, loads features, labels, and masks, and initializes the sync substrate. Some thought has to be put in on how to access the feature array being synchronized, however. Before it was done via a global... --- CMakeLists.txt | 1 + libgnn/CMakeLists.txt | 34 +++ libgnn/include/galois/GNNTypes.h | 15 ++ libgnn/include/galois/graphs/GNNGraph.h | 143 +++++++++++++ libgnn/src/GNNGraph.cpp | 265 ++++++++++++++++++++++++ libgnn/test/CMakeLists.txt | 5 + libgnn/test/gnngraph-test.cpp | 33 +++ 7 files changed, 496 insertions(+) create mode 100644 libgnn/CMakeLists.txt create mode 100644 libgnn/include/galois/GNNTypes.h create mode 100644 libgnn/include/galois/graphs/GNNGraph.h create mode 100644 libgnn/src/GNNGraph.cpp create mode 100644 libgnn/test/CMakeLists.txt create mode 100644 libgnn/test/gnngraph-test.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index d82f802c97..fc01f4a1ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -238,6 +238,7 @@ if (GALOIS_ENABLE_DIST) add_subdirectory(libdist) add_subdirectory(libcusp) add_subdirectory(libgluon) + add_subdirectory(libgnn) endif() # TODO(loc) prefix with GALOIS diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt new file mode 100644 index 0000000000..a44b94f427 --- /dev/null +++ b/libgnn/CMakeLists.txt @@ -0,0 +1,34 @@ +set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread") +SET(BLAS_INC_DIR ${OPENBLAS_ROOT}/include/openblas) +SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64) +set(BLAS_LIB "-lopenblas -lpthread") +if(USE_MKL_BLAS) + link_directories(${INTEL_LIBS_DIR}) + message(STATUS "ICC Libraries for MKL: ${INTEL_LIBS_DIR}") + SET(BLAS_INC_DIR ${MKL_ROOT}/include) + SET(BLAS_LIB_DIR ${MKL_ROOT}/lib/intel64) + set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core") + #set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lpthread -liomp5") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MKL") +endif() + +# blas library +include_directories(${BLAS_INC_DIR}) +link_directories(${BLAS_LIB_DIR}) + +set(sources + src/GNNGraph.cpp +) + +add_library(galois_gnn STATIC ${sources}) +target_link_libraries(galois_gnn galois_shmem) +target_link_libraries(galois_gnn ${MPI_CXX_LIBRARIES}) +target_link_libraries(galois_gnn ${BLAS_LIB} ${BOOST_LIBRARIES}) +target_link_libraries(galois_gnn galois_dist_async galois_cusp galois_gluon galois_support) +target_include_directories(galois_gnn PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/include +) + +set_target_properties(galois_gnn PROPERTIES EXPORT_NAME gluon) + +add_subdirectory(test) diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h new file mode 100644 index 0000000000..aaabaa15e0 --- /dev/null +++ b/libgnn/include/galois/GNNTypes.h @@ -0,0 +1,15 @@ +#pragma once +//! @file GNNTypes.h +//! Typedefs used by the Galois GNN code + +#include + +namespace galois { +//! Floating point type to use throughout GNN compute; typedef'd so it's easier +//! to flip later +using GNNFloat = float; +//! Type of the labels for a vertex +using GNNLabel = uint8_t; +//! Type of a feature on vertices +using GNNFeature = float; +} // end namespace galois diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h new file mode 100644 index 0000000000..8bba9609fc --- /dev/null +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -0,0 +1,143 @@ +#pragma once + +#include "galois/GNNTypes.h" +#include "galois/graphs/CuSPPartitioner.h" +#include "galois/graphs/GluonSubstrate.h" + +namespace galois { + +// TODO remove the need to hardcode this path +//! Path to location of all gnn files +static const std::string gnn_dataset_path = + "/net/ohm/export/iss/inputs/Learning/"; + +//! Helper struct to maintain start/end/size of any particular range. Mostly +//! used for mask ranges. +struct GNNRange { + size_t begin{0}; + size_t end{0}; + size_t size{0}; +}; + +namespace graphs { + +//! Possible partitioning schemes for the GNN graph +enum class GNNPartitionScheme { kOEC, kCVC }; + +//! XXX +class GNNGraph { +public: + // using LocalGraphType = LearningGraph; + using GNNDistGraph = galois::graphs::DistGraph; + + //! Loads a graph and all relevant metadata (labels, features, masks, etc.) + GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme, + bool has_single_class_label); + +private: + //! In a multi-host setting, this variable stores the host id that the graph + //! is currently running on + unsigned host_id_; + //! Number of classes for a single vertex label + size_t num_label_classes_{1}; + //! Length of a feature node + size_t node_feature_length_{0}; + //! Partitioned graph + std::unique_ptr partitioned_graph_; + // XXX is this necessary + //! Copy of underlying topology of the distributed graph + // std::unique_ptr local_graph_; + //! Sync substrate for the partitioned graph + std::unique_ptr> sync_substrate_; + //! Ground truth label for nodes in the partitioned graph; Nx1 if single + //! class, N x num classes if multi-class label + std::unique_ptr local_ground_truth_labels_; + //! Feature vectors for nodes in partitioned graph + std::unique_ptr local_node_features_; + + // TODO maybe revisit this and use an actual bitset + //! Bitset indicating which nodes are training nodes + std::unique_ptr local_training_mask_; + //! Bitset indicating which nodes are validation nodes + std::unique_ptr local_validation_mask_; + //! Bitset indicating which nodes are testing nodes + std::unique_ptr local_testing_mask_; + + //! Global mask range for training nodes; must convert to LIDs when using + //! in this class + GNNRange global_training_mask_range_; + //! Global mask range for validation nodes; must convert to LIDs when using + //! in this class + GNNRange global_validation_mask_range_; + //! Global mask range for testing nodes; must convert to LIDs when using + //! in this class + GNNRange global_testing_mask_range_; + + // XXX figure out what this is really used for + //! Normalization constant based on structure of the graph + std::vector norm_factors_; + + // TODO vars for subgraphs as necessary + + //! Read labels of local nodes only + void ReadLocalLabels(const std::string& dataset_name, + bool has_single_class_label); + //! Read features of local nodes only + void ReadLocalFeatures(const std::string& dataset_str); + //! Helper function to read masks from file into the appropriate structures + //! given a name, mask type, and arrays to save into + size_t ReadLocalMasksFromFile(const std::string& dataset_name, + const std::string& mask_type, + GNNRange* mask_range, GNNLabel* masks); + //! Read masks of local nodes only for training, validation, and testing + void ReadLocalMasks(const std::string& dataset_name); + + // public: + // + // DGraph* getGraphPointer() { return partitionedGraph; } + // Graph* getLGraphPointer() { return lGraph; } + // Graph* getSubgraphPointer(int id) { return partitionedSubgraphs[id]; }; + // + // void initializeSyncSubstrate(); + // + // void saveDistGraph(DGraph* a); + // galois::graphs::GluonSubstrate* getSyncSubstrate(); + // float_t* get_feats_ptr() { return h_feats; } + // float_t* get_feats_subg_ptr() { return h_feats_subg.data(); } + // label_t* get_labels_ptr() { return h_labels; } + // label_t* get_labels_subg_ptr() { return h_labels_subg.data(); } + // float_t* get_norm_factors_ptr() { return normFactors.data(); } + // float_t* get_norm_factors_subg_ptr() { return &normFactorsSub[0]; } + // + // //! allocate the norm factor vector + // void allocNormFactor(); + // void allocNormFactorSub(int subID); + // //! construct norm factor vector by using data from global graph + // void constructNormFactor(deepgalois::Context* globalContext); + // void constructNormFactorSub(int subgraphID); + // + // void constructSubgraphLabels(size_t m, const mask_t* masks); + // void constructSubgraphFeatures(size_t m, const mask_t* masks); + // + // //! return label for some node + // //! NOTE: this is LID, not GID + // label_t get_label(size_t lid) { return h_labels[lid]; } + // + // //! returns pointer to the features of each local node + // float_t* get_in_ptr(); + // + // //! allocate memory for subgraphs (don't actually build them) + // void allocateSubgraphs(int num_subgraphs, unsigned max_size); + // + // //! return if a vertex is owned by the partitioned graph this context + // contains bool isOwned(unsigned gid); + // //! return if part graph has provided vertex for given gid locally + // bool isLocal(unsigned gid); + // //! get GID of an lid for a vertex + // unsigned getGID(unsigned lid); + // //! get local id of a vertex given a global id for that vertex + // unsigned getLID(unsigned gid); +}; + +} // namespace graphs +} // namespace galois diff --git a/libgnn/src/GNNGraph.cpp b/libgnn/src/GNNGraph.cpp new file mode 100644 index 0000000000..5a39ed4d25 --- /dev/null +++ b/libgnn/src/GNNGraph.cpp @@ -0,0 +1,265 @@ +// XXX include net interface if necessary +#include "galois/graphs/GNNGraph.h" +#include "galois/Logging.h" + +namespace { +//! Partitions a particular dataset given some partitioning scheme +std::unique_ptr +LoadPartition(const std::string& dataset_name, + galois::graphs::GNNPartitionScheme partition_scheme) { + // XXX input path + std::string input_file = galois::gnn_dataset_path + dataset_name + ".csgr"; + GALOIS_LOG_VERBOSE("File to read is {}", input_file); + + // load partition + switch (partition_scheme) { + case galois::graphs::GNNPartitionScheme::kOEC: + return galois::cuspPartitionGraph( + input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); + case galois::graphs::GNNPartitionScheme::kCVC: + return galois::cuspPartitionGraph( + input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); + default: + GALOIS_LOG_FATAL("Error: partition scheme specified is invalid"); + return nullptr; + } +} + +} // end namespace + +galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name, + GNNPartitionScheme partition_scheme, + bool has_single_class_label) { + // save host id + host_id_ = galois::runtime::getSystemNetworkInterface().ID; + // load partition + partitioned_graph_ = LoadPartition(dataset_name, partition_scheme); + + // read additional graph data + ReadLocalLabels(dataset_name, has_single_class_label); + ReadLocalFeatures(dataset_name); + ReadLocalMasks(dataset_name); + + // init gluon from the partitioned graph + sync_substrate_ = + std::make_unique>( + *partitioned_graph_, host_id_, + galois::runtime::getSystemNetworkInterface().Num, false); +} + +void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, + bool has_single_class_label) { + GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_); + std::string filename = + galois::gnn_dataset_path + dataset_name + "-labels.txt"; + // read file header, save num label classes while at it + std::ifstream file_stream; + file_stream.open(filename, std::ios::in); + size_t num_nodes; + file_stream >> num_nodes >> num_label_classes_ >> std::ws; + assert(num_nodes == partitioned_graph_->globalSize()); + + // allocate memory for labels + if (has_single_class_label) { + // single-class (one-hot) label for each vertex: N x 1 + local_ground_truth_labels_ = + std::make_unique(partitioned_graph_->size()); + } else { + // multi-class label for each vertex: N x num classes + local_ground_truth_labels_ = std::make_unique( + partitioned_graph_->size() * num_label_classes_); + } + + size_t cur_gid = 0; + size_t found_local_vertices = 0; + // each line contains a set of 0s and 1s + std::string read_line; + + // loop through all labels of the graph + while (std::getline(file_stream, read_line)) { + // only process label if this node is local + if (partitioned_graph_->isLocal(cur_gid)) { + uint32_t cur_lid = partitioned_graph_->getLID(cur_gid); + // read line as bitset of 0s and 1s + std::istringstream label_stream(read_line); + unsigned cur_bit; + // bitset size is # of label classes + for (size_t cur_class = 0; cur_class < num_label_classes_; ++cur_class) { + // read a bit + label_stream >> cur_bit; + + if (has_single_class_label) { + // in single class, only 1 bit is set in bitset; that represents the + // class to take + if (cur_bit != 0) { + // set class and break (assumption is that's the only bit that is + // set) + local_ground_truth_labels_[cur_lid] = cur_class; + break; + } + } else { + // else the entire bitset needs to be copied over to the label array + // TODO this can possibly be saved all at once rather than bit by bit? + local_ground_truth_labels_[cur_lid * num_label_classes_ + cur_class] = + cur_bit; + } + } + found_local_vertices++; + } + // always increment cur_gid + cur_gid++; + } + + file_stream.close(); + + GALOIS_LOG_ASSERT(found_local_vertices == partitioned_graph_->size()); +} + +void galois::graphs::GNNGraph::ReadLocalFeatures( + const std::string& dataset_name) { + GALOIS_LOG_VERBOSE("[{}] Reading features from disk...", host_id_); + + // read in dimensions of features, specifically node feature length + size_t num_vertices; + + std::string file_dims = galois::gnn_dataset_path + dataset_name + "-dims.txt"; + std::ifstream ifs; + ifs.open(file_dims, std::ios::in); + ifs >> num_vertices >> node_feature_length_; + ifs.close(); + + GALOIS_LOG_ASSERT(num_vertices == partitioned_graph_->globalSize()); + GALOIS_LOG_VERBOSE("[{}] N x D: {} x {}", host_id_, num_vertices, + node_feature_length_); + + // memory for all features of all nodes in graph + // TODO read features without loading entire feature file into memory; this + // is quite inefficient + std::unique_ptr full_feature_set = + std::make_unique(num_vertices * node_feature_length_); + + // read in all features + std::ifstream file_stream; + std::string feature_file = + galois::gnn_dataset_path + dataset_name + "-feats.bin"; + file_stream.open(feature_file, std::ios::binary | std::ios::in); + file_stream.read((char*)full_feature_set.get(), + sizeof(GNNFloat) * num_vertices * node_feature_length_); + file_stream.close(); + + // allocate memory for local features + local_node_features_ = std::make_unique( + partitioned_graph_->size() * node_feature_length_); + + // copy over features for local nodes only + size_t local_vertex = 0; + for (size_t i = 0; i < num_vertices; i++) { + if (partitioned_graph_->isLocal(i)) { + // copy over feature vector + std::copy(full_feature_set.get() + i * node_feature_length_, + full_feature_set.get() + (i + 1) * node_feature_length_, + &local_node_features_[local_vertex * node_feature_length_]); + local_vertex++; + } + } + full_feature_set.reset(); + GALOIS_LOG_ASSERT(local_vertex++ == partitioned_graph_->size()); +} + +//! Helper function to read masks from file into the appropriate structures +//! given a name, mask type, and arrays to save into +size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( + const std::string& dataset_name, const std::string& mask_type, + GNNRange* mask_range, GNNLabel* masks) { + size_t range_begin; + size_t range_end; + + // read mask range + std::string mask_filename = + galois::gnn_dataset_path + dataset_name + "-" + mask_type + "_mask.txt"; + std::ifstream mask_stream; + mask_stream.open(mask_filename, std::ios::in); + mask_stream >> range_begin >> range_end >> std::ws; + GALOIS_LOG_ASSERT(range_begin <= range_end); + + // set the range object + mask_range->begin = range_begin; + mask_range->end = range_end; + mask_range->size = range_end - range_begin; + + size_t cur_line_num = 0; + size_t local_sample_count = 0; + std::string line; + // each line is a number signifying if mask is set for the vertex + while (std::getline(mask_stream, line)) { + std::istringstream mask_stream(line); + // only examine vertices/lines in range + if (cur_line_num >= range_begin && cur_line_num < range_end) { + // only bother if node is local + if (partitioned_graph_->isLocal(cur_line_num)) { + unsigned mask = 0; + mask_stream >> mask; + if (mask == 1) { + masks[partitioned_graph_->getLID(cur_line_num)] = 1; + local_sample_count++; + } + } + } + cur_line_num++; + } + mask_stream.close(); + + return local_sample_count; +} + +void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { + // allocate the memory for the local masks + local_training_mask_ = + std::make_unique(partitioned_graph_->size()); + local_validation_mask_ = + std::make_unique(partitioned_graph_->size()); + local_testing_mask_ = + std::make_unique(partitioned_graph_->size()); + + if (dataset_name == "reddit") { + // TODO reddit is hardcode handled at the moment; better way to not do + // this? + global_training_mask_range_ = {.begin = 0, .end = 153431, .size = 153431}; + global_validation_mask_range_ = { + .begin = 153431, .end = 153431 + 23831, .size = 23831}; + global_testing_mask_range_ = { + .begin = 177262, .end = 177262 + 55703, .size = 55703}; + + // training + for (size_t i = global_training_mask_range_.begin; + i < global_training_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_training_mask_[partitioned_graph_->getLID(i)] = 1; + } + } + + // validation + for (size_t i = global_validation_mask_range_.begin; + i < global_validation_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_validation_mask_[partitioned_graph_->getLID(i)] = 1; + } + } + + // testing + for (size_t i = global_testing_mask_range_.begin; + i < global_testing_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_testing_mask_[partitioned_graph_->getLID(i)] = 1; + } + } + } else { + // XXX i can get local sample counts from here if i need it + ReadLocalMasksFromFile(dataset_name, "train", &global_training_mask_range_, + local_training_mask_.get()); + ReadLocalMasksFromFile(dataset_name, "val", &global_validation_mask_range_, + local_validation_mask_.get()); + ReadLocalMasksFromFile(dataset_name, "test", &global_testing_mask_range_, + local_testing_mask_.get()); + } +} diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt new file mode 100644 index 0000000000..83c6164eac --- /dev/null +++ b/libgnn/test/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(gnngraph-test gnngraph-test.cpp) +target_link_libraries(gnngraph-test galois_gnn) +add_test(NAME gnngraph-test COMMAND gnngraph-test) + +# TODO multi host tests diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp new file mode 100644 index 0000000000..78b6804513 --- /dev/null +++ b/libgnn/test/gnngraph-test.cpp @@ -0,0 +1,33 @@ +//! @file gnngraph-test.cpp +//! Test loads a few graphs. Better if you run with multiple hosts. +//! Doesn't really do much besides that. + +#include "galois/Logging.h" +#include "galois/graphs/GNNGraph.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + GALOIS_LOG_VERBOSE("[{}] Using {} threads", + galois::runtime::getSystemNetworkInterface().ID, + num_threads); + + GALOIS_LOG_VERBOSE("reddit with multilabel, oec"); + galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kOEC, + false); + GALOIS_LOG_VERBOSE("reddit with single label, oec"); + galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kOEC, + true); + GALOIS_LOG_VERBOSE("reddit with multilabel, cvc"); + galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kCVC, + false); + GALOIS_LOG_VERBOSE("reddit with single label, cvc"); + galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kCVC, + true); + + // TODO fix citeseer and goec + // galois::graphs::GNNGraph("citeseer", + // galois::graphs::GNNPartitionScheme::kOEC, false); +} From 964bc732b561bb364a6cfb4002e42e4a15e373ab Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 28 Sep 2020 17:54:18 -0500 Subject: [PATCH 340/660] GraphConvolutionalLayer Adds an initial graph convolution layer implementation with a few optimizations still pending. Also adds various things required to make this work, including calls into a matrix multiply library, per thread RNG, etc. Also adds a test to make sure the conv layer by itself is sane + runs as expected/without crashing. Note this commit will not build because it does not include graph structure changes. Those will come in the next commit (in the spirit of keeping commits separate). --- libgnn/CMakeLists.txt | 3 + libgnn/include/galois/GNNMath.h | 18 ++ libgnn/include/galois/GNNTypes.h | 1 + libgnn/include/galois/PerThreadRNG.h | 32 +++ libgnn/include/galois/layers/GNNLayer.h | 133 +++++++++ .../galois/layers/GraphConvolutionalLayer.h | 58 ++++ libgnn/src/GNNLayer.cpp | 100 +++++++ libgnn/src/GNNMath.cpp | 42 +++ libgnn/src/GraphConvolutionalLayer.cpp | 164 +++++++++++ libgnn/test/CMakeLists.txt | 4 + libgnn/test/convlayer-test.cpp | 258 ++++++++++++++++++ libgnn/test/gnngraph-test.cpp | 1 + 12 files changed, 814 insertions(+) create mode 100644 libgnn/include/galois/GNNMath.h create mode 100644 libgnn/include/galois/PerThreadRNG.h create mode 100644 libgnn/include/galois/layers/GNNLayer.h create mode 100644 libgnn/include/galois/layers/GraphConvolutionalLayer.h create mode 100644 libgnn/src/GNNLayer.cpp create mode 100644 libgnn/src/GNNMath.cpp create mode 100644 libgnn/src/GraphConvolutionalLayer.cpp create mode 100644 libgnn/test/convlayer-test.cpp diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index a44b94f427..28e8dc8630 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -18,6 +18,9 @@ link_directories(${BLAS_LIB_DIR}) set(sources src/GNNGraph.cpp + src/GNNLayer.cpp + src/GNNMath.cpp + src/GraphConvolutionalLayer.cpp ) add_library(galois_gnn STATIC ${sources}) diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h new file mode 100644 index 0000000000..755d281752 --- /dev/null +++ b/libgnn/include/galois/GNNMath.h @@ -0,0 +1,18 @@ +#pragma once + +#include "galois/GNNTypes.h" +#include + +namespace galois { + +//! Given 2 float array pointers, do element wise addition of length elements +//! Can be called in parallel sections as its sigle threaded code +void VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b, + GNNFloat* output); + +//! Calls into a library BLAS call to do matrix muliply; uses default alpha/beta +void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, + size_t input_rows, size_t input_columns, size_t output_columns, + const GNNFloat* a, const GNNFloat* b, GNNFloat* output); + +} // namespace galois diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h index aaabaa15e0..56eed101f8 100644 --- a/libgnn/include/galois/GNNTypes.h +++ b/libgnn/include/galois/GNNTypes.h @@ -3,6 +3,7 @@ //! Typedefs used by the Galois GNN code #include +#include namespace galois { //! Floating point type to use throughout GNN compute; typedef'd so it's easier diff --git a/libgnn/include/galois/PerThreadRNG.h b/libgnn/include/galois/PerThreadRNG.h new file mode 100644 index 0000000000..80f8d11f0a --- /dev/null +++ b/libgnn/include/galois/PerThreadRNG.h @@ -0,0 +1,32 @@ +#pragma once +#include +#include "galois/substrate/PerThreadStorage.h" +#include "galois/GNNTypes.h" + +namespace galois { + +//! Per thread RNG object for generating numbers in parallel +class PerThreadRNG { +public: + //! Default seed 0, default distribution 0 to 1 + PerThreadRNG() : distribution_{0.0, 1.0} {}; + //! User specified range + PerThreadRNG(float begin, float end) : distribution_{begin, end} {}; + //! Returns a random number between numbers specified during init + GNNFloat GetRandomNumber() { + return (*distribution_.getLocal())(*engine_.getLocal()); + } + //! Return true or false based on some dropout rate + bool DoBernoulli(float dropout_rate) { + return (GetRandomNumber() > dropout_rate) ? 1 : 0; + } + +private: + //! Per thread generator of random + galois::substrate::PerThreadStorage engine_; + //! Per thread distribution of random + galois::substrate::PerThreadStorage> + distribution_; +}; + +} // namespace galois diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h new file mode 100644 index 0000000000..7df88d2ce7 --- /dev/null +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -0,0 +1,133 @@ +#pragma once + +#include "galois/PerThreadRNG.h" +#include "galois/graphs/GNNGraph.h" + +namespace galois { + +//! Struct holding the dimensions of a layer. Assumption is that a layer takes +//! a matrix and outputs another matrix with a different # of columns (e.g. +//! matrix multiply with a set of weights) +struct GNNLayerDimensions { + //! Number of rows in input and output of this layer + size_t input_rows; + //! Number of columns in input of this layer + size_t input_columns; + //! Number of columns output of this layer + size_t output_columns; +}; + +//! Config options for operations that can occur in a layer +struct GNNConfig { + //! True if dropout is to be done at beginning of forward phase + bool do_dropout{false}; + //! Rate at which to drop things if dropout is on + float dropout_rate{0.5}; + //! True if some activation function is to be called done at end of forward + //! phase + bool do_activation{false}; + //! True if normalization is to occur during multiplies + bool do_normalization{false}; + // TODO activation type; for now default is softmax +}; + +// Tried to avoid inheritance, but keeping track of heterogeneous layers +// becomes a mess if there isn't a base class I can create the container on. +//! Base class for layers in a graph neural network +class GNNLayer { +public: + GNNLayer() = delete; + //! Creation of a layer needs the # of the layer, the graph to train on, and + //! the input/output dimensions of the MxM that occurs in the layer; config + //! as well + GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions, const GNNConfig& config); + + //! Uses a default config + GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions) + : GNNLayer(layer_num, graph, dimensions, GNNConfig()) {} + + //! Initializes all layer weights to 1. This is used as a debug function for + //! testing. + void InitAllWeightsTo1() { layer_weights_.assign(layer_weights_.size(), 1); } + + //! Conducts the forward phase given the input to this layer which + //! ultimately leads to an output (classfication of node labels) at the end + //! of the GNN. + //! @returns Output of the forward phase (i.e. input to next layer) + virtual const std::vector& + ForwardPhase(const std::vector& input_embeddings) = 0; + //! Conducts the backward phase given the input to this layer; the backward + //! phase calculates the gradients to update the weights of trainable + //! parts of the layer (e.g., weights, trainable params for aggregate, etc.). + //! @param prev_layer_input The input that was given to this layer in the + //! forward phase + //! @param input_gradient gradient from the backward phase layer before this + //! one; takes a pointer to save space by writing intermediate results to it + //! @returns Output of the backward phase (i.e. input to previous layer); note + //! it's a pointer because layer can mess with it + virtual std::vector* + BackwardPhase(const std::vector& prev_layer_input, + std::vector* input_gradient) = 0; + + const std::vector& GetLayerWeightGradients() { + return layer_weight_gradients_; + } + + //! Returns dimensions of this layer + // XXX may not be needed + const GNNLayerDimensions& GetLayerDimensions() { return layer_dimensions_; } + +protected: + //! Layer order (starts from 0); used in backward to shortcut output as layer + //! 0 does not need to do some things that other layers need to do + // XXX be more specific + size_t layer_number_; + //! Pointer to the graph being trained by this layer. + //! This is owned by the creator of this layer, so no need to free it when + //! this layer is destroyed. + const galois::graphs::GNNGraph& graph_; + //! Dimensions (input/output sizes) of this layer + GNNLayerDimensions layer_dimensions_; + //! Config object for certain parameters for layer + GNNConfig config_; + //! Weights used by this layer. Dimensions: input columns by output columns + std::vector layer_weights_; + //! Gradients used to update the weights of this layer + std::vector layer_weight_gradients_; + // There is a forward and a backward as their sizes will differ and we only + // want to allocate memory once to avoid runtime memory allocation. + //! The output of the forward phase for this layer. + std::vector forward_output_matrix_; + //! The output of the backward phase for this layer. + std::vector backward_output_matrix_; + //! RNG for matrix initialization + PerThreadRNG random_init_rng_{-5.0, 5.0}; + //! RNG for dropout + PerThreadRNG dropout_rng_; + //! Indicates which fields of the weight matrix are dropped if dropout is + //! used + std::vector dropout_mask_; + + ////////////////////////////////////////////////////////////////////////////// + + //! Randomly init a float vector using the class's random init RNG + void RandomInitVector(std::vector* vector_to_init); + + //! Choose a set of weights from this layer's weights to keep and save to + //! the output matrix + apply some scaling to the kept weights based on + //! dropout rate + void DoDropout(std::vector* output_matrix); + //! Apply the derivative of dropout to the backward phase output + void DoDropoutDerivative(); + + //! Does some activation function based on configuration on forward output + //! matrix + void Activation(); + //! Calculate derivative of activation function based on config on the matrix + // XXX + void ActivationDerivative(std::vector* matrix); +}; + +} // namespace galois diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h new file mode 100644 index 0000000000..6a99682b8a --- /dev/null +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h @@ -0,0 +1,58 @@ +#pragma once +#include "galois/layers/GNNLayer.h" + +namespace galois { + +class GraphConvolutionalLayer : public GNNLayer { +public: + //! Initializes the variables of the base class and also allocates additional + //! memory for temporary matrices. Also initializes sync substrate for the + //! weight matrix + GraphConvolutionalLayer(size_t layer_num, + const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions, + const GNNConfig& config); + + GraphConvolutionalLayer(size_t layer_num, + const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions) + : GraphConvolutionalLayer(layer_num, graph, dimensions, GNNConfig()) {} + + // Parent functions + const std::vector& + ForwardPhase(const std::vector& input_embeddings) final; + std::vector* + BackwardPhase(const std::vector& prev_layer_input, + std::vector* input_gradient) final; + +private: + // 2 temporaries the size of the forward input; used for dropout and + // aggregation (if either are required) + std::vector in_temp_1_; + std::vector in_temp_2_; + // Temporary matrix the size of the output of the forward pass; used if + // an intermediate op occurs before writing to the final output matrix + std::vector out_temp_; + // Each thread has a vector of size # input columns or # output columns for + // storing intermediate results during aggregation. + // The one used depeneds on if aggregation occurs before or after the mxm. + galois::substrate::PerThreadStorage> + input_column_intermediates_; + galois::substrate::PerThreadStorage> + output_column_intermediates_; + + //! Performs aggregation for all nodes of the graph given the length of the + //! vector to aggregate, the features themselves, an output array, and per + //! thread storage for the intermediate scaling via norm factor + void + AggregateAll(size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + galois::substrate::PerThreadStorage>* pts); + + //! Do embedding update via mxm with this layer's weights (forward) + void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output); + //! Calculate graident via mxm with last layer's gradients (backward) + void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output); +}; + +} // namespace galois diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/GNNLayer.cpp new file mode 100644 index 0000000000..0f4eaeb36b --- /dev/null +++ b/libgnn/src/GNNLayer.cpp @@ -0,0 +1,100 @@ +#include "galois/layers/GNNLayer.h" + +galois::GNNLayer::GNNLayer(size_t layer_num, + const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions, + const GNNConfig& config) + : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions), + config_(config) { + // TODO some of this does not need alloc if not used + // dropout allocation; dropout is same as input + dropout_mask_.resize(layer_dimensions_.input_rows * + layer_dimensions_.input_columns); + // allocate memory based on layer dimensions + size_t num_weight_elements = + layer_dimensions_.input_columns * layer_dimensions_.output_columns; + layer_weights_.resize(num_weight_elements); + layer_weight_gradients_.resize(num_weight_elements, 0); + // init weights randomly with a parallel loop + RandomInitVector(&layer_weights_); + + size_t num_output_elements = + layer_dimensions_.input_rows * layer_dimensions_.output_columns; + forward_output_matrix_.resize(num_output_elements, 0); + backward_output_matrix_.resize( + layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0); +} + +void galois::GNNLayer::RandomInitVector(std::vector* vector_to_init) { + galois::do_all( + galois::iterate(static_cast(0), vector_to_init->size()), + [&](size_t i) { + // pull from the class's per thread RNG + (*vector_to_init)[i] = random_init_rng_.GetRandomNumber(); + }, + galois::loopname("RandomInitVector")); +} + +void galois::GNNLayer::DoDropout(std::vector* output_matrix) { + // XXX fix droptout, should use inputs not weights + size_t num_weights = layer_weights_.size(); + // determine which weights to drop + galois::do_all( + galois::iterate(static_cast(0), num_weights), + [&](size_t i) { + dropout_mask_[i] = dropout_rng_.DoBernoulli(config_.dropout_rate); + }, + galois::loopname("LayerDropoutRNG")); + + // create new matrix with non-dropped weights + some scaling + // TODO scaling? + GNNFloat scale = 1. / (1. - config_.dropout_rate); + galois::do_all( + galois::iterate(static_cast(0), num_weights), + [&](size_t i) { + (*output_matrix)[i] = + layer_weights_[i] * static_cast(dropout_mask_[i]) * scale; + }, + galois::loopname("LayerDropout")); +} + +void galois::GNNLayer::DoDropoutDerivative() { + GNNFloat scale = 1. / (1. - config_.dropout_rate); + // use dropout mask to figure out derivative + galois::do_all( + galois::iterate(static_cast(0), backward_output_matrix_.size()), + [&](size_t i) { + backward_output_matrix_[i] = backward_output_matrix_[i] * + static_cast(dropout_mask_[i]) * + scale; + }, + galois::loopname("LayerDropoutDerivative")); +} + +void galois::GNNLayer::Activation() { + // TODO only does relu at the moment; should check user specified activation + // and act accordingly + galois::do_all( + galois::iterate(static_cast(0), forward_output_matrix_.size()), + [&](size_t i) { + forward_output_matrix_[i] = + std::max(forward_output_matrix_.at(i), static_cast(0)); + }, + galois::loopname("ReLU")); +} + +void galois::GNNLayer::ActivationDerivative(std::vector* gradient) { + // TODO only does relu at the moment; should check user specified activation + // and act accordingly + // XXX + // keep gradient if the original output is greater than 0 + galois::do_all( + galois::iterate(static_cast(0), gradient->size()), + [&](size_t i) { + (*gradient)[i] = + (forward_output_matrix_.at(i) > static_cast(0)) + ? (*gradient)[i] + : static_cast(0); + }, + galois::loopname("ReLU-Derivative")); +} diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp new file mode 100644 index 0000000000..5ba0fdcf64 --- /dev/null +++ b/libgnn/src/GNNMath.cpp @@ -0,0 +1,42 @@ +#include +#include "galois/GNNMath.h" + +void galois::VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b, + GNNFloat* output) { +#ifdef __AVX2__ + constexpr size_t vectorization_length = + 8; // for 32-bit floating point in AVX2; TODO AVX512 + // can only do up to a particular multiple due to alignment + const size_t aligned_end = length - length % vectorization_length; + // do add via vector ops + for (size_t i = 0; i < aligned_end; i += vectorization_length) { + _mm256_storeu_ps(&output[i], _mm256_add_ps(_mm256_loadu_ps(&a[i]), + _mm256_loadu_ps(&b[i]))); + } + + // handle the rest + for (size_t i = aligned_end; i < length; ++i) { + output[i] = a[i] + b[i]; + } +#else + // no vector -> trivial loop add + for (size_t i = 0; i < length; ++i) { + output[i] = a[i] + b[i]; + } +#endif +} + +void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, + const CBLAS_TRANSPOSE trans_b, size_t input_rows, + size_t input_columns, size_t output_columns, + const GNNFloat* a, const GNNFloat* b, + GNNFloat* output) { + // set lead dimension based on cblas spec w.r.t. transpose setting + size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows; + size_t lead_dim_b = + (trans_b == CblasNoTrans) ? output_columns : input_columns; + // do the MM + cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns, + input_columns, 1.0, a, lead_dim_a, b, lead_dim_b, 0.0, output, + output_columns); +} diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp new file mode 100644 index 0000000000..bb00b83d61 --- /dev/null +++ b/libgnn/src/GraphConvolutionalLayer.cpp @@ -0,0 +1,164 @@ +#include "galois/Logging.h" +#include "galois/GNNMath.h" +#include "galois/layers/GraphConvolutionalLayer.h" + +galois::GraphConvolutionalLayer::GraphConvolutionalLayer( + size_t layer_num, const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions, const GNNConfig& config) + : galois::GNNLayer::GNNLayer(layer_num, graph, dimensions, config), + input_column_intermediates_(dimensions.input_columns), + output_column_intermediates_(dimensions.output_columns) { + size_t num_input_elements = + layer_dimensions_.input_rows * layer_dimensions_.input_columns; + in_temp_1_.resize(num_input_elements, 0); + // TODO temp2 does not need to be initialized in all circumstances + in_temp_2_.resize(num_input_elements, 0); + + size_t num_output_elements = + layer_dimensions_.input_rows * layer_dimensions_.output_columns; + GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements); + out_temp_.resize(num_output_elements, 0); +} + +const std::vector& +galois::GraphConvolutionalLayer::ForwardPhase( + const std::vector& input_embeddings) { + assert(input_embeddings.size() == + (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); + assert(in_temp_1_.size() == input_embeddings.size()); + assert(in_temp_2_.size() == input_embeddings.size()); + assert(forward_output_matrix_.size() == + (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); + // pointer to input to operate on + const GNNFloat* input_data = input_embeddings.data(); + // first, dropout + // TODO only dropout if in training apparently + if (config_.do_dropout) { + GALOIS_LOG_VERBOSE("Doing dropout"); + DoDropout(&in_temp_1_); + input_data = in_temp_1_.data(); + } + + GALOIS_LOG_VERBOSE("Doing aggregate"); + // aggregation and update (or vice versa) + AggregateAll(layer_dimensions_.input_columns, input_data, in_temp_2_.data(), + &input_column_intermediates_); + GALOIS_LOG_VERBOSE("Doing embedding update"); + // TODO synchronization of aggregation functions + UpdateEmbeddings(in_temp_2_.data(), forward_output_matrix_.data()); + + // TODO if input columns > output columns do update first then aggregate for + // efficiency + + if (config_.do_activation) { + GALOIS_LOG_VERBOSE("Doing activation"); + Activation(); + } + + assert(forward_output_matrix_.size() == + (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); + return forward_output_matrix_; +} + +std::vector* galois::GraphConvolutionalLayer::BackwardPhase( + const std::vector& prev_layer_input, + std::vector* input_gradient) { + // derivative of activation + if (config_.do_activation) { + ActivationDerivative(input_gradient); + } + + // derivative of aggregation/update + // TODO do optimized cased like the forward + if (layer_number_ != 0) { + // transposed sgemm for derivative; in_temp is output + UpdateEmbeddingsDerivative(input_gradient->data(), in_temp_1_.data()); + // derivative of aggregate is the same due to symmetric graph + AggregateAll(layer_dimensions_.input_columns, in_temp_1_.data(), + backward_output_matrix_.data(), &input_column_intermediates_); + } + // TODO sync agg/update + + // weight gradient calculation + galois::CBlasSGEMM(CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, + layer_dimensions_.input_rows, + layer_dimensions_.output_columns, prev_layer_input.data(), + input_gradient->data(), layer_weight_gradients_.data()); + // TODO sync weights + + if (config_.do_dropout) { + DoDropoutDerivative(); + } + + return &backward_output_matrix_; +} + +void galois::GraphConvolutionalLayer::AggregateAll( + size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + galois::substrate::PerThreadStorage>* pts) { + size_t num_nodes = graph_.size(); + // galois::gPrint(pts, "\n"); + + galois::do_all( + galois::iterate(static_cast(0), num_nodes), + [&](size_t src) { + size_t index_to_src_feature = src * column_length; + // zero out src feature first + // TODO can init to self as well + for (size_t i = 0; i < column_length; i++) { + aggregate_output[index_to_src_feature + i] = 0; + } + + GNNFloat source_norm = 0.0; + if (config_.do_normalization) { + source_norm = graph_.NormFactor(src); + } + + // loop through all destinations to grab the feature to aggregate + for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) { + size_t dst = graph_.EdgeDestination(e); + size_t index_to_dst_feature = dst * column_length; + + if (config_.do_normalization) { + GNNFloat norm_scale = source_norm * graph_.NormFactor(dst); + // scale the value on the destination by the combined norm term + assert(pts->getLocal()->size() == column_length); + GNNFloat* intermediate = pts->getLocal()->data(); + for (size_t i = 0; i < column_length; i++) { + intermediate[i] = + norm_scale * node_embeddings[index_to_dst_feature + i]; + } + // add intermediate instead of original feature + galois::VectorAdd( + column_length, &aggregate_output[index_to_src_feature], + intermediate, &aggregate_output[index_to_src_feature]); + } else { + // add dst feature to aggregate output + galois::VectorAdd(column_length, + &aggregate_output[index_to_src_feature], + &node_embeddings[index_to_dst_feature], + &aggregate_output[index_to_src_feature]); + } + } + }, + galois::steal(), galois::loopname("ConvolutionalAggregateAll")); +} + +void galois::GraphConvolutionalLayer::UpdateEmbeddings( + const GNNFloat* node_embeddings, GNNFloat* output) { + galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, + layer_dimensions_.input_columns, + layer_dimensions_.output_columns, node_embeddings, + layer_weights_.data(), output); +} + +void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative( + const GNNFloat* gradients, GNNFloat* output) { + // difference is Trans for B matrix (data) to get z by y (weights is y by z + // normally); result is x by y + galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, + layer_dimensions_.output_columns, + layer_dimensions_.input_columns, gradients, + layer_weights_.data(), output); +} diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 83c6164eac..9469f86aba 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -2,4 +2,8 @@ add_executable(gnngraph-test gnngraph-test.cpp) target_link_libraries(gnngraph-test galois_gnn) add_test(NAME gnngraph-test COMMAND gnngraph-test) +add_executable(convlayer-test convlayer-test.cpp) +target_link_libraries(convlayer-test galois_gnn) +add_test(NAME convlayer-test COMMAND convlayer-test) + # TODO multi host tests diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp new file mode 100644 index 0000000000..2d98875a0d --- /dev/null +++ b/libgnn/test/convlayer-test.cpp @@ -0,0 +1,258 @@ +//! @file convlayer-test.cpp +//! Conv layer test with a test graph + +#include "galois/Logging.h" +#include "galois/layers/GraphConvolutionalLayer.h" + +int main() { + galois::DistMemSys G; + + // size_t num_threads = galois::setActiveThreads( + // 56 / galois::runtime::getSystemNetworkInterface().Num); + size_t num_threads = galois::setActiveThreads(1); + + GALOIS_LOG_VERBOSE("[{}] Using {} threads", + galois::runtime::getSystemNetworkInterface().ID, + num_threads); + // load test graph + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + + std::vector feats = test_graph.GetLocalFeatures(); + ////////////////////////////////////////////////////////////////////////////// + // doubles as a test for reading as well + GALOIS_LOG_ASSERT(21 == feats.size()); + GALOIS_LOG_ASSERT(0.0 == feats[0]); + GALOIS_LOG_ASSERT(0.0 == feats[1]); + GALOIS_LOG_ASSERT(0.0 == feats[2]); + GALOIS_LOG_ASSERT(1.0 == feats[3]); + GALOIS_LOG_ASSERT(1.0 == feats[4]); + GALOIS_LOG_ASSERT(1.0 == feats[5]); + GALOIS_LOG_ASSERT(2.0 == feats[6]); + GALOIS_LOG_ASSERT(2.0 == feats[7]); + GALOIS_LOG_ASSERT(2.0 == feats[8]); + GALOIS_LOG_ASSERT(3.0 == feats[9]); + GALOIS_LOG_ASSERT(3.0 == feats[10]); + GALOIS_LOG_ASSERT(3.0 == feats[11]); + GALOIS_LOG_ASSERT(4.0 == feats[12]); + GALOIS_LOG_ASSERT(4.0 == feats[13]); + GALOIS_LOG_ASSERT(4.0 == feats[14]); + GALOIS_LOG_ASSERT(5.0 == feats[15]); + GALOIS_LOG_ASSERT(5.0 == feats[16]); + GALOIS_LOG_ASSERT(5.0 == feats[17]); + GALOIS_LOG_ASSERT(6.0 == feats[18]); + GALOIS_LOG_ASSERT(6.0 == feats[19]); + GALOIS_LOG_ASSERT(6.0 == feats[20]); + ////////////////////////////////////////////////////////////////////////////// + + galois::GNNLayerDimensions dimension_0{ + .input_rows = 7, .input_columns = 3, .output_columns = 2}; + + // create the layer, no norm factor + // note layer number is 1 so that it does something in backward phase + std::unique_ptr layer_0 = + std::make_unique(0, test_graph, + dimension_0); + layer_0->InitAllWeightsTo1(); + // make sure it runs in a sane manner + const std::vector& layer_0_forward_output = + layer_0->ForwardPhase(test_graph.GetLocalFeatures()); + + ////////////////////////////////////////////////////////////////////////////// + // sanity check layer 0 output + ////////////////////////////////////////////////////////////////////////////// + // since norm factors aren't invovled it is possible to do full assertions + // 7 x 2 + GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14); + GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3); + GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3); + GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6); + GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6); + GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12); + GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12); + GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 18); + GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 18); + GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 24); + GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 24); + GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 30); + GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 30); + GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 15); + GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 15); + ////////////////////////////////////////////////////////////////////////////// + + // dummy 1 matrix + std::vector dummy_ones(14, 1); + + // backward pass checking + // layer 0 means that an empty weight matrix is returned since there is no + // point passing back anything + std::vector* layer_0_backward_output = + layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + + ////////////////////////////////////////////////////////////////////////////// + // sanity check layer 0 backward output; all 0 because layer 0 + ////////////////////////////////////////////////////////////////////////////// + // since norm factors aren't invovled it is possible to do full assertions + // 7 x 3 + GALOIS_LOG_ASSERT(layer_0_backward_output->size() == 21); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[0] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[1] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[2] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[3] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[4] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[5] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[6] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[7] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[8] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[9] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[10] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[11] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[12] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[13] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[14] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[15] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[16] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[17] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[18] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[19] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[20] == 0); + + const std::vector layer_0_weight_gradients = + layer_0->GetLayerWeightGradients(); + // make sure they are sane + GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21); + + layer_0.reset(); + + ////////////////////////////////////////////////////////////////////////////// + + // create layer 1 for testing backward prop actually giving weights back + + std::unique_ptr layer_1 = + std::make_unique(1, test_graph, + dimension_0); + layer_1->InitAllWeightsTo1(); + const std::vector& layer_1_forward_output = + layer_1->ForwardPhase(test_graph.GetLocalFeatures()); + // same check as before for sanity purposes + GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14); + GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3); + GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3); + GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 6); + GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 6); + GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 12); + GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 12); + GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 18); + GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 18); + GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 24); + GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 24); + GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 30); + GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 30); + GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 15); + GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15); + + // since layer isn't 0 anymore, backward phase will actually return something + dummy_ones.assign(14, 1); + std::vector* layer_1_backward_output = + layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + ////////////////////////////////////////////////////////////////////////////// + // check that multiplies go as expected + ////////////////////////////////////////////////////////////////////////////// + GALOIS_LOG_ASSERT(layer_1_backward_output->size() == 21); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[0] == 2); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[1] == 2); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[2] == 2); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[3] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[4] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[5] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[6] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[7] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[8] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[9] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[10] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[11] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[12] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[13] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[14] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[15] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[16] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[17] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[18] == 2); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[19] == 2); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[20] == 2); + + const std::vector layer_1_weight_gradients = + layer_1->GetLayerWeightGradients(); + // make sure they are sane + GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21); + + layer_1.reset(); + + ////////////////////////////////////////////////////////////////////////////// + + galois::GNNConfig config = { + .do_dropout = true, .do_activation = true, .do_normalization = true}; + + // finally, just make sure dropout and activation run without crashes + // (verification requires floating point accuracy or setting a seed which I + // don't have time for at the moment + // TODO in future maybe add better unit test for this + std::unique_ptr layer_2 = + std::make_unique(1, test_graph, + dimension_0, config); + const std::vector l2_fo = + layer_2->ForwardPhase(test_graph.GetLocalFeatures()); + GALOIS_LOG_ASSERT(l2_fo.size() == 14); + GALOIS_LOG_VERBOSE("{}", l2_fo[0]); + GALOIS_LOG_VERBOSE("{}", l2_fo[1]); + GALOIS_LOG_VERBOSE("{}", l2_fo[2]); + GALOIS_LOG_VERBOSE("{}", l2_fo[3]); + GALOIS_LOG_VERBOSE("{}", l2_fo[4]); + GALOIS_LOG_VERBOSE("{}", l2_fo[5]); + GALOIS_LOG_VERBOSE("{}", l2_fo[6]); + GALOIS_LOG_VERBOSE("{}", l2_fo[7]); + GALOIS_LOG_VERBOSE("{}", l2_fo[8]); + GALOIS_LOG_VERBOSE("{}", l2_fo[9]); + GALOIS_LOG_VERBOSE("{}", l2_fo[10]); + GALOIS_LOG_VERBOSE("{}", l2_fo[11]); + GALOIS_LOG_VERBOSE("{}", l2_fo[12]); + GALOIS_LOG_VERBOSE("{}", l2_fo[13]); + + std::vector* l2_bo = + layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + + GALOIS_LOG_ASSERT(l2_bo->size() == 21); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[0]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[1]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[2]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[3]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[4]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[5]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[6]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[7]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[8]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[9]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[10]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[11]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[12]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[13]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[14]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[15]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[16]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[17]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[18]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[19]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[20]); + + return 0; +} diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp index 78b6804513..7db24081f5 100644 --- a/libgnn/test/gnngraph-test.cpp +++ b/libgnn/test/gnngraph-test.cpp @@ -30,4 +30,5 @@ int main() { // TODO fix citeseer and goec // galois::graphs::GNNGraph("citeseer", // galois::graphs::GNNPartitionScheme::kOEC, false); + return 0; } From 90a1447763d0aa4bde0a9b14a25579aa5f1ce9e6 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 28 Sep 2020 18:00:10 -0500 Subject: [PATCH 341/660] GNNGraph graph accessors, whole graph loading Adds a few access functions to the GNN Graph such as edge iterators, array accessors, etc. Also adds whole graph loading for the purposes of norm calculation (and other things that may require the entire graph down the line). Also hardcodes the tester graph's test/train/val boundaries because the partitioner requires it. --- libcusp/include/galois/graphs/NewGeneric.h | 8 +- libgnn/include/galois/graphs/GNNGraph.h | 69 ++++++++++++++---- libgnn/src/GNNGraph.cpp | 85 +++++++++++++++++----- 3 files changed, 129 insertions(+), 33 deletions(-) diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index a5eb13ee5b..3af95db9dd 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -76,7 +76,7 @@ class NewDistGraphGeneric : public DistGraph { uint32_t nodesToReceive; std::vector getGNNBreakpoints(std::string filename) { - // contains 2 numbers: begin and end of test + // contains 2 numbers: begin and end of train // everything else can be split evenly among hosts as they are not // performance critical std::vector bps; @@ -91,6 +91,12 @@ class NewDistGraphGeneric : public DistGraph { } else if (filename.find("ppi") != std::string::npos) { bps.push_back(0); bps.push_back(9716); + } else if (filename.find("tester") != std::string::npos) { + bps.push_back(0); + bps.push_back(5); + } else { + GALOIS_DIE("invalid input for gnn partitioning ", filename, + " hardcode needed"); } // TODO hardcode the rest diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 8bba9609fc..81d94f1948 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -27,13 +27,45 @@ enum class GNNPartitionScheme { kOEC, kCVC }; //! XXX class GNNGraph { public: - // using LocalGraphType = LearningGraph; using GNNDistGraph = galois::graphs::DistGraph; + using WholeGraph = galois::graphs::LC_CSR_Graph; + using GraphNode = GNNDistGraph::GraphNode; + using EdgeIterator = GNNDistGraph::edge_iterator; //! Loads a graph and all relevant metadata (labels, features, masks, etc.) GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme, bool has_single_class_label); + //! Return # of nodes in the partitioned graph + size_t size() const { return partitioned_graph_->size(); } + + // All following functions take a local id + EdgeIterator EdgeBegin(GraphNode n) const { + return partitioned_graph_->edge_begin(n); + }; + EdgeIterator EdgeEnd(GraphNode n) const { + return partitioned_graph_->edge_end(n); + }; + GraphNode EdgeDestination(EdgeIterator ei) const { + return partitioned_graph_->getEdgeDst(ei); + }; + GNNFloat NormFactor(GraphNode n) const { return norm_factors_[n]; } + + const std::vector& GetLocalFeatures() const { + return local_node_features_; + } + + //! Returns a pointer to the CSR indices where the first element starts at + //! 0 (used with MKL) + const uint32_t* GetZeroBasedRowPointer() { + return zero_start_graph_indices_.data(); + } + + //! Return pointer to all edge destinations; used with MKL + const uint32_t* GetEdgeDestPointer() { + return partitioned_graph_->edge_dst_ptr(); + } + private: //! In a multi-host setting, this variable stores the host id that the graph //! is currently running on @@ -44,6 +76,12 @@ class GNNGraph { size_t node_feature_length_{0}; //! Partitioned graph std::unique_ptr partitioned_graph_; + //! The entire topology of the dataset: used for things like norm factor + //! calculation or sampling + WholeGraph whole_graph_; + //! The indices pointer from the partitioned graph except with a 0 + //! prepended to it; needed for MKL calls + std::vector zero_start_graph_indices_; // XXX is this necessary //! Copy of underlying topology of the distributed graph // std::unique_ptr local_graph_; @@ -51,17 +89,17 @@ class GNNGraph { std::unique_ptr> sync_substrate_; //! Ground truth label for nodes in the partitioned graph; Nx1 if single //! class, N x num classes if multi-class label - std::unique_ptr local_ground_truth_labels_; + std::vector local_ground_truth_labels_; //! Feature vectors for nodes in partitioned graph - std::unique_ptr local_node_features_; + std::vector local_node_features_; // TODO maybe revisit this and use an actual bitset //! Bitset indicating which nodes are training nodes - std::unique_ptr local_training_mask_; + std::vector local_training_mask_; //! Bitset indicating which nodes are validation nodes - std::unique_ptr local_validation_mask_; + std::vector local_validation_mask_; //! Bitset indicating which nodes are testing nodes - std::unique_ptr local_testing_mask_; + std::vector local_testing_mask_; //! Global mask range for training nodes; must convert to LIDs when using //! in this class @@ -73,8 +111,7 @@ class GNNGraph { //! in this class GNNRange global_testing_mask_range_; - // XXX figure out what this is really used for - //! Normalization constant based on structure of the graph + //! Normalization constant based on structure of the graph (degrees) std::vector norm_factors_; // TODO vars for subgraphs as necessary @@ -91,15 +128,17 @@ class GNNGraph { GNNRange* mask_range, GNNLabel* masks); //! Read masks of local nodes only for training, validation, and testing void ReadLocalMasks(const std::string& dataset_name); + //! Init the node start indices that have a 0 at the beginning; straight + //! copy of the array from the partitioned graph save for the 0 at the + //! first element. + void InitZeroStartGraphIndices(); + //! Reads the entire graph topology in (but nothing else) + void ReadWholeGraph(const std::string& dataset_name); + //! Initializes the norm factors using the entire graph's topology for global + //! degree access + void InitNormFactor(); // public: - // - // DGraph* getGraphPointer() { return partitionedGraph; } - // Graph* getLGraphPointer() { return lGraph; } - // Graph* getSubgraphPointer(int id) { return partitionedSubgraphs[id]; }; - // - // void initializeSyncSubstrate(); - // // void saveDistGraph(DGraph* a); // galois::graphs::GluonSubstrate* getSyncSubstrate(); // float_t* get_feats_ptr() { return h_feats; } diff --git a/libgnn/src/GNNGraph.cpp b/libgnn/src/GNNGraph.cpp index 5a39ed4d25..78ff5d828c 100644 --- a/libgnn/src/GNNGraph.cpp +++ b/libgnn/src/GNNGraph.cpp @@ -1,6 +1,7 @@ // XXX include net interface if necessary -#include "galois/graphs/GNNGraph.h" #include "galois/Logging.h" +#include "galois/graphs/ReadGraph.h" +#include "galois/graphs/GNNGraph.h" namespace { //! Partitions a particular dataset given some partitioning scheme @@ -9,7 +10,7 @@ LoadPartition(const std::string& dataset_name, galois::graphs::GNNPartitionScheme partition_scheme) { // XXX input path std::string input_file = galois::gnn_dataset_path + dataset_name + ".csgr"; - GALOIS_LOG_VERBOSE("File to read is {}", input_file); + GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file); // load partition switch (partition_scheme) { @@ -30,6 +31,8 @@ LoadPartition(const std::string& dataset_name, galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme, bool has_single_class_label) { + GALOIS_LOG_VERBOSE("[{}] Constructing partitiong for {}", host_id_, + dataset_name); // save host id host_id_ = galois::runtime::getSystemNetworkInterface().ID; // load partition @@ -45,6 +48,13 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name, std::make_unique>( *partitioned_graph_, host_id_, galois::runtime::getSystemNetworkInterface().Num, false); + + // create the 0 based row indices for MKL use + InitZeroStartGraphIndices(); + // read in entire graph topology + ReadWholeGraph(dataset_name); + // init norm factors using the whole graph topology + InitNormFactor(); } void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, @@ -62,12 +72,11 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, // allocate memory for labels if (has_single_class_label) { // single-class (one-hot) label for each vertex: N x 1 - local_ground_truth_labels_ = - std::make_unique(partitioned_graph_->size()); + local_ground_truth_labels_.resize(partitioned_graph_->size()); } else { // multi-class label for each vertex: N x num classes - local_ground_truth_labels_ = std::make_unique( - partitioned_graph_->size() * num_label_classes_); + local_ground_truth_labels_.resize(partitioned_graph_->size() * + num_label_classes_); } size_t cur_gid = 0; @@ -148,8 +157,8 @@ void galois::graphs::GNNGraph::ReadLocalFeatures( file_stream.close(); // allocate memory for local features - local_node_features_ = std::make_unique( - partitioned_graph_->size() * node_feature_length_); + local_node_features_.resize(partitioned_graph_->size() * + node_feature_length_); // copy over features for local nodes only size_t local_vertex = 0; @@ -214,12 +223,9 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { // allocate the memory for the local masks - local_training_mask_ = - std::make_unique(partitioned_graph_->size()); - local_validation_mask_ = - std::make_unique(partitioned_graph_->size()); - local_testing_mask_ = - std::make_unique(partitioned_graph_->size()); + local_training_mask_.resize(partitioned_graph_->size()); + local_validation_mask_.resize(partitioned_graph_->size()); + local_testing_mask_.resize(partitioned_graph_->size()); if (dataset_name == "reddit") { // TODO reddit is hardcode handled at the moment; better way to not do @@ -256,10 +262,55 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { } else { // XXX i can get local sample counts from here if i need it ReadLocalMasksFromFile(dataset_name, "train", &global_training_mask_range_, - local_training_mask_.get()); + local_training_mask_.data()); ReadLocalMasksFromFile(dataset_name, "val", &global_validation_mask_range_, - local_validation_mask_.get()); + local_validation_mask_.data()); ReadLocalMasksFromFile(dataset_name, "test", &global_testing_mask_range_, - local_testing_mask_.get()); + local_testing_mask_.data()); } } + +void galois::graphs::GNNGraph::InitZeroStartGraphIndices() { + GALOIS_LOG_VERBOSE("[{}] Initializing node indices with 0 prepended", + host_id_); + // size is num nodes + 1 + zero_start_graph_indices_.resize(partitioned_graph_->size() + 1); + // first element is zero + zero_start_graph_indices_[0] = 0; + // the rest is a straight copy from partitioned graph (use edge_end to access + // it) + galois::do_all( + galois::iterate(static_cast(0), partitioned_graph_->size()), + [&](size_t i) { + zero_start_graph_indices_[i + 1] = *(partitioned_graph_->edge_end(i)); + }, + galois::loopname("InitZeroStartGraphIndices")); +} + +void galois::graphs::GNNGraph::ReadWholeGraph(const std::string& dataset_name) { + std::string input_file = galois::gnn_dataset_path + dataset_name + ".csgr"; + GALOIS_LOG_VERBOSE("[{}] Reading entire graph: file to read is {}", host_id_, + input_file); + galois::graphs::readGraph(whole_graph_, input_file); +} + +void galois::graphs::GNNGraph::InitNormFactor() { + GALOIS_LOG_VERBOSE("[{}] Initializing norm factors", host_id_); + norm_factors_.resize(partitioned_graph_->size(), 0.0); + + // get the norm factor contribution for each node based on the GLOBAL graph + galois::do_all( + galois::iterate(static_cast(0), partitioned_graph_->size()), + [&](size_t local_id) { + // translate lid into gid to get global degree + size_t global_id = partitioned_graph_->getGID(local_id); + size_t global_degree = whole_graph_.edge_end(global_id) - + whole_graph_.edge_begin(global_id); + // only set if non-zero + if (global_degree != 0) { + norm_factors_[local_id] = + 1.0 / std::sqrt(static_cast(global_degree)); + } + }, + galois::loopname("InitNormFactor")); +} From f6d097d301d968f6c40aa52f9c958fbbf851d36f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 29 Sep 2020 18:32:55 -0500 Subject: [PATCH 342/660] Layer phases + graph accessors Added more accessors to the graph like node iterators and LID-GID functions. More importantly, added the concept of a layer phase (test, validate, train) which causes training/masking to occur differently. Added a function to the graph object to check if an LID is part of a particular group of nodes to train as well. --- libgnn/include/galois/GNNTypes.h | 4 +++ libgnn/include/galois/graphs/GNNGraph.h | 34 ++++++++++++++++++++---- libgnn/src/GNNGraph.cpp | 35 +++++++++++++++++++++++++ 3 files changed, 68 insertions(+), 5 deletions(-) diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h index 56eed101f8..a04fa14687 100644 --- a/libgnn/include/galois/GNNTypes.h +++ b/libgnn/include/galois/GNNTypes.h @@ -13,4 +13,8 @@ using GNNFloat = float; using GNNLabel = uint8_t; //! Type of a feature on vertices using GNNFeature = float; + +//! Phase of GNN computation +enum class GNNPhase { kTrain, kValidate, kTest }; + } // end namespace galois diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 81d94f1948..cefc505992 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -30,6 +30,8 @@ class GNNGraph { using GNNDistGraph = galois::graphs::DistGraph; using WholeGraph = galois::graphs::LC_CSR_Graph; using GraphNode = GNNDistGraph::GraphNode; + // defined as such because dist graph range objects used long unsigned + using NodeIterator = boost::counting_iterator; using EdgeIterator = GNNDistGraph::edge_iterator; //! Loads a graph and all relevant metadata (labels, features, masks, etc.) @@ -39,6 +41,29 @@ class GNNGraph { //! Return # of nodes in the partitioned graph size_t size() const { return partitioned_graph_->size(); } + //! Node begin for all local nodes + NodeIterator begin() const { + return partitioned_graph_->allNodesRange().begin(); + } + //! Node end for all local nodes + NodeIterator end() const { return partitioned_graph_->allNodesRange().end(); } + //! Return GID of some local node + size_t GetGID(unsigned lid) const { return partitioned_graph_->getGID(lid); } + //! Given an LID and the current phase of GNN computation, determine if the + //! lid in question is valid for the current phase (i.e., it is part of + //! a training, validation, or test phase mask) + bool IsValidForPhase(const unsigned lid, + const galois::GNNPhase current_phase) const; + //! Returns the label of some local id assuming labels are single class + //! labels. + GNNFloat GetSingleClassLabel(const unsigned lid) const { + assert(using_single_class_labels_); + return local_ground_truth_labels_[lid]; + } + + //! Return the number of label classes + size_t GetNumLabelClasses() const { return num_label_classes_; }; + // All following functions take a local id EdgeIterator EdgeBegin(GraphNode n) const { return partitioned_graph_->edge_begin(n); @@ -57,12 +82,12 @@ class GNNGraph { //! Returns a pointer to the CSR indices where the first element starts at //! 0 (used with MKL) - const uint32_t* GetZeroBasedRowPointer() { + const uint32_t* GetZeroBasedRowPointer() const { return zero_start_graph_indices_.data(); } //! Return pointer to all edge destinations; used with MKL - const uint32_t* GetEdgeDestPointer() { + const uint32_t* GetEdgeDestPointer() const { return partitioned_graph_->edge_dst_ptr(); } @@ -82,11 +107,10 @@ class GNNGraph { //! The indices pointer from the partitioned graph except with a 0 //! prepended to it; needed for MKL calls std::vector zero_start_graph_indices_; - // XXX is this necessary - //! Copy of underlying topology of the distributed graph - // std::unique_ptr local_graph_; //! Sync substrate for the partitioned graph std::unique_ptr> sync_substrate_; + //! True if labels are single class + bool using_single_class_labels_; //! Ground truth label for nodes in the partitioned graph; Nx1 if single //! class, N x num classes if multi-class label std::vector local_ground_truth_labels_; diff --git a/libgnn/src/GNNGraph.cpp b/libgnn/src/GNNGraph.cpp index 78ff5d828c..38a78d68dc 100644 --- a/libgnn/src/GNNGraph.cpp +++ b/libgnn/src/GNNGraph.cpp @@ -57,6 +57,39 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name, InitNormFactor(); } +bool galois::graphs::GNNGraph::IsValidForPhase( + const unsigned lid, const galois::GNNPhase current_phase) const { + // convert to gid first + size_t gid = partitioned_graph_->getGID(lid); + + // select range to use based on phase + const GNNRange* range_to_use; + switch (current_phase) { + case GNNPhase::kTrain: + range_to_use = &global_training_mask_range_; + break; + case GNNPhase::kValidate: + range_to_use = &global_validation_mask_range_; + break; + case GNNPhase::kTest: + range_to_use = &global_testing_mask_range_; + break; + default: + GALOIS_LOG_FATAL("Invalid phase used"); + range_to_use = nullptr; + } + + // if within range, it is valid + // TODO there is an assumption here that ranges are contiguous; may not + // necessarily be the case in all inputs in which case using the mask is safer + // (but less cache efficient) + if (range_to_use->begin <= gid && gid < range_to_use->end) { + return true; + } else { + return false; + } +} + void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, bool has_single_class_label) { GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_); @@ -72,9 +105,11 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, // allocate memory for labels if (has_single_class_label) { // single-class (one-hot) label for each vertex: N x 1 + using_single_class_labels_ = true; local_ground_truth_labels_.resize(partitioned_graph_->size()); } else { // multi-class label for each vertex: N x num classes + using_single_class_labels_ = false; local_ground_truth_labels_.resize(partitioned_graph_->size() * num_label_classes_); } From 5e5214a7adf750babade5d5f78b3c832cfa1669f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 29 Sep 2020 18:37:01 -0500 Subject: [PATCH 343/660] GNNLayers: layer phases and allocation disabling Added the concept of layer phases to the GNN layer base class and added a config option to disable allocation of weights (e.g., output layers do not need weights to be allocated). --- libgnn/include/galois/layers/GNNLayer.h | 7 +++++++ libgnn/src/GNNLayer.cpp | 24 +++++++++++++----------- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 7df88d2ce7..e5d83678f1 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -19,6 +19,8 @@ struct GNNLayerDimensions { //! Config options for operations that can occur in a layer struct GNNConfig { + //! True if weights should be allocated + bool allocate_weights{true}; //! True if dropout is to be done at beginning of forward phase bool do_dropout{false}; //! Rate at which to drop things if dropout is on @@ -48,6 +50,9 @@ class GNNLayer { const GNNLayerDimensions& dimensions) : GNNLayer(layer_num, graph, dimensions, GNNConfig()) {} + //! Changes this layer's phase + void SetLayerPhase(GNNPhase new_phase) { layer_phase_ = new_phase; } + //! Initializes all layer weights to 1. This is used as a debug function for //! testing. void InitAllWeightsTo1() { layer_weights_.assign(layer_weights_.size(), 1); } @@ -109,6 +114,8 @@ class GNNLayer { //! Indicates which fields of the weight matrix are dropped if dropout is //! used std::vector dropout_mask_; + //! Phase of GNN computation that this layer is currently in + galois::GNNPhase layer_phase_{galois::GNNPhase::kTrain}; ////////////////////////////////////////////////////////////////////////////// diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/GNNLayer.cpp index 0f4eaeb36b..d14a5d1b05 100644 --- a/libgnn/src/GNNLayer.cpp +++ b/libgnn/src/GNNLayer.cpp @@ -6,17 +6,19 @@ galois::GNNLayer::GNNLayer(size_t layer_num, const GNNConfig& config) : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions), config_(config) { - // TODO some of this does not need alloc if not used - // dropout allocation; dropout is same as input - dropout_mask_.resize(layer_dimensions_.input_rows * - layer_dimensions_.input_columns); - // allocate memory based on layer dimensions - size_t num_weight_elements = - layer_dimensions_.input_columns * layer_dimensions_.output_columns; - layer_weights_.resize(num_weight_elements); - layer_weight_gradients_.resize(num_weight_elements, 0); - // init weights randomly with a parallel loop - RandomInitVector(&layer_weights_); + if (config_.allocate_weights) { + // TODO some of this does not need alloc if not used + // dropout allocation; dropout is same as input + dropout_mask_.resize(layer_dimensions_.input_rows * + layer_dimensions_.input_columns); + // allocate memory based on layer dimensions + size_t num_weight_elements = + layer_dimensions_.input_columns * layer_dimensions_.output_columns; + layer_weights_.resize(num_weight_elements); + layer_weight_gradients_.resize(num_weight_elements, 0); + // init weights randomly with a parallel loop + RandomInitVector(&layer_weights_); + } size_t num_output_elements = layer_dimensions_.input_rows * layer_dimensions_.output_columns; From 8d19a26de4cbd4f16b4b3d4c3e9e617ed306da92 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 29 Sep 2020 18:38:33 -0500 Subject: [PATCH 344/660] GNNMath: softmax, cross entropy, max selector Added softmax, cross entropy (and their deriviatives) for use in output layers. Also added a utility function that selects the max element in a list of elements. --- libgnn/include/galois/GNNMath.h | 22 +++++++++ libgnn/src/GNNMath.cpp | 79 +++++++++++++++++++++++++++++++++ 2 files changed, 101 insertions(+) diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h index 755d281752..2cf913d5de 100644 --- a/libgnn/include/galois/GNNMath.h +++ b/libgnn/include/galois/GNNMath.h @@ -5,11 +5,33 @@ namespace galois { +//! Find max index in a vector of some length +size_t MaxIndex(const size_t length, const GNNFloat* vector); //! Given 2 float array pointers, do element wise addition of length elements //! Can be called in parallel sections as its sigle threaded code void VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b, GNNFloat* output); +//! Does a softmax operation on the input vector and saves result to output +//! vector; single threaded so it can be called in a parallel section +void GNNSoftmax(const size_t vector_length, const GNNFloat* input, + GNNFloat* output); +//! Get derivative of softmax given the forward pass's input, the derivative +//! from loss calculation, and a temp vector to store intermediate results. +//! Everything is the same size. +void GNNSoftmaxDerivative(const size_t vector_length, + const GNNFloat* prev_output, + const GNNFloat* prev_output_derivative, + GNNFloat* temp_vector, GNNFloat* output); +//! Performs cross entropy given a ground truth and input and returns the loss +//! value. +galois::GNNFloat GNNCrossEntropy(const size_t vector_length, + const GNNFloat* ground_truth, + const GNNFloat* input); +//! Derivative of cross entropy; gradients saved into an output vector. +void GNNCrossEntropyDerivative(const size_t vector_length, + const GNNFloat* ground_truth, + const GNNFloat* input, GNNFloat* gradients); //! Calls into a library BLAS call to do matrix muliply; uses default alpha/beta void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, size_t input_rows, size_t input_columns, size_t output_columns, diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp index 5ba0fdcf64..303e872e2a 100644 --- a/libgnn/src/GNNMath.cpp +++ b/libgnn/src/GNNMath.cpp @@ -1,5 +1,22 @@ +#include +#include #include #include "galois/GNNMath.h" +#include "galois/Logging.h" + +size_t galois::MaxIndex(const size_t length, const GNNFloat* vector) { + size_t index = 0; + GNNFloat cur_max = vector[0]; + + for (size_t i = 1; i < length; i++) { + if (vector[i] > cur_max) { + index = i; + cur_max = vector[i]; + } + } + + return index; +} void galois::VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b, GNNFloat* output) { @@ -26,6 +43,68 @@ void galois::VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b, #endif } +void galois::GNNSoftmax(const size_t vector_length, const GNNFloat* input, + GNNFloat* output) { + const GNNFloat max_element = + *(std::max_element(input, input + vector_length)); + GNNFloat denominator = 0; + // normalize all elements using exponentional of max element + for (size_t i = 0; i < vector_length; i++) { + output[i] = std::exp(input[i] - max_element); + denominator += output[i]; + } + // divide all by total to get a distribution + for (size_t i = 0; i < vector_length; i++) { + output[i] /= denominator; + } +} + +void galois::GNNSoftmaxDerivative(const size_t vector_length, + const GNNFloat* prev_output, + const GNNFloat* prev_output_derivative, + GNNFloat* temp_vector, GNNFloat* output) { + for (size_t i = 0; i < vector_length; i++) { + for (size_t j = 0; j < vector_length; j++) { + temp_vector[j] = (j == i) ? prev_output[i] * (1.0 - prev_output[i]) + : -prev_output[j] * prev_output[i]; + } + // TODO is sdot using threads? if so this is a nested parallelism problem + output[i] = + cblas_sdot(vector_length, prev_output_derivative, 1, temp_vector, 1); + } +} + +galois::GNNFloat galois::GNNCrossEntropy(const size_t vector_length, + const GNNFloat* ground_truth, + const GNNFloat* input) { + GNNFloat loss = 0.0; + + for (size_t i = 0; i < vector_length; i++) { + if (ground_truth[i] == 0.0) { + continue; + } + + GALOIS_LOG_VERBOSE("Truth {} input {}", ground_truth[i], input[i]); + + if (input[i] == 0.0) { + loss -= ground_truth[i] * std::log(static_cast(1e-10)); + } else { + loss -= ground_truth[i] * std::log(input[i]); + } + } + + return loss; +} + +void galois::GNNCrossEntropyDerivative(const size_t vector_length, + const GNNFloat* ground_truth, + const GNNFloat* input, + GNNFloat* gradients) { + for (size_t i = 0; i < vector_length; i++) { + gradients[i] = -(ground_truth[i]) / (input[i] + 1e-10); + } +} + void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, size_t input_rows, size_t input_columns, size_t output_columns, From ac2ce73e13b8ea52a108f640ade306317ca44b3c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 29 Sep 2020 18:41:31 -0500 Subject: [PATCH 345/660] GNN softmax loss output layer This commit adds an implementation of the Softmax Loss output layer. The layer creates a probability distribution of the rows of a passed in matrix. The backward phase returns gradients on how to best move computation towards a distribution that favors the ground truth. Also includes some minor cleanup of the GCN layer. --- libgnn/CMakeLists.txt | 1 + libgnn/include/galois/layers/SoftmaxLayer.h | 51 +++++++++++++ libgnn/src/GraphConvolutionalLayer.cpp | 3 +- libgnn/src/SoftmaxLayer.cpp | 83 +++++++++++++++++++++ 4 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 libgnn/include/galois/layers/SoftmaxLayer.h create mode 100644 libgnn/src/SoftmaxLayer.cpp diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 28e8dc8630..24f1f0e33d 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -21,6 +21,7 @@ set(sources src/GNNLayer.cpp src/GNNMath.cpp src/GraphConvolutionalLayer.cpp + src/SoftmaxLayer.cpp ) add_library(galois_gnn STATIC ${sources}) diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h new file mode 100644 index 0000000000..45116f1b62 --- /dev/null +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -0,0 +1,51 @@ +#pragma once +#include "galois/layers/GNNLayer.h" + +namespace galois { + +//! Softmax layer: takes each row of the input matrix and creates a probability +//! distribution based on the magnitude of elements in each row. +//! Currently this only works with **single class* labels and is coded as such. +class SoftmaxLayer : public GNNLayer { +public: + SoftmaxLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions) + : GNNLayer(layer_num, graph, dimensions, + GNNConfig{.allocate_weights = false}), + input_loss_(dimensions.input_rows), + ground_truth_vectors_(dimensions.input_columns), + norm_gradient_vectors_(dimensions.input_columns), + softmax_temp_vectors_(dimensions.input_columns) { + // input/output columns must be equivalent in a softmax + GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns); + } + + //! Creates probability distribution of each row of input + const std::vector& + ForwardPhase(const std::vector& input_embeddings) final; + + //! Get gradients to fix distribution such that it leans more towards single + //! class ground truth. + std::vector* + BackwardPhase(const std::vector& prev_layer_input, + std::vector* input_gradient) final; + + // TODO prediction loss function? +private: + //! Loss for each row of the input + std::vector input_loss_; + //! Each thread gets storage to allocate the ground truth vector in during + //! calculation; each vector is the size of a feature vector + galois::substrate::PerThreadStorage> + ground_truth_vectors_; + //! Each thread gets storage to allocate the gradients during backward + //! prop; each is the size of a feature vector + galois::substrate::PerThreadStorage> + norm_gradient_vectors_; + //! Each thread gets storage for a temporary vector used during softmax + //! derivative calculation; each is the size of a feature vector + galois::substrate::PerThreadStorage> + softmax_temp_vectors_; +}; + +} // namespace galois diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp index bb00b83d61..fecea27d17 100644 --- a/libgnn/src/GraphConvolutionalLayer.cpp +++ b/libgnn/src/GraphConvolutionalLayer.cpp @@ -5,7 +5,7 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( size_t layer_num, const galois::graphs::GNNGraph& graph, const GNNLayerDimensions& dimensions, const GNNConfig& config) - : galois::GNNLayer::GNNLayer(layer_num, graph, dimensions, config), + : GNNLayer(layer_num, graph, dimensions, config), input_column_intermediates_(dimensions.input_columns), output_column_intermediates_(dimensions.output_columns) { size_t num_input_elements = @@ -98,7 +98,6 @@ void galois::GraphConvolutionalLayer::AggregateAll( GNNFloat* aggregate_output, galois::substrate::PerThreadStorage>* pts) { size_t num_nodes = graph_.size(); - // galois::gPrint(pts, "\n"); galois::do_all( galois::iterate(static_cast(0), num_nodes), diff --git a/libgnn/src/SoftmaxLayer.cpp b/libgnn/src/SoftmaxLayer.cpp new file mode 100644 index 0000000000..1c7073e560 --- /dev/null +++ b/libgnn/src/SoftmaxLayer.cpp @@ -0,0 +1,83 @@ +#include "galois/Logging.h" +#include "galois/GNNMath.h" +#include "galois/layers/SoftmaxLayer.h" + +const std::vector& galois::SoftmaxLayer::ForwardPhase( + const std::vector& input_embeddings) { + input_loss_.assign(input_loss_.size(), 0.0); + forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); + + const size_t feature_length = layer_dimensions_.input_columns; + galois::do_all( + galois::iterate(graph_.begin(), graph_.end()), + [&](const unsigned i) { + if (graph_.IsValidForPhase(i, layer_phase_)) { + // do softmax + GNNSoftmax(feature_length, &input_embeddings[feature_length * i], + &forward_output_matrix_[feature_length * i]); + + // create ground truth vector for this LID + std::vector* ground_truth_vec = + ground_truth_vectors_.getLocal(); + assert(ground_truth_vec->size() == feature_length); + ground_truth_vec->assign(ground_truth_vec->size(), 0.0); + GALOIS_LOG_VERBOSE("Label for LID {} is {}", i, + graph_.GetSingleClassLabel(i)); + // single class label is an index; set the correct one + (*ground_truth_vec)[static_cast( + graph_.GetSingleClassLabel(i))] = 1.0; + + // calculate loss for this LID (note not all i will be filled) + input_loss_[i] = + GNNCrossEntropy(feature_length, ground_truth_vec->data(), + &forward_output_matrix_[feature_length * i]); + GALOIS_LOG_VERBOSE("Loss for LID {} is {}", i, input_loss_[i]); + } + }, + // TODO chunk size? + // steal on as some threads may have nothing to work on + galois::steal(), galois::loopname("SoftmaxForward")); + + return forward_output_matrix_; +} + +std::vector* +galois::SoftmaxLayer::BackwardPhase(const std::vector&, + std::vector*) { + const size_t feature_length = layer_dimensions_.input_columns; + galois::do_all( + galois::iterate(graph_.begin(), graph_.end()), + [&](const unsigned i) { + if (graph_.IsValidForPhase(i, layer_phase_)) { + // create ground truth vector for this LID + std::vector* ground_truth_vec = + ground_truth_vectors_.getLocal(); + assert(ground_truth_vec->size() == feature_length); + ground_truth_vec->assign(ground_truth_vec->size(), 0.0); + // single class label is an index; set the correct one + (*ground_truth_vec)[static_cast( + graph_.GetSingleClassLabel(i))] = 1.0; + + // derivative cross entropy into norm grad + std::vector* norm_gradient = + norm_gradient_vectors_.getLocal(); + GNNCrossEntropyDerivative(feature_length, ground_truth_vec->data(), + forward_output_matrix_.data(), + norm_gradient->data()); + + // use norm grad with softmax deritave, save and return + std::vector* softmax_temp = + softmax_temp_vectors_.getLocal(); + GNNSoftmaxDerivative(feature_length, forward_output_matrix_.data(), + norm_gradient->data(), softmax_temp->data(), + backward_output_matrix_.data()); + } + }, + // TODO chunk size? + // steal on as some threads may have nothing to work on + galois::steal(), galois::loopname("SoftmaxBackward")); + + return &backward_output_matrix_; +} + +// TODO function for getting loss From d7645e2b61f1afd6fdfcc14868dc844aa23c8ce1 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 29 Sep 2020 18:43:52 -0500 Subject: [PATCH 346/660] Softmax layer test Adds a test that runs the forward and backward phase of the softmax layer. Verification only included for the forward pass at this point: it checks to make sure the probability distribution is as expected from the test input. The backward phase is just run without checking its output for sanity purposes. --- libgnn/test/CMakeLists.txt | 4 ++ libgnn/test/convlayer-test.cpp | 5 +- libgnn/test/softmaxlayer-test.cpp | 107 ++++++++++++++++++++++++++++++ 3 files changed, 113 insertions(+), 3 deletions(-) create mode 100644 libgnn/test/softmaxlayer-test.cpp diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 9469f86aba..c604dd59e2 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -6,4 +6,8 @@ add_executable(convlayer-test convlayer-test.cpp) target_link_libraries(convlayer-test galois_gnn) add_test(NAME convlayer-test COMMAND convlayer-test) +add_executable(softmaxlayer-test softmaxlayer-test.cpp) +target_link_libraries(softmaxlayer-test galois_gnn) +add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test) + # TODO multi host tests diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp index 2d98875a0d..3c127b0ad0 100644 --- a/libgnn/test/convlayer-test.cpp +++ b/libgnn/test/convlayer-test.cpp @@ -7,9 +7,8 @@ int main() { galois::DistMemSys G; - // size_t num_threads = galois::setActiveThreads( - // 56 / galois::runtime::getSystemNetworkInterface().Num); - size_t num_threads = galois::setActiveThreads(1); + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); GALOIS_LOG_VERBOSE("[{}] Using {} threads", galois::runtime::getSystemNetworkInterface().ID, diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp new file mode 100644 index 0000000000..bd3cd8c5e3 --- /dev/null +++ b/libgnn/test/softmaxlayer-test.cpp @@ -0,0 +1,107 @@ +//! @file convlayer-test.cpp +//! Softmax layer test with a test graph + +#include "galois/Logging.h" +#include "galois/GNNMath.h" +#include "galois/layers/SoftmaxLayer.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + GALOIS_LOG_VERBOSE("Num threads is {}", num_threads); + + // load test graph + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + + // input/output columns must be same in softmax + galois::GNNLayerDimensions dimension_0{ + .input_rows = 7, + .input_columns = test_graph.GetNumLabelClasses(), + .output_columns = test_graph.GetNumLabelClasses()}; + + GALOIS_LOG_VERBOSE("Num output classes is {}", dimension_0.input_columns); + + // input to softmax + std::vector softmax_input(49, 0.0); + // create input with perfect accuracy + softmax_input[0] = 1; + softmax_input[8] = 1; + softmax_input[16] = 1; + softmax_input[24] = 1; + softmax_input[32] = 1; + softmax_input[40] = 1; + softmax_input[48] = 1; + + // train mode + auto output_layer = + std::make_unique(3, test_graph, dimension_0); + const std::vector& prediction_distribution = + output_layer->ForwardPhase(softmax_input); + output_layer->BackwardPhase(softmax_input, nullptr); + + // assert that predictions are as expected + for (size_t i = 0; i < 5; i++) { + GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[i * 7])) == + i); + } + // train mode means last 2 vertices should be empty + for (size_t i = 5; i < 7; i++) { + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 0] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 1] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 2] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 3] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 4] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 5] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0); + } + + // validation mode + output_layer->SetLayerPhase(galois::GNNPhase::kValidate); + const std::vector& pd2 = + output_layer->ForwardPhase(softmax_input); + output_layer->BackwardPhase(softmax_input, nullptr); + // validate vertex is index 5 + GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd2[5 * 7])) == 5); + for (size_t i = 0; i < 5; i++) { + GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0); + } + for (size_t i = 6; i < 7; i++) { + GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0); + } + + // test mode + output_layer->SetLayerPhase(galois::GNNPhase::kTest); + const std::vector& pd3 = + output_layer->ForwardPhase(softmax_input); + output_layer->BackwardPhase(softmax_input, nullptr); + // validate vertex is index 6 + GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd3[6 * 7])) == 6); + // all but last are empty distributions + for (size_t i = 0; i < 6; i++) { + GALOIS_LOG_ASSERT(pd3[i * 7 + 0] == 0.0); + GALOIS_LOG_ASSERT(pd3[i * 7 + 1] == 0.0); + GALOIS_LOG_ASSERT(pd3[i * 7 + 2] == 0.0); + GALOIS_LOG_ASSERT(pd3[i * 7 + 3] == 0.0); + GALOIS_LOG_ASSERT(pd3[i * 7 + 4] == 0.0); + GALOIS_LOG_ASSERT(pd3[i * 7 + 5] == 0.0); + GALOIS_LOG_ASSERT(pd3[i * 7 + 6] == 0.0); + } + + // TODO in future maybe: add better test for backward phase besides just + // running it +} From b9cc256c7cda6bd59be4b68c48189e68b4afc2a6 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 22 Sep 2020 17:17:30 -0500 Subject: [PATCH 347/660] refixed gradient sync, added post process average --- .../include/deepgalois/layers/GradientSyncStructs.h | 6 +++++- libdeepgalois/src/layers/graph_conv_layer.cpp | 13 +++++++++++++ lonestar/gnn/include/engine.h | 2 +- 3 files changed, 19 insertions(+), 2 deletions(-) diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h index 26420aa30d..6f600b40a8 100644 --- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h +++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h @@ -15,8 +15,12 @@ struct GradientSync { // if (std::abs(weight - y) > 0.00001) { // galois::gInfo("weight ", node_id, " not consistent with one received"); //} + if (y == 0) { + galois::gPrint("nothing important\n"); + } weight += y; - weight /= 2; + // need a post process divide all step + //weight /= 2; return true; } diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index 9320ade39c..f6741f4b6d 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -268,7 +268,20 @@ void graph_conv_layer::back_propagation(const float_t* in_data, math::d_dropout_cpu(x, y, scale_, in_grad, dropout_mask, in_grad); drop_timer.stop(); + deepgalois::_syncVectorSize = z; + deepgalois::_dataToSync = &layer::weight_grad[0]; + unsigned host_num = galois::runtime::getSystemNetworkInterface().Num; layer::syncSub->sync("Gradients"); + galois::do_all( + galois::iterate((size_t)0, (size_t)z), + [&] (size_t i) { + //galois::gPrint("before ", i, " ", layer::weight_grad[i], "\n"); + layer::weight_grad[i] /= host_num; + //galois::gPrint("after ", i, " ", layer::weight_grad[i], "\n"); + }, + galois::loopname("sync post process") + ); + galois::gDebug("[", layer::gradientGraph->myHostID(), "] Sync done"); conv_timer.stop(); } diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h index 016ac80831..f9afb28a4c 100644 --- a/lonestar/gnn/include/engine.h +++ b/lonestar/gnn/include/engine.h @@ -92,7 +92,7 @@ int main(int argc, char** argv) { // see optimizer.h for more details // optimizer *opt = new gradient_descent(); // optimizer *opt = new adagrad(); - deepgalois::optimizer* opt = new deepgalois::adam(); + deepgalois::optimizer* opt = new deepgalois::adagrad(); galois::StatTimer Ttrain("TrainAndVal"); Ttrain.start(); network.train(opt, do_validate); // do training using training samples From 7b71274a02c1834ef95a8b3492fc861a56723a2c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 30 Sep 2020 18:27:44 -0500 Subject: [PATCH 348/660] Add accessors and type to GNNLayers Adds a few more accessors to GNNLayer classes as well as a type field which can be used to determine what kind of layer an object is. Note the separation between an output layer and an intermediate layer; this may be merged later into a single field, but the current design is that an intermediate and output layer are considered differently throughout the codebase. --- libgnn/include/galois/layers/GNNLayer.h | 32 +++++++++++++++++++-- libgnn/include/galois/layers/SoftmaxLayer.h | 3 ++ libgnn/src/GraphConvolutionalLayer.cpp | 1 + 3 files changed, 33 insertions(+), 3 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index e5d83678f1..4144dbfead 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -5,6 +5,19 @@ namespace galois { +//! Supported layer types in the GNN +enum class GNNLayerType { + //! Invalid placeholder + kInvalid, + //! GCN + kGraphConvolutional + // TODO SAGE and GAT +}; + +// TODO Sigmoid +//! Supported output layer types in the GNN +enum class GNNOutputLayerType { kInvalid, kSoftmax }; + //! Struct holding the dimensions of a layer. Assumption is that a layer takes //! a matrix and outputs another matrix with a different # of columns (e.g. //! matrix multiply with a set of weights) @@ -50,6 +63,7 @@ class GNNLayer { const GNNLayerDimensions& dimensions) : GNNLayer(layer_num, graph, dimensions, GNNConfig()) {} + GNNPhase layer_phase() { return layer_phase_; } //! Changes this layer's phase void SetLayerPhase(GNNPhase new_phase) { layer_phase_ = new_phase; } @@ -76,13 +90,20 @@ class GNNLayer { BackwardPhase(const std::vector& prev_layer_input, std::vector* input_gradient) = 0; - const std::vector& GetLayerWeightGradients() { + //! Returns the weight gradients + const std::vector& GetLayerWeightGradients() const { return layer_weight_gradients_; } //! Returns dimensions of this layer - // XXX may not be needed - const GNNLayerDimensions& GetLayerDimensions() { return layer_dimensions_; } + const GNNLayerDimensions& GetLayerDimensions() const { + return layer_dimensions_; + } + + galois::GNNLayerType layer_type() const { return layer_type_; } + galois::GNNOutputLayerType output_layer_type() const { + return output_layer_type_; + } protected: //! Layer order (starts from 0); used in backward to shortcut output as layer @@ -116,6 +137,11 @@ class GNNLayer { std::vector dropout_mask_; //! Phase of GNN computation that this layer is currently in galois::GNNPhase layer_phase_{galois::GNNPhase::kTrain}; + //! Layer type (invalid if output layer) + galois::GNNLayerType layer_type_{galois::GNNLayerType::kInvalid}; + //! Output layer type (remains invalid if not an output layer) + galois::GNNOutputLayerType output_layer_type_{ + galois::GNNOutputLayerType::kInvalid}; ////////////////////////////////////////////////////////////////////////////// diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index 45116f1b62..3052429b8b 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -16,8 +16,11 @@ class SoftmaxLayer : public GNNLayer { ground_truth_vectors_(dimensions.input_columns), norm_gradient_vectors_(dimensions.input_columns), softmax_temp_vectors_(dimensions.input_columns) { + output_layer_type_ = galois::GNNOutputLayerType::kSoftmax; // input/output columns must be equivalent in a softmax GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns); + // output needs to match number of possible classes + GALOIS_LOG_ASSERT(dimensions.input_columns == graph.GetNumLabelClasses()); } //! Creates probability distribution of each row of input diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp index fecea27d17..a5abe1d0ef 100644 --- a/libgnn/src/GraphConvolutionalLayer.cpp +++ b/libgnn/src/GraphConvolutionalLayer.cpp @@ -18,6 +18,7 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( layer_dimensions_.input_rows * layer_dimensions_.output_columns; GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements); out_temp_.resize(num_output_elements, 0); + layer_type_ = galois::GNNLayerType::kGraphConvolutional; } const std::vector& From b585f2b280a63cce4ff3904f4b07756a077aea62 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 30 Sep 2020 18:42:34 -0500 Subject: [PATCH 349/660] Graph neural network construction + unit test Adds initial implementation of a graph neural network's constructor which can construct an arbitrary GNN given some config object and a few other details. Adds a unit test which checks the structure of the GNN to make sure that it is sane. Also adds a feature length accessor to GNNGraph. Next step is the implementation of forward/backward phases in the GNN. --- libgnn/CMakeLists.txt | 1 + libgnn/include/galois/GraphNeuralNetwork.h | 128 +++++++++++++++++++++ libgnn/include/galois/graphs/GNNGraph.h | 2 + libgnn/src/GraphNeuralNetwork.cpp | 57 +++++++++ libgnn/test/CMakeLists.txt | 6 +- libgnn/test/gnnconstruct-test.cpp | 64 +++++++++++ 6 files changed, 257 insertions(+), 1 deletion(-) create mode 100644 libgnn/include/galois/GraphNeuralNetwork.h create mode 100644 libgnn/src/GraphNeuralNetwork.cpp create mode 100644 libgnn/test/gnnconstruct-test.cpp diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 24f1f0e33d..ce6e6f990f 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -22,6 +22,7 @@ set(sources src/GNNMath.cpp src/GraphConvolutionalLayer.cpp src/SoftmaxLayer.cpp + src/GraphNeuralNetwork.cpp ) add_library(galois_gnn STATIC ${sources}) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h new file mode 100644 index 0000000000..a4eb19f375 --- /dev/null +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -0,0 +1,128 @@ +#pragma once +//! @file GraphNeuralNetwork.h +//! +//! Defines the graph neural network class that is used to classify graphs as +//! well as helper enums/classes involved with the GNN. + +#include "galois/Logging.h" +#include "galois/graphs/GNNGraph.h" +#include "galois/layers/GNNLayer.h" + +namespace galois { + +//////////////////////////////////////////////////////////////////////////////// + +//! Configuration object passed into constructor of a GraphNeuralNetwork to +//! determine how the network gets constructed. +class GraphNeuralNetworkConfig { +public: + // default move, no copy + GraphNeuralNetworkConfig() = delete; + GraphNeuralNetworkConfig(const GraphNeuralNetworkConfig&) = delete; + GraphNeuralNetworkConfig& operator=(const GraphNeuralNetworkConfig&) = delete; + GraphNeuralNetworkConfig(GraphNeuralNetworkConfig&&) = default; + GraphNeuralNetworkConfig& operator=(GraphNeuralNetworkConfig&&) = default; + + //! Construction without a config for layers specified; uses a default + GraphNeuralNetworkConfig(size_t num_layers, + const std::vector& layer_types, + const std::vector& layer_column_sizes, + GNNOutputLayerType output_layer_type) + : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes, + output_layer_type, + GNNConfig{.do_dropout = true, + .dropout_rate = 0.3, + .do_normalization = true}) {} + + //! Construction with a specified config for layers + GraphNeuralNetworkConfig(size_t num_layers, + const std::vector& layer_types, + const std::vector& layer_column_sizes, + GNNOutputLayerType output_layer_type, + const GNNConfig& default_layer_config) + : num_intermediate_layers_(num_layers), layer_types_(layer_types), + layer_column_sizes_(layer_column_sizes), + output_layer_type_(output_layer_type), + default_layer_config_(default_layer_config) { + // Do sanity checks on inputs + // should have a type for each layer + GALOIS_LOG_ASSERT(num_intermediate_layers_ == layer_types_.size()); + // For now, should be at least 1 intermediate layer + GALOIS_LOG_ASSERT(num_intermediate_layers_ >= 1); + // + 1 because it includes output layer + GALOIS_LOG_ASSERT((num_intermediate_layers_ + 1) == + layer_column_sizes_.size()); + } + + //! # layers NOT including output layer + size_t num_intermediate_layers() { return num_intermediate_layers_; } + //! Get intermediate layer i + GNNLayerType intermediate_layer_type(size_t i) { + assert(i < num_intermediate_layers_); + return layer_types_[i]; + } + //! Get intermediate layer i's size + size_t intermediate_layer_size(size_t i) { + assert(i < num_intermediate_layers_); + return layer_column_sizes_[i]; + } + //! Type of output layer + GNNOutputLayerType output_layer_type() { return output_layer_type_; } + //! Size of output layer is last element of layer column sizes + size_t output_layer_size() { + return layer_column_sizes_[num_intermediate_layers_]; + } + //! Get the default layer config of layers in this GNN + const GNNConfig& default_layer_config() { return default_layer_config_; } + +private: + //! Number of layers to construct in the GNN not including the output + //! layer + size_t num_intermediate_layers_; + //! Layers to construct for the GNN going from left to right; size should + //! match num_layers setting + std::vector layer_types_; + //! Size (in columns) of each non-output layer; size should match num_layers + //! + 1 (+1 is for the output layer) + std::vector layer_column_sizes_; + //! Output layer type + GNNOutputLayerType output_layer_type_; + //! Default config to use for layers + GNNConfig default_layer_config_; +}; + +//////////////////////////////////////////////////////////////////////////////// + +//! Class representing the graph neural network: contains the graph to train as +//! well as all the layers that comprise it +class GraphNeuralNetwork { +public: + //! Construct the graph neural network given the graph to train on as well as + //! a configuration object + GraphNeuralNetwork(std::unique_ptr graph, + GraphNeuralNetworkConfig&& config); + + //! Number of intermediate layers (DOES NOT INCLUDE OUTPUT LAYER) + size_t num_intermediate_layers() { return gnn_layers_.size() - 1; } + + //! Returns pointer to intermediate layer i + const galois::GNNLayer* GetIntermediateLayer(size_t i) { + if (i < gnn_layers_.size() - 1) { + return gnn_layers_[i].get(); + } else { + GALOIS_LOG_FATAL("Accessing out of bounds intermediate layer {}", i); + } + } + //! Returns the output layer + const galois::GNNLayer* GetOutputLayer() { return gnn_layers_.back().get(); } + +private: + //! Underlying graph to train + std::unique_ptr graph_; + //! Configuration object used to construct this GNN + GraphNeuralNetworkConfig config_; + //! GNN layers including the output + std::vector> gnn_layers_; +}; + +} // namespace galois diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index cefc505992..fa06453df9 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -76,6 +76,8 @@ class GNNGraph { }; GNNFloat NormFactor(GraphNode n) const { return norm_factors_[n]; } + size_t node_feature_length() const { return node_feature_length_; } + const std::vector& GetLocalFeatures() const { return local_node_features_; } diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp new file mode 100644 index 0000000000..9b6a4ad708 --- /dev/null +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -0,0 +1,57 @@ +#include "galois/GraphNeuralNetwork.h" +#include "galois/layers/GraphConvolutionalLayer.h" +#include "galois/layers/SoftmaxLayer.h" + +galois::GraphNeuralNetwork::GraphNeuralNetwork( + std::unique_ptr graph, + galois::GraphNeuralNetworkConfig&& config) + : graph_(std::move(graph)), config_(std::move(config)) { + // max number of rows that can be passed as inputs; allocate space for it as + // this will be the # of rows for each layer + size_t max_rows = graph_->size(); + + // create the intermediate layers + for (size_t i = 0; i < config_.num_intermediate_layers(); i++) { + GNNLayerType layer_type = config_.intermediate_layer_type(i); + size_t prev_layer_columns; + + if (i != 0) { + // grab previous layer's size + prev_layer_columns = config_.intermediate_layer_size(i - 1); + } else { + // first layer means the input columns are # features in graph + prev_layer_columns = graph_->node_feature_length(); + } + + GNNLayerDimensions layer_dims = {.input_rows = max_rows, + .input_columns = prev_layer_columns, + .output_columns = + config_.intermediate_layer_size(i)}; + + switch (layer_type) { + case GNNLayerType::kGraphConvolutional: + gnn_layers_.push_back(std::move(std::make_unique( + i, *graph_, layer_dims, config_.default_layer_config()))); + break; + default: + GALOIS_LOG_FATAL("Invalid layer type during network construction"); + } + } + + // create the output layer + GNNLayerDimensions output_dims = { + .input_rows = max_rows, + // get last intermediate layer column size + .input_columns = config_.intermediate_layer_size( + config_.num_intermediate_layers() - 1), + .output_columns = config_.output_layer_size()}; + + switch (config_.output_layer_type()) { + case (GNNOutputLayerType::kSoftmax): + gnn_layers_.push_back(std::move(std::make_unique( + config_.num_intermediate_layers(), *graph_, output_dims))); + break; + default: + GALOIS_LOG_FATAL("Invalid layer type during network construction"); + } +} diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index c604dd59e2..e7a04b5b5f 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -10,4 +10,8 @@ add_executable(softmaxlayer-test softmaxlayer-test.cpp) target_link_libraries(softmaxlayer-test galois_gnn) add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test) -# TODO multi host tests +add_executable(gnnconstruct-test gnnconstruct-test.cpp) +target_link_libraries(gnnconstruct-test galois_gnn) +add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test) + +# TODO multi host tests? diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp new file mode 100644 index 0000000000..9265eb6b8b --- /dev/null +++ b/libgnn/test/gnnconstruct-test.cpp @@ -0,0 +1,64 @@ +#include "galois/Logging.h" +#include "galois/GraphNeuralNetwork.h" +//! @file gnnconstruct-test.cpp +//! Test to make sure construction works as expected + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + + GALOIS_LOG_VERBOSE("[{}] Using {} threads", + galois::runtime::getSystemNetworkInterface().ID, + num_threads); + // load test graph + auto test_graph = std::make_unique( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + + // 2 layer test with softmax + std::vector layer_types = { + galois::GNNLayerType::kGraphConvolutional, + galois::GNNLayerType::kGraphConvolutional}; + // note this includes the output; last 2 must be same because softmax + std::vector layer_output_sizes = {4, 7, 7}; + galois::GraphNeuralNetworkConfig gnn_config( + 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax); + galois::GraphNeuralNetwork gnn(std::move(test_graph), std::move(gnn_config)); + + // note this does not include output layer + GALOIS_LOG_ASSERT(gnn.num_intermediate_layers() == 2); + // assert layer types + GALOIS_LOG_ASSERT(galois::GNNLayerType::kGraphConvolutional == + gnn.GetIntermediateLayer(0)->layer_type()); + GALOIS_LOG_ASSERT(galois::GNNOutputLayerType::kInvalid == + gnn.GetIntermediateLayer(0)->output_layer_type()); + GALOIS_LOG_ASSERT(galois::GNNLayerType::kGraphConvolutional == + gnn.GetIntermediateLayer(1)->layer_type()); + GALOIS_LOG_ASSERT(galois::GNNOutputLayerType::kInvalid == + gnn.GetIntermediateLayer(1)->output_layer_type()); + GALOIS_LOG_ASSERT(galois::GNNLayerType::kInvalid == + gnn.GetOutputLayer()->layer_type()); + GALOIS_LOG_ASSERT(galois::GNNOutputLayerType::kSoftmax == + gnn.GetOutputLayer()->output_layer_type()); + + // assert dimensions are what we expect + const galois::GNNLayerDimensions& layer0_dims = + gnn.GetIntermediateLayer(0)->GetLayerDimensions(); + GALOIS_LOG_ASSERT(layer0_dims.input_rows == 7); + // remember tester has features of length 3 + GALOIS_LOG_ASSERT(layer0_dims.input_columns == 3); + GALOIS_LOG_ASSERT(layer0_dims.output_columns == 4); + + const galois::GNNLayerDimensions& layer1_dims = + gnn.GetIntermediateLayer(1)->GetLayerDimensions(); + GALOIS_LOG_ASSERT(layer1_dims.input_rows == 7); + GALOIS_LOG_ASSERT(layer1_dims.input_columns == 4); + GALOIS_LOG_ASSERT(layer1_dims.output_columns == 7); + + const galois::GNNLayerDimensions& output_dims = + gnn.GetOutputLayer()->GetLayerDimensions(); + GALOIS_LOG_ASSERT(output_dims.input_rows == 7); + GALOIS_LOG_ASSERT(output_dims.input_columns == 7); + GALOIS_LOG_ASSERT(output_dims.output_columns == 7); +} From 53b97018f78ea6acc3f005cfdaac1f49feb11a29 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 1 Oct 2020 18:40:53 -0500 Subject: [PATCH 350/660] Forward phase (inference) of GNN Added initial forward phase/inference call to the gnn class. Added various accessors and setters to layers to facilitate checking the results of a forward pass (e.g. getting reference to output). Test added to make sure the forward pass works as expected. --- libgnn/include/galois/GraphNeuralNetwork.h | 21 +++ libgnn/include/galois/layers/GNNLayer.h | 43 +++--- libgnn/src/GraphNeuralNetwork.cpp | 9 ++ libgnn/test/CMakeLists.txt | 4 + libgnn/test/gnnconstruct-test.cpp | 5 +- libgnn/test/gnnfb-test.cpp | 156 +++++++++++++++++++++ 6 files changed, 220 insertions(+), 18 deletions(-) create mode 100644 libgnn/test/gnnfb-test.cpp diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index a4eb19f375..8762612a9b 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -113,9 +113,30 @@ class GraphNeuralNetwork { GALOIS_LOG_FATAL("Accessing out of bounds intermediate layer {}", i); } } + + //! Set the phases of all layers at once + void SetLayerPhases(galois::GNNPhase phase) { + for (std::unique_ptr& ptr : gnn_layers_) { + ptr->SetLayerPhase(phase); + } + } + + //! Set weights on all layers to 1; should be used for debugging only + void SetAllLayerWeightsTo1() { + for (std::unique_ptr& ptr : gnn_layers_) { + ptr->InitAllWeightsTo1(); + } + } + //! Returns the output layer const galois::GNNLayer* GetOutputLayer() { return gnn_layers_.back().get(); } + //! Propogates the graph's feature vectors through the network to get a new + //! vector representation. + //! Also known as the forward phase in most literature + //! @returns Output layer's output + const std::vector* DoInference(); + private: //! Underlying graph to train std::unique_ptr graph_; diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 4144dbfead..f22507b6be 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -69,7 +69,33 @@ class GNNLayer { //! Initializes all layer weights to 1. This is used as a debug function for //! testing. - void InitAllWeightsTo1() { layer_weights_.assign(layer_weights_.size(), 1); } + void InitAllWeightsTo1() { + if (layer_weights_.size()) { + layer_weights_.assign(layer_weights_.size(), 1); + } + } + + const std::vector& GetForwardOutput() const { + return forward_output_matrix_; + } + const std::vector& GetBackwardOutput() const { + return backward_output_matrix_; + } + + //! Returns the weight gradients + const std::vector& GetLayerWeightGradients() const { + return layer_weight_gradients_; + } + + //! Returns dimensions of this layer + const GNNLayerDimensions& GetLayerDimensions() const { + return layer_dimensions_; + } + + galois::GNNLayerType layer_type() const { return layer_type_; } + galois::GNNOutputLayerType output_layer_type() const { + return output_layer_type_; + } //! Conducts the forward phase given the input to this layer which //! ultimately leads to an output (classfication of node labels) at the end @@ -90,21 +116,6 @@ class GNNLayer { BackwardPhase(const std::vector& prev_layer_input, std::vector* input_gradient) = 0; - //! Returns the weight gradients - const std::vector& GetLayerWeightGradients() const { - return layer_weight_gradients_; - } - - //! Returns dimensions of this layer - const GNNLayerDimensions& GetLayerDimensions() const { - return layer_dimensions_; - } - - galois::GNNLayerType layer_type() const { return layer_type_; } - galois::GNNOutputLayerType output_layer_type() const { - return output_layer_type_; - } - protected: //! Layer order (starts from 0); used in backward to shortcut output as layer //! 0 does not need to do some things that other layers need to do diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 9b6a4ad708..a593a218bf 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -55,3 +55,12 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( GALOIS_LOG_FATAL("Invalid layer type during network construction"); } } + +const std::vector* galois::GraphNeuralNetwork::DoInference() { + // start with graph features and pass it through all layers of the network + const std::vector* layer_input = &(graph_->GetLocalFeatures()); + for (std::unique_ptr& ptr : gnn_layers_) { + layer_input = &(ptr->ForwardPhase(*layer_input)); + } + return layer_input; +} diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index e7a04b5b5f..66c70c6f26 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -14,4 +14,8 @@ add_executable(gnnconstruct-test gnnconstruct-test.cpp) target_link_libraries(gnnconstruct-test galois_gnn) add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test) +add_executable(gnnfb-test gnnfb-test.cpp) +target_link_libraries(gnnfb-test galois_gnn) +add_test(NAME gnnfb-test COMMAND gnnfb-test) + # TODO multi host tests? diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp index 9265eb6b8b..537a16d5b0 100644 --- a/libgnn/test/gnnconstruct-test.cpp +++ b/libgnn/test/gnnconstruct-test.cpp @@ -1,8 +1,9 @@ -#include "galois/Logging.h" -#include "galois/GraphNeuralNetwork.h" //! @file gnnconstruct-test.cpp //! Test to make sure construction works as expected +#include "galois/Logging.h" +#include "galois/GraphNeuralNetwork.h" + int main() { galois::DistMemSys G; diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp new file mode 100644 index 0000000000..7a9ee5d697 --- /dev/null +++ b/libgnn/test/gnnfb-test.cpp @@ -0,0 +1,156 @@ +//! @file gnnfb-test.cpp +//! Runs a forward and backward phase on a GCN and an example graph. + +#include "galois/Logging.h" +#include "galois/GraphNeuralNetwork.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + + GALOIS_LOG_VERBOSE("[{}] Using {} threads", + galois::runtime::getSystemNetworkInterface().ID, + num_threads); + // load test graph + auto test_graph = std::make_unique( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + + // 2 layer test with softmax + std::vector layer_types = { + galois::GNNLayerType::kGraphConvolutional, + galois::GNNLayerType::kGraphConvolutional}; + // note this includes the output; last 2 must be same because softmax + std::vector layer_output_sizes = {4, 7, 7}; + // note GNNConfig is passed in; use a config that does not do anything extra + // like dropout or activation and the like so that input is easier to verify + galois::GraphNeuralNetworkConfig gnn_config( + 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, + galois::GNNConfig()); + auto gnn = std::make_unique( + std::move(test_graph), std::move(gnn_config)); + // for constancy set everything to 1 + gnn->SetAllLayerWeightsTo1(); + + ////////////////////////////////////////////////////////////////////////////// + // forward phase + ////////////////////////////////////////////////////////////////////////////// + gnn->DoInference(); + + // check output for layers to make sure it's as expected + const std::vector& lf0_out = + gnn->GetIntermediateLayer(0)->GetForwardOutput(); + GALOIS_LOG_ASSERT(lf0_out.size() == 28); + for (size_t i = 0; i < 4; i++) { + GALOIS_LOG_ASSERT(lf0_out[0 + i] == 3); + } + for (size_t i = 0; i < 4; i++) { + GALOIS_LOG_ASSERT(lf0_out[4 + i] == 6); + } + for (size_t i = 0; i < 4; i++) { + GALOIS_LOG_ASSERT(lf0_out[8 + i] == 12); + } + for (size_t i = 0; i < 4; i++) { + GALOIS_LOG_ASSERT(lf0_out[12 + i] == 18); + } + for (size_t i = 0; i < 4; i++) { + GALOIS_LOG_ASSERT(lf0_out[16 + i] == 24); + } + for (size_t i = 0; i < 4; i++) { + GALOIS_LOG_ASSERT(lf0_out[20 + i] == 30); + } + for (size_t i = 0; i < 4; i++) { + GALOIS_LOG_ASSERT(lf0_out[24 + i] == 15); + } + + const std::vector& lf1_out = + gnn->GetIntermediateLayer(1)->GetForwardOutput(); + GALOIS_LOG_ASSERT(lf1_out.size() == 49); + for (size_t i = 0; i < 7; i++) { + GALOIS_LOG_ASSERT(lf1_out[0 + i] == 24); + } + for (size_t i = 0; i < 7; i++) { + GALOIS_LOG_ASSERT(lf1_out[7 + i] == 60); + } + for (size_t i = 0; i < 7; i++) { + GALOIS_LOG_ASSERT(lf1_out[14 + i] == 96); + } + for (size_t i = 0; i < 7; i++) { + GALOIS_LOG_ASSERT(lf1_out[21 + i] == 144); + } + for (size_t i = 0; i < 7; i++) { + GALOIS_LOG_ASSERT(lf1_out[28 + i] == 192); + } + for (size_t i = 0; i < 7; i++) { + GALOIS_LOG_ASSERT(lf1_out[35 + i] == 156); + } + for (size_t i = 0; i < 7; i++) { + GALOIS_LOG_ASSERT(lf1_out[42 + i] == 120); + } + + const std::vector& fo_out = + gnn->GetOutputLayer()->GetForwardOutput(); + GALOIS_LOG_ASSERT(fo_out.size() == 49); + // since row all same, prob distribution across row should be same + for (size_t c = 0; c < 49; c += 7) { + for (size_t i = 0; i < 6; i++) { + GALOIS_LOG_VERBOSE("{}", fo_out[c + i]); + GALOIS_LOG_ASSERT(fo_out[c + i] == fo_out[c + i + 1]); + } + } + + // train mode = last 2 should be masked off + for (size_t c = 35; c < 49; c += 7) { + for (size_t i = 0; i < 6; i++) { + GALOIS_LOG_ASSERT(fo_out[c + i] == 0); + } + } + + ////////////////////////////////////////////////////////////////////////////// + // TODO backward phase + ////////////////////////////////////////////////////////////////////////////// + + ////////////////////////////////////////////////////////////////////////////// + // verify forward val and test masks + ////////////////////////////////////////////////////////////////////////////// + gnn->SetLayerPhases(galois::GNNPhase::kValidate); + gnn->DoInference(); + const std::vector& fo_out_val = + gnn->GetOutputLayer()->GetForwardOutput(); + for (size_t c = 0; c < 49; c += 7) { + for (size_t i = 0; i < 6; i++) { + GALOIS_LOG_ASSERT(fo_out_val[c + i] == fo_out_val[c + i + 1]); + } + } + // first 5 and last should be 0s + for (size_t c = 0; c < 35; c += 7) { + for (size_t i = 0; i < 6; i++) { + GALOIS_LOG_ASSERT(fo_out_val[c + i] == 0); + } + } + for (size_t c = 42; c < 49; c += 7) { + for (size_t i = 0; i < 6; i++) { + GALOIS_LOG_ASSERT(fo_out_val[c + i] == 0); + } + } + + // all but last should be 0s + gnn->SetLayerPhases(galois::GNNPhase::kTest); + gnn->DoInference(); + const std::vector& fo_out_test = + gnn->GetOutputLayer()->GetForwardOutput(); + for (size_t c = 0; c < 49; c += 7) { + for (size_t i = 0; i < 6; i++) { + GALOIS_LOG_ASSERT(fo_out_test[c + i] == fo_out_test[c + i + 1]); + } + } + // first 5 and last should be 0s + for (size_t c = 0; c < 42; c += 7) { + for (size_t i = 0; i < 6; i++) { + GALOIS_LOG_ASSERT(fo_out_test[c + i] == 0); + } + } + + // TODO different config of gnn +} From d1aff528137624c6e81d51e5eb2b5424262ae989 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 1 Oct 2020 19:38:49 -0500 Subject: [PATCH 351/660] GNN backward phase; no weight update Added implementation for GNN backward phase without the weight update. Each layer passes its gradients to the previous layer in the GNN where weight gradients as well as gradients for the next layer in propagation are determined. The weight gradients are not yet used to update the weights via SGD. This requires the addition of optimizer classes and will be the next step in this refactoring. Adds a call to the backward phase in the GNN FB test to make sure no crashes occur. --- libgnn/include/galois/GraphNeuralNetwork.h | 6 +++++ libgnn/src/GraphNeuralNetwork.cpp | 30 ++++++++++++++++++++++ libgnn/test/gnnfb-test.cpp | 20 +++++++++++++-- 3 files changed, 54 insertions(+), 2 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 8762612a9b..962350c8c4 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -32,6 +32,7 @@ class GraphNeuralNetworkConfig { output_layer_type, GNNConfig{.do_dropout = true, .dropout_rate = 0.3, + .do_activation = true, .do_normalization = true}) {} //! Construction with a specified config for layers @@ -137,6 +138,11 @@ class GraphNeuralNetwork { //! @returns Output layer's output const std::vector* DoInference(); + //! Backpropagate gradients from the output layer backwards through the + //! network to update the layer weights. Also known as a backward phase in + //! most literature + void GradientPropagation(); + private: //! Underlying graph to train std::unique_ptr graph_; diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index a593a218bf..966cd3238a 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -64,3 +64,33 @@ const std::vector* galois::GraphNeuralNetwork::DoInference() { } return layer_input; } + +void galois::GraphNeuralNetwork::GradientPropagation() { + // from output layer get initial gradients + std::vector dummy; + std::unique_ptr& output_layer = gnn_layers_.back(); + std::vector* current_gradients = + output_layer->BackwardPhase(dummy, nullptr); + + // loops through intermediate layers in a backward fashion + // -1 to ignore output layer which was handled above + for (size_t i = 0; i < gnn_layers_.size() - 1; i++) { + // note this assumes you have at least 2 layers + size_t layer_index = gnn_layers_.size() - 2 - i; + + // get the input to the layer before this one + const std::vector* prev_layer_input; + if (layer_index != 0) { + prev_layer_input = &(gnn_layers_[layer_index - 1]->GetForwardOutput()); + } else { + prev_layer_input = &(graph_->GetLocalFeatures()); + } + + // backward prop and get a new set of gradients + current_gradients = gnn_layers_[layer_index]->BackwardPhase( + *prev_layer_input, current_gradients); + // at this point in the layer the gradients exist; use the gradients to + // update the weights of the layer + // XXX need optimizers + } +} diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp index 7a9ee5d697..9fd43b4675 100644 --- a/libgnn/test/gnnfb-test.cpp +++ b/libgnn/test/gnnfb-test.cpp @@ -108,9 +108,12 @@ int main() { } ////////////////////////////////////////////////////////////////////////////// - // TODO backward phase + // backward phase; run it; verifying is difficult due to floating point + // nature of softmax gradients ////////////////////////////////////////////////////////////////////////////// + gnn->GradientPropagation(); + ////////////////////////////////////////////////////////////////////////////// // verify forward val and test masks ////////////////////////////////////////////////////////////////////////////// @@ -151,6 +154,19 @@ int main() { GALOIS_LOG_ASSERT(fo_out_test[c + i] == 0); } } + ////////////////////////////////////////////////////////////////////////////// + // run different config of gnn with dropout/activation + ////////////////////////////////////////////////////////////////////////////// - // TODO different config of gnn + GALOIS_LOG_VERBOSE("Running with different congifuration"); + + test_graph = std::make_unique( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + galois::GraphNeuralNetworkConfig gnn_config2( + 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax); + auto gnn2 = std::make_unique( + std::move(test_graph), std::move(gnn_config2)); + // run to make sure no crashes occur + gnn2->DoInference(); + gnn2->GradientPropagation(); } From 27d807d336f803d0bc29b6e90dda46776ab0015b Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 5 Oct 2020 14:22:33 -0500 Subject: [PATCH 352/660] Adam optimizer and test Added the Adam optimizer and a test for it. Notably different from the previous implementation is that there is now separate training variables for each layer instead of having them shared among all layers. --- libgnn/CMakeLists.txt | 1 + libgnn/include/galois/GNNOptimizers.h | 70 +++++++++++++++++++++++++++ libgnn/src/GNNOptimizers.cpp | 44 +++++++++++++++++ libgnn/test/CMakeLists.txt | 4 ++ libgnn/test/adam-test.cpp | 44 +++++++++++++++++ 5 files changed, 163 insertions(+) create mode 100644 libgnn/include/galois/GNNOptimizers.h create mode 100644 libgnn/src/GNNOptimizers.cpp create mode 100644 libgnn/test/adam-test.cpp diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index ce6e6f990f..e6c8786cd2 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -23,6 +23,7 @@ set(sources src/GraphConvolutionalLayer.cpp src/SoftmaxLayer.cpp src/GraphNeuralNetwork.cpp + src/GNNOptimizers.cpp ) add_library(galois_gnn STATIC ${sources}) diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h new file mode 100644 index 0000000000..8037cbdef0 --- /dev/null +++ b/libgnn/include/galois/GNNOptimizers.h @@ -0,0 +1,70 @@ +#pragma once +// Code inspired from this; actual code style is not the same + changed some +// things such as adding params for every layer which TinyDNN does not seem to +// do +// https://github.com/tiny-dnn/tiny-dnn/blob/master/tiny_dnn/optimizers/optimizer.h +// Copyright (c) 2013, Taiga Nomi and the respective contributors +// All rights reserved. +// Changed by Galois under 3-BSD +#include "galois/GNNTypes.h" +#include + +namespace galois { + +//! Virtual class; optimizers all need the descent function +class BaseOptimizer { + virtual void GradientDescent(const std::vector& derivatives, + std::vector* matrix, + size_t layer_number) = 0; +}; + +//! Maintains a first and second moment for each weight in the weight matrix and +//! does gradient descent invidiually on each weight +class AdamOptimizer : public BaseOptimizer { +public: + //! Struct for specifying adam config. Defaults based on the Adam paper. + struct AdamConfiguration { + GNNFloat alpha{0.001}; + GNNFloat beta1{0.9}; + GNNFloat beta2{0.999}; + GNNFloat epsilon{1e-8}; + }; + + //! Constructor allocates memory, initializes training vars for each layer + AdamOptimizer(const AdamConfiguration& config, + const std::vector& trainable_layer_sizes, + size_t num_trainable_layers) + : config_(config), num_trainable_layers_(num_trainable_layers), + beta1_power_t_(num_trainable_layers_, config.beta1), + beta2_power_t_(num_trainable_layers_, config.beta2) { + assert(trainable_layer_sizes.size() == num_trainable_layers_); + // allocate vectors based on # of trainable layers + for (size_t layer_size : trainable_layer_sizes) { + first_moments_.emplace_back(layer_size, 0.0); + second_moments_.emplace_back(layer_size, 0.0); + } + assert(first_moments_.size() == num_trainable_layers_); + assert(second_moments_.size() == num_trainable_layers_); + } + //! Adam based gradient descent + void GradientDescent(const std::vector& derivatives, + std::vector* matrix, + size_t layer_number) final; + +private: + //! Configuration options for this layer + AdamConfiguration config_; + //! First moment vectors; one for each trainable layer + std::vector> first_moments_; + //! Second moment vectors; one for each trainable layer + std::vector> second_moments_; + //! Number of layers that can be trained (need moment vectors for each) + size_t num_trainable_layers_; + // power terms used in adam: updated by raising power every time update is + // called + // vector because one is necessary for each layer + std::vector beta1_power_t_; + std::vector beta2_power_t_; +}; + +} // namespace galois diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp new file mode 100644 index 0000000000..8698aa37c3 --- /dev/null +++ b/libgnn/src/GNNOptimizers.cpp @@ -0,0 +1,44 @@ +#include "galois/Galois.h" +#include "galois/GNNOptimizers.h" +#include "galois/Logging.h" +#include + +void galois::AdamOptimizer::GradientDescent( + const std::vector& derivatives, std::vector* matrix, + size_t layer_number) { + assert(derivatives.size() == matrix->size()); + + // grab based on layer being used + std::vector& first_moment = first_moments_[layer_number]; + std::vector& second_moment = second_moments_[layer_number]; + assert(derivatives.size() == first_moment.size()); + assert(derivatives.size() == second_moment.size()); + + // individual weight updates via gradients + galois::do_all( + galois::iterate(static_cast(0), matrix->size()), + [&](size_t i) { + // moment estimate updates + first_moment[i] = config_.beta1 * first_moment[i] + + (1.0 - config_.beta1) * derivatives[i]; + second_moment[i] = + config_.beta2 * second_moment[i] + + (1.0 - config_.beta2) * (derivatives[i] * derivatives[i]); + GALOIS_LOG_VERBOSE("{} {}", first_moment[i], second_moment[i]); + // bias corrected moments using beta power + GNNFloat bias_correct_first = + first_moment[i] / (1.0 - beta1_power_t_[layer_number]); + GNNFloat bias_correct_second = + second_moment[i] / (1.0 - beta2_power_t_[layer_number]); + GALOIS_LOG_VERBOSE("{} {}", bias_correct_first, bias_correct_second); + // weight update using bias corrected moments + (matrix->data())[i] -= + config_.alpha * bias_correct_first / + (std::sqrt(bias_correct_second) + config_.epsilon); + }, + galois::loopname("AdamOptimizerGradientDescent")); + + // update the power terms for next update call + beta1_power_t_[layer_number] *= config_.beta1; + beta2_power_t_[layer_number] *= config_.beta2; +} diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 66c70c6f26..029f785ad1 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -18,4 +18,8 @@ add_executable(gnnfb-test gnnfb-test.cpp) target_link_libraries(gnnfb-test galois_gnn) add_test(NAME gnnfb-test COMMAND gnnfb-test) +add_executable(adam-test adam-test.cpp) +target_link_libraries(adam-test galois_gnn) +add_test(NAME adam-test COMMAND adam-test) + # TODO multi host tests? diff --git a/libgnn/test/adam-test.cpp b/libgnn/test/adam-test.cpp new file mode 100644 index 0000000000..e01368ce87 --- /dev/null +++ b/libgnn/test/adam-test.cpp @@ -0,0 +1,44 @@ +//! @file adam-test.cpp +//! Tests the adam optimizer +#include "galois/DistGalois.h" +#include "galois/GNNOptimizers.h" +#include "galois/Logging.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + GALOIS_LOG_VERBOSE("[{}] Using {} threads", + galois::runtime::getSystemNetworkInterface().ID, + num_threads); + + // create sample config that is easy to trace + galois::AdamOptimizer::AdamConfiguration config = { + .alpha = 1, .beta1 = 0.5, .beta2 = 0.5, .epsilon = 0}; + std::vector layer_sizes = {2, 1}; + galois::AdamOptimizer adam(config, layer_sizes, 2); + + std::vector weights1 = {1, 1}; + std::vector weights2 = {10}; + std::vector grad1 = {1, 1}; + std::vector grad2 = {10}; + + adam.GradientDescent(grad1, &weights1, 0); + // check weights + GALOIS_LOG_ASSERT(weights1[0] == 0.0); + GALOIS_LOG_ASSERT(weights1[1] == 0.0); + + adam.GradientDescent(grad2, &weights2, 1); + GALOIS_LOG_ASSERT(weights2[0] == 9.0); + + // run again to check if adam keeps moments from before + adam.GradientDescent(grad1, &weights1, 0); + // check weights again (turns out derivative one ends up doing same thing) + GALOIS_LOG_ASSERT(weights1[0] == -1.0); + GALOIS_LOG_ASSERT(weights1[1] == -1.0); + + // grad 2 again + adam.GradientDescent(grad2, &weights2, 1); + GALOIS_LOG_ASSERT(weights2[0] == 8.0); +} From 34c9848df23614e7bc24e6ab31a99b8a05007d63 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 5 Oct 2020 18:24:38 -0500 Subject: [PATCH 353/660] GNN now takes an optimizer object Prep for gradient descent in GNN backward phase by adding an argument to GNN constructor requiring that an optimizer be specified. Changed tests to have optimizers as well. --- libgnn/include/galois/GNNOptimizers.h | 15 +++++++++++---- libgnn/include/galois/GraphNeuralNetwork.h | 4 ++++ libgnn/src/GraphNeuralNetwork.cpp | 4 +++- libgnn/test/gnnconstruct-test.cpp | 5 ++++- libgnn/test/gnnfb-test.cpp | 10 ++++++---- 5 files changed, 28 insertions(+), 10 deletions(-) diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h index 8037cbdef0..84531e5a20 100644 --- a/libgnn/include/galois/GNNOptimizers.h +++ b/libgnn/include/galois/GNNOptimizers.h @@ -8,6 +8,7 @@ // Changed by Galois under 3-BSD #include "galois/GNNTypes.h" #include +#include namespace galois { @@ -30,6 +31,11 @@ class AdamOptimizer : public BaseOptimizer { GNNFloat epsilon{1e-8}; }; + AdamOptimizer(const std::vector& trainable_layer_sizes, + size_t num_trainable_layers) + : AdamOptimizer(AdamConfiguration(), trainable_layer_sizes, + num_trainable_layers) {} + //! Constructor allocates memory, initializes training vars for each layer AdamOptimizer(const AdamConfiguration& config, const std::vector& trainable_layer_sizes, @@ -37,11 +43,12 @@ class AdamOptimizer : public BaseOptimizer { : config_(config), num_trainable_layers_(num_trainable_layers), beta1_power_t_(num_trainable_layers_, config.beta1), beta2_power_t_(num_trainable_layers_, config.beta2) { - assert(trainable_layer_sizes.size() == num_trainable_layers_); + // >= because only prefix will be considered otherwise + assert(trainable_layer_sizes.size() >= num_trainable_layers_); // allocate vectors based on # of trainable layers - for (size_t layer_size : trainable_layer_sizes) { - first_moments_.emplace_back(layer_size, 0.0); - second_moments_.emplace_back(layer_size, 0.0); + for (size_t i = 0; i < num_trainable_layers_; i++) { + first_moments_.emplace_back(trainable_layer_sizes[i], 0.0); + second_moments_.emplace_back(trainable_layer_sizes[i], 0.0); } assert(first_moments_.size() == num_trainable_layers_); assert(second_moments_.size() == num_trainable_layers_); diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 962350c8c4..d9cd6febc9 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -5,6 +5,7 @@ //! well as helper enums/classes involved with the GNN. #include "galois/Logging.h" +#include "galois/GNNOptimizers.h" #include "galois/graphs/GNNGraph.h" #include "galois/layers/GNNLayer.h" @@ -101,6 +102,7 @@ class GraphNeuralNetwork { //! Construct the graph neural network given the graph to train on as well as //! a configuration object GraphNeuralNetwork(std::unique_ptr graph, + std::unique_ptr optimizer, GraphNeuralNetworkConfig&& config); //! Number of intermediate layers (DOES NOT INCLUDE OUTPUT LAYER) @@ -146,6 +148,8 @@ class GraphNeuralNetwork { private: //! Underlying graph to train std::unique_ptr graph_; + //! Optimizer object for weight updates + std::unique_ptr optimizer_; //! Configuration object used to construct this GNN GraphNeuralNetworkConfig config_; //! GNN layers including the output diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 966cd3238a..daaa49297f 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -4,8 +4,10 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( std::unique_ptr graph, + std::unique_ptr optimizer, galois::GraphNeuralNetworkConfig&& config) - : graph_(std::move(graph)), config_(std::move(config)) { + : graph_(std::move(graph)), optimizer_(std::move(optimizer)), + config_(std::move(config)) { // max number of rows that can be passed as inputs; allocate space for it as // this will be the # of rows for each layer size_t max_rows = graph_->size(); diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp index 537a16d5b0..25abf0e4a1 100644 --- a/libgnn/test/gnnconstruct-test.cpp +++ b/libgnn/test/gnnconstruct-test.cpp @@ -25,7 +25,10 @@ int main() { std::vector layer_output_sizes = {4, 7, 7}; galois::GraphNeuralNetworkConfig gnn_config( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax); - galois::GraphNeuralNetwork gnn(std::move(test_graph), std::move(gnn_config)); + auto adam = std::make_unique(layer_output_sizes, 2); + + galois::GraphNeuralNetwork gnn(std::move(test_graph), std::move(adam), + std::move(gnn_config)); // note this does not include output layer GALOIS_LOG_ASSERT(gnn.num_intermediate_layers() == 2); diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp index 9fd43b4675..75b91f40b6 100644 --- a/libgnn/test/gnnfb-test.cpp +++ b/libgnn/test/gnnfb-test.cpp @@ -28,8 +28,9 @@ int main() { galois::GraphNeuralNetworkConfig gnn_config( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, galois::GNNConfig()); - auto gnn = std::make_unique( - std::move(test_graph), std::move(gnn_config)); + auto adam = std::make_unique(layer_output_sizes, 2); + auto gnn = std::make_unique( + std::move(test_graph), std::move(adam), std::move(gnn_config)); // for constancy set everything to 1 gnn->SetAllLayerWeightsTo1(); @@ -164,8 +165,9 @@ int main() { "tester", galois::graphs::GNNPartitionScheme::kOEC, true); galois::GraphNeuralNetworkConfig gnn_config2( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax); - auto gnn2 = std::make_unique( - std::move(test_graph), std::move(gnn_config2)); + auto adam2 = std::make_unique(layer_output_sizes, 2); + auto gnn2 = std::make_unique( + std::move(test_graph), std::move(adam2), std::move(gnn_config2)); // run to make sure no crashes occur gnn2->DoInference(); gnn2->GradientPropagation(); From b0ecc157238f4dbad80de62902f19091c79a3660 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 5 Oct 2020 18:57:15 -0500 Subject: [PATCH 354/660] Added gradient descent to backward pass GNN's backward pass now also calls the optimizer to update weights based on the gradient, thus completing the pipeline for a full training epoch. The tests have been updated accordingly to fix the adam size (supposed to be size of layer, not output). Next step is to add accuracy measures and then a full program can be tested end to end. --- libgnn/include/galois/GNNOptimizers.h | 1 + libgnn/include/galois/layers/GNNLayer.h | 6 +++++- libgnn/src/GNNLayer.cpp | 6 ++++++ libgnn/src/GraphNeuralNetwork.cpp | 3 ++- libgnn/test/gnnconstruct-test.cpp | 3 ++- libgnn/test/gnnfb-test.cpp | 12 ++++++++++-- 6 files changed, 26 insertions(+), 5 deletions(-) diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h index 84531e5a20..a970c54c56 100644 --- a/libgnn/include/galois/GNNOptimizers.h +++ b/libgnn/include/galois/GNNOptimizers.h @@ -14,6 +14,7 @@ namespace galois { //! Virtual class; optimizers all need the descent function class BaseOptimizer { +public: virtual void GradientDescent(const std::vector& derivatives, std::vector* matrix, size_t layer_number) = 0; diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index f22507b6be..3647434be6 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -1,6 +1,7 @@ #pragma once #include "galois/PerThreadRNG.h" +#include "galois/GNNOptimizers.h" #include "galois/graphs/GNNGraph.h" namespace galois { @@ -116,6 +117,10 @@ class GNNLayer { BackwardPhase(const std::vector& prev_layer_input, std::vector* input_gradient) = 0; + //! Given an optimizer, update the weights in this layer based on gradients + //! stored in the layer + void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number); + protected: //! Layer order (starts from 0); used in backward to shortcut output as layer //! 0 does not need to do some things that other layers need to do @@ -170,7 +175,6 @@ class GNNLayer { //! matrix void Activation(); //! Calculate derivative of activation function based on config on the matrix - // XXX void ActivationDerivative(std::vector* matrix); }; diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/GNNLayer.cpp index d14a5d1b05..33114a2f06 100644 --- a/libgnn/src/GNNLayer.cpp +++ b/libgnn/src/GNNLayer.cpp @@ -100,3 +100,9 @@ void galois::GNNLayer::ActivationDerivative(std::vector* gradient) { }, galois::loopname("ReLU-Derivative")); } + +void galois::GNNLayer::OptimizeLayer(BaseOptimizer* optimizer, + size_t trainable_layer_number) { + optimizer->GradientDescent(layer_weight_gradients_, &layer_weights_, + trainable_layer_number); +} diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index daaa49297f..18675d2ce4 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -91,8 +91,9 @@ void galois::GraphNeuralNetwork::GradientPropagation() { // backward prop and get a new set of gradients current_gradients = gnn_layers_[layer_index]->BackwardPhase( *prev_layer_input, current_gradients); + // if not output do optimization/gradient descent // at this point in the layer the gradients exist; use the gradients to // update the weights of the layer - // XXX need optimizers + gnn_layers_[layer_index]->OptimizeLayer(optimizer_.get(), layer_index); } } diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp index 25abf0e4a1..69c64105f6 100644 --- a/libgnn/test/gnnconstruct-test.cpp +++ b/libgnn/test/gnnconstruct-test.cpp @@ -25,7 +25,8 @@ int main() { std::vector layer_output_sizes = {4, 7, 7}; galois::GraphNeuralNetworkConfig gnn_config( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax); - auto adam = std::make_unique(layer_output_sizes, 2); + std::vector adam_sizes = {12, 28}; + auto adam = std::make_unique(adam_sizes, 2); galois::GraphNeuralNetwork gnn(std::move(test_graph), std::move(adam), std::move(gnn_config)); diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp index 75b91f40b6..8142b2435b 100644 --- a/libgnn/test/gnnfb-test.cpp +++ b/libgnn/test/gnnfb-test.cpp @@ -28,7 +28,10 @@ int main() { galois::GraphNeuralNetworkConfig gnn_config( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, galois::GNNConfig()); - auto adam = std::make_unique(layer_output_sizes, 2); + // input is 7 x 3, layers are then 3 x 4 and 4 x 7 and 7 x 7 + // middle 2 are trainable so 12 and 28 + std::vector adam_sizes = {12, 28}; + auto adam = std::make_unique(adam_sizes, 2); auto gnn = std::make_unique( std::move(test_graph), std::move(adam), std::move(gnn_config)); // for constancy set everything to 1 @@ -119,6 +122,7 @@ int main() { // verify forward val and test masks ////////////////////////////////////////////////////////////////////////////// gnn->SetLayerPhases(galois::GNNPhase::kValidate); + gnn->SetAllLayerWeightsTo1(); gnn->DoInference(); const std::vector& fo_out_val = gnn->GetOutputLayer()->GetForwardOutput(); @@ -138,9 +142,11 @@ int main() { GALOIS_LOG_ASSERT(fo_out_val[c + i] == 0); } } + gnn->GradientPropagation(); // all but last should be 0s gnn->SetLayerPhases(galois::GNNPhase::kTest); + gnn->SetAllLayerWeightsTo1(); gnn->DoInference(); const std::vector& fo_out_test = gnn->GetOutputLayer()->GetForwardOutput(); @@ -155,6 +161,8 @@ int main() { GALOIS_LOG_ASSERT(fo_out_test[c + i] == 0); } } + gnn->GradientPropagation(); + ////////////////////////////////////////////////////////////////////////////// // run different config of gnn with dropout/activation ////////////////////////////////////////////////////////////////////////////// @@ -165,7 +173,7 @@ int main() { "tester", galois::graphs::GNNPartitionScheme::kOEC, true); galois::GraphNeuralNetworkConfig gnn_config2( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax); - auto adam2 = std::make_unique(layer_output_sizes, 2); + auto adam2 = std::make_unique(adam_sizes, 2); auto gnn2 = std::make_unique( std::move(test_graph), std::move(adam2), std::move(gnn_config2)); // run to make sure no crashes occur From b07f243ff0e1d322d8cabbe9a04132275220b3cd Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 6 Oct 2020 13:24:49 -0500 Subject: [PATCH 355/660] Global accuracy in GNN; test for it as well Added a function to the GNN class to get accuracy of a prediction distribution from a softmax layer based on training, validiation, and test modes in the GNN. Added a field to track the mode in the GNN in addition to it being tracked in the layers. Added a test to make sure accuracy readings were returning correctly as expected as well. --- libgnn/include/galois/GraphNeuralNetwork.h | 11 ++- libgnn/include/galois/graphs/GNNGraph.h | 9 +++ libgnn/include/galois/layers/SoftmaxLayer.h | 1 - libgnn/src/GraphNeuralNetwork.cpp | 41 ++++++++++ libgnn/test/CMakeLists.txt | 4 + libgnn/test/accuracy-test.cpp | 89 +++++++++++++++++++++ 6 files changed, 153 insertions(+), 2 deletions(-) create mode 100644 libgnn/test/accuracy-test.cpp diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index d9cd6febc9..80f7b07916 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -117,8 +117,9 @@ class GraphNeuralNetwork { } } - //! Set the phases of all layers at once + //! Set the phases of all layers at once as well as this network void SetLayerPhases(galois::GNNPhase phase) { + phase_ = phase; for (std::unique_ptr& ptr : gnn_layers_) { ptr->SetLayerPhase(phase); } @@ -140,6 +141,8 @@ class GraphNeuralNetwork { //! @returns Output layer's output const std::vector* DoInference(); + float GetGlobalAccuracy(const std::vector& predictions); + //! Backpropagate gradients from the output layer backwards through the //! network to update the layer weights. Also known as a backward phase in //! most literature @@ -154,6 +157,12 @@ class GraphNeuralNetwork { GraphNeuralNetworkConfig config_; //! GNN layers including the output std::vector> gnn_layers_; + //! Current phase of the GNN: train, validation, test + GNNPhase phase_{GNNPhase::kTrain}; + //! Used to track accurate predictions during accuracy calculation + DGAccumulator num_correct_; + //! Used to count total number of things checked during accuracy calculation + DGAccumulator total_checked_; }; } // namespace galois diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index fa06453df9..79d96d0da5 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -49,6 +49,15 @@ class GNNGraph { NodeIterator end() const { return partitioned_graph_->allNodesRange().end(); } //! Return GID of some local node size_t GetGID(unsigned lid) const { return partitioned_graph_->getGID(lid); } + + NodeIterator begin_owned() const { + return partitioned_graph_->masterNodesRange().begin(); + } + + NodeIterator end_owned() const { + return partitioned_graph_->masterNodesRange().end(); + } + //! Given an LID and the current phase of GNN computation, determine if the //! lid in question is valid for the current phase (i.e., it is part of //! a training, validation, or test phase mask) diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index 3052429b8b..3b5ace94c8 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -33,7 +33,6 @@ class SoftmaxLayer : public GNNLayer { BackwardPhase(const std::vector& prev_layer_input, std::vector* input_gradient) final; - // TODO prediction loss function? private: //! Loss for each row of the input std::vector input_loss_; diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 18675d2ce4..7c209a3cbf 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -1,3 +1,4 @@ +#include "galois/GNNMath.h" #include "galois/GraphNeuralNetwork.h" #include "galois/layers/GraphConvolutionalLayer.h" #include "galois/layers/SoftmaxLayer.h" @@ -67,6 +68,46 @@ const std::vector* galois::GraphNeuralNetwork::DoInference() { return layer_input; } +float galois::GraphNeuralNetwork::GetGlobalAccuracy( + const std::vector& predictions) { + // check owned nodes' accuracy + size_t num_labels = graph_->GetNumLabelClasses(); + assert((graph_->GetNumLabelClasses() * graph_->size()) == predictions.size()); + num_correct_.reset(); + total_checked_.reset(); + + galois::do_all( + galois::iterate(graph_->begin_owned(), graph_->end_owned()), + [&](const unsigned lid) { + if (graph_->IsValidForPhase(lid, phase_)) { + total_checked_ += 1; + // get prediction by getting max + size_t predicted_label = + galois::MaxIndex(num_labels, &(predictions[lid * num_labels])); + // GALOIS_LOG_VERBOSE("Checking LID {} with label {} against + // prediction {}", + // lid, graph_->GetSingleClassLabel(lid), + // predicted_label); + // check against ground truth and track accordingly + // TODO static cast used here is dangerous + if (predicted_label == + static_cast(graph_->GetSingleClassLabel(lid))) { + num_correct_ += 1; + } + } + }, + // TODO chunk size? + // steal on as some threads may have nothing to work on + galois::steal(), galois::loopname("GlobalAccuracy")); + // TODO revise for later when multi-class labels come in + + size_t global_correct = num_correct_.reduce(); + size_t global_checked = total_checked_.reduce(); + + return static_cast(global_correct) / + static_cast(global_checked); +} + void galois::GraphNeuralNetwork::GradientPropagation() { // from output layer get initial gradients std::vector dummy; diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 029f785ad1..5934ad6331 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -22,4 +22,8 @@ add_executable(adam-test adam-test.cpp) target_link_libraries(adam-test galois_gnn) add_test(NAME adam-test COMMAND adam-test) +add_executable(accuracy-test accuracy-test.cpp) +target_link_libraries(accuracy-test galois_gnn) +add_test(NAME accuracy-test COMMAND accuracy-test) + # TODO multi host tests? diff --git a/libgnn/test/accuracy-test.cpp b/libgnn/test/accuracy-test.cpp new file mode 100644 index 0000000000..61d449255f --- /dev/null +++ b/libgnn/test/accuracy-test.cpp @@ -0,0 +1,89 @@ +//! @file accuracy-test.cpp +//! Similar to softmax test except that accuracy is checked + it constructs +//! a full network object. + +#include "galois/Logging.h" +#include "galois/GraphNeuralNetwork.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + GALOIS_LOG_VERBOSE("Num threads is {}", num_threads); + + // load test graph + auto test_graph = std::make_unique( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + + std::vector layer_types = { + galois::GNNLayerType::kGraphConvolutional}; + std::vector layer_output_sizes = {7, 7}; + galois::GraphNeuralNetworkConfig gnn_config( + 1, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, + galois::GNNConfig()); + + std::vector adam_sizes = {21}; + auto adam = std::make_unique(adam_sizes, 1); + + auto gnn = std::make_unique( + std::move(test_graph), std::move(adam), std::move(gnn_config)); + // for constancy set everything to 1 + gnn->SetAllLayerWeightsTo1(); + + ////////////////////////////////////////////////////////////////////////////// + + const std::vector* distributions = gnn->DoInference(); + // accuracy will be 0.2: everything chooses the first 1 as the entire row + // is the same + float pred_accuracy = gnn->GetGlobalAccuracy(*distributions); + GALOIS_LOG_VERBOSE("{}", pred_accuracy); + GALOIS_LOG_ASSERT(pred_accuracy == static_cast(0.2)); + + // validation mode + gnn->SetLayerPhases(galois::GNNPhase::kValidate); + const std::vector* dist2 = gnn->DoInference(); + pred_accuracy = gnn->GetGlobalAccuracy(*dist2); + GALOIS_LOG_ASSERT(pred_accuracy == static_cast(0.0)); + + // test mode + gnn->SetLayerPhases(galois::GNNPhase::kTest); + const std::vector* dist3 = gnn->DoInference(); + pred_accuracy = gnn->GetGlobalAccuracy(*dist3); + GALOIS_LOG_ASSERT(pred_accuracy == static_cast(0.0)); + + // manufactured predictions to make sure it predicts things correctly based + // on mode + // prediction is correct if diagonal of the 7x7 matrix has largest value + std::vector mpred = { + 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, + 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1}; + + gnn->SetLayerPhases(galois::GNNPhase::kTrain); + pred_accuracy = gnn->GetGlobalAccuracy(mpred); + GALOIS_LOG_VERBOSE("{}", pred_accuracy); + GALOIS_LOG_ASSERT(pred_accuracy == static_cast(0.8)); + + gnn->SetLayerPhases(galois::GNNPhase::kValidate); + pred_accuracy = gnn->GetGlobalAccuracy(mpred); + GALOIS_LOG_ASSERT(pred_accuracy == static_cast(0.0)); + + gnn->SetLayerPhases(galois::GNNPhase::kTest); + pred_accuracy = gnn->GetGlobalAccuracy(mpred); + GALOIS_LOG_ASSERT(pred_accuracy == static_cast(1.0)); + + std::vector mpred2 = { + 0.5, 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 0, 0, 0.1, 0, 1, + 0, 0, 0, 0, 0, 0, 0, 0.3, 0, 0, 0, 1, 0, 0, 0, 2, 0, + 0, 0, 0, 0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 0.1}; + pred_accuracy = gnn->GetGlobalAccuracy(mpred2); + GALOIS_LOG_ASSERT(pred_accuracy == static_cast(1.0)); + + gnn->SetLayerPhases(galois::GNNPhase::kValidate); + pred_accuracy = gnn->GetGlobalAccuracy(mpred2); + GALOIS_LOG_ASSERT(pred_accuracy == static_cast(1.0)); + + gnn->SetLayerPhases(galois::GNNPhase::kTest); + pred_accuracy = gnn->GetGlobalAccuracy(mpred2); + GALOIS_LOG_ASSERT(pred_accuracy == static_cast(1.0)); +} From 3a735e4c03c9ab651f5b109a0179935932618073 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 7 Oct 2020 16:09:51 -0500 Subject: [PATCH 356/660] Various libgnn cleanup/bugfixes - Initialize different seeds for each thread in per thread RNG. - Fix dropout sizes being used (there's still a problem with it however). - Added more assertions for safety purposes throughout code. - Changed default value in Adam optimizer. - Removed some VERBOSE prints to clean things up. And various other things I may have forgotten. Accidentally included CMakeList modification in this commit for epoch-test; will be added in next commit. --- libgnn/include/galois/GNNOptimizers.h | 2 +- libgnn/include/galois/PerThreadRNG.h | 12 ++++++++++-- libgnn/src/GNNLayer.cpp | 21 +++++++++++++-------- libgnn/src/GNNMath.cpp | 4 +--- libgnn/src/GNNOptimizers.cpp | 2 -- libgnn/src/GraphConvolutionalLayer.cpp | 14 +++++++++----- libgnn/src/GraphNeuralNetwork.cpp | 4 +++- libgnn/src/SoftmaxLayer.cpp | 20 +++++++++++--------- libgnn/test/CMakeLists.txt | 4 ++++ 9 files changed, 52 insertions(+), 31 deletions(-) diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h index a970c54c56..c0e8dd2582 100644 --- a/libgnn/include/galois/GNNOptimizers.h +++ b/libgnn/include/galois/GNNOptimizers.h @@ -26,7 +26,7 @@ class AdamOptimizer : public BaseOptimizer { public: //! Struct for specifying adam config. Defaults based on the Adam paper. struct AdamConfiguration { - GNNFloat alpha{0.001}; + GNNFloat alpha{0.01}; GNNFloat beta1{0.9}; GNNFloat beta2{0.999}; GNNFloat epsilon{1e-8}; diff --git a/libgnn/include/galois/PerThreadRNG.h b/libgnn/include/galois/PerThreadRNG.h index 80f8d11f0a..fde88386ab 100644 --- a/libgnn/include/galois/PerThreadRNG.h +++ b/libgnn/include/galois/PerThreadRNG.h @@ -1,7 +1,9 @@ #pragma once #include #include "galois/substrate/PerThreadStorage.h" +#include "galois/Galois.h" #include "galois/GNNTypes.h" +#include "galois/Logging.h" namespace galois { @@ -9,9 +11,15 @@ namespace galois { class PerThreadRNG { public: //! Default seed 0, default distribution 0 to 1 - PerThreadRNG() : distribution_{0.0, 1.0} {}; + PerThreadRNG() : PerThreadRNG(0.0, 1.0){}; //! User specified range - PerThreadRNG(float begin, float end) : distribution_{begin, end} {}; + PerThreadRNG(float begin, float end) : distribution_{begin, end} { + // each thread needs to have a different seed so that the same # isn't + // chosen across all threads + galois::on_each([&](unsigned tid, unsigned n_threads) { + engine_.getLocal()->seed(tid * n_threads); + }); + }; //! Returns a random number between numbers specified during init GNNFloat GetRandomNumber() { return (*distribution_.getLocal())(*engine_.getLocal()); diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/GNNLayer.cpp index 33114a2f06..396f7ddf7c 100644 --- a/libgnn/src/GNNLayer.cpp +++ b/libgnn/src/GNNLayer.cpp @@ -1,3 +1,4 @@ +#include "galois/Logging.h" #include "galois/layers/GNNLayer.h" galois::GNNLayer::GNNLayer(size_t layer_num, @@ -9,8 +10,8 @@ galois::GNNLayer::GNNLayer(size_t layer_num, if (config_.allocate_weights) { // TODO some of this does not need alloc if not used // dropout allocation; dropout is same as input - dropout_mask_.resize(layer_dimensions_.input_rows * - layer_dimensions_.input_columns); + dropout_mask_.resize( + layer_dimensions_.input_rows * layer_dimensions_.input_columns, false); // allocate memory based on layer dimensions size_t num_weight_elements = layer_dimensions_.input_columns * layer_dimensions_.output_columns; @@ -37,22 +38,25 @@ void galois::GNNLayer::RandomInitVector(std::vector* vector_to_init) { galois::loopname("RandomInitVector")); } +// XXX Something is wrong with dropout; accuracy suffers, figure out what +// it is void galois::GNNLayer::DoDropout(std::vector* output_matrix) { - // XXX fix droptout, should use inputs not weights - size_t num_weights = layer_weights_.size(); + size_t num_elements = output_matrix->size(); + assert(num_elements == dropout_mask_.size()); + // determine which weights to drop galois::do_all( - galois::iterate(static_cast(0), num_weights), + galois::iterate(static_cast(0), num_elements), [&](size_t i) { dropout_mask_[i] = dropout_rng_.DoBernoulli(config_.dropout_rate); }, galois::loopname("LayerDropoutRNG")); // create new matrix with non-dropped weights + some scaling - // TODO scaling? + // TODO save scaling elsewhere? GNNFloat scale = 1. / (1. - config_.dropout_rate); galois::do_all( - galois::iterate(static_cast(0), num_weights), + galois::iterate(static_cast(0), num_elements), [&](size_t i) { (*output_matrix)[i] = layer_weights_[i] * static_cast(dropout_mask_[i]) * scale; @@ -61,7 +65,9 @@ void galois::GNNLayer::DoDropout(std::vector* output_matrix) { } void galois::GNNLayer::DoDropoutDerivative() { + assert(backward_output_matrix_.size() == dropout_mask_.size()); GNNFloat scale = 1. / (1. - config_.dropout_rate); + // use dropout mask to figure out derivative galois::do_all( galois::iterate(static_cast(0), backward_output_matrix_.size()), @@ -88,7 +94,6 @@ void galois::GNNLayer::Activation() { void galois::GNNLayer::ActivationDerivative(std::vector* gradient) { // TODO only does relu at the moment; should check user specified activation // and act accordingly - // XXX // keep gradient if the original output is greater than 0 galois::do_all( galois::iterate(static_cast(0), gradient->size()), diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp index 303e872e2a..0087c7340b 100644 --- a/libgnn/src/GNNMath.cpp +++ b/libgnn/src/GNNMath.cpp @@ -84,8 +84,6 @@ galois::GNNFloat galois::GNNCrossEntropy(const size_t vector_length, continue; } - GALOIS_LOG_VERBOSE("Truth {} input {}", ground_truth[i], input[i]); - if (input[i] == 0.0) { loss -= ground_truth[i] * std::log(static_cast(1e-10)); } else { @@ -101,7 +99,7 @@ void galois::GNNCrossEntropyDerivative(const size_t vector_length, const GNNFloat* input, GNNFloat* gradients) { for (size_t i = 0; i < vector_length; i++) { - gradients[i] = -(ground_truth[i]) / (input[i] + 1e-10); + gradients[i] = -(ground_truth[i]) / (input[i] + static_cast(1e-10)); } } diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp index 8698aa37c3..53088825fd 100644 --- a/libgnn/src/GNNOptimizers.cpp +++ b/libgnn/src/GNNOptimizers.cpp @@ -24,13 +24,11 @@ void galois::AdamOptimizer::GradientDescent( second_moment[i] = config_.beta2 * second_moment[i] + (1.0 - config_.beta2) * (derivatives[i] * derivatives[i]); - GALOIS_LOG_VERBOSE("{} {}", first_moment[i], second_moment[i]); // bias corrected moments using beta power GNNFloat bias_correct_first = first_moment[i] / (1.0 - beta1_power_t_[layer_number]); GNNFloat bias_correct_second = second_moment[i] / (1.0 - beta2_power_t_[layer_number]); - GALOIS_LOG_VERBOSE("{} {}", bias_correct_first, bias_correct_second); // weight update using bias corrected moments (matrix->data())[i] -= config_.alpha * bias_correct_first / diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp index a5abe1d0ef..7b513374db 100644 --- a/libgnn/src/GraphConvolutionalLayer.cpp +++ b/libgnn/src/GraphConvolutionalLayer.cpp @@ -34,17 +34,14 @@ galois::GraphConvolutionalLayer::ForwardPhase( const GNNFloat* input_data = input_embeddings.data(); // first, dropout // TODO only dropout if in training apparently - if (config_.do_dropout) { - GALOIS_LOG_VERBOSE("Doing dropout"); + if (config_.do_dropout && (layer_phase_ == GNNPhase::kTrain)) { DoDropout(&in_temp_1_); input_data = in_temp_1_.data(); } - GALOIS_LOG_VERBOSE("Doing aggregate"); // aggregation and update (or vice versa) AggregateAll(layer_dimensions_.input_columns, input_data, in_temp_2_.data(), &input_column_intermediates_); - GALOIS_LOG_VERBOSE("Doing embedding update"); // TODO synchronization of aggregation functions UpdateEmbeddings(in_temp_2_.data(), forward_output_matrix_.data()); @@ -64,6 +61,7 @@ galois::GraphConvolutionalLayer::ForwardPhase( std::vector* galois::GraphConvolutionalLayer::BackwardPhase( const std::vector& prev_layer_input, std::vector* input_gradient) { + assert(layer_phase_ == GNNPhase::kTrain); // derivative of activation if (config_.do_activation) { ActivationDerivative(input_gradient); @@ -73,6 +71,10 @@ std::vector* galois::GraphConvolutionalLayer::BackwardPhase( // TODO do optimized cased like the forward if (layer_number_ != 0) { // transposed sgemm for derivative; in_temp is output + assert(input_gradient->size() == + layer_dimensions_.input_rows * layer_dimensions_.output_columns); + assert(in_temp_1_.size() == + layer_dimensions_.input_columns * layer_dimensions_.input_rows); UpdateEmbeddingsDerivative(input_gradient->data(), in_temp_1_.data()); // derivative of aggregate is the same due to symmetric graph AggregateAll(layer_dimensions_.input_columns, in_temp_1_.data(), @@ -87,7 +89,7 @@ std::vector* galois::GraphConvolutionalLayer::BackwardPhase( input_gradient->data(), layer_weight_gradients_.data()); // TODO sync weights - if (config_.do_dropout) { + if (config_.do_dropout && layer_number_ != 0) { DoDropoutDerivative(); } @@ -155,6 +157,8 @@ void galois::GraphConvolutionalLayer::UpdateEmbeddings( void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative( const GNNFloat* gradients, GNNFloat* output) { + assert(layer_weights_.size() == + layer_dimensions_.input_columns * layer_dimensions_.output_columns); // difference is Trans for B matrix (data) to get z by y (weights is y by z // normally); result is x by y galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 7c209a3cbf..3424c2b3e3 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -104,6 +104,8 @@ float galois::GraphNeuralNetwork::GetGlobalAccuracy( size_t global_correct = num_correct_.reduce(); size_t global_checked = total_checked_.reduce(); + GALOIS_LOG_VERBOSE("Accuracy: {} / {}", global_correct, global_checked); + return static_cast(global_correct) / static_cast(global_checked); } @@ -118,7 +120,7 @@ void galois::GraphNeuralNetwork::GradientPropagation() { // loops through intermediate layers in a backward fashion // -1 to ignore output layer which was handled above for (size_t i = 0; i < gnn_layers_.size() - 1; i++) { - // note this assumes you have at least 2 layers + // note this assumes you have at least 2 layers (including output) size_t layer_index = gnn_layers_.size() - 2 - i; // get the input to the layer before this one diff --git a/libgnn/src/SoftmaxLayer.cpp b/libgnn/src/SoftmaxLayer.cpp index 1c7073e560..30dc476965 100644 --- a/libgnn/src/SoftmaxLayer.cpp +++ b/libgnn/src/SoftmaxLayer.cpp @@ -6,8 +6,8 @@ const std::vector& galois::SoftmaxLayer::ForwardPhase( const std::vector& input_embeddings) { input_loss_.assign(input_loss_.size(), 0.0); forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); - const size_t feature_length = layer_dimensions_.input_columns; + galois::do_all( galois::iterate(graph_.begin(), graph_.end()), [&](const unsigned i) { @@ -21,8 +21,6 @@ const std::vector& galois::SoftmaxLayer::ForwardPhase( ground_truth_vectors_.getLocal(); assert(ground_truth_vec->size() == feature_length); ground_truth_vec->assign(ground_truth_vec->size(), 0.0); - GALOIS_LOG_VERBOSE("Label for LID {} is {}", i, - graph_.GetSingleClassLabel(i)); // single class label is an index; set the correct one (*ground_truth_vec)[static_cast( graph_.GetSingleClassLabel(i))] = 1.0; @@ -31,7 +29,6 @@ const std::vector& galois::SoftmaxLayer::ForwardPhase( input_loss_[i] = GNNCrossEntropy(feature_length, ground_truth_vec->data(), &forward_output_matrix_[feature_length * i]); - GALOIS_LOG_VERBOSE("Loss for LID {} is {}", i, input_loss_[i]); } }, // TODO chunk size? @@ -45,11 +42,14 @@ std::vector* galois::SoftmaxLayer::BackwardPhase(const std::vector&, std::vector*) { const size_t feature_length = layer_dimensions_.input_columns; + galois::do_all( galois::iterate(graph_.begin(), graph_.end()), [&](const unsigned i) { if (graph_.IsValidForPhase(i, layer_phase_)) { // create ground truth vector for this LID + // TODO maybe make this part of the graph class instead of recreating + // every time std::vector* ground_truth_vec = ground_truth_vectors_.getLocal(); assert(ground_truth_vec->size() == feature_length); @@ -61,16 +61,18 @@ galois::SoftmaxLayer::BackwardPhase(const std::vector&, // derivative cross entropy into norm grad std::vector* norm_gradient = norm_gradient_vectors_.getLocal(); - GNNCrossEntropyDerivative(feature_length, ground_truth_vec->data(), - forward_output_matrix_.data(), - norm_gradient->data()); + GNNCrossEntropyDerivative( + feature_length, ground_truth_vec->data(), + &(forward_output_matrix_[i * feature_length]), + norm_gradient->data()); // use norm grad with softmax deritave, save and return std::vector* softmax_temp = softmax_temp_vectors_.getLocal(); - GNNSoftmaxDerivative(feature_length, forward_output_matrix_.data(), + GNNSoftmaxDerivative(feature_length, + &(forward_output_matrix_[i * feature_length]), norm_gradient->data(), softmax_temp->data(), - backward_output_matrix_.data()); + &(backward_output_matrix_[i * feature_length])); } }, // TODO chunk size? diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 5934ad6331..7ad7bf1888 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -26,4 +26,8 @@ add_executable(accuracy-test accuracy-test.cpp) target_link_libraries(accuracy-test galois_gnn) add_test(NAME accuracy-test COMMAND accuracy-test) +add_executable(epoch-test epoch-test.cpp) +target_link_libraries(epoch-test galois_gnn) +add_test(NAME epoch-test COMMAND epoch-test) + # TODO multi host tests? From 803f62378144b9b978ec3da429a0395416ac3013 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 7 Oct 2020 16:10:30 -0500 Subject: [PATCH 357/660] epoch-test Test that runs on the cora dataset for 100 epochs to see if accuracy grows as time passes. The test has exposed some issues with the dropout option that will need to be fixed in a future commit, but otherwise training accuracy seems to grow as time passes and the program seems to scale. --- libgnn/test/epoch-test.cpp | 52 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 libgnn/test/epoch-test.cpp diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp new file mode 100644 index 0000000000..aada47eea2 --- /dev/null +++ b/libgnn/test/epoch-test.cpp @@ -0,0 +1,52 @@ +//! @file epoch-test.cpp +//! Run 50 epochs of training to see if results improve. + +#include "galois/Logging.h" +#include "galois/GraphNeuralNetwork.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + // size_t num_threads = galois::setActiveThreads(1); + GALOIS_LOG_VERBOSE("Num threads is {}", num_threads); + + // load graph + auto test_graph = std::make_unique( + "cora", galois::graphs::GNNPartitionScheme::kOEC, true); + + std::vector layer_types = { + galois::GNNLayerType::kGraphConvolutional, + galois::GNNLayerType::kGraphConvolutional}; + std::vector layer_output_sizes = { + 16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()}; + // XXX fix dropout accuracy + galois::GraphNeuralNetworkConfig gnn_config( + 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, + galois::GNNConfig{.do_dropout = false, .do_normalization = true}); + + std::vector adam_sizes = {16 * test_graph->node_feature_length(), + 16 * test_graph->GetNumLabelClasses()}; + auto adam = std::make_unique(adam_sizes, 2); + + auto gnn = std::make_unique( + std::move(test_graph), std::move(adam), std::move(gnn_config)); + + ////////////////////////////////////////////////////////////////////////////// + + // no verification; test should be eyeballed to make sure accuracy is + // increasing + for (size_t epoch = 0; epoch < 100; epoch++) { + const std::vector* predictions = gnn->DoInference(); + gnn->GradientPropagation(); + galois::gPrint("Epoch ", epoch, ": Accuracy is ", + gnn->GetGlobalAccuracy(*predictions), "\n"); + } + + // check test accuracy + gnn->SetLayerPhases(galois::GNNPhase::kTest); + const std::vector* predictions = gnn->DoInference(); + galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(*predictions), + "\n"); +} From 0a01df9789dbc87dc3aeae12d4b07e008798212e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 8 Oct 2020 13:06:49 -0500 Subject: [PATCH 358/660] Removing some gDebugs Removing some gDebugs that appear while debugging GNNs in NewGeneric and GraphHelepers --- libcusp/include/galois/graphs/NewGeneric.h | 13 ------------ .../include/galois/graphs/GraphHelpers.h | 20 ------------------- 2 files changed, 33 deletions(-) diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index 3af95db9dd..4632d3b4d8 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -560,16 +560,10 @@ class NewDistGraphGeneric : public DistGraph { lid++; } } - galois::gDebug("[", base_DistGraph::id, " -> ", h, "] bitset size ", - (end - start) / 64, " vs. vector size ", - syncNodes[h].size() / 2); } lid -= numLocal; assert(lid == numToReserve); - galois::gDebug("[", base_DistGraph::id, "] total bitset size ", - (ghosts.size() - numLocal) / 64, " vs. total vector size ", - numToReserve / 2); // TODO: should not be used after this - refactor to make this clean ghosts.resize(0); @@ -1450,13 +1444,6 @@ class NewDistGraphGeneric : public DistGraph { asyncSyncLoad(nodeLoads, nodeAccum, edgeLoads, edgeAccum, loadsClear); } loadSyncTimer.stop(); - -#ifndef NDEBUG - if (async) { - galois::gDebug("[", base_DistGraph::id, "] host count ", - hostFinished.count()); - } -#endif } // if asynchronous, don't move on until everything is done diff --git a/libgalois/include/galois/graphs/GraphHelpers.h b/libgalois/include/galois/graphs/GraphHelpers.h index e7da20ebc1..ab0b48c5a5 100644 --- a/libgalois/include/galois/graphs/GraphHelpers.h +++ b/libgalois/include/galois/graphs/GraphHelpers.h @@ -167,8 +167,6 @@ auto divideNodesBinarySearch( // weight of a block (one block for each division by default; if scale // factor specifies something different, then use that instead) uint64_t blockWeight = (weight + numBlocks - 1) / numBlocks; - // galois::gDebug("weight ", weight, " numblock ", numBlocks, " blockwegith ", - // blockWeight); // lower and upper blocks that this division should use determined // using scaleFactor @@ -182,9 +180,6 @@ auto divideNodesBinarySearch( uint32_t blockUpper = scaleFactor[id]; assert(blockLower <= blockUpper); - // galois::gDebug("Unit ", id, " block ", blockLower, " to ", - // blockUpper, "; ", blockLower * blockWeight, " ", - // blockUpper * blockWeight); uint64_t nodesLower; // use prefix sum to find node bounds @@ -215,10 +210,6 @@ auto divideNodesBinarySearch( edgesUpper = edgePrefixSum[nodesUpper - 1 + nodeOffset] - edgeOffset; } - // galois::gDebug("Unit ", id, " nodes ", nodesLower, " to ", - // nodesUpper, " edges ", edgesLower, " ", - // edgesUpper); - return GraphRange( NodeRange(iterator(nodesLower), iterator(nodesUpper)), EdgeRange(edge_iterator(edgesLower), edge_iterator(edgesUpper))); @@ -294,11 +285,6 @@ void determineUnitRangesLoopGraph(GraphTy& graph, uint32_t unitsToSplit, // unit assinged no nodes, copy last one returnRanges[i + 1] = returnRanges[i]; } - - galois::gDebug("LoopGraph Unit ", i, " gets nodes ", returnRanges[i], - " to ", returnRanges[i + 1], ", num edges is ", - graph.edge_end(returnRanges[i + 1] - 1) - - graph.edge_begin(returnRanges[i])); } } @@ -362,9 +348,6 @@ void determineUnitRangesLoopPrefixSum(VectorTy& prefixSum, // unit assinged no nodes returnRanges[i + 1] = returnRanges[i]; } - - galois::gDebug("Unit ", i, " gets nodes ", returnRanges[i], " to ", - returnRanges[i + 1]); } } @@ -522,9 +505,6 @@ std::vector determineUnitRangesFromPrefixSum(uint32_t unitsToSplit, // unit assinged no nodes nodeRanges[i + 1] = nodeRanges[i]; } - - galois::gDebug("Unit ", i, " gets nodes ", nodeRanges[i], " to ", - nodeRanges[i + 1]); } return nodeRanges; From 03007ddce3f25da74420a661b5f7f24927d32fda Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 8 Oct 2020 18:57:48 -0500 Subject: [PATCH 359/660] Glorot Bengio layer weight initialization Added a new function to initialize GNN layer weights based on a paper by Glorot and Bengio at AISTATS 2010. This was what was used in the non-refactored code to great effect in terms of accuracy gain. --- libgnn/include/galois/layers/GNNLayer.h | 8 ++++++++ libgnn/src/GNNLayer.cpp | 15 +++++++++++++-- 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 3647434be6..37d32a3c4d 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -161,6 +161,14 @@ class GNNLayer { ////////////////////////////////////////////////////////////////////////////// + //! Init based from following paper + //! http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf + //! Since it is unclear what j and j+1 refer to in that paper, the things + //! used are the dimensions of this particular weight matrix + //! TODO revisit paper and see what they really mean + //! Code inspired DGL and TinyDNN + void GlorotBengioInit(std::vector* vector_to_init); + //! Randomly init a float vector using the class's random init RNG void RandomInitVector(std::vector* vector_to_init); diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/GNNLayer.cpp index 396f7ddf7c..5a8f106f20 100644 --- a/libgnn/src/GNNLayer.cpp +++ b/libgnn/src/GNNLayer.cpp @@ -17,8 +17,7 @@ galois::GNNLayer::GNNLayer(size_t layer_num, layer_dimensions_.input_columns * layer_dimensions_.output_columns; layer_weights_.resize(num_weight_elements); layer_weight_gradients_.resize(num_weight_elements, 0); - // init weights randomly with a parallel loop - RandomInitVector(&layer_weights_); + GlorotBengioInit(&layer_weights_); } size_t num_output_elements = @@ -28,6 +27,18 @@ galois::GNNLayer::GNNLayer(size_t layer_num, layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0); } +void galois::GNNLayer::GlorotBengioInit(std::vector* vector_to_init) { + float max = std::sqrt(6.0) / std::sqrt(layer_dimensions_.output_columns + + layer_dimensions_.input_columns); + // TODO this seed should be configurable + std::default_random_engine rng(1); + std::uniform_real_distribution dist(-max, max); + + for (size_t i = 0; i < vector_to_init->size(); i++) { + (*vector_to_init)[i] = dist(rng); + } +} + void galois::GNNLayer::RandomInitVector(std::vector* vector_to_init) { galois::do_all( galois::iterate(static_cast(0), vector_to_init->size()), From 39c50716fcb0872988f6b81f6ba4d559a1f83c7e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 8 Oct 2020 19:42:38 -0500 Subject: [PATCH 360/660] GCN layer aggregate/update flip based on dims Added an "optimization" to GCN passes where if input columns are greater than output columns then update occurs before aggregation to make it so aggregation has less work to do. This comes from DGL which also does something similar to save on compute time. --- libgnn/src/GraphConvolutionalLayer.cpp | 71 ++++++++++++++++++-------- 1 file changed, 50 insertions(+), 21 deletions(-) diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp index 7b513374db..c2a838e0fd 100644 --- a/libgnn/src/GraphConvolutionalLayer.cpp +++ b/libgnn/src/GraphConvolutionalLayer.cpp @@ -33,17 +33,25 @@ galois::GraphConvolutionalLayer::ForwardPhase( // pointer to input to operate on const GNNFloat* input_data = input_embeddings.data(); // first, dropout - // TODO only dropout if in training apparently if (config_.do_dropout && (layer_phase_ == GNNPhase::kTrain)) { DoDropout(&in_temp_1_); input_data = in_temp_1_.data(); } - // aggregation and update (or vice versa) - AggregateAll(layer_dimensions_.input_columns, input_data, in_temp_2_.data(), - &input_column_intermediates_); + // flip aggregate/update if dimensions favor it (do less work) + if (layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + // aggregation and update + AggregateAll(layer_dimensions_.input_columns, input_data, in_temp_2_.data(), + &input_column_intermediates_); + UpdateEmbeddings(in_temp_2_.data(), forward_output_matrix_.data()); + } else { + // update to aggregate + UpdateEmbeddings(input_data, out_temp_.data()); + AggregateAll(layer_dimensions_.output_columns, out_temp_.data(), + forward_output_matrix_.data(), &output_column_intermediates_); + } + // TODO synchronization of aggregation functions - UpdateEmbeddings(in_temp_2_.data(), forward_output_matrix_.data()); // TODO if input columns > output columns do update first then aggregate for // efficiency @@ -68,25 +76,46 @@ std::vector* galois::GraphConvolutionalLayer::BackwardPhase( } // derivative of aggregation/update - // TODO do optimized cased like the forward - if (layer_number_ != 0) { - // transposed sgemm for derivative; in_temp is output - assert(input_gradient->size() == - layer_dimensions_.input_rows * layer_dimensions_.output_columns); - assert(in_temp_1_.size() == - layer_dimensions_.input_columns * layer_dimensions_.input_rows); - UpdateEmbeddingsDerivative(input_gradient->data(), in_temp_1_.data()); - // derivative of aggregate is the same due to symmetric graph - AggregateAll(layer_dimensions_.input_columns, in_temp_1_.data(), - backward_output_matrix_.data(), &input_column_intermediates_); + // TODO clean up logic here to reduce nesting + if (layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + if (layer_number_ != 0) { + // transposed sgemm for derivative; in_temp is output + assert(input_gradient->size() == + layer_dimensions_.input_rows * layer_dimensions_.output_columns); + assert(in_temp_1_.size() == + layer_dimensions_.input_columns * layer_dimensions_.input_rows); + UpdateEmbeddingsDerivative(input_gradient->data(), in_temp_1_.data()); + // derivative of aggregate is the same due to symmetric graph + AggregateAll(layer_dimensions_.input_columns, in_temp_1_.data(), + backward_output_matrix_.data(), + &input_column_intermediates_); + } + // weight gradient calculation + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, + layer_dimensions_.input_rows, layer_dimensions_.output_columns, + prev_layer_input.data(), input_gradient->data(), + layer_weight_gradients_.data()); + } else { + // aggregate occurs regardless of layer being equal to 0 because it is + // required in this case for the weight gradient calculation + AggregateAll(layer_dimensions_.output_columns, input_gradient->data(), + out_temp_.data(), &output_column_intermediates_); + if (layer_number_ != 0) { + // derivative for update + UpdateEmbeddingsDerivative(out_temp_.data(), + backward_output_matrix_.data()); + } + // weight gradient; note the use of the aggregated gradient in out_temp + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, + layer_dimensions_.input_rows, layer_dimensions_.output_columns, + prev_layer_input.data(), out_temp_.data(), + layer_weight_gradients_.data()); } + // TODO sync agg/update - // weight gradient calculation - galois::CBlasSGEMM(CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, - layer_dimensions_.input_rows, - layer_dimensions_.output_columns, prev_layer_input.data(), - input_gradient->data(), layer_weight_gradients_.data()); // TODO sync weights if (config_.do_dropout && layer_number_ != 0) { From ce4437e75fbe622894f6c6d95c41a41d5ceac08f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 8 Oct 2020 19:45:17 -0500 Subject: [PATCH 361/660] Timer to epoch-test Adds a timer to epoch test to compare with older code. Note that after the previous few commits the accuracy now matches quite closely to the older code (there are slight differences due to corrections to the optimizer), and it is also faster than the older code as well. --- libgnn/test/epoch-test.cpp | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp index aada47eea2..0bf8c61f81 100644 --- a/libgnn/test/epoch-test.cpp +++ b/libgnn/test/epoch-test.cpp @@ -22,9 +22,12 @@ int main() { std::vector layer_output_sizes = { 16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()}; // XXX fix dropout accuracy + // XXX fix activation too galois::GraphNeuralNetworkConfig gnn_config( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, - galois::GNNConfig{.do_dropout = false, .do_normalization = true}); + galois::GNNConfig{.do_dropout = false, + .do_activation = false, + .do_normalization = true}); std::vector adam_sizes = {16 * test_graph->node_feature_length(), 16 * test_graph->GetNumLabelClasses()}; @@ -37,6 +40,8 @@ int main() { // no verification; test should be eyeballed to make sure accuracy is // increasing + galois::StatTimer main_timer("Timer_0"); + main_timer.start(); for (size_t epoch = 0; epoch < 100; epoch++) { const std::vector* predictions = gnn->DoInference(); gnn->GradientPropagation(); @@ -49,4 +54,5 @@ int main() { const std::vector* predictions = gnn->DoInference(); galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(*predictions), "\n"); + main_timer.stop(); } From db5204ce20388ec92a7b0695d938016f749f851d Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 9 Oct 2020 18:37:49 -0500 Subject: [PATCH 362/660] Fixed forward dropout: input to drop not weights Fixed the dropout occuring in the forward phase of the GCN layer: the original implementation was dropping layer weights instead of dropping the input like it is supposed to which completely wrecked accuracy and caused segfaults. Turning on dropout no longer makes accuracy horrible. --- libgnn/include/galois/layers/GNNLayer.h | 3 ++- libgnn/src/GNNLayer.cpp | 12 +++++++----- libgnn/src/GraphConvolutionalLayer.cpp | 2 +- libgnn/test/epoch-test.cpp | 6 +++--- 4 files changed, 13 insertions(+), 10 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 37d32a3c4d..0d30c337f2 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -175,7 +175,8 @@ class GNNLayer { //! Choose a set of weights from this layer's weights to keep and save to //! the output matrix + apply some scaling to the kept weights based on //! dropout rate - void DoDropout(std::vector* output_matrix); + void DoDropout(const std::vector& input_to_drop, + std::vector* output_matrix); //! Apply the derivative of dropout to the backward phase output void DoDropoutDerivative(); diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/GNNLayer.cpp index 5a8f106f20..bcefd42efe 100644 --- a/libgnn/src/GNNLayer.cpp +++ b/libgnn/src/GNNLayer.cpp @@ -51,11 +51,13 @@ void galois::GNNLayer::RandomInitVector(std::vector* vector_to_init) { // XXX Something is wrong with dropout; accuracy suffers, figure out what // it is -void galois::GNNLayer::DoDropout(std::vector* output_matrix) { +void galois::GNNLayer::DoDropout(const std::vector& input_to_dropout, + std::vector* output_matrix) { size_t num_elements = output_matrix->size(); assert(num_elements == dropout_mask_.size()); + assert(num_elements == input_to_dropout.size()); - // determine which weights to drop + // determine which parts to drop galois::do_all( galois::iterate(static_cast(0), num_elements), [&](size_t i) { @@ -63,14 +65,14 @@ void galois::GNNLayer::DoDropout(std::vector* output_matrix) { }, galois::loopname("LayerDropoutRNG")); - // create new matrix with non-dropped weights + some scaling + // create new matrix with non-dropped input + some scaling // TODO save scaling elsewhere? GNNFloat scale = 1. / (1. - config_.dropout_rate); galois::do_all( galois::iterate(static_cast(0), num_elements), [&](size_t i) { - (*output_matrix)[i] = - layer_weights_[i] * static_cast(dropout_mask_[i]) * scale; + (*output_matrix)[i] = input_to_dropout[i] * + static_cast(dropout_mask_[i]) * scale; }, galois::loopname("LayerDropout")); } diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp index c2a838e0fd..92f554da45 100644 --- a/libgnn/src/GraphConvolutionalLayer.cpp +++ b/libgnn/src/GraphConvolutionalLayer.cpp @@ -34,7 +34,7 @@ galois::GraphConvolutionalLayer::ForwardPhase( const GNNFloat* input_data = input_embeddings.data(); // first, dropout if (config_.do_dropout && (layer_phase_ == GNNPhase::kTrain)) { - DoDropout(&in_temp_1_); + DoDropout(input_embeddings, &in_temp_1_); input_data = in_temp_1_.data(); } diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp index 0bf8c61f81..c6c98ab7d1 100644 --- a/libgnn/test/epoch-test.cpp +++ b/libgnn/test/epoch-test.cpp @@ -14,7 +14,7 @@ int main() { // load graph auto test_graph = std::make_unique( - "cora", galois::graphs::GNNPartitionScheme::kOEC, true); + "reddit", galois::graphs::GNNPartitionScheme::kOEC, true); std::vector layer_types = { galois::GNNLayerType::kGraphConvolutional, @@ -25,7 +25,7 @@ int main() { // XXX fix activation too galois::GraphNeuralNetworkConfig gnn_config( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, - galois::GNNConfig{.do_dropout = false, + galois::GNNConfig{.do_dropout = true, .do_activation = false, .do_normalization = true}); @@ -42,7 +42,7 @@ int main() { // increasing galois::StatTimer main_timer("Timer_0"); main_timer.start(); - for (size_t epoch = 0; epoch < 100; epoch++) { + for (size_t epoch = 0; epoch < 5; epoch++) { const std::vector* predictions = gnn->DoInference(); gnn->GradientPropagation(); galois::gPrint("Epoch ", epoch, ": Accuracy is ", From 675018698fd3d66a1ea99c10c97060b23edba3be Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 9 Oct 2020 18:59:53 -0500 Subject: [PATCH 363/660] Disable convlayer-test and gnnfb-test for now With the addition of the optimization that flips aggregation/update order based on input/output columns, some old tests are now broken because the answer differs due to the flip in order. Disabled tests for now until I figure out the new correct output or undo the optimization somehow for the tests. --- libgnn/test/CMakeLists.txt | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 7ad7bf1888..791b79757e 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -2,9 +2,9 @@ add_executable(gnngraph-test gnngraph-test.cpp) target_link_libraries(gnngraph-test galois_gnn) add_test(NAME gnngraph-test COMMAND gnngraph-test) -add_executable(convlayer-test convlayer-test.cpp) -target_link_libraries(convlayer-test galois_gnn) -add_test(NAME convlayer-test COMMAND convlayer-test) +#add_executable(convlayer-test convlayer-test.cpp) +#target_link_libraries(convlayer-test galois_gnn) +#add_test(NAME convlayer-test COMMAND convlayer-test) add_executable(softmaxlayer-test softmaxlayer-test.cpp) target_link_libraries(softmaxlayer-test galois_gnn) @@ -14,9 +14,9 @@ add_executable(gnnconstruct-test gnnconstruct-test.cpp) target_link_libraries(gnnconstruct-test galois_gnn) add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test) -add_executable(gnnfb-test gnnfb-test.cpp) -target_link_libraries(gnnfb-test galois_gnn) -add_test(NAME gnnfb-test COMMAND gnnfb-test) +#add_executable(gnnfb-test gnnfb-test.cpp) +#target_link_libraries(gnnfb-test galois_gnn) +#add_test(NAME gnnfb-test COMMAND gnnfb-test) add_executable(adam-test adam-test.cpp) target_link_libraries(adam-test galois_gnn) From a7db2e2c885e125ad3bc68bbc22202e89ba9aef7 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 12 Oct 2020 12:30:00 -0500 Subject: [PATCH 364/660] Option to disable agg/update flip, reenable tests Adds an option to layers to make it so program doesn't automatically flip aggregate/update to avoid more work in the aggregate step. This allows the tests that broke before to be renabled. Also fixed gnnfb test by removing back prop call in val and test (you should not call those during those phases; back is only for training). --- libgnn/include/galois/layers/GNNLayer.h | 3 +++ libgnn/src/GraphConvolutionalLayer.cpp | 6 ++++-- libgnn/test/CMakeLists.txt | 12 ++++++------ libgnn/test/convlayer-test.cpp | 16 ++++++++++------ libgnn/test/gnnfb-test.cpp | 7 +++---- 5 files changed, 26 insertions(+), 18 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 0d30c337f2..ac6cc9dd0e 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -44,6 +44,9 @@ struct GNNConfig { bool do_activation{false}; //! True if normalization is to occur during multiplies bool do_normalization{false}; + //! If this is true, aggregate may occur after multiply if # of input columns + //! is higher than output columns to do less work in aggregation + bool allow_aggregate_after_update{true}; // TODO activation type; for now default is softmax }; diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp index 92f554da45..5a222ced62 100644 --- a/libgnn/src/GraphConvolutionalLayer.cpp +++ b/libgnn/src/GraphConvolutionalLayer.cpp @@ -39,7 +39,8 @@ galois::GraphConvolutionalLayer::ForwardPhase( } // flip aggregate/update if dimensions favor it (do less work) - if (layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + if (!config_.allow_aggregate_after_update || + layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { // aggregation and update AggregateAll(layer_dimensions_.input_columns, input_data, in_temp_2_.data(), &input_column_intermediates_); @@ -77,7 +78,8 @@ std::vector* galois::GraphConvolutionalLayer::BackwardPhase( // derivative of aggregation/update // TODO clean up logic here to reduce nesting - if (layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + if (!config_.allow_aggregate_after_update || + layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { if (layer_number_ != 0) { // transposed sgemm for derivative; in_temp is output assert(input_gradient->size() == diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 791b79757e..7ad7bf1888 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -2,9 +2,9 @@ add_executable(gnngraph-test gnngraph-test.cpp) target_link_libraries(gnngraph-test galois_gnn) add_test(NAME gnngraph-test COMMAND gnngraph-test) -#add_executable(convlayer-test convlayer-test.cpp) -#target_link_libraries(convlayer-test galois_gnn) -#add_test(NAME convlayer-test COMMAND convlayer-test) +add_executable(convlayer-test convlayer-test.cpp) +target_link_libraries(convlayer-test galois_gnn) +add_test(NAME convlayer-test COMMAND convlayer-test) add_executable(softmaxlayer-test softmaxlayer-test.cpp) target_link_libraries(softmaxlayer-test galois_gnn) @@ -14,9 +14,9 @@ add_executable(gnnconstruct-test gnnconstruct-test.cpp) target_link_libraries(gnnconstruct-test galois_gnn) add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test) -#add_executable(gnnfb-test gnnfb-test.cpp) -#target_link_libraries(gnnfb-test galois_gnn) -#add_test(NAME gnnfb-test COMMAND gnnfb-test) +add_executable(gnnfb-test gnnfb-test.cpp) +target_link_libraries(gnnfb-test galois_gnn) +add_test(NAME gnnfb-test COMMAND gnnfb-test) add_executable(adam-test adam-test.cpp) target_link_libraries(adam-test galois_gnn) diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp index 3c127b0ad0..3f46fb84d2 100644 --- a/libgnn/test/convlayer-test.cpp +++ b/libgnn/test/convlayer-test.cpp @@ -50,8 +50,9 @@ int main() { // create the layer, no norm factor // note layer number is 1 so that it does something in backward phase std::unique_ptr layer_0 = - std::make_unique(0, test_graph, - dimension_0); + std::make_unique( + 0, test_graph, dimension_0, + galois::GNNConfig{.allow_aggregate_after_update = false}); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner const std::vector& layer_0_forward_output = @@ -133,8 +134,9 @@ int main() { // create layer 1 for testing backward prop actually giving weights back std::unique_ptr layer_1 = - std::make_unique(1, test_graph, - dimension_0); + std::make_unique( + 1, test_graph, dimension_0, + galois::GNNConfig{.allow_aggregate_after_update = false}); layer_1->InitAllWeightsTo1(); const std::vector& layer_1_forward_output = layer_1->ForwardPhase(test_graph.GetLocalFeatures()); @@ -199,8 +201,10 @@ int main() { ////////////////////////////////////////////////////////////////////////////// - galois::GNNConfig config = { - .do_dropout = true, .do_activation = true, .do_normalization = true}; + galois::GNNConfig config = {.do_dropout = true, + .do_activation = true, + .do_normalization = true, + .allow_aggregate_after_update = false}; // finally, just make sure dropout and activation run without crashes // (verification requires floating point accuracy or setting a seed which I diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp index 8142b2435b..50c40ff2c1 100644 --- a/libgnn/test/gnnfb-test.cpp +++ b/libgnn/test/gnnfb-test.cpp @@ -27,7 +27,7 @@ int main() { // like dropout or activation and the like so that input is easier to verify galois::GraphNeuralNetworkConfig gnn_config( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, - galois::GNNConfig()); + galois::GNNConfig{.allow_aggregate_after_update = false}); // input is 7 x 3, layers are then 3 x 4 and 4 x 7 and 7 x 7 // middle 2 are trainable so 12 and 28 std::vector adam_sizes = {12, 28}; @@ -142,7 +142,6 @@ int main() { GALOIS_LOG_ASSERT(fo_out_val[c + i] == 0); } } - gnn->GradientPropagation(); // all but last should be 0s gnn->SetLayerPhases(galois::GNNPhase::kTest); @@ -161,7 +160,6 @@ int main() { GALOIS_LOG_ASSERT(fo_out_test[c + i] == 0); } } - gnn->GradientPropagation(); ////////////////////////////////////////////////////////////////////////////// // run different config of gnn with dropout/activation @@ -172,7 +170,8 @@ int main() { test_graph = std::make_unique( "tester", galois::graphs::GNNPartitionScheme::kOEC, true); galois::GraphNeuralNetworkConfig gnn_config2( - 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax); + 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, + galois::GNNConfig{.allow_aggregate_after_update = false}); auto adam2 = std::make_unique(adam_sizes, 2); auto gnn2 = std::make_unique( std::move(test_graph), std::move(adam2), std::move(gnn_config2)); From d83a224008a3996327062cbd072c93ef01034b0d Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 12 Oct 2020 12:32:04 -0500 Subject: [PATCH 365/660] old gnn: add tester as a valid input, use adam opt Two small changes to make it easier to debug the new refactored code. --- libdeepgalois/include/deepgalois/configs.h | 5 +++-- lonestar/gnn/include/engine.h | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/libdeepgalois/include/deepgalois/configs.h b/libdeepgalois/include/deepgalois/configs.h index f21dff7fed..5cbb1909fd 100644 --- a/libdeepgalois/include/deepgalois/configs.h +++ b/libdeepgalois/include/deepgalois/configs.h @@ -5,8 +5,9 @@ namespace deepgalois { const std::string path = "/net/ohm/export/iss/inputs/Learning/"; // path to the input dataset -#define NUM_DATASETS 8 +#define NUM_DATASETS 9 const std::string dataset_names[NUM_DATASETS] = { - "cora", "citeseer", "ppi", "pubmed", "flickr", "yelp", "reddit", "amazon"}; + "cora", "citeseer", "ppi", "pubmed", "flickr", + "yelp", "reddit", "amazon", "tester"}; } // namespace deepgalois diff --git a/lonestar/gnn/include/engine.h b/lonestar/gnn/include/engine.h index f9afb28a4c..016ac80831 100644 --- a/lonestar/gnn/include/engine.h +++ b/lonestar/gnn/include/engine.h @@ -92,7 +92,7 @@ int main(int argc, char** argv) { // see optimizer.h for more details // optimizer *opt = new gradient_descent(); // optimizer *opt = new adagrad(); - deepgalois::optimizer* opt = new deepgalois::adagrad(); + deepgalois::optimizer* opt = new deepgalois::adam(); galois::StatTimer Ttrain("TrainAndVal"); Ttrain.start(); network.train(opt, do_validate); // do training using training samples From 06142dc7f83134fe67aaaaec95e28b608c857bc4 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 12 Oct 2020 13:47:13 -0500 Subject: [PATCH 366/660] Switch epoch test to cora At this point in time turning on activation makes things much worse than the older code; will have to figure out why this is the case, but this is lower priority at the moment because activation also slows convergence. --- libgnn/test/epoch-test.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp index c6c98ab7d1..c37a7d2e34 100644 --- a/libgnn/test/epoch-test.cpp +++ b/libgnn/test/epoch-test.cpp @@ -14,15 +14,14 @@ int main() { // load graph auto test_graph = std::make_unique( - "reddit", galois::graphs::GNNPartitionScheme::kOEC, true); + "cora", galois::graphs::GNNPartitionScheme::kOEC, true); std::vector layer_types = { galois::GNNLayerType::kGraphConvolutional, galois::GNNLayerType::kGraphConvolutional}; std::vector layer_output_sizes = { 16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()}; - // XXX fix dropout accuracy - // XXX fix activation too + // XXX Activation kills accuracy compared to old code, esp. for cora galois::GraphNeuralNetworkConfig gnn_config( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, galois::GNNConfig{.do_dropout = true, @@ -42,7 +41,7 @@ int main() { // increasing galois::StatTimer main_timer("Timer_0"); main_timer.start(); - for (size_t epoch = 0; epoch < 5; epoch++) { + for (size_t epoch = 0; epoch < 100; epoch++) { const std::vector* predictions = gnn->DoInference(); gnn->GradientPropagation(); galois::gPrint("Epoch ", epoch, ": Accuracy is ", From 600e0ef5831a4f6a3182f4c6d7d2f182079cab76 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 12 Oct 2020 15:31:15 -0500 Subject: [PATCH 367/660] libgnn CMakeLists cleanup Removal of unused MKL links as well as openmp/pthreads which don't seem to be required by openblas. --- libgnn/CMakeLists.txt | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index e6c8786cd2..88398c3d60 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -1,21 +1,11 @@ -set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenmp -pthread") -SET(BLAS_INC_DIR ${OPENBLAS_ROOT}/include/openblas) -SET(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64) -set(BLAS_LIB "-lopenblas -lpthread") -if(USE_MKL_BLAS) - link_directories(${INTEL_LIBS_DIR}) - message(STATUS "ICC Libraries for MKL: ${INTEL_LIBS_DIR}") - SET(BLAS_INC_DIR ${MKL_ROOT}/include) - SET(BLAS_LIB_DIR ${MKL_ROOT}/lib/intel64) - set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core") - #set(BLAS_LIB "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -lpthread -liomp5") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DUSE_MKL") -endif() - +set(BLAS_INC_DIR ${OPENBLAS_ROOT}/include/openblas) +set(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64) # blas library include_directories(${BLAS_INC_DIR}) link_directories(${BLAS_LIB_DIR}) +set(BLAS_LIB "-lopenblas") + set(sources src/GNNGraph.cpp src/GNNLayer.cpp From bd84e8f3981107ad89161f260530f678d919f8ff Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 12 Oct 2020 16:39:47 -0500 Subject: [PATCH 368/660] Removed some galois/cusp/gluon debugs Removed a bunch of gDebugs that were making output in Debug build extremely hard to parse. --- libcusp/include/galois/graphs/BasePolicies.h | 8 --- .../include/galois/graphs/DistributedGraph.h | 3 - libcusp/include/galois/graphs/NewGeneric.h | 62 +------------------ libdist/include/galois/DTerminationDetector.h | 9 --- libgalois/include/galois/LargeArray.h | 4 -- .../include/galois/graphs/BufferedGraph.h | 3 - 6 files changed, 3 insertions(+), 86 deletions(-) diff --git a/libcusp/include/galois/graphs/BasePolicies.h b/libcusp/include/galois/graphs/BasePolicies.h index a529e391c7..446e9c7dae 100644 --- a/libcusp/include/galois/graphs/BasePolicies.h +++ b/libcusp/include/galois/graphs/BasePolicies.h @@ -203,16 +203,12 @@ class CustomMasterAssignment : public PartitioningScaffold { // found in map if (gidMasterIter != _gid2masters.end()) { uint32_t mappedMaster = gidMasterIter->second; - // galois::gDebug("[", _hostID, "] ", gid, " found with master ", - // mappedMaster, "!"); // make sure host is in bounds assert(mappedMaster < _numHosts); return mappedMaster; } else { // NOT FOUND (not necessarily a bad thing, and required for // some cases) - galois::gDebug("[", _hostID, "] ", gid, - " not found for retrieveMaster!"); if (_status == 2) { // die if we expect all gids to be mapped already (stage 2) GALOIS_DIE("should not fail to find a GID after stage 2 " @@ -253,7 +249,6 @@ class CustomMasterAssignment : public PartitioningScaffold { for (auto i = gid2offsets.begin(); i != gid2offsets.end(); i++) { assert(i->second < localNodeToMaster.size()); - galois::gDebug("Map ", i->first, " to ", localNodeToMaster[i->second]); _gid2masters[i->first] = localNodeToMaster[i->second]; } assert(_gid2masters.size() == (originalSize + gid2offsets.size())); @@ -314,13 +309,10 @@ class CustomMasterAssignment : public PartitioningScaffold { auto offsetIntoMapIter = _gid2masters.find(gid); if (offsetIntoMapIter == _gid2masters.end()) { // NOT FOUND - galois::gDebug("[", _hostID, "] ", gid, " not found; mapping!"); _gid2masters[gid] = mappedMaster; return true; } else { // already mapped - galois::gDebug("[", _hostID, "] ", gid, " already mapped with master ", - offsetIntoMapIter->second, "!"); assert(offsetIntoMapIter->second == mappedMaster); return false; } diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h index 41c0e810f1..bf70bbf3e2 100644 --- a/libcusp/include/galois/graphs/DistributedGraph.h +++ b/libcusp/include/galois/graphs/DistributedGraph.h @@ -776,7 +776,6 @@ class DistGraph { withEdgeRanges.size() != 0) { masterRanges = withEdgeRanges; } else { - galois::gDebug("Manually det. master thread ranges"); masterRanges = galois::graphs::determineUnitRangesFromGraph( graph, galois::runtime::activeThreads, beginMaster, beginMaster + numOwned, 0); @@ -802,7 +801,6 @@ class DistGraph { masterRanges.size() != 0) { withEdgeRanges = masterRanges; } else { - galois::gDebug("Manually det. with edges thread ranges"); withEdgeRanges = galois::graphs::determineUnitRangesFromGraph( graph, galois::runtime::activeThreads, 0, numNodesWithEdges, 0); } @@ -869,7 +867,6 @@ class DistGraph { * Deallocates underlying LC CSR Graph */ void deallocate() { - galois::gDebug("Deallocating CSR in DistGraph"); graph.deallocate(); } diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index 4632d3b4d8..33a618c62f 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -1012,8 +1012,6 @@ class NewDistGraphGeneric : public DistGraph { std::vector& receivedOffsets, std::vector& receivedMasters) { uint64_t hostOffset = base_DistGraph::gid2host[sendingHost].first; - galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost, " offset ", - hostOffset); // if execution gets here, messageType was 1 or 2 assert(receivedMasters.size() == receivedOffsets.size()); @@ -1021,10 +1019,8 @@ class NewDistGraphGeneric : public DistGraph { galois::do_all( galois::iterate((size_t)0, receivedMasters.size()), [&](size_t i) { - uint64_t curGID = hostOffset + receivedOffsets[i]; - uint32_t indexIntoMap = gid2offsets[curGID]; - galois::gDebug("[", base_DistGraph::id, "] gid ", curGID, " offset ", - indexIntoMap); + uint64_t curGID = hostOffset + receivedOffsets[i]; + uint32_t indexIntoMap = gid2offsets[curGID]; localNodeToMaster[indexIntoMap] = receivedMasters[i]; }, galois::no_stats()); @@ -1069,9 +1065,6 @@ class NewDistGraphGeneric : public DistGraph { messageType); } - galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost, - " send message type ", messageType); - return std::make_pair(sendingHost, messageType); } @@ -1126,9 +1119,6 @@ class NewDistGraphGeneric : public DistGraph { GALOIS_DIE("invalid message type for sync of master assignments: ", messageType); } - - galois::gDebug("[", base_DistGraph::id, "] host ", sendingHost, - " send message type ", messageType); } } while (p); } @@ -1316,8 +1306,6 @@ class NewDistGraphGeneric : public DistGraph { // gid to vector offset setup std::unordered_map gid2offsets; uint64_t neighborCount = phase0MapSetup(ghosts, gid2offsets, syncNodes); - galois::gDebug("[", base_DistGraph::id, "] num neighbors found is ", - neighborCount); // send off neighbor metadata phase0SendRecv(syncNodes); @@ -1386,13 +1374,6 @@ class NewDistGraphGeneric : public DistGraph { auto work = getSpecificThreadRange(bufGraph, rangeVec, beginNode, endNode); - // debug print - // galois::on_each([&] (unsigned i, unsigned j) { - // galois::gDebug("[", base_DistGraph::id, " ", i, "] sync round ", - // syncRound, " local range ", - // *work.local_begin(), " ", *work.local_end()); - //}); - galois::do_all( // iterate over my read nodes galois::iterate(work), @@ -1410,10 +1391,6 @@ class NewDistGraphGeneric : public DistGraph { // on map with subtraction localNodeToMaster[node - globalOffset] = assignedHost; - // galois::gDebug("[", base_DistGraph::id, "] state round ", - // syncRound, - // " set ", node, " ", node - globalOffset); - // ptt.stop(); }, galois::loopname("Phase0DetermineMasters"), galois::steal(), @@ -1460,14 +1437,6 @@ class NewDistGraphGeneric : public DistGraph { waitTime.start(); while (hostFinished.count() != base_DistGraph::numHosts || loadsClear.count() != base_DistGraph::numHosts) { - //#ifndef NDEBUG - // galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts to - // finish, ", - // hostFinished.count()); - // galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts - // loads " - // "syncs to finish, ", loadsClear.count()); - //#endif // make sure all assignments are done and all loads are done syncAssignmentReceivesAsync(localNodeToMaster, gid2offsets, hostFinished); @@ -1476,15 +1445,9 @@ class NewDistGraphGeneric : public DistGraph { waitTime.stop(); } -#ifndef NDEBUG - printLoad(nodeLoads, nodeAccum); - printLoad(edgeLoads, edgeAccum); -#endif - // sanity check for correctness (all should be assigned) for (uint32_t i = 0; i < localNodeToMaster.size(); i++) { if (localNodeToMaster[i] == (uint32_t)-1) { - // galois::gDebug("[", base_DistGraph::id, "] bad index ", i); assert(localNodeToMaster[i] != (uint32_t)-1); } } @@ -2041,9 +2004,6 @@ class NewDistGraphGeneric : public DistGraph { size_t curCount = 0; // size_t actuallySet = 0; for (uint32_t offset : offsetsToConsider.getOffsets()) { - // galois::gDebug("[", base_DistGraph::id, "] ", " setting ", - // offset + hostOffset, " from host ", senderHost, - // " to ", recvMasterLocations[curCount]); graphPartitioner->addMasterMapping(offset + hostOffset, recvMasterLocations[curCount]); // bool set = graphPartitioner->addMasterMapping(offset + hostOffset, @@ -2051,9 +2011,6 @@ class NewDistGraphGeneric : public DistGraph { // if (set) { actuallySet++; } curCount++; } - - // galois::gDebug("[", base_DistGraph::id, "] host ", senderHost, ": set ", - // actuallySet, " out of ", recvMasterLocations.size()); } /** @@ -2070,9 +2027,6 @@ class NewDistGraphGeneric : public DistGraph { size_t curCount = 0; for (uint64_t gid : gids) { assert(gid < base_DistGraph::numGlobalNodes); - // galois::gDebug("[", base_DistGraph::id, "] ", " in-setting ", gid, " to - // ", - // recvMasterLocations[curCount]); graphPartitioner->addMasterMapping(gid, recvMasterLocations[curCount]); curCount++; } @@ -2133,7 +2087,6 @@ class NewDistGraphGeneric : public DistGraph { galois::runtime::gSerialize(b, offsets); if (graphPartitioner->masterAssignPhase()) { - // galois::gDebug("incoming master map serialization"); // serializeIncomingMasterMap(b, curBitset, h); serializeIncomingMasterMap(b, curBitset); } @@ -2142,7 +2095,6 @@ class NewDistGraphGeneric : public DistGraph { galois::runtime::gSerialize(b, 1); galois::runtime::gSerialize(b, curBitset); if (graphPartitioner->masterAssignPhase()) { - // galois::gDebug("incoming master map serialization"); // serializeIncomingMasterMap(b, curBitset, h); serializeIncomingMasterMap(b, curBitset); } @@ -2268,9 +2220,6 @@ class NewDistGraphGeneric : public DistGraph { inspectIncomingNodes(hasIncomingEdge, prefixSumOfEdges); finalizeInspection(prefixSumOfEdges); - galois::gDebug("[", base_DistGraph::id, - "] To receive this many nodes: ", nodesToReceive); - galois::gPrint("[", base_DistGraph::id, "] Inspection mapping complete.\n"); return prefixSumOfEdges; } @@ -2307,9 +2256,6 @@ class NewDistGraphGeneric : public DistGraph { galois::block_range((size_t)0, hostSize, tid, nthreads); uint64_t count = 0; for (size_t i = beginNode; i < endNode; i++) { - // galois::gDebug("[", base_DistGraph::id, "] ", i + startNode, - // " mapped to ", - // graphPartitioner->retrieveMaster(i+startNode)); if (graphPartitioner->retrieveMaster(i + startNode) == myHID) { count++; } @@ -2326,9 +2272,7 @@ class NewDistGraphGeneric : public DistGraph { assert(base_DistGraph::localToGlobalVector.size() == base_DistGraph::numNodes); - uint32_t newMasterNodes = threadPrefixSums[activeThreads - 1]; - galois::gDebug("[", base_DistGraph::id, "] This many masters from host ", - h, ": ", newMasterNodes); + uint32_t newMasterNodes = threadPrefixSums[activeThreads - 1]; uint32_t startingNodeIndex = base_DistGraph::numNodes; // increase size of prefix sum + mapping vector prefixSumOfEdges.resize(base_DistGraph::numNodes + newMasterNodes); diff --git a/libdist/include/galois/DTerminationDetector.h b/libdist/include/galois/DTerminationDetector.h index 0f6d696838..28c58b3666 100644 --- a/libdist/include/galois/DTerminationDetector.h +++ b/libdist/include/galois/DTerminationDetector.h @@ -150,10 +150,8 @@ class DGTerminator { bool terminate() { bool active = (local_mdata != 0); - // if (active) galois::gDebug("[", net.ID, "] local work done \n"); if (!active) { active = net.anyPendingSends(); - // if (active) galois::gDebug("[", net.ID, "] pending send \n"); } int snapshot_ended = 0; if (!active) { @@ -166,8 +164,6 @@ class DGTerminator { } if (!active) { // check pending receives after checking snapshot active = net.anyPendingReceives(); - if (active) - galois::gDebug("[", net.ID, "] pending receive"); } if (active) { work_done = true; @@ -178,16 +174,11 @@ class DGTerminator { work_done = false; prev_snapshot = snapshot; ++snapshot; - galois::gDebug("[", net.ID, "] work done, taking snapshot ", - snapshot); initiate_snapshot(); } else if (prev_snapshot != snapshot) { prev_snapshot = snapshot; - galois::gDebug("[", net.ID, "] no work done, taking snapshot ", - snapshot); initiate_snapshot(); } else { - galois::gDebug("[", net.ID, "] terminating ", snapshot); // an explicit barrier may be required here // so that the next async phase begins on all hosts at the same time // however, this may add overheads when it is not required diff --git a/libgalois/include/galois/LargeArray.h b/libgalois/include/galois/LargeArray.h index da2b89b916..71df3036ff 100644 --- a/libgalois/include/galois/LargeArray.h +++ b/libgalois/include/galois/LargeArray.h @@ -80,21 +80,17 @@ class LargeArray { m_size = n; switch (t) { case Blocked: - galois::gDebug("Block-alloc'd"); m_realdata = substrate::largeMallocBlocked(n * sizeof(T), runtime::activeThreads); break; case Interleaved: - galois::gDebug("Interleave-alloc'd"); m_realdata = substrate::largeMallocInterleaved(n * sizeof(T), runtime::activeThreads); break; case Local: - galois::gDebug("Local-allocd"); m_realdata = substrate::largeMallocLocal(n * sizeof(T)); break; case Floating: - galois::gDebug("Floating-alloc'd"); m_realdata = substrate::largeMallocFloating(n * sizeof(T)); break; }; diff --git a/libgalois/include/galois/graphs/BufferedGraph.h b/libgalois/include/galois/graphs/BufferedGraph.h index 7140506311..e5e3fa4221 100644 --- a/libgalois/include/galois/graphs/BufferedGraph.h +++ b/libgalois/include/galois/graphs/BufferedGraph.h @@ -181,8 +181,6 @@ class BufferedGraph { void loadEdgeData(std::ifstream& graphFile, uint64_t edgeStart, uint64_t numEdgesToLoad, uint64_t numGlobalNodes, uint64_t numGlobalEdges) { - galois::gDebug("Loading edge data"); - if (numEdgesToLoad == 0) { return; } @@ -233,7 +231,6 @@ class BufferedGraph { typename EdgeType, typename std::enable_if::value>::type* = nullptr> void loadEdgeData(std::ifstream&, uint64_t, uint64_t, uint64_t, uint64_t) { - galois::gDebug("Not loading edge data"); // do nothing (edge data is void, i.e. no edge data) } From 9a33c6f806c33289f733ac153509dcd894152e2f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 13 Oct 2020 13:53:12 -0500 Subject: [PATCH 369/660] Aggregation sync + feature reading fix + unit test 1) Fixes a bug in distributed feature reading that caused features to be read incorrectly (GID 0 -> LID 0 instead of GID 0 -> LID of global node 0). Fixed in both refactored and non-refactored code. Preliminary experiments show that it pretty much fixes accuracy in a distributed setting. (have yet to check if it reaches single host accuracy) 2) Adds preliminary aggregation sync call to forward/backward phase: trivial summation of rows of some matrix. Had to add globals to work with current Gluon sync structures (having Katana's refactor would be quite nice at this point in time......) 3) Adds a unit test for the aggregation sync. Pretty much the old conv layer sync call except that it adds logic to deal with distributed rows. --- libdeepgalois/src/DistContext.cpp | 2 +- libgnn/include/galois/graphs/GNNGraph.h | 18 ++ .../graphs/GraphAggregationSyncStructures.h | 66 ++++++ libgnn/src/GNNGraph.cpp | 56 +++-- libgnn/src/GraphConvolutionalLayer.cpp | 12 +- libgnn/test/CMakeLists.txt | 5 + libgnn/test/aggregate-sync-test.cpp | 200 ++++++++++++++++++ libgnn/test/convlayer-test.cpp | 1 + 8 files changed, 338 insertions(+), 22 deletions(-) create mode 100644 libgnn/include/galois/graphs/GraphAggregationSyncStructures.h create mode 100644 libgnn/test/aggregate-sync-test.cpp diff --git a/libdeepgalois/src/DistContext.cpp b/libdeepgalois/src/DistContext.cpp index e9f0ef4214..21bcad0fe3 100644 --- a/libdeepgalois/src/DistContext.cpp +++ b/libdeepgalois/src/DistContext.cpp @@ -136,7 +136,7 @@ size_t DistContext::read_features(std::string dataset_str) { // h_feats[count * feat_len] = fullFeats[i]; std::copy(fullFeats + i * DistContext::feat_len, fullFeats + (i + 1) * DistContext::feat_len, - &this->h_feats[count * DistContext::feat_len]); + &this->h_feats[dGraph->getLID(i) * DistContext::feat_len]); count++; } } diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 79d96d0da5..a0b1430add 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -3,6 +3,7 @@ #include "galois/GNNTypes.h" #include "galois/graphs/CuSPPartitioner.h" #include "galois/graphs/GluonSubstrate.h" +#include "galois/graphs/GraphAggregationSyncStructures.h" namespace galois { @@ -38,6 +39,12 @@ class GNNGraph { GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme, bool has_single_class_label); + //! Returns host id + size_t host_id() const { return host_id_; } + + //! Returns host id in brackets to use for printing things + const std::string& host_prefix() const { return host_prefix_; } + //! Return # of nodes in the partitioned graph size_t size() const { return partitioned_graph_->size(); } @@ -102,10 +109,21 @@ class GNNGraph { return partitioned_graph_->edge_dst_ptr(); } + //! Given a matrix and the column size, do an aggregate sync where each row + //! is considered a node's data and sync using the graph's Gluon + //! substrate + //! Note that it's const because the only thing being used is the graph + //! topology of this object; the thing modified is the passed in matrix + void AggregateSync(GNNFloat* matrix_to_sync, + const size_t matrix_column_size) const; + private: //! In a multi-host setting, this variable stores the host id that the graph //! is currently running on unsigned host_id_; + //! String header that can be used for debug print statements to get the host + //! this graph is on + std::string host_prefix_; //! Number of classes for a single vertex label size_t num_label_classes_{1}; //! Length of a feature node diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h new file mode 100644 index 0000000000..75a18fd830 --- /dev/null +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -0,0 +1,66 @@ +// defined in GNNGraph.cpp; set in order to control which matrix +// gets synchronized +#include "galois/GNNTypes.h" +#include "galois/BufferWrapper.h" + +namespace galois { +namespace graphs { + +extern GNNFloat* gnn_matrix_to_sync_; +extern size_t gnn_matrix_to_sync_column_length_; + +struct GNNSumAggregate { + using ValTy = galois::BufferWrapper; + + //! return a vector of floats to sync + static ValTy extract(uint32_t node_id, char&) { + ValTy extracted_vec( + &gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_], + gnn_matrix_to_sync_column_length_); + // move constructor should kick in here to avoid return copy + return extracted_vec; + } + + //! reduction is addition in this case; add received vector to + //! own vector + static bool reduce(uint32_t node_id, char&, ValTy y) { + assert(y.size() == gnn_matrix_to_sync_column_length_); + // loop and do addition + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] += + y[i]; + } + return true; + } + + //! do nothing (waste of a write) + static void reset(uint32_t, char&) {} + + //! element wise set + static void setVal(uint32_t node_id, char&, ValTy y) { + assert(y.size() == gnn_matrix_to_sync_column_length_); + // loop and do addition + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] = + y[i]; + } + } + + // GPU options TODO for GPU + static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_batch(unsigned, uint8_t*) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_reset_batch(unsigned, uint8_t*) { return false; } + static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { + return false; + } + static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } +}; + +} // namespace graphs +} // namespace galois diff --git a/libgnn/src/GNNGraph.cpp b/libgnn/src/GNNGraph.cpp index 38a78d68dc..a327dfe641 100644 --- a/libgnn/src/GNNGraph.cpp +++ b/libgnn/src/GNNGraph.cpp @@ -28,6 +28,13 @@ LoadPartition(const std::string& dataset_name, } // end namespace +namespace galois { +namespace graphs { +GNNFloat* gnn_matrix_to_sync_ = nullptr; +size_t gnn_matrix_to_sync_column_length_ = 0; +} // namespace graphs +} // namespace galois + galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme, bool has_single_class_label) { @@ -35,6 +42,10 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name, dataset_name); // save host id host_id_ = galois::runtime::getSystemNetworkInterface().ID; + host_prefix_ = + std::string("[") + + std::to_string(galois::runtime::getSystemNetworkInterface().ID) + + std::string("] "); // load partition partitioned_graph_ = LoadPartition(dataset_name, partition_scheme); @@ -90,6 +101,19 @@ bool galois::graphs::GNNGraph::IsValidForPhase( } } +void galois::graphs::GNNGraph::AggregateSync( + GNNFloat* matrix_to_sync, const size_t matrix_column_size) const { + // set globals for the sync substrate + gnn_matrix_to_sync_ = matrix_to_sync; + gnn_matrix_to_sync_column_length_ = matrix_column_size; + + // XXX bitset setting + + // call sync + sync_substrate_->sync( + "GraphAggregateSync"); +} + void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, bool has_single_class_label) { GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_); @@ -164,31 +188,32 @@ void galois::graphs::GNNGraph::ReadLocalFeatures( GALOIS_LOG_VERBOSE("[{}] Reading features from disk...", host_id_); // read in dimensions of features, specifically node feature length - size_t num_vertices; + size_t num_global_vertices; std::string file_dims = galois::gnn_dataset_path + dataset_name + "-dims.txt"; std::ifstream ifs; ifs.open(file_dims, std::ios::in); - ifs >> num_vertices >> node_feature_length_; + ifs >> num_global_vertices >> node_feature_length_; ifs.close(); - GALOIS_LOG_ASSERT(num_vertices == partitioned_graph_->globalSize()); - GALOIS_LOG_VERBOSE("[{}] N x D: {} x {}", host_id_, num_vertices, + GALOIS_LOG_ASSERT(num_global_vertices == partitioned_graph_->globalSize()); + GALOIS_LOG_VERBOSE("[{}] N x D: {} x {}", host_id_, num_global_vertices, node_feature_length_); // memory for all features of all nodes in graph // TODO read features without loading entire feature file into memory; this // is quite inefficient std::unique_ptr full_feature_set = - std::make_unique(num_vertices * node_feature_length_); + std::make_unique(num_global_vertices * node_feature_length_); // read in all features std::ifstream file_stream; std::string feature_file = galois::gnn_dataset_path + dataset_name + "-feats.bin"; file_stream.open(feature_file, std::ios::binary | std::ios::in); - file_stream.read((char*)full_feature_set.get(), - sizeof(GNNFloat) * num_vertices * node_feature_length_); + file_stream.read((char*)full_feature_set.get(), sizeof(GNNFloat) * + num_global_vertices * + node_feature_length_); file_stream.close(); // allocate memory for local features @@ -196,18 +221,19 @@ void galois::graphs::GNNGraph::ReadLocalFeatures( node_feature_length_); // copy over features for local nodes only - size_t local_vertex = 0; - for (size_t i = 0; i < num_vertices; i++) { - if (partitioned_graph_->isLocal(i)) { + size_t num_kept_vertices = 0; + for (size_t gid = 0; gid < num_global_vertices; gid++) { + if (partitioned_graph_->isLocal(gid)) { // copy over feature vector - std::copy(full_feature_set.get() + i * node_feature_length_, - full_feature_set.get() + (i + 1) * node_feature_length_, - &local_node_features_[local_vertex * node_feature_length_]); - local_vertex++; + std::copy(full_feature_set.get() + gid * node_feature_length_, + full_feature_set.get() + (gid + 1) * node_feature_length_, + &local_node_features_[partitioned_graph_->getLID(gid) * + node_feature_length_]); + num_kept_vertices++; } } full_feature_set.reset(); - GALOIS_LOG_ASSERT(local_vertex++ == partitioned_graph_->size()); + GALOIS_LOG_ASSERT(num_kept_vertices == partitioned_graph_->size()); } //! Helper function to read masks from file into the appropriate structures diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp index 5a222ced62..75125e6fe2 100644 --- a/libgnn/src/GraphConvolutionalLayer.cpp +++ b/libgnn/src/GraphConvolutionalLayer.cpp @@ -54,9 +54,6 @@ galois::GraphConvolutionalLayer::ForwardPhase( // TODO synchronization of aggregation functions - // TODO if input columns > output columns do update first then aggregate for - // efficiency - if (config_.do_activation) { GALOIS_LOG_VERBOSE("Doing activation"); Activation(); @@ -116,9 +113,9 @@ std::vector* galois::GraphConvolutionalLayer::BackwardPhase( layer_weight_gradients_.data()); } - // TODO sync agg/update - - // TODO sync weights + // sync weight gradients; note aggregation sync occurs in the function call + // already + // XXX if (config_.do_dropout && layer_number_ != 0) { DoDropoutDerivative(); @@ -176,6 +173,9 @@ void galois::GraphConvolutionalLayer::AggregateAll( } }, galois::steal(), galois::loopname("ConvolutionalAggregateAll")); + + // aggregate sync + graph_.AggregateSync(aggregate_output, column_length); } void galois::GraphConvolutionalLayer::UpdateEmbeddings( diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 7ad7bf1888..70dc3c2b65 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -30,4 +30,9 @@ add_executable(epoch-test epoch-test.cpp) target_link_libraries(epoch-test galois_gnn) add_test(NAME epoch-test COMMAND epoch-test) +# TODO figure out how to make this test run in parallel +add_executable(aggregate-sync-test aggregate-sync-test.cpp) +target_link_libraries(aggregate-sync-test galois_gnn) +#add_test(NAME aggregate-sync-test COMMAND GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 ./aggregate-sync-test) + # TODO multi host tests? diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp new file mode 100644 index 0000000000..a6ca42e963 --- /dev/null +++ b/libgnn/test/aggregate-sync-test.cpp @@ -0,0 +1,200 @@ +#include "galois/Logging.h" +#include "galois/GraphNeuralNetwork.h" +#include "galois/layers/GraphConvolutionalLayer.h" + +int main() { + galois::DistMemSys G; + + if (galois::runtime::getSystemNetworkInterface().Num == 1) { + GALOIS_LOG_ERROR("This test should be run with multiple hosts/processes"); + exit(1); + } + + auto test_graph = std::make_unique( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + + // print edges for sanity + for (size_t node = 0; node < test_graph->size(); node++) { + for (auto e = test_graph->EdgeBegin(node); e != test_graph->EdgeEnd(node); + e++) { + galois::gPrint(test_graph->host_prefix(), "Edge ", + test_graph->GetGID(node), " ", + test_graph->GetGID(test_graph->EdgeDestination(e)), "\n"); + } + } + + // create same layer from convlayer-test and make sure result is the same even + // in multi-host environment + galois::GNNLayerDimensions dimension_0{.input_rows = test_graph->size(), + .input_columns = 3, + .output_columns = 2}; + + // create the layer, no norm factor + // note layer number is 1 so that it does something in backward phase + std::unique_ptr layer_0 = + std::make_unique( + 0, *(test_graph.get()), dimension_0, + galois::GNNConfig{.allow_aggregate_after_update = false}); + layer_0->InitAllWeightsTo1(); + // make sure it runs in a sane manner + const std::vector& layer_0_forward_output = + layer_0->ForwardPhase(test_graph->GetLocalFeatures()); + + ////////////////////////////////////////////////////////////////////////////// + // sanity check output + ////////////////////////////////////////////////////////////////////////////// + + // check each row on each host: convert row into GID, and based on GID we + // know what the ground truth is + // row 0 = 3 + // row 1 = 6 + // row 2 = 12 + // row 3 = 18 + // row 4 = 24 + // row 5 = 30 + // row 6 = 15 + + // row should correspond to LID + for (size_t row = 0; row < test_graph->size(); row++) { + // row -> GID + size_t global_row = test_graph->GetGID(row); + + galois::GNNFloat ground_truth = 0.0; + + switch (global_row) { + case 0: + ground_truth = 3; + break; + case 1: + ground_truth = 6; + break; + case 2: + ground_truth = 12; + break; + case 3: + ground_truth = 18; + break; + case 4: + ground_truth = 24; + break; + case 5: + ground_truth = 30; + break; + case 6: + ground_truth = 15; + break; + default: + GALOIS_LOG_FATAL("bad global row for test graph"); + break; + } + + // size 2 columns + for (size_t c = 0; c < 2; c++) { + GALOIS_LOG_ASSERT(layer_0_forward_output[row * 2 + c] == ground_truth); + } + } + + ////////////////////////////////////////////////////////////////////////////// + + std::vector dummy_ones(test_graph->size() * 2, 1); + // backward pass checking + // layer 0 means that an empty weight matrix is returned since there is no + // point passing back anything + std::vector* layer_0_backward_output = + layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones); + + ////////////////////////////////////////////////////////////////////////////// + // sanity check layer 0 backward output; all 0 because layer 0 + ////////////////////////////////////////////////////////////////////////////// + // since norm factors aren't invovled it is possible to do full assertions + GALOIS_LOG_ASSERT(layer_0_backward_output->size() == test_graph->size() * 3); + for (size_t i = 0; i < layer_0_backward_output->size(); i++) { + GALOIS_LOG_ASSERT((*layer_0_backward_output)[i] == 0); + } + + ////////////////////////////////////////////////////////////////////////////// + // layer 1 to check backward output + ////////////////////////////////////////////////////////////////////////////// + std::unique_ptr layer_1 = + std::make_unique( + 1, *(test_graph.get()), dimension_0, + galois::GNNConfig{.allow_aggregate_after_update = false}); + layer_1->InitAllWeightsTo1(); + const std::vector& layer_1_forward_output = + layer_1->ForwardPhase(test_graph->GetLocalFeatures()); + + // same check for forward as before + for (size_t row = 0; row < test_graph->size(); row++) { + // row -> GID + size_t global_row = test_graph->GetGID(row); + + galois::GNNFloat ground_truth = 0.0; + + switch (global_row) { + case 0: + ground_truth = 3; + break; + case 1: + ground_truth = 6; + break; + case 2: + ground_truth = 12; + break; + case 3: + ground_truth = 18; + break; + case 4: + ground_truth = 24; + break; + case 5: + ground_truth = 30; + break; + case 6: + ground_truth = 15; + break; + default: + GALOIS_LOG_FATAL("bad global row for test graph"); + break; + } + + // size 2 columns + for (size_t c = 0; c < 2; c++) { + GALOIS_LOG_ASSERT(layer_1_forward_output[row * 2 + c] == ground_truth); + } + } + + // since layer isn't 0 anymore, backward phase will actually return something + dummy_ones.assign(test_graph->size() * 2, 1); + std::vector* layer_1_backward_output = + layer_1->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones); + + for (size_t row = 0; row < test_graph->size(); row++) { + // row -> GID + size_t global_row = test_graph->GetGID(row); + + galois::GNNFloat ground_truth = 0.0; + + switch (global_row) { + case 0: + case 6: + ground_truth = 2; + break; + case 1: + case 2: + case 3: + case 4: + case 5: + ground_truth = 4; + break; + default: + GALOIS_LOG_FATAL("bad global row for test graph"); + break; + } + + // size 3 columns + for (size_t c = 0; c < 3; c++) { + GALOIS_LOG_ASSERT((*layer_1_backward_output)[row * 3 + c] == + ground_truth); + } + } +} diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp index 3f46fb84d2..0bb6c25a4c 100644 --- a/libgnn/test/convlayer-test.cpp +++ b/libgnn/test/convlayer-test.cpp @@ -20,6 +20,7 @@ int main() { std::vector feats = test_graph.GetLocalFeatures(); ////////////////////////////////////////////////////////////////////////////// // doubles as a test for reading as well + GALOIS_LOG_ASSERT(7 == test_graph.size()); GALOIS_LOG_ASSERT(21 == feats.size()); GALOIS_LOG_ASSERT(0.0 == feats[0]); GALOIS_LOG_ASSERT(0.0 == feats[1]); From 4acdd1f5050e3fc4c19d1822c78d5bd986e0c9ee Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 13 Oct 2020 19:19:35 -0500 Subject: [PATCH 370/660] GNN weight gradient synchronization + unit test Adds a wrapper over any vector to make it possible to sync the vector using Gluon: assumes the vector is replicated present on all hosts. Use this wrapper to synchronize weight gradients calculated on each host during the backward phase of computation. This commit includes a sum and an average sync function with the average one being used by default. TODO later is to make this user configurable. Adds a unit test to make sure the weight gradient works as expected. With this commit an end to end training framework should work. There are still some accuracy issues to resolve, however (not matching old code accuracy). --- libgnn/CMakeLists.txt | 1 + libgnn/include/galois/layers/GNNLayer.h | 13 ++++ .../galois/layers/GluonGradientInterface.h | 70 +++++++++++++++++++ .../galois/layers/GradientSyncStructures.h | 37 ++++++++++ libgnn/src/GNNLayer.cpp | 33 +++++++++ libgnn/src/GluonGradientInterface.cpp | 49 +++++++++++++ libgnn/src/GraphConvolutionalLayer.cpp | 2 +- libgnn/test/CMakeLists.txt | 3 + libgnn/test/aggregate-sync-test.cpp | 3 +- libgnn/test/convlayer-test.cpp | 1 - libgnn/test/weight-sync-test.cpp | 42 +++++++++++ 11 files changed, 251 insertions(+), 3 deletions(-) create mode 100644 libgnn/include/galois/layers/GluonGradientInterface.h create mode 100644 libgnn/include/galois/layers/GradientSyncStructures.h create mode 100644 libgnn/src/GluonGradientInterface.cpp create mode 100644 libgnn/test/weight-sync-test.cpp diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 88398c3d60..e8c109df6c 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -14,6 +14,7 @@ set(sources src/SoftmaxLayer.cpp src/GraphNeuralNetwork.cpp src/GNNOptimizers.cpp + src/GluonGradientInterface.cpp ) add_library(galois_gnn STATIC ${sources}) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index ac6cc9dd0e..2232e82b5c 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -3,6 +3,7 @@ #include "galois/PerThreadRNG.h" #include "galois/GNNOptimizers.h" #include "galois/graphs/GNNGraph.h" +#include "galois/layers/GluonGradientInterface.h" namespace galois { @@ -141,6 +142,12 @@ class GNNLayer { std::vector layer_weights_; //! Gradients used to update the weights of this layer std::vector layer_weight_gradients_; + //! Wrapper over gradient matrix to make it compatible with Gluon + std::unique_ptr gradient_sync_interface_; + //! Synchronization substrate for the weight gradients + std::unique_ptr> + gradient_sync_substrate_; + // There is a forward and a backward as their sizes will differ and we only // want to allocate memory once to avoid runtime memory allocation. //! The output of the forward phase for this layer. @@ -188,6 +195,12 @@ class GNNLayer { void Activation(); //! Calculate derivative of activation function based on config on the matrix void ActivationDerivative(std::vector* matrix); + + //! Synchronize weight gradients with a summation + void WeightGradientSyncSum(); + //! Synchronize weight gradients with a summation, then locally divide all + //! weights to get an average + void WeightGradientSyncAverage(); }; } // namespace galois diff --git a/libgnn/include/galois/layers/GluonGradientInterface.h b/libgnn/include/galois/layers/GluonGradientInterface.h new file mode 100644 index 0000000000..92c0a5eb69 --- /dev/null +++ b/libgnn/include/galois/layers/GluonGradientInterface.h @@ -0,0 +1,70 @@ +#pragma once + +#include "galois/GNNTypes.h" +#include "galois/gstl.h" +#include "galois/runtime/Network.h" + +namespace galois { + +// TODO figure out which function calls can be removed without causing compiler +// to complain + +//! Wraps a matrix and allows it to be synchronized via Gluon as it provides +//! all the functions Gluon needs. +//! Assumes the matrix is initialized the same way across all hosts (if not +//! they'll all see the same values after the first round of sync anyways) +class GluonGradientInterface { +public: + //! Save reference to weight gradients. + //! Then setup mirror metadata for Gluon to use during setup. + GluonGradientInterface(std::vector& gradients); + + //! Size is number of weights since all hosts own everything + size_t size() const { return num_weights_; } + //! Global size is number of weights + size_t globalSize() const { return num_weights_; } + //! Return the weights owned by this host + size_t numMasters() const { return num_owned_; } + //! GID is same as LID since all hosts have all weights + uint32_t getGID(const uint32_t node_id) const { return node_id; } + //! LID is same as GID since all hosts have all weights + uint32_t getLID(const uint32_t node_id) const { return node_id; } + //! Return weight w + GNNFloat& getData(uint32_t w) const { return gradients_[w]; } + //! Return ranges for mirrors (unowned nodes) + const std::vector>& getMirrorRanges() const { + return mirror_ranges_; + } + //! Return mirror nodes for each host from this host's point of view + std::vector>& getMirrorNodes() { return mirror_nodes_; } + + ////////////////////////////////////////////////////////////////////////////// + + // for all that follow, no edges in this sync so most of this returns what + // you expect + // size_t getNumNodesWithEdges() const { return 0; } + bool is_vertex_cut() const { return false; } + unsigned edge_begin(uint32_t) const { return 0; } + unsigned edge_end(uint32_t) const { return 0; } + unsigned getEdgeDst(uint32_t) const { return 0; } + unsigned getEdgeData(uint32_t) const { return 0; } + void deallocate() const {}; + +private: + //! Reference to gradients that can get synchronized + std::vector& gradients_; + //! number of weight gradients + size_t num_weights_; + //! number of single gradients this host is responsible for + size_t num_owned_; + //! First weight that's a master + size_t begin_master_; + //! Last weight that's a master + size_t end_master_; + //! My nodes whose's masters are on other hosts; global ids + std::vector> mirror_nodes_; + //! nodes that are mirrors on this host + std::vector> mirror_ranges_; +}; + +} // namespace galois diff --git a/libgnn/include/galois/layers/GradientSyncStructures.h b/libgnn/include/galois/layers/GradientSyncStructures.h new file mode 100644 index 0000000000..32b7a85b82 --- /dev/null +++ b/libgnn/include/galois/layers/GradientSyncStructures.h @@ -0,0 +1,37 @@ +#pragma once +#include "galois/GNNTypes.h" + +namespace galois { + +//! Simple summation of values +struct WeightGradientSummation { + using ValTy = GNNFloat; + static ValTy extract(uint32_t, ValTy& weight) { return weight; } + static bool reduce(uint32_t, ValTy& weight, ValTy y) { + weight += y; + return true; + } + + //! reset weight to 0 + static void reset(uint32_t, ValTy& weight) { weight = 0.0; } + + //! save weight + static void setVal(uint32_t, ValTy& weight, ValTy y) { weight = y; } + + // GPU options TODO for GPU + static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_batch(unsigned, uint8_t*) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_reset_batch(unsigned, uint8_t*) { return false; } + static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { + return false; + } + static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } +}; + +} // namespace galois diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/GNNLayer.cpp index bcefd42efe..838bf45905 100644 --- a/libgnn/src/GNNLayer.cpp +++ b/libgnn/src/GNNLayer.cpp @@ -1,5 +1,6 @@ #include "galois/Logging.h" #include "galois/layers/GNNLayer.h" +#include "galois/layers/GradientSyncStructures.h" galois::GNNLayer::GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, @@ -18,6 +19,15 @@ galois::GNNLayer::GNNLayer(size_t layer_num, layer_weights_.resize(num_weight_elements); layer_weight_gradients_.resize(num_weight_elements, 0); GlorotBengioInit(&layer_weights_); + + // initialize sync substrate + gradient_sync_interface_ = + std::make_unique(layer_weight_gradients_); + gradient_sync_substrate_ = std::make_unique< + galois::graphs::GluonSubstrate>( + *gradient_sync_interface_, + galois::runtime::getSystemNetworkInterface().ID, + galois::runtime::getSystemNetworkInterface().Num, false); } size_t num_output_elements = @@ -124,3 +134,26 @@ void galois::GNNLayer::OptimizeLayer(BaseOptimizer* optimizer, optimizer->GradientDescent(layer_weight_gradients_, &layer_weights_, trainable_layer_number); } + +void galois::GNNLayer::WeightGradientSyncSum() { + // XXX bitset + gradient_sync_substrate_->sync( + "WeightGradientsSync"); +} + +void galois::GNNLayer::WeightGradientSyncAverage() { + size_t num_hosts = galois::runtime::getSystemNetworkInterface().Num; + if (num_hosts > 1) { + // XXX bitset + // sum, then average by dividing all by num hosts (every host participates + // in sync) + gradient_sync_substrate_->sync( + "WeightGradientsSyncAverage"); + galois::do_all( + galois::iterate(static_cast(0), layer_weight_gradients_.size()), + [&](size_t weight_index) { + layer_weight_gradients_[weight_index] /= num_hosts; + }, + galois::loopname("WeightGradientSyncAverageDivide")); + } +} diff --git a/libgnn/src/GluonGradientInterface.cpp b/libgnn/src/GluonGradientInterface.cpp new file mode 100644 index 0000000000..31770afb4e --- /dev/null +++ b/libgnn/src/GluonGradientInterface.cpp @@ -0,0 +1,49 @@ +#include "galois/layers/GluonGradientInterface.h" + +galois::GluonGradientInterface::GluonGradientInterface( + std::vector& gradients) + : gradients_(gradients), num_weights_(gradients_.size()) { + size_t my_host = galois::runtime::getSystemNetworkInterface().ID; + size_t num_hosts = galois::runtime::getSystemNetworkInterface().Num; + + // allocate a vector for each host + mirror_nodes_.resize(num_hosts); + + // loop through distribution of weights to hosts + for (unsigned h = 0; h < num_hosts; h++) { + std::pair cur_range = + galois::block_range((size_t)0, num_weights_, h, num_hosts); + + if (h != my_host) { + // setup mirrors for the host h which is just the list of IDs + size_t current_weight = cur_range.first; + size_t last_weight = cur_range.second; + size_t num_host_weights = last_weight - current_weight; + + // set mirrors for host h + mirror_nodes_[h].reserve(num_host_weights); + for (; current_weight < last_weight; current_weight++) { + mirror_nodes_[h].push_back(current_weight); + } + } else { + // these belong to this host; save, then mirror ranges can be + // calculated from this + begin_master_ = cur_range.first; + end_master_ = cur_range.second; + num_owned_ = end_master_ - begin_master_; + + // first range is 0 to begin master + if (begin_master_ > 0) { + mirror_ranges_.emplace_back(0, begin_master_); + } + + // second range is endMaster to end + if (end_master_ < num_weights_) { + mirror_ranges_.emplace_back(end_master_, num_weights_); + } + } + } + + galois::gInfo("[", my_host, "] Weight gradients: this host owns ", + begin_master_, " to ", end_master_); +} diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/GraphConvolutionalLayer.cpp index 75125e6fe2..d02a2bf0ca 100644 --- a/libgnn/src/GraphConvolutionalLayer.cpp +++ b/libgnn/src/GraphConvolutionalLayer.cpp @@ -115,7 +115,7 @@ std::vector* galois::GraphConvolutionalLayer::BackwardPhase( // sync weight gradients; note aggregation sync occurs in the function call // already - // XXX + WeightGradientSyncAverage(); if (config_.do_dropout && layer_number_ != 0) { DoDropoutDerivative(); diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 70dc3c2b65..69ef29b43f 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -35,4 +35,7 @@ add_executable(aggregate-sync-test aggregate-sync-test.cpp) target_link_libraries(aggregate-sync-test galois_gnn) #add_test(NAME aggregate-sync-test COMMAND GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 ./aggregate-sync-test) +add_executable(weight-sync-test weight-sync-test.cpp) +target_link_libraries(weight-sync-test galois_gnn) + # TODO multi host tests? diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp index a6ca42e963..6b67c65bfe 100644 --- a/libgnn/test/aggregate-sync-test.cpp +++ b/libgnn/test/aggregate-sync-test.cpp @@ -30,7 +30,6 @@ int main() { .output_columns = 2}; // create the layer, no norm factor - // note layer number is 1 so that it does something in backward phase std::unique_ptr layer_0 = std::make_unique( 0, *(test_graph.get()), dimension_0, @@ -197,4 +196,6 @@ int main() { ground_truth); } } + + // XXX TODO CVC } diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp index 0bb6c25a4c..ffe3bb6513 100644 --- a/libgnn/test/convlayer-test.cpp +++ b/libgnn/test/convlayer-test.cpp @@ -49,7 +49,6 @@ int main() { .input_rows = 7, .input_columns = 3, .output_columns = 2}; // create the layer, no norm factor - // note layer number is 1 so that it does something in backward phase std::unique_ptr layer_0 = std::make_unique( 0, test_graph, dimension_0, diff --git a/libgnn/test/weight-sync-test.cpp b/libgnn/test/weight-sync-test.cpp new file mode 100644 index 0000000000..33c08df29b --- /dev/null +++ b/libgnn/test/weight-sync-test.cpp @@ -0,0 +1,42 @@ +#include "galois/Logging.h" +#include "galois/GraphNeuralNetwork.h" +#include "galois/layers/GraphConvolutionalLayer.h" + +int main() { + galois::DistMemSys G; + + if (galois::runtime::getSystemNetworkInterface().Num == 4) { + GALOIS_LOG_ERROR("This test should be run with 4 hosts/processes"); + exit(1); + } + + auto test_graph = std::make_unique( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + + // create same layer from convlayer-test and make sure result is the same even + // in multi-host environment + galois::GNNLayerDimensions dimension_0{.input_rows = test_graph->size(), + .input_columns = 3, + .output_columns = 2}; + + // create the layer, no norm factor + std::unique_ptr layer_0 = + std::make_unique( + 0, *(test_graph.get()), dimension_0, + galois::GNNConfig{.allow_aggregate_after_update = false}); + layer_0->InitAllWeightsTo1(); + + // backward pass checking; check the gradients out + std::vector dummy_ones(test_graph->size() * 2, 1); + layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones); + + // gradient verification; average + // host 0 has 18, 1 has 21, 2 has 12, 3 has 0s; averaged to 12.75 + const std::vector& grads = + layer_0->GetLayerWeightGradients(); + for (size_t i = 0; i < 6; i++) { + GALOIS_LOG_ASSERT(grads[i] == 12.75); + } + + // XXX CVC +} From b7c9fc176e547942a84702b4c663697d49126231 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 16 Oct 2020 14:26:26 -0500 Subject: [PATCH 371/660] disabled ReLU in the old gnn code It hurts more than helps from my experience --- libdeepgalois/src/layers/graph_conv_layer.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index f6741f4b6d..da9b01dbae 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -76,6 +76,8 @@ void graph_conv_layer::malloc_and_init() { norm_scores_grad.resize(ne); epsilon = 0.2; // LeakyReLU angle of negative slope #endif + dropout_ = true; + act_ = false; if (dropout_) dropout_mask = new mask_t[x * y]; From 8f289eaa68435f6db54bfe604d1eabe2de10dfbb Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 16 Oct 2020 19:23:28 -0500 Subject: [PATCH 372/660] libgnn needs MKL; non-blas dot product 1) Move over to use MKL instead of OpenBLAS for libgnn because MKL is way easier to find on other machines. 2) Dot product use in GNN math is now a regular for loop rather than a call to cblas (to avoid nested parallelism problem; some prelim testing shows it doesn't affect performance). --- CMakeLists.txt | 4 +--- libgnn/CMakeLists.txt | 18 +++++++----------- libgnn/include/galois/GNNMath.h | 2 +- libgnn/src/GNNMath.cpp | 16 +++++++++++++--- 4 files changed, 22 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fc01f4a1ef..937251376c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -143,11 +143,9 @@ endif() if(USE_MKL_BLAS) SET(INTEL_ROOT /opt/apps/sysnet/intel/19.0) SET(MKL_ROOT ${INTEL_ROOT}/mkl) - SET(INTEL_LIBS_DIR ${INTEL_ROOT}/lib/intel64_lin) - find_package(MKL) + find_package(MKL REQUIRED) message(STATUS "MKL: ${MKL_INCLUDE_DIRS}") if (MKL_FOUND) - include_directories(${MKL_INCLUDE_DIRS}) else() message(WARNING "MKL not found") endif() diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index e8c109df6c..9d1b18b682 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -1,11 +1,3 @@ -set(BLAS_INC_DIR ${OPENBLAS_ROOT}/include/openblas) -set(BLAS_LIB_DIR ${OPENBLAS_ROOT}/lib64) -# blas library -include_directories(${BLAS_INC_DIR}) -link_directories(${BLAS_LIB_DIR}) - -set(BLAS_LIB "-lopenblas") - set(sources src/GNNGraph.cpp src/GNNLayer.cpp @@ -17,15 +9,19 @@ set(sources src/GluonGradientInterface.cpp ) +set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64) +set(INTEL_LIBS "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core") + add_library(galois_gnn STATIC ${sources}) +target_link_directories(galois_gnn PUBLIC ${MKL_LIBRARIES}) target_link_libraries(galois_gnn galois_shmem) -target_link_libraries(galois_gnn ${MPI_CXX_LIBRARIES}) -target_link_libraries(galois_gnn ${BLAS_LIB} ${BOOST_LIBRARIES}) +target_link_libraries(galois_gnn ${INTEL_LIBS}) target_link_libraries(galois_gnn galois_dist_async galois_cusp galois_gluon galois_support) target_include_directories(galois_gnn PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include + ${MKL_INCLUDE_DIRS} ) -set_target_properties(galois_gnn PROPERTIES EXPORT_NAME gluon) +set_target_properties(galois_gnn PROPERTIES EXPORT_NAME galois_gnn) add_subdirectory(test) diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h index 2cf913d5de..488b538d75 100644 --- a/libgnn/include/galois/GNNMath.h +++ b/libgnn/include/galois/GNNMath.h @@ -1,7 +1,7 @@ #pragma once #include "galois/GNNTypes.h" -#include +#include namespace galois { diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp index 0087c7340b..5e9fb8d050 100644 --- a/libgnn/src/GNNMath.cpp +++ b/libgnn/src/GNNMath.cpp @@ -68,9 +68,18 @@ void galois::GNNSoftmaxDerivative(const size_t vector_length, temp_vector[j] = (j == i) ? prev_output[i] * (1.0 - prev_output[i]) : -prev_output[j] * prev_output[i]; } - // TODO is sdot using threads? if so this is a nested parallelism problem - output[i] = - cblas_sdot(vector_length, prev_output_derivative, 1, temp_vector, 1); + GNNFloat sdot_result = 0; + // TODO use vector instructions? would need another loop to add everything + // together + a temp vector to store results so probably about the same? + for (size_t k = 0; k < vector_length; k++) { + sdot_result += prev_output_derivative[k] * temp_vector[k]; + } + output[i] = sdot_result; + + // TODO this is currently disabled because of a nested parallelism problem + // (cblas may use more threads) + // output[i] = + // cblas_sdot(vector_length, prev_output_derivative, 1, temp_vector, 1); } } @@ -113,6 +122,7 @@ void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, size_t lead_dim_b = (trans_b == CblasNoTrans) ? output_columns : input_columns; // do the MM + // TODO roll our own sgemm rather than use 3rd party? cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns, input_columns, 1.0, a, lead_dim_a, b, lead_dim_b, 0.0, output, output_columns); From 0ed623fba833fc6a026837f877e2cf4592e1a674 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 16 Oct 2020 19:50:59 -0500 Subject: [PATCH 373/660] Removed structure initializer lists from libgnn Some compilers don't like structure init lists and will fail to compile. This commit makes the code way less compact and manually declares structures to get around this annoying limitation. --- libgnn/include/galois/GraphNeuralNetwork.h | 6 +---- libgnn/test/adam-test.cpp | 8 +++++-- libgnn/test/aggregate-sync-test.cpp | 19 ++++++++------- libgnn/test/convlayer-test.cpp | 28 ++++++++++++---------- libgnn/test/epoch-test.cpp | 12 ++++++---- libgnn/test/gnnfb-test.cpp | 6 +++-- libgnn/test/softmaxlayer-test.cpp | 8 +++---- libgnn/test/weight-sync-test.cpp | 14 ++++++----- 8 files changed, 56 insertions(+), 45 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 80f7b07916..1762cda8da 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -30,11 +30,7 @@ class GraphNeuralNetworkConfig { const std::vector& layer_column_sizes, GNNOutputLayerType output_layer_type) : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes, - output_layer_type, - GNNConfig{.do_dropout = true, - .dropout_rate = 0.3, - .do_activation = true, - .do_normalization = true}) {} + output_layer_type, GNNConfig()) {} //! Construction with a specified config for layers GraphNeuralNetworkConfig(size_t num_layers, diff --git a/libgnn/test/adam-test.cpp b/libgnn/test/adam-test.cpp index e01368ce87..dfdfcdad00 100644 --- a/libgnn/test/adam-test.cpp +++ b/libgnn/test/adam-test.cpp @@ -14,8 +14,12 @@ int main() { num_threads); // create sample config that is easy to trace - galois::AdamOptimizer::AdamConfiguration config = { - .alpha = 1, .beta1 = 0.5, .beta2 = 0.5, .epsilon = 0}; + galois::AdamOptimizer::AdamConfiguration config; + config.alpha = 1; + config.beta1 = 0.5; + config.beta2 = 0.5; + config.epsilon = 0; + std::vector layer_sizes = {2, 1}; galois::AdamOptimizer adam(config, layer_sizes, 2); diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp index 6b67c65bfe..ea184e3e2a 100644 --- a/libgnn/test/aggregate-sync-test.cpp +++ b/libgnn/test/aggregate-sync-test.cpp @@ -25,15 +25,17 @@ int main() { // create same layer from convlayer-test and make sure result is the same even // in multi-host environment - galois::GNNLayerDimensions dimension_0{.input_rows = test_graph->size(), - .input_columns = 3, - .output_columns = 2}; + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = test_graph->size(); + dimension_0.input_columns = 3; + dimension_0.output_columns = 2; + galois::GNNConfig l_config; + l_config.allow_aggregate_after_update = false; // create the layer, no norm factor std::unique_ptr layer_0 = - std::make_unique( - 0, *(test_graph.get()), dimension_0, - galois::GNNConfig{.allow_aggregate_after_update = false}); + std::make_unique(0, *(test_graph.get()), + dimension_0, l_config); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner const std::vector& layer_0_forward_output = @@ -115,9 +117,8 @@ int main() { // layer 1 to check backward output ////////////////////////////////////////////////////////////////////////////// std::unique_ptr layer_1 = - std::make_unique( - 1, *(test_graph.get()), dimension_0, - galois::GNNConfig{.allow_aggregate_after_update = false}); + std::make_unique(1, *(test_graph.get()), + dimension_0, l_config); layer_1->InitAllWeightsTo1(); const std::vector& layer_1_forward_output = layer_1->ForwardPhase(test_graph->GetLocalFeatures()); diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp index ffe3bb6513..1d89cf198a 100644 --- a/libgnn/test/convlayer-test.cpp +++ b/libgnn/test/convlayer-test.cpp @@ -45,14 +45,18 @@ int main() { GALOIS_LOG_ASSERT(6.0 == feats[20]); ////////////////////////////////////////////////////////////////////////////// - galois::GNNLayerDimensions dimension_0{ - .input_rows = 7, .input_columns = 3, .output_columns = 2}; + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = 7; + dimension_0.input_columns = 3; + dimension_0.output_columns = 2; + + galois::GNNConfig dcon; + dcon.allow_aggregate_after_update = false; // create the layer, no norm factor std::unique_ptr layer_0 = - std::make_unique( - 0, test_graph, dimension_0, - galois::GNNConfig{.allow_aggregate_after_update = false}); + std::make_unique(0, test_graph, + dimension_0, dcon); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner const std::vector& layer_0_forward_output = @@ -134,9 +138,8 @@ int main() { // create layer 1 for testing backward prop actually giving weights back std::unique_ptr layer_1 = - std::make_unique( - 1, test_graph, dimension_0, - galois::GNNConfig{.allow_aggregate_after_update = false}); + std::make_unique(1, test_graph, + dimension_0, dcon); layer_1->InitAllWeightsTo1(); const std::vector& layer_1_forward_output = layer_1->ForwardPhase(test_graph.GetLocalFeatures()); @@ -201,10 +204,11 @@ int main() { ////////////////////////////////////////////////////////////////////////////// - galois::GNNConfig config = {.do_dropout = true, - .do_activation = true, - .do_normalization = true, - .allow_aggregate_after_update = false}; + galois::GNNConfig config; + config.do_dropout = true; + config.do_activation = true; + config.do_normalization = true; + config.allow_aggregate_after_update = false; // finally, just make sure dropout and activation run without crashes // (verification requires floating point accuracy or setting a seed which I diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp index c37a7d2e34..20c987be60 100644 --- a/libgnn/test/epoch-test.cpp +++ b/libgnn/test/epoch-test.cpp @@ -14,19 +14,21 @@ int main() { // load graph auto test_graph = std::make_unique( - "cora", galois::graphs::GNNPartitionScheme::kOEC, true); + "reddit", galois::graphs::GNNPartitionScheme::kCVC, true); std::vector layer_types = { galois::GNNLayerType::kGraphConvolutional, galois::GNNLayerType::kGraphConvolutional}; std::vector layer_output_sizes = { 16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()}; + galois::GNNConfig layer_config; + layer_config.do_dropout = true; + layer_config.do_activation = false; + layer_config.do_normalization = true; // XXX Activation kills accuracy compared to old code, esp. for cora galois::GraphNeuralNetworkConfig gnn_config( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, - galois::GNNConfig{.do_dropout = true, - .do_activation = false, - .do_normalization = true}); + layer_config); std::vector adam_sizes = {16 * test_graph->node_feature_length(), 16 * test_graph->GetNumLabelClasses()}; @@ -41,7 +43,7 @@ int main() { // increasing galois::StatTimer main_timer("Timer_0"); main_timer.start(); - for (size_t epoch = 0; epoch < 100; epoch++) { + for (size_t epoch = 0; epoch < 20; epoch++) { const std::vector* predictions = gnn->DoInference(); gnn->GradientPropagation(); galois::gPrint("Epoch ", epoch, ": Accuracy is ", diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp index 50c40ff2c1..692cbfd30c 100644 --- a/libgnn/test/gnnfb-test.cpp +++ b/libgnn/test/gnnfb-test.cpp @@ -23,11 +23,13 @@ int main() { galois::GNNLayerType::kGraphConvolutional}; // note this includes the output; last 2 must be same because softmax std::vector layer_output_sizes = {4, 7, 7}; + galois::GNNConfig dcon; + dcon.allow_aggregate_after_update = false; // note GNNConfig is passed in; use a config that does not do anything extra // like dropout or activation and the like so that input is easier to verify galois::GraphNeuralNetworkConfig gnn_config( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, - galois::GNNConfig{.allow_aggregate_after_update = false}); + dcon); // input is 7 x 3, layers are then 3 x 4 and 4 x 7 and 7 x 7 // middle 2 are trainable so 12 and 28 std::vector adam_sizes = {12, 28}; @@ -171,7 +173,7 @@ int main() { "tester", galois::graphs::GNNPartitionScheme::kOEC, true); galois::GraphNeuralNetworkConfig gnn_config2( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, - galois::GNNConfig{.allow_aggregate_after_update = false}); + dcon); auto adam2 = std::make_unique(adam_sizes, 2); auto gnn2 = std::make_unique( std::move(test_graph), std::move(adam2), std::move(gnn_config2)); diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp index bd3cd8c5e3..5d9fa87728 100644 --- a/libgnn/test/softmaxlayer-test.cpp +++ b/libgnn/test/softmaxlayer-test.cpp @@ -17,10 +17,10 @@ int main() { "tester", galois::graphs::GNNPartitionScheme::kOEC, true); // input/output columns must be same in softmax - galois::GNNLayerDimensions dimension_0{ - .input_rows = 7, - .input_columns = test_graph.GetNumLabelClasses(), - .output_columns = test_graph.GetNumLabelClasses()}; + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = 7; + dimension_0.input_columns = test_graph.GetNumLabelClasses(); + dimension_0.output_columns = test_graph.GetNumLabelClasses(); GALOIS_LOG_VERBOSE("Num output classes is {}", dimension_0.input_columns); diff --git a/libgnn/test/weight-sync-test.cpp b/libgnn/test/weight-sync-test.cpp index 33c08df29b..37314fb59a 100644 --- a/libgnn/test/weight-sync-test.cpp +++ b/libgnn/test/weight-sync-test.cpp @@ -15,15 +15,17 @@ int main() { // create same layer from convlayer-test and make sure result is the same even // in multi-host environment - galois::GNNLayerDimensions dimension_0{.input_rows = test_graph->size(), - .input_columns = 3, - .output_columns = 2}; + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = test_graph->size(); + dimension_0.input_columns = 3; + dimension_0.output_columns = 2; + galois::GNNConfig dcon; + dcon.allow_aggregate_after_update = false; // create the layer, no norm factor std::unique_ptr layer_0 = - std::make_unique( - 0, *(test_graph.get()), dimension_0, - galois::GNNConfig{.allow_aggregate_after_update = false}); + std::make_unique(0, *(test_graph.get()), + dimension_0, dcon); layer_0->InitAllWeightsTo1(); // backward pass checking; check the gradients out From 866f25c0517936fdad94cc198a04934db61cc5a4 Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 17 Oct 2020 10:59:30 -0500 Subject: [PATCH 374/660] example --- libgnn/src/SoftmaxLayer.cu | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 libgnn/src/SoftmaxLayer.cu diff --git a/libgnn/src/SoftmaxLayer.cu b/libgnn/src/SoftmaxLayer.cu new file mode 100644 index 0000000000..4a9bce5b26 --- /dev/null +++ b/libgnn/src/SoftmaxLayer.cu @@ -0,0 +1,24 @@ +#include "galois/Logging.h" +#include "galois/GNNMath.h" // Please add GPU functions +#include "galois/layers/SoftmaxLayer.h" + +// Allocate memory and initialize +void galois::SoftmaxLayer::Init() { +} + +// Input: in_tensor +// Output: out_tensor +void galois::SoftmaxLayer::Forward(const galois::GNNFloat* in_tensor, + galois::GNNFloat* out_tensor) { +} + +// Input: in_tensor +// Input: out_tensor +// Input: out_gradients +// Output: in_gradients +void galois::SoftmaxLayer::Backward(const galois::GNNFloat* in_tensor, + const galois::GNNFloat* out_tensor, + galois::GNNFloat* in_gradients, + galois::GNNFloat* out_gradients) { +} + From fdb4aa19d3ffa6a5937d9d8193041a570ca36c7f Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 17 Oct 2020 11:06:40 -0500 Subject: [PATCH 375/660] ass Init --- libgnn/include/galois/layers/SoftmaxLayer.h | 3 +++ libgnn/src/SoftmaxLayer.cpp | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index 3b5ace94c8..e410337964 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -21,6 +21,7 @@ class SoftmaxLayer : public GNNLayer { GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns); // output needs to match number of possible classes GALOIS_LOG_ASSERT(dimensions.input_columns == graph.GetNumLabelClasses()); + Init(); } //! Creates probability distribution of each row of input @@ -48,6 +49,8 @@ class SoftmaxLayer : public GNNLayer { //! derivative calculation; each is the size of a feature vector galois::substrate::PerThreadStorage> softmax_temp_vectors_; + + void Init(); }; } // namespace galois diff --git a/libgnn/src/SoftmaxLayer.cpp b/libgnn/src/SoftmaxLayer.cpp index 30dc476965..1262555a36 100644 --- a/libgnn/src/SoftmaxLayer.cpp +++ b/libgnn/src/SoftmaxLayer.cpp @@ -2,6 +2,10 @@ #include "galois/GNNMath.h" #include "galois/layers/SoftmaxLayer.h" +// Allocate memory and initialize +void galois::SoftmaxLayer::Init() { +} + const std::vector& galois::SoftmaxLayer::ForwardPhase( const std::vector& input_embeddings) { input_loss_.assign(input_loss_.size(), 0.0); From d013d3593db41dd784b108924f9f078e699ab36a Mon Sep 17 00:00:00 2001 From: chenxuhao Date: Sat, 17 Oct 2020 11:40:55 -0500 Subject: [PATCH 376/660] add comments --- libgnn/src/SoftmaxLayer.cu | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libgnn/src/SoftmaxLayer.cu b/libgnn/src/SoftmaxLayer.cu index 4a9bce5b26..d9ed5fc0ff 100644 --- a/libgnn/src/SoftmaxLayer.cu +++ b/libgnn/src/SoftmaxLayer.cu @@ -16,6 +16,10 @@ void galois::SoftmaxLayer::Forward(const galois::GNNFloat* in_tensor, // Input: out_tensor // Input: out_gradients // Output: in_gradients +// Note: although out_gradients is an input data, +// it is not const because it can be reused +// to hold intermediate data inside this function, +// to avoid allocating more memory void galois::SoftmaxLayer::Backward(const galois::GNNFloat* in_tensor, const galois::GNNFloat* out_tensor, galois::GNNFloat* in_gradients, From e4fa27b7640f6f196b0552f92b028e76972a8c1e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 17 Oct 2020 11:05:44 -0500 Subject: [PATCH 377/660] libgnn src directory follows include dir structure Adds subdirectories to the src directory in libgnn to follow the same structure as the include directory. --- libgnn/CMakeLists.txt | 12 ++++++------ libgnn/src/{ => graphs}/GNNGraph.cpp | 0 libgnn/src/{ => layers}/GNNLayer.cpp | 0 libgnn/src/{ => layers}/GluonGradientInterface.cpp | 0 libgnn/src/{ => layers}/GraphConvolutionalLayer.cpp | 0 libgnn/src/{ => layers}/SoftmaxLayer.cpp | 0 libgnn/src/{ => layers}/SoftmaxLayer.cu | 0 7 files changed, 6 insertions(+), 6 deletions(-) rename libgnn/src/{ => graphs}/GNNGraph.cpp (100%) rename libgnn/src/{ => layers}/GNNLayer.cpp (100%) rename libgnn/src/{ => layers}/GluonGradientInterface.cpp (100%) rename libgnn/src/{ => layers}/GraphConvolutionalLayer.cpp (100%) rename libgnn/src/{ => layers}/SoftmaxLayer.cpp (100%) rename libgnn/src/{ => layers}/SoftmaxLayer.cu (100%) diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 9d1b18b682..d635781ba6 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -1,12 +1,12 @@ set(sources - src/GNNGraph.cpp - src/GNNLayer.cpp src/GNNMath.cpp - src/GraphConvolutionalLayer.cpp - src/SoftmaxLayer.cpp - src/GraphNeuralNetwork.cpp src/GNNOptimizers.cpp - src/GluonGradientInterface.cpp + src/GraphNeuralNetwork.cpp + src/graphs/GNNGraph.cpp + src/layers/GNNLayer.cpp + src/layers/GluonGradientInterface.cpp + src/layers/GraphConvolutionalLayer.cpp + src/layers/SoftmaxLayer.cpp ) set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64) diff --git a/libgnn/src/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp similarity index 100% rename from libgnn/src/GNNGraph.cpp rename to libgnn/src/graphs/GNNGraph.cpp diff --git a/libgnn/src/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp similarity index 100% rename from libgnn/src/GNNLayer.cpp rename to libgnn/src/layers/GNNLayer.cpp diff --git a/libgnn/src/GluonGradientInterface.cpp b/libgnn/src/layers/GluonGradientInterface.cpp similarity index 100% rename from libgnn/src/GluonGradientInterface.cpp rename to libgnn/src/layers/GluonGradientInterface.cpp diff --git a/libgnn/src/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp similarity index 100% rename from libgnn/src/GraphConvolutionalLayer.cpp rename to libgnn/src/layers/GraphConvolutionalLayer.cpp diff --git a/libgnn/src/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp similarity index 100% rename from libgnn/src/SoftmaxLayer.cpp rename to libgnn/src/layers/SoftmaxLayer.cpp diff --git a/libgnn/src/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu similarity index 100% rename from libgnn/src/SoftmaxLayer.cu rename to libgnn/src/layers/SoftmaxLayer.cu From 67eb0a29400d5d6dd0809788306ca7d12efd2056 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 17 Oct 2020 11:50:42 -0500 Subject: [PATCH 378/660] GNNConfig -> GNNLayerConfig GNNConfig is too general a name when it was only used for layers: changed the name accorindingly. --- libgnn/include/galois/GraphNeuralNetwork.h | 8 ++++---- libgnn/include/galois/layers/GNNLayer.h | 8 ++++---- libgnn/include/galois/layers/GraphConvolutionalLayer.h | 5 +++-- libgnn/include/galois/layers/SoftmaxLayer.h | 2 +- libgnn/src/layers/GNNLayer.cpp | 2 +- libgnn/src/layers/GraphConvolutionalLayer.cpp | 2 +- libgnn/test/accuracy-test.cpp | 2 +- libgnn/test/aggregate-sync-test.cpp | 2 +- libgnn/test/convlayer-test.cpp | 4 ++-- libgnn/test/epoch-test.cpp | 2 +- libgnn/test/gnnfb-test.cpp | 7 ++++--- libgnn/test/weight-sync-test.cpp | 2 +- 12 files changed, 24 insertions(+), 22 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 1762cda8da..919c11046a 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -30,14 +30,14 @@ class GraphNeuralNetworkConfig { const std::vector& layer_column_sizes, GNNOutputLayerType output_layer_type) : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes, - output_layer_type, GNNConfig()) {} + output_layer_type, GNNLayerConfig()) {} //! Construction with a specified config for layers GraphNeuralNetworkConfig(size_t num_layers, const std::vector& layer_types, const std::vector& layer_column_sizes, GNNOutputLayerType output_layer_type, - const GNNConfig& default_layer_config) + const GNNLayerConfig& default_layer_config) : num_intermediate_layers_(num_layers), layer_types_(layer_types), layer_column_sizes_(layer_column_sizes), output_layer_type_(output_layer_type), @@ -71,7 +71,7 @@ class GraphNeuralNetworkConfig { return layer_column_sizes_[num_intermediate_layers_]; } //! Get the default layer config of layers in this GNN - const GNNConfig& default_layer_config() { return default_layer_config_; } + const GNNLayerConfig& default_layer_config() { return default_layer_config_; } private: //! Number of layers to construct in the GNN not including the output @@ -86,7 +86,7 @@ class GraphNeuralNetworkConfig { //! Output layer type GNNOutputLayerType output_layer_type_; //! Default config to use for layers - GNNConfig default_layer_config_; + GNNLayerConfig default_layer_config_; }; //////////////////////////////////////////////////////////////////////////////// diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 2232e82b5c..2473de7229 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -33,7 +33,7 @@ struct GNNLayerDimensions { }; //! Config options for operations that can occur in a layer -struct GNNConfig { +struct GNNLayerConfig { //! True if weights should be allocated bool allocate_weights{true}; //! True if dropout is to be done at beginning of forward phase @@ -61,12 +61,12 @@ class GNNLayer { //! the input/output dimensions of the MxM that occurs in the layer; config //! as well GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, - const GNNLayerDimensions& dimensions, const GNNConfig& config); + const GNNLayerDimensions& dimensions, const GNNLayerConfig& config); //! Uses a default config GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, const GNNLayerDimensions& dimensions) - : GNNLayer(layer_num, graph, dimensions, GNNConfig()) {} + : GNNLayer(layer_num, graph, dimensions, GNNLayerConfig()) {} GNNPhase layer_phase() { return layer_phase_; } //! Changes this layer's phase @@ -137,7 +137,7 @@ class GNNLayer { //! Dimensions (input/output sizes) of this layer GNNLayerDimensions layer_dimensions_; //! Config object for certain parameters for layer - GNNConfig config_; + GNNLayerConfig config_; //! Weights used by this layer. Dimensions: input columns by output columns std::vector layer_weights_; //! Gradients used to update the weights of this layer diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h index 6a99682b8a..123a8d774a 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h @@ -11,12 +11,13 @@ class GraphConvolutionalLayer : public GNNLayer { GraphConvolutionalLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, const GNNLayerDimensions& dimensions, - const GNNConfig& config); + const GNNLayerConfig& config); GraphConvolutionalLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, const GNNLayerDimensions& dimensions) - : GraphConvolutionalLayer(layer_num, graph, dimensions, GNNConfig()) {} + : GraphConvolutionalLayer(layer_num, graph, dimensions, + GNNLayerConfig()) {} // Parent functions const std::vector& diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index e410337964..815f2401ff 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -11,7 +11,7 @@ class SoftmaxLayer : public GNNLayer { SoftmaxLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, const GNNLayerDimensions& dimensions) : GNNLayer(layer_num, graph, dimensions, - GNNConfig{.allocate_weights = false}), + GNNLayerConfig{.allocate_weights = false}), input_loss_(dimensions.input_rows), ground_truth_vectors_(dimensions.input_columns), norm_gradient_vectors_(dimensions.input_columns), diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 838bf45905..dc81a9ca2b 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -5,7 +5,7 @@ galois::GNNLayer::GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, const GNNLayerDimensions& dimensions, - const GNNConfig& config) + const GNNLayerConfig& config) : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions), config_(config) { if (config_.allocate_weights) { diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index d02a2bf0ca..57a5d9505b 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -4,7 +4,7 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( size_t layer_num, const galois::graphs::GNNGraph& graph, - const GNNLayerDimensions& dimensions, const GNNConfig& config) + const GNNLayerDimensions& dimensions, const GNNLayerConfig& config) : GNNLayer(layer_num, graph, dimensions, config), input_column_intermediates_(dimensions.input_columns), output_column_intermediates_(dimensions.output_columns) { diff --git a/libgnn/test/accuracy-test.cpp b/libgnn/test/accuracy-test.cpp index 61d449255f..e1fc17702e 100644 --- a/libgnn/test/accuracy-test.cpp +++ b/libgnn/test/accuracy-test.cpp @@ -21,7 +21,7 @@ int main() { std::vector layer_output_sizes = {7, 7}; galois::GraphNeuralNetworkConfig gnn_config( 1, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, - galois::GNNConfig()); + galois::GNNLayerConfig()); std::vector adam_sizes = {21}; auto adam = std::make_unique(adam_sizes, 1); diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp index ea184e3e2a..432a546448 100644 --- a/libgnn/test/aggregate-sync-test.cpp +++ b/libgnn/test/aggregate-sync-test.cpp @@ -29,7 +29,7 @@ int main() { dimension_0.input_rows = test_graph->size(); dimension_0.input_columns = 3; dimension_0.output_columns = 2; - galois::GNNConfig l_config; + galois::GNNLayerConfig l_config; l_config.allow_aggregate_after_update = false; // create the layer, no norm factor diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp index 1d89cf198a..00825cf6f8 100644 --- a/libgnn/test/convlayer-test.cpp +++ b/libgnn/test/convlayer-test.cpp @@ -50,7 +50,7 @@ int main() { dimension_0.input_columns = 3; dimension_0.output_columns = 2; - galois::GNNConfig dcon; + galois::GNNLayerConfig dcon; dcon.allow_aggregate_after_update = false; // create the layer, no norm factor @@ -204,7 +204,7 @@ int main() { ////////////////////////////////////////////////////////////////////////////// - galois::GNNConfig config; + galois::GNNLayerConfig config; config.do_dropout = true; config.do_activation = true; config.do_normalization = true; diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp index 20c987be60..21d5249fd1 100644 --- a/libgnn/test/epoch-test.cpp +++ b/libgnn/test/epoch-test.cpp @@ -21,7 +21,7 @@ int main() { galois::GNNLayerType::kGraphConvolutional}; std::vector layer_output_sizes = { 16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()}; - galois::GNNConfig layer_config; + galois::GNNLayerConfig layer_config; layer_config.do_dropout = true; layer_config.do_activation = false; layer_config.do_normalization = true; diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp index 692cbfd30c..d43e1b0e2e 100644 --- a/libgnn/test/gnnfb-test.cpp +++ b/libgnn/test/gnnfb-test.cpp @@ -23,10 +23,11 @@ int main() { galois::GNNLayerType::kGraphConvolutional}; // note this includes the output; last 2 must be same because softmax std::vector layer_output_sizes = {4, 7, 7}; - galois::GNNConfig dcon; + galois::GNNLayerConfig dcon; dcon.allow_aggregate_after_update = false; - // note GNNConfig is passed in; use a config that does not do anything extra - // like dropout or activation and the like so that input is easier to verify + // note GNNLayerConfig is passed in; use a config that does not do anything + // extra like dropout or activation and the like so that input is easier to + // verify galois::GraphNeuralNetworkConfig gnn_config( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, dcon); diff --git a/libgnn/test/weight-sync-test.cpp b/libgnn/test/weight-sync-test.cpp index 37314fb59a..561aa95370 100644 --- a/libgnn/test/weight-sync-test.cpp +++ b/libgnn/test/weight-sync-test.cpp @@ -19,7 +19,7 @@ int main() { dimension_0.input_rows = test_graph->size(); dimension_0.input_columns = 3; dimension_0.output_columns = 2; - galois::GNNConfig dcon; + galois::GNNLayerConfig dcon; dcon.allow_aggregate_after_update = false; // create the layer, no norm factor From 434637650e1ef0cd7f605f5f6fee926752b142f1 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 19 Oct 2020 13:09:06 -0500 Subject: [PATCH 379/660] Command line options for GNN apps (libgnnbench) Added a new static library in the lonestar directory called libgnnbench that will be the backend for the (distributed) gnn benchmarks added in the future. At the moment it contains command line declarations for user configurable things in the GNN. Also changed the variable name of the variable containing the default path for gnn inputs (preparation for letting it get passed in as a user defined parameter). --- libgnn/include/galois/graphs/GNNGraph.h | 2 +- libgnn/src/graphs/GNNGraph.cpp | 17 +++--- lonestar/CMakeLists.txt | 1 + lonestar/libgnnbench/CMakeLists.txt | 5 ++ lonestar/libgnnbench/include/GNNBench/Input.h | 23 ++++++++ lonestar/libgnnbench/src/Input.cpp | 59 +++++++++++++++++++ 6 files changed, 99 insertions(+), 8 deletions(-) create mode 100644 lonestar/libgnnbench/CMakeLists.txt create mode 100644 lonestar/libgnnbench/include/GNNBench/Input.h create mode 100644 lonestar/libgnnbench/src/Input.cpp diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index a0b1430add..2a7e20b445 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -9,7 +9,7 @@ namespace galois { // TODO remove the need to hardcode this path //! Path to location of all gnn files -static const std::string gnn_dataset_path = +static const std::string default_gnn_dataset_path = "/net/ohm/export/iss/inputs/Learning/"; //! Helper struct to maintain start/end/size of any particular range. Mostly diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index a327dfe641..fe57784b30 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -9,7 +9,8 @@ std::unique_ptr LoadPartition(const std::string& dataset_name, galois::graphs::GNNPartitionScheme partition_scheme) { // XXX input path - std::string input_file = galois::gnn_dataset_path + dataset_name + ".csgr"; + std::string input_file = + galois::default_gnn_dataset_path + dataset_name + ".csgr"; GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file); // load partition @@ -118,7 +119,7 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, bool has_single_class_label) { GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_); std::string filename = - galois::gnn_dataset_path + dataset_name + "-labels.txt"; + galois::default_gnn_dataset_path + dataset_name + "-labels.txt"; // read file header, save num label classes while at it std::ifstream file_stream; file_stream.open(filename, std::ios::in); @@ -190,7 +191,8 @@ void galois::graphs::GNNGraph::ReadLocalFeatures( // read in dimensions of features, specifically node feature length size_t num_global_vertices; - std::string file_dims = galois::gnn_dataset_path + dataset_name + "-dims.txt"; + std::string file_dims = + galois::default_gnn_dataset_path + dataset_name + "-dims.txt"; std::ifstream ifs; ifs.open(file_dims, std::ios::in); ifs >> num_global_vertices >> node_feature_length_; @@ -209,7 +211,7 @@ void galois::graphs::GNNGraph::ReadLocalFeatures( // read in all features std::ifstream file_stream; std::string feature_file = - galois::gnn_dataset_path + dataset_name + "-feats.bin"; + galois::default_gnn_dataset_path + dataset_name + "-feats.bin"; file_stream.open(feature_file, std::ios::binary | std::ios::in); file_stream.read((char*)full_feature_set.get(), sizeof(GNNFloat) * num_global_vertices * @@ -245,8 +247,8 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( size_t range_end; // read mask range - std::string mask_filename = - galois::gnn_dataset_path + dataset_name + "-" + mask_type + "_mask.txt"; + std::string mask_filename = galois::default_gnn_dataset_path + dataset_name + + "-" + mask_type + "_mask.txt"; std::ifstream mask_stream; mask_stream.open(mask_filename, std::ios::in); mask_stream >> range_begin >> range_end >> std::ws; @@ -349,7 +351,8 @@ void galois::graphs::GNNGraph::InitZeroStartGraphIndices() { } void galois::graphs::GNNGraph::ReadWholeGraph(const std::string& dataset_name) { - std::string input_file = galois::gnn_dataset_path + dataset_name + ".csgr"; + std::string input_file = + galois::default_gnn_dataset_path + dataset_name + ".csgr"; GALOIS_LOG_VERBOSE("[{}] Reading entire graph: file to read is {}", host_id_, input_file); galois::graphs::readGraph(whole_graph_, input_file); diff --git a/lonestar/CMakeLists.txt b/lonestar/CMakeLists.txt index e00c61eb89..a0efe7bae7 100644 --- a/lonestar/CMakeLists.txt +++ b/lonestar/CMakeLists.txt @@ -226,4 +226,5 @@ add_subdirectory(scientific) if(USE_DEEPGALOIS) add_subdirectory(gnn) + add_subdirectory(libgnnbench) endif(USE_DEEPGALOIS) diff --git a/lonestar/libgnnbench/CMakeLists.txt b/lonestar/libgnnbench/CMakeLists.txt new file mode 100644 index 0000000000..31d174d581 --- /dev/null +++ b/lonestar/libgnnbench/CMakeLists.txt @@ -0,0 +1,5 @@ +add_library(gnnbench STATIC src/Input.cpp) +target_include_directories(gnnbench PUBLIC + "${CMAKE_CURRENT_SOURCE_DIR}/include" +) +target_link_libraries(gnnbench galois_gnn LLVMSupport) diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h new file mode 100644 index 0000000000..e9885026df --- /dev/null +++ b/lonestar/libgnnbench/include/GNNBench/Input.h @@ -0,0 +1,23 @@ +#pragma once + +#include "galois/graphs/GNNGraph.h" +#include + +//! Directory where all files used for GNN training are found +extern llvm::cl::opt input_directory; +//! Base graph name (used to find the csgr, features, masks, etc.) +extern llvm::cl::opt input_file; +//! Scheme used to partition the graph +extern llvm::cl::opt partition_scheme; +// Control layer count and size +extern llvm::cl::opt num_layers; +extern llvm::cl::list layer_sizes; +// Control dropout +extern llvm::cl::opt do_dropout; +extern llvm::cl::opt dropout_rate; +// Control activation +extern llvm::cl::opt do_activation; +// TODO activation layer type once more are supported +//! Controls weight normalization based on degree +extern llvm::cl::opt do_normalization; +// TODO output layer type diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp new file mode 100644 index 0000000000..fe167e24b6 --- /dev/null +++ b/lonestar/libgnnbench/src/Input.cpp @@ -0,0 +1,59 @@ +#include "GNNBench/Input.h" + +namespace cll = llvm::cl; + +// Self documented via the desc argument + +llvm::cl::opt input_directory( + "inputDirectory", + cll::desc("Base directory to find all files required for doing GNN " + "training (features, graph topology, masks, etc.)"), + cll::init(galois::default_gnn_dataset_path)); + +llvm::cl::opt input_file( + cll::Positional, + cll::desc("Base name of graph: used to find csgr, features, etc."), + cll::Required); + +llvm::cl::opt partition_scheme( + "partition", cll::desc("Type of partitioning."), + cll::values(clEnumValN(galois::graphs::GNNPartitionScheme::kOEC, "oec", + "Outgoing Edge-Cut (default)"), + clEnumValN(galois::graphs::GNNPartitionScheme::kCVC, "cvc", + "Cartesian Vertex-Cut")), + cll::init(galois::graphs::GNNPartitionScheme::kOEC)); + +llvm::cl::opt num_layers( + "numLayers", + cll::desc( + "Number of intermediate layers in the neural network (default 2))"), + cll::init(2)); + +llvm::cl::list layer_sizes( + "layerSizes", + cll::desc( + "Comma separated list of numbers specifying intermediate layer sizes"), + cll::CommaSeparated); + +llvm::cl::opt do_dropout( + "doDropout", + cll::desc("If true (on by default), does dropout of input during training"), + cll::init(true)); + +llvm::cl::opt dropout_rate( + "dropoutRate", + cll::desc("Specifies probability that any one weight is DROPPED (e.g., if " + "0.1, then 10 percent chance of dropping) (default 0.5)"), + cll::init(0.5)); + +llvm::cl::opt + do_activation("doActivation", + cll::desc("If true (off by default), does activation at the " + "end of an intermediate layer"), + cll::init(false)); + +llvm::cl::opt + do_normalization("doNormalization", + cll::desc("If true (on by default), normalizes vertex " + "features based on their degree"), + cll::init(true)); From e891594d6a1f0ca8adeb9614620ba9e2a07c6658 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 19 Oct 2020 14:20:03 -0500 Subject: [PATCH 380/660] Disabled gDebug huge page message in PageAlloc Makes debug mode prints way more than necessary --- libgalois/src/PageAlloc.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/libgalois/src/PageAlloc.cpp b/libgalois/src/PageAlloc.cpp index e051a6431d..a45e72e93d 100644 --- a/libgalois/src/PageAlloc.cpp +++ b/libgalois/src/PageAlloc.cpp @@ -60,7 +60,6 @@ void* galois::substrate::allocPages(unsigned num, bool preFault) { void* ptr = trymmap(num * hugePageSize, preFault ? _MAP_HUGE_POP : _MAP_HUGE); if (!ptr) { - gDebug("Huge page alloc failed, falling back"); ptr = trymmap(num * hugePageSize, preFault ? _MAP_POP : _MAP); } From cb97c91519192213977100badba42add2fe11b42 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 19 Oct 2020 14:33:34 -0500 Subject: [PATCH 381/660] Start code for GNNBench, new dist gcn app Adds the commandline parsing/stats setup code to GNNBench (based on DistBench). Adds a new gcn-dist app under lonestart/gnn/distributed that will be the main executable for gcn from now on. Successfully compiles using the GNNBench code. Next step is the GNN construction code in the lonestar end of the code. --- lonestar/gnn/CMakeLists.txt | 3 +- lonestar/gnn/distributed/CMakeLists.txt | 1 + lonestar/gnn/distributed/gcn/CMakeLists.txt | 3 + lonestar/gnn/distributed/gcn/gcn-dist.cpp | 10 +++ lonestar/libgnnbench/CMakeLists.txt | 2 +- lonestar/libgnnbench/include/GNNBench/Input.h | 9 ++- lonestar/libgnnbench/include/GNNBench/Start.h | 22 ++++++ lonestar/libgnnbench/src/Input.cpp | 14 +++- lonestar/libgnnbench/src/Start.cpp | 77 +++++++++++++++++++ 9 files changed, 137 insertions(+), 4 deletions(-) create mode 100644 lonestar/gnn/distributed/CMakeLists.txt create mode 100644 lonestar/gnn/distributed/gcn/CMakeLists.txt create mode 100644 lonestar/gnn/distributed/gcn/gcn-dist.cpp create mode 100644 lonestar/libgnnbench/include/GNNBench/Start.h create mode 100644 lonestar/libgnnbench/src/Start.cpp diff --git a/lonestar/gnn/CMakeLists.txt b/lonestar/gnn/CMakeLists.txt index f718db4942..d07810f48e 100644 --- a/lonestar/gnn/CMakeLists.txt +++ b/lonestar/gnn/CMakeLists.txt @@ -25,4 +25,5 @@ endif() add_subdirectory(gcn) #add_subdirectory(sage) -add_subdirectory(gat) +#add_subdirectory(gat) +add_subdirectory(distributed) diff --git a/lonestar/gnn/distributed/CMakeLists.txt b/lonestar/gnn/distributed/CMakeLists.txt new file mode 100644 index 0000000000..7863ee29db --- /dev/null +++ b/lonestar/gnn/distributed/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(gcn) diff --git a/lonestar/gnn/distributed/gcn/CMakeLists.txt b/lonestar/gnn/distributed/gcn/CMakeLists.txt new file mode 100644 index 0000000000..c8c9d10447 --- /dev/null +++ b/lonestar/gnn/distributed/gcn/CMakeLists.txt @@ -0,0 +1,3 @@ +# link libgnn library and all should go well +add_executable(gcn-dist gcn-dist.cpp) +target_link_libraries(gcn-dist galois_gnn gnnbench) diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp new file mode 100644 index 0000000000..031ae06d13 --- /dev/null +++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp @@ -0,0 +1,10 @@ +#include "GNNBench/Start.h" +#include "galois/GraphNeuralNetwork.h" + +constexpr static const char* const name = "Graph Convolutional Network"; + +int main(int argc, char* argv[]) { + galois::DistMemSys G; + GNNBenchStart(argc, argv, name); + return 0; +} diff --git a/lonestar/libgnnbench/CMakeLists.txt b/lonestar/libgnnbench/CMakeLists.txt index 31d174d581..14d152c8e7 100644 --- a/lonestar/libgnnbench/CMakeLists.txt +++ b/lonestar/libgnnbench/CMakeLists.txt @@ -1,4 +1,4 @@ -add_library(gnnbench STATIC src/Input.cpp) +add_library(gnnbench STATIC src/Input.cpp src/Start.cpp) target_include_directories(gnnbench PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" ) diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h index e9885026df..1bb2afdf70 100644 --- a/lonestar/libgnnbench/include/GNNBench/Input.h +++ b/lonestar/libgnnbench/include/GNNBench/Input.h @@ -1,12 +1,14 @@ #pragma once +#include "galois/GraphNeuralNetwork.h" +#include "galois/Logging.h" #include "galois/graphs/GNNGraph.h" #include //! Directory where all files used for GNN training are found extern llvm::cl::opt input_directory; //! Base graph name (used to find the csgr, features, masks, etc.) -extern llvm::cl::opt input_file; +extern llvm::cl::opt input_name; //! Scheme used to partition the graph extern llvm::cl::opt partition_scheme; // Control layer count and size @@ -21,3 +23,8 @@ extern llvm::cl::opt do_activation; //! Controls weight normalization based on degree extern llvm::cl::opt do_normalization; // TODO output layer type + +const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s); + +//! Using command line args above, create a GNN. +// XXX diff --git a/lonestar/libgnnbench/include/GNNBench/Start.h b/lonestar/libgnnbench/include/GNNBench/Start.h new file mode 100644 index 0000000000..93fc3ee0b1 --- /dev/null +++ b/lonestar/libgnnbench/include/GNNBench/Start.h @@ -0,0 +1,22 @@ +#pragma once + +#include "galois/Galois.h" +#include "galois/Version.h" +#include "GNNBench/Input.h" + +//////////////////////////////////////////////////////////////////////////////// +// CLI +//////////////////////////////////////////////////////////////////////////////// + +extern llvm::cl::opt num_threads; +extern llvm::cl::opt num_runs; +extern llvm::cl::opt stat_file; + +//////////////////////////////////////////////////////////////////////////////// +// Init functions +//////////////////////////////////////////////////////////////////////////////// + +//! Parses command line + setup some stats +void GNNBenchStart(int argc, char** argv, const char* app); +void GNNBenchStart(int argc, char** argv, const char* app, const char* desc, + const char* url); diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index fe167e24b6..0965234b51 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -10,7 +10,7 @@ llvm::cl::opt input_directory( "training (features, graph topology, masks, etc.)"), cll::init(galois::default_gnn_dataset_path)); -llvm::cl::opt input_file( +llvm::cl::opt input_name( cll::Positional, cll::desc("Base name of graph: used to find csgr, features, etc."), cll::Required); @@ -57,3 +57,15 @@ llvm::cl::opt cll::desc("If true (on by default), normalizes vertex " "features based on their degree"), cll::init(true)); + +const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) { + switch (s) { + case galois::graphs::GNNPartitionScheme::kOEC: + return "oec"; + case galois::graphs::GNNPartitionScheme::kCVC: + return "cvc"; + default: + GALOIS_LOG_FATAL("Invalid partitioning scheme"); + return ""; + } +} diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp new file mode 100644 index 0000000000..6276b373ee --- /dev/null +++ b/lonestar/libgnnbench/src/Start.cpp @@ -0,0 +1,77 @@ +#include "GNNBench/Start.h" + +namespace cll = llvm::cl; + +cll::opt num_threads("t", cll::desc("Number of threads (default 1)"), + cll::init(1)); +cll::opt num_runs("runs", cll::desc("Number of runs (default 1)"), + cll::init(1)); +cll::opt + stat_file("statFile", cll::desc("Optional output file to print stats to")); + +//////////////////////////////////////////////////////////////////////////////// + +static void PrintVersion(llvm::raw_ostream& out) { + out << "D-Galois Benchmark Suite v" << galois::getVersion() << " (" + << galois::getRevision() << ")\n"; + out.flush(); +} + +//////////////////////////////////////////////////////////////////////////////// + +void GNNBenchStart(int argc, char** argv, const char* app) { + GNNBenchStart(argc, argv, app, nullptr, nullptr); +} + +void GNNBenchStart(int argc, char** argv, const char* app, const char* desc, + const char* url) { + llvm::cl::SetVersionPrinter(PrintVersion); + llvm::cl::ParseCommandLineOptions(argc, argv); + num_threads = galois::setActiveThreads(num_threads); + galois::runtime::setStatFile(stat_file); + + auto& net = galois::runtime::getSystemNetworkInterface(); + + if (net.ID == 0) { + PrintVersion(llvm::outs()); + llvm::outs() << "Copyright (C) " << galois::getCopyrightYear() + << " The University of Texas at Austin\n"; + llvm::outs() << "http://iss.ices.utexas.edu/galois/\n\n"; + llvm::outs() << "application: " << (app ? app : "unspecified") << "\n"; + + if (desc) { + llvm::outs() << desc << "\n"; + } + if (url) { + llvm::outs() + << "http://iss.ices.utexas.edu/?p=projects/galois/benchmarks/" << url + << "\n"; + } + llvm::outs() << "\n"; + llvm::outs().flush(); + + std::ostringstream cmdout; + + for (int i = 0; i < argc; ++i) { + cmdout << argv[i]; + if (i != argc - 1) + cmdout << " "; + } + + galois::runtime::reportParam("GNNBench", "CommandLine", cmdout.str()); + galois::runtime::reportParam("GNNBench", "Threads", num_threads); + galois::runtime::reportParam("GNNBench", "Hosts", net.Num); + galois::runtime::reportParam("GNNBench", "Runs", num_runs); + galois::runtime::reportParam("GNNBench", "Run_UUID", + galois::runtime::getRandUUID()); + galois::runtime::reportParam("GNNBench", "InputDirectory", input_directory); + galois::runtime::reportParam("GNNBench", "Input", input_name); + galois::runtime::reportParam("GNNBench", "PartitionScheme", + GNNPartitionToString(partition_scheme)); + // XXX report the rest of the command line options + } + + char name[256]; + gethostname(name, 256); + galois::runtime::reportParam("GNNBench", "Hostname", name); +} From 0f6cc119a5eab2ff57a1370db8e4fb2608090972 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 19 Oct 2020 18:04:09 -0500 Subject: [PATCH 382/660] GNNGraph takes an input directory argument Adds an input directory argument to the constructor of a GNNGraph in order to allow a caller to use an input directory that differs from the default hard-coded directory. Done in prep for the new GCN app being able to specify whatever input directory it wants. --- libgnn/include/galois/graphs/GNNGraph.h | 6 ++++- libgnn/src/graphs/GNNGraph.cpp | 34 +++++++++++++++---------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 2a7e20b445..8ce85092ac 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -35,9 +35,11 @@ class GNNGraph { using NodeIterator = boost::counting_iterator; using EdgeIterator = GNNDistGraph::edge_iterator; - //! Loads a graph and all relevant metadata (labels, features, masks, etc.) GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme, bool has_single_class_label); + //! Loads a graph and all relevant metadata (labels, features, masks, etc.) + GNNGraph(const std::string& input_directory, const std::string& dataset_name, + GNNPartitionScheme partition_scheme, bool has_single_class_label); //! Returns host id size_t host_id() const { return host_id_; } @@ -118,6 +120,8 @@ class GNNGraph { const size_t matrix_column_size) const; private: + //! Directory for input data + const std::string input_directory_; //! In a multi-host setting, this variable stores the host id that the graph //! is currently running on unsigned host_id_; diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index fe57784b30..b77e5df0a8 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -6,11 +6,11 @@ namespace { //! Partitions a particular dataset given some partitioning scheme std::unique_ptr -LoadPartition(const std::string& dataset_name, +LoadPartition(const std::string& input_directory, + const std::string& dataset_name, galois::graphs::GNNPartitionScheme partition_scheme) { // XXX input path - std::string input_file = - galois::default_gnn_dataset_path + dataset_name + ".csgr"; + std::string input_file = input_directory + dataset_name + ".csgr"; GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file); // load partition @@ -39,6 +39,15 @@ size_t gnn_matrix_to_sync_column_length_ = 0; galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme, bool has_single_class_label) { + GNNGraph(galois::default_gnn_dataset_path, dataset_name, partition_scheme, + has_single_class_label); +} + +galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, + const std::string& dataset_name, + GNNPartitionScheme partition_scheme, + bool has_single_class_label) + : input_directory_(input_directory) { GALOIS_LOG_VERBOSE("[{}] Constructing partitiong for {}", host_id_, dataset_name); // save host id @@ -48,7 +57,8 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name, std::to_string(galois::runtime::getSystemNetworkInterface().ID) + std::string("] "); // load partition - partitioned_graph_ = LoadPartition(dataset_name, partition_scheme); + partitioned_graph_ = + LoadPartition(input_directory_, dataset_name, partition_scheme); // read additional graph data ReadLocalLabels(dataset_name, has_single_class_label); @@ -118,8 +128,7 @@ void galois::graphs::GNNGraph::AggregateSync( void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, bool has_single_class_label) { GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_); - std::string filename = - galois::default_gnn_dataset_path + dataset_name + "-labels.txt"; + std::string filename = input_directory_ + dataset_name + "-labels.txt"; // read file header, save num label classes while at it std::ifstream file_stream; file_stream.open(filename, std::ios::in); @@ -191,8 +200,7 @@ void galois::graphs::GNNGraph::ReadLocalFeatures( // read in dimensions of features, specifically node feature length size_t num_global_vertices; - std::string file_dims = - galois::default_gnn_dataset_path + dataset_name + "-dims.txt"; + std::string file_dims = input_directory_ + dataset_name + "-dims.txt"; std::ifstream ifs; ifs.open(file_dims, std::ios::in); ifs >> num_global_vertices >> node_feature_length_; @@ -210,8 +218,7 @@ void galois::graphs::GNNGraph::ReadLocalFeatures( // read in all features std::ifstream file_stream; - std::string feature_file = - galois::default_gnn_dataset_path + dataset_name + "-feats.bin"; + std::string feature_file = input_directory_ + dataset_name + "-feats.bin"; file_stream.open(feature_file, std::ios::binary | std::ios::in); file_stream.read((char*)full_feature_set.get(), sizeof(GNNFloat) * num_global_vertices * @@ -247,8 +254,8 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( size_t range_end; // read mask range - std::string mask_filename = galois::default_gnn_dataset_path + dataset_name + - "-" + mask_type + "_mask.txt"; + std::string mask_filename = + input_directory_ + dataset_name + "-" + mask_type + "_mask.txt"; std::ifstream mask_stream; mask_stream.open(mask_filename, std::ios::in); mask_stream >> range_begin >> range_end >> std::ws; @@ -351,8 +358,7 @@ void galois::graphs::GNNGraph::InitZeroStartGraphIndices() { } void galois::graphs::GNNGraph::ReadWholeGraph(const std::string& dataset_name) { - std::string input_file = - galois::default_gnn_dataset_path + dataset_name + ".csgr"; + std::string input_file = input_directory_ + dataset_name + ".csgr"; GALOIS_LOG_VERBOSE("[{}] Reading entire graph: file to read is {}", host_id_, input_file); galois::graphs::readGraph(whole_graph_, input_file); From fa8a3c543c3ab237f5c2f733293319a9ac548932 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 19 Oct 2020 18:05:57 -0500 Subject: [PATCH 383/660] Adds GNN user-side construction func to GNNBench GNNBench now provides a GNN construction function that uses the user specified command line options to create a GCN for use in an app. --- lonestar/gnn/distributed/gcn/gcn-dist.cpp | 18 ++- lonestar/libgnnbench/include/GNNBench/Input.h | 11 +- lonestar/libgnnbench/src/Input.cpp | 126 +++++++++++++++++- 3 files changed, 146 insertions(+), 9 deletions(-) diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp index 031ae06d13..b2c1888c7a 100644 --- a/lonestar/gnn/distributed/gcn/gcn-dist.cpp +++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp @@ -1,10 +1,26 @@ #include "GNNBench/Start.h" -#include "galois/GraphNeuralNetwork.h" constexpr static const char* const name = "Graph Convolutional Network"; int main(int argc, char* argv[]) { galois::DistMemSys G; GNNBenchStart(argc, argv, name); + + galois::StatTimer init_timer("InitializationTime"); + init_timer.start(); + std::unique_ptr gnn = + InitializeGraphNeuralNetwork(galois::GNNLayerType::kGraphConvolutional); + gnn->SetLayerPhases(galois::GNNPhase::kTrain); + init_timer.stop(); + + galois::StatTimer compute_timer("Timer_0"); + compute_timer.start(); + + galois::StatTimer train_timer("TrainingTime"); + train_timer.start(); + + train_timer.stop(); + compute_timer.stop(); + return 0; } diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h index 1bb2afdf70..fc5059bb0c 100644 --- a/lonestar/libgnnbench/include/GNNBench/Input.h +++ b/lonestar/libgnnbench/include/GNNBench/Input.h @@ -1,7 +1,6 @@ #pragma once #include "galois/GraphNeuralNetwork.h" -#include "galois/Logging.h" #include "galois/graphs/GNNGraph.h" #include @@ -23,8 +22,14 @@ extern llvm::cl::opt do_activation; //! Controls weight normalization based on degree extern llvm::cl::opt do_normalization; // TODO output layer type +// TODO optimizer type +//! Toggles an optimization that flips aggregate/update step if it would be +//! beneficial +extern llvm::cl::opt agg_after_update; const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s); -//! Using command line args above, create a GNN. -// XXX +//! Using command line args above, create a GNN using some specified layer type +//! as the intermediate layer. +std::unique_ptr +InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type); diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index 0965234b51..97ef7a6fc3 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -1,3 +1,4 @@ +#include "galois/Logging.h" #include "GNNBench/Input.h" namespace cll = llvm::cl; @@ -29,11 +30,11 @@ llvm::cl::opt num_layers( "Number of intermediate layers in the neural network (default 2))"), cll::init(2)); -llvm::cl::list layer_sizes( - "layerSizes", - cll::desc( - "Comma separated list of numbers specifying intermediate layer sizes"), - cll::CommaSeparated); +llvm::cl::list + layer_sizes("layerSizes", + cll::desc("Comma separated list of numbers specifying " + "intermediate layer sizes (does not include output)"), + cll::CommaSeparated); llvm::cl::opt do_dropout( "doDropout", @@ -58,6 +59,12 @@ llvm::cl::opt "features based on their degree"), cll::init(true)); +llvm::cl::opt + agg_after_update("allowAggregationAfterUpdate", + cll::desc("If true (on by default), allows aggregate to " + "be done after update as an optimization"), + cll::init(true)); + const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) { switch (s) { case galois::graphs::GNNPartitionScheme::kOEC: @@ -69,3 +76,112 @@ const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) { return ""; } } + +//! Initializes the vector of layer sizes from command line args + graph +std::vector +CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) { + // set layer sizes for intermdiate and output layers + std::vector layer_sizes_vector; + if (layer_sizes.size()) { + GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers); + for (size_t i = 0; i < num_layers; i++) { + layer_sizes_vector.emplace_back(layer_sizes[i]); + } + // verify user satisfies last intermediate layer needing to have same size + // as # label classes + GALOIS_LOG_ASSERT(layer_sizes_vector.back() == + gnn_graph->GetNumLabelClasses()); + } else { + // default 16 for everything until last 2 + for (size_t i = 0; i < num_layers - 1; i++) { + layer_sizes_vector.emplace_back(16); + } + // last 2 sizes must be equivalent to # label classes; this is the last + // intermediate layer + layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses()); + } + + // TODO + // for now only softmax layer which dictates the output size of the last + // intermediate layer + size of the output layer + // output layer at the moment required to be same as # label classes + layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses()); + + return layer_sizes_vector; +} + +//! Setup layer config struct based on cli args +galois::GNNLayerConfig CreateLayerConfig() { + galois::GNNLayerConfig layer_config; + layer_config.do_dropout = do_dropout; + layer_config.dropout_rate = dropout_rate; + layer_config.do_activation = do_activation; + layer_config.do_normalization = do_normalization; + layer_config.allow_aggregate_after_update = agg_after_update; + return layer_config; +} + +std::unique_ptr +CreateOptimizer(const galois::graphs::GNNGraph* gnn_graph) { + std::vector opt_sizes; + + // optimizer sizes are based on intermediate layer sizes, input feats, and + // # label classes + if (layer_sizes.size()) { + GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers); + opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_sizes[0]); + // assumption here is that if it reached this point then layer sizes were + // already sanity checked previously (esp. last layer) + for (size_t i = 1; i < num_layers; i++) { + opt_sizes.emplace_back(layer_sizes[i] * layer_sizes[i - 1]); + } + } else { + // everything is size 16 until last + if (num_layers == 1) { + // single layer requires a bit of special handling + opt_sizes.emplace_back(gnn_graph->node_feature_length() * + gnn_graph->GetNumLabelClasses()); + } else { + // first + opt_sizes.emplace_back(gnn_graph->node_feature_length() * 16); + for (size_t i = 1; i < num_layers - 1; i++) { + opt_sizes.emplace_back(16 * 16); + } + // last + opt_sizes.emplace_back(16 * gnn_graph->GetNumLabelClasses()); + } + } + GALOIS_LOG_ASSERT(opt_sizes.size() == num_layers); + + // TODO only adam works right now, add the others later + return std::make_unique(opt_sizes, num_layers); +} + +std::unique_ptr +InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type) { + // partition/load graph + auto gnn_graph = std::make_unique( + input_directory, input_name, partition_scheme, true); + + // create layer types vector + std::vector layer_types; + for (size_t i = 0; i < num_layers; i++) { + layer_types.push_back(layer_type); + } + // sizes + std::vector layer_sizes_vector = + CreateLayerSizesVector(gnn_graph.get()); + // layer config object + galois::GNNLayerConfig layer_config = CreateLayerConfig(); + // GNN config object + // TODO output type should be configurable + galois::GraphNeuralNetworkConfig gnn_config( + num_layers, layer_types, layer_sizes_vector, + galois::GNNOutputLayerType::kSoftmax, layer_config); + // optimizer + std::unique_ptr opt = CreateOptimizer(gnn_graph.get()); + + // create the gnn + return std::make_unique( + std::move(gnn_graph), std::move(opt), std::move(gnn_config)); +} From e81eb03a143f0d8a2555a1d4625b6a9ef68f509d Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 19 Oct 2020 18:47:05 -0500 Subject: [PATCH 384/660] Train function to GNN Added a high level train function for callers to do end to end training for some specified number of epochs for the graph neural network class. --- libgnn/include/galois/GraphNeuralNetwork.h | 5 ++++ libgnn/src/GraphNeuralNetwork.cpp | 29 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 919c11046a..725e3a69d1 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -13,6 +13,7 @@ namespace galois { //////////////////////////////////////////////////////////////////////////////// +// TODO validation and testing intervals //! Configuration object passed into constructor of a GraphNeuralNetwork to //! determine how the network gets constructed. class GraphNeuralNetworkConfig { @@ -131,6 +132,10 @@ class GraphNeuralNetwork { //! Returns the output layer const galois::GNNLayer* GetOutputLayer() { return gnn_layers_.back().get(); } + //! Do training for a specified # of epochs and return test accuracy at the + //! end of it + float Train(size_t num_epochs); + //! Propogates the graph's feature vectors through the network to get a new //! vector representation. //! Also known as the forward phase in most literature diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 3424c2b3e3..82c1d40c07 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -59,6 +59,35 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( } } +float galois::GraphNeuralNetwork::Train(size_t num_epochs) { + const size_t this_host = graph_->host_id(); + // TODO incorporate validation/test intervals + for (size_t epoch = 0; epoch < num_epochs; epoch++) { + const std::vector* predictions = DoInference(); + GradientPropagation(); + float train_accuracy = GetGlobalAccuracy(*predictions); + if (this_host == 0) { + galois::gPrint("Epoch ", epoch, ": Train accuracy is ", train_accuracy, + "\n"); + } + // TODO validation and test as necessary + } + + // check test accuracy + galois::StatTimer acc_timer("FinalAccuracyTest"); + acc_timer.start(); + SetLayerPhases(galois::GNNPhase::kTest); + const std::vector* predictions = DoInference(); + float global_accuracy = GetGlobalAccuracy(*predictions); + acc_timer.stop(); + + if (this_host == 0) { + galois::gPrint("Final test accuracy is ", global_accuracy, "\n"); + } + + return global_accuracy; +} + const std::vector* galois::GraphNeuralNetwork::DoInference() { // start with graph features and pass it through all layers of the network const std::vector* layer_input = &(graph_->GetLocalFeatures()); From c7e9c02a47fc6a522a4121bbfca3e6d1a03caa89 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 19 Oct 2020 18:48:08 -0500 Subject: [PATCH 385/660] Epoch CLI to GNNBench + gcn-dist app complete Adds another command line option to GNNBench to specify the number of epochs to train for. The initial gcn app is also now done and will do training end to end. --- lonestar/gnn/distributed/gcn/gcn-dist.cpp | 2 +- lonestar/libgnnbench/include/GNNBench/Start.h | 1 + lonestar/libgnnbench/src/Start.cpp | 4 ++++ 3 files changed, 6 insertions(+), 1 deletion(-) diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp index b2c1888c7a..a7eb0a4bae 100644 --- a/lonestar/gnn/distributed/gcn/gcn-dist.cpp +++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp @@ -18,7 +18,7 @@ int main(int argc, char* argv[]) { galois::StatTimer train_timer("TrainingTime"); train_timer.start(); - + gnn->Train(num_epochs); train_timer.stop(); compute_timer.stop(); diff --git a/lonestar/libgnnbench/include/GNNBench/Start.h b/lonestar/libgnnbench/include/GNNBench/Start.h index 93fc3ee0b1..c17ddecadc 100644 --- a/lonestar/libgnnbench/include/GNNBench/Start.h +++ b/lonestar/libgnnbench/include/GNNBench/Start.h @@ -10,6 +10,7 @@ extern llvm::cl::opt num_threads; extern llvm::cl::opt num_runs; +extern llvm::cl::opt num_epochs; extern llvm::cl::opt stat_file; //////////////////////////////////////////////////////////////////////////////// diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp index 6276b373ee..1a178c583d 100644 --- a/lonestar/libgnnbench/src/Start.cpp +++ b/lonestar/libgnnbench/src/Start.cpp @@ -6,6 +6,10 @@ cll::opt num_threads("t", cll::desc("Number of threads (default 1)"), cll::init(1)); cll::opt num_runs("runs", cll::desc("Number of runs (default 1)"), cll::init(1)); +cll::opt num_epochs("epochs", + cll::desc("Number of epochs (default 50)"), + cll::init(50)); + cll::opt stat_file("statFile", cll::desc("Optional output file to print stats to")); From f69569e26c7c0066791820cbc95c3ffe5b4a2136 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 27 Oct 2020 17:12:07 -0500 Subject: [PATCH 386/660] Disable GALOIS_SUPPORT_ASYNC in libdist The SUPPORT_ASYNC flag makes messages use issend instead of isend. issend was causing issues with distributed execution where under certain interleavings of send/recv a send would get presumably corrupted. For now, disable issend until a fix is found (or maybe remove it forever depending on if it's really necessary or not). Performance wise it does not seem to show much of an impact on GNN apps; mostly noise and large variance has been observed. --- libdist/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libdist/CMakeLists.txt b/libdist/CMakeLists.txt index 138a4edabd..57e6aa1750 100644 --- a/libdist/CMakeLists.txt +++ b/libdist/CMakeLists.txt @@ -21,7 +21,7 @@ target_include_directories(galois_dist_async PUBLIC target_link_libraries(galois_dist_async PUBLIC MPI::MPI_CXX) target_link_libraries(galois_dist_async PUBLIC galois_shmem) -target_compile_definitions(galois_dist_async PRIVATE GALOIS_SUPPORT_ASYNC=1) +#target_compile_definitions(galois_dist_async PRIVATE GALOIS_SUPPORT_ASYNC=1) if (GALOIS_USE_BARE_MPI) target_compile_definitions(galois_dist_async PRIVATE GALOIS_USE_BARE_MPI=1) From 4035e46aeb5fa85c994d37a477a494d79c782f7f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 27 Oct 2020 17:26:34 -0500 Subject: [PATCH 387/660] GNNGraph loading: turn off unnecessary CuSP features Add args to get rid of async partitioning, multi-round partitioning. --- libgnn/src/graphs/GNNGraph.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index b77e5df0a8..3b0c79a628 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -17,10 +17,10 @@ LoadPartition(const std::string& input_directory, switch (partition_scheme) { case galois::graphs::GNNPartitionScheme::kOEC: return galois::cuspPartitionGraph( - input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); + input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1); case galois::graphs::GNNPartitionScheme::kCVC: return galois::cuspPartitionGraph( - input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); + input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1); default: GALOIS_LOG_FATAL("Error: partition scheme specified is invalid"); return nullptr; From b8c6a90a926101a29278bd1b538532ce788efb10 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 27 Oct 2020 17:28:31 -0500 Subject: [PATCH 388/660] Various Katana changes to Serialize/PODArray Take some changes made to clean up the serialize/POD code from the Katana repo and add them here. --- libdist/include/galois/runtime/Serialize.h | 10 +++++++--- libgalois/CMakeLists.txt | 1 + libgalois/include/galois/PODResizeableArray.h | 12 +++++++++--- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h index 1060721ed2..94517e34ca 100644 --- a/libdist/include/galois/runtime/Serialize.h +++ b/libdist/include/galois/runtime/Serialize.h @@ -79,13 +79,17 @@ class SerializeBuffer { //! Insert characters from a buffer into the serialize buffer void insert(const uint8_t* c, size_t bytes) { - bufdata.insert(bufdata.end(), c, c + bytes); + if (bytes > 0) { + bufdata.insert(bufdata.end(), c, c + bytes); + } } //! Insert characters from a buffer into the serialize buffer at a particular //! offset void insertAt(const uint8_t* c, size_t bytes, size_t offset) { - std::copy_n(c, bytes, bufdata.begin() + offset); + if (bytes > 0) { + std::copy_n(c, bytes, bufdata.begin() + offset); + } } /** @@ -237,7 +241,7 @@ class DeSerializeBuffer { */ void extract(uint8_t* dst, size_t num) { if (num > 0) { - memcpy(dst, &bufdata[offset], num); + std::copy_n(&bufdata[offset], num, dst); offset += num; } } diff --git a/libgalois/CMakeLists.txt b/libgalois/CMakeLists.txt index 8e9d56d48e..76161160f6 100644 --- a/libgalois/CMakeLists.txt +++ b/libgalois/CMakeLists.txt @@ -86,6 +86,7 @@ endif() target_link_libraries(galois_shmem INTERFACE pygalois) target_link_libraries(galois_shmem PRIVATE Threads::Threads) +target_link_libraries(galois_shmem PUBLIC galois_support) if (CMAKE_HAVE_PTHREAD_H) target_compile_definitions(galois_shmem PRIVATE GALOIS_HAVE_PTHREAD) diff --git a/libgalois/include/galois/PODResizeableArray.h b/libgalois/include/galois/PODResizeableArray.h index a37a0b598c..dc1cabdb48 100644 --- a/libgalois/include/galois/PODResizeableArray.h +++ b/libgalois/include/galois/PODResizeableArray.h @@ -28,6 +28,7 @@ #include #include "galois/config.h" +#include "galois/Logging.h" namespace galois { @@ -136,6 +137,9 @@ class PODResizeableArray { } data_ = static_cast<_Tp*>( realloc(reinterpret_cast(data_), capacity_ * sizeof(_Tp))); + if (!data_) { + GALOIS_LOG_FATAL("Out of memory for a PODResizableArray"); + } } } @@ -183,10 +187,12 @@ class PODResizeableArray { void insert(iterator GALOIS_USED_ONLY_IN_DEBUG(position), InputIterator first, InputIterator last) { assert(position == end()); - size_t old_size = size_; size_t to_add = last - first; - resize(old_size + to_add); - std::copy_n(first, to_add, begin() + old_size); + if (to_add > 0) { + size_t old_size = size_; + resize(old_size + to_add); + std::copy_n(first, to_add, begin() + old_size); + } } void swap(PODResizeableArray& v) { From 65a3bb32817886a83cd824a1d3d696d52d8385ed Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 28 Oct 2020 19:19:20 -0500 Subject: [PATCH 389/660] Disable DIE for partitioning of non-gnn datasets Adding a DIE in else clause for the check of break points for GNN datasets unintentionally broke all datasets for other apps. TODO: add a check to only call the function if a GNN dataset is used (i.e. add a flag or add a hardcoded list) --- libcusp/include/galois/graphs/NewGeneric.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index 33a618c62f..f4837ff1de 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -95,8 +95,9 @@ class NewDistGraphGeneric : public DistGraph { bps.push_back(0); bps.push_back(5); } else { - GALOIS_DIE("invalid input for gnn partitioning ", filename, - " hardcode needed"); + // XXX only die under certain conditions + //GALOIS_DIE("invalid input for gnn partitioning ", filename, + // " hardcode needed"); } // TODO hardcode the rest From 64dfd096e98a6024d491e3c4f5749b66d6e99200 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 28 Oct 2020 19:23:56 -0500 Subject: [PATCH 390/660] GNNGraph cleanup; prep for GPU additions Remove some unused vars/functions originally added to use MKL functions that were eventually dropped. Rearranged some functions and added placeholders for the GPU graph build. --- libgnn/include/galois/graphs/GNNGraph.h | 112 +++++++++--------------- libgnn/src/graphs/GNNGraph.cpp | 19 ---- 2 files changed, 41 insertions(+), 90 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 8ce85092ac..3f73aff510 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -5,6 +5,10 @@ #include "galois/graphs/GluonSubstrate.h" #include "galois/graphs/GraphAggregationSyncStructures.h" +#ifdef GALOIS_ENABLE_GPU +#include "galois/graphs/GNNGraph.cuh" +#endif + namespace galois { // TODO remove the need to hardcode this path @@ -47,6 +51,16 @@ class GNNGraph { //! Returns host id in brackets to use for printing things const std::string& host_prefix() const { return host_prefix_; } + //! Length of a node feature + size_t node_feature_length() const { return node_feature_length_; } + + //! Return the number of label classes (i.e. number of possible outputs) + size_t GetNumLabelClasses() const { return num_label_classes_; }; + + ////////////////////////////////////////////////////////////////////////////// + // Graph accessors + ////////////////////////////////////////////////////////////////////////////// + //! Return # of nodes in the partitioned graph size_t size() const { return partitioned_graph_->size(); } @@ -67,22 +81,7 @@ class GNNGraph { return partitioned_graph_->masterNodesRange().end(); } - //! Given an LID and the current phase of GNN computation, determine if the - //! lid in question is valid for the current phase (i.e., it is part of - //! a training, validation, or test phase mask) - bool IsValidForPhase(const unsigned lid, - const galois::GNNPhase current_phase) const; - //! Returns the label of some local id assuming labels are single class - //! labels. - GNNFloat GetSingleClassLabel(const unsigned lid) const { - assert(using_single_class_labels_); - return local_ground_truth_labels_[lid]; - } - - //! Return the number of label classes - size_t GetNumLabelClasses() const { return num_label_classes_; }; - - // All following functions take a local id + // All following functions take a local node id EdgeIterator EdgeBegin(GraphNode n) const { return partitioned_graph_->edge_begin(n); }; @@ -94,22 +93,25 @@ class GNNGraph { }; GNNFloat NormFactor(GraphNode n) const { return norm_factors_[n]; } - size_t node_feature_length() const { return node_feature_length_; } + //! Returns the ground truth label of some local id assuming labels are single + //! class labels. + GNNFloat GetSingleClassLabel(const unsigned lid) const { + assert(using_single_class_labels_); + return local_ground_truth_labels_[lid]; + } + //! Return matrix of the local node features const std::vector& GetLocalFeatures() const { return local_node_features_; } - //! Returns a pointer to the CSR indices where the first element starts at - //! 0 (used with MKL) - const uint32_t* GetZeroBasedRowPointer() const { - return zero_start_graph_indices_.data(); - } + //! Given an LID and the current phase of GNN computation, determine if the + //! lid in question is valid for the current phase (i.e., it is part of + //! a training, validation, or test phase mask) + bool IsValidForPhase(const unsigned lid, + const galois::GNNPhase current_phase) const; - //! Return pointer to all edge destinations; used with MKL - const uint32_t* GetEdgeDestPointer() const { - return partitioned_graph_->edge_dst_ptr(); - } + ////////////////////////////////////////////////////////////////////////////// //! Given a matrix and the column size, do an aggregate sync where each row //! is considered a node's data and sync using the graph's Gluon @@ -137,9 +139,6 @@ class GNNGraph { //! The entire topology of the dataset: used for things like norm factor //! calculation or sampling WholeGraph whole_graph_; - //! The indices pointer from the partitioned graph except with a 0 - //! prepended to it; needed for MKL calls - std::vector zero_start_graph_indices_; //! Sync substrate for the partitioned graph std::unique_ptr> sync_substrate_; //! True if labels are single class @@ -173,6 +172,10 @@ class GNNGraph { // TODO vars for subgraphs as necessary + ////////////////////////////////////////////////////////////////////////////// + // Initialization + ////////////////////////////////////////////////////////////////////////////// + //! Read labels of local nodes only void ReadLocalLabels(const std::string& dataset_name, bool has_single_class_label); @@ -185,54 +188,21 @@ class GNNGraph { GNNRange* mask_range, GNNLabel* masks); //! Read masks of local nodes only for training, validation, and testing void ReadLocalMasks(const std::string& dataset_name); - //! Init the node start indices that have a 0 at the beginning; straight - //! copy of the array from the partitioned graph save for the 0 at the - //! first element. - void InitZeroStartGraphIndices(); //! Reads the entire graph topology in (but nothing else) void ReadWholeGraph(const std::string& dataset_name); //! Initializes the norm factors using the entire graph's topology for global //! degree access void InitNormFactor(); - // public: - // void saveDistGraph(DGraph* a); - // galois::graphs::GluonSubstrate* getSyncSubstrate(); - // float_t* get_feats_ptr() { return h_feats; } - // float_t* get_feats_subg_ptr() { return h_feats_subg.data(); } - // label_t* get_labels_ptr() { return h_labels; } - // label_t* get_labels_subg_ptr() { return h_labels_subg.data(); } - // float_t* get_norm_factors_ptr() { return normFactors.data(); } - // float_t* get_norm_factors_subg_ptr() { return &normFactorsSub[0]; } - // - // //! allocate the norm factor vector - // void allocNormFactor(); - // void allocNormFactorSub(int subID); - // //! construct norm factor vector by using data from global graph - // void constructNormFactor(deepgalois::Context* globalContext); - // void constructNormFactorSub(int subgraphID); - // - // void constructSubgraphLabels(size_t m, const mask_t* masks); - // void constructSubgraphFeatures(size_t m, const mask_t* masks); - // - // //! return label for some node - // //! NOTE: this is LID, not GID - // label_t get_label(size_t lid) { return h_labels[lid]; } - // - // //! returns pointer to the features of each local node - // float_t* get_in_ptr(); - // - // //! allocate memory for subgraphs (don't actually build them) - // void allocateSubgraphs(int num_subgraphs, unsigned max_size); - // - // //! return if a vertex is owned by the partitioned graph this context - // contains bool isOwned(unsigned gid); - // //! return if part graph has provided vertex for given gid locally - // bool isLocal(unsigned gid); - // //! get GID of an lid for a vertex - // unsigned getGID(unsigned lid); - // //! get local id of a vertex given a global id for that vertex - // unsigned getLID(unsigned gid); + ////////////////////////////////////////////////////////////////////////////// + // GPU things + ////////////////////////////////////////////////////////////////////////////// + +#ifdef GALOIS_ENABLE_GPU + //! This satisfies the cuda context forward declaration in host decls: + //! context fields + GNNGraphGPUAllocations gpu_memory_; +#endif }; } // namespace graphs diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 3b0c79a628..fe997ed228 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -71,8 +71,6 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, *partitioned_graph_, host_id_, galois::runtime::getSystemNetworkInterface().Num, false); - // create the 0 based row indices for MKL use - InitZeroStartGraphIndices(); // read in entire graph topology ReadWholeGraph(dataset_name); // init norm factors using the whole graph topology @@ -340,23 +338,6 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { } } -void galois::graphs::GNNGraph::InitZeroStartGraphIndices() { - GALOIS_LOG_VERBOSE("[{}] Initializing node indices with 0 prepended", - host_id_); - // size is num nodes + 1 - zero_start_graph_indices_.resize(partitioned_graph_->size() + 1); - // first element is zero - zero_start_graph_indices_[0] = 0; - // the rest is a straight copy from partitioned graph (use edge_end to access - // it) - galois::do_all( - galois::iterate(static_cast(0), partitioned_graph_->size()), - [&](size_t i) { - zero_start_graph_indices_[i + 1] = *(partitioned_graph_->edge_end(i)); - }, - galois::loopname("InitZeroStartGraphIndices")); -} - void galois::graphs::GNNGraph::ReadWholeGraph(const std::string& dataset_name) { std::string input_file = input_directory_ + dataset_name + ".csgr"; GALOIS_LOG_VERBOSE("[{}] Reading entire graph: file to read is {}", host_id_, From 04bd6c51ab238c4d61d48d61b8740d35b6da15e1 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Nov 2020 10:43:05 -0600 Subject: [PATCH 391/660] Fix GNNGraph default directory init bug The constructor for GNN Graph that does not take an input directory was constructing a GNNGraph in its function scope and not constructing the actual caller of the constructor. This led to most of the unit tests failing as the graph was not constructed correctly. Moral: run unit tests before every commit. --- libgnn/src/graphs/GNNGraph.cpp | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index fe997ed228..2292614b70 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -38,10 +38,9 @@ size_t gnn_matrix_to_sync_column_length_ = 0; galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme, - bool has_single_class_label) { - GNNGraph(galois::default_gnn_dataset_path, dataset_name, partition_scheme, - has_single_class_label); -} + bool has_single_class_label) + : GNNGraph(galois::default_gnn_dataset_path, dataset_name, partition_scheme, + has_single_class_label) {} galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, const std::string& dataset_name, From 10943ca906a6a2ac50d18c758a5e530f5e8fda8f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Nov 2020 10:48:55 -0600 Subject: [PATCH 392/660] Allow getMarshalGraph to not deallocate graph Added a version of getMarshalGraph in the Gluon substrate that allows a user to not deallocate the source graph. Added in anticipation of GNN work where it might be beneficial to keep the CPU version of the graph around in memory. --- libgluon/include/galois/graphs/GluonSubstrate.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index 11a89157e7..f79427af89 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -3301,7 +3301,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } public: - void getMarshalGraph(MarshalGraph& m) { + void getMarshalGraph(MarshalGraph& m) { getMarshalGraph(m, true); } + + void getMarshalGraph(MarshalGraph& m, bool deallocate_graph) { m.nnodes = userGraph.size(); m.nedges = userGraph.sizeEdges(); m.numOwned = userGraph.numMasters(); @@ -3389,7 +3391,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // user needs to provide method of freeing up graph (it can do nothing // if they wish) - userGraph.deallocate(); + if (deallocate_graph) { + userGraph.deallocate(); + } } #endif // het galois def From bd9dea1a0042ceb627dd7702a266da5320400a9c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Nov 2020 13:03:16 -0600 Subject: [PATCH 393/660] Allocate features of graph on GPU (first test) Adds initial CUDA headers/impl to GNNGraph, starting with a struct that is meant to hold all GPU based allocations for the graph. This is compiled separately from the main GNN library using nvcc and linked into the main GNN library. This commit adds an Init GPU call that is meant to initialize all GPU memory used for GNN training. For now, it only allocates/copies over the features of the graph. The rest are incoming in later commits. Also contains a change to the Gluon gradient interface which adds a few dummy typedefs so that Gluon doesn't complain (GPU build expects certain typedefs to exist due to the marshal graph function). --- CMakeLists.txt | 7 +++- libgnn/CMakeLists.txt | 24 +++++++++++- libgnn/include/galois/CUDAUtil.h | 17 +++++++++ libgnn/include/galois/GNNTypes.h | 4 ++ libgnn/include/galois/graphs/GNNGraph.cuh | 37 +++++++++++++++++++ libgnn/include/galois/graphs/GNNGraph.h | 3 ++ .../galois/layers/GluonGradientInterface.h | 9 +++++ libgnn/src/graphs/GNNGraph.cpp | 15 +++++++- libgnn/src/graphs/GNNGraph.cu | 11 ++++++ 9 files changed, 124 insertions(+), 3 deletions(-) create mode 100644 libgnn/include/galois/CUDAUtil.h create mode 100644 libgnn/include/galois/graphs/GNNGraph.cuh create mode 100644 libgnn/src/graphs/GNNGraph.cu diff --git a/CMakeLists.txt b/CMakeLists.txt index 937251376c..41f318b828 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -236,7 +236,6 @@ if (GALOIS_ENABLE_DIST) add_subdirectory(libdist) add_subdirectory(libcusp) add_subdirectory(libgluon) - add_subdirectory(libgnn) endif() # TODO(loc) prefix with GALOIS @@ -279,6 +278,12 @@ if (GALOIS_ENABLE_GPU) #find_package(OpenCL REQUIRED) endif() endif() + +if (GALOIS_ENABLE_DIST) + # here because I need the GPU declarations above + add_subdirectory(libgnn) +endif() + add_subdirectory(libpangolin) # Applications (apps) diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index d635781ba6..1ca05c8632 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -23,5 +23,27 @@ target_include_directories(galois_gnn PUBLIC ) set_target_properties(galois_gnn PROPERTIES EXPORT_NAME galois_gnn) - add_subdirectory(test) + +if (GALOIS_ENABLE_GPU) + target_compile_definitions(galois_gnn PUBLIC GALOIS_ENABLE_GPU=1) + + # create the galois_gnn_gpu library to get linked into galois_gnn + set(gpusources + src/graphs/GNNGraph.cu + ) + add_library(galois_gnn_gpu STATIC ${gpusources}) + target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES) + target_compile_options(galois_gnn_gpu PUBLIC "$<$:--expt-extended-lambda>") + set_property(TARGET galois_gnn_gpu PROPERTY CUDA_STANDARD 14) + target_compile_definitions(galois_gnn_gpu PUBLIC GALOIS_ENABLE_GPU=1) + target_include_directories(galois_gnn_gpu PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/include + ) + + # link to gpu lib (which takes care of moderngpu and cub) + target_link_libraries(galois_gnn_gpu Galois::gpu galois_support) + + # gpu -> cpu lib + target_link_libraries(galois_gnn galois_gnn_gpu) +endif() diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h new file mode 100644 index 0000000000..dabd1638b4 --- /dev/null +++ b/libgnn/include/galois/CUDAUtil.h @@ -0,0 +1,17 @@ +#ifdef GALOIS_ENABLE_GPU +//! @file CUDAUtil.h +//! Contains various utility functions for CUDA. +#pragma once +#include +#include "galois/Logging.h" + +#define CUDA_CHECK(condition) \ + do { \ + cudaError_t error = condition; \ + if (error != cudaSuccess) { \ + GALOIS_LOG_ERROR("CUDA error: {}", cudaGetErrorString(error)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + +#endif diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h index a04fa14687..99cb700cb4 100644 --- a/libgnn/include/galois/GNNTypes.h +++ b/libgnn/include/galois/GNNTypes.h @@ -13,6 +13,10 @@ using GNNFloat = float; using GNNLabel = uint8_t; //! Type of a feature on vertices using GNNFeature = float; +//! Type of node index on gpus +using GPUNodeIndex = uint32_t; +//! Type of edge index on gpus +using GPUEdgeIndex = uint64_t; //! Phase of GNN computation enum class GNNPhase { kTrain, kValidate, kTest }; diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh new file mode 100644 index 0000000000..3d8bd45d58 --- /dev/null +++ b/libgnn/include/galois/graphs/GNNGraph.cuh @@ -0,0 +1,37 @@ +#pragma once +#include "galois/GNNTypes.h" + +namespace galois { +namespace graphs { + +//! Class to hold everything allocated on the GPU that has to do with GNNGraph. +//! Similar in nature to the CUDAContext class in existing D-IrGL +class GNNGraphGPUAllocations { +public: + // XXX getters for everything, the rest of the setters, etc. + + // XXX destructor for allocated memory + + //! Host side function that allocates memory for the features on the vertices + //! and copies them over to the GPU. + void SetFeatures(const std::vector& features); + +private: + // Note: no graph object, similar to Xuhao's LGraph in older code + //! edge_index[n] gets the first edge index for node n (i.e. edge_index_[0] + //! = 0) + GPUEdgeIndex* edge_index_{nullptr}; + //! edge_destinations_[i] = destination for edge i + GPUNodeIndex* edge_destinations_{nullptr}; + //! (Local) feature vector + GNNFeature* feature_vector_{nullptr}; + //! (Local) ground truth vector + GNNFloat* ground_truth_{nullptr}; + //! (Local) norm factors + GNNFloat* norm_factors_{nullptr}; + + // XXX masks? other things I haven't considered yet? +}; + +} // namespace graphs +} // namespace galois diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 3f73aff510..81b2830e9d 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -202,6 +202,9 @@ class GNNGraph { //! This satisfies the cuda context forward declaration in host decls: //! context fields GNNGraphGPUAllocations gpu_memory_; + //! Call this to setup GPU memory for this graph: allocates necessary GPU + //! memory and copies things over + void InitGPUMemory(); #endif }; diff --git a/libgnn/include/galois/layers/GluonGradientInterface.h b/libgnn/include/galois/layers/GluonGradientInterface.h index 92c0a5eb69..473151efcd 100644 --- a/libgnn/include/galois/layers/GluonGradientInterface.h +++ b/libgnn/include/galois/layers/GluonGradientInterface.h @@ -15,6 +15,15 @@ namespace galois { //! they'll all see the same values after the first round of sync anyways) class GluonGradientInterface { public: + // typedefs required by GPU end to build; not actually used anywhere in this + // class (...at the moment) + // as such, dummy declarations that don't particularly make sense + // TODO will likely need to revisit once GPU substrate for this needs to be + // setup + using GraphNode = uint32_t; + using edge_iterator = boost::counting_iterator; + using EdgeType = char; + //! Save reference to weight gradients. //! Then setup mirror metadata for Gluon to use during setup. GluonGradientInterface(std::vector& gradients); diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 2292614b70..804486a6bb 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -47,7 +47,7 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, GNNPartitionScheme partition_scheme, bool has_single_class_label) : input_directory_(input_directory) { - GALOIS_LOG_VERBOSE("[{}] Constructing partitiong for {}", host_id_, + GALOIS_LOG_VERBOSE("[{}] Constructing partitioning for {}", host_id_, dataset_name); // save host id host_id_ = galois::runtime::getSystemNetworkInterface().ID; @@ -74,6 +74,12 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, ReadWholeGraph(dataset_name); // init norm factors using the whole graph topology InitNormFactor(); + +#ifdef GALOIS_ENABLE_GPU + // allocate/copy data structures over to GPU + GALOIS_LOG_VERBOSE("[{}] Initializing GPU memory", host_id_); + InitGPUMemory(); +#endif } bool galois::graphs::GNNGraph::IsValidForPhase( @@ -364,3 +370,10 @@ void galois::graphs::GNNGraph::InitNormFactor() { }, galois::loopname("InitNormFactor")); } + +#ifdef GALOIS_ENABLE_GPU +void galois::graphs::GNNGraph::InitGPUMemory() { + // XXX finish up GPU memory allocation; currently just testing the build + gpu_memory_.SetFeatures(local_node_features_); +} +#endif diff --git a/libgnn/src/graphs/GNNGraph.cu b/libgnn/src/graphs/GNNGraph.cu new file mode 100644 index 0000000000..4282ba753d --- /dev/null +++ b/libgnn/src/graphs/GNNGraph.cu @@ -0,0 +1,11 @@ +#include "galois/CUDAUtil.h" +#include "galois/graphs/GNNGraph.cuh" + +void galois::graphs::GNNGraphGPUAllocations::SetFeatures( + const std::vector& features) { + CUDA_CHECK(cudaMalloc((void**)(&feature_vector_), + features.size() * sizeof(GNNFeature))); + CUDA_CHECK(cudaMemcpy(feature_vector_, features.data(), + features.size() * sizeof(GNNFeature), + cudaMemcpyHostToDevice)); +} From 50f8de139a53521f8cf0943c436e25fd509468b4 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Nov 2020 13:52:51 -0600 Subject: [PATCH 394/660] epoch-test: changed input to cora for faster test --- libgnn/test/epoch-test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp index 21d5249fd1..2486269ccd 100644 --- a/libgnn/test/epoch-test.cpp +++ b/libgnn/test/epoch-test.cpp @@ -14,7 +14,7 @@ int main() { // load graph auto test_graph = std::make_unique( - "reddit", galois::graphs::GNNPartitionScheme::kCVC, true); + "cora", galois::graphs::GNNPartitionScheme::kCVC, true); std::vector layer_types = { galois::GNNLayerType::kGraphConvolutional, From 6824c288e8f06fa47356d09fe55e6d53b7b12ee8 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 3 Nov 2020 17:45:06 -0600 Subject: [PATCH 395/660] Copy over int graph topology onto GPU for GNN Adds code to cast the CSR of a partitioned graph into ints and copy it over to the GPU. The use of ints is because BLAS CSR standard is apparently ints, so this is a future-proofing step for using GPU BLAS calls. Also adds the destructor for GPU graph allocations which frees up allocated memory + saves more variables onto GPU memory like feature length, node count, etc. --- libgnn/include/galois/CUDAUtil.h | 8 ++++ libgnn/include/galois/graphs/GNNGraph.cuh | 28 +++++++++++--- libgnn/src/graphs/GNNGraph.cpp | 45 ++++++++++++++++++++++- libgnn/src/graphs/GNNGraph.cu | 45 ++++++++++++++++++++++- 4 files changed, 119 insertions(+), 7 deletions(-) diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h index dabd1638b4..f8d7a03b80 100644 --- a/libgnn/include/galois/CUDAUtil.h +++ b/libgnn/include/galois/CUDAUtil.h @@ -14,4 +14,12 @@ } \ } while (0) +#define CUDA_FREE(ptr) \ + do { \ + if (ptr) { \ + CUDA_CHECK(cudaFree(ptr)); \ + ptr = nullptr; \ + } \ + } while (0) + #endif diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh index 3d8bd45d58..4e0bb8d193 100644 --- a/libgnn/include/galois/graphs/GNNGraph.cuh +++ b/libgnn/include/galois/graphs/GNNGraph.cuh @@ -10,27 +10,45 @@ class GNNGraphGPUAllocations { public: // XXX getters for everything, the rest of the setters, etc. - // XXX destructor for allocated memory + //! CUDA frees all allocated memory (i.e. non-nullptr) + ~GNNGraphGPUAllocations(); + + //! Copies graph topology over to GPU; using ints because cuSparse lib + //! expects ints for the CSR arrays + void SetGraphTopology(const std::vector& edge_index, + const std::vector& edge_dests); //! Host side function that allocates memory for the features on the vertices //! and copies them over to the GPU. - void SetFeatures(const std::vector& features); + void SetFeatures(const std::vector& features, + unsigned num_features); private: + // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS + + //! Number of features (which is equivalent to number of nodes) + unsigned* num_features_{nullptr}; + //! Length of a feature vector + unsigned* feature_length_{nullptr}; + //! Number of edges in graph + unsigned* num_edges_{nullptr}; + // Note: no graph object, similar to Xuhao's LGraph in older code //! edge_index[n] gets the first edge index for node n (i.e. edge_index_[0] //! = 0) - GPUEdgeIndex* edge_index_{nullptr}; + int* edge_index_{nullptr}; //! edge_destinations_[i] = destination for edge i - GPUNodeIndex* edge_destinations_{nullptr}; + int* edge_destinations_{nullptr}; //! (Local) feature vector GNNFeature* feature_vector_{nullptr}; + // TODO need these on GPU? //! (Local) ground truth vector GNNFloat* ground_truth_{nullptr}; //! (Local) norm factors GNNFloat* norm_factors_{nullptr}; - // XXX masks? other things I haven't considered yet? + // TODO masks? other things I haven't considered yet? will determine if they + // are needed }; } // namespace graphs diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 804486a6bb..42eb645c43 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -2,6 +2,7 @@ #include "galois/Logging.h" #include "galois/graphs/ReadGraph.h" #include "galois/graphs/GNNGraph.h" +#include namespace { //! Partitions a particular dataset given some partitioning scheme @@ -374,6 +375,48 @@ void galois::graphs::GNNGraph::InitNormFactor() { #ifdef GALOIS_ENABLE_GPU void galois::graphs::GNNGraph::InitGPUMemory() { // XXX finish up GPU memory allocation; currently just testing the build - gpu_memory_.SetFeatures(local_node_features_); + + // create int casted CSR + uint64_t* e_index_ptr = partitioned_graph_->row_start_ptr(); + uint32_t* e_dest_ptr = partitioned_graph_->edge_dst_ptr(); + + // + 1 because first element is 0 in BLAS CSRs + std::vector e_index(partitioned_graph_->size() + 1); + std::vector e_dest(partitioned_graph_->sizeEdges()); + + // set in parallel + galois::do_all( + galois::iterate(static_cast(0), partitioned_graph_->size() + 1), + [&](size_t index) { + if (index != 0) { + if (e_index_ptr[index - 1] > + static_cast(std::numeric_limits::max())) { + GALOIS_LOG_FATAL("{} is too big a number for int arrays on GPUs", + e_index_ptr[index - 1]); + } + e_index[index] = static_cast(e_index_ptr[index - 1]); + } else { + e_index[index] = 0; + } + }, + galois::loopname("GPUEdgeIndexConstruction")); + galois::do_all( + galois::iterate(static_cast(0), partitioned_graph_->sizeEdges()), + [&](size_t edge) { + if (e_dest_ptr[edge] > + static_cast(std::numeric_limits::max())) { + GALOIS_LOG_FATAL("{} is too big a number for int arrays on GPUs", + e_dest_ptr[edge]); + } + + e_dest[edge] = static_cast(e_dest_ptr[edge]); + }, + galois::loopname("GPUEdgeDestConstruction")); + + gpu_memory_.SetGraphTopology(e_index, e_dest); + e_index.clear(); + e_dest.clear(); + + gpu_memory_.SetFeatures(local_node_features_, node_feature_length_); } #endif diff --git a/libgnn/src/graphs/GNNGraph.cu b/libgnn/src/graphs/GNNGraph.cu index 4282ba753d..aae729c015 100644 --- a/libgnn/src/graphs/GNNGraph.cu +++ b/libgnn/src/graphs/GNNGraph.cu @@ -1,8 +1,51 @@ #include "galois/CUDAUtil.h" #include "galois/graphs/GNNGraph.cuh" +galois::graphs::GNNGraphGPUAllocations::~GNNGraphGPUAllocations() { + GALOIS_LOG_VERBOSE("Freeing GPU graph allocations"); + CUDA_FREE(num_features_); + CUDA_FREE(feature_length_); + CUDA_FREE(num_edges_); + CUDA_FREE(edge_index_); + CUDA_FREE(edge_destinations_); + CUDA_FREE(feature_vector_); + CUDA_FREE(ground_truth_); + CUDA_FREE(norm_factors_); +} + +void galois::graphs::GNNGraphGPUAllocations::SetGraphTopology( + const std::vector& edge_index, const std::vector& edge_dests) { + // num edges variable + CUDA_CHECK(cudaMalloc((void**)(&num_edges_), sizeof(unsigned))); + unsigned num_edges = edge_dests.size(); + CUDA_CHECK(cudaMemcpy(num_edges_, &num_edges, sizeof(unsigned), + cudaMemcpyHostToDevice)); + + // topology; assumes caller already setup vectors accordingly + CUDA_CHECK( + cudaMalloc((void**)(&edge_index_), edge_index.size() * sizeof(int))); + CUDA_CHECK(cudaMalloc((void**)(&edge_destinations_), + edge_dests.size() * sizeof(int))); + CUDA_CHECK(cudaMemcpy(edge_index_, edge_index.data(), + edge_index.size() * sizeof(int), + cudaMemcpyHostToDevice)); + CUDA_CHECK(cudaMemcpy(edge_destinations_, edge_dests.data(), + edge_dests.size() * sizeof(int), + cudaMemcpyHostToDevice)); +} + void galois::graphs::GNNGraphGPUAllocations::SetFeatures( - const std::vector& features) { + const std::vector& features, unsigned num_features) { + // feature count & length + CUDA_CHECK(cudaMalloc((void**)(&num_features_), sizeof(unsigned))); + CUDA_CHECK(cudaMalloc((void**)(&feature_length_), sizeof(unsigned))); + CUDA_CHECK(cudaMemcpy(num_features_, &num_features, sizeof(unsigned), + cudaMemcpyHostToDevice)); + unsigned feature_length = features.size() / num_features; + CUDA_CHECK(cudaMemcpy(feature_length_, &feature_length, sizeof(unsigned), + cudaMemcpyHostToDevice)); + + // features themselves CUDA_CHECK(cudaMalloc((void**)(&feature_vector_), features.size() * sizeof(GNNFeature))); CUDA_CHECK(cudaMemcpy(feature_vector_, features.data(), From ccb21dc1446b775c966f02c71de3c2fdbf504583 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 4 Nov 2020 15:41:16 -0600 Subject: [PATCH 396/660] Training set specification for citeseer and pubmed Adds training set specification for citeseer and pubmed datasets to the CuSP partitioner so that they can be run without crashes. g# --- libcusp/include/galois/graphs/NewGeneric.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index f4837ff1de..048cfa4bc2 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -88,6 +88,12 @@ class NewDistGraphGeneric : public DistGraph { } else if (filename.find("reddit") != std::string::npos) { bps.push_back(0); bps.push_back(153431); + } else if (filename.find("citeseer") != std::string::npos) { + bps.push_back(0); + bps.push_back(120); + } else if (filename.find("pubmed") != std::string::npos) { + bps.push_back(0); + bps.push_back(60); } else if (filename.find("ppi") != std::string::npos) { bps.push_back(0); bps.push_back(9716); @@ -96,7 +102,7 @@ class NewDistGraphGeneric : public DistGraph { bps.push_back(5); } else { // XXX only die under certain conditions - //GALOIS_DIE("invalid input for gnn partitioning ", filename, + // GALOIS_DIE("invalid input for gnn partitioning ", filename, // " hardcode needed"); } // TODO hardcode the rest From d7792b3b0cab40a514d5caf8a88dc78943425eb8 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 4 Nov 2020 19:00:29 -0600 Subject: [PATCH 397/660] GNNGraph: copy node ground truth labels to GPU Adds a function to copy ground truth for vertices to the GPU and adds it to the GPU memory init call. --- libgnn/include/galois/graphs/GNNGraph.cuh | 5 ++++- libgnn/include/galois/graphs/GNNGraph.h | 3 +-- libgnn/src/graphs/GNNGraph.cpp | 1 + libgnn/src/graphs/GNNGraph.cu | 9 +++++++++ 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh index 4e0bb8d193..9e047b6a0f 100644 --- a/libgnn/include/galois/graphs/GNNGraph.cuh +++ b/libgnn/include/galois/graphs/GNNGraph.cuh @@ -23,6 +23,9 @@ public: void SetFeatures(const std::vector& features, unsigned num_features); + //! Copy over ground truth for the graph to GPU + void SetLabels(const std::vector& ground_truth); + private: // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS @@ -41,9 +44,9 @@ private: int* edge_destinations_{nullptr}; //! (Local) feature vector GNNFeature* feature_vector_{nullptr}; - // TODO need these on GPU? //! (Local) ground truth vector GNNFloat* ground_truth_{nullptr}; + // TODO need this? //! (Local) norm factors GNNFloat* norm_factors_{nullptr}; diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 81b2830e9d..76ac693cf5 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -199,8 +199,7 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// #ifdef GALOIS_ENABLE_GPU - //! This satisfies the cuda context forward declaration in host decls: - //! context fields + //! Object that holds all GPU allocated pointers to memory related to graphs. GNNGraphGPUAllocations gpu_memory_; //! Call this to setup GPU memory for this graph: allocates necessary GPU //! memory and copies things over diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 42eb645c43..918ce3d735 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -418,5 +418,6 @@ void galois::graphs::GNNGraph::InitGPUMemory() { e_dest.clear(); gpu_memory_.SetFeatures(local_node_features_, node_feature_length_); + gpu_memory_.SetLabels(local_ground_truth_labels_); } #endif diff --git a/libgnn/src/graphs/GNNGraph.cu b/libgnn/src/graphs/GNNGraph.cu index aae729c015..f13bbf4089 100644 --- a/libgnn/src/graphs/GNNGraph.cu +++ b/libgnn/src/graphs/GNNGraph.cu @@ -52,3 +52,12 @@ void galois::graphs::GNNGraphGPUAllocations::SetFeatures( features.size() * sizeof(GNNFeature), cudaMemcpyHostToDevice)); } + +void galois::graphs::GNNGraphGPUAllocations::SetLabels( + const std::vector& ground_truth) { + CUDA_CHECK(cudaMalloc((void**)(&ground_truth_), + ground_truth.size() * sizeof(GNNLabel))); + CUDA_CHECK(cudaMemcpy(ground_truth_, ground_truth.data(), + ground_truth.size() * sizeof(GNNLabel), + cudaMemcpyHostToDevice)); +} From ede6c109fb39959e53517879408de97b6ba414eb Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 5 Nov 2020 19:38:39 -0600 Subject: [PATCH 398/660] Massive signature overhaul for vectors in GNNs Previously in most of the GNN code std::vectors were being passed around for the main calls to training/inferencing. The problem with this is that GPUs do not have std::vector, but only pointers. To avoid losing out on the size call, a new class called PointerWithSize has been added that wraps a pointer and attaches a size to it that can be accessed via size. It also has a similar interface to a std::vector so as little of the code has to be changed as possible. This class is now used instead of std::vector in the main Forward/Backward Phase calls in the GNN in preparation for the GPU support. This also required a few related changes such as making some functions non-const (because the PointerWithSize is not a const construction since it grabs the raw pointer). Most of the tests have been edited as well to account for the new return type. --- libgnn/include/galois/GNNTypes.h | 32 +++ libgnn/include/galois/GraphNeuralNetwork.h | 8 +- libgnn/include/galois/graphs/GNNGraph.h | 4 +- libgnn/include/galois/layers/GNNLayer.h | 28 +- .../galois/layers/GraphConvolutionalLayer.h | 11 +- libgnn/include/galois/layers/SoftmaxLayer.h | 11 +- libgnn/src/GraphNeuralNetwork.cpp | 28 +- libgnn/src/layers/GNNLayer.cpp | 8 +- libgnn/src/layers/GraphConvolutionalLayer.cpp | 13 +- libgnn/src/layers/SoftmaxLayer.cpp | 16 +- libgnn/test/accuracy-test.cpp | 12 +- libgnn/test/aggregate-sync-test.cpp | 22 +- libgnn/test/convlayer-test.cpp | 156 ++++++------ libgnn/test/epoch-test.cpp | 8 +- libgnn/test/gnnfb-test.cpp | 10 +- libgnn/test/gpu-convlayer-test.cpp | 239 ++++++++++++++++++ libgnn/test/softmaxlayer-test.cpp | 6 +- libgnn/test/weight-sync-test.cpp | 5 +- 18 files changed, 449 insertions(+), 168 deletions(-) create mode 100644 libgnn/test/gpu-convlayer-test.cpp diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h index 99cb700cb4..40f19da7b0 100644 --- a/libgnn/include/galois/GNNTypes.h +++ b/libgnn/include/galois/GNNTypes.h @@ -4,6 +4,7 @@ #include #include +#include namespace galois { //! Floating point type to use throughout GNN compute; typedef'd so it's easier @@ -21,4 +22,35 @@ using GPUEdgeIndex = uint64_t; //! Phase of GNN computation enum class GNNPhase { kTrain, kValidate, kTest }; +//! Vector like wrapper over a pointer and size; exists solely to pass around +//! raw pointers with size (because vectors are a no-go due to the code +//! handling both CPU and GPU.) +template +class PointerWithSize { +public: + //! Default is empty + PointerWithSize() : ptr_{nullptr}, num_elements_{0} {} + //! Generic constructor which takes 2 fields to initialize + PointerWithSize(PointerType* ptr, size_t num_elements) + : ptr_{ptr}, num_elements_{num_elements} {} + //! Grab vector pointer + size + PointerWithSize(std::vector& v) + : ptr_{v.data()}, num_elements_{v.size()} {} + //! Alias to return pointer data + PointerType* data() { return ptr_; } + //! Alias to return pointer data (const version) + const PointerType* data() const { return ptr_; } + //! # elements that pointer should contain + size_t size() const { return num_elements_; } + // accessors; one lets you mess with the array + PointerType& operator[](size_t i) { return ptr_[i]; } + const PointerType& operator[](size_t i) const { return ptr_[i]; } + +private: + //! Pointer to data + PointerType* ptr_; + //! # elements that I should be able to access from pointer + size_t num_elements_; +}; + } // end namespace galois diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 725e3a69d1..9e7e2266d0 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -106,7 +106,7 @@ class GraphNeuralNetwork { size_t num_intermediate_layers() { return gnn_layers_.size() - 1; } //! Returns pointer to intermediate layer i - const galois::GNNLayer* GetIntermediateLayer(size_t i) { + galois::GNNLayer* GetIntermediateLayer(size_t i) { if (i < gnn_layers_.size() - 1) { return gnn_layers_[i].get(); } else { @@ -130,7 +130,7 @@ class GraphNeuralNetwork { } //! Returns the output layer - const galois::GNNLayer* GetOutputLayer() { return gnn_layers_.back().get(); } + galois::GNNLayer* GetOutputLayer() { return gnn_layers_.back().get(); } //! Do training for a specified # of epochs and return test accuracy at the //! end of it @@ -140,9 +140,9 @@ class GraphNeuralNetwork { //! vector representation. //! Also known as the forward phase in most literature //! @returns Output layer's output - const std::vector* DoInference(); + const PointerWithSize DoInference(); - float GetGlobalAccuracy(const std::vector& predictions); + float GetGlobalAccuracy(const PointerWithSize predictions); //! Backpropagate gradients from the output layer backwards through the //! network to update the layer weights. Also known as a backward phase in diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 76ac693cf5..2b55d17b7a 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -101,8 +101,8 @@ class GNNGraph { } //! Return matrix of the local node features - const std::vector& GetLocalFeatures() const { - return local_node_features_; + const PointerWithSize GetLocalFeatures() { + return PointerWithSize(local_node_features_); } //! Given an LID and the current phase of GNN computation, determine if the diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 2473de7229..e738bdacca 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -80,16 +80,17 @@ class GNNLayer { } } - const std::vector& GetForwardOutput() const { - return forward_output_matrix_; + const PointerWithSize GetForwardOutput() { + return PointerWithSize(forward_output_matrix_); } - const std::vector& GetBackwardOutput() const { - return backward_output_matrix_; + + const PointerWithSize GetBackwardOutput() { + return PointerWithSize(backward_output_matrix_); } //! Returns the weight gradients - const std::vector& GetLayerWeightGradients() const { - return layer_weight_gradients_; + const PointerWithSize GetLayerWeightGradients() { + return PointerWithSize(layer_weight_gradients_); } //! Returns dimensions of this layer @@ -106,8 +107,9 @@ class GNNLayer { //! ultimately leads to an output (classfication of node labels) at the end //! of the GNN. //! @returns Output of the forward phase (i.e. input to next layer) - virtual const std::vector& - ForwardPhase(const std::vector& input_embeddings) = 0; + // XXX size of embeddings + virtual const PointerWithSize + ForwardPhase(const PointerWithSize input_embeddings) = 0; //! Conducts the backward phase given the input to this layer; the backward //! phase calculates the gradients to update the weights of trainable //! parts of the layer (e.g., weights, trainable params for aggregate, etc.). @@ -117,9 +119,9 @@ class GNNLayer { //! one; takes a pointer to save space by writing intermediate results to it //! @returns Output of the backward phase (i.e. input to previous layer); note //! it's a pointer because layer can mess with it - virtual std::vector* - BackwardPhase(const std::vector& prev_layer_input, - std::vector* input_gradient) = 0; + virtual PointerWithSize + BackwardPhase(const PointerWithSize prev_layer_input, + PointerWithSize* input_gradient) = 0; //! Given an optimizer, update the weights in this layer based on gradients //! stored in the layer @@ -185,7 +187,7 @@ class GNNLayer { //! Choose a set of weights from this layer's weights to keep and save to //! the output matrix + apply some scaling to the kept weights based on //! dropout rate - void DoDropout(const std::vector& input_to_drop, + void DoDropout(const PointerWithSize input_to_drop, std::vector* output_matrix); //! Apply the derivative of dropout to the backward phase output void DoDropoutDerivative(); @@ -194,7 +196,7 @@ class GNNLayer { //! matrix void Activation(); //! Calculate derivative of activation function based on config on the matrix - void ActivationDerivative(std::vector* matrix); + void ActivationDerivative(PointerWithSize* matrix); //! Synchronize weight gradients with a summation void WeightGradientSyncSum(); diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h index 123a8d774a..196fa752c8 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h @@ -20,11 +20,12 @@ class GraphConvolutionalLayer : public GNNLayer { GNNLayerConfig()) {} // Parent functions - const std::vector& - ForwardPhase(const std::vector& input_embeddings) final; - std::vector* - BackwardPhase(const std::vector& prev_layer_input, - std::vector* input_gradient) final; + const PointerWithSize + ForwardPhase(const PointerWithSize input_embeddings) final; + + PointerWithSize + BackwardPhase(const PointerWithSize prev_layer_input, + PointerWithSize* input_gradient) final; private: // 2 temporaries the size of the forward input; used for dropout and diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index 815f2401ff..5c412f6bf3 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -25,14 +25,13 @@ class SoftmaxLayer : public GNNLayer { } //! Creates probability distribution of each row of input - const std::vector& - ForwardPhase(const std::vector& input_embeddings) final; - + const PointerWithSize + ForwardPhase(const PointerWithSize input_embeddings) final; //! Get gradients to fix distribution such that it leans more towards single //! class ground truth. - std::vector* - BackwardPhase(const std::vector& prev_layer_input, - std::vector* input_gradient) final; + PointerWithSize + BackwardPhase(const PointerWithSize prev_layer_input, + PointerWithSize* input_gradient) final; private: //! Loss for each row of the input diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 82c1d40c07..eb419ba26c 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -63,9 +63,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { const size_t this_host = graph_->host_id(); // TODO incorporate validation/test intervals for (size_t epoch = 0; epoch < num_epochs; epoch++) { - const std::vector* predictions = DoInference(); + const PointerWithSize predictions = DoInference(); GradientPropagation(); - float train_accuracy = GetGlobalAccuracy(*predictions); + float train_accuracy = GetGlobalAccuracy(predictions); if (this_host == 0) { galois::gPrint("Epoch ", epoch, ": Train accuracy is ", train_accuracy, "\n"); @@ -77,8 +77,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { galois::StatTimer acc_timer("FinalAccuracyTest"); acc_timer.start(); SetLayerPhases(galois::GNNPhase::kTest); - const std::vector* predictions = DoInference(); - float global_accuracy = GetGlobalAccuracy(*predictions); + const PointerWithSize predictions = DoInference(); + float global_accuracy = GetGlobalAccuracy(predictions); acc_timer.stop(); if (this_host == 0) { @@ -88,17 +88,19 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { return global_accuracy; } -const std::vector* galois::GraphNeuralNetwork::DoInference() { +const galois::PointerWithSize +galois::GraphNeuralNetwork::DoInference() { // start with graph features and pass it through all layers of the network - const std::vector* layer_input = &(graph_->GetLocalFeatures()); + galois::PointerWithSize layer_input = + graph_->GetLocalFeatures(); for (std::unique_ptr& ptr : gnn_layers_) { - layer_input = &(ptr->ForwardPhase(*layer_input)); + layer_input = ptr->ForwardPhase(layer_input); } return layer_input; } float galois::GraphNeuralNetwork::GetGlobalAccuracy( - const std::vector& predictions) { + const PointerWithSize predictions) { // check owned nodes' accuracy size_t num_labels = graph_->GetNumLabelClasses(); assert((graph_->GetNumLabelClasses() * graph_->size()) == predictions.size()); @@ -143,7 +145,7 @@ void galois::GraphNeuralNetwork::GradientPropagation() { // from output layer get initial gradients std::vector dummy; std::unique_ptr& output_layer = gnn_layers_.back(); - std::vector* current_gradients = + galois::PointerWithSize current_gradients = output_layer->BackwardPhase(dummy, nullptr); // loops through intermediate layers in a backward fashion @@ -153,16 +155,16 @@ void galois::GraphNeuralNetwork::GradientPropagation() { size_t layer_index = gnn_layers_.size() - 2 - i; // get the input to the layer before this one - const std::vector* prev_layer_input; + galois::PointerWithSize prev_layer_input; if (layer_index != 0) { - prev_layer_input = &(gnn_layers_[layer_index - 1]->GetForwardOutput()); + prev_layer_input = gnn_layers_[layer_index - 1]->GetForwardOutput(); } else { - prev_layer_input = &(graph_->GetLocalFeatures()); + prev_layer_input = graph_->GetLocalFeatures(); } // backward prop and get a new set of gradients current_gradients = gnn_layers_[layer_index]->BackwardPhase( - *prev_layer_input, current_gradients); + prev_layer_input, ¤t_gradients); // if not output do optimization/gradient descent // at this point in the layer the gradients exist; use the gradients to // update the weights of the layer diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index dc81a9ca2b..a0ead51e10 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -61,8 +61,9 @@ void galois::GNNLayer::RandomInitVector(std::vector* vector_to_init) { // XXX Something is wrong with dropout; accuracy suffers, figure out what // it is -void galois::GNNLayer::DoDropout(const std::vector& input_to_dropout, - std::vector* output_matrix) { +void galois::GNNLayer::DoDropout( + const PointerWithSize input_to_dropout, + std::vector* output_matrix) { size_t num_elements = output_matrix->size(); assert(num_elements == dropout_mask_.size()); assert(num_elements == input_to_dropout.size()); @@ -114,7 +115,8 @@ void galois::GNNLayer::Activation() { galois::loopname("ReLU")); } -void galois::GNNLayer::ActivationDerivative(std::vector* gradient) { +void galois::GNNLayer::ActivationDerivative( + PointerWithSize* gradient) { // TODO only does relu at the moment; should check user specified activation // and act accordingly // keep gradient if the original output is greater than 0 diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 57a5d9505b..e2e80ce8b1 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -21,9 +21,9 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( layer_type_ = galois::GNNLayerType::kGraphConvolutional; } -const std::vector& +const galois::PointerWithSize galois::GraphConvolutionalLayer::ForwardPhase( - const std::vector& input_embeddings) { + const galois::PointerWithSize input_embeddings) { assert(input_embeddings.size() == (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); assert(in_temp_1_.size() == input_embeddings.size()); @@ -64,9 +64,10 @@ galois::GraphConvolutionalLayer::ForwardPhase( return forward_output_matrix_; } -std::vector* galois::GraphConvolutionalLayer::BackwardPhase( - const std::vector& prev_layer_input, - std::vector* input_gradient) { +galois::PointerWithSize +galois::GraphConvolutionalLayer::BackwardPhase( + galois::PointerWithSize prev_layer_input, + galois::PointerWithSize* input_gradient) { assert(layer_phase_ == GNNPhase::kTrain); // derivative of activation if (config_.do_activation) { @@ -121,7 +122,7 @@ std::vector* galois::GraphConvolutionalLayer::BackwardPhase( DoDropoutDerivative(); } - return &backward_output_matrix_; + return PointerWithSize(backward_output_matrix_); } void galois::GraphConvolutionalLayer::AggregateAll( diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 1262555a36..07e78d3c1f 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -3,11 +3,11 @@ #include "galois/layers/SoftmaxLayer.h" // Allocate memory and initialize -void galois::SoftmaxLayer::Init() { -} +void galois::SoftmaxLayer::Init() {} -const std::vector& galois::SoftmaxLayer::ForwardPhase( - const std::vector& input_embeddings) { +const galois::PointerWithSize +galois::SoftmaxLayer::ForwardPhase( + const galois::PointerWithSize input_embeddings) { input_loss_.assign(input_loss_.size(), 0.0); forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); const size_t feature_length = layer_dimensions_.input_columns; @@ -42,9 +42,9 @@ const std::vector& galois::SoftmaxLayer::ForwardPhase( return forward_output_matrix_; } -std::vector* -galois::SoftmaxLayer::BackwardPhase(const std::vector&, - std::vector*) { +galois::PointerWithSize +galois::SoftmaxLayer::BackwardPhase(const PointerWithSize, + PointerWithSize*) { const size_t feature_length = layer_dimensions_.input_columns; galois::do_all( @@ -83,7 +83,7 @@ galois::SoftmaxLayer::BackwardPhase(const std::vector&, // steal on as some threads may have nothing to work on galois::steal(), galois::loopname("SoftmaxBackward")); - return &backward_output_matrix_; + return PointerWithSize(backward_output_matrix_); } // TODO function for getting loss diff --git a/libgnn/test/accuracy-test.cpp b/libgnn/test/accuracy-test.cpp index e1fc17702e..6d26284325 100644 --- a/libgnn/test/accuracy-test.cpp +++ b/libgnn/test/accuracy-test.cpp @@ -33,23 +33,23 @@ int main() { ////////////////////////////////////////////////////////////////////////////// - const std::vector* distributions = gnn->DoInference(); + galois::PointerWithSize distributions = gnn->DoInference(); // accuracy will be 0.2: everything chooses the first 1 as the entire row // is the same - float pred_accuracy = gnn->GetGlobalAccuracy(*distributions); + float pred_accuracy = gnn->GetGlobalAccuracy(distributions); GALOIS_LOG_VERBOSE("{}", pred_accuracy); GALOIS_LOG_ASSERT(pred_accuracy == static_cast(0.2)); // validation mode gnn->SetLayerPhases(galois::GNNPhase::kValidate); - const std::vector* dist2 = gnn->DoInference(); - pred_accuracy = gnn->GetGlobalAccuracy(*dist2); + galois::PointerWithSize dist2 = gnn->DoInference(); + pred_accuracy = gnn->GetGlobalAccuracy(dist2); GALOIS_LOG_ASSERT(pred_accuracy == static_cast(0.0)); // test mode gnn->SetLayerPhases(galois::GNNPhase::kTest); - const std::vector* dist3 = gnn->DoInference(); - pred_accuracy = gnn->GetGlobalAccuracy(*dist3); + galois::PointerWithSize dist3 = gnn->DoInference(); + pred_accuracy = gnn->GetGlobalAccuracy(dist3); GALOIS_LOG_ASSERT(pred_accuracy == static_cast(0.0)); // manufactured predictions to make sure it predicts things correctly based diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp index 432a546448..600ac42018 100644 --- a/libgnn/test/aggregate-sync-test.cpp +++ b/libgnn/test/aggregate-sync-test.cpp @@ -38,7 +38,7 @@ int main() { dimension_0, l_config); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner - const std::vector& layer_0_forward_output = + galois::PointerWithSize layer_0_forward_output = layer_0->ForwardPhase(test_graph->GetLocalFeatures()); ////////////////////////////////////////////////////////////////////////////// @@ -97,20 +97,21 @@ int main() { ////////////////////////////////////////////////////////////////////////////// - std::vector dummy_ones(test_graph->size() * 2, 1); + std::vector dummy_ones_v(test_graph->size() * 2, 1); + galois::PointerWithSize dummy_ones(dummy_ones_v); // backward pass checking // layer 0 means that an empty weight matrix is returned since there is no // point passing back anything - std::vector* layer_0_backward_output = + galois::PointerWithSize layer_0_backward_output = layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones); ////////////////////////////////////////////////////////////////////////////// // sanity check layer 0 backward output; all 0 because layer 0 ////////////////////////////////////////////////////////////////////////////// // since norm factors aren't invovled it is possible to do full assertions - GALOIS_LOG_ASSERT(layer_0_backward_output->size() == test_graph->size() * 3); - for (size_t i = 0; i < layer_0_backward_output->size(); i++) { - GALOIS_LOG_ASSERT((*layer_0_backward_output)[i] == 0); + GALOIS_LOG_ASSERT(layer_0_backward_output.size() == test_graph->size() * 3); + for (size_t i = 0; i < layer_0_backward_output.size(); i++) { + GALOIS_LOG_ASSERT((layer_0_backward_output)[i] == 0); } ////////////////////////////////////////////////////////////////////////////// @@ -120,7 +121,7 @@ int main() { std::make_unique(1, *(test_graph.get()), dimension_0, l_config); layer_1->InitAllWeightsTo1(); - const std::vector& layer_1_forward_output = + galois::PointerWithSize layer_1_forward_output = layer_1->ForwardPhase(test_graph->GetLocalFeatures()); // same check for forward as before @@ -164,8 +165,8 @@ int main() { } // since layer isn't 0 anymore, backward phase will actually return something - dummy_ones.assign(test_graph->size() * 2, 1); - std::vector* layer_1_backward_output = + dummy_ones_v.assign(test_graph->size() * 2, 1); + galois::PointerWithSize layer_1_backward_output = layer_1->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones); for (size_t row = 0; row < test_graph->size(); row++) { @@ -193,8 +194,7 @@ int main() { // size 3 columns for (size_t c = 0; c < 3; c++) { - GALOIS_LOG_ASSERT((*layer_1_backward_output)[row * 3 + c] == - ground_truth); + GALOIS_LOG_ASSERT((layer_1_backward_output)[row * 3 + c] == ground_truth); } } diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp index 00825cf6f8..ae23fa4f23 100644 --- a/libgnn/test/convlayer-test.cpp +++ b/libgnn/test/convlayer-test.cpp @@ -17,7 +17,8 @@ int main() { galois::graphs::GNNGraph test_graph( "tester", galois::graphs::GNNPartitionScheme::kOEC, true); - std::vector feats = test_graph.GetLocalFeatures(); + galois::PointerWithSize feats = + test_graph.GetLocalFeatures(); ////////////////////////////////////////////////////////////////////////////// // doubles as a test for reading as well GALOIS_LOG_ASSERT(7 == test_graph.size()); @@ -59,7 +60,7 @@ int main() { dimension_0, dcon); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner - const std::vector& layer_0_forward_output = + const galois::PointerWithSize layer_0_forward_output = layer_0->ForwardPhase(test_graph.GetLocalFeatures()); ////////////////////////////////////////////////////////////////////////////// @@ -85,12 +86,13 @@ int main() { ////////////////////////////////////////////////////////////////////////////// // dummy 1 matrix - std::vector dummy_ones(14, 1); + std::vector dummy_ones_v(14, 1); + galois::PointerWithSize dummy_ones(dummy_ones_v); // backward pass checking // layer 0 means that an empty weight matrix is returned since there is no // point passing back anything - std::vector* layer_0_backward_output = + galois::PointerWithSize layer_0_backward_output = layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); ////////////////////////////////////////////////////////////////////////////// @@ -98,30 +100,30 @@ int main() { ////////////////////////////////////////////////////////////////////////////// // since norm factors aren't invovled it is possible to do full assertions // 7 x 3 - GALOIS_LOG_ASSERT(layer_0_backward_output->size() == 21); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[0] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[1] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[2] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[3] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[4] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[5] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[6] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[7] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[8] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[9] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[10] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[11] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[12] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[13] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[14] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[15] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[16] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[17] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[18] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[19] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[20] == 0); + GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21); + GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0); - const std::vector layer_0_weight_gradients = + galois::PointerWithSize layer_0_weight_gradients = layer_0->GetLayerWeightGradients(); // make sure they are sane GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6); @@ -141,7 +143,7 @@ int main() { std::make_unique(1, test_graph, dimension_0, dcon); layer_1->InitAllWeightsTo1(); - const std::vector& layer_1_forward_output = + galois::PointerWithSize layer_1_forward_output = layer_1->ForwardPhase(test_graph.GetLocalFeatures()); // same check as before for sanity purposes GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14); @@ -161,36 +163,36 @@ int main() { GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15); // since layer isn't 0 anymore, backward phase will actually return something - dummy_ones.assign(14, 1); - std::vector* layer_1_backward_output = + dummy_ones_v.assign(14, 1); + galois::PointerWithSize layer_1_backward_output = layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); ////////////////////////////////////////////////////////////////////////////// // check that multiplies go as expected ////////////////////////////////////////////////////////////////////////////// - GALOIS_LOG_ASSERT(layer_1_backward_output->size() == 21); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[0] == 2); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[1] == 2); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[2] == 2); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[3] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[4] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[5] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[6] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[7] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[8] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[9] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[10] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[11] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[12] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[13] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[14] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[15] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[16] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[17] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[18] == 2); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[19] == 2); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[20] == 2); + GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21); + GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 2); - const std::vector layer_1_weight_gradients = + galois::PointerWithSize layer_1_weight_gradients = layer_1->GetLayerWeightGradients(); // make sure they are sane GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); @@ -217,7 +219,7 @@ int main() { std::unique_ptr layer_2 = std::make_unique(1, test_graph, dimension_0, config); - const std::vector l2_fo = + galois::PointerWithSize l2_fo = layer_2->ForwardPhase(test_graph.GetLocalFeatures()); GALOIS_LOG_ASSERT(l2_fo.size() == 14); GALOIS_LOG_VERBOSE("{}", l2_fo[0]); @@ -235,31 +237,31 @@ int main() { GALOIS_LOG_VERBOSE("{}", l2_fo[12]); GALOIS_LOG_VERBOSE("{}", l2_fo[13]); - std::vector* l2_bo = + galois::PointerWithSize l2_bo = layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); - GALOIS_LOG_ASSERT(l2_bo->size() == 21); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[0]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[1]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[2]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[3]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[4]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[5]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[6]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[7]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[8]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[9]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[10]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[11]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[12]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[13]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[14]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[15]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[16]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[17]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[18]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[19]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[20]); + GALOIS_LOG_ASSERT(l2_bo.size() == 21); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]); return 0; } diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp index 2486269ccd..da2a9e1be2 100644 --- a/libgnn/test/epoch-test.cpp +++ b/libgnn/test/epoch-test.cpp @@ -44,16 +44,16 @@ int main() { galois::StatTimer main_timer("Timer_0"); main_timer.start(); for (size_t epoch = 0; epoch < 20; epoch++) { - const std::vector* predictions = gnn->DoInference(); + galois::PointerWithSize predictions = gnn->DoInference(); gnn->GradientPropagation(); galois::gPrint("Epoch ", epoch, ": Accuracy is ", - gnn->GetGlobalAccuracy(*predictions), "\n"); + gnn->GetGlobalAccuracy(predictions), "\n"); } // check test accuracy gnn->SetLayerPhases(galois::GNNPhase::kTest); - const std::vector* predictions = gnn->DoInference(); - galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(*predictions), + galois::PointerWithSize predictions = gnn->DoInference(); + galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(predictions), "\n"); main_timer.stop(); } diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp index d43e1b0e2e..e7232ca108 100644 --- a/libgnn/test/gnnfb-test.cpp +++ b/libgnn/test/gnnfb-test.cpp @@ -46,7 +46,7 @@ int main() { gnn->DoInference(); // check output for layers to make sure it's as expected - const std::vector& lf0_out = + galois::PointerWithSize lf0_out = gnn->GetIntermediateLayer(0)->GetForwardOutput(); GALOIS_LOG_ASSERT(lf0_out.size() == 28); for (size_t i = 0; i < 4; i++) { @@ -71,7 +71,7 @@ int main() { GALOIS_LOG_ASSERT(lf0_out[24 + i] == 15); } - const std::vector& lf1_out = + const galois::PointerWithSize lf1_out = gnn->GetIntermediateLayer(1)->GetForwardOutput(); GALOIS_LOG_ASSERT(lf1_out.size() == 49); for (size_t i = 0; i < 7; i++) { @@ -96,7 +96,7 @@ int main() { GALOIS_LOG_ASSERT(lf1_out[42 + i] == 120); } - const std::vector& fo_out = + const galois::PointerWithSize fo_out = gnn->GetOutputLayer()->GetForwardOutput(); GALOIS_LOG_ASSERT(fo_out.size() == 49); // since row all same, prob distribution across row should be same @@ -127,7 +127,7 @@ int main() { gnn->SetLayerPhases(galois::GNNPhase::kValidate); gnn->SetAllLayerWeightsTo1(); gnn->DoInference(); - const std::vector& fo_out_val = + const galois::PointerWithSize fo_out_val = gnn->GetOutputLayer()->GetForwardOutput(); for (size_t c = 0; c < 49; c += 7) { for (size_t i = 0; i < 6; i++) { @@ -150,7 +150,7 @@ int main() { gnn->SetLayerPhases(galois::GNNPhase::kTest); gnn->SetAllLayerWeightsTo1(); gnn->DoInference(); - const std::vector& fo_out_test = + galois::PointerWithSize fo_out_test = gnn->GetOutputLayer()->GetForwardOutput(); for (size_t c = 0; c < 49; c += 7) { for (size_t i = 0; i < 6; i++) { diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp new file mode 100644 index 0000000000..d51a3bb54c --- /dev/null +++ b/libgnn/test/gpu-convlayer-test.cpp @@ -0,0 +1,239 @@ +//! @file gpu-convlayer-test.cpp +//! Conv layer test with a test graph on gpus + +#include "galois/Logging.h" +#include "galois/layers/GraphConvolutionalLayer.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + + GALOIS_LOG_VERBOSE("[{}] Using {} threads", + galois::runtime::getSystemNetworkInterface().ID, + num_threads); + // load test graph + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + + std::vector feats = test_graph.GetLocalFeatures(); + + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = 7; + dimension_0.input_columns = 3; + dimension_0.output_columns = 2; + + galois::GNNLayerConfig dcon; + dcon.allow_aggregate_after_update = false; + + // create the layer, no norm factor + std::unique_ptr layer_0 = + std::make_unique(0, test_graph, + dimension_0, dcon); + layer_0->InitAllWeightsTo1(); + // make sure it runs in a sane manner + const std::vector& layer_0_forward_output = + layer_0->ForwardPhase(test_graph.GetLocalFeatures()); + + ////////////////////////////////////////////////////////////////////////////// + // sanity check layer 0 output + ////////////////////////////////////////////////////////////////////////////// + // since norm factors aren't invovled it is possible to do full assertions + // 7 x 2 + GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14); + GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3); + GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3); + GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6); + GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6); + GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12); + GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12); + GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 18); + GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 18); + GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 24); + GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 24); + GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 30); + GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 30); + GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 15); + GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 15); + ////////////////////////////////////////////////////////////////////////////// + + // dummy 1 matrix + std::vector dummy_ones(14, 1); + + // backward pass checking + // layer 0 means that an empty weight matrix is returned since there is no + // point passing back anything + std::vector* layer_0_backward_output = + layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + + ////////////////////////////////////////////////////////////////////////////// + // sanity check layer 0 backward output; all 0 because layer 0 + ////////////////////////////////////////////////////////////////////////////// + // since norm factors aren't invovled it is possible to do full assertions + // 7 x 3 + GALOIS_LOG_ASSERT(layer_0_backward_output->size() == 21); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[0] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[1] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[2] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[3] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[4] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[5] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[6] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[7] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[8] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[9] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[10] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[11] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[12] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[13] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[14] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[15] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[16] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[17] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[18] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[19] == 0); + GALOIS_LOG_ASSERT((*layer_0_backward_output)[20] == 0); + + const std::vector layer_0_weight_gradients = + layer_0->GetLayerWeightGradients(); + // make sure they are sane + GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21); + + layer_0.reset(); + + ////////////////////////////////////////////////////////////////////////////// + + // create layer 1 for testing backward prop actually giving weights back + + std::unique_ptr layer_1 = + std::make_unique(1, test_graph, + dimension_0, dcon); + layer_1->InitAllWeightsTo1(); + const std::vector& layer_1_forward_output = + layer_1->ForwardPhase(test_graph.GetLocalFeatures()); + // same check as before for sanity purposes + GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14); + GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3); + GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3); + GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 6); + GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 6); + GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 12); + GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 12); + GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 18); + GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 18); + GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 24); + GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 24); + GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 30); + GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 30); + GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 15); + GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15); + + // since layer isn't 0 anymore, backward phase will actually return something + dummy_ones.assign(14, 1); + std::vector* layer_1_backward_output = + layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + ////////////////////////////////////////////////////////////////////////////// + // check that multiplies go as expected + ////////////////////////////////////////////////////////////////////////////// + GALOIS_LOG_ASSERT(layer_1_backward_output->size() == 21); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[0] == 2); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[1] == 2); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[2] == 2); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[3] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[4] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[5] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[6] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[7] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[8] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[9] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[10] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[11] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[12] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[13] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[14] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[15] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[16] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[17] == 4); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[18] == 2); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[19] == 2); + GALOIS_LOG_ASSERT((*layer_1_backward_output)[20] == 2); + + const std::vector layer_1_weight_gradients = + layer_1->GetLayerWeightGradients(); + // make sure they are sane + GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21); + + layer_1.reset(); + + ////////////////////////////////////////////////////////////////////////////// + + galois::GNNLayerConfig config; + config.do_dropout = true; + config.do_activation = true; + config.do_normalization = true; + config.allow_aggregate_after_update = false; + + // finally, just make sure dropout and activation run without crashes + // (verification requires floating point accuracy or setting a seed which I + // don't have time for at the moment + // TODO in future maybe add better unit test for this + std::unique_ptr layer_2 = + std::make_unique(1, test_graph, + dimension_0, config); + const std::vector l2_fo = + layer_2->ForwardPhase(test_graph.GetLocalFeatures()); + GALOIS_LOG_ASSERT(l2_fo.size() == 14); + GALOIS_LOG_VERBOSE("{}", l2_fo[0]); + GALOIS_LOG_VERBOSE("{}", l2_fo[1]); + GALOIS_LOG_VERBOSE("{}", l2_fo[2]); + GALOIS_LOG_VERBOSE("{}", l2_fo[3]); + GALOIS_LOG_VERBOSE("{}", l2_fo[4]); + GALOIS_LOG_VERBOSE("{}", l2_fo[5]); + GALOIS_LOG_VERBOSE("{}", l2_fo[6]); + GALOIS_LOG_VERBOSE("{}", l2_fo[7]); + GALOIS_LOG_VERBOSE("{}", l2_fo[8]); + GALOIS_LOG_VERBOSE("{}", l2_fo[9]); + GALOIS_LOG_VERBOSE("{}", l2_fo[10]); + GALOIS_LOG_VERBOSE("{}", l2_fo[11]); + GALOIS_LOG_VERBOSE("{}", l2_fo[12]); + GALOIS_LOG_VERBOSE("{}", l2_fo[13]); + + std::vector* l2_bo = + layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + + GALOIS_LOG_ASSERT(l2_bo->size() == 21); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[0]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[1]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[2]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[3]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[4]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[5]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[6]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[7]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[8]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[9]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[10]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[11]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[12]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[13]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[14]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[15]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[16]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[17]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[18]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[19]); + GALOIS_LOG_VERBOSE("{}", (*l2_bo)[20]); + + return 0; +} diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp index 5d9fa87728..f7baab24fd 100644 --- a/libgnn/test/softmaxlayer-test.cpp +++ b/libgnn/test/softmaxlayer-test.cpp @@ -38,7 +38,7 @@ int main() { // train mode auto output_layer = std::make_unique(3, test_graph, dimension_0); - const std::vector& prediction_distribution = + galois::PointerWithSize prediction_distribution = output_layer->ForwardPhase(softmax_input); output_layer->BackwardPhase(softmax_input, nullptr); @@ -60,7 +60,7 @@ int main() { // validation mode output_layer->SetLayerPhase(galois::GNNPhase::kValidate); - const std::vector& pd2 = + galois::PointerWithSize pd2 = output_layer->ForwardPhase(softmax_input); output_layer->BackwardPhase(softmax_input, nullptr); // validate vertex is index 5 @@ -86,7 +86,7 @@ int main() { // test mode output_layer->SetLayerPhase(galois::GNNPhase::kTest); - const std::vector& pd3 = + galois::PointerWithSize pd3 = output_layer->ForwardPhase(softmax_input); output_layer->BackwardPhase(softmax_input, nullptr); // validate vertex is index 6 diff --git a/libgnn/test/weight-sync-test.cpp b/libgnn/test/weight-sync-test.cpp index 561aa95370..3ea524e4a7 100644 --- a/libgnn/test/weight-sync-test.cpp +++ b/libgnn/test/weight-sync-test.cpp @@ -29,12 +29,13 @@ int main() { layer_0->InitAllWeightsTo1(); // backward pass checking; check the gradients out - std::vector dummy_ones(test_graph->size() * 2, 1); + std::vector dummy_ones_v(test_graph->size() * 2, 1); + galois::PointerWithSize dummy_ones(dummy_ones_v); layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones); // gradient verification; average // host 0 has 18, 1 has 21, 2 has 12, 3 has 0s; averaged to 12.75 - const std::vector& grads = + const galois::PointerWithSize& grads = layer_0->GetLayerWeightGradients(); for (size_t i = 0; i < 6; i++) { GALOIS_LOG_ASSERT(grads[i] == 12.75); From 5931edc464f9de1d72c9de5081a8c4a9fb93b4fc Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 6 Nov 2020 19:03:35 -0600 Subject: [PATCH 399/660] Allocate GPU memory for parent GNNLayer Allocates memory for layer weights, gradients, forward output, and backward output of generic GNNLayers on the GPU. Layer specific matrices such as those on a GCN layer will need to be handled separately in later commits. --- libgnn/CMakeLists.txt | 1 + libgnn/include/galois/graphs/GNNGraph.cuh | 6 ---- libgnn/include/galois/layers/GNNLayer.cuh | 26 +++++++++++++++ libgnn/include/galois/layers/GNNLayer.h | 11 +++++++ libgnn/src/layers/GNNLayer.cpp | 14 ++++++++ libgnn/src/layers/GNNLayer.cu | 40 +++++++++++++++++++++++ 6 files changed, 92 insertions(+), 6 deletions(-) create mode 100644 libgnn/include/galois/layers/GNNLayer.cuh create mode 100644 libgnn/src/layers/GNNLayer.cu diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 1ca05c8632..bf84358393 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -31,6 +31,7 @@ if (GALOIS_ENABLE_GPU) # create the galois_gnn_gpu library to get linked into galois_gnn set(gpusources src/graphs/GNNGraph.cu + src/layers/GNNLayer.cu ) add_library(galois_gnn_gpu STATIC ${gpusources}) target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES) diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh index 9e047b6a0f..c44fba7b9a 100644 --- a/libgnn/include/galois/graphs/GNNGraph.cuh +++ b/libgnn/include/galois/graphs/GNNGraph.cuh @@ -8,24 +8,18 @@ namespace graphs { //! Similar in nature to the CUDAContext class in existing D-IrGL class GNNGraphGPUAllocations { public: - // XXX getters for everything, the rest of the setters, etc. - //! CUDA frees all allocated memory (i.e. non-nullptr) ~GNNGraphGPUAllocations(); - //! Copies graph topology over to GPU; using ints because cuSparse lib //! expects ints for the CSR arrays void SetGraphTopology(const std::vector& edge_index, const std::vector& edge_dests); - //! Host side function that allocates memory for the features on the vertices //! and copies them over to the GPU. void SetFeatures(const std::vector& features, unsigned num_features); - //! Copy over ground truth for the graph to GPU void SetLabels(const std::vector& ground_truth); - private: // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh new file mode 100644 index 0000000000..3a89c97d61 --- /dev/null +++ b/libgnn/include/galois/layers/GNNLayer.cuh @@ -0,0 +1,26 @@ +#pragma once +#include "galois/GNNTypes.h" + +namespace galois { + +//! Holds pointers to GNN layer weights/gradient on GPU +class GNNLayerGPUAllocations { +public: + //! CUDA frees all allocated memory (i.e. non-nullptr) + ~GNNLayerGPUAllocations(); + //! Initializes forward and backward output matrices of this layer on GPU + void InitInOutMemory(size_t forward_size, size_t backward_size); + //! Initializes memory for weight and weight gradients on GPU + void InitWeightMemory(size_t num_weights); + //! Copy provided data in vector to GPU weights + void CopyToWeights(const std::vector& cpu_layer_weights); + +private: + size_t* num_weights_{nullptr}; + GNNFloat* forward_output_matrix_{nullptr}; + GNNFloat* backward_output_matrix_{nullptr}; + GNNFloat* layer_weights_{nullptr}; + GNNFloat* layer_weight_gradients_{nullptr}; +}; + +} // namespace galois diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index e738bdacca..9636d4f8d6 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -5,6 +5,10 @@ #include "galois/graphs/GNNGraph.h" #include "galois/layers/GluonGradientInterface.h" +#ifdef GALOIS_ENABLE_GPU +#include "galois/layers/GNNLayer.cuh" +#endif + namespace galois { //! Supported layer types in the GNN @@ -203,6 +207,13 @@ class GNNLayer { //! Synchronize weight gradients with a summation, then locally divide all //! weights to get an average void WeightGradientSyncAverage(); + +#ifdef GALOIS_ENABLE_GPU + //! Object that holds all GPU allocated pointers to memory related to layers + GNNLayerGPUAllocations gpu_memory_; + //! Copies over layer weights to GPU + void CopyLayerWeightsToGPU(); +#endif }; } // namespace galois diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index a0ead51e10..a42d593a22 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -28,6 +28,9 @@ galois::GNNLayer::GNNLayer(size_t layer_num, *gradient_sync_interface_, galois::runtime::getSystemNetworkInterface().ID, galois::runtime::getSystemNetworkInterface().Num, false); +#ifdef GALOIS_ENABLE_GPU + gpu_memory_.InitWeightMemory(num_weight_elements); +#endif } size_t num_output_elements = @@ -35,6 +38,11 @@ galois::GNNLayer::GNNLayer(size_t layer_num, forward_output_matrix_.resize(num_output_elements, 0); backward_output_matrix_.resize( layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0); +#ifdef GALOIS_ENABLE_GPU + gpu_memory_.InitInOutMemory(num_output_elements, + layer_dimensions_.input_rows * + layer_dimensions_.input_columns); +#endif } void galois::GNNLayer::GlorotBengioInit(std::vector* vector_to_init) { @@ -159,3 +167,9 @@ void galois::GNNLayer::WeightGradientSyncAverage() { galois::loopname("WeightGradientSyncAverageDivide")); } } + +#ifdef GALOIS_ENABLE_GPU +void galois::GNNLayer::CopyLayerWeightsToGPU() { + gpu_memory_.CopyToWeights(layer_weights_); +} +#endif diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu new file mode 100644 index 0000000000..424df92e26 --- /dev/null +++ b/libgnn/src/layers/GNNLayer.cu @@ -0,0 +1,40 @@ +#include "galois/CUDAUtil.h" +#include "galois/layers/GNNLayer.cuh" + +galois::GNNLayerGPUAllocations::~GNNLayerGPUAllocations() { + GALOIS_LOG_VERBOSE("Freeing GPU layer allocations"); + CUDA_FREE(num_weights_); + CUDA_FREE(forward_output_matrix_); + CUDA_FREE(backward_output_matrix_); + CUDA_FREE(layer_weights_); + CUDA_FREE(layer_weight_gradients_); +} + +void galois::GNNLayerGPUAllocations::InitInOutMemory(size_t forward_size, + size_t backward_size) { + CUDA_CHECK(cudaMalloc((void**)(&forward_output_matrix_), + forward_size * sizeof(GNNFloat))); + CUDA_CHECK(cudaMalloc((void**)(&backward_output_matrix_), + backward_size * sizeof(GNNFloat))); +} + +void galois::GNNLayerGPUAllocations::InitWeightMemory(size_t num_weights) { + // num weights + CUDA_CHECK(cudaMalloc((void**)(&num_weights_), sizeof(size_t))); + CUDA_CHECK(cudaMemcpy(num_weights_, &num_weights, sizeof(size_t), + cudaMemcpyHostToDevice)); + // memory + CUDA_CHECK( + cudaMalloc((void**)(&layer_weights_), num_weights * sizeof(GNNFloat))); + CUDA_CHECK(cudaMalloc((void**)(&layer_weight_gradients_), + num_weights * sizeof(GNNFloat))); +} + +void galois::GNNLayerGPUAllocations::CopyToWeights( + const std::vector& cpu_layer_weights) { + CUDA_CHECK(cudaMemcpy(layer_weights_, cpu_layer_weights.data(), + cpu_layer_weights.size() * sizeof(GNNFloat), + cudaMemcpyHostToDevice)); +} + +// TODO copy from gpu function as well just in case I need to check From 483fda6703859a96a96da30612949fcb97d1e1ad Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 6 Nov 2020 19:35:39 -0600 Subject: [PATCH 400/660] GCN layer: allocate GPU memory for matrices Allocates memory on the GPU for the intermediate matrices used in the GCN layer of the graph neural network. --- libgnn/CMakeLists.txt | 1 + .../galois/layers/GraphConvolutionalLayer.cuh | 19 +++++++++++++++++++ .../galois/layers/GraphConvolutionalLayer.h | 8 ++++++++ libgnn/src/layers/GraphConvolutionalLayer.cpp | 3 +++ libgnn/src/layers/GraphConvolutionalLayer.cu | 19 +++++++++++++++++++ 5 files changed, 50 insertions(+) create mode 100644 libgnn/include/galois/layers/GraphConvolutionalLayer.cuh create mode 100644 libgnn/src/layers/GraphConvolutionalLayer.cu diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index bf84358393..f556ec6ca4 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -32,6 +32,7 @@ if (GALOIS_ENABLE_GPU) set(gpusources src/graphs/GNNGraph.cu src/layers/GNNLayer.cu + src/layers/GraphConvolutionalLayer.cu ) add_library(galois_gnn_gpu STATIC ${gpusources}) target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES) diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh new file mode 100644 index 0000000000..6b567eab2e --- /dev/null +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh @@ -0,0 +1,19 @@ +#pragma once +#include "galois/GNNTypes.h" + +namespace galois { + +//! Holds pointers for GPU memory for GCN layer +class GCNGPUAllocations { +public: + // free memory + ~GCNGPUAllocations(); + // allocate the 3 temp arrays + void Allocate(size_t input_elements, size_t output_elements); +private: + GNNFloat* in_temp_1_{nullptr}; + GNNFloat* in_temp_2_{nullptr}; + GNNFloat* out_temp_{nullptr}; +}; + +} diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h index 196fa752c8..19c4e6c68c 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h @@ -1,6 +1,10 @@ #pragma once #include "galois/layers/GNNLayer.h" +#ifdef GALOIS_ENABLE_GPU +#include "galois/layers/GraphConvolutionalLayer.cuh" +#endif + namespace galois { class GraphConvolutionalLayer : public GNNLayer { @@ -55,6 +59,10 @@ class GraphConvolutionalLayer : public GNNLayer { void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output); //! Calculate graident via mxm with last layer's gradients (backward) void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output); + +#ifdef GALOIS_ENABLE_GPU + GCNGPUAllocations gpu_memory_; +#endif }; } // namespace galois diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index e2e80ce8b1..c10c59c383 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -19,6 +19,9 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements); out_temp_.resize(num_output_elements, 0); layer_type_ = galois::GNNLayerType::kGraphConvolutional; +#ifdef GALOIS_ENABLE_GPU + gpu_memory_.Allocate(num_input_elements, num_output_elements); +#endif } const galois::PointerWithSize diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu new file mode 100644 index 0000000000..20e96d9777 --- /dev/null +++ b/libgnn/src/layers/GraphConvolutionalLayer.cu @@ -0,0 +1,19 @@ +#include "galois/CUDAUtil.h" +#include "galois/layers/GraphConvolutionalLayer.cuh" + +galois::GCNGPUAllocations::~GCNGPUAllocations() { + GALOIS_LOG_VERBOSE("Freeing GCN layer allocations"); + CUDA_FREE(in_temp_1_); + CUDA_FREE(in_temp_2_); + CUDA_FREE(out_temp_); +} + +void galois::GCNGPUAllocations::Allocate(size_t input_elements, + size_t output_elements) { + CUDA_CHECK( + cudaMalloc((void**)(&in_temp_1_), input_elements * sizeof(GNNFloat))); + CUDA_CHECK( + cudaMalloc((void**)(&in_temp_2_), input_elements * sizeof(GNNFloat))); + CUDA_CHECK( + cudaMalloc((void**)(&out_temp_), output_elements * sizeof(GNNFloat))); +} From deb50bceb176ba4bc54883ebc5f9ba36053f6563 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Nov 2020 17:02:35 -0600 Subject: [PATCH 401/660] Disable majority GNN tests for GPU build Turn off a majority of the GNN tests for the GPU build because the data is on the GPU so doing a check of the data would involve copying it back to the CPU: this will be done later. This commit is unlikely to build as it does not include a few changes coming in later commits that make things work. --- libgnn/test/CMakeLists.txt | 76 +++--- libgnn/test/gpu-convlayer-test.cpp | 405 +++++++++++++++-------------- 2 files changed, 247 insertions(+), 234 deletions(-) diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 69ef29b43f..8bec96c4d6 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -2,40 +2,46 @@ add_executable(gnngraph-test gnngraph-test.cpp) target_link_libraries(gnngraph-test galois_gnn) add_test(NAME gnngraph-test COMMAND gnngraph-test) -add_executable(convlayer-test convlayer-test.cpp) -target_link_libraries(convlayer-test galois_gnn) -add_test(NAME convlayer-test COMMAND convlayer-test) - -add_executable(softmaxlayer-test softmaxlayer-test.cpp) -target_link_libraries(softmaxlayer-test galois_gnn) -add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test) - -add_executable(gnnconstruct-test gnnconstruct-test.cpp) -target_link_libraries(gnnconstruct-test galois_gnn) -add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test) - -add_executable(gnnfb-test gnnfb-test.cpp) -target_link_libraries(gnnfb-test galois_gnn) -add_test(NAME gnnfb-test COMMAND gnnfb-test) - -add_executable(adam-test adam-test.cpp) -target_link_libraries(adam-test galois_gnn) -add_test(NAME adam-test COMMAND adam-test) - -add_executable(accuracy-test accuracy-test.cpp) -target_link_libraries(accuracy-test galois_gnn) -add_test(NAME accuracy-test COMMAND accuracy-test) - -add_executable(epoch-test epoch-test.cpp) -target_link_libraries(epoch-test galois_gnn) -add_test(NAME epoch-test COMMAND epoch-test) - -# TODO figure out how to make this test run in parallel -add_executable(aggregate-sync-test aggregate-sync-test.cpp) -target_link_libraries(aggregate-sync-test galois_gnn) -#add_test(NAME aggregate-sync-test COMMAND GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 ./aggregate-sync-test) - -add_executable(weight-sync-test weight-sync-test.cpp) -target_link_libraries(weight-sync-test galois_gnn) +if (NOT GALOIS_ENABLE_GPU) + add_executable(convlayer-test convlayer-test.cpp) + target_link_libraries(convlayer-test galois_gnn) + add_test(NAME convlayer-test COMMAND convlayer-test) + + add_executable(softmaxlayer-test softmaxlayer-test.cpp) + target_link_libraries(softmaxlayer-test galois_gnn) + add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test) + + add_executable(gnnconstruct-test gnnconstruct-test.cpp) + target_link_libraries(gnnconstruct-test galois_gnn) + add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test) + + add_executable(gnnfb-test gnnfb-test.cpp) + target_link_libraries(gnnfb-test galois_gnn) + add_test(NAME gnnfb-test COMMAND gnnfb-test) + + add_executable(adam-test adam-test.cpp) + target_link_libraries(adam-test galois_gnn) + add_test(NAME adam-test COMMAND adam-test) + + add_executable(accuracy-test accuracy-test.cpp) + target_link_libraries(accuracy-test galois_gnn) + add_test(NAME accuracy-test COMMAND accuracy-test) + + add_executable(epoch-test epoch-test.cpp) + target_link_libraries(epoch-test galois_gnn) + add_test(NAME epoch-test COMMAND epoch-test) + + # TODO figure out how to make this test run in parallel + add_executable(aggregate-sync-test aggregate-sync-test.cpp) + target_link_libraries(aggregate-sync-test galois_gnn) + #add_test(NAME aggregate-sync-test COMMAND GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 ./aggregate-sync-test) + + add_executable(weight-sync-test weight-sync-test.cpp) + target_link_libraries(weight-sync-test galois_gnn) +else() + add_executable(gpu-convlayer-test gpu-convlayer-test.cpp) + target_link_libraries(gpu-convlayer-test galois_gnn) + add_test(NAME gpu-convlayer-test COMMAND gpu-convlayer-test) +endif() # TODO multi host tests? diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp index d51a3bb54c..0123a35b17 100644 --- a/libgnn/test/gpu-convlayer-test.cpp +++ b/libgnn/test/gpu-convlayer-test.cpp @@ -17,7 +17,13 @@ int main() { galois::graphs::GNNGraph test_graph( "tester", galois::graphs::GNNPartitionScheme::kOEC, true); - std::vector feats = test_graph.GetLocalFeatures(); + galois::PointerWithSize feats = + test_graph.GetLocalFeatures(); + ////////////////////////////////////////////////////////////////////////////// + // doubles as a test for reading as well + GALOIS_LOG_ASSERT(7 == test_graph.size()); + GALOIS_LOG_ASSERT(21 == feats.size()); + ////////////////////////////////////////////////////////////////////////////// galois::GNNLayerDimensions dimension_0; dimension_0.input_rows = 7; @@ -33,207 +39,208 @@ int main() { dimension_0, dcon); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner - const std::vector& layer_0_forward_output = + const galois::PointerWithSize layer_0_forward_output = layer_0->ForwardPhase(test_graph.GetLocalFeatures()); - ////////////////////////////////////////////////////////////////////////////// - // sanity check layer 0 output - ////////////////////////////////////////////////////////////////////////////// - // since norm factors aren't invovled it is possible to do full assertions - // 7 x 2 + //////////////////////////////////////////////////////////////////////////////// + //// sanity check layer 0 output + //////////////////////////////////////////////////////////////////////////////// + //// since norm factors aren't invovled it is possible to do full assertions + //// 7 x 2 GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14); - GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3); - GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3); - GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6); - GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6); - GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12); - GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12); - GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 18); - GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 18); - GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 24); - GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 24); - GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 30); - GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 30); - GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 15); - GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 15); - ////////////////////////////////////////////////////////////////////////////// - - // dummy 1 matrix - std::vector dummy_ones(14, 1); - - // backward pass checking - // layer 0 means that an empty weight matrix is returned since there is no - // point passing back anything - std::vector* layer_0_backward_output = - layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); - - ////////////////////////////////////////////////////////////////////////////// - // sanity check layer 0 backward output; all 0 because layer 0 - ////////////////////////////////////////////////////////////////////////////// - // since norm factors aren't invovled it is possible to do full assertions - // 7 x 3 - GALOIS_LOG_ASSERT(layer_0_backward_output->size() == 21); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[0] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[1] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[2] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[3] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[4] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[5] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[6] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[7] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[8] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[9] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[10] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[11] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[12] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[13] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[14] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[15] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[16] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[17] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[18] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[19] == 0); - GALOIS_LOG_ASSERT((*layer_0_backward_output)[20] == 0); - - const std::vector layer_0_weight_gradients = - layer_0->GetLayerWeightGradients(); - // make sure they are sane - GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21); - - layer_0.reset(); - - ////////////////////////////////////////////////////////////////////////////// - - // create layer 1 for testing backward prop actually giving weights back - - std::unique_ptr layer_1 = - std::make_unique(1, test_graph, - dimension_0, dcon); - layer_1->InitAllWeightsTo1(); - const std::vector& layer_1_forward_output = - layer_1->ForwardPhase(test_graph.GetLocalFeatures()); - // same check as before for sanity purposes - GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14); - GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3); - GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3); - GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 6); - GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 6); - GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 12); - GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 12); - GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 18); - GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 18); - GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 24); - GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 24); - GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 30); - GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 30); - GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 15); - GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15); - - // since layer isn't 0 anymore, backward phase will actually return something - dummy_ones.assign(14, 1); - std::vector* layer_1_backward_output = - layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); - ////////////////////////////////////////////////////////////////////////////// - // check that multiplies go as expected - ////////////////////////////////////////////////////////////////////////////// - GALOIS_LOG_ASSERT(layer_1_backward_output->size() == 21); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[0] == 2); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[1] == 2); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[2] == 2); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[3] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[4] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[5] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[6] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[7] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[8] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[9] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[10] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[11] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[12] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[13] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[14] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[15] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[16] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[17] == 4); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[18] == 2); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[19] == 2); - GALOIS_LOG_ASSERT((*layer_1_backward_output)[20] == 2); - - const std::vector layer_1_weight_gradients = - layer_1->GetLayerWeightGradients(); - // make sure they are sane - GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21); - - layer_1.reset(); - - ////////////////////////////////////////////////////////////////////////////// - - galois::GNNLayerConfig config; - config.do_dropout = true; - config.do_activation = true; - config.do_normalization = true; - config.allow_aggregate_after_update = false; - - // finally, just make sure dropout and activation run without crashes - // (verification requires floating point accuracy or setting a seed which I - // don't have time for at the moment - // TODO in future maybe add better unit test for this - std::unique_ptr layer_2 = - std::make_unique(1, test_graph, - dimension_0, config); - const std::vector l2_fo = - layer_2->ForwardPhase(test_graph.GetLocalFeatures()); - GALOIS_LOG_ASSERT(l2_fo.size() == 14); - GALOIS_LOG_VERBOSE("{}", l2_fo[0]); - GALOIS_LOG_VERBOSE("{}", l2_fo[1]); - GALOIS_LOG_VERBOSE("{}", l2_fo[2]); - GALOIS_LOG_VERBOSE("{}", l2_fo[3]); - GALOIS_LOG_VERBOSE("{}", l2_fo[4]); - GALOIS_LOG_VERBOSE("{}", l2_fo[5]); - GALOIS_LOG_VERBOSE("{}", l2_fo[6]); - GALOIS_LOG_VERBOSE("{}", l2_fo[7]); - GALOIS_LOG_VERBOSE("{}", l2_fo[8]); - GALOIS_LOG_VERBOSE("{}", l2_fo[9]); - GALOIS_LOG_VERBOSE("{}", l2_fo[10]); - GALOIS_LOG_VERBOSE("{}", l2_fo[11]); - GALOIS_LOG_VERBOSE("{}", l2_fo[12]); - GALOIS_LOG_VERBOSE("{}", l2_fo[13]); - - std::vector* l2_bo = - layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); - - GALOIS_LOG_ASSERT(l2_bo->size() == 21); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[0]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[1]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[2]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[3]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[4]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[5]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[6]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[7]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[8]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[9]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[10]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[11]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[12]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[13]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[14]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[15]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[16]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[17]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[18]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[19]); - GALOIS_LOG_VERBOSE("{}", (*l2_bo)[20]); + //GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3); + //GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3); + //GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6); + //GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6); + //GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12); + //GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12); + //GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 18); + //GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 18); + //GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 24); + //GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 24); + //GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 30); + //GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 30); + //GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 15); + //GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 15); + //////////////////////////////////////////////////////////////////////////////// + + //// dummy 1 matrix + //std::vector dummy_ones_v(14, 1); + //galois::PointerWithSize dummy_ones(dummy_ones_v); + + //// backward pass checking + //// layer 0 means that an empty weight matrix is returned since there is no + //// point passing back anything + //galois::PointerWithSize layer_0_backward_output = + // layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + + //////////////////////////////////////////////////////////////////////////////// + //// sanity check layer 0 backward output; all 0 because layer 0 + //////////////////////////////////////////////////////////////////////////////// + //// since norm factors aren't invovled it is possible to do full assertions + //// 7 x 3 + //GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0); + //GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0); + + //galois::PointerWithSize layer_0_weight_gradients = + // layer_0->GetLayerWeightGradients(); + //// make sure they are sane + //GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6); + //GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21); + //GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21); + //GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21); + //GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21); + //GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21); + + //layer_0.reset(); + + //////////////////////////////////////////////////////////////////////////////// + + //// create layer 1 for testing backward prop actually giving weights back + + //std::unique_ptr layer_1 = + // std::make_unique(1, test_graph, + // dimension_0, dcon); + //layer_1->InitAllWeightsTo1(); + //galois::PointerWithSize layer_1_forward_output = + // layer_1->ForwardPhase(test_graph.GetLocalFeatures()); + //// same check as before for sanity purposes + //GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14); + //GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3); + //GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3); + //GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 6); + //GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 6); + //GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 12); + //GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 12); + //GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 18); + //GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 18); + //GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 24); + //GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 24); + //GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 30); + //GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 30); + //GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 15); + //GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15); + + //// since layer isn't 0 anymore, backward phase will actually return something + //dummy_ones_v.assign(14, 1); + //galois::PointerWithSize layer_1_backward_output = + // layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + //////////////////////////////////////////////////////////////////////////////// + //// check that multiplies go as expected + //////////////////////////////////////////////////////////////////////////////// + //GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 2); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 2); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 2); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 4); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 2); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 2); + //GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 2); + + //galois::PointerWithSize layer_1_weight_gradients = + // layer_1->GetLayerWeightGradients(); + //// make sure they are sane + //GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); + //GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21); + //GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21); + //GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21); + //GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21); + //GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21); + + //layer_1.reset(); + + //////////////////////////////////////////////////////////////////////////////// + + //galois::GNNLayerConfig config; + //config.do_dropout = true; + //config.do_activation = true; + //config.do_normalization = true; + //config.allow_aggregate_after_update = false; + + //// finally, just make sure dropout and activation run without crashes + //// (verification requires floating point accuracy or setting a seed which I + //// don't have time for at the moment + //// TODO in future maybe add better unit test for this + //std::unique_ptr layer_2 = + // std::make_unique(1, test_graph, + // dimension_0, config); + //galois::PointerWithSize l2_fo = + // layer_2->ForwardPhase(test_graph.GetLocalFeatures()); + //GALOIS_LOG_ASSERT(l2_fo.size() == 14); + //GALOIS_LOG_VERBOSE("{}", l2_fo[0]); + //GALOIS_LOG_VERBOSE("{}", l2_fo[1]); + //GALOIS_LOG_VERBOSE("{}", l2_fo[2]); + //GALOIS_LOG_VERBOSE("{}", l2_fo[3]); + //GALOIS_LOG_VERBOSE("{}", l2_fo[4]); + //GALOIS_LOG_VERBOSE("{}", l2_fo[5]); + //GALOIS_LOG_VERBOSE("{}", l2_fo[6]); + //GALOIS_LOG_VERBOSE("{}", l2_fo[7]); + //GALOIS_LOG_VERBOSE("{}", l2_fo[8]); + //GALOIS_LOG_VERBOSE("{}", l2_fo[9]); + //GALOIS_LOG_VERBOSE("{}", l2_fo[10]); + //GALOIS_LOG_VERBOSE("{}", l2_fo[11]); + //GALOIS_LOG_VERBOSE("{}", l2_fo[12]); + //GALOIS_LOG_VERBOSE("{}", l2_fo[13]); + + //galois::PointerWithSize l2_bo = + // layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + + //GALOIS_LOG_ASSERT(l2_bo.size() == 21); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]); + //GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]); return 0; } From 606750b85e131b08da97ff60f9ea621ee041a45e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Nov 2020 17:06:42 -0600 Subject: [PATCH 402/660] GPU arch related defs, CUDA_TEST Adds compile time constants for various GPU things like warp size, block size, etc. Also adds CUDA_TEST which is a macro for checking if a CUDA call returned successfully. --- libgnn/include/galois/CUDAUtil.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h index f8d7a03b80..d479efe64d 100644 --- a/libgnn/include/galois/CUDAUtil.h +++ b/libgnn/include/galois/CUDAUtil.h @@ -5,6 +5,14 @@ #include #include "galois/Logging.h" +// TODO check these +#define CHUNK_SIZE 256 +#define TB_SIZE 256 +#define BLOCK_SIZE 256 +#define WARP_SIZE 32 +#define MAX_NUM_CLASSES 128 +#define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE) + #define CUDA_CHECK(condition) \ do { \ cudaError_t error = condition; \ @@ -22,4 +30,15 @@ } \ } while (0) +#define CUDA_TEST(msg) \ + do { \ + cudaError_t e; \ + cudaDeviceSynchronize(); \ + if (cudaSuccess != (e = cudaGetLastError())) { \ + GALOIS_LOG_ERROR("{}: {}", msg, e); \ + GALOIS_LOG_ERROR("{}", cudaGetErrorString(e)); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + #endif From 4d218ccc2f769c4c369e42d6fec185c9c5ba100e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Nov 2020 17:18:25 -0600 Subject: [PATCH 403/660] GPU structs: getter functions Adds getter functions to the memory objects for layers and the graph. The GCN layer also declares a AggregateAllGPU function for a later commit to use + to do aggregation on the GPU. --- libgnn/include/galois/graphs/GNNGraph.cuh | 8 ++++++++ libgnn/include/galois/layers/GNNLayer.cuh | 5 +++++ .../galois/layers/GraphConvolutionalLayer.cuh | 12 +++++++++++- 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh index c44fba7b9a..33093d2ebc 100644 --- a/libgnn/include/galois/graphs/GNNGraph.cuh +++ b/libgnn/include/galois/graphs/GNNGraph.cuh @@ -20,6 +20,14 @@ public: unsigned num_features); //! Copy over ground truth for the graph to GPU void SetLabels(const std::vector& ground_truth); + + GNNFeature* feature_vector() { return feature_vector_; }; + const GNNFeature* feature_vector() const { return feature_vector_; }; + int* edge_index() { return edge_index_; } + const int* edge_index() const { return edge_index_; } + int* edge_destinations() { return edge_destinations_; } + const int* edge_destinations() const { return edge_destinations_; } + private: // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh index 3a89c97d61..81fa9e2026 100644 --- a/libgnn/include/galois/layers/GNNLayer.cuh +++ b/libgnn/include/galois/layers/GNNLayer.cuh @@ -15,6 +15,11 @@ public: //! Copy provided data in vector to GPU weights void CopyToWeights(const std::vector& cpu_layer_weights); + GNNFloat* forward_output() { return forward_output_matrix_; } + GNNFloat* backward_output() { return backward_output_matrix_; } + GNNFloat* layer_weights() { return layer_weights_; } + GNNFloat* layer_weight_gradients() { return layer_weight_gradients_; } + private: size_t* num_weights_{nullptr}; GNNFloat* forward_output_matrix_{nullptr}; diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh index 6b567eab2e..993b6f39cb 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh @@ -1,5 +1,6 @@ #pragma once #include "galois/GNNTypes.h" +#include "galois/graphs/GNNGraph.cuh" namespace galois { @@ -10,10 +11,19 @@ public: ~GCNGPUAllocations(); // allocate the 3 temp arrays void Allocate(size_t input_elements, size_t output_elements); + GNNFloat* in_temp_1() { return in_temp_1_; } + GNNFloat* in_temp_2() { return in_temp_2_; } + GNNFloat* out_temp() { return out_temp_; } + + void AggregateAllGPU(const graphs::GNNGraphGPUAllocations& gpu_graph, + size_t num_nodes, size_t column_length, + const GNNFloat* node_embeddings, + GNNFloat* aggregate_output); + private: GNNFloat* in_temp_1_{nullptr}; GNNFloat* in_temp_2_{nullptr}; GNNFloat* out_temp_{nullptr}; }; -} +} // namespace galois From 6629963b45b5879935c3aa596e414985c5078a08 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Nov 2020 17:22:37 -0600 Subject: [PATCH 404/660] GNNGraph: GPU object access + return GPU features Return GPU features if GPU build is on + adds function to get the GPU pointer object. --- libgnn/include/galois/graphs/GNNGraph.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 2b55d17b7a..5383b325d3 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -102,7 +102,13 @@ class GNNGraph { //! Return matrix of the local node features const PointerWithSize GetLocalFeatures() { +#ifndef GALOIS_ENABLE_GPU return PointerWithSize(local_node_features_); +#else + // TODO remove reliance on local_node_features + return PointerWithSize(gpu_memory_.feature_vector(), + local_node_features_.size()); +#endif } //! Given an LID and the current phase of GNN computation, determine if the @@ -121,6 +127,9 @@ class GNNGraph { void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size) const; +#ifdef GALOIS_ENABLE_GPU + const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; } +#endif private: //! Directory for input data const std::string input_directory_; From 15de1cf298c5957e0b043b2435f159bb0e7789ea Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Nov 2020 17:33:46 -0600 Subject: [PATCH 405/660] Use PointerWithSize for functions in layers Since the code now needs to run with both CPUs and GPUs, the "orchestration" code cannot use vectors anymore. This commit replaces them with PointerWithSize which can be used by both CPUs and GPUs. This commit adds initialization code for these objects depending on the build. This commit also begins to split the calls in CPU and GPU variants: the orchestration code remains the same, but depending on the build, the code will either call a CPU version or a GPU version. This split is done for dropout and aggregation so far (though this commit does not include the implementation of the GPU code; just the call). --- libgnn/include/galois/layers/GNNLayer.h | 30 +++++++--- .../galois/layers/GraphConvolutionalLayer.h | 14 ++++- libgnn/src/layers/GNNLayer.cpp | 44 ++++++++++++--- libgnn/src/layers/GraphConvolutionalLayer.cpp | 55 +++++++++++++++---- 4 files changed, 115 insertions(+), 28 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 9636d4f8d6..b1c8ae55a5 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -144,22 +144,32 @@ class GNNLayer { GNNLayerDimensions layer_dimensions_; //! Config object for certain parameters for layer GNNLayerConfig config_; + //! Weights used by this layer. Dimensions: input columns by output columns std::vector layer_weights_; //! Gradients used to update the weights of this layer std::vector layer_weight_gradients_; - //! Wrapper over gradient matrix to make it compatible with Gluon - std::unique_ptr gradient_sync_interface_; - //! Synchronization substrate for the weight gradients - std::unique_ptr> - gradient_sync_substrate_; - // There is a forward and a backward as their sizes will differ and we only // want to allocate memory once to avoid runtime memory allocation. //! The output of the forward phase for this layer. std::vector forward_output_matrix_; //! The output of the backward phase for this layer. std::vector backward_output_matrix_; + + // These are wrapper around the pointer for the data associated with + // any GNN layer: takes a CPU or GPU pointer depending on configuration + // Needed to allow both CPU/GPU runs with same code + PointerWithSize p_layer_weights_; + PointerWithSize p_layer_weight_gradients_; + PointerWithSize p_forward_output_matrix_; + PointerWithSize p_backward_output_matrix_; + + //! Wrapper over gradient matrix to make it compatible with Gluon + std::unique_ptr gradient_sync_interface_; + //! Synchronization substrate for the weight gradients + std::unique_ptr> + gradient_sync_substrate_; + //! RNG for matrix initialization PerThreadRNG random_init_rng_{-5.0, 5.0}; //! RNG for dropout @@ -188,11 +198,15 @@ class GNNLayer { //! Randomly init a float vector using the class's random init RNG void RandomInitVector(std::vector* vector_to_init); + //! CPU variant of dropout + void DoDropoutCPU(const PointerWithSize input_to_drop, + PointerWithSize* output_matrix); + //! Choose a set of weights from this layer's weights to keep and save to //! the output matrix + apply some scaling to the kept weights based on //! dropout rate void DoDropout(const PointerWithSize input_to_drop, - std::vector* output_matrix); + PointerWithSize* output_matrix); //! Apply the derivative of dropout to the backward phase output void DoDropoutDerivative(); @@ -210,7 +224,7 @@ class GNNLayer { #ifdef GALOIS_ENABLE_GPU //! Object that holds all GPU allocated pointers to memory related to layers - GNNLayerGPUAllocations gpu_memory_; + GNNLayerGPUAllocations gpu_object_; //! Copies over layer weights to GPU void CopyLayerWeightsToGPU(); #endif diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h index 19c4e6c68c..c677389df7 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h @@ -39,6 +39,12 @@ class GraphConvolutionalLayer : public GNNLayer { // Temporary matrix the size of the output of the forward pass; used if // an intermediate op occurs before writing to the final output matrix std::vector out_temp_; + + // Pointer with size versions + PointerWithSize p_in_temp_1_; + PointerWithSize p_in_temp_2_; + PointerWithSize p_out_temp_; + // Each thread has a vector of size # input columns or # output columns for // storing intermediate results during aggregation. // The one used depeneds on if aggregation occurs before or after the mxm. @@ -47,6 +53,12 @@ class GraphConvolutionalLayer : public GNNLayer { galois::substrate::PerThreadStorage> output_column_intermediates_; + //! CPU aggregation + void AggregateAllCPU( + size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + galois::substrate::PerThreadStorage>* pts); + //! Performs aggregation for all nodes of the graph given the length of the //! vector to aggregate, the features themselves, an output array, and per //! thread storage for the intermediate scaling via norm factor @@ -61,7 +73,7 @@ class GraphConvolutionalLayer : public GNNLayer { void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output); #ifdef GALOIS_ENABLE_GPU - GCNGPUAllocations gpu_memory_; + GCNGPUAllocations gpu_object_; #endif }; diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index a42d593a22..8bf9c42dfd 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -29,7 +29,7 @@ galois::GNNLayer::GNNLayer(size_t layer_num, galois::runtime::getSystemNetworkInterface().ID, galois::runtime::getSystemNetworkInterface().Num, false); #ifdef GALOIS_ENABLE_GPU - gpu_memory_.InitWeightMemory(num_weight_elements); + gpu_object_.InitWeightMemory(num_weight_elements); #endif } @@ -39,10 +39,31 @@ galois::GNNLayer::GNNLayer(size_t layer_num, backward_output_matrix_.resize( layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0); #ifdef GALOIS_ENABLE_GPU - gpu_memory_.InitInOutMemory(num_output_elements, + gpu_object_.InitInOutMemory(num_output_elements, layer_dimensions_.input_rows * layer_dimensions_.input_columns); #endif + + // initialize the PointerWithSize wrappers +#ifndef GALOIS_ENABLE_GPU + p_layer_weights_ = PointerWithSize(layer_weights_); + p_layer_weight_gradients_ = + PointerWithSize(layer_weight_gradients_); + p_forward_output_matrix_ = PointerWithSize(forward_output_matrix_); + p_backward_output_matrix_ = + PointerWithSize(backward_output_matrix_); +#else + p_layer_weights_ = PointerWithSize(gpu_object_.layer_weights(), + layer_weights_.size()); + p_layer_weight_gradients_ = PointerWithSize( + gpu_object_.layer_weight_gradients(), layer_weight_gradients_.size()); + p_forward_output_matrix_ = PointerWithSize( + gpu_object_.forward_output(), forward_output_matrix_.size()); + p_backward_output_matrix_ = PointerWithSize( + gpu_object_.backward_output(), backward_output_matrix_.size()); + // TODO can clear the cpu side vectors/don't use .size() since optimally they + // aren't initialized +#endif } void galois::GNNLayer::GlorotBengioInit(std::vector* vector_to_init) { @@ -67,11 +88,9 @@ void galois::GNNLayer::RandomInitVector(std::vector* vector_to_init) { galois::loopname("RandomInitVector")); } -// XXX Something is wrong with dropout; accuracy suffers, figure out what -// it is -void galois::GNNLayer::DoDropout( +void galois::GNNLayer::DoDropoutCPU( const PointerWithSize input_to_dropout, - std::vector* output_matrix) { + PointerWithSize* output_matrix) { size_t num_elements = output_matrix->size(); assert(num_elements == dropout_mask_.size()); assert(num_elements == input_to_dropout.size()); @@ -96,6 +115,17 @@ void galois::GNNLayer::DoDropout( galois::loopname("LayerDropout")); } +void galois::GNNLayer::DoDropout( + const PointerWithSize input_to_dropout, + PointerWithSize* output_matrix) { + //#ifdef GALOIS_ENABLE_GPU + // // XXX + // DoDropoutGPU(); + //#else + DoDropoutCPU(input_to_dropout, output_matrix); + //#endif +} + void galois::GNNLayer::DoDropoutDerivative() { assert(backward_output_matrix_.size() == dropout_mask_.size()); GNNFloat scale = 1. / (1. - config_.dropout_rate); @@ -170,6 +200,6 @@ void galois::GNNLayer::WeightGradientSyncAverage() { #ifdef GALOIS_ENABLE_GPU void galois::GNNLayer::CopyLayerWeightsToGPU() { - gpu_memory_.CopyToWeights(layer_weights_); + gpu_object_.CopyToWeights(layer_weights_); } #endif diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index c10c59c383..af2925facc 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -20,39 +20,57 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( out_temp_.resize(num_output_elements, 0); layer_type_ = galois::GNNLayerType::kGraphConvolutional; #ifdef GALOIS_ENABLE_GPU - gpu_memory_.Allocate(num_input_elements, num_output_elements); + gpu_object_.Allocate(num_input_elements, num_output_elements); #endif + + // init pointers with size +#ifndef GALOIS_ENABLE_GPU + p_in_temp_1_ = PointerWithSize(in_temp_1_); + p_in_temp_2_ = PointerWithSize(in_temp_2_); + p_out_temp_ = PointerWithSize(out_temp_); +#else + p_in_temp_1_ = + PointerWithSize(gpu_object_.in_temp_1(), in_temp_1_.size()); + p_in_temp_2_ = + PointerWithSize(gpu_object_.in_temp_2(), in_temp_2_.size()); + p_out_temp_ = + PointerWithSize(gpu_object_.out_temp(), out_temp_.size()); +#endif + GALOIS_LOG_VERBOSE("Conv layer initialized"); } const galois::PointerWithSize galois::GraphConvolutionalLayer::ForwardPhase( const galois::PointerWithSize input_embeddings) { + GALOIS_LOG_VERBOSE("Calling forward phase"); assert(input_embeddings.size() == (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); - assert(in_temp_1_.size() == input_embeddings.size()); - assert(in_temp_2_.size() == input_embeddings.size()); - assert(forward_output_matrix_.size() == + assert(p_in_temp_1_.size() == input_embeddings.size()); + assert(p_in_temp_2_.size() == input_embeddings.size()); + assert(p_forward_output_matrix_.size() == (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); // pointer to input to operate on const GNNFloat* input_data = input_embeddings.data(); // first, dropout if (config_.do_dropout && (layer_phase_ == GNNPhase::kTrain)) { - DoDropout(input_embeddings, &in_temp_1_); - input_data = in_temp_1_.data(); + galois::PointerWithSize drop_output(in_temp_1_); + DoDropout(input_embeddings, &drop_output); + input_data = drop_output.data(); } // flip aggregate/update if dimensions favor it (do less work) if (!config_.allow_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { // aggregation and update - AggregateAll(layer_dimensions_.input_columns, input_data, in_temp_2_.data(), - &input_column_intermediates_); - UpdateEmbeddings(in_temp_2_.data(), forward_output_matrix_.data()); + AggregateAll(layer_dimensions_.input_columns, input_data, + p_in_temp_2_.data(), &input_column_intermediates_); + UpdateEmbeddings(p_in_temp_2_.data(), p_forward_output_matrix_.data()); } else { // update to aggregate - UpdateEmbeddings(input_data, out_temp_.data()); - AggregateAll(layer_dimensions_.output_columns, out_temp_.data(), - forward_output_matrix_.data(), &output_column_intermediates_); + UpdateEmbeddings(input_data, p_out_temp_.data()); + AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(), + p_forward_output_matrix_.data(), + &output_column_intermediates_); } // TODO synchronization of aggregation functions @@ -129,6 +147,19 @@ galois::GraphConvolutionalLayer::BackwardPhase( } void galois::GraphConvolutionalLayer::AggregateAll( + size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + [[maybe_unused]] galois::substrate::PerThreadStorage>* + pts) { +#ifndef GALOIS_ENABLE_GPU + AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts); +#else + gpu_object_.AggregateAllGPU(graph_.GetGPUGraph(), graph_.size(), + column_length, node_embeddings, aggregate_output); +#endif +} + +void galois::GraphConvolutionalLayer::AggregateAllCPU( size_t column_length, const GNNFloat* node_embeddings, GNNFloat* aggregate_output, galois::substrate::PerThreadStorage>* pts) { From 336b6311ec9e70e0178932701efac10ed531ee21 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 10 Nov 2020 17:40:42 -0600 Subject: [PATCH 406/660] GCN aggregation code on GPU This commit adds the GCN aggregation code on the GPU: each warp gets a node, and the warp splits the summation of its feature among its threads. Therefore, the implementation works well if the feature vector is big to take advantage of the paralellism. This code was taken from the old GPU code, so it may not necessarily be the best optimized just yet. cuSparse may perform better as well, but at the moment I just want to get something working. --- libgnn/src/layers/GraphConvolutionalLayer.cu | 62 ++++++++++++++++++++ 1 file changed, 62 insertions(+) diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu index 20e96d9777..e5a34a3c15 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cu +++ b/libgnn/src/layers/GraphConvolutionalLayer.cu @@ -17,3 +17,65 @@ void galois::GCNGPUAllocations::Allocate(size_t input_elements, CUDA_CHECK( cudaMalloc((void**)(&out_temp_), output_elements * sizeof(GNNFloat))); } + +namespace { +// GPU side aggregation call: no matrix multiply, just regular dst accesses +__global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length, + const int* edge_index, + const int* edge_destination, + const galois::GNNFloat* node_embeddings, + galois::GNNFloat* aggregate_output) { + const unsigned thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const unsigned thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const unsigned warp_id = thread_id / WARP_SIZE; // global warp index + const unsigned warp_lane = + threadIdx.x / WARP_SIZE; // warp index within the CTA + const unsigned num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + // each warp gets a source: this var holds the first/last edge worked on by + // that warp + __shared__ int edge_begin_end[BLOCK_SIZE / WARP_SIZE][2]; + + // each warp works on a source: threads in warp split the feature + for (int src = warp_id; src < static_cast(num_nodes); src += num_warps) { + if (thread_lane < 2) { + edge_begin_end[warp_lane][thread_lane] = edge_index[src + thread_lane]; + } + // essentially what this is doing is making 2 of the threads set edge + // begin/end; all threads wait for sync + __syncthreads(); + + const int row_begin = edge_begin_end[warp_lane][0]; + const int row_end = edge_begin_end[warp_lane][1]; + unsigned base_src_index = src * column_length; + + for (int offset = row_begin; offset < row_end; offset++) { + int dst = edge_destination[offset]; + unsigned base_dst_index = dst * column_length; + + // NOTE: this is where warp diverges + // the feature aggregation is split among thread in a warp + for (int i = 0; i < column_length; i += WARP_SIZE) { + if ((thread_lane + i) < column_length) { + aggregate_output[base_src_index + thread_lane + i] += + node_embeddings[base_dst_index + thread_lane + i]; + } + } + } + } +} + +} // namespace + +void galois::GCNGPUAllocations::AggregateAllGPU( + const graphs::GNNGraphGPUAllocations& gpu_graph, size_t num_nodes, + size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output) { + AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>( + num_nodes, column_length, gpu_graph.edge_index(), + gpu_graph.edge_destinations(), node_embeddings, aggregate_output); + CUDA_TEST("GPU aggregate all failure"); +} From f46332a58ce93c2692e5427e2ce448e07dbcc47f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 11 Nov 2020 12:55:41 -0600 Subject: [PATCH 407/660] Adds CuBLAS SGEMM function This commit adds a function to call CuBLAS's SGEMM for doing matrix multiplies on the GPU using row-major matrices. Note the way matrices are passed into the function: B * A instead of A * B. The idea is that the CuBLAS function assumes column-major, so using row-major matrices makes it so that it's a transpose matrix. (BA)^T = C^T, and C^T in column-major form is a row-major C (exactly what we want). --- libgnn/CMakeLists.txt | 1 + libgnn/include/galois/CUDAUtil.h | 14 ++++++++++++-- libgnn/include/galois/GNNMath.cuh | 22 ++++++++++++++++++++++ libgnn/src/GNNMath.cu | 26 ++++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 2 deletions(-) create mode 100644 libgnn/include/galois/GNNMath.cuh create mode 100644 libgnn/src/GNNMath.cu diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index f556ec6ca4..d5771552ca 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -30,6 +30,7 @@ if (GALOIS_ENABLE_GPU) # create the galois_gnn_gpu library to get linked into galois_gnn set(gpusources + src/GNNMath.cu src/graphs/GNNGraph.cu src/layers/GNNLayer.cu src/layers/GraphConvolutionalLayer.cu diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h index d479efe64d..7af2f6c1e8 100644 --- a/libgnn/include/galois/CUDAUtil.h +++ b/libgnn/include/galois/CUDAUtil.h @@ -1,8 +1,9 @@ -#ifdef GALOIS_ENABLE_GPU +#ifndef GALOIS_CUDA_UTIL +#define GALOIS_CUDA_UTIL //! @file CUDAUtil.h //! Contains various utility functions for CUDA. -#pragma once #include +#include #include "galois/Logging.h" // TODO check these @@ -41,4 +42,13 @@ } \ } while (0) +#define CUBLAS_CHECK(condition) \ + do { \ + cublasStatus_t status = condition; \ + if (status != CUBLAS_STATUS_SUCCESS) { \ + GALOIS_LOG_ERROR("CuBLAS error code : {}", status); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + #endif diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh new file mode 100644 index 0000000000..6a7cbbac43 --- /dev/null +++ b/libgnn/include/galois/GNNMath.cuh @@ -0,0 +1,22 @@ +#ifndef GALOIS_GNN_MATH_CUDA +#define GALOIS_GNN_MATH_CUDA +#include "galois/GNNTypes.h" +#include "galois/CUDAUtil.h" + +namespace galois { + +extern bool cublas_is_init; +extern cublasHandle_t global_cublas_handle; + +//! Initializes the cublas handle to use cublas on GPUs. +void InitCuBLAS() { CUBLAS_CHECK(cublasCreate(&global_cublas_handle)); } + +//! Takes 2 *row-major* matrices and does a matrix multiply on the GPU using +//! CuBLAS. +void CBlasSGEMMGPU(const cublasOperation_t trans_a, + const cublasOperation_t trans_b, size_t input_rows, + size_t input_columns, size_t output_columns, + const GNNFloat* a, const GNNFloat* b, GNNFloat* output); + +} // namespace galois +#endif diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu new file mode 100644 index 0000000000..d4c4108785 --- /dev/null +++ b/libgnn/src/GNNMath.cu @@ -0,0 +1,26 @@ +#include "galois/GNNMath.cuh" + +bool galois::cublas_is_init = false; +cublasHandle_t galois::global_cublas_handle; + +void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a, + const cublasOperation_t trans_b, size_t input_rows, + size_t input_columns, size_t output_columns, + const GNNFloat* a, const GNNFloat* b, + GNNFloat* output) { + if (!cublas_is_init) { + InitCuBLAS(); + } + size_t lead_dim_a = (trans_a == CUBLAS_OP_N) ? input_columns : input_rows; + size_t lead_dim_b = (trans_b == CUBLAS_OP_N) ? output_columns : input_columns; + float dummy0 = 0.0; + float dummy1 = 1.0; + // because cusparse assumes column major even though we're passing in row + // major, the order of multiply is reversed so that it does what we + // want anyways + // https://stackoverflow.com/questions/56043539/cublassgemm-row-major-multiplication + CUBLAS_CHECK(cublasSgemm(global_cublas_handle, trans_b, trans_a, + output_columns, input_rows, input_columns, &dummy1, + b, lead_dim_b, a, lead_dim_a, &dummy0, output, + output_columns)); +} From 771460ec89f2e124b25ba77385db4cf7fd4312c6 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 11 Nov 2020 13:01:52 -0600 Subject: [PATCH 408/660] Renames base gpu alloc object in GNNLayer Before this commit the GPU alloc object in GNNLayer and its children were named the same. This commit changes the name of the base object to something else so that children classes can use it without name conflict. --- libgnn/include/galois/layers/GNNLayer.h | 2 +- libgnn/src/layers/GNNLayer.cpp | 21 +++++++++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index b1c8ae55a5..5e2d4708ba 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -224,7 +224,7 @@ class GNNLayer { #ifdef GALOIS_ENABLE_GPU //! Object that holds all GPU allocated pointers to memory related to layers - GNNLayerGPUAllocations gpu_object_; + GNNLayerGPUAllocations base_gpu_object_; //! Copies over layer weights to GPU void CopyLayerWeightsToGPU(); #endif diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 8bf9c42dfd..198c40985c 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -29,7 +29,7 @@ galois::GNNLayer::GNNLayer(size_t layer_num, galois::runtime::getSystemNetworkInterface().ID, galois::runtime::getSystemNetworkInterface().Num, false); #ifdef GALOIS_ENABLE_GPU - gpu_object_.InitWeightMemory(num_weight_elements); + base_gpu_object_.InitWeightMemory(num_weight_elements); #endif } @@ -39,9 +39,9 @@ galois::GNNLayer::GNNLayer(size_t layer_num, backward_output_matrix_.resize( layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0); #ifdef GALOIS_ENABLE_GPU - gpu_object_.InitInOutMemory(num_output_elements, - layer_dimensions_.input_rows * - layer_dimensions_.input_columns); + base_gpu_object_.InitInOutMemory(num_output_elements, + layer_dimensions_.input_rows * + layer_dimensions_.input_columns); #endif // initialize the PointerWithSize wrappers @@ -53,14 +53,15 @@ galois::GNNLayer::GNNLayer(size_t layer_num, p_backward_output_matrix_ = PointerWithSize(backward_output_matrix_); #else - p_layer_weights_ = PointerWithSize(gpu_object_.layer_weights(), + p_layer_weights_ = PointerWithSize(base_gpu_object_.layer_weights(), layer_weights_.size()); - p_layer_weight_gradients_ = PointerWithSize( - gpu_object_.layer_weight_gradients(), layer_weight_gradients_.size()); + p_layer_weight_gradients_ = + PointerWithSize(base_gpu_object_.layer_weight_gradients(), + layer_weight_gradients_.size()); p_forward_output_matrix_ = PointerWithSize( - gpu_object_.forward_output(), forward_output_matrix_.size()); + base_gpu_object_.forward_output(), forward_output_matrix_.size()); p_backward_output_matrix_ = PointerWithSize( - gpu_object_.backward_output(), backward_output_matrix_.size()); + base_gpu_object_.backward_output(), backward_output_matrix_.size()); // TODO can clear the cpu side vectors/don't use .size() since optimally they // aren't initialized #endif @@ -200,6 +201,6 @@ void galois::GNNLayer::WeightGradientSyncAverage() { #ifdef GALOIS_ENABLE_GPU void galois::GNNLayer::CopyLayerWeightsToGPU() { - gpu_object_.CopyToWeights(layer_weights_); + base_gpu_object_.CopyToWeights(layer_weights_); } #endif From c508c09bc5a9fc3cf234a08e1b8729f729f9513a Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 11 Nov 2020 13:17:14 -0600 Subject: [PATCH 409/660] Link CuBLAS to GPU build, multiple def. fix Links cublas to the gpu gnn library. Moves the definition of the cublas init function to the cu file because nvcc doesn't seem to work the same way has gcc/g++ in terms of multiple includes (header guards/pragma once doesn't seem to work). --- libgnn/CMakeLists.txt | 2 +- libgnn/include/galois/GNNMath.cuh | 2 +- libgnn/src/GNNMath.cu | 2 ++ 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index d5771552ca..ff7d47a07d 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -45,7 +45,7 @@ if (GALOIS_ENABLE_GPU) ) # link to gpu lib (which takes care of moderngpu and cub) - target_link_libraries(galois_gnn_gpu Galois::gpu galois_support) + target_link_libraries(galois_gnn_gpu Galois::gpu galois_support -lcublas) # gpu -> cpu lib target_link_libraries(galois_gnn galois_gnn_gpu) diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh index 6a7cbbac43..763799f838 100644 --- a/libgnn/include/galois/GNNMath.cuh +++ b/libgnn/include/galois/GNNMath.cuh @@ -9,7 +9,7 @@ extern bool cublas_is_init; extern cublasHandle_t global_cublas_handle; //! Initializes the cublas handle to use cublas on GPUs. -void InitCuBLAS() { CUBLAS_CHECK(cublasCreate(&global_cublas_handle)); } +void InitCuBLAS(); //! Takes 2 *row-major* matrices and does a matrix multiply on the GPU using //! CuBLAS. diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu index d4c4108785..be396a153e 100644 --- a/libgnn/src/GNNMath.cu +++ b/libgnn/src/GNNMath.cu @@ -3,6 +3,8 @@ bool galois::cublas_is_init = false; cublasHandle_t galois::global_cublas_handle; +void galois::InitCuBLAS() { CUBLAS_CHECK(cublasCreate(&global_cublas_handle)); } + void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a, const cublasOperation_t trans_b, size_t input_rows, size_t input_columns, size_t output_columns, From 667e94915916f08d314bd39351c2007025e1a4a8 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 11 Nov 2020 13:19:46 -0600 Subject: [PATCH 410/660] UpdateEmdeddingsGPU function added This commit splits the update emdeddings call in the GCN layer into a CPU and GPU version. The GPU version calls a function which at the moment will call into cuBLAS to do the multiplication. Next step is to unit test the forward pass to make sure the results are sane. This commit also includes a clang-format run on SoftmaxLayer (which currently is not being used anywhere anyways as it was an example Xuhao added). --- .../galois/layers/GraphConvolutionalLayer.cuh | 5 +++++ libgnn/src/layers/GraphConvolutionalLayer.cpp | 9 +++++++++ libgnn/src/layers/GraphConvolutionalLayer.cu | 10 +++++++++- libgnn/src/layers/SoftmaxLayer.cu | 14 +++++--------- 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh index 993b6f39cb..4b28916db5 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh @@ -20,6 +20,11 @@ public: const GNNFloat* node_embeddings, GNNFloat* aggregate_output); + void UpdateEmbeddingsGPU(size_t num_nodes, size_t input_columns, + size_t output_columns, + const GNNFloat* node_embeddings, + const GNNFloat* layer_weights, GNNFloat* output); + private: GNNFloat* in_temp_1_{nullptr}; GNNFloat* in_temp_2_{nullptr}; diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index af2925facc..1cd7a34f40 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -215,10 +215,19 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( void galois::GraphConvolutionalLayer::UpdateEmbeddings( const GNNFloat* node_embeddings, GNNFloat* output) { + +#ifndef GALOIS_ENABLE_GPU + // CPU version is just a call into CBlas galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, layer_dimensions_.input_columns, layer_dimensions_.output_columns, node_embeddings, layer_weights_.data(), output); +#else + gpu_object_.UpdateEmbeddingsGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, node_embeddings, + base_gpu_object_.layer_weights(), output); +#endif } void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative( diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu index e5a34a3c15..0af2201829 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cu +++ b/libgnn/src/layers/GraphConvolutionalLayer.cu @@ -1,4 +1,4 @@ -#include "galois/CUDAUtil.h" +#include "galois/GNNMath.cuh" #include "galois/layers/GraphConvolutionalLayer.cuh" galois::GCNGPUAllocations::~GCNGPUAllocations() { @@ -79,3 +79,11 @@ void galois::GCNGPUAllocations::AggregateAllGPU( gpu_graph.edge_destinations(), node_embeddings, aggregate_output); CUDA_TEST("GPU aggregate all failure"); } + +void galois::GCNGPUAllocations::UpdateEmbeddingsGPU( + size_t num_nodes, size_t input_columns, size_t output_columns, + const GNNFloat* node_embeddings, const GNNFloat* layer_weights, + GNNFloat* output) { + CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_N, num_nodes, input_columns, + output_columns, node_embeddings, layer_weights, output); +} diff --git a/libgnn/src/layers/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu index d9ed5fc0ff..c3f61dcf6f 100644 --- a/libgnn/src/layers/SoftmaxLayer.cu +++ b/libgnn/src/layers/SoftmaxLayer.cu @@ -3,26 +3,22 @@ #include "galois/layers/SoftmaxLayer.h" // Allocate memory and initialize -void galois::SoftmaxLayer::Init() { -} +void galois::SoftmaxLayer::Init() {} // Input: in_tensor // Output: out_tensor void galois::SoftmaxLayer::Forward(const galois::GNNFloat* in_tensor, - galois::GNNFloat* out_tensor) { -} + galois::GNNFloat* out_tensor) {} // Input: in_tensor // Input: out_tensor // Input: out_gradients // Output: in_gradients -// Note: although out_gradients is an input data, +// Note: although out_gradients is an input data, // it is not const because it can be reused -// to hold intermediate data inside this function, +// to hold intermediate data inside this function, // to avoid allocating more memory void galois::SoftmaxLayer::Backward(const galois::GNNFloat* in_tensor, const galois::GNNFloat* out_tensor, galois::GNNFloat* in_gradients, - galois::GNNFloat* out_gradients) { -} - + galois::GNNFloat* out_gradients) {} From 32829a25759a04b9821b5623bc4ea849f586b387 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 11 Nov 2020 15:52:21 -0600 Subject: [PATCH 411/660] Changing weights on CPU copies it to GPU Initialization of weights on CPU did not initialize weights on GPU: this commit adds a call to copy over newly initialized CPU weights over to the GPU. Moves init of GPU memory before the CPU weights are initialized as well to prevent a nullptr copy. Also adds a debug function to print vectors on the GPU. --- libgnn/include/galois/CUDAUtil.h | 3 +-- libgnn/include/galois/layers/GNNLayer.cuh | 5 +++++ libgnn/include/galois/layers/GNNLayer.h | 11 ++++++++++ libgnn/src/layers/GNNLayer.cpp | 20 ++++++++++++++++--- libgnn/src/layers/GNNLayer.cu | 18 +++++++++++++++++ libgnn/src/layers/GraphConvolutionalLayer.cpp | 5 ++--- libgnn/src/layers/GraphConvolutionalLayer.cu | 4 ++++ 7 files changed, 58 insertions(+), 8 deletions(-) diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h index 7af2f6c1e8..6a7e7e9915 100644 --- a/libgnn/include/galois/CUDAUtil.h +++ b/libgnn/include/galois/CUDAUtil.h @@ -18,8 +18,7 @@ do { \ cudaError_t error = condition; \ if (error != cudaSuccess) { \ - GALOIS_LOG_ERROR("CUDA error: {}", cudaGetErrorString(error)); \ - exit(EXIT_FAILURE); \ + GALOIS_LOG_FATAL("CUDA error: {}", cudaGetErrorString(error)); \ } \ } while (0) diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh index 81fa9e2026..b1e5290761 100644 --- a/libgnn/include/galois/layers/GNNLayer.cuh +++ b/libgnn/include/galois/layers/GNNLayer.cuh @@ -14,6 +14,11 @@ public: void InitWeightMemory(size_t num_weights); //! Copy provided data in vector to GPU weights void CopyToWeights(const std::vector& cpu_layer_weights); + //! Copy GPU forward output to the provided vector (assumes vector is already + //! correct size) + void CopyForwardOutputToCPU(std::vector* cpu_forward_output); + //! Prints forward output matrix on gpu + void PrintForwardOutput(size_t num); GNNFloat* forward_output() { return forward_output_matrix_; } GNNFloat* backward_output() { return backward_output_matrix_; } diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 5e2d4708ba..e6ac1b1497 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -82,6 +82,9 @@ class GNNLayer { if (layer_weights_.size()) { layer_weights_.assign(layer_weights_.size(), 1); } +#ifdef GALOIS_ENABLE_GPU + CopyLayerWeightsToGPU(); +#endif } const PointerWithSize GetForwardOutput() { @@ -131,6 +134,14 @@ class GNNLayer { //! stored in the layer void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number); +#ifdef GALOIS_ENABLE_GPU + //! Copies over forward output results to CPU + const std::vector& CopyForwardOutputFromGPU(); + void PrintForwardOutputGPU() { + base_gpu_object_.PrintForwardOutput(forward_output_matrix_.size()); + } +#endif + protected: //! Layer order (starts from 0); used in backward to shortcut output as layer //! 0 does not need to do some things that other layers need to do diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 198c40985c..7bd591e90a 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -9,6 +9,7 @@ galois::GNNLayer::GNNLayer(size_t layer_num, : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions), config_(config) { if (config_.allocate_weights) { + // TODO some of this does not need alloc if not used // dropout allocation; dropout is same as input dropout_mask_.resize( @@ -18,6 +19,10 @@ galois::GNNLayer::GNNLayer(size_t layer_num, layer_dimensions_.input_columns * layer_dimensions_.output_columns; layer_weights_.resize(num_weight_elements); layer_weight_gradients_.resize(num_weight_elements, 0); +#ifdef GALOIS_ENABLE_GPU + base_gpu_object_.InitWeightMemory(num_weight_elements); +#endif + GlorotBengioInit(&layer_weights_); // initialize sync substrate @@ -28,9 +33,6 @@ galois::GNNLayer::GNNLayer(size_t layer_num, *gradient_sync_interface_, galois::runtime::getSystemNetworkInterface().ID, galois::runtime::getSystemNetworkInterface().Num, false); -#ifdef GALOIS_ENABLE_GPU - base_gpu_object_.InitWeightMemory(num_weight_elements); -#endif } size_t num_output_elements = @@ -77,6 +79,9 @@ void galois::GNNLayer::GlorotBengioInit(std::vector* vector_to_init) { for (size_t i = 0; i < vector_to_init->size(); i++) { (*vector_to_init)[i] = dist(rng); } +#ifdef GALOIS_ENABLE_GPU + CopyLayerWeightsToGPU(); +#endif } void galois::GNNLayer::RandomInitVector(std::vector* vector_to_init) { @@ -87,6 +92,9 @@ void galois::GNNLayer::RandomInitVector(std::vector* vector_to_init) { (*vector_to_init)[i] = random_init_rng_.GetRandomNumber(); }, galois::loopname("RandomInitVector")); +#ifdef GALOIS_ENABLE_GPU + CopyLayerWeightsToGPU(); +#endif } void galois::GNNLayer::DoDropoutCPU( @@ -203,4 +211,10 @@ void galois::GNNLayer::WeightGradientSyncAverage() { void galois::GNNLayer::CopyLayerWeightsToGPU() { base_gpu_object_.CopyToWeights(layer_weights_); } + +const std::vector& +galois::GNNLayer::CopyForwardOutputFromGPU() { + base_gpu_object_.CopyForwardOutputToCPU(&forward_output_matrix_); + return forward_output_matrix_; +} #endif diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu index 424df92e26..64be961e4b 100644 --- a/libgnn/src/layers/GNNLayer.cu +++ b/libgnn/src/layers/GNNLayer.cu @@ -37,4 +37,22 @@ void galois::GNNLayerGPUAllocations::CopyToWeights( cudaMemcpyHostToDevice)); } +void galois::GNNLayerGPUAllocations::CopyForwardOutputToCPU( + std::vector* cpu_forward_output) { + CUDA_CHECK(cudaMemcpy(cpu_forward_output->data(), forward_output_matrix_, + cpu_forward_output->size() * sizeof(GNNFloat), + cudaMemcpyDeviceToHost)); +} + +namespace { +__global__ void PrintVector(galois::GNNFloat* v, unsigned size) { + for (unsigned i = 0; i < size; i++) { + printf("%u %f\n", i, v[i]); + } +} +} // namespace + // TODO copy from gpu function as well just in case I need to check +void galois::GNNLayerGPUAllocations::PrintForwardOutput(size_t size) { + PrintVector<<<1, 1>>>(forward_output_matrix_, size); +} diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 1cd7a34f40..61a0abaf4c 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -80,9 +80,9 @@ galois::GraphConvolutionalLayer::ForwardPhase( Activation(); } - assert(forward_output_matrix_.size() == + assert(p_forward_output_matrix_.size() == (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); - return forward_output_matrix_; + return p_forward_output_matrix_; } galois::PointerWithSize @@ -215,7 +215,6 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( void galois::GraphConvolutionalLayer::UpdateEmbeddings( const GNNFloat* node_embeddings, GNNFloat* output) { - #ifndef GALOIS_ENABLE_GPU // CPU version is just a call into CBlas galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu index 0af2201829..7161580ee3 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cu +++ b/libgnn/src/layers/GraphConvolutionalLayer.cu @@ -65,6 +65,10 @@ __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length, } } } + //__syncthreads(); + // if (thread_lane == 0) { + // printf("Agg %d %f\n", src, aggregate_output[base_src_index]); + //} } } From 9527c9013a77010971cb26c5b4223e2a639f9f80 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 11 Nov 2020 15:54:00 -0600 Subject: [PATCH 412/660] GPU conv layer forward pass test Reenable assertions for the GPU GCN layer forward pass which now works. The next step is to get the backward pass test working which involves adding a function for user code to copy things over to CUDA without needing to include the CUDA header + adding the appropriate GPU functions in the backend. --- libgnn/test/gpu-convlayer-test.cpp | 343 +++++++++++++++-------------- 1 file changed, 175 insertions(+), 168 deletions(-) diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp index 0123a35b17..7326c1a911 100644 --- a/libgnn/test/gpu-convlayer-test.cpp +++ b/libgnn/test/gpu-convlayer-test.cpp @@ -39,208 +39,215 @@ int main() { dimension_0, dcon); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner - const galois::PointerWithSize layer_0_forward_output = - layer_0->ForwardPhase(test_graph.GetLocalFeatures()); + layer_0->ForwardPhase(test_graph.GetLocalFeatures()); + // pointer is to GPU memory: copy it over to a CPU source for verification + layer_0->PrintForwardOutputGPU(); + const std::vector& layer_0_forward_output = + layer_0->CopyForwardOutputFromGPU(); //////////////////////////////////////////////////////////////////////////////// //// sanity check layer 0 output //////////////////////////////////////////////////////////////////////////////// - //// since norm factors aren't invovled it is possible to do full assertions - //// 7 x 2 + // since norm factors aren't invovled it is possible to do full assertions + // 7 x 2 GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14); - //GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3); - //GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3); - //GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6); - //GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6); - //GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12); - //GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12); - //GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 18); - //GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 18); - //GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 24); - //GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 24); - //GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 30); - //GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 30); - //GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 15); - //GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 15); + GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3); + GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3); + GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6); + GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6); + GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12); + GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12); + GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 18); + GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 18); + GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 24); + GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 24); + GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 30); + GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 30); + GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 15); + GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 15); //////////////////////////////////////////////////////////////////////////////// - //// dummy 1 matrix - //std::vector dummy_ones_v(14, 1); - //galois::PointerWithSize dummy_ones(dummy_ones_v); + // dummy 1 matrix + // std::vector dummy_ones_v(14, 1); + // galois::PointerWithSize dummy_ones(dummy_ones_v); - //// backward pass checking - //// layer 0 means that an empty weight matrix is returned since there is no - //// point passing back anything - //galois::PointerWithSize layer_0_backward_output = + // XXX TODO copy this over to the GPU + + // backward pass checking + // layer 0 means that an empty weight matrix is returned since there is no + // point passing back anything + // galois::PointerWithSize layer_0_backward_output = // layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); - //////////////////////////////////////////////////////////////////////////////// - //// sanity check layer 0 backward output; all 0 because layer 0 - //////////////////////////////////////////////////////////////////////////////// - //// since norm factors aren't invovled it is possible to do full assertions - //// 7 x 3 - //GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0); - //GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0); - - //galois::PointerWithSize layer_0_weight_gradients = + ////////////////////////////////////////////////////////////////////////////// + // sanity check layer 0 backward output; all 0 because layer 0 + ////////////////////////////////////////////////////////////////////////////// + // since norm factors aren't invovled it is possible to do full assertions + // 7 x 3 + // GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0); + // GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0); + + // galois::PointerWithSize layer_0_weight_gradients = // layer_0->GetLayerWeightGradients(); //// make sure they are sane - //GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6); - //GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21); - //GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21); - //GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21); - //GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21); - //GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21); + // GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6); + // GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21); + // GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21); + // GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21); + // GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21); + // GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21); - //layer_0.reset(); + // layer_0.reset(); //////////////////////////////////////////////////////////////////////////////// - //// create layer 1 for testing backward prop actually giving weights back + // create layer 1 for testing backward prop actually giving weights back - //std::unique_ptr layer_1 = - // std::make_unique(1, test_graph, - // dimension_0, dcon); - //layer_1->InitAllWeightsTo1(); - //galois::PointerWithSize layer_1_forward_output = - // layer_1->ForwardPhase(test_graph.GetLocalFeatures()); - //// same check as before for sanity purposes - //GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14); - //GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3); - //GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3); - //GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 6); - //GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 6); - //GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 12); - //GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 12); - //GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 18); - //GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 18); - //GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 24); - //GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 24); - //GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 30); - //GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 30); - //GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 15); - //GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15); - - //// since layer isn't 0 anymore, backward phase will actually return something - //dummy_ones_v.assign(14, 1); - //galois::PointerWithSize layer_1_backward_output = + std::unique_ptr layer_1 = + std::make_unique(1, test_graph, + dimension_0, dcon); + layer_1->InitAllWeightsTo1(); + layer_1->ForwardPhase(test_graph.GetLocalFeatures()); + const std::vector& layer_1_forward_output = + layer_1->CopyForwardOutputFromGPU(); + + // same check as before for sanity purposes + GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14); + GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3); + GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3); + GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 6); + GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 6); + GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 12); + GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 12); + GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 18); + GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 18); + GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 24); + GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 24); + GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 30); + GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 30); + GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 15); + GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15); + + // since layer isn't 0 anymore, backward phase will actually return something + // dummy_ones_v.assign(14, 1); + // galois::PointerWithSize layer_1_backward_output = // layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); - //////////////////////////////////////////////////////////////////////////////// - //// check that multiplies go as expected - //////////////////////////////////////////////////////////////////////////////// - //GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 2); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 2); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 2); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 4); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 2); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 2); - //GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 2); - - //galois::PointerWithSize layer_1_weight_gradients = + ////////////////////////////////////////////////////////////////////////////// + // check that multiplies go as expected + ////////////////////////////////////////////////////////////////////////////// + // GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 2); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 2); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 2); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 4); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 2); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 2); + // GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 2); + + // galois::PointerWithSize layer_1_weight_gradients = // layer_1->GetLayerWeightGradients(); //// make sure they are sane - //GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); - //GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21); - //GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21); - //GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21); - //GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21); - //GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21); + // GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); + // GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21); + // GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21); + // GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21); + // GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21); + // GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21); - //layer_1.reset(); + // layer_1.reset(); //////////////////////////////////////////////////////////////////////////////// - //galois::GNNLayerConfig config; - //config.do_dropout = true; - //config.do_activation = true; - //config.do_normalization = true; - //config.allow_aggregate_after_update = false; + // galois::GNNLayerConfig config; + // config.do_dropout = true; + // config.do_activation = true; + // config.do_normalization = true; + // config.allow_aggregate_after_update = false; //// finally, just make sure dropout and activation run without crashes //// (verification requires floating point accuracy or setting a seed which I //// don't have time for at the moment //// TODO in future maybe add better unit test for this - //std::unique_ptr layer_2 = + // std::unique_ptr layer_2 = // std::make_unique(1, test_graph, // dimension_0, config); - //galois::PointerWithSize l2_fo = + // galois::PointerWithSize l2_fo = // layer_2->ForwardPhase(test_graph.GetLocalFeatures()); - //GALOIS_LOG_ASSERT(l2_fo.size() == 14); - //GALOIS_LOG_VERBOSE("{}", l2_fo[0]); - //GALOIS_LOG_VERBOSE("{}", l2_fo[1]); - //GALOIS_LOG_VERBOSE("{}", l2_fo[2]); - //GALOIS_LOG_VERBOSE("{}", l2_fo[3]); - //GALOIS_LOG_VERBOSE("{}", l2_fo[4]); - //GALOIS_LOG_VERBOSE("{}", l2_fo[5]); - //GALOIS_LOG_VERBOSE("{}", l2_fo[6]); - //GALOIS_LOG_VERBOSE("{}", l2_fo[7]); - //GALOIS_LOG_VERBOSE("{}", l2_fo[8]); - //GALOIS_LOG_VERBOSE("{}", l2_fo[9]); - //GALOIS_LOG_VERBOSE("{}", l2_fo[10]); - //GALOIS_LOG_VERBOSE("{}", l2_fo[11]); - //GALOIS_LOG_VERBOSE("{}", l2_fo[12]); - //GALOIS_LOG_VERBOSE("{}", l2_fo[13]); - - //galois::PointerWithSize l2_bo = + // GALOIS_LOG_ASSERT(l2_fo.size() == 14); + // GALOIS_LOG_VERBOSE("{}", l2_fo[0]); + // GALOIS_LOG_VERBOSE("{}", l2_fo[1]); + // GALOIS_LOG_VERBOSE("{}", l2_fo[2]); + // GALOIS_LOG_VERBOSE("{}", l2_fo[3]); + // GALOIS_LOG_VERBOSE("{}", l2_fo[4]); + // GALOIS_LOG_VERBOSE("{}", l2_fo[5]); + // GALOIS_LOG_VERBOSE("{}", l2_fo[6]); + // GALOIS_LOG_VERBOSE("{}", l2_fo[7]); + // GALOIS_LOG_VERBOSE("{}", l2_fo[8]); + // GALOIS_LOG_VERBOSE("{}", l2_fo[9]); + // GALOIS_LOG_VERBOSE("{}", l2_fo[10]); + // GALOIS_LOG_VERBOSE("{}", l2_fo[11]); + // GALOIS_LOG_VERBOSE("{}", l2_fo[12]); + // GALOIS_LOG_VERBOSE("{}", l2_fo[13]); + + // galois::PointerWithSize l2_bo = // layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); - //GALOIS_LOG_ASSERT(l2_bo.size() == 21); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]); - //GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]); + // GALOIS_LOG_ASSERT(l2_bo.size() == 21); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]); + // GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]); return 0; } From 818b68564850a5202adae75653c9f63fd2a85ad5 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 11 Nov 2020 16:29:02 -0600 Subject: [PATCH 413/660] When init CuBLAS, set var to true --- libgnn/src/GNNMath.cu | 1 + 1 file changed, 1 insertion(+) diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu index be396a153e..06a3dc5983 100644 --- a/libgnn/src/GNNMath.cu +++ b/libgnn/src/GNNMath.cu @@ -12,6 +12,7 @@ void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a, GNNFloat* output) { if (!cublas_is_init) { InitCuBLAS(); + cublas_is_init = true; } size_t lead_dim_a = (trans_a == CUBLAS_OP_N) ? input_columns : input_rows; size_t lead_dim_b = (trans_b == CUBLAS_OP_N) ? output_columns : input_columns; From 25a8d1976a0102f42b3ebe63c25260c5085b2dbe Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 11 Nov 2020 16:52:56 -0600 Subject: [PATCH 414/660] GPU allocation helper function on GNNLayer Adds a function that allocates GPU memory and copies over a particular passed in vector to the GPU. At the moment there is no way to free this memory and it will leak. This function is added mostly for unit test purposes and should not be used otherwise. --- libgnn/include/galois/layers/GNNLayer.cuh | 4 ++++ libgnn/include/galois/layers/GNNLayer.h | 4 ++++ libgnn/src/layers/GNNLayer.cu | 13 +++++++++++++ 3 files changed, 21 insertions(+) diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh index b1e5290761..951b1c2775 100644 --- a/libgnn/include/galois/layers/GNNLayer.cuh +++ b/libgnn/include/galois/layers/GNNLayer.cuh @@ -20,6 +20,10 @@ public: //! Prints forward output matrix on gpu void PrintForwardOutput(size_t num); + //! Helper function: give a vector which is copied over to the GPU (new + //! memory is allocated as necessary) + GNNFloat* Allocate(const std::vector& v); + GNNFloat* forward_output() { return forward_output_matrix_; } GNNFloat* backward_output() { return backward_output_matrix_; } GNNFloat* layer_weights() { return layer_weights_; } diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index e6ac1b1497..143e3a2cb2 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -135,6 +135,10 @@ class GNNLayer { void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number); #ifdef GALOIS_ENABLE_GPU + //! Utility function for allocating + PointerWithSize AllocateGPU(const std::vector& v) { + return PointerWithSize(base_gpu_object_.Allocate(v), v.size()); + } //! Copies over forward output results to CPU const std::vector& CopyForwardOutputFromGPU(); void PrintForwardOutputGPU() { diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu index 64be961e4b..941926f1f6 100644 --- a/libgnn/src/layers/GNNLayer.cu +++ b/libgnn/src/layers/GNNLayer.cu @@ -44,6 +44,19 @@ void galois::GNNLayerGPUAllocations::CopyForwardOutputToCPU( cudaMemcpyDeviceToHost)); } +galois::GNNFloat* +galois::GNNLayerGPUAllocations::Allocate(const std::vector& v) { + // TODO keep track of these so that on destruction they can be freed + // accordingly; for now I'll let them leak + galois::GNNFloat* to_return = nullptr; + CUDA_CHECK( + cudaMalloc((void**)(&to_return), v.size() * sizeof(galois::GNNFloat))); + CUDA_CHECK(cudaMemcpy(to_return, v.data(), + v.size() * sizeof(galois::GNNFloat), + cudaMemcpyHostToDevice)); + return to_return; +} + namespace { __global__ void PrintVector(galois::GNNFloat* v, unsigned size) { for (unsigned i = 0; i < size; i++) { From 75719d0eec0e08ca535ee267b9a337ab8f35fc8c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 11 Nov 2020 17:16:03 -0600 Subject: [PATCH 415/660] Prepping GCN Layer backward phase for GPU code Add ifdefs to separate CPU/GPU code in the backward step and also change the structures being used to PointerWithSize (gpus don't like CPU vectors). Added a few TODOs too for better code organization later. --- libgnn/src/layers/GraphConvolutionalLayer.cpp | 37 +++++++++++++------ 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 61a0abaf4c..1dbfdacb2b 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -103,47 +103,58 @@ galois::GraphConvolutionalLayer::BackwardPhase( // transposed sgemm for derivative; in_temp is output assert(input_gradient->size() == layer_dimensions_.input_rows * layer_dimensions_.output_columns); - assert(in_temp_1_.size() == + assert(p_in_temp_1_.size() == layer_dimensions_.input_columns * layer_dimensions_.input_rows); - UpdateEmbeddingsDerivative(input_gradient->data(), in_temp_1_.data()); + UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data()); // derivative of aggregate is the same due to symmetric graph - AggregateAll(layer_dimensions_.input_columns, in_temp_1_.data(), - backward_output_matrix_.data(), + AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(), + p_backward_output_matrix_.data(), &input_column_intermediates_); } // weight gradient calculation + // TODO put this in a function to put the ifdef in there +#ifndef GALOIS_ENABLE_GPU galois::CBlasSGEMM( CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, layer_dimensions_.input_rows, layer_dimensions_.output_columns, prev_layer_input.data(), input_gradient->data(), - layer_weight_gradients_.data()); + p_layer_weight_gradients_.data()); +#else + // XXX +#endif } else { // aggregate occurs regardless of layer being equal to 0 because it is // required in this case for the weight gradient calculation AggregateAll(layer_dimensions_.output_columns, input_gradient->data(), - out_temp_.data(), &output_column_intermediates_); + p_out_temp_.data(), &output_column_intermediates_); if (layer_number_ != 0) { // derivative for update - UpdateEmbeddingsDerivative(out_temp_.data(), - backward_output_matrix_.data()); + UpdateEmbeddingsDerivative(p_out_temp_.data(), + p_backward_output_matrix_.data()); } + // TODO put this in a function +#ifndef GALOIS_ENABLE_GPU // weight gradient; note the use of the aggregated gradient in out_temp galois::CBlasSGEMM( CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, layer_dimensions_.input_rows, layer_dimensions_.output_columns, prev_layer_input.data(), out_temp_.data(), - layer_weight_gradients_.data()); + p_layer_weight_gradients_.data()); +#else + // XXX +#endif } // sync weight gradients; note aggregation sync occurs in the function call // already + // TODO figure out how to do this with GPUs WeightGradientSyncAverage(); if (config_.do_dropout && layer_number_ != 0) { DoDropoutDerivative(); } - return PointerWithSize(backward_output_matrix_); + return p_backward_output_matrix_; } void galois::GraphConvolutionalLayer::AggregateAll( @@ -231,12 +242,16 @@ void galois::GraphConvolutionalLayer::UpdateEmbeddings( void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative( const GNNFloat* gradients, GNNFloat* output) { - assert(layer_weights_.size() == + assert(p_layer_weights_.size() == layer_dimensions_.input_columns * layer_dimensions_.output_columns); +#ifndef GALOIS_ENABLE_GPU // difference is Trans for B matrix (data) to get z by y (weights is y by z // normally); result is x by y galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, layer_dimensions_.output_columns, layer_dimensions_.input_columns, gradients, layer_weights_.data(), output); +#else + // XXX +#endif } From 6af03430cf2a8e56b32d748c60c35e3d26c17734 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 11 Nov 2020 18:25:57 -0600 Subject: [PATCH 416/660] GPU GCN weight gradient/layer gradient calc Adds functions for calculating the weight and layer gradients of the GCN layer. Untested: the tests will be added in a commit down the line. --- .../galois/layers/GraphConvolutionalLayer.cuh | 9 +++++++++ libgnn/src/layers/GraphConvolutionalLayer.cpp | 18 ++++++++++++++---- libgnn/src/layers/GraphConvolutionalLayer.cu | 17 +++++++++++++++++ 3 files changed, 40 insertions(+), 4 deletions(-) diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh index 4b28916db5..fd4d9d76f0 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh @@ -24,6 +24,15 @@ public: size_t output_columns, const GNNFloat* node_embeddings, const GNNFloat* layer_weights, GNNFloat* output); + void UpdateEmbeddingsDerivativeGPU(size_t num_nodes, size_t input_columns, + size_t output_columns, + const GNNFloat* node_embeddings, + const GNNFloat* layer_weights, + GNNFloat* output); + + void GetWeightGradientsGPU(size_t num_nodes, size_t input_columns, + size_t output_columns, const GNNFloat* prev_input, + const GNNFloat* gradients, GNNFloat* output); private: GNNFloat* in_temp_1_{nullptr}; diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 1dbfdacb2b..ef9d3cbb03 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -120,7 +120,10 @@ galois::GraphConvolutionalLayer::BackwardPhase( prev_layer_input.data(), input_gradient->data(), p_layer_weight_gradients_.data()); #else - // XXX + gpu_object_.GetWeightGradientsGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, prev_layer_input.data(), + input_gradient->data(), p_layer_weight_gradients_.data()); #endif } else { // aggregate occurs regardless of layer being equal to 0 because it is @@ -138,10 +141,13 @@ galois::GraphConvolutionalLayer::BackwardPhase( galois::CBlasSGEMM( CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, layer_dimensions_.input_rows, layer_dimensions_.output_columns, - prev_layer_input.data(), out_temp_.data(), + prev_layer_input.data(), p_out_temp_.data(), p_layer_weight_gradients_.data()); #else - // XXX + gpu_object_.GetWeightGradientsGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, prev_layer_input.data(), + p_out_temp_.data(), p_layer_weight_gradients_.data()); #endif } @@ -252,6 +258,10 @@ void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative( layer_dimensions_.input_columns, gradients, layer_weights_.data(), output); #else - // XXX + gpu_object_.UpdateEmbeddingsDerivativeGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, gradients, + base_gpu_object_.layer_weights(), output); + #endif } diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu index 7161580ee3..5b6124211d 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cu +++ b/libgnn/src/layers/GraphConvolutionalLayer.cu @@ -91,3 +91,20 @@ void galois::GCNGPUAllocations::UpdateEmbeddingsGPU( CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_N, num_nodes, input_columns, output_columns, node_embeddings, layer_weights, output); } + +void galois::GCNGPUAllocations::UpdateEmbeddingsDerivativeGPU( + size_t num_nodes, size_t input_columns, size_t output_columns, + const GNNFloat* gradients, const GNNFloat* layer_weights, + GNNFloat* output) { + // note output clumns/input columns are flipped due to transpose of the + // layer weights + CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_T, num_nodes, output_columns, + input_columns, gradients, layer_weights, output); +} + +void galois::GCNGPUAllocations::GetWeightGradientsGPU( + size_t num_nodes, size_t input_columns, size_t output_columns, + const GNNFloat* prev_input, const GNNFloat* gradients, GNNFloat* output) { + CBlasSGEMMGPU(CUBLAS_OP_T, CUBLAS_OP_N, input_columns, num_nodes, + output_columns, prev_input, gradients, output); +} From d1a7eff9376ce0ab72dd619330375c18355920e4 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 11 Nov 2020 18:51:50 -0600 Subject: [PATCH 417/660] Functions for backward/weight gradient from GPU Add functions to copy the backward output and weight gradients of a layer from GPU to CPU. Also moved some function definitions to the header since the definitions were quite small. --- libgnn/include/galois/layers/GNNLayer.cuh | 7 +++++++ libgnn/include/galois/layers/GNNLayer.h | 22 +++++++++++++++++++--- libgnn/src/layers/GNNLayer.cpp | 12 ------------ libgnn/src/layers/GNNLayer.cu | 14 ++++++++++++++ 4 files changed, 40 insertions(+), 15 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh index 951b1c2775..387b1673c4 100644 --- a/libgnn/include/galois/layers/GNNLayer.cuh +++ b/libgnn/include/galois/layers/GNNLayer.cuh @@ -17,6 +17,13 @@ public: //! Copy GPU forward output to the provided vector (assumes vector is already //! correct size) void CopyForwardOutputToCPU(std::vector* cpu_forward_output); + //! Copy GPU backward output to the provided vector (assumes vector is already + //! correct size) + void CopyBackwardOutputToCPU(std::vector* cpu_backward_output); + //! Copy GPU weight gradients to the provided vector (assumes vector is + //! already correct size) + void CopyWeightGradientsToCPU(std::vector* cpu_gradients); + //! Prints forward output matrix on gpu void PrintForwardOutput(size_t num); diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 143e3a2cb2..f4acec8f25 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -139,8 +139,22 @@ class GNNLayer { PointerWithSize AllocateGPU(const std::vector& v) { return PointerWithSize(base_gpu_object_.Allocate(v), v.size()); } - //! Copies over forward output results to CPU - const std::vector& CopyForwardOutputFromGPU(); + //! Copies over forward output results to CPU from GPU + const std::vector& CopyForwardOutputFromGPU() { + base_gpu_object_.CopyForwardOutputToCPU(&forward_output_matrix_); + return forward_output_matrix_; + } + //! Copies over backward output results to CPU from GPU + const std::vector& CopyBackwardOutputFromGPU() { + base_gpu_object_.CopyBackwardOutputToCPU(&backward_output_matrix_); + return backward_output_matrix_; + } + //! Copies over weight gradients to CPU from GPU + const std::vector& CopyWeightGradientsFromGPU() { + base_gpu_object_.CopyWeightGradientsToCPU(&layer_weight_gradients_); + return layer_weight_gradients_; + } + void PrintForwardOutputGPU() { base_gpu_object_.PrintForwardOutput(forward_output_matrix_.size()); } @@ -241,7 +255,9 @@ class GNNLayer { //! Object that holds all GPU allocated pointers to memory related to layers GNNLayerGPUAllocations base_gpu_object_; //! Copies over layer weights to GPU - void CopyLayerWeightsToGPU(); + void CopyLayerWeightsToGPU() { + base_gpu_object_.CopyToWeights(layer_weights_); + } #endif }; diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 7bd591e90a..3cfbd990a0 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -206,15 +206,3 @@ void galois::GNNLayer::WeightGradientSyncAverage() { galois::loopname("WeightGradientSyncAverageDivide")); } } - -#ifdef GALOIS_ENABLE_GPU -void galois::GNNLayer::CopyLayerWeightsToGPU() { - base_gpu_object_.CopyToWeights(layer_weights_); -} - -const std::vector& -galois::GNNLayer::CopyForwardOutputFromGPU() { - base_gpu_object_.CopyForwardOutputToCPU(&forward_output_matrix_); - return forward_output_matrix_; -} -#endif diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu index 941926f1f6..597fba96bd 100644 --- a/libgnn/src/layers/GNNLayer.cu +++ b/libgnn/src/layers/GNNLayer.cu @@ -44,6 +44,20 @@ void galois::GNNLayerGPUAllocations::CopyForwardOutputToCPU( cudaMemcpyDeviceToHost)); } +void galois::GNNLayerGPUAllocations::CopyBackwardOutputToCPU( + std::vector* cpu_backward_output) { + CUDA_CHECK(cudaMemcpy(cpu_backward_output->data(), backward_output_matrix_, + cpu_backward_output->size() * sizeof(GNNFloat), + cudaMemcpyDeviceToHost)); +} + +void galois::GNNLayerGPUAllocations::CopyWeightGradientsToCPU( + std::vector* cpu_gradients) { + CUDA_CHECK(cudaMemcpy(cpu_gradients->data(), layer_weight_gradients_, + cpu_gradients->size() * sizeof(GNNFloat), + cudaMemcpyDeviceToHost)); +} + galois::GNNFloat* galois::GNNLayerGPUAllocations::Allocate(const std::vector& v) { // TODO keep track of these so that on destruction they can be freed From 614651fad02088f967f06424a6b79d5b5e86e748 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 11 Nov 2020 19:09:16 -0600 Subject: [PATCH 418/660] Readded zero'ing of output matrix for aggregation Since the aggregation in the GPU doesn't actually overwrite but adds to, the entire output matrix needs to be zero'd out before anything is done on it else you will have garbage values on it. --- libgnn/src/layers/GraphConvolutionalLayer.cu | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu index 5b6124211d..7828336b28 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cu +++ b/libgnn/src/layers/GraphConvolutionalLayer.cu @@ -78,6 +78,8 @@ void galois::GCNGPUAllocations::AggregateAllGPU( const graphs::GNNGraphGPUAllocations& gpu_graph, size_t num_nodes, size_t column_length, const GNNFloat* node_embeddings, GNNFloat* aggregate_output) { + CUDA_CHECK(cudaMemset(aggregate_output, 0, + num_nodes * column_length * sizeof(GNNFloat))); AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>( num_nodes, column_length, gpu_graph.edge_index(), gpu_graph.edge_destinations(), node_embeddings, aggregate_output); From 068387cbc5e766e853e3ecadeb5ff5a06f19d22f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 11 Nov 2020 19:10:12 -0600 Subject: [PATCH 419/660] GPU GCN layer unit test: simple forward/backward The forward and backward pass of a GCN layer without dropout/activation works fine now. All that is left for fully functioning GPU code is the output layer (softmax). Dropout and activation are nice to have but are not critical to "function" (though obviously they will be added). --- libgnn/test/gpu-convlayer-test.cpp | 159 +++++++++++++++-------------- 1 file changed, 81 insertions(+), 78 deletions(-) diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp index 7326c1a911..f4bb4cf4d3 100644 --- a/libgnn/test/gpu-convlayer-test.cpp +++ b/libgnn/test/gpu-convlayer-test.cpp @@ -41,7 +41,6 @@ int main() { // make sure it runs in a sane manner layer_0->ForwardPhase(test_graph.GetLocalFeatures()); // pointer is to GPU memory: copy it over to a CPU source for verification - layer_0->PrintForwardOutputGPU(); const std::vector& layer_0_forward_output = layer_0->CopyForwardOutputFromGPU(); @@ -68,56 +67,58 @@ int main() { //////////////////////////////////////////////////////////////////////////////// // dummy 1 matrix - // std::vector dummy_ones_v(14, 1); - // galois::PointerWithSize dummy_ones(dummy_ones_v); - - // XXX TODO copy this over to the GPU + std::vector dummy_ones_v(14, 1); + // TODO need to free the gpu pointer + galois::PointerWithSize dummy_ones = + layer_0->AllocateGPU(dummy_ones_v); // backward pass checking // layer 0 means that an empty weight matrix is returned since there is no // point passing back anything // galois::PointerWithSize layer_0_backward_output = - // layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + const std::vector& layer_0_backward_output = + layer_0->CopyBackwardOutputFromGPU(); ////////////////////////////////////////////////////////////////////////////// // sanity check layer 0 backward output; all 0 because layer 0 ////////////////////////////////////////////////////////////////////////////// // since norm factors aren't invovled it is possible to do full assertions // 7 x 3 - // GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0); - // GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0); - - // galois::PointerWithSize layer_0_weight_gradients = - // layer_0->GetLayerWeightGradients(); - //// make sure they are sane - // GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6); - // GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21); - // GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21); - // GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21); - // GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21); - // GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21); - - // layer_0.reset(); + GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21); + GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0); + + const std::vector& layer_0_weight_gradients = + layer_0->CopyWeightGradientsFromGPU(); + // make sure they are sane + GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21); + + layer_0.reset(); //////////////////////////////////////////////////////////////////////////////// @@ -131,7 +132,6 @@ int main() { const std::vector& layer_1_forward_output = layer_1->CopyForwardOutputFromGPU(); - // same check as before for sanity purposes GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14); GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3); GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3); @@ -149,49 +149,52 @@ int main() { GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 15); // since layer isn't 0 anymore, backward phase will actually return something - // dummy_ones_v.assign(14, 1); - // galois::PointerWithSize layer_1_backward_output = - // layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + dummy_ones = layer_1->AllocateGPU(dummy_ones_v); + layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + const std::vector& layer_1_backward_output = + layer_1->CopyBackwardOutputFromGPU(); ////////////////////////////////////////////////////////////////////////////// // check that multiplies go as expected ////////////////////////////////////////////////////////////////////////////// - // GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 2); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 2); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 2); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 4); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 2); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 2); - // GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 2); - - // galois::PointerWithSize layer_1_weight_gradients = - // layer_1->GetLayerWeightGradients(); - //// make sure they are sane - // GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); - // GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21); - // GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21); - // GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21); - // GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21); - // GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21); - - // layer_1.reset(); + GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21); + GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 2); + + const std::vector& layer_1_weight_gradients = + layer_1->CopyWeightGradientsFromGPU(); + // make sure they are sane + GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21); + + layer_1.reset(); //////////////////////////////////////////////////////////////////////////////// + // TODO get dropout and activation working + // galois::GNNLayerConfig config; // config.do_dropout = true; // config.do_activation = true; From bbe5fe7de6b2f8e75f2738d5ef82c4b056fa6224 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 12 Nov 2020 19:17:49 -0600 Subject: [PATCH 420/660] Softmax layer funcs prep split into CPU/GPU Add ifdefs to calls in softmax layer in preparation for GPU calls. --- libgnn/include/galois/layers/SoftmaxLayer.h | 6 +++++ libgnn/src/layers/SoftmaxLayer.cpp | 26 ++++++++++++++++++--- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index 5c412f6bf3..b9f821787b 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -24,9 +24,15 @@ class SoftmaxLayer : public GNNLayer { Init(); } + const PointerWithSize + ForwardPhaseCPU(const PointerWithSize input_embeddings); //! Creates probability distribution of each row of input const PointerWithSize ForwardPhase(const PointerWithSize input_embeddings) final; + + PointerWithSize + BackwardPhaseCPU(const PointerWithSize prev_layer_input, + PointerWithSize* input_gradient); //! Get gradients to fix distribution such that it leans more towards single //! class ground truth. PointerWithSize diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 07e78d3c1f..c3bfdb00e7 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -6,7 +6,7 @@ void galois::SoftmaxLayer::Init() {} const galois::PointerWithSize -galois::SoftmaxLayer::ForwardPhase( +galois::SoftmaxLayer::ForwardPhaseCPU( const galois::PointerWithSize input_embeddings) { input_loss_.assign(input_loss_.size(), 0.0); forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); @@ -42,9 +42,19 @@ galois::SoftmaxLayer::ForwardPhase( return forward_output_matrix_; } +const galois::PointerWithSize +galois::SoftmaxLayer::ForwardPhase( + const galois::PointerWithSize input_embeddings) { +#ifndef GALOIS_ENABLE_GPU + return ForwardPhaseCPU(input_embeddings); +#else + // XXX +#endif +} + galois::PointerWithSize -galois::SoftmaxLayer::BackwardPhase(const PointerWithSize, - PointerWithSize*) { +galois::SoftmaxLayer::BackwardPhaseCPU(const PointerWithSize, + PointerWithSize*) { const size_t feature_length = layer_dimensions_.input_columns; galois::do_all( @@ -86,4 +96,14 @@ galois::SoftmaxLayer::BackwardPhase(const PointerWithSize, return PointerWithSize(backward_output_matrix_); } +galois::PointerWithSize +galois::SoftmaxLayer::BackwardPhase(const PointerWithSize a, + PointerWithSize* b) { +#ifndef GALOIS_ENABLE_GPU + return BackwardPhaseCPU(a, b); +#else + // gpu_object_.BackwardPhaseGPU( +#endif +} + // TODO function for getting loss From e735eb65fdee68269fc2e77c7f1dd69bc3360625 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 12 Nov 2020 19:24:31 -0600 Subject: [PATCH 421/660] Dataset masks in GNNs now chars For some reason they were Labels which are not needed since the masks are essentially bitsets: they have been changed to chars to save more space. --- libgnn/include/galois/graphs/GNNGraph.h | 8 ++++---- libgnn/src/graphs/GNNGraph.cpp | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 5383b325d3..04debc019f 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -160,11 +160,11 @@ class GNNGraph { // TODO maybe revisit this and use an actual bitset //! Bitset indicating which nodes are training nodes - std::vector local_training_mask_; + std::vector local_training_mask_; //! Bitset indicating which nodes are validation nodes - std::vector local_validation_mask_; + std::vector local_validation_mask_; //! Bitset indicating which nodes are testing nodes - std::vector local_testing_mask_; + std::vector local_testing_mask_; //! Global mask range for training nodes; must convert to LIDs when using //! in this class @@ -194,7 +194,7 @@ class GNNGraph { //! given a name, mask type, and arrays to save into size_t ReadLocalMasksFromFile(const std::string& dataset_name, const std::string& mask_type, - GNNRange* mask_range, GNNLabel* masks); + GNNRange* mask_range, char* masks); //! Read masks of local nodes only for training, validation, and testing void ReadLocalMasks(const std::string& dataset_name); //! Reads the entire graph topology in (but nothing else) diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 918ce3d735..6e616e851b 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -253,7 +253,7 @@ void galois::graphs::GNNGraph::ReadLocalFeatures( //! given a name, mask type, and arrays to save into size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( const std::string& dataset_name, const std::string& mask_type, - GNNRange* mask_range, GNNLabel* masks) { + GNNRange* mask_range, char* masks) { size_t range_begin; size_t range_end; From 85c544e7355c9270d3726ef91f2b4124886237aa Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 12 Nov 2020 19:42:11 -0600 Subject: [PATCH 422/660] Cleanup to Softmax layer to let GPU build work Returning dead objects + removing unused arguments in Softmax layer files to allow GPU build to compile --- libgnn/include/galois/layers/SoftmaxLayer.h | 3 +-- libgnn/src/layers/SoftmaxLayer.cpp | 17 ++++++++++------- 2 files changed, 11 insertions(+), 9 deletions(-) diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index b9f821787b..62a4d9ff75 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -31,8 +31,7 @@ class SoftmaxLayer : public GNNLayer { ForwardPhase(const PointerWithSize input_embeddings) final; PointerWithSize - BackwardPhaseCPU(const PointerWithSize prev_layer_input, - PointerWithSize* input_gradient); + BackwardPhaseCPU(); //! Get gradients to fix distribution such that it leans more towards single //! class ground truth. PointerWithSize diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index c3bfdb00e7..b0446fd3ab 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -43,18 +43,19 @@ galois::SoftmaxLayer::ForwardPhaseCPU( } const galois::PointerWithSize -galois::SoftmaxLayer::ForwardPhase( - const galois::PointerWithSize input_embeddings) { +galois::SoftmaxLayer::ForwardPhase([ + [maybe_unused]] const galois::PointerWithSize + input_embeddings) { #ifndef GALOIS_ENABLE_GPU return ForwardPhaseCPU(input_embeddings); #else // XXX + return PointerWithSize(); #endif } galois::PointerWithSize -galois::SoftmaxLayer::BackwardPhaseCPU(const PointerWithSize, - PointerWithSize*) { +galois::SoftmaxLayer::BackwardPhaseCPU() { const size_t feature_length = layer_dimensions_.input_columns; galois::do_all( @@ -97,12 +98,14 @@ galois::SoftmaxLayer::BackwardPhaseCPU(const PointerWithSize, } galois::PointerWithSize -galois::SoftmaxLayer::BackwardPhase(const PointerWithSize a, - PointerWithSize* b) { +galois::SoftmaxLayer::BackwardPhase(const PointerWithSize, + PointerWithSize*) { #ifndef GALOIS_ENABLE_GPU - return BackwardPhaseCPU(a, b); + return BackwardPhaseCPU(); #else + // XXX // gpu_object_.BackwardPhaseGPU( + return PointerWithSize(); #endif } From 6139288827858dceef5ef40761eeb77a594cdecf Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 12 Nov 2020 20:00:42 -0600 Subject: [PATCH 423/660] Copy over GNN node masks to GPU Adds code to copy over the masks for the train, val, test sets to the GPU. Removes norm factor variable + adds the free calls for masks to the destructor as well. --- libgnn/include/galois/graphs/GNNGraph.cuh | 13 +++++++------ libgnn/src/graphs/GNNGraph.cpp | 4 ++-- libgnn/src/graphs/GNNGraph.cu | 23 ++++++++++++++++++++++- 3 files changed, 31 insertions(+), 9 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh index 33093d2ebc..3470056663 100644 --- a/libgnn/include/galois/graphs/GNNGraph.cuh +++ b/libgnn/include/galois/graphs/GNNGraph.cuh @@ -20,6 +20,9 @@ public: unsigned num_features); //! Copy over ground truth for the graph to GPU void SetLabels(const std::vector& ground_truth); + //! Copy over masks for the 3 sets to GPU + void SetMasks(const std::vector& train, const std::vector& val, + const std::vector& test); GNNFeature* feature_vector() { return feature_vector_; }; const GNNFeature* feature_vector() const { return feature_vector_; }; @@ -48,12 +51,10 @@ private: GNNFeature* feature_vector_{nullptr}; //! (Local) ground truth vector GNNFloat* ground_truth_{nullptr}; - // TODO need this? - //! (Local) norm factors - GNNFloat* norm_factors_{nullptr}; - - // TODO masks? other things I haven't considered yet? will determine if they - // are needed + // masks for phases + char* local_training_mask_{nullptr}; + char* local_validation_mask_{nullptr}; + char* local_testing_mask_{nullptr}; }; } // namespace graphs diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 6e616e851b..cbdf5e13db 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -374,8 +374,6 @@ void galois::graphs::GNNGraph::InitNormFactor() { #ifdef GALOIS_ENABLE_GPU void galois::graphs::GNNGraph::InitGPUMemory() { - // XXX finish up GPU memory allocation; currently just testing the build - // create int casted CSR uint64_t* e_index_ptr = partitioned_graph_->row_start_ptr(); uint32_t* e_dest_ptr = partitioned_graph_->edge_dst_ptr(); @@ -419,5 +417,7 @@ void galois::graphs::GNNGraph::InitGPUMemory() { gpu_memory_.SetFeatures(local_node_features_, node_feature_length_); gpu_memory_.SetLabels(local_ground_truth_labels_); + gpu_memory_.SetMasks(local_training_mask_, local_validation_mask_, + local_testing_mask_); } #endif diff --git a/libgnn/src/graphs/GNNGraph.cu b/libgnn/src/graphs/GNNGraph.cu index f13bbf4089..b0d5c1eb43 100644 --- a/libgnn/src/graphs/GNNGraph.cu +++ b/libgnn/src/graphs/GNNGraph.cu @@ -10,7 +10,9 @@ galois::graphs::GNNGraphGPUAllocations::~GNNGraphGPUAllocations() { CUDA_FREE(edge_destinations_); CUDA_FREE(feature_vector_); CUDA_FREE(ground_truth_); - CUDA_FREE(norm_factors_); + CUDA_FREE(local_training_mask_); + CUDA_FREE(local_validation_mask_); + CUDA_FREE(local_testing_mask_); } void galois::graphs::GNNGraphGPUAllocations::SetGraphTopology( @@ -61,3 +63,22 @@ void galois::graphs::GNNGraphGPUAllocations::SetLabels( ground_truth.size() * sizeof(GNNLabel), cudaMemcpyHostToDevice)); } + +void galois::graphs::GNNGraphGPUAllocations::SetMasks( + const std::vector& train, const std::vector& val, + const std::vector& test) { + CUDA_CHECK( + cudaMalloc((void**)(&local_training_mask_), train.size() * sizeof(char))); + CUDA_CHECK(cudaMemcpy(local_training_mask_, train.data(), + train.size() * sizeof(char), cudaMemcpyHostToDevice)); + + CUDA_CHECK( + cudaMalloc((void**)(&local_validation_mask_), val.size() * sizeof(char))); + CUDA_CHECK(cudaMemcpy(local_validation_mask_, val.data(), + val.size() * sizeof(char), cudaMemcpyHostToDevice)); + + CUDA_CHECK( + cudaMalloc((void**)(&local_testing_mask_), test.size() * sizeof(char))); + CUDA_CHECK(cudaMemcpy(local_testing_mask_, test.data(), + test.size() * sizeof(char), cudaMemcpyHostToDevice)); +} From cba333bf432efc5afd49126b4f4182dda12bf91d Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 13 Nov 2020 17:28:31 -0600 Subject: [PATCH 424/660] Softmax GPU object + hook to its forward phase Adds the file for the GPU object for the Softmax layer and adds the call to the Forward phase of the GPU code. The call itself is not yet defined. --- libgnn/include/galois/graphs/GNNGraph.cuh | 15 ++++++---- libgnn/include/galois/layers/SoftmaxLayer.cuh | 28 +++++++++++++++++++ libgnn/include/galois/layers/SoftmaxLayer.h | 17 +++++++++-- libgnn/src/layers/SoftmaxLayer.cpp | 11 ++++---- 4 files changed, 57 insertions(+), 14 deletions(-) create mode 100644 libgnn/include/galois/layers/SoftmaxLayer.cuh diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh index 3470056663..528fe4ceb2 100644 --- a/libgnn/include/galois/graphs/GNNGraph.cuh +++ b/libgnn/include/galois/graphs/GNNGraph.cuh @@ -24,12 +24,15 @@ public: void SetMasks(const std::vector& train, const std::vector& val, const std::vector& test); - GNNFeature* feature_vector() { return feature_vector_; }; - const GNNFeature* feature_vector() const { return feature_vector_; }; - int* edge_index() { return edge_index_; } - const int* edge_index() const { return edge_index_; } - int* edge_destinations() { return edge_destinations_; } - const int* edge_destinations() const { return edge_destinations_; } + GNNFeature* feature_vector() const { return feature_vector_; }; + int* edge_index() const { return edge_index_; } + int* edge_destinations() const { return edge_destinations_; } + + GNNFloat* ground_truth() const { return ground_truth_; } + + char* local_training_mask() const { return local_training_mask_; } + char* local_validation_mask() const { return local_validation_mask_; } + char* local_testing_mask() const { return local_testing_mask_; } private: // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS diff --git a/libgnn/include/galois/layers/SoftmaxLayer.cuh b/libgnn/include/galois/layers/SoftmaxLayer.cuh new file mode 100644 index 0000000000..440bb1f488 --- /dev/null +++ b/libgnn/include/galois/layers/SoftmaxLayer.cuh @@ -0,0 +1,28 @@ +#ifndef GALOIS_SOFTMAX_GPU +#define GALOIS_SOFTMAX_GPU +#include "galois/graphs/GNNGraph.cuh" +namespace galois { + +//! Contains implementation for the forward/backward pass of the softmax layer +//! on GPUs. +class SoftmaxLayerGPU { +public: + //! Initialize by saving pointers to already initialized GPU memory + SoftmaxLayerGPU(const galois::graphs::GNNGraphGPUAllocations& gpu_graph) + : train_mask_(gpu_graph.local_training_mask()), + val_mask_(gpu_graph.local_validation_mask()), + test_mask_(gpu_graph.local_testing_mask()), + local_labels_(gpu_graph.ground_truth()) {} + void ForwardPhaseGPU(size_t num_nodes, size_t feature_length, + const GNNFloat* input_embeddings, GNNFloat* output); + void BackwardPhaseGPU(GNNFloat* output); + +private: + char* train_mask_; + char* val_mask_; + char* test_mask_; + GNNFloat* local_labels_; +}; + +} // namespace galois +#endif diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index 62a4d9ff75..76a7ec654f 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -1,5 +1,8 @@ #pragma once #include "galois/layers/GNNLayer.h" +#ifdef GALOIS_ENABLE_GPU +#include "galois/layers/SoftmaxLayer.cuh" +#endif namespace galois { @@ -12,10 +15,15 @@ class SoftmaxLayer : public GNNLayer { const GNNLayerDimensions& dimensions) : GNNLayer(layer_num, graph, dimensions, GNNLayerConfig{.allocate_weights = false}), +#ifdef GALOIS_ENABLE_GPU + gpu_object_(graph.GetGPUGraph()), +#endif input_loss_(dimensions.input_rows), ground_truth_vectors_(dimensions.input_columns), norm_gradient_vectors_(dimensions.input_columns), - softmax_temp_vectors_(dimensions.input_columns) { + softmax_temp_vectors_(dimensions.input_columns) + + { output_layer_type_ = galois::GNNOutputLayerType::kSoftmax; // input/output columns must be equivalent in a softmax GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns); @@ -30,8 +38,7 @@ class SoftmaxLayer : public GNNLayer { const PointerWithSize ForwardPhase(const PointerWithSize input_embeddings) final; - PointerWithSize - BackwardPhaseCPU(); + PointerWithSize BackwardPhaseCPU(); //! Get gradients to fix distribution such that it leans more towards single //! class ground truth. PointerWithSize @@ -39,6 +46,10 @@ class SoftmaxLayer : public GNNLayer { PointerWithSize* input_gradient) final; private: +#ifdef GALOIS_ENABLE_GPU + SoftmaxLayerGPU gpu_object_; +#endif + //! Loss for each row of the input std::vector input_loss_; //! Each thread gets storage to allocate the ground truth vector in during diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index b0446fd3ab..00c0c05edd 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -43,14 +43,15 @@ galois::SoftmaxLayer::ForwardPhaseCPU( } const galois::PointerWithSize -galois::SoftmaxLayer::ForwardPhase([ - [maybe_unused]] const galois::PointerWithSize - input_embeddings) { +galois::SoftmaxLayer::ForwardPhase( + const galois::PointerWithSize input_embeddings) { #ifndef GALOIS_ENABLE_GPU return ForwardPhaseCPU(input_embeddings); #else - // XXX - return PointerWithSize(); + gpu_object_.ForwardPhaseGPU(graph_.size(), graph_.node_feature_length(), + input_embeddings.data(), + p_forward_output_matrix_.data()); + return p_forward_output_matrix_; #endif } From ed546d4d5334cdd52235ace01f39b4d4058dec29 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 13 Nov 2020 18:58:04 -0600 Subject: [PATCH 425/660] Softmax function for GPUs Adds a softmax function on GPUs that can be called from GPU kernels. --- libgnn/include/galois/GNNMath.cuh | 6 ++++++ libgnn/src/GNNMath.cu | 23 +++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh index 763799f838..212226d00b 100644 --- a/libgnn/include/galois/GNNMath.cuh +++ b/libgnn/include/galois/GNNMath.cuh @@ -18,5 +18,11 @@ void CBlasSGEMMGPU(const cublasOperation_t trans_a, size_t input_columns, size_t output_columns, const GNNFloat* a, const GNNFloat* b, GNNFloat* output); +//! Given a vector, apply a softmax on some specified # of elements and save +//! the result to the specified output. Since this is a device function, +//! all pointers should be to GPU memory. +__device__ void DoSoftmax(size_t vector_length, const GNNFloat* input, + GNNFloat* output); + } // namespace galois #endif diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu index 06a3dc5983..a04fac6962 100644 --- a/libgnn/src/GNNMath.cu +++ b/libgnn/src/GNNMath.cu @@ -27,3 +27,26 @@ void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a, b, lead_dim_b, a, lead_dim_a, &dummy0, output, output_columns)); } + +__device__ void galois::DoSoftmax(size_t vector_length, const GNNFloat* input, + GNNFloat* output) { + // find max value + GNNFloat current_max = input[0]; + for (size_t i = 1; i < vector_length; i++) { + if (input[i] > current_max) { + current_max = input[i]; + } + } + // set output by scaling with the max + GNNFloat denominator = 0.0; + for (size_t i = 0; i < vector_length; i++) { + // NOTE: expf only works for single precision float; may need to change if + // we ever switch to double + output[i] = expf(input[i] - current_max); + denominator += output[i]; + } + // denominator scale + for (size_t i = 0; i < vector_length; i++) { + output[i] /= denominator; + } +} From 8017d6a473efe0ce5fad02183656406569b016df Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 13 Nov 2020 19:38:52 -0600 Subject: [PATCH 426/660] CUDA_KERNEL_LOOP and some helper calcs Added a few things from old codebase's CUDA utils to new one in preparation for using the newly added things to compute the softmax layer. Also added the original source of the old code: Caffe. --- libgnn/include/galois/CUDAUtil.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h index 6a7e7e9915..51be6cd102 100644 --- a/libgnn/include/galois/CUDAUtil.h +++ b/libgnn/include/galois/CUDAUtil.h @@ -2,10 +2,21 @@ #define GALOIS_CUDA_UTIL //! @file CUDAUtil.h //! Contains various utility functions for CUDA. +//! Taken and revised+added to from here +//! https://github.com/BVLC/caffe/blob/master/include/caffe/util/device_alternate.hpp #include #include #include "galois/Logging.h" +// TODO check these too and make sure they make sense +// CUDA: use 256 threads per block +const int CUDA_NUM_THREADS = 256; + +// CUDA: number of blocks for threads. +inline int CUDA_GET_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + // TODO check these #define CHUNK_SIZE 256 #define TB_SIZE 256 @@ -14,6 +25,7 @@ #define MAX_NUM_CLASSES 128 #define WARPS_PER_BLOCK (BLOCK_SIZE / WARP_SIZE) +//! Wrap a CUDA call with this to auto-check if it returns any error #define CUDA_CHECK(condition) \ do { \ cudaError_t error = condition; \ @@ -22,6 +34,7 @@ } \ } while (0) +//! Frees a pointer allocated by cuda malloc #define CUDA_FREE(ptr) \ do { \ if (ptr) { \ @@ -30,6 +43,7 @@ } \ } while (0) +//! Call this after a cuda call to make sure it set any error flags #define CUDA_TEST(msg) \ do { \ cudaError_t e; \ @@ -41,6 +55,13 @@ } \ } while (0) +//! Basic kernel loop for CUDA threads +//! Caffe describes it as "grid stride" +#define CUDA_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +//! Wrap a CuBLAS call with this to check if it threw any errors #define CUBLAS_CHECK(condition) \ do { \ cublasStatus_t status = condition; \ From 5eb6b4d085f90bba8adeb8e04f4b0103a7c6388a Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 13 Nov 2020 19:41:41 -0600 Subject: [PATCH 427/660] Definition of forward phase gpu softmax This commit adds the softmax/cross entropy function to GNNMath.cu and uses it to define the GPU Softmax forward phase function. An additional argument was added to the forward phase gpu call to deal with the different phases: the phase argument details which mask to use in the softmax. There are a few things left to do that will be done later, namely zero'ing out the output matrix. Note that I have NOT defined cross entropy for the forward phase: it is only used to calculate loss, and I'm not using loss nor referring to it anywhere in my code or analysis at the moment.. --- libgnn/CMakeLists.txt | 1 + libgnn/include/galois/GNNMath.cuh | 6 +++ libgnn/include/galois/layers/SoftmaxLayer.cuh | 5 ++- libgnn/src/GNNMath.cpp | 4 ++ libgnn/src/GNNMath.cu | 14 +++++++ libgnn/src/layers/SoftmaxLayer.cpp | 2 +- libgnn/src/layers/SoftmaxLayer.cu | 37 +++++++++++++------ 7 files changed, 54 insertions(+), 15 deletions(-) diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index ff7d47a07d..ca50e171ee 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -34,6 +34,7 @@ if (GALOIS_ENABLE_GPU) src/graphs/GNNGraph.cu src/layers/GNNLayer.cu src/layers/GraphConvolutionalLayer.cu + src/layers/SoftmaxLayer.cu ) add_library(galois_gnn_gpu STATIC ${gpusources}) target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES) diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh index 212226d00b..01afe64f03 100644 --- a/libgnn/include/galois/GNNMath.cuh +++ b/libgnn/include/galois/GNNMath.cuh @@ -18,6 +18,12 @@ void CBlasSGEMMGPU(const cublasOperation_t trans_a, size_t input_columns, size_t output_columns, const GNNFloat* a, const GNNFloat* b, GNNFloat* output); +//! Runs softmax + cross entropy on masked nodes +__global__ void +SoftmaxCrossEntropyForward(char* mask, size_t num_nodes, size_t feature_length, + const galois::GNNFloat* input_embeddings, + galois::GNNFloat* output); + //! Given a vector, apply a softmax on some specified # of elements and save //! the result to the specified output. Since this is a device function, //! all pointers should be to GPU memory. diff --git a/libgnn/include/galois/layers/SoftmaxLayer.cuh b/libgnn/include/galois/layers/SoftmaxLayer.cuh index 440bb1f488..40e9681bb1 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.cuh +++ b/libgnn/include/galois/layers/SoftmaxLayer.cuh @@ -13,8 +13,9 @@ public: val_mask_(gpu_graph.local_validation_mask()), test_mask_(gpu_graph.local_testing_mask()), local_labels_(gpu_graph.ground_truth()) {} - void ForwardPhaseGPU(size_t num_nodes, size_t feature_length, - const GNNFloat* input_embeddings, GNNFloat* output); + void ForwardPhaseGPU(galois::GNNPhase phase, size_t num_nodes, + size_t feature_length, const GNNFloat* input_embeddings, + GNNFloat* output); void BackwardPhaseGPU(GNNFloat* output); private: diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp index 5e9fb8d050..0d065d6bcc 100644 --- a/libgnn/src/GNNMath.cpp +++ b/libgnn/src/GNNMath.cpp @@ -88,6 +88,10 @@ galois::GNNFloat galois::GNNCrossEntropy(const size_t vector_length, const GNNFloat* input) { GNNFloat loss = 0.0; + // Note that this function works if there are multiple non-zeros in the + // ground truth vector + // If there is only 1 then this function is overkill and it should break + // early for (size_t i = 0; i < vector_length; i++) { if (ground_truth[i] == 0.0) { continue; diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu index a04fac6962..d33ea88cc9 100644 --- a/libgnn/src/GNNMath.cu +++ b/libgnn/src/GNNMath.cu @@ -28,6 +28,20 @@ void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a, output_columns)); } + +__global__ void SoftmaxCrossEntropyForward(char* mask, size_t num_nodes, size_t feature_length, + const galois::GNNFloat* input_embeddings, + galois::GNNFloat* output) { + // XXX zero out output + CUDA_KERNEL_LOOP(i, num_nodes) { + if (mask[i] == 1) { + galois::DoSoftmax(feature_length, input_embeddings + feature_length * i, output + feature_length * i); + // ignoring crossentropy loss calculation for now because I'm not using + // loss for anything + didn't bother allocating an array to store loss anyways + } + } +} + __device__ void galois::DoSoftmax(size_t vector_length, const GNNFloat* input, GNNFloat* output) { // find max value diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 00c0c05edd..57a10af41c 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -48,7 +48,7 @@ galois::SoftmaxLayer::ForwardPhase( #ifndef GALOIS_ENABLE_GPU return ForwardPhaseCPU(input_embeddings); #else - gpu_object_.ForwardPhaseGPU(graph_.size(), graph_.node_feature_length(), + gpu_object_.ForwardPhaseGPU(layer_phase_, graph_.size(), graph_.node_feature_length(), input_embeddings.data(), p_forward_output_matrix_.data()); return p_forward_output_matrix_; diff --git a/libgnn/src/layers/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu index c3f61dcf6f..a562923a98 100644 --- a/libgnn/src/layers/SoftmaxLayer.cu +++ b/libgnn/src/layers/SoftmaxLayer.cu @@ -1,14 +1,27 @@ +#include "galois/GNNMath.cuh" #include "galois/Logging.h" -#include "galois/GNNMath.h" // Please add GPU functions -#include "galois/layers/SoftmaxLayer.h" +#include "galois/layers/SoftmaxLayer.cuh" -// Allocate memory and initialize -void galois::SoftmaxLayer::Init() {} +void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase, size_t num_nodes, size_t feature_length, + const GNNFloat* input_embeddings, GNNFloat* output) { + char* mask_to_use = nullptr; + switch (phase) { + case GNNPhase::kTrain: + mask_to_use = train_mask_; + break; + case GNNPhase::kValidate: + mask_to_use = val_mask_; + break; + case GNNPhase::kTest: + mask_to_use = test_mask_; + break; + default: + GALOIS_LOG_FATAL("Invalid phase specified"); + } -// Input: in_tensor -// Output: out_tensor -void galois::SoftmaxLayer::Forward(const galois::GNNFloat* in_tensor, - galois::GNNFloat* out_tensor) {} + SoftmaxCrossEntropyForward<<>>(mask_to_use, num_nodes, + feature_length, input_embeddings, output); +} // Input: in_tensor // Input: out_tensor @@ -18,7 +31,7 @@ void galois::SoftmaxLayer::Forward(const galois::GNNFloat* in_tensor, // it is not const because it can be reused // to hold intermediate data inside this function, // to avoid allocating more memory -void galois::SoftmaxLayer::Backward(const galois::GNNFloat* in_tensor, - const galois::GNNFloat* out_tensor, - galois::GNNFloat* in_gradients, - galois::GNNFloat* out_gradients) {} +//void galois::SoftmaxLayerGPU::Backward(const galois::GNNFloat* in_tensor, +// const galois::GNNFloat* out_tensor, +// galois::GNNFloat* in_gradients, +// galois::GNNFloat* out_gradients) {} From 77ec2f36fea802b1e5842ccdde71bc205fbd3ff5 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 17 Nov 2020 14:07:35 -0600 Subject: [PATCH 428/660] Softmax forward phase fixes; unit test for it too Fixed some bugs exposed by the unit test for softmax forward, namely that the feature length size was incorrect and that the vector was not being 0'd out before softmax occured. The unit test in question has been ported over from the cpu softmax unit test as well. The next step is to finish up the backward pass for the softmax layer and reactivate the unit test calls to the backward phase. I also need to consider actually checking backward phase output to make sure it is sane. --- libgnn/include/galois/GNNMath.cuh | 3 +- libgnn/include/galois/layers/SoftmaxLayer.h | 3 - libgnn/src/GNNMath.cu | 15 ++- libgnn/src/layers/GNNLayer.cpp | 1 - libgnn/src/layers/SoftmaxLayer.cpp | 9 +- libgnn/src/layers/SoftmaxLayer.cu | 15 ++- libgnn/test/CMakeLists.txt | 5 + libgnn/test/gpu-softmaxlayer-test.cpp | 118 ++++++++++++++++++++ 8 files changed, 147 insertions(+), 22 deletions(-) create mode 100644 libgnn/test/gpu-softmaxlayer-test.cpp diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh index 01afe64f03..e63221f87f 100644 --- a/libgnn/include/galois/GNNMath.cuh +++ b/libgnn/include/galois/GNNMath.cuh @@ -18,7 +18,8 @@ void CBlasSGEMMGPU(const cublasOperation_t trans_a, size_t input_columns, size_t output_columns, const GNNFloat* a, const GNNFloat* b, GNNFloat* output); -//! Runs softmax + cross entropy on masked nodes +//! Runs softmax + cross entropy on masked nodes. Will not overwrite all of +//! the output, so make sure it's been zero'd out beforehand. __global__ void SoftmaxCrossEntropyForward(char* mask, size_t num_nodes, size_t feature_length, const galois::GNNFloat* input_embeddings, diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index 76a7ec654f..7bf29272cd 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -29,7 +29,6 @@ class SoftmaxLayer : public GNNLayer { GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns); // output needs to match number of possible classes GALOIS_LOG_ASSERT(dimensions.input_columns == graph.GetNumLabelClasses()); - Init(); } const PointerWithSize @@ -64,8 +63,6 @@ class SoftmaxLayer : public GNNLayer { //! derivative calculation; each is the size of a feature vector galois::substrate::PerThreadStorage> softmax_temp_vectors_; - - void Init(); }; } // namespace galois diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu index d33ea88cc9..0066e85939 100644 --- a/libgnn/src/GNNMath.cu +++ b/libgnn/src/GNNMath.cu @@ -28,16 +28,19 @@ void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a, output_columns)); } +__global__ void galois::SoftmaxCrossEntropyForward( + char* mask, size_t num_nodes, size_t feature_length, + const galois::GNNFloat* input_embeddings, galois::GNNFloat* output) { -__global__ void SoftmaxCrossEntropyForward(char* mask, size_t num_nodes, size_t feature_length, - const galois::GNNFloat* input_embeddings, - galois::GNNFloat* output) { - // XXX zero out output + // NOTE: assumes that output is already 0'd out as it will not overwrite the + // entire thing CUDA_KERNEL_LOOP(i, num_nodes) { if (mask[i] == 1) { - galois::DoSoftmax(feature_length, input_embeddings + feature_length * i, output + feature_length * i); + galois::DoSoftmax(feature_length, input_embeddings + feature_length * i, + output + feature_length * i); // ignoring crossentropy loss calculation for now because I'm not using - // loss for anything + didn't bother allocating an array to store loss anyways + // loss for anything + didn't bother allocating an array to store loss + // anyways } } } diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 3cfbd990a0..31cf58c6c7 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -9,7 +9,6 @@ galois::GNNLayer::GNNLayer(size_t layer_num, : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions), config_(config) { if (config_.allocate_weights) { - // TODO some of this does not need alloc if not used // dropout allocation; dropout is same as input dropout_mask_.resize( diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 57a10af41c..3a65ba55bc 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -2,9 +2,6 @@ #include "galois/GNNMath.h" #include "galois/layers/SoftmaxLayer.h" -// Allocate memory and initialize -void galois::SoftmaxLayer::Init() {} - const galois::PointerWithSize galois::SoftmaxLayer::ForwardPhaseCPU( const galois::PointerWithSize input_embeddings) { @@ -48,9 +45,9 @@ galois::SoftmaxLayer::ForwardPhase( #ifndef GALOIS_ENABLE_GPU return ForwardPhaseCPU(input_embeddings); #else - gpu_object_.ForwardPhaseGPU(layer_phase_, graph_.size(), graph_.node_feature_length(), - input_embeddings.data(), - p_forward_output_matrix_.data()); + gpu_object_.ForwardPhaseGPU( + layer_phase_, graph_.size(), layer_dimensions_.input_columns, + input_embeddings.data(), p_forward_output_matrix_.data()); return p_forward_output_matrix_; #endif } diff --git a/libgnn/src/layers/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu index a562923a98..e385214981 100644 --- a/libgnn/src/layers/SoftmaxLayer.cu +++ b/libgnn/src/layers/SoftmaxLayer.cu @@ -2,8 +2,11 @@ #include "galois/Logging.h" #include "galois/layers/SoftmaxLayer.cuh" -void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase, size_t num_nodes, size_t feature_length, - const GNNFloat* input_embeddings, GNNFloat* output) { +void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase, + size_t num_nodes, + size_t feature_length, + const GNNFloat* input_embeddings, + GNNFloat* output) { char* mask_to_use = nullptr; switch (phase) { case GNNPhase::kTrain: @@ -19,8 +22,10 @@ void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase, size_t num GALOIS_LOG_FATAL("Invalid phase specified"); } - SoftmaxCrossEntropyForward<<>>(mask_to_use, num_nodes, - feature_length, input_embeddings, output); + CUDA_CHECK( + cudaMemset(output, 0, num_nodes * feature_length * sizeof(GNNFloat))); + SoftmaxCrossEntropyForward<<>>( + mask_to_use, num_nodes, feature_length, input_embeddings, output); } // Input: in_tensor @@ -31,7 +36,7 @@ void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase, size_t num // it is not const because it can be reused // to hold intermediate data inside this function, // to avoid allocating more memory -//void galois::SoftmaxLayerGPU::Backward(const galois::GNNFloat* in_tensor, +// void galois::SoftmaxLayerGPU::Backward(const galois::GNNFloat* in_tensor, // const galois::GNNFloat* out_tensor, // galois::GNNFloat* in_gradients, // galois::GNNFloat* out_gradients) {} diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 8bec96c4d6..01199c1247 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -42,6 +42,11 @@ else() add_executable(gpu-convlayer-test gpu-convlayer-test.cpp) target_link_libraries(gpu-convlayer-test galois_gnn) add_test(NAME gpu-convlayer-test COMMAND gpu-convlayer-test) + + add_executable(gpu-softmaxlayer-test gpu-softmaxlayer-test.cpp) + target_link_libraries(gpu-softmaxlayer-test galois_gnn) + add_test(NAME gpu-softmaxlayer-test COMMAND gpu-softmaxlayer-test) + endif() # TODO multi host tests? diff --git a/libgnn/test/gpu-softmaxlayer-test.cpp b/libgnn/test/gpu-softmaxlayer-test.cpp new file mode 100644 index 0000000000..2bceb7a6b4 --- /dev/null +++ b/libgnn/test/gpu-softmaxlayer-test.cpp @@ -0,0 +1,118 @@ +//! @file convlayer-test.cpp +//! Softmax layer test with a test graph + +#include "galois/Logging.h" +#include "galois/GNNMath.h" +#include "galois/layers/SoftmaxLayer.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + GALOIS_LOG_VERBOSE("Num threads is {}", num_threads); + + // load test graph + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + + // input/output columns must be same in softmax + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = 7; + dimension_0.input_columns = test_graph.GetNumLabelClasses(); + dimension_0.output_columns = test_graph.GetNumLabelClasses(); + + GALOIS_LOG_VERBOSE("Num output classes is {}", dimension_0.input_columns); + + // train mode + auto output_layer = + std::make_unique(3, test_graph, dimension_0); + // input to softmax + std::vector softmax_input(49, 0.0); + // create input with perfect accuracy + softmax_input[0] = 1; + softmax_input[8] = 1; + softmax_input[16] = 1; + softmax_input[24] = 1; + softmax_input[32] = 1; + softmax_input[40] = 1; + softmax_input[48] = 1; + galois::PointerWithSize p_softmax_input = + output_layer->AllocateGPU(softmax_input); + + output_layer->ForwardPhase(p_softmax_input); + + const std::vector& prediction_distribution = + output_layer->CopyForwardOutputFromGPU(); + + // assert that predictions are as expected + for (size_t i = 0; i < 5; i++) { + GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[i * 7])) == + i); + } + // train mode means last 2 vertices should be empty + for (size_t i = 5; i < 7; i++) { + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 0] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 1] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 2] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 3] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 4] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 5] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0); + } + + // XXX + // output_layer->BackwardPhase(p_softmax_input, nullptr); + + // validation mode + output_layer->SetLayerPhase(galois::GNNPhase::kValidate); + output_layer->ForwardPhase(p_softmax_input); + std::vector pd2 = output_layer->CopyForwardOutputFromGPU(); + + // validate vertex is index 5 + GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd2[5 * 7])) == 5); + for (size_t i = 0; i < 5; i++) { + GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0); + } + for (size_t i = 6; i < 7; i++) { + GALOIS_LOG_ASSERT(pd2[i * 7 + 0] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 1] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 2] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 3] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 4] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 5] == 0.0); + GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0); + } + + // XXX + // output_layer->BackwardPhase(p_softmax_input, nullptr); + + // test mode + output_layer->SetLayerPhase(galois::GNNPhase::kTest); + output_layer->ForwardPhase(p_softmax_input); + std::vector pd3 = output_layer->CopyForwardOutputFromGPU(); + // validate vertex is index 6 + GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd3[6 * 7])) == 6); + // all but last are empty distributions + for (size_t i = 0; i < 6; i++) { + GALOIS_LOG_ASSERT(pd3[i * 7 + 0] == 0.0); + GALOIS_LOG_ASSERT(pd3[i * 7 + 1] == 0.0); + GALOIS_LOG_ASSERT(pd3[i * 7 + 2] == 0.0); + GALOIS_LOG_ASSERT(pd3[i * 7 + 3] == 0.0); + GALOIS_LOG_ASSERT(pd3[i * 7 + 4] == 0.0); + GALOIS_LOG_ASSERT(pd3[i * 7 + 5] == 0.0); + GALOIS_LOG_ASSERT(pd3[i * 7 + 6] == 0.0); + } + + // XXX + // output_layer->BackwardPhase(softmax_input, nullptr); + + // TODO in future maybe: add better test for backward phase besides just + // running it +} From bf1b3551672885b165a6500bddb79b0178e7cdad Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 17 Nov 2020 14:25:36 -0600 Subject: [PATCH 429/660] Softmax: mask selection function refactoring Moved code to select the right mask pointer passed on the current layer phase to a function as it will be used in backward phase as well. --- libgnn/include/galois/layers/SoftmaxLayer.cuh | 15 ++++++++++++ libgnn/src/layers/SoftmaxLayer.cu | 23 +++++++------------ 2 files changed, 23 insertions(+), 15 deletions(-) diff --git a/libgnn/include/galois/layers/SoftmaxLayer.cuh b/libgnn/include/galois/layers/SoftmaxLayer.cuh index 40e9681bb1..ee1350f2bd 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.cuh +++ b/libgnn/include/galois/layers/SoftmaxLayer.cuh @@ -23,6 +23,21 @@ private: char* val_mask_; char* test_mask_; GNNFloat* local_labels_; + + //! Helper function that returns the correct mask based on phase it is passed + char* ChooseMask(galois::GNNPhase phase) { + switch (phase) { + case GNNPhase::kTrain: + return train_mask_; + case GNNPhase::kValidate: + return val_mask_; + case GNNPhase::kTest: + return test_mask_; + default: + GALOIS_LOG_FATAL("Invalid phase specified"); + return nullptr; + } + } }; } // namespace galois diff --git a/libgnn/src/layers/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu index e385214981..aecdd93c52 100644 --- a/libgnn/src/layers/SoftmaxLayer.cu +++ b/libgnn/src/layers/SoftmaxLayer.cu @@ -7,27 +7,20 @@ void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase, size_t feature_length, const GNNFloat* input_embeddings, GNNFloat* output) { - char* mask_to_use = nullptr; - switch (phase) { - case GNNPhase::kTrain: - mask_to_use = train_mask_; - break; - case GNNPhase::kValidate: - mask_to_use = val_mask_; - break; - case GNNPhase::kTest: - mask_to_use = test_mask_; - break; - default: - GALOIS_LOG_FATAL("Invalid phase specified"); - } - + char* mask_to_use = ChooseMask(phase); CUDA_CHECK( cudaMemset(output, 0, num_nodes * feature_length * sizeof(GNNFloat))); SoftmaxCrossEntropyForward<<>>( mask_to_use, num_nodes, feature_length, input_embeddings, output); + CUDA_TEST("Softmax cross entropy forward failed"); } +// void galois::SoftmaxLayerGPU::BackwardPhaseGPU() { +// +// +// +//} + // Input: in_tensor // Input: out_tensor // Input: out_gradients From f71b7c6d2c04e09ec7d92e3647643be2add94612 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 18 Nov 2020 11:57:06 -0600 Subject: [PATCH 430/660] Fixed ground truth type on GPUs Ground truth is represented with GNNLabel, but I was using a GNNFloat. This caused the labels being read to be garbaged when used on the GPU. This commit changes it them to the correct type. It also includes the signature definition of the backward phase: the implementation will be included in the next commit. (Split the commits up for modularity's sake) --- libgnn/include/galois/graphs/GNNGraph.cuh | 4 ++-- libgnn/include/galois/layers/SoftmaxLayer.cuh | 6 ++++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh index 528fe4ceb2..d485808972 100644 --- a/libgnn/include/galois/graphs/GNNGraph.cuh +++ b/libgnn/include/galois/graphs/GNNGraph.cuh @@ -28,7 +28,7 @@ public: int* edge_index() const { return edge_index_; } int* edge_destinations() const { return edge_destinations_; } - GNNFloat* ground_truth() const { return ground_truth_; } + GNNLabel* ground_truth() const { return ground_truth_; } char* local_training_mask() const { return local_training_mask_; } char* local_validation_mask() const { return local_validation_mask_; } @@ -53,7 +53,7 @@ private: //! (Local) feature vector GNNFeature* feature_vector_{nullptr}; //! (Local) ground truth vector - GNNFloat* ground_truth_{nullptr}; + GNNLabel* ground_truth_{nullptr}; // masks for phases char* local_training_mask_{nullptr}; char* local_validation_mask_{nullptr}; diff --git a/libgnn/include/galois/layers/SoftmaxLayer.cuh b/libgnn/include/galois/layers/SoftmaxLayer.cuh index ee1350f2bd..8e1e5d21d7 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.cuh +++ b/libgnn/include/galois/layers/SoftmaxLayer.cuh @@ -16,13 +16,15 @@ public: void ForwardPhaseGPU(galois::GNNPhase phase, size_t num_nodes, size_t feature_length, const GNNFloat* input_embeddings, GNNFloat* output); - void BackwardPhaseGPU(GNNFloat* output); + void BackwardPhaseGPU(galois::GNNPhase phase, size_t num_nodes, + size_t feature_length, const GNNFloat* predictions, + GNNFloat* output_gradient); private: char* train_mask_; char* val_mask_; char* test_mask_; - GNNFloat* local_labels_; + GNNLabel* local_labels_; //! Helper function that returns the correct mask based on phase it is passed char* ChooseMask(galois::GNNPhase phase) { From 4562c8b551da74996055523218a5765e5a7bc05a Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 18 Nov 2020 12:27:47 -0600 Subject: [PATCH 431/660] Softmax backward phase on GPU Adds the backward phase for the softmax layer for the GPU. The implementation is taken from the non-refactored old code: it copies a prediction to shared memory (presumably to improve locality) then does cross entropy to softmax derivatives. It remains to be seen if the shared memory copy is actually more efficient; some testing will be done down the line. Also adds print to both cpu and gpu softmax tests in order to verify that both are doing the same compute (which they are in this commit). --- libgnn/include/galois/GNNMath.cuh | 10 ++++ libgnn/src/GNNMath.cu | 83 +++++++++++++++++++++++++++ libgnn/src/layers/SoftmaxLayer.cpp | 10 +++- libgnn/src/layers/SoftmaxLayer.cu | 36 ++++++------ libgnn/test/gpu-softmaxlayer-test.cpp | 27 +++++++-- libgnn/test/softmaxlayer-test.cpp | 22 ++++++- 6 files changed, 158 insertions(+), 30 deletions(-) diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh index e63221f87f..aca14a573f 100644 --- a/libgnn/include/galois/GNNMath.cuh +++ b/libgnn/include/galois/GNNMath.cuh @@ -20,11 +20,21 @@ void CBlasSGEMMGPU(const cublasOperation_t trans_a, //! Runs softmax + cross entropy on masked nodes. Will not overwrite all of //! the output, so make sure it's been zero'd out beforehand. +//! At this point in time cross entropy is ignored because it only calculates a +//! loss value which doesn't really do anything for us at the moment. __global__ void SoftmaxCrossEntropyForward(char* mask, size_t num_nodes, size_t feature_length, const galois::GNNFloat* input_embeddings, galois::GNNFloat* output); +//! Derivative of cross entropy (to get error of prediction) then derivavtive +//! of the softmax. +__global__ void +SoftmaxCrossEntropyBackward(char* mask, size_t num_nodes, size_t feature_length, + const galois::GNNFloat* predictions, + const galois::GNNLabel* ground_truth, + galois::GNNFloat* output_gradient); + //! Given a vector, apply a softmax on some specified # of elements and save //! the result to the specified output. Since this is a device function, //! all pointers should be to GPU memory. diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu index 0066e85939..5b429dafb2 100644 --- a/libgnn/src/GNNMath.cu +++ b/libgnn/src/GNNMath.cu @@ -45,6 +45,89 @@ __global__ void galois::SoftmaxCrossEntropyForward( } } +__global__ void galois::SoftmaxCrossEntropyBackward( + char* mask, size_t num_nodes, size_t feature_length, + const galois::GNNFloat* predictions, const galois::GNNLabel* ground_truth, + galois::GNNFloat* output_gradient) { + const unsigned global_thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const unsigned warp_thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const unsigned warp_id = global_thread_id / WARP_SIZE; // global warp index + const unsigned warp_lane = + threadIdx.x / WARP_SIZE; // warp index within the CTA + const unsigned num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + // TODO: how many classes can there be? it's a set quantity at the moment + // copy of a particular node's prediction; put into shared memory to avoid + // overheads of accessing it otherwise + // TODO benchmark + __shared__ GNNFloat + local_node_prediction[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + __shared__ GNNFloat + intermediate_gradient[BLOCK_SIZE / WARP_SIZE][MAX_NUM_CLASSES]; + + // a warp works on a single node at once + for (unsigned wid = warp_id; wid < num_nodes; wid += num_warps) { + // operate only if masked + if (mask[wid] == 1) { + unsigned base_index = wid * feature_length; + + // copy over a prediction to shared memory (faster access time) + // TODO benchmark this to see if worth + for (unsigned feat_index = warp_thread_lane; feat_index < feature_length; + feat_index += WARP_SIZE) { + if (feat_index < feature_length) { + local_node_prediction[warp_lane][feat_index] = + predictions[base_index + feat_index]; + } + } + // do not proceed until entire prediction is copied to shared memory + __syncthreads(); + + // TODO can refactor below to device functions + // cross entropy derivative + // each thread of warp takes different feature + for (unsigned feat_index = warp_thread_lane; feat_index < feature_length; + feat_index += WARP_SIZE) { + if (feat_index < feature_length) { + if (feat_index == (unsigned)ground_truth[wid]) { + // this thread is responsible for the truth + intermediate_gradient[warp_lane][feat_index] = + -1.0 / (local_node_prediction[warp_lane][feat_index] + 1e-10); + } else { + // all others are 0 (ground truth label = 0) + intermediate_gradient[warp_lane][feat_index] = 0.0; + } + } + } + __syncthreads(); + + // softmax derivative + // each thread of warp takes different feature + for (unsigned feat_index = warp_thread_lane; feat_index < feature_length; + feat_index += WARP_SIZE) { + if (feat_index < feature_length) { + GNNFloat sum = 0.0; + GNNFloat self = local_node_prediction[warp_lane][feat_index]; + + for (unsigned j = 0; j < feature_length; j++) { + GNNFloat df = (j == feat_index) + ? (self * (1.0 - self)) + : -local_node_prediction[warp_lane][j] * self; + sum += df * intermediate_gradient[warp_lane][j]; + } + + // each thread saves final output for the feature + output_gradient[base_index + feat_index] = sum; + } + } + __syncthreads(); + } + } +} + __device__ void galois::DoSoftmax(size_t vector_length, const GNNFloat* input, GNNFloat* output) { // find max value diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 3a65ba55bc..a4d5133caa 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -56,6 +56,9 @@ galois::PointerWithSize galois::SoftmaxLayer::BackwardPhaseCPU() { const size_t feature_length = layer_dimensions_.input_columns; + // zero out output + backward_output_matrix_.assign(backward_output_matrix_.size(), 0); + galois::do_all( galois::iterate(graph_.begin(), graph_.end()), [&](const unsigned i) { @@ -101,9 +104,10 @@ galois::SoftmaxLayer::BackwardPhase(const PointerWithSize, #ifndef GALOIS_ENABLE_GPU return BackwardPhaseCPU(); #else - // XXX - // gpu_object_.BackwardPhaseGPU( - return PointerWithSize(); + gpu_object_.BackwardPhaseGPU( + layer_phase_, graph_.size(), layer_dimensions_.input_columns, + p_forward_output_matrix_.data(), p_backward_output_matrix_.data()); + return p_backward_output_matrix_; #endif } diff --git a/libgnn/src/layers/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu index aecdd93c52..f24a6f1e77 100644 --- a/libgnn/src/layers/SoftmaxLayer.cu +++ b/libgnn/src/layers/SoftmaxLayer.cu @@ -1,3 +1,4 @@ +#include #include "galois/GNNMath.cuh" #include "galois/Logging.h" #include "galois/layers/SoftmaxLayer.cuh" @@ -15,21 +16,20 @@ void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase, CUDA_TEST("Softmax cross entropy forward failed"); } -// void galois::SoftmaxLayerGPU::BackwardPhaseGPU() { -// -// -// -//} - -// Input: in_tensor -// Input: out_tensor -// Input: out_gradients -// Output: in_gradients -// Note: although out_gradients is an input data, -// it is not const because it can be reused -// to hold intermediate data inside this function, -// to avoid allocating more memory -// void galois::SoftmaxLayerGPU::Backward(const galois::GNNFloat* in_tensor, -// const galois::GNNFloat* out_tensor, -// galois::GNNFloat* in_gradients, -// galois::GNNFloat* out_gradients) {} +void galois::SoftmaxLayerGPU::BackwardPhaseGPU(galois::GNNPhase phase, + size_t num_nodes, + size_t feature_length, + const GNNFloat* predictions, + GNNFloat* output_gradient) { + assert(feature_length <= MAX_NUM_CLASSES); + char* mask_to_use = ChooseMask(phase); + CUDA_CHECK(cudaMemset(output_gradient, 0, + num_nodes * feature_length * sizeof(GNNFloat))); + // TODO check the launch parameters; this is taken directly from the original + // code + SoftmaxCrossEntropyBackward<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, + BLOCK_SIZE>>>(mask_to_use, num_nodes, + feature_length, predictions, + local_labels_, output_gradient); + CUDA_TEST("Softmax cross entropy backward failed"); +} diff --git a/libgnn/test/gpu-softmaxlayer-test.cpp b/libgnn/test/gpu-softmaxlayer-test.cpp index 2bceb7a6b4..453606e311 100644 --- a/libgnn/test/gpu-softmaxlayer-test.cpp +++ b/libgnn/test/gpu-softmaxlayer-test.cpp @@ -61,8 +61,13 @@ int main() { GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0); } - // XXX - // output_layer->BackwardPhase(p_softmax_input, nullptr); + output_layer->BackwardPhase(p_softmax_input, nullptr); + const std::vector& backward_output = + output_layer->CopyBackwardOutputFromGPU(); + printf("Output 1\n========\n"); + for (galois::GNNFloat a : backward_output) { + printf("%f\n", a); + } // validation mode output_layer->SetLayerPhase(galois::GNNPhase::kValidate); @@ -90,8 +95,13 @@ int main() { GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0); } - // XXX - // output_layer->BackwardPhase(p_softmax_input, nullptr); + output_layer->BackwardPhase(p_softmax_input, nullptr); + const std::vector& backward_output2 = + output_layer->CopyBackwardOutputFromGPU(); + printf("Output 2\n========\n"); + for (galois::GNNFloat a : backward_output2) { + printf("%f\n", a); + } // test mode output_layer->SetLayerPhase(galois::GNNPhase::kTest); @@ -110,8 +120,13 @@ int main() { GALOIS_LOG_ASSERT(pd3[i * 7 + 6] == 0.0); } - // XXX - // output_layer->BackwardPhase(softmax_input, nullptr); + output_layer->BackwardPhase(softmax_input, nullptr); + const std::vector& backward_output3 = + output_layer->CopyBackwardOutputFromGPU(); + printf("Output 3\n========\n"); + for (galois::GNNFloat a : backward_output3) { + printf("%f\n", a); + } // TODO in future maybe: add better test for backward phase besides just // running it diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp index f7baab24fd..9f15bedfa3 100644 --- a/libgnn/test/softmaxlayer-test.cpp +++ b/libgnn/test/softmaxlayer-test.cpp @@ -40,7 +40,13 @@ int main() { std::make_unique(3, test_graph, dimension_0); galois::PointerWithSize prediction_distribution = output_layer->ForwardPhase(softmax_input); - output_layer->BackwardPhase(softmax_input, nullptr); + + galois::PointerWithSize asdf = + output_layer->BackwardPhase(softmax_input, nullptr); + printf("Output 1\n========\n"); + for (unsigned i = 0; i < asdf.size(); i++) { + printf("%f\n", asdf[i]); + } // assert that predictions are as expected for (size_t i = 0; i < 5; i++) { @@ -62,7 +68,12 @@ int main() { output_layer->SetLayerPhase(galois::GNNPhase::kValidate); galois::PointerWithSize pd2 = output_layer->ForwardPhase(softmax_input); - output_layer->BackwardPhase(softmax_input, nullptr); + asdf = output_layer->BackwardPhase(softmax_input, nullptr); + printf("Output 2\n========\n"); + for (unsigned i = 0; i < asdf.size(); i++) { + printf("%f\n", asdf[i]); + } + // validate vertex is index 5 GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd2[5 * 7])) == 5); for (size_t i = 0; i < 5; i++) { @@ -88,7 +99,12 @@ int main() { output_layer->SetLayerPhase(galois::GNNPhase::kTest); galois::PointerWithSize pd3 = output_layer->ForwardPhase(softmax_input); - output_layer->BackwardPhase(softmax_input, nullptr); + asdf = output_layer->BackwardPhase(softmax_input, nullptr); + printf("Output 3\n========\n"); + for (unsigned i = 0; i < asdf.size(); i++) { + printf("%f\n", asdf[i]); + } + // validate vertex is index 6 GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd3[6 * 7])) == 6); // all but last are empty distributions From 4e5266caf94184b4f827b55a1567a8372f83e886 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 18 Nov 2020 14:56:47 -0600 Subject: [PATCH 432/660] Structure added for GPU global accuracy function This commit adds the declarations for the global accuracy getter for GPU GNNs as well as the orchestration of the call to the GPU version. The rest of the implementation will come in a later commit: for now this isn't priority as I can still compute accuracy on the CPU. Adds a new GNNGPU object to hold all GPU related things for the GNN class. --- libgnn/CMakeLists.txt | 1 + libgnn/include/galois/GraphNeuralNetwork.cuh | 22 +++++++++++++++++ libgnn/include/galois/GraphNeuralNetwork.h | 10 ++++++++ libgnn/src/GraphNeuralNetwork.cpp | 11 +++++++++ libgnn/src/GraphNeuralNetwork.cu | 26 ++++++++++++++++++++ 5 files changed, 70 insertions(+) create mode 100644 libgnn/include/galois/GraphNeuralNetwork.cuh create mode 100644 libgnn/src/GraphNeuralNetwork.cu diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index ca50e171ee..61867f21c8 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -35,6 +35,7 @@ if (GALOIS_ENABLE_GPU) src/layers/GNNLayer.cu src/layers/GraphConvolutionalLayer.cu src/layers/SoftmaxLayer.cu + src/GraphNeuralNetwork.cu ) add_library(galois_gnn_gpu STATIC ${gpusources}) target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES) diff --git a/libgnn/include/galois/GraphNeuralNetwork.cuh b/libgnn/include/galois/GraphNeuralNetwork.cuh new file mode 100644 index 0000000000..dd2eeed8b0 --- /dev/null +++ b/libgnn/include/galois/GraphNeuralNetwork.cuh @@ -0,0 +1,22 @@ +#ifndef GALOIS_GNN_GPU_CLASS +#define GALOIS_GNN_GPU_CLASS + +#include "galois/GNNTypes.h" +#include "galois/graphs/GNNGraph.cuh" + +namespace galois { + +//! Helper class for a GNN: holds GPU arguments. In its own class so that the +//! compiler used for it can differ from the main CPU code +class GraphNeuralNetworkGPU { +public: + //! Gets accuracy of a prediction given pointers to the data on the GPU + float + GetGlobalAccuracyGPU(const galois::graphs::GNNGraphGPUAllocations& gpu_graph, + galois::GNNPhase phase, + const galois::PointerWithSize predictions); +}; + +} // namespace galois + +#endif diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 9e7e2266d0..652b1cbfad 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -9,6 +9,10 @@ #include "galois/graphs/GNNGraph.h" #include "galois/layers/GNNLayer.h" +#ifdef GALOIS_ENABLE_GPU +#include "galois/GraphNeuralNetwork.cuh" +#endif + namespace galois { //////////////////////////////////////////////////////////////////////////////// @@ -144,6 +148,8 @@ class GraphNeuralNetwork { float GetGlobalAccuracy(const PointerWithSize predictions); + float GetGlobalAccuracyCPU(const PointerWithSize predictions); + //! Backpropagate gradients from the output layer backwards through the //! network to update the layer weights. Also known as a backward phase in //! most literature @@ -164,6 +170,10 @@ class GraphNeuralNetwork { DGAccumulator num_correct_; //! Used to count total number of things checked during accuracy calculation DGAccumulator total_checked_; +#ifdef GALOIS_ENABLE_GPU + //! Holds all GPU functions + GraphNeuralNetworkGPU gpu_object_; +#endif }; } // namespace galois diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index eb419ba26c..e669feac50 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -101,6 +101,17 @@ galois::GraphNeuralNetwork::DoInference() { float galois::GraphNeuralNetwork::GetGlobalAccuracy( const PointerWithSize predictions) { + // TODO mark as a forwarding argument? +#ifndef GALOIS_ENABLE_GPU + return GetGlobalAccuracyCPU(predictions); +#else + return gpu_object_.GetGlobalAccuracyGPU(graph_->GetGPUGraph(), phase_, + predictions); +#endif +} + +float galois::GraphNeuralNetwork::GetGlobalAccuracyCPU( + const PointerWithSize predictions) { // check owned nodes' accuracy size_t num_labels = graph_->GetNumLabelClasses(); assert((graph_->GetNumLabelClasses() * graph_->size()) == predictions.size()); diff --git a/libgnn/src/GraphNeuralNetwork.cu b/libgnn/src/GraphNeuralNetwork.cu new file mode 100644 index 0000000000..a16c4b2b69 --- /dev/null +++ b/libgnn/src/GraphNeuralNetwork.cu @@ -0,0 +1,26 @@ +#include "galois/GraphNeuralNetwork.cuh" +#include "galois/Logging.h" + +float galois::GraphNeuralNetworkGPU::GetGlobalAccuracyGPU( + const graphs::GNNGraphGPUAllocations& gpu_graph, GNNPhase phase, + const PointerWithSize predictions) { + // get correct mask + char* mask_to_use = nullptr; + switch (phase) { + case GNNPhase::kTrain: + mask_to_use = gpu_graph.local_training_mask(); + break; + case GNNPhase::kValidate: + mask_to_use = gpu_graph.local_validation_mask(); + break; + case GNNPhase::kTest: + mask_to_use = gpu_graph.local_testing_mask(); + break; + default: + GALOIS_LOG_FATAL("Invalid phase specified"); + } + + // run accuracy check kernel on GPU + + return 0.0; +} From b9e3b32afdedb6ed91fdf8968e96118298987d4c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 18 Nov 2020 17:28:28 -0600 Subject: [PATCH 433/660] Allocate GPU memory for Adam optimizer Adds a GPU Adam optimizer class that holds the allocations for the moments used in the adam optimizer on the GPU. Adds a gpu version of the adam test as well to make sure build is sane in its current state. The CPU optimizer class is also now split into the CPU/GPU paths depending on which build is being used. Next step is to do the adam optimizer on the GPU proper. --- libgnn/CMakeLists.txt | 1 + libgnn/include/galois/GNNOptimizers.cuh | 30 +++++++++++++++ libgnn/include/galois/GNNOptimizers.h | 31 +++++++++++++++- libgnn/src/GNNOptimizers.cpp | 4 +- libgnn/src/GNNOptimizers.cu | 29 +++++++++++++++ libgnn/test/CMakeLists.txt | 3 ++ libgnn/test/gpu-adam-test.cpp | 49 +++++++++++++++++++++++++ 7 files changed, 144 insertions(+), 3 deletions(-) create mode 100644 libgnn/include/galois/GNNOptimizers.cuh create mode 100644 libgnn/src/GNNOptimizers.cu create mode 100644 libgnn/test/gpu-adam-test.cpp diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 61867f21c8..362fc7f773 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -36,6 +36,7 @@ if (GALOIS_ENABLE_GPU) src/layers/GraphConvolutionalLayer.cu src/layers/SoftmaxLayer.cu src/GraphNeuralNetwork.cu + src/GNNOptimizers.cu ) add_library(galois_gnn_gpu STATIC ${gpusources}) target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES) diff --git a/libgnn/include/galois/GNNOptimizers.cuh b/libgnn/include/galois/GNNOptimizers.cuh new file mode 100644 index 0000000000..13fcb97263 --- /dev/null +++ b/libgnn/include/galois/GNNOptimizers.cuh @@ -0,0 +1,30 @@ +#ifndef GALOIS_GPU_GNN_OPT +#define GALOIS_GPU_GNN_OPT + +#include +#include "galois/GNNTypes.h" + +namespace galois { + +//! Holds GPU memory for the adam optimizer as well as function definitions +//! for weight adjustment +class AdamOptimizerGPU { +public: + //! Initializes the moment vectors on the GPU based on provided sizes + AdamOptimizerGPU(const std::vector& trainable_layer_sizes, + size_t num_trainable); + //! Frees moment vectors and vector of pointers to moments + ~AdamOptimizerGPU(); + + GNNFloat* first_moment(size_t i) { return first_moments_[i]; }; + GNNFloat* second_moment(size_t i) { return second_moments_[i]; }; + +private: + size_t num_layers_; + std::vector first_moments_; + std::vector second_moments_; +}; + +} // namespace galois + +#endif diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h index c0e8dd2582..9528612ef4 100644 --- a/libgnn/include/galois/GNNOptimizers.h +++ b/libgnn/include/galois/GNNOptimizers.h @@ -10,6 +10,10 @@ #include #include +#ifdef GALOIS_ENABLE_GPU +#include "galois/GNNOptimizers.cuh" +#endif + namespace galois { //! Virtual class; optimizers all need the descent function @@ -41,18 +45,35 @@ class AdamOptimizer : public BaseOptimizer { AdamOptimizer(const AdamConfiguration& config, const std::vector& trainable_layer_sizes, size_t num_trainable_layers) - : config_(config), num_trainable_layers_(num_trainable_layers), + : +#ifdef GALOIS_ENABLE_GPU + gpu_object_(trainable_layer_sizes, num_trainable_layers), +#endif + config_(config), num_trainable_layers_(num_trainable_layers), beta1_power_t_(num_trainable_layers_, config.beta1), beta2_power_t_(num_trainable_layers_, config.beta2) { // >= because only prefix will be considered otherwise assert(trainable_layer_sizes.size() >= num_trainable_layers_); +#ifndef GALOIS_ENABLE_GPU // allocate vectors based on # of trainable layers for (size_t i = 0; i < num_trainable_layers_; i++) { first_moments_.emplace_back(trainable_layer_sizes[i], 0.0); second_moments_.emplace_back(trainable_layer_sizes[i], 0.0); + // Pointer with size construction + p_first_moments_.emplace_back(first_moments_.back()); + p_second_moments_.emplace_back(second_moments_.back()); } assert(first_moments_.size() == num_trainable_layers_); assert(second_moments_.size() == num_trainable_layers_); +#else + // pointer with size initialization with GPU pointers + for (size_t i = 0; i < num_trainable_layers_; i++) { + p_first_moments_.emplace_back(gpu_object_.first_moment(i), + trainable_layer_sizes[i]); + p_second_moments_.emplace_back(gpu_object_.second_moment(i), + trainable_layer_sizes[i]); + } +#endif } //! Adam based gradient descent void GradientDescent(const std::vector& derivatives, @@ -60,12 +81,20 @@ class AdamOptimizer : public BaseOptimizer { size_t layer_number) final; private: +#ifdef GALOIS_ENABLE_GPU + AdamOptimizerGPU gpu_object_; +#endif + //! Configuration options for this layer AdamConfiguration config_; //! First moment vectors; one for each trainable layer std::vector> first_moments_; //! Second moment vectors; one for each trainable layer std::vector> second_moments_; + // PointerWithSize versions of first/second moments (for use in function + // to support GPU pointers as well + std::vector> p_first_moments_; + std::vector> p_second_moments_; //! Number of layers that can be trained (need moment vectors for each) size_t num_trainable_layers_; // power terms used in adam: updated by raising power every time update is diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp index 53088825fd..94d51310b9 100644 --- a/libgnn/src/GNNOptimizers.cpp +++ b/libgnn/src/GNNOptimizers.cpp @@ -9,8 +9,8 @@ void galois::AdamOptimizer::GradientDescent( assert(derivatives.size() == matrix->size()); // grab based on layer being used - std::vector& first_moment = first_moments_[layer_number]; - std::vector& second_moment = second_moments_[layer_number]; + PointerWithSize& first_moment = p_first_moments_[layer_number]; + PointerWithSize& second_moment = p_second_moments_[layer_number]; assert(derivatives.size() == first_moment.size()); assert(derivatives.size() == second_moment.size()); diff --git a/libgnn/src/GNNOptimizers.cu b/libgnn/src/GNNOptimizers.cu new file mode 100644 index 0000000000..ff5b771b59 --- /dev/null +++ b/libgnn/src/GNNOptimizers.cu @@ -0,0 +1,29 @@ +#include "galois/GNNOptimizers.cuh" +#include "galois/CUDAUtil.h" + +galois::AdamOptimizerGPU::AdamOptimizerGPU( + const std::vector& trainable_layer_sizes, size_t num_trainable) { + num_layers_ = num_trainable; + first_moments_.resize(num_layers_); + second_moments_.resize(num_layers_); + + for (size_t layer = 0; layer < num_layers_; layer++) { + // initialize the moment vector memory then zero it all out + CUDA_CHECK(cudaMalloc((void**)(&(first_moments_[layer])), + trainable_layer_sizes[layer] * sizeof(GNNFloat))); + CUDA_CHECK(cudaMalloc((void**)(&(second_moments_[layer])), + trainable_layer_sizes[layer] * sizeof(GNNFloat))); + CUDA_CHECK(cudaMemset(first_moments_[layer], 0, + trainable_layer_sizes[layer] * sizeof(GNNFloat))); + CUDA_CHECK(cudaMemset(second_moments_[layer], 0, + trainable_layer_sizes[layer] * sizeof(GNNFloat))); + } +} + +galois::AdamOptimizerGPU::~AdamOptimizerGPU() { + // loop through and free first/second moments + for (size_t layer = 0; layer < num_layers_; layer++) { + CUDA_FREE(first_moments_[layer]); + CUDA_FREE(second_moments_[layer]); + } +} diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 01199c1247..9c7547b8d3 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -47,6 +47,9 @@ else() target_link_libraries(gpu-softmaxlayer-test galois_gnn) add_test(NAME gpu-softmaxlayer-test COMMAND gpu-softmaxlayer-test) + add_executable(gpu-adam-test gpu-adam-test.cpp) + target_link_libraries(gpu-adam-test galois_gnn) + #add_test(NAME gpu-adam-test COMMAND gpu-adam-test) endif() # TODO multi host tests? diff --git a/libgnn/test/gpu-adam-test.cpp b/libgnn/test/gpu-adam-test.cpp new file mode 100644 index 0000000000..faee872bfa --- /dev/null +++ b/libgnn/test/gpu-adam-test.cpp @@ -0,0 +1,49 @@ +//! @file adam-test.cpp +//! Tests the adam optimizer +#include "galois/DistGalois.h" +#include "galois/GNNOptimizers.h" +#include "galois/Logging.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + GALOIS_LOG_VERBOSE("[{}] Using {} threads", + galois::runtime::getSystemNetworkInterface().ID, + num_threads); + + // create sample config that is easy to trace + galois::AdamOptimizer::AdamConfiguration config; + config.alpha = 1; + config.beta1 = 0.5; + config.beta2 = 0.5; + config.epsilon = 0; + + std::vector layer_sizes = {2, 1}; + galois::AdamOptimizer adam(config, layer_sizes, 2); + printf("%p\n", &adam); + + // std::vector weights1 = {1, 1}; + // std::vector weights2 = {10}; + // std::vector grad1 = {1, 1}; + // std::vector grad2 = {10}; + + // adam.GradientDescent(grad1, &weights1, 0); + //// check weights + // GALOIS_LOG_ASSERT(weights1[0] == 0.0); + // GALOIS_LOG_ASSERT(weights1[1] == 0.0); + + // adam.GradientDescent(grad2, &weights2, 1); + // GALOIS_LOG_ASSERT(weights2[0] == 9.0); + + //// run again to check if adam keeps moments from before + // adam.GradientDescent(grad1, &weights1, 0); + //// check weights again (turns out derivative one ends up doing same thing) + // GALOIS_LOG_ASSERT(weights1[0] == -1.0); + // GALOIS_LOG_ASSERT(weights1[1] == -1.0); + + //// grad 2 again + // adam.GradientDescent(grad2, &weights2, 1); + // GALOIS_LOG_ASSERT(weights2[0] == 8.0); +} From af9c72a664438dc5003021ec10c3e3849f87110e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 18 Nov 2020 17:57:54 -0600 Subject: [PATCH 434/660] GradientDescent call now uses PointerWithSize The gradient descent call in the optimizers now uses PointerWithSize rather than std::vectors. This is for compatibility with GPU pointers. Calls to the function have been changed throughout the code accordingly. --- libgnn/include/galois/GNNOptimizers.h | 8 ++++---- libgnn/src/GNNOptimizers.cpp | 12 ++++++++---- libgnn/src/layers/GNNLayer.cpp | 2 +- libgnn/test/CMakeLists.txt | 2 +- libgnn/test/adam-test.cpp | 8 ++++---- libgnn/test/gpu-adam-test.cpp | 11 +++++------ 6 files changed, 23 insertions(+), 20 deletions(-) diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h index 9528612ef4..e649b73887 100644 --- a/libgnn/include/galois/GNNOptimizers.h +++ b/libgnn/include/galois/GNNOptimizers.h @@ -19,8 +19,8 @@ namespace galois { //! Virtual class; optimizers all need the descent function class BaseOptimizer { public: - virtual void GradientDescent(const std::vector& derivatives, - std::vector* matrix, + virtual void GradientDescent(PointerWithSize derivatives, + PointerWithSize matrix, size_t layer_number) = 0; }; @@ -76,8 +76,8 @@ class AdamOptimizer : public BaseOptimizer { #endif } //! Adam based gradient descent - void GradientDescent(const std::vector& derivatives, - std::vector* matrix, + void GradientDescent(PointerWithSize derivatives, + PointerWithSize matrix, size_t layer_number) final; private: diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp index 94d51310b9..fa1f4dd10c 100644 --- a/libgnn/src/GNNOptimizers.cpp +++ b/libgnn/src/GNNOptimizers.cpp @@ -4,9 +4,9 @@ #include void galois::AdamOptimizer::GradientDescent( - const std::vector& derivatives, std::vector* matrix, + PointerWithSize derivatives, PointerWithSize matrix, size_t layer_number) { - assert(derivatives.size() == matrix->size()); + assert(derivatives.size() == matrix.size()); // grab based on layer being used PointerWithSize& first_moment = p_first_moments_[layer_number]; @@ -14,9 +14,10 @@ void galois::AdamOptimizer::GradientDescent( assert(derivatives.size() == first_moment.size()); assert(derivatives.size() == second_moment.size()); +#ifndef GALOIS_ENABLE_GPU // individual weight updates via gradients galois::do_all( - galois::iterate(static_cast(0), matrix->size()), + galois::iterate(static_cast(0), matrix.size()), [&](size_t i) { // moment estimate updates first_moment[i] = config_.beta1 * first_moment[i] + @@ -30,11 +31,14 @@ void galois::AdamOptimizer::GradientDescent( GNNFloat bias_correct_second = second_moment[i] / (1.0 - beta2_power_t_[layer_number]); // weight update using bias corrected moments - (matrix->data())[i] -= + (matrix.data())[i] -= config_.alpha * bias_correct_first / (std::sqrt(bias_correct_second) + config_.epsilon); }, galois::loopname("AdamOptimizerGradientDescent")); +#else + // gpu_object_.DoAdamUpdate(first_moment.data(), second_moment.data(), ); +#endif // update the power terms for next update call beta1_power_t_[layer_number] *= config_.beta1; diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 31cf58c6c7..5688e13c31 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -179,7 +179,7 @@ void galois::GNNLayer::ActivationDerivative( void galois::GNNLayer::OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number) { - optimizer->GradientDescent(layer_weight_gradients_, &layer_weights_, + optimizer->GradientDescent(p_layer_weight_gradients_, p_layer_weights_, trainable_layer_number); } diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 9c7547b8d3..54e9bd43af 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -49,7 +49,7 @@ else() add_executable(gpu-adam-test gpu-adam-test.cpp) target_link_libraries(gpu-adam-test galois_gnn) - #add_test(NAME gpu-adam-test COMMAND gpu-adam-test) + add_test(NAME gpu-adam-test COMMAND gpu-adam-test) endif() # TODO multi host tests? diff --git a/libgnn/test/adam-test.cpp b/libgnn/test/adam-test.cpp index dfdfcdad00..159e27c744 100644 --- a/libgnn/test/adam-test.cpp +++ b/libgnn/test/adam-test.cpp @@ -28,21 +28,21 @@ int main() { std::vector grad1 = {1, 1}; std::vector grad2 = {10}; - adam.GradientDescent(grad1, &weights1, 0); + adam.GradientDescent(grad1, weights1, 0); // check weights GALOIS_LOG_ASSERT(weights1[0] == 0.0); GALOIS_LOG_ASSERT(weights1[1] == 0.0); - adam.GradientDescent(grad2, &weights2, 1); + adam.GradientDescent(grad2, weights2, 1); GALOIS_LOG_ASSERT(weights2[0] == 9.0); // run again to check if adam keeps moments from before - adam.GradientDescent(grad1, &weights1, 0); + adam.GradientDescent(grad1, weights1, 0); // check weights again (turns out derivative one ends up doing same thing) GALOIS_LOG_ASSERT(weights1[0] == -1.0); GALOIS_LOG_ASSERT(weights1[1] == -1.0); // grad 2 again - adam.GradientDescent(grad2, &weights2, 1); + adam.GradientDescent(grad2, weights2, 1); GALOIS_LOG_ASSERT(weights2[0] == 8.0); } diff --git a/libgnn/test/gpu-adam-test.cpp b/libgnn/test/gpu-adam-test.cpp index faee872bfa..24a19fb66c 100644 --- a/libgnn/test/gpu-adam-test.cpp +++ b/libgnn/test/gpu-adam-test.cpp @@ -22,14 +22,13 @@ int main() { std::vector layer_sizes = {2, 1}; galois::AdamOptimizer adam(config, layer_sizes, 2); - printf("%p\n", &adam); - // std::vector weights1 = {1, 1}; - // std::vector weights2 = {10}; - // std::vector grad1 = {1, 1}; - // std::vector grad2 = {10}; + std::vector weights1 = {1, 1}; + std::vector weights2 = {10}; + std::vector grad1 = {1, 1}; + std::vector grad2 = {10}; - // adam.GradientDescent(grad1, &weights1, 0); + adam.GradientDescent(grad1, weights1, 0); //// check weights // GALOIS_LOG_ASSERT(weights1[0] == 0.0); // GALOIS_LOG_ASSERT(weights1[1] == 0.0); From 599ee5f4432e2255ef1e0676c3485e62c384a3a9 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 18 Nov 2020 18:43:22 -0600 Subject: [PATCH 435/660] Adam optimizer on GPU + test done; CPU fix Implements Adam optimization on the GPU and makes sure it's sane via the gpu unit test. Also fixes an inconsistency with the CPU adam optimizer where a sqrt wasn't being applied to epsilon like it is in the original non-refactored code. --- libgnn/include/galois/GNNOptimizers.cuh | 11 +++++ libgnn/include/galois/GNNOptimizers.h | 6 +++ libgnn/src/GNNOptimizers.cpp | 11 +++-- libgnn/src/GNNOptimizers.cu | 39 ++++++++++++++++++ libgnn/test/gpu-adam-test.cpp | 54 ++++++++++++++++++------- 5 files changed, 103 insertions(+), 18 deletions(-) diff --git a/libgnn/include/galois/GNNOptimizers.cuh b/libgnn/include/galois/GNNOptimizers.cuh index 13fcb97263..42f499b557 100644 --- a/libgnn/include/galois/GNNOptimizers.cuh +++ b/libgnn/include/galois/GNNOptimizers.cuh @@ -19,6 +19,17 @@ public: GNNFloat* first_moment(size_t i) { return first_moments_[i]; }; GNNFloat* second_moment(size_t i) { return second_moments_[i]; }; + //! Calls into a GPU kernel; needs to be done this way as this cuh is included + //! in a GCC build, so the kernel cannot be defined in this header. + void AdamUpdate(const GNNFloat* derivatives, GNNFloat* matrix_to_update, + size_t matrix_size, GNNFloat* first_moment, + GNNFloat* second_moment, GNNFloat alpha, GNNFloat beta1, + GNNFloat beta2, GNNFloat epsilon, GNNFloat beta1t, + GNNFloat beta2t); + + //! Helper to copy gpu pointer to cpu vector + void CopyToVector(std::vector& to, PointerWithSize from); + private: size_t num_layers_; std::vector first_moments_; diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h index e649b73887..86a656fd30 100644 --- a/libgnn/include/galois/GNNOptimizers.h +++ b/libgnn/include/galois/GNNOptimizers.h @@ -80,6 +80,12 @@ class AdamOptimizer : public BaseOptimizer { PointerWithSize matrix, size_t layer_number) final; +#ifdef GALOIS_ENABLE_GPU + //! helper function for unit testing to do some vector copying + void CopyToVector(std::vector& to, PointerWithSize from) { + gpu_object_.CopyToVector(to, from); + } +#endif private: #ifdef GALOIS_ENABLE_GPU AdamOptimizerGPU gpu_object_; diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp index fa1f4dd10c..566b61c14e 100644 --- a/libgnn/src/GNNOptimizers.cpp +++ b/libgnn/src/GNNOptimizers.cpp @@ -31,13 +31,16 @@ void galois::AdamOptimizer::GradientDescent( GNNFloat bias_correct_second = second_moment[i] / (1.0 - beta2_power_t_[layer_number]); // weight update using bias corrected moments - (matrix.data())[i] -= - config_.alpha * bias_correct_first / - (std::sqrt(bias_correct_second) + config_.epsilon); + (matrix.data())[i] -= config_.alpha * bias_correct_first / + std::sqrt(bias_correct_second + config_.epsilon); }, galois::loopname("AdamOptimizerGradientDescent")); #else - // gpu_object_.DoAdamUpdate(first_moment.data(), second_moment.data(), ); + gpu_object_.AdamUpdate(derivatives.data(), matrix.data(), matrix.size(), + first_moment.data(), second_moment.data(), + config_.alpha, config_.beta1, config_.beta2, + config_.epsilon, beta1_power_t_[layer_number], + beta2_power_t_[layer_number]); #endif // update the power terms for next update call diff --git a/libgnn/src/GNNOptimizers.cu b/libgnn/src/GNNOptimizers.cu index ff5b771b59..77f3e74f5f 100644 --- a/libgnn/src/GNNOptimizers.cu +++ b/libgnn/src/GNNOptimizers.cu @@ -27,3 +27,42 @@ galois::AdamOptimizerGPU::~AdamOptimizerGPU() { CUDA_FREE(second_moments_[layer]); } } +void galois::AdamOptimizerGPU::CopyToVector(std::vector& to, + PointerWithSize from) { + CUDA_CHECK(cudaMemcpy(to.data(), from.data(), to.size() * sizeof(GNNFloat), + cudaMemcpyDeviceToHost)); +} + +namespace { + +__global__ void DoAdamUpdate(const galois::GNNFloat* derivatives, + galois::GNNFloat* matrix_to_update, + size_t matrix_size, galois::GNNFloat* first_moment, + galois::GNNFloat* second_moment, + galois::GNNFloat alpha, galois::GNNFloat beta1, + galois::GNNFloat beta2, galois::GNNFloat epsilon, + galois::GNNFloat beta1t, galois::GNNFloat beta2t) { + CUDA_KERNEL_LOOP(i, matrix_size) { + first_moment[i] = beta1 * first_moment[i] + (1.0 - beta1) * derivatives[i]; + second_moment[i] = beta2 * second_moment[i] + + (1.0 - beta2) * (derivatives[i] * derivatives[i]); + // bias corrected moments using beta power + galois::GNNFloat bias_correct_first = first_moment[i] / (1.0 - beta1t); + galois::GNNFloat bias_correct_second = second_moment[i] / (1.0 - beta2t); + // weight update using bias corrected moments + matrix_to_update[i] -= + alpha * bias_correct_first / sqrtf(bias_correct_second + epsilon); + } +} + +} // namespace + +void galois::AdamOptimizerGPU::AdamUpdate( + const GNNFloat* derivatives, GNNFloat* matrix_to_update, size_t matrix_size, + GNNFloat* first_moment, GNNFloat* second_moment, GNNFloat alpha, + GNNFloat beta1, GNNFloat beta2, GNNFloat epsilon, GNNFloat beta1t, + GNNFloat beta2t) { + DoAdamUpdate<<>>( + derivatives, matrix_to_update, matrix_size, first_moment, second_moment, + alpha, beta1, beta2, epsilon, beta1t, beta2t); +} diff --git a/libgnn/test/gpu-adam-test.cpp b/libgnn/test/gpu-adam-test.cpp index 24a19fb66c..a1d0c1961e 100644 --- a/libgnn/test/gpu-adam-test.cpp +++ b/libgnn/test/gpu-adam-test.cpp @@ -3,6 +3,7 @@ #include "galois/DistGalois.h" #include "galois/GNNOptimizers.h" #include "galois/Logging.h" +#include "galois/layers/SoftmaxLayer.h" int main() { galois::DistMemSys G; @@ -23,26 +24,51 @@ int main() { std::vector layer_sizes = {2, 1}; galois::AdamOptimizer adam(config, layer_sizes, 2); + // make this layer to get access to a gpu helper function; TODO + // need a helper alloc function + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = 7; + dimension_0.input_columns = test_graph.GetNumLabelClasses(); + dimension_0.output_columns = test_graph.GetNumLabelClasses(); + auto alloc_layer = + std::make_unique(3, test_graph, dimension_0); + std::vector weights1 = {1, 1}; std::vector weights2 = {10}; std::vector grad1 = {1, 1}; std::vector grad2 = {10}; - adam.GradientDescent(grad1, weights1, 0); - //// check weights - // GALOIS_LOG_ASSERT(weights1[0] == 0.0); - // GALOIS_LOG_ASSERT(weights1[1] == 0.0); + galois::PointerWithSize p_grad1 = + alloc_layer->AllocateGPU(grad1); + galois::PointerWithSize p_weights1 = + alloc_layer->AllocateGPU(weights1); + galois::PointerWithSize p_grad2 = + alloc_layer->AllocateGPU(grad2); + galois::PointerWithSize p_weights2 = + alloc_layer->AllocateGPU(weights2); + + adam.GradientDescent(p_grad1, p_weights1, 0); + adam.CopyToVector(weights1, p_weights1); + + // check weights + GALOIS_LOG_ASSERT(weights1[0] == 0.0); + GALOIS_LOG_ASSERT(weights1[1] == 0.0); - // adam.GradientDescent(grad2, &weights2, 1); - // GALOIS_LOG_ASSERT(weights2[0] == 9.0); + adam.GradientDescent(p_grad2, p_weights2, 1); + adam.CopyToVector(weights2, p_weights2); + GALOIS_LOG_ASSERT(weights2[0] == 9.0); - //// run again to check if adam keeps moments from before - // adam.GradientDescent(grad1, &weights1, 0); - //// check weights again (turns out derivative one ends up doing same thing) - // GALOIS_LOG_ASSERT(weights1[0] == -1.0); - // GALOIS_LOG_ASSERT(weights1[1] == -1.0); + // run again to check if adam keeps moments from before + adam.GradientDescent(p_grad1, p_weights1, 0); + adam.CopyToVector(weights1, p_weights1); + // check weights again (turns out derivative one ends up doing same thing) + GALOIS_LOG_ASSERT(weights1[0] == -1.0); + GALOIS_LOG_ASSERT(weights1[1] == -1.0); - //// grad 2 again - // adam.GradientDescent(grad2, &weights2, 1); - // GALOIS_LOG_ASSERT(weights2[0] == 8.0); + // grad 2 again + adam.GradientDescent(p_grad2, p_weights2, 1); + adam.CopyToVector(weights2, p_weights2); + GALOIS_LOG_ASSERT(weights2[0] == 8.0); } From 5fd1abfebb755d6c19b0960177e395dbede5c76c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 19 Nov 2020 18:36:08 -0600 Subject: [PATCH 436/660] gpu epoch test; fixes to returned pointers Adds a gpu version of the epoch test and fixes the pointers returned from a GNN layer (it was always returning CPU pointers even in the GPU build). Adds error checking to cuSparse call too. gpu-epoch-test runs a GNN end to end (still missing some features that CPU has), but it has to copy predictions over from GPU (slow, should do this from GPU end) + there seem to be accuracy issues on reddit. Will be resolved in a later commit. --- libgnn/include/galois/layers/GNNLayer.h | 6 +-- libgnn/src/GNNMath.cu | 1 + libgnn/src/GraphNeuralNetwork.cpp | 11 ++-- libgnn/test/CMakeLists.txt | 4 ++ libgnn/test/gpu-epoch-test.cpp | 69 +++++++++++++++++++++++++ 5 files changed, 82 insertions(+), 9 deletions(-) create mode 100644 libgnn/test/gpu-epoch-test.cpp diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index f4acec8f25..3296b17d20 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -88,16 +88,16 @@ class GNNLayer { } const PointerWithSize GetForwardOutput() { - return PointerWithSize(forward_output_matrix_); + return p_forward_output_matrix_; } const PointerWithSize GetBackwardOutput() { - return PointerWithSize(backward_output_matrix_); + return p_backward_output_matrix_; } //! Returns the weight gradients const PointerWithSize GetLayerWeightGradients() { - return PointerWithSize(layer_weight_gradients_); + return p_layer_weight_gradients_; } //! Returns dimensions of this layer diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu index 5b429dafb2..8f60f91d84 100644 --- a/libgnn/src/GNNMath.cu +++ b/libgnn/src/GNNMath.cu @@ -26,6 +26,7 @@ void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a, output_columns, input_rows, input_columns, &dummy1, b, lead_dim_b, a, lead_dim_a, &dummy0, output, output_columns)); + CUDA_TEST("cublas sgemm failure"); } __global__ void galois::SoftmaxCrossEntropyForward( diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index e669feac50..ebe486b47a 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -102,12 +102,12 @@ galois::GraphNeuralNetwork::DoInference() { float galois::GraphNeuralNetwork::GetGlobalAccuracy( const PointerWithSize predictions) { // TODO mark as a forwarding argument? -#ifndef GALOIS_ENABLE_GPU + //#ifndef GALOIS_ENABLE_GPU return GetGlobalAccuracyCPU(predictions); -#else - return gpu_object_.GetGlobalAccuracyGPU(graph_->GetGPUGraph(), phase_, - predictions); -#endif + //#else + // return gpu_object_.GetGlobalAccuracyGPU(graph_->GetGPUGraph(), phase_, + // predictions); + //#endif } float galois::GraphNeuralNetwork::GetGlobalAccuracyCPU( @@ -158,7 +158,6 @@ void galois::GraphNeuralNetwork::GradientPropagation() { std::unique_ptr& output_layer = gnn_layers_.back(); galois::PointerWithSize current_gradients = output_layer->BackwardPhase(dummy, nullptr); - // loops through intermediate layers in a backward fashion // -1 to ignore output layer which was handled above for (size_t i = 0; i < gnn_layers_.size() - 1; i++) { diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 54e9bd43af..c900c7318c 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -50,6 +50,10 @@ else() add_executable(gpu-adam-test gpu-adam-test.cpp) target_link_libraries(gpu-adam-test galois_gnn) add_test(NAME gpu-adam-test COMMAND gpu-adam-test) + + add_executable(gpu-epoch-test gpu-epoch-test.cpp) + target_link_libraries(gpu-epoch-test galois_gnn) + #add_test(NAME gpu-epoch-test COMMAND gpu-epoch-test) endif() # TODO multi host tests? diff --git a/libgnn/test/gpu-epoch-test.cpp b/libgnn/test/gpu-epoch-test.cpp new file mode 100644 index 0000000000..6223fce8e5 --- /dev/null +++ b/libgnn/test/gpu-epoch-test.cpp @@ -0,0 +1,69 @@ +//! @file epoch-test.cpp +//! Run 50 epochs of training to see if results improve. + +#include "galois/Logging.h" +#include "galois/GraphNeuralNetwork.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + // size_t num_threads = galois::setActiveThreads(1); + GALOIS_LOG_VERBOSE("Num threads is {}", num_threads); + + // load graph + auto test_graph = std::make_unique( + "reddit", galois::graphs::GNNPartitionScheme::kCVC, true); + + std::vector layer_types = { + galois::GNNLayerType::kGraphConvolutional, + galois::GNNLayerType::kGraphConvolutional}; + std::vector layer_output_sizes = { + 16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()}; + galois::GNNLayerConfig layer_config; + layer_config.do_dropout = false; + layer_config.do_activation = false; + layer_config.do_normalization = true; + // XXX Activation kills accuracy compared to old code, esp. for cora + galois::GraphNeuralNetworkConfig gnn_config( + 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, + layer_config); + + std::vector adam_sizes = {16 * test_graph->node_feature_length(), + 16 * test_graph->GetNumLabelClasses()}; + auto adam = std::make_unique(adam_sizes, 2); + + std::vector cpu_pred; + cpu_pred.resize(test_graph->GetNumLabelClasses() * test_graph->size()); + + auto gnn = std::make_unique( + std::move(test_graph), std::move(adam), std::move(gnn_config)); + + ////////////////////////////////////////////////////////////////////////////// + + // no verification; test should be eyeballed to make sure accuracy is + // increasing + galois::StatTimer main_timer("Timer_0"); + main_timer.start(); + for (size_t epoch = 0; epoch < 50; epoch++) { + galois::PointerWithSize predictions = gnn->DoInference(); + if (cpu_pred.size() != predictions.size()) { + cpu_pred.resize(predictions.size()); + } + gnn->GradientPropagation(); + // copy to cpu + // TODO currently adam has this helper function; it should be handled + // by other class though + adam->CopyToVector(cpu_pred, predictions); + galois::gPrint("Epoch ", epoch, ": Accuracy is ", + gnn->GetGlobalAccuracy(cpu_pred), "\n"); + } + + // check test accuracy + gnn->SetLayerPhases(galois::GNNPhase::kTest); + galois::PointerWithSize predictions = gnn->DoInference(); + adam->CopyToVector(cpu_pred, predictions); + galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(cpu_pred), "\n"); + main_timer.stop(); +} From e500cb06a4c2313e6d948f8053b2efa32b44888d Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 24 Nov 2020 17:35:37 -0600 Subject: [PATCH 437/660] GNN: Copy over norm factors from CPU to GPU Norm factors are required during aggregation in order for the current computation on GPU to match CPU computation (earlier I was under the impression that norm factors were integrated into the data that was already copied, but this is incorrect). This commit adds the norm factor copy from CPU to GPU. --- libgnn/include/galois/graphs/GNNGraph.cuh | 4 ++++ libgnn/src/graphs/GNNGraph.cpp | 1 + libgnn/src/graphs/GNNGraph.cu | 9 +++++++++ 3 files changed, 14 insertions(+) diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh index d485808972..81bf00971a 100644 --- a/libgnn/include/galois/graphs/GNNGraph.cuh +++ b/libgnn/include/galois/graphs/GNNGraph.cuh @@ -23,6 +23,8 @@ public: //! Copy over masks for the 3 sets to GPU void SetMasks(const std::vector& train, const std::vector& val, const std::vector& test); + //! Copy over norm factors + void SetNormFactors(const std::vector norm_factors); GNNFeature* feature_vector() const { return feature_vector_; }; int* edge_index() const { return edge_index_; } @@ -58,6 +60,8 @@ private: char* local_training_mask_{nullptr}; char* local_validation_mask_{nullptr}; char* local_testing_mask_{nullptr}; + //! Norm factors used during aggregation + GNNFloat* norm_factors_; }; } // namespace graphs diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index cbdf5e13db..059759a81e 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -419,5 +419,6 @@ void galois::graphs::GNNGraph::InitGPUMemory() { gpu_memory_.SetLabels(local_ground_truth_labels_); gpu_memory_.SetMasks(local_training_mask_, local_validation_mask_, local_testing_mask_); + gpu_memory_.SetNormFactors(norm_factors_); } #endif diff --git a/libgnn/src/graphs/GNNGraph.cu b/libgnn/src/graphs/GNNGraph.cu index b0d5c1eb43..96ba37db15 100644 --- a/libgnn/src/graphs/GNNGraph.cu +++ b/libgnn/src/graphs/GNNGraph.cu @@ -82,3 +82,12 @@ void galois::graphs::GNNGraphGPUAllocations::SetMasks( CUDA_CHECK(cudaMemcpy(local_testing_mask_, test.data(), test.size() * sizeof(char), cudaMemcpyHostToDevice)); } + +void galois::graphs::GNNGraphGPUAllocations::SetNormFactors( + const std::vector norm_factors) { + CUDA_CHECK(cudaMalloc((void**)(&norm_factors_), + norm_factors.size() * sizeof(GNNFloat))); + CUDA_CHECK(cudaMemcpy(norm_factors_, norm_factors.data(), + norm_factors.size() * sizeof(GNNFloat), + cudaMemcpyHostToDevice)); +} From 0cdaaf5146f0c0bcc447558248f697aa2bf41af6 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 24 Nov 2020 18:08:22 -0600 Subject: [PATCH 438/660] GPU GCN aggregation uses norm factors Aggregation in the GPU for GCN now uses norm factors to normalize the aggregations of neighbors. This change allows it to exactly match computation done on a CPU if dropout is turned off. The next step is to add dropout support to the GPU. --- libgnn/include/galois/graphs/GNNGraph.cuh | 3 +- .../galois/layers/GraphConvolutionalLayer.cuh | 2 +- libgnn/src/GraphNeuralNetwork.cu | 1 + libgnn/src/layers/GraphConvolutionalLayer.cpp | 3 +- libgnn/src/layers/GraphConvolutionalLayer.cu | 35 ++++++++++++++----- libgnn/test/gpu-epoch-test.cpp | 2 +- 6 files changed, 32 insertions(+), 14 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh index 81bf00971a..2012dcd7c9 100644 --- a/libgnn/include/galois/graphs/GNNGraph.cuh +++ b/libgnn/include/galois/graphs/GNNGraph.cuh @@ -29,12 +29,11 @@ public: GNNFeature* feature_vector() const { return feature_vector_; }; int* edge_index() const { return edge_index_; } int* edge_destinations() const { return edge_destinations_; } - GNNLabel* ground_truth() const { return ground_truth_; } - char* local_training_mask() const { return local_training_mask_; } char* local_validation_mask() const { return local_validation_mask_; } char* local_testing_mask() const { return local_testing_mask_; } + GNNFloat* norm_factors() const { return norm_factors_; } private: // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh index fd4d9d76f0..c59617828d 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh @@ -18,7 +18,7 @@ public: void AggregateAllGPU(const graphs::GNNGraphGPUAllocations& gpu_graph, size_t num_nodes, size_t column_length, const GNNFloat* node_embeddings, - GNNFloat* aggregate_output); + GNNFloat* aggregate_output, bool use_norm); void UpdateEmbeddingsGPU(size_t num_nodes, size_t input_columns, size_t output_columns, diff --git a/libgnn/src/GraphNeuralNetwork.cu b/libgnn/src/GraphNeuralNetwork.cu index a16c4b2b69..2d04073563 100644 --- a/libgnn/src/GraphNeuralNetwork.cu +++ b/libgnn/src/GraphNeuralNetwork.cu @@ -21,6 +21,7 @@ float galois::GraphNeuralNetworkGPU::GetGlobalAccuracyGPU( } // run accuracy check kernel on GPU + // TODO finish this implementation return 0.0; } diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index ef9d3cbb03..04fea5f286 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -172,7 +172,8 @@ void galois::GraphConvolutionalLayer::AggregateAll( AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts); #else gpu_object_.AggregateAllGPU(graph_.GetGPUGraph(), graph_.size(), - column_length, node_embeddings, aggregate_output); + column_length, node_embeddings, aggregate_output, + config_.do_normalization); #endif } diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu index 7828336b28..882cb32391 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cu +++ b/libgnn/src/layers/GraphConvolutionalLayer.cu @@ -23,6 +23,7 @@ namespace { __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length, const int* edge_index, const int* edge_destination, + const galois::GNNFloat* norm_factors, const galois::GNNFloat* node_embeddings, galois::GNNFloat* aggregate_output) { const unsigned thread_id = @@ -41,6 +42,13 @@ __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length, // each warp works on a source: threads in warp split the feature for (int src = warp_id; src < static_cast(num_nodes); src += num_warps) { + galois::GNNFloat src_norm = 0.0; + galois::GNNFloat norm_to_use = 1.0; + + if (norm_factors != nullptr) { + src_norm = norm_factors[src]; + } + if (thread_lane < 2) { edge_begin_end[warp_lane][thread_lane] = edge_index[src + thread_lane]; } @@ -56,19 +64,20 @@ __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length, int dst = edge_destination[offset]; unsigned base_dst_index = dst * column_length; + if (norm_factors != nullptr) { + // note that otherwise it's 1.0, so a no-op when it comes to multiply + norm_to_use = src_norm * norm_factors[dst]; + } + // NOTE: this is where warp diverges // the feature aggregation is split among thread in a warp for (int i = 0; i < column_length; i += WARP_SIZE) { if ((thread_lane + i) < column_length) { aggregate_output[base_src_index + thread_lane + i] += - node_embeddings[base_dst_index + thread_lane + i]; + node_embeddings[base_dst_index + thread_lane + i] * norm_to_use; } } } - //__syncthreads(); - // if (thread_lane == 0) { - // printf("Agg %d %f\n", src, aggregate_output[base_src_index]); - //} } } @@ -77,12 +86,20 @@ __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length, void galois::GCNGPUAllocations::AggregateAllGPU( const graphs::GNNGraphGPUAllocations& gpu_graph, size_t num_nodes, size_t column_length, const GNNFloat* node_embeddings, - GNNFloat* aggregate_output) { + GNNFloat* aggregate_output, bool use_norm) { CUDA_CHECK(cudaMemset(aggregate_output, 0, num_nodes * column_length * sizeof(GNNFloat))); - AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>( - num_nodes, column_length, gpu_graph.edge_index(), - gpu_graph.edge_destinations(), node_embeddings, aggregate_output); + if (use_norm) { + AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>( + num_nodes, column_length, gpu_graph.edge_index(), + gpu_graph.edge_destinations(), gpu_graph.norm_factors(), + node_embeddings, aggregate_output); + } else { + AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>( + num_nodes, column_length, gpu_graph.edge_index(), + gpu_graph.edge_destinations(), nullptr, node_embeddings, + aggregate_output); + } CUDA_TEST("GPU aggregate all failure"); } diff --git a/libgnn/test/gpu-epoch-test.cpp b/libgnn/test/gpu-epoch-test.cpp index 6223fce8e5..7778550875 100644 --- a/libgnn/test/gpu-epoch-test.cpp +++ b/libgnn/test/gpu-epoch-test.cpp @@ -46,7 +46,7 @@ int main() { // increasing galois::StatTimer main_timer("Timer_0"); main_timer.start(); - for (size_t epoch = 0; epoch < 50; epoch++) { + for (size_t epoch = 0; epoch < 20; epoch++) { galois::PointerWithSize predictions = gnn->DoInference(); if (cpu_pred.size() != predictions.size()) { cpu_pred.resize(predictions.size()); From 982a0bc783d92cdf3c3b398c794f72b7ff968d64 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 24 Nov 2020 18:59:18 -0600 Subject: [PATCH 439/660] Init function for CuRAND Efficient dropout support requires RNG on the GPU: this commit adds a function to init the CuRAND RNG so that the GPU can generate the random numbers required to choose things to drop for dropout. --- libgnn/CMakeLists.txt | 4 ++-- libgnn/include/galois/CUDAUtil.h | 11 +++++++++++ libgnn/include/galois/GNNMath.cuh | 4 ++++ libgnn/src/GNNMath.cu | 14 ++++++++++++-- 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 362fc7f773..baee47c3fb 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -47,8 +47,8 @@ if (GALOIS_ENABLE_GPU) ${CMAKE_CURRENT_SOURCE_DIR}/include ) - # link to gpu lib (which takes care of moderngpu and cub) - target_link_libraries(galois_gnn_gpu Galois::gpu galois_support -lcublas) + # link to gpu lib (which takes care of moderngpu and cub) as well as cu libs + target_link_libraries(galois_gnn_gpu Galois::gpu galois_support -lcublas -lcurand) # gpu -> cpu lib target_link_libraries(galois_gnn galois_gnn_gpu) diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h index 51be6cd102..fd51eb1362 100644 --- a/libgnn/include/galois/CUDAUtil.h +++ b/libgnn/include/galois/CUDAUtil.h @@ -6,6 +6,7 @@ //! https://github.com/BVLC/caffe/blob/master/include/caffe/util/device_alternate.hpp #include #include +#include #include "galois/Logging.h" // TODO check these too and make sure they make sense @@ -71,4 +72,14 @@ inline int CUDA_GET_BLOCKS(const int N) { } \ } while (0) +//! Wrap a CuRAND call with this to check if it threw any errors +#define CURAND_CHECK(condition) \ + do { \ + curandStatus_t status = condition; \ + if (status != CURAND_STATUS_SUCCESS) { \ + GALOIS_LOG_ERROR("CuRAND error code : {}", status); \ + exit(EXIT_FAILURE); \ + } \ + } while (0) + #endif diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh index aca14a573f..40402f325b 100644 --- a/libgnn/include/galois/GNNMath.cuh +++ b/libgnn/include/galois/GNNMath.cuh @@ -7,9 +7,13 @@ namespace galois { extern bool cublas_is_init; extern cublasHandle_t global_cublas_handle; +extern bool curand_is_init; +extern curandGenerator_t global_curand_generator; //! Initializes the cublas handle to use cublas on GPUs. void InitCuBLAS(); +//! Initializes the curand RNG +void InitCuRAND(); //! Takes 2 *row-major* matrices and does a matrix multiply on the GPU using //! CuBLAS. diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu index 8f60f91d84..026ca17265 100644 --- a/libgnn/src/GNNMath.cu +++ b/libgnn/src/GNNMath.cu @@ -2,8 +2,19 @@ bool galois::cublas_is_init = false; cublasHandle_t galois::global_cublas_handle; +bool galois::curand_is_init = false; +curandGenerator_t galois::global_curand_generator; -void galois::InitCuBLAS() { CUBLAS_CHECK(cublasCreate(&global_cublas_handle)); } +void galois::InitCuBLAS() { + CUBLAS_CHECK(cublasCreate(&global_cublas_handle)); + galois::cublas_is_init = true; +} + +void galois::InitCuRAND() { + CURAND_CHECK(curandCreateGenerator(&galois::global_curand_generator, + CURAND_RNG_PSEUDO_DEFAULT)); + galois::curand_is_init = true; +} void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a, const cublasOperation_t trans_b, size_t input_rows, @@ -12,7 +23,6 @@ void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a, GNNFloat* output) { if (!cublas_is_init) { InitCuBLAS(); - cublas_is_init = true; } size_t lead_dim_a = (trans_a == CUBLAS_OP_N) ? input_columns : input_rows; size_t lead_dim_b = (trans_b == CUBLAS_OP_N) ? output_columns : input_columns; From 256c560e74ed35f0b2c90edb0375a06de9a95848 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 24 Nov 2020 19:34:51 -0600 Subject: [PATCH 440/660] GNNLayer GPU: init dropout memory Initializes a dropout mask for every GPU layer. Can be optimized if dropout is disabled (i.e. do not allocate) for both CPU/GPUs. This will be handled later once a base implementation of everything is settled. It is a float because the float will be checked during dropout to see if it crosses some threshold for dropout. --- libgnn/include/galois/layers/GNNLayer.cuh | 3 +++ libgnn/src/layers/GNNLayer.cpp | 2 ++ libgnn/src/layers/GNNLayer.cu | 6 ++++++ 3 files changed, 11 insertions(+) diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh index 387b1673c4..0e00515302 100644 --- a/libgnn/include/galois/layers/GNNLayer.cuh +++ b/libgnn/include/galois/layers/GNNLayer.cuh @@ -12,6 +12,8 @@ public: void InitInOutMemory(size_t forward_size, size_t backward_size); //! Initializes memory for weight and weight gradients on GPU void InitWeightMemory(size_t num_weights); + //! Initializes memory for dropout + void InitDropoutMemory(size_t dropout_size); //! Copy provided data in vector to GPU weights void CopyToWeights(const std::vector& cpu_layer_weights); //! Copy GPU forward output to the provided vector (assumes vector is already @@ -42,6 +44,7 @@ private: GNNFloat* backward_output_matrix_{nullptr}; GNNFloat* layer_weights_{nullptr}; GNNFloat* layer_weight_gradients_{nullptr}; + GNNFloat* dropout_mask_{nullptr}; }; } // namespace galois diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 5688e13c31..8044eef6cf 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -20,6 +20,8 @@ galois::GNNLayer::GNNLayer(size_t layer_num, layer_weight_gradients_.resize(num_weight_elements, 0); #ifdef GALOIS_ENABLE_GPU base_gpu_object_.InitWeightMemory(num_weight_elements); + base_gpu_object_.InitDropoutMemory(layer_dimensions_.input_rows * + layer_dimensions_.input_columns); #endif GlorotBengioInit(&layer_weights_); diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu index 597fba96bd..dc70817dcf 100644 --- a/libgnn/src/layers/GNNLayer.cu +++ b/libgnn/src/layers/GNNLayer.cu @@ -30,6 +30,12 @@ void galois::GNNLayerGPUAllocations::InitWeightMemory(size_t num_weights) { num_weights * sizeof(GNNFloat))); } +void galois::GNNLayerGPUAllocations::InitDropoutMemory(size_t dropout_size) { + CUDA_CHECK( + cudaMalloc((void**)(&dropout_mask_), dropout_size * sizeof(GNNFloat))); + CUDA_CHECK(cudaMemset(dropout_mask_, 0, dropout_size * sizeof(GNNFloat))); +} + void galois::GNNLayerGPUAllocations::CopyToWeights( const std::vector& cpu_layer_weights) { CUDA_CHECK(cudaMemcpy(layer_weights_, cpu_layer_weights.data(), From 0fa6ea70543e14e01ac4578edcbfb4b032efc5c0 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 25 Nov 2020 12:33:13 -0600 Subject: [PATCH 441/660] Added CuRAND uniform generate wrapper call Wrapper call to generate random numbers in an array on the GPU. --- libgnn/include/galois/GNNMath.cuh | 3 +++ libgnn/src/GNNMath.cu | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh index 40402f325b..1b262fa6a3 100644 --- a/libgnn/include/galois/GNNMath.cuh +++ b/libgnn/include/galois/GNNMath.cuh @@ -15,6 +15,9 @@ void InitCuBLAS(); //! Initializes the curand RNG void InitCuRAND(); +//! Initializes an array with random numbers (0.0, 1.0] +void CuRANDUniformRNG(GNNFloat* array_to_fill, size_t num_elements); + //! Takes 2 *row-major* matrices and does a matrix multiply on the GPU using //! CuBLAS. void CBlasSGEMMGPU(const cublasOperation_t trans_a, diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu index 026ca17265..8771b75d5b 100644 --- a/libgnn/src/GNNMath.cu +++ b/libgnn/src/GNNMath.cu @@ -16,6 +16,15 @@ void galois::InitCuRAND() { galois::curand_is_init = true; } +void galois::CuRANDUniformRNG(GNNFloat* array_to_fill, size_t num_elements) { + // TODO how much overhead does this check have? + if (!galois::curand_is_init) { + galois::InitCuRAND(); + } + CURAND_CHECK(curandGenerateUniform(galois::global_curand_generator, + array_to_fill, num_elements)); +} + void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a, const cublasOperation_t trans_b, size_t input_rows, size_t input_columns, size_t output_columns, From b52e1cbc6e9e6024002541efabceb294248fa898 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 25 Nov 2020 17:10:40 -0600 Subject: [PATCH 442/660] GNN GPU: forward phase dropout Adds code necessary for forward phase of dropout. Adds a data structure for storing chars (result of dropout rng) and renames vars accordingly (floats are converted to bools depending on threshold). Fixes the variables passed into the dropout call as well as before it was using CPU pointers: this commit changes the pointer to use the PointerWithSize objects allocated for this purpose. --- libgnn/include/galois/PerThreadRNG.h | 2 + libgnn/include/galois/layers/GNNLayer.cuh | 7 +++- libgnn/src/layers/GNNLayer.cpp | 10 ++--- libgnn/src/layers/GNNLayer.cu | 37 ++++++++++++++++++- libgnn/src/layers/GraphConvolutionalLayer.cpp | 5 +-- 5 files changed, 50 insertions(+), 11 deletions(-) diff --git a/libgnn/include/galois/PerThreadRNG.h b/libgnn/include/galois/PerThreadRNG.h index fde88386ab..441b1b542c 100644 --- a/libgnn/include/galois/PerThreadRNG.h +++ b/libgnn/include/galois/PerThreadRNG.h @@ -26,6 +26,8 @@ class PerThreadRNG { } //! Return true or false based on some dropout rate bool DoBernoulli(float dropout_rate) { + // TODO can the random number be 0? what is the behavior of 0 > 0? + // same with 1 > 1..... return (GetRandomNumber() > dropout_rate) ? 1 : 0; } diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh index 0e00515302..fed3f12402 100644 --- a/libgnn/include/galois/layers/GNNLayer.cuh +++ b/libgnn/include/galois/layers/GNNLayer.cuh @@ -29,6 +29,10 @@ public: //! Prints forward output matrix on gpu void PrintForwardOutput(size_t num); + //! Does dropout on the GPU; saves non-dropped weights to output + void DoDropoutGPU(const PointerWithSize input_to_dropout, + PointerWithSize output, float dropout_rate); + //! Helper function: give a vector which is copied over to the GPU (new //! memory is allocated as necessary) GNNFloat* Allocate(const std::vector& v); @@ -44,7 +48,8 @@ private: GNNFloat* backward_output_matrix_{nullptr}; GNNFloat* layer_weights_{nullptr}; GNNFloat* layer_weight_gradients_{nullptr}; - GNNFloat* dropout_mask_{nullptr}; + GNNFloat* rng_results_{nullptr}; + char* dropout_mask_{nullptr}; }; } // namespace galois diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 8044eef6cf..3541442bed 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -128,12 +128,12 @@ void galois::GNNLayer::DoDropoutCPU( void galois::GNNLayer::DoDropout( const PointerWithSize input_to_dropout, PointerWithSize* output_matrix) { - //#ifdef GALOIS_ENABLE_GPU - // // XXX - // DoDropoutGPU(); - //#else +#ifdef GALOIS_ENABLE_GPU + base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix, + config_.dropout_rate); +#else DoDropoutCPU(input_to_dropout, output_matrix); - //#endif +#endif } void galois::GNNLayer::DoDropoutDerivative() { diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu index dc70817dcf..0e43a478be 100644 --- a/libgnn/src/layers/GNNLayer.cu +++ b/libgnn/src/layers/GNNLayer.cu @@ -1,4 +1,5 @@ #include "galois/CUDAUtil.h" +#include "galois/GNNMath.cuh" #include "galois/layers/GNNLayer.cuh" galois::GNNLayerGPUAllocations::~GNNLayerGPUAllocations() { @@ -32,8 +33,11 @@ void galois::GNNLayerGPUAllocations::InitWeightMemory(size_t num_weights) { void galois::GNNLayerGPUAllocations::InitDropoutMemory(size_t dropout_size) { CUDA_CHECK( - cudaMalloc((void**)(&dropout_mask_), dropout_size * sizeof(GNNFloat))); - CUDA_CHECK(cudaMemset(dropout_mask_, 0, dropout_size * sizeof(GNNFloat))); + cudaMalloc((void**)(&rng_results_), dropout_size * sizeof(GNNFloat))); + CUDA_CHECK(cudaMemset(rng_results_, 0, dropout_size * sizeof(GNNFloat))); + + CUDA_CHECK(cudaMalloc((void**)(&dropout_mask_), dropout_size * sizeof(char))); + CUDA_CHECK(cudaMemset(dropout_mask_, 0, dropout_size * sizeof(char))); } void galois::GNNLayerGPUAllocations::CopyToWeights( @@ -64,6 +68,35 @@ void galois::GNNLayerGPUAllocations::CopyWeightGradientsToCPU( cudaMemcpyDeviceToHost)); } +namespace { + +__global__ void +DoDropoutImpl(size_t input_size, const galois::GNNFloat* input_to_dropout, + galois::GNNFloat* output, const galois::GNNFloat* rng_vector, + char* dropout_mask, float dropout_rate, galois::GNNFloat scale) { + CUDA_KERNEL_LOOP(i, input_size) { + // convert the rng floats into a mask + dropout_mask[i] = rng_vector[i] > dropout_rate ? 1 : 0; + // use mask to keep/drop weights + output[i] = input_to_dropout[i] * dropout_mask[i] * scale; + } +} + +} // namespace + +void galois::GNNLayerGPUAllocations::DoDropoutGPU( + const PointerWithSize input_to_dropout, + PointerWithSize output, float dropout_rate) { + // RNG which weights to dropout + galois::CuRANDUniformRNG(rng_results_, input_to_dropout.size()); + GNNFloat scale = 1. / (1. - dropout_rate); + // GPU dropout kernel + DoDropoutImpl<<>>( + input_to_dropout.size(), input_to_dropout.data(), output.data(), + rng_results_, dropout_mask_, dropout_rate, scale); + CUDA_TEST("Dropout on GPU failure"); +} + galois::GNNFloat* galois::GNNLayerGPUAllocations::Allocate(const std::vector& v) { // TODO keep track of these so that on destruction they can be freed diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 04fea5f286..d070afa1ef 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -53,9 +53,8 @@ galois::GraphConvolutionalLayer::ForwardPhase( const GNNFloat* input_data = input_embeddings.data(); // first, dropout if (config_.do_dropout && (layer_phase_ == GNNPhase::kTrain)) { - galois::PointerWithSize drop_output(in_temp_1_); - DoDropout(input_embeddings, &drop_output); - input_data = drop_output.data(); + DoDropout(input_embeddings, &p_in_temp_1_); + input_data = p_in_temp_1_.data(); } // flip aggregate/update if dimensions favor it (do less work) From a186e5836ef35c0fee1964859f95688afde3a463 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 25 Nov 2020 17:48:43 -0600 Subject: [PATCH 443/660] GPU GNN: backward phase dropout derivative Adds the GPU wrapper + kernel call for the derivative of dropout for the backward phase of the GCN layer. Makes the epoch tests for both CPUs and GPUs equivalent as well. With this commit, single CPU/GPU functionality for the GCN is roughly the same. Note that answers will not be the same due to dropout RNG being different on the 2 platforms. --- libgnn/include/galois/layers/GNNLayer.cuh | 2 ++ libgnn/src/layers/GNNLayer.cpp | 11 ++++++++--- libgnn/src/layers/GNNLayer.cu | 18 +++++++++++++++++- libgnn/test/epoch-test.cpp | 2 +- libgnn/test/gpu-epoch-test.cpp | 6 +++--- 5 files changed, 31 insertions(+), 8 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh index fed3f12402..9dfd09e0da 100644 --- a/libgnn/include/galois/layers/GNNLayer.cuh +++ b/libgnn/include/galois/layers/GNNLayer.cuh @@ -32,6 +32,8 @@ public: //! Does dropout on the GPU; saves non-dropped weights to output void DoDropoutGPU(const PointerWithSize input_to_dropout, PointerWithSize output, float dropout_rate); + //! Does dropout derivative on the backward output matrix of the gpu + void DoDropoutDerivativeGPU(size_t input_size, GNNFloat scale); //! Helper function: give a vector which is copied over to the GPU (new //! memory is allocated as necessary) diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 3541442bed..aff4bc3b11 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -128,11 +128,11 @@ void galois::GNNLayer::DoDropoutCPU( void galois::GNNLayer::DoDropout( const PointerWithSize input_to_dropout, PointerWithSize* output_matrix) { -#ifdef GALOIS_ENABLE_GPU +#ifndef GALOIS_ENABLE_GPU + DoDropoutCPU(input_to_dropout, output_matrix); +#else base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix, config_.dropout_rate); -#else - DoDropoutCPU(input_to_dropout, output_matrix); #endif } @@ -140,6 +140,7 @@ void galois::GNNLayer::DoDropoutDerivative() { assert(backward_output_matrix_.size() == dropout_mask_.size()); GNNFloat scale = 1. / (1. - config_.dropout_rate); +#ifndef GALOIS_ENABLE_GPU // use dropout mask to figure out derivative galois::do_all( galois::iterate(static_cast(0), backward_output_matrix_.size()), @@ -149,6 +150,10 @@ void galois::GNNLayer::DoDropoutDerivative() { scale; }, galois::loopname("LayerDropoutDerivative")); +#else + base_gpu_object_.DoDropoutDerivativeGPU(p_backward_output_matrix_.size(), + scale); +#endif } void galois::GNNLayer::Activation() { diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu index 0e43a478be..d6616be5fe 100644 --- a/libgnn/src/layers/GNNLayer.cu +++ b/libgnn/src/layers/GNNLayer.cu @@ -78,7 +78,16 @@ DoDropoutImpl(size_t input_size, const galois::GNNFloat* input_to_dropout, // convert the rng floats into a mask dropout_mask[i] = rng_vector[i] > dropout_rate ? 1 : 0; // use mask to keep/drop weights - output[i] = input_to_dropout[i] * dropout_mask[i] * scale; + output[i] = input_to_dropout[i] * (float)dropout_mask[i] * scale; + } +} + +__global__ void DoDropoutDerivativeImpl(size_t input_size, + galois::GNNFloat* input, + char* dropout_mask, + galois::GNNFloat scale) { + CUDA_KERNEL_LOOP(i, input_size) { + input[i] = input[i] * (float)dropout_mask[i] * scale; } } @@ -97,6 +106,13 @@ void galois::GNNLayerGPUAllocations::DoDropoutGPU( CUDA_TEST("Dropout on GPU failure"); } +void galois::GNNLayerGPUAllocations::DoDropoutDerivativeGPU(size_t input_size, + GNNFloat scale) { + DoDropoutDerivativeImpl<<>>( + input_size, backward_output_matrix_, dropout_mask_, scale); + CUDA_TEST("Dropout derivative on GPU failure"); +} + galois::GNNFloat* galois::GNNLayerGPUAllocations::Allocate(const std::vector& v) { // TODO keep track of these so that on destruction they can be freed diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp index da2a9e1be2..d8a27cc13b 100644 --- a/libgnn/test/epoch-test.cpp +++ b/libgnn/test/epoch-test.cpp @@ -43,7 +43,7 @@ int main() { // increasing galois::StatTimer main_timer("Timer_0"); main_timer.start(); - for (size_t epoch = 0; epoch < 20; epoch++) { + for (size_t epoch = 0; epoch < 100; epoch++) { galois::PointerWithSize predictions = gnn->DoInference(); gnn->GradientPropagation(); galois::gPrint("Epoch ", epoch, ": Accuracy is ", diff --git a/libgnn/test/gpu-epoch-test.cpp b/libgnn/test/gpu-epoch-test.cpp index 7778550875..3a481b9d66 100644 --- a/libgnn/test/gpu-epoch-test.cpp +++ b/libgnn/test/gpu-epoch-test.cpp @@ -14,7 +14,7 @@ int main() { // load graph auto test_graph = std::make_unique( - "reddit", galois::graphs::GNNPartitionScheme::kCVC, true); + "cora", galois::graphs::GNNPartitionScheme::kCVC, true); std::vector layer_types = { galois::GNNLayerType::kGraphConvolutional, @@ -22,7 +22,7 @@ int main() { std::vector layer_output_sizes = { 16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()}; galois::GNNLayerConfig layer_config; - layer_config.do_dropout = false; + layer_config.do_dropout = true; layer_config.do_activation = false; layer_config.do_normalization = true; // XXX Activation kills accuracy compared to old code, esp. for cora @@ -46,7 +46,7 @@ int main() { // increasing galois::StatTimer main_timer("Timer_0"); main_timer.start(); - for (size_t epoch = 0; epoch < 20; epoch++) { + for (size_t epoch = 0; epoch < 100; epoch++) { galois::PointerWithSize predictions = gnn->DoInference(); if (cpu_pred.size() != predictions.size()) { cpu_pred.resize(predictions.size()); From 461d57571f1159d8add0005fe664878a089866e1 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 15 Dec 2020 19:57:30 -0600 Subject: [PATCH 444/660] Gradient sync to sum, master only softmax Changes the gradient sync function to use sum instead of average, and make it so the softmax layer only loops over the master nodes on each host. The effects of this are that accuracy in a distributed setting will be exactly the same as accuracy in a single host setting because no redundant computation will occur. In practice, however, RNG on each host (i.e., dropout) will cause distributed execution to differ from single host execution. Turning off all RNG will make it so the exact same computation occurs (tradeoff is that dropout isn't done, so overfitting to the train set may occur). --- libgnn/src/layers/GraphConvolutionalLayer.cpp | 4 +++- libgnn/src/layers/SoftmaxLayer.cpp | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index d070afa1ef..9c4379dbcc 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -89,6 +89,7 @@ galois::GraphConvolutionalLayer::BackwardPhase( galois::PointerWithSize prev_layer_input, galois::PointerWithSize* input_gradient) { assert(layer_phase_ == GNNPhase::kTrain); + // derivative of activation if (config_.do_activation) { ActivationDerivative(input_gradient); @@ -153,7 +154,8 @@ galois::GraphConvolutionalLayer::BackwardPhase( // sync weight gradients; note aggregation sync occurs in the function call // already // TODO figure out how to do this with GPUs - WeightGradientSyncAverage(); + // WeightGradientSyncAverage(); + WeightGradientSyncSum(); if (config_.do_dropout && layer_number_ != 0) { DoDropoutDerivative(); diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index a4d5133caa..562349780b 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -10,7 +10,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU( const size_t feature_length = layer_dimensions_.input_columns; galois::do_all( - galois::iterate(graph_.begin(), graph_.end()), + galois::iterate(graph_.begin_owned(), graph_.end_owned()), [&](const unsigned i) { if (graph_.IsValidForPhase(i, layer_phase_)) { // do softmax @@ -60,7 +60,7 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { backward_output_matrix_.assign(backward_output_matrix_.size(), 0); galois::do_all( - galois::iterate(graph_.begin(), graph_.end()), + galois::iterate(graph_.begin_owned(), graph_.end_owned()), [&](const unsigned i) { if (graph_.IsValidForPhase(i, layer_phase_)) { // create ground truth vector for this LID From 98509ebd21d949e717fd98901cc324ad39ba55aa Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 18 Dec 2020 19:12:02 -0600 Subject: [PATCH 445/660] Old code fixing: zero out output matrices Old GNN code was problematic as it did not zero out output matrices, meaning garbage was introduced into training step (and ironically improved accuracy). This has been fixed. Also left a comment in adam optimizer noting its incorrect use. --- libdeepgalois/include/deepgalois/optimizer.h | 2 +- libdeepgalois/src/layers/softmax_loss_layer.cpp | 14 ++++++++++++++ libdeepgalois/src/optimizer.cpp | 2 ++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h index ceb0f93ba0..f5eb4b54ec 100644 --- a/libdeepgalois/include/deepgalois/optimizer.h +++ b/libdeepgalois/include/deepgalois/optimizer.h @@ -49,7 +49,7 @@ struct stateful_optimizer : public optimizer { vec_t& get(const vec_t& key) { static_assert(Index < N, "index out of range"); if (E_[Index][&key].empty()) - E_[Index][&key].resize(key.size(), float_t()); + E_[Index][&key].resize(key.size(), float_t(0)); return E_[Index][&key]; } std::unordered_map E_[N]; diff --git a/libdeepgalois/src/layers/softmax_loss_layer.cpp b/libdeepgalois/src/layers/softmax_loss_layer.cpp index 3581365427..17e7023176 100644 --- a/libdeepgalois/src/layers/softmax_loss_layer.cpp +++ b/libdeepgalois/src/layers/softmax_loss_layer.cpp @@ -29,6 +29,13 @@ void softmax_loss_layer::forward_propagation(const float_t* in_data, float_t* out_data) { // size_t numSamples = input_dims; size_t featLen = input_dims[1]; + // zero out the output vector + for (unsigned i = 0; i < input_dims[0]; i++) { + for (unsigned j = 0; j < featLen; j++) { + out_data[i * featLen + j] = 0.0; + } + } + galois::do_all( galois::iterate(begin_, end_), [&](const unsigned gid) { @@ -61,6 +68,13 @@ void softmax_loss_layer::back_propagation(const float_t* in_data, float_t* in_grad) { // note: out_grad is ignored because it shouldn't exist (this is output layer) size_t featLen = layer::input_dims[1]; + + for (unsigned i = 0; i < input_dims[0]; i++) { + for (unsigned j = 0; j < featLen; j++) { + in_grad[i * featLen + j] = 0.0; + } + } + galois::do_all( galois::iterate(layer::begin_, layer::end_), [&](const auto& gid) { diff --git a/libdeepgalois/src/optimizer.cpp b/libdeepgalois/src/optimizer.cpp index e8455e9206..4538d1c956 100644 --- a/libdeepgalois/src/optimizer.cpp +++ b/libdeepgalois/src/optimizer.cpp @@ -46,6 +46,8 @@ void adam::update(const vec_t& dW, vec_t& W) { }, galois::chunk_size<256>(), galois::steal(), galois::loopname("adam_update")); + // TODO/NOTE: this is incorrect: adam parameters should not be shared + // among layers, but this is making it shared b1_t *= b1; b2_t *= b2; } From f20e47368148e3a7bf1572b1039ee7a9d2045ef3 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 6 Jan 2021 18:45:04 -0600 Subject: [PATCH 446/660] Testing splits for ogbn datasets Graph loading in libgnn requires splits for the training set; this commit adds them for the new ogbn datasets. --- libcusp/include/galois/graphs/NewGeneric.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index 048cfa4bc2..9d6fe7b558 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -100,6 +100,12 @@ class NewDistGraphGeneric : public DistGraph { } else if (filename.find("tester") != std::string::npos) { bps.push_back(0); bps.push_back(5); + } else if (filename.find("ogbn-arxiv") != std::string::npos) { + bps.push_back(0); + bps.push_back(169251); + } else if (filename.find("ogbn-products") != std::string::npos) { + bps.push_back(0); + bps.push_back(196614); } else { // XXX only die under certain conditions // GALOIS_DIE("invalid input for gnn partitioning ", filename, From 4e58928c3648aab90692070e57afc01f626cc495 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 6 Jan 2021 19:07:56 -0600 Subject: [PATCH 447/660] Off by one for ogbn training splits --- libcusp/include/galois/graphs/NewGeneric.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index 9d6fe7b558..e8f4fb332d 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -102,10 +102,10 @@ class NewDistGraphGeneric : public DistGraph { bps.push_back(5); } else if (filename.find("ogbn-arxiv") != std::string::npos) { bps.push_back(0); - bps.push_back(169251); + bps.push_back(169252); } else if (filename.find("ogbn-products") != std::string::npos) { bps.push_back(0); - bps.push_back(196614); + bps.push_back(196615); } else { // XXX only die under certain conditions // GALOIS_DIE("invalid input for gnn partitioning ", filename, From 2ae60a61c6b29e505fde6087a401f1b349e36dae Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 8 Jan 2021 19:05:47 -0600 Subject: [PATCH 448/660] multilabel reading test Adds a multilabel reading test as well as a function to GNNGraph to grab multi-class labels (returns a pointer). Also makes a reading change: for multi-class files the labels files should be "mlabels" rather than just labels. --- libgnn/include/galois/graphs/GNNGraph.h | 8 ++ libgnn/src/graphs/GNNGraph.cpp | 8 +- libgnn/test/CMakeLists.txt | 3 + libgnn/test/multilabel-read.cpp | 142 ++++++++++++++++++++++++ 4 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 libgnn/test/multilabel-read.cpp diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 04debc019f..c06de18182 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -100,6 +100,14 @@ class GNNGraph { return local_ground_truth_labels_[lid]; } + //! Returns pointer to start of ground truth vector for some local id assuming + //! labels are multi-class. + const GNNLabel* GetMultiClassLabel(const unsigned lid) const { + assert(!using_single_class_labels_); + return static_cast(local_ground_truth_labels_.data() + + (lid * num_label_classes_)); + } + //! Return matrix of the local node features const PointerWithSize GetLocalFeatures() { #ifndef GALOIS_ENABLE_GPU diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 059759a81e..df73a1cd61 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -132,7 +132,13 @@ void galois::graphs::GNNGraph::AggregateSync( void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, bool has_single_class_label) { GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_); - std::string filename = input_directory_ + dataset_name + "-labels.txt"; + std::string filename; + if (has_single_class_label) { + filename = input_directory_ + dataset_name + "-labels.txt"; + } else { + filename = input_directory_ + dataset_name + "-mlabels.txt"; + } + // read file header, save num label classes while at it std::ifstream file_stream; file_stream.open(filename, std::ios::in); diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index c900c7318c..8385f0b177 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -38,6 +38,9 @@ if (NOT GALOIS_ENABLE_GPU) add_executable(weight-sync-test weight-sync-test.cpp) target_link_libraries(weight-sync-test galois_gnn) + + add_executable(multilabel-read multilabel-read.cpp) + target_link_libraries(multilabel-read galois_gnn) else() add_executable(gpu-convlayer-test gpu-convlayer-test.cpp) target_link_libraries(gpu-convlayer-test galois_gnn) diff --git a/libgnn/test/multilabel-read.cpp b/libgnn/test/multilabel-read.cpp new file mode 100644 index 0000000000..83debfa2bc --- /dev/null +++ b/libgnn/test/multilabel-read.cpp @@ -0,0 +1,142 @@ +//! @file multilabel-read +//! Make sure multilabels read are sane + +#include "galois/Logging.h" +#include "galois/graphs/GNNGraph.h" + +int main() { + galois::DistMemSys G; + + // load test graph; false at end = multilabel + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, false); + const galois::GNNLabel* labels = test_graph.GetMultiClassLabel(0); + + unsigned i = 0; + GALOIS_LOG_ASSERT(1 == labels[i * 7]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 1]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 2]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 3]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 4]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 5]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 6]); + + i = 1; + GALOIS_LOG_ASSERT(0 == labels[i * 7]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 1]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 2]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 3]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 4]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 5]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 6]); + + i = 2; + GALOIS_LOG_ASSERT(0 == labels[i * 7]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 1]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 2]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 3]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 4]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 5]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 6]); + + i = 3; + GALOIS_LOG_ASSERT(0 == labels[i * 7]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 1]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 2]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 3]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 4]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 5]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 6]); + + i = 4; + GALOIS_LOG_ASSERT(0 == labels[i * 7]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 1]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 2]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 3]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 4]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 5]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 6]); + + i = 5; + GALOIS_LOG_ASSERT(1 == labels[i * 7]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 1]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 2]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 3]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 4]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 5]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 6]); + + i = 6; + GALOIS_LOG_ASSERT(1 == labels[i * 7]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 1]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 2]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 3]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 4]); + GALOIS_LOG_ASSERT(0 == labels[i * 7 + 5]); + GALOIS_LOG_ASSERT(1 == labels[i * 7 + 6]); + + labels = test_graph.GetMultiClassLabel(0); + GALOIS_LOG_ASSERT(1 == labels[0]); + GALOIS_LOG_ASSERT(1 == labels[1]); + GALOIS_LOG_ASSERT(1 == labels[2]); + GALOIS_LOG_ASSERT(0 == labels[3]); + GALOIS_LOG_ASSERT(0 == labels[4]); + GALOIS_LOG_ASSERT(0 == labels[5]); + GALOIS_LOG_ASSERT(0 == labels[6]); + + labels = test_graph.GetMultiClassLabel(1); + GALOIS_LOG_ASSERT(0 == labels[0]); + GALOIS_LOG_ASSERT(1 == labels[1]); + GALOIS_LOG_ASSERT(1 == labels[2]); + GALOIS_LOG_ASSERT(1 == labels[3]); + GALOIS_LOG_ASSERT(0 == labels[4]); + GALOIS_LOG_ASSERT(0 == labels[5]); + GALOIS_LOG_ASSERT(0 == labels[6]); + + labels = test_graph.GetMultiClassLabel(2); + GALOIS_LOG_ASSERT(0 == labels[0]); + GALOIS_LOG_ASSERT(0 == labels[1]); + GALOIS_LOG_ASSERT(1 == labels[2]); + GALOIS_LOG_ASSERT(1 == labels[3]); + GALOIS_LOG_ASSERT(1 == labels[4]); + GALOIS_LOG_ASSERT(0 == labels[5]); + GALOIS_LOG_ASSERT(0 == labels[6]); + + labels = test_graph.GetMultiClassLabel(3); + GALOIS_LOG_ASSERT(0 == labels[0]); + GALOIS_LOG_ASSERT(0 == labels[1]); + GALOIS_LOG_ASSERT(0 == labels[2]); + GALOIS_LOG_ASSERT(1 == labels[3]); + GALOIS_LOG_ASSERT(1 == labels[4]); + GALOIS_LOG_ASSERT(1 == labels[5]); + GALOIS_LOG_ASSERT(0 == labels[6]); + + labels = test_graph.GetMultiClassLabel(4); + GALOIS_LOG_ASSERT(0 == labels[0]); + GALOIS_LOG_ASSERT(0 == labels[1]); + GALOIS_LOG_ASSERT(0 == labels[2]); + GALOIS_LOG_ASSERT(0 == labels[3]); + GALOIS_LOG_ASSERT(1 == labels[4]); + GALOIS_LOG_ASSERT(1 == labels[5]); + GALOIS_LOG_ASSERT(1 == labels[6]); + + labels = test_graph.GetMultiClassLabel(5); + GALOIS_LOG_ASSERT(1 == labels[0]); + GALOIS_LOG_ASSERT(0 == labels[1]); + GALOIS_LOG_ASSERT(0 == labels[2]); + GALOIS_LOG_ASSERT(0 == labels[3]); + GALOIS_LOG_ASSERT(0 == labels[4]); + GALOIS_LOG_ASSERT(1 == labels[5]); + GALOIS_LOG_ASSERT(1 == labels[6]); + + labels = test_graph.GetMultiClassLabel(6); + GALOIS_LOG_ASSERT(1 == labels[0]); + GALOIS_LOG_ASSERT(1 == labels[1]); + GALOIS_LOG_ASSERT(0 == labels[2]); + GALOIS_LOG_ASSERT(0 == labels[3]); + GALOIS_LOG_ASSERT(0 == labels[4]); + GALOIS_LOG_ASSERT(0 == labels[5]); + GALOIS_LOG_ASSERT(1 == labels[6]); + + return 0; +} From 3398c5f4dd57c0048f0370dd6b649c68b9eeb7d9 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 11 Jan 2021 15:41:16 -0600 Subject: [PATCH 449/660] F1 scoring for multi-class labels 1) Moved accuracy functions from GNN to GNNGraph which contains the labels necessary to figure out accuracy. 2) Fixed an issue with an older test w.r.t. multi-class reading. 3) Added a new F1 scoring test. 4) New F1 scoring function added (MultiClass accuracy) which returns micro F1 score. --- libgnn/include/galois/GraphNeuralNetwork.h | 9 +- libgnn/include/galois/graphs/GNNGraph.h | 23 ++- libgnn/src/GraphNeuralNetwork.cpp | 56 +------- libgnn/src/graphs/GNNGraph.cpp | 155 +++++++++++++++++++++ libgnn/test/CMakeLists.txt | 5 + libgnn/test/f1-test.cpp | 51 +++++++ libgnn/test/gnngraph-test.cpp | 10 +- 7 files changed, 241 insertions(+), 68 deletions(-) create mode 100644 libgnn/test/f1-test.cpp diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 652b1cbfad..d4681746f3 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -146,10 +146,10 @@ class GraphNeuralNetwork { //! @returns Output layer's output const PointerWithSize DoInference(); + //! Returns classification accuracy for single class label or micro F1 score + //! for multi-class predictions; this calls into GNNGraph's accuracy call float GetGlobalAccuracy(const PointerWithSize predictions); - float GetGlobalAccuracyCPU(const PointerWithSize predictions); - //! Backpropagate gradients from the output layer backwards through the //! network to update the layer weights. Also known as a backward phase in //! most literature @@ -166,10 +166,7 @@ class GraphNeuralNetwork { std::vector> gnn_layers_; //! Current phase of the GNN: train, validation, test GNNPhase phase_{GNNPhase::kTrain}; - //! Used to track accurate predictions during accuracy calculation - DGAccumulator num_correct_; - //! Used to count total number of things checked during accuracy calculation - DGAccumulator total_checked_; + #ifdef GALOIS_ENABLE_GPU //! Holds all GPU functions GraphNeuralNetworkGPU gpu_object_; diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index c06de18182..cfed56aade 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -55,7 +55,9 @@ class GNNGraph { size_t node_feature_length() const { return node_feature_length_; } //! Return the number of label classes (i.e. number of possible outputs) - size_t GetNumLabelClasses() const { return num_label_classes_; }; + size_t GetNumLabelClasses() const { return num_label_classes_; } + + bool is_single_class_label() const { return using_single_class_labels_; } ////////////////////////////////////////////////////////////////////////////// // Graph accessors @@ -93,6 +95,9 @@ class GNNGraph { }; GNNFloat NormFactor(GraphNode n) const { return norm_factors_[n]; } + float GetGlobalAccuracy(PointerWithSize predictions, + GNNPhase phase); + //! Returns the ground truth label of some local id assuming labels are single //! class labels. GNNFloat GetSingleClassLabel(const unsigned lid) const { @@ -139,6 +144,13 @@ class GNNGraph { const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; } #endif private: + float GetGlobalAccuracyCPU(PointerWithSize predictions, + GNNPhase phase); + float GetGlobalAccuracyCPUSingle(PointerWithSize predictions, + GNNPhase phase); + float GetGlobalAccuracyCPUMulti(PointerWithSize predictions, + GNNPhase phase); + //! Directory for input data const std::string input_directory_; //! In a multi-host setting, this variable stores the host id that the graph @@ -222,6 +234,15 @@ class GNNGraph { //! memory and copies things over void InitGPUMemory(); #endif + //! Used to track accurate predictions during accuracy calculation + DGAccumulator num_correct_; + //! Used to count total number of things checked during accuracy calculation + DGAccumulator total_checked_; + // Below are used for multi-class accuracy + DGAccumulator local_true_positive_; + DGAccumulator local_true_negative_; + DGAccumulator local_false_positive_; + DGAccumulator local_false_negative_; }; } // namespace graphs diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index ebe486b47a..afd46f1bcb 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -67,8 +67,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { GradientPropagation(); float train_accuracy = GetGlobalAccuracy(predictions); if (this_host == 0) { - galois::gPrint("Epoch ", epoch, ": Train accuracy is ", train_accuracy, - "\n"); + galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ", + train_accuracy, "\n"); } // TODO validation and test as necessary } @@ -100,56 +100,8 @@ galois::GraphNeuralNetwork::DoInference() { } float galois::GraphNeuralNetwork::GetGlobalAccuracy( - const PointerWithSize predictions) { - // TODO mark as a forwarding argument? - //#ifndef GALOIS_ENABLE_GPU - return GetGlobalAccuracyCPU(predictions); - //#else - // return gpu_object_.GetGlobalAccuracyGPU(graph_->GetGPUGraph(), phase_, - // predictions); - //#endif -} - -float galois::GraphNeuralNetwork::GetGlobalAccuracyCPU( - const PointerWithSize predictions) { - // check owned nodes' accuracy - size_t num_labels = graph_->GetNumLabelClasses(); - assert((graph_->GetNumLabelClasses() * graph_->size()) == predictions.size()); - num_correct_.reset(); - total_checked_.reset(); - - galois::do_all( - galois::iterate(graph_->begin_owned(), graph_->end_owned()), - [&](const unsigned lid) { - if (graph_->IsValidForPhase(lid, phase_)) { - total_checked_ += 1; - // get prediction by getting max - size_t predicted_label = - galois::MaxIndex(num_labels, &(predictions[lid * num_labels])); - // GALOIS_LOG_VERBOSE("Checking LID {} with label {} against - // prediction {}", - // lid, graph_->GetSingleClassLabel(lid), - // predicted_label); - // check against ground truth and track accordingly - // TODO static cast used here is dangerous - if (predicted_label == - static_cast(graph_->GetSingleClassLabel(lid))) { - num_correct_ += 1; - } - } - }, - // TODO chunk size? - // steal on as some threads may have nothing to work on - galois::steal(), galois::loopname("GlobalAccuracy")); - // TODO revise for later when multi-class labels come in - - size_t global_correct = num_correct_.reduce(); - size_t global_checked = total_checked_.reduce(); - - GALOIS_LOG_VERBOSE("Accuracy: {} / {}", global_correct, global_checked); - - return static_cast(global_correct) / - static_cast(global_checked); + PointerWithSize predictions) { + return graph_->GetGlobalAccuracy(predictions, phase_); } void galois::GraphNeuralNetwork::GradientPropagation() { diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index df73a1cd61..2753e07f3d 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -2,6 +2,7 @@ #include "galois/Logging.h" #include "galois/graphs/ReadGraph.h" #include "galois/graphs/GNNGraph.h" +#include "galois/GNNMath.h" #include namespace { @@ -378,6 +379,160 @@ void galois::graphs::GNNGraph::InitNormFactor() { galois::loopname("InitNormFactor")); } +float galois::graphs::GNNGraph::GetGlobalAccuracy( + PointerWithSize predictions, GNNPhase phase) { + // No GPU version yet, but this is where it would be + return GetGlobalAccuracyCPU(predictions, phase); +} + +float galois::graphs::GNNGraph::GetGlobalAccuracyCPU( + PointerWithSize predictions, GNNPhase phase) { + if (is_single_class_label()) { + return GetGlobalAccuracyCPUSingle(predictions, phase); + } else { + return GetGlobalAccuracyCPUMulti(predictions, phase); + } +} + +float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( + PointerWithSize predictions, GNNPhase phase) { + // check owned nodes' accuracy + assert((num_label_classes_ * size()) == predictions.size()); + num_correct_.reset(); + total_checked_.reset(); + + galois::do_all( + galois::iterate(begin_owned(), end_owned()), + [&](const unsigned lid) { + if (IsValidForPhase(lid, phase)) { + total_checked_ += 1; + // get prediction by getting max + size_t predicted_label = galois::MaxIndex( + num_label_classes_, &(predictions[lid * num_label_classes_])); + // check against ground truth and track accordingly + // TODO static cast used here is dangerous + if (predicted_label == + static_cast(GetSingleClassLabel(lid))) { + num_correct_ += 1; + } + } + }, + // steal on as some threads may have nothing to work on + galois::steal(), galois::loopname("GlobalAccuracy")); + + size_t global_correct = num_correct_.reduce(); + size_t global_checked = total_checked_.reduce(); + + GALOIS_LOG_VERBOSE("Accuracy: {} / {}", global_correct, global_checked); + + return static_cast(global_correct) / + static_cast(global_checked); +} + +float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti( + PointerWithSize predictions, GNNPhase phase) { + + const GNNLabel* full_ground_truth = GetMultiClassLabel(0); + assert(predictions.size() == (num_label_classes_ * size())); + + size_t global_true_positive = 0; + size_t global_true_negative = 0; + size_t global_false_positive = 0; + size_t global_false_negative = 0; + size_t global_f1_score = 0; + + // per class check + for (size_t label_class = 0; label_class < num_label_classes_; + label_class++) { + local_true_positive_.reset(); + local_true_negative_.reset(); + local_false_positive_.reset(); + local_false_negative_.reset(); + + // loop through all *owned* nodes (do not want to overcount) + galois::do_all( + galois::iterate(begin_owned(), end_owned()), + [&](const unsigned lid) { + if (IsValidForPhase(lid, phase)) { + size_t label_index = lid * num_label_classes_ + label_class; + GNNLabel true_label = full_ground_truth[label_index]; + GNNLabel prediction_is_positive = + (predictions[label_index] > 0.5) ? 1 : 0; + + if (true_label && prediction_is_positive) { + local_true_positive_ += 1; + } else if (true_label && !prediction_is_positive) { + local_false_negative_ += 1; + } else if (!true_label && prediction_is_positive) { + local_false_positive_ += 1; + } else if (!true_label && !prediction_is_positive) { + local_true_negative_ += 1; + } else { + // all cases should be covered with clauses above, so it should + // NEVER get here; adding it here just for sanity purposes + GALOIS_LOG_FATAL( + "Logic error with true label and prediction label"); + } + } + total_checked_ += 1; + }, + galois::steal(), galois::loopname("GlobalMultiAccuracy")); + + // reduce from accumulators across all hosts for this particular class + size_t class_true_positives = local_true_positive_.reduce(); + size_t class_false_positives = local_false_positive_.reduce(); + size_t class_true_negatives = local_true_negative_.reduce(); + size_t class_false_negatives = local_false_negative_.reduce(); + + // add to global counts + global_true_positive += class_true_positives; + global_false_positive += class_false_positives; + global_true_negative += class_true_negatives; + global_false_negative += class_false_negatives; + + // calculate precision, recall, and f1 score for this class + // ternery op used to avoid division by 0 + double class_precision = + (class_true_positives + class_true_negatives) > 0 + ? static_cast(class_true_positives) / + (class_true_positives + class_false_positives) + : 0.0; + double class_recall = + (class_true_positives + class_false_negatives) > 0 + ? static_cast(class_true_positives) / + (class_true_positives + class_false_negatives) + : 0.0; + double class_f1_score = (class_precision + class_recall) > 0 + ? (2.0 * (class_precision * class_recall)) / + (class_precision + class_recall) + : 0.0; + + global_f1_score += class_f1_score; + } // end label class loop + + // double global_f1_macro_score = global_f1_score / num_label_classes_; + + // micro = considers all classes for precision/recall + double global_micro_precision = + (global_true_positive + global_true_negative) > 0 + ? static_cast(global_true_positive) / + (global_true_positive + global_false_positive) + : 0.0; + double global_micro_recall = + (global_true_positive + global_false_negative) > 0 + ? static_cast(global_true_positive) / + (global_true_positive + global_false_negative) + : 0.0; + + double global_f1_micro_score = + (global_micro_precision + global_micro_recall) > 0 + ? (2.0 * (global_micro_precision * global_micro_recall)) / + (global_micro_precision + global_micro_recall) + : 0.0; + + return global_f1_micro_score; +} + #ifdef GALOIS_ENABLE_GPU void galois::graphs::GNNGraph::InitGPUMemory() { // create int casted CSR diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 8385f0b177..820bd03019 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -41,6 +41,11 @@ if (NOT GALOIS_ENABLE_GPU) add_executable(multilabel-read multilabel-read.cpp) target_link_libraries(multilabel-read galois_gnn) + add_test(NAME multilabel-read COMMAND multilabel-read) + + add_executable(f1-test f1-test.cpp) + target_link_libraries(f1-test galois_gnn) + add_test(NAME f1-test COMMAND f1-test) else() add_executable(gpu-convlayer-test gpu-convlayer-test.cpp) target_link_libraries(gpu-convlayer-test galois_gnn) diff --git a/libgnn/test/f1-test.cpp b/libgnn/test/f1-test.cpp new file mode 100644 index 0000000000..64935bc235 --- /dev/null +++ b/libgnn/test/f1-test.cpp @@ -0,0 +1,51 @@ +//! @file f1-test +//! Tests f1 micro accuracy for multiclass labels + +#include "galois/Logging.h" +#include "galois/graphs/GNNGraph.h" + +int main() { + galois::DistMemSys G; + + // load test graph; false at end = multilabel + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, false); + + // perfect precision and recall + std::vector prediction = { + 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, + 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1}; + GALOIS_LOG_ASSERT(1.0 == test_graph.GetGlobalAccuracy( + prediction, galois::GNNPhase::kTrain)); + GALOIS_LOG_ASSERT(1.0 == test_graph.GetGlobalAccuracy( + prediction, galois::GNNPhase::kValidate)); + GALOIS_LOG_ASSERT( + 1.0 == test_graph.GetGlobalAccuracy(prediction, galois::GNNPhase::kTest)); + + // perfect recall, but training precision is bad + std::vector prediction2 = { + 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1}; + + // just print here and check with eyes: checking float equivalance is a pain + // both prints should be .6666666 + GALOIS_LOG_DEBUG( + "{} {}", + test_graph.GetGlobalAccuracy(prediction2, galois::GNNPhase::kTrain), + (2 * (15.0 / 30.0)) / ((15.0 / 30.0) + 1)); + GALOIS_LOG_ASSERT(1.0 == test_graph.GetGlobalAccuracy( + prediction2, galois::GNNPhase::kValidate)); + GALOIS_LOG_ASSERT(1.0 == test_graph.GetGlobalAccuracy( + prediction2, galois::GNNPhase::kTest)); + + // no predictions made + std::vector prediction3(49, 0); + GALOIS_LOG_ASSERT(0.0 == test_graph.GetGlobalAccuracy( + prediction3, galois::GNNPhase::kTrain)); + GALOIS_LOG_ASSERT(0.0 == test_graph.GetGlobalAccuracy( + prediction3, galois::GNNPhase::kValidate)); + GALOIS_LOG_ASSERT(0.0 == test_graph.GetGlobalAccuracy( + prediction3, galois::GNNPhase::kTest)); + + return 0; +} diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp index 7db24081f5..5aa4d72ddf 100644 --- a/libgnn/test/gnngraph-test.cpp +++ b/libgnn/test/gnngraph-test.cpp @@ -14,21 +14,13 @@ int main() { galois::runtime::getSystemNetworkInterface().ID, num_threads); - GALOIS_LOG_VERBOSE("reddit with multilabel, oec"); - galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kOEC, - false); + // multi level reading tested in another test GALOIS_LOG_VERBOSE("reddit with single label, oec"); galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kOEC, true); - GALOIS_LOG_VERBOSE("reddit with multilabel, cvc"); - galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kCVC, - false); GALOIS_LOG_VERBOSE("reddit with single label, cvc"); galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kCVC, true); - // TODO fix citeseer and goec - // galois::graphs::GNNGraph("citeseer", - // galois::graphs::GNNPartitionScheme::kOEC, false); return 0; } From 4025be2c2e442afb1c09441a3e8369b64078a085 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 12 Jan 2021 13:22:26 -0600 Subject: [PATCH 450/660] GNN sigmoid output layer - Adds the sigmoid layer as a possible output layer for use. - Templatizes cross entropy call because truth vector doens't necessarily need to be a float. - Removed definition of constructors from GNN class and let default constructors do their thing. - Added a sanity test for multi-label epochs. - Added yelp/amazon ranges to CuSP Needs to be added as an option to the GCN app (next commit). --- libcusp/include/galois/graphs/NewGeneric.h | 13 +++- libgnn/CMakeLists.txt | 1 + libgnn/include/galois/GNNMath.h | 11 ++- libgnn/include/galois/GraphNeuralNetwork.h | 7 -- libgnn/include/galois/layers/GNNLayer.h | 3 +- libgnn/include/galois/layers/SigmoidLayer.h | 52 +++++++++++++ libgnn/src/GNNMath.cpp | 9 --- libgnn/src/GraphNeuralNetwork.cpp | 14 ++++ libgnn/src/layers/SigmoidLayer.cpp | 86 +++++++++++++++++++++ libgnn/test/CMakeLists.txt | 8 +- libgnn/test/multilabel-epoch-test.cpp | 59 ++++++++++++++ 11 files changed, 239 insertions(+), 24 deletions(-) create mode 100644 libgnn/include/galois/layers/SigmoidLayer.h create mode 100644 libgnn/src/layers/SigmoidLayer.cpp create mode 100644 libgnn/test/multilabel-epoch-test.cpp diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index e8f4fb332d..0c3e4b31d4 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -81,6 +81,7 @@ class NewDistGraphGeneric : public DistGraph { // performance critical std::vector bps; + // TODO(loc) avoid this entirely and load it from file... // if through all possible GNN outputs if (filename.find("cora") != std::string::npos) { bps.push_back(0); @@ -106,12 +107,20 @@ class NewDistGraphGeneric : public DistGraph { } else if (filename.find("ogbn-products") != std::string::npos) { bps.push_back(0); bps.push_back(196615); + } else if (filename.find("yelp") != std::string::npos) { + // this is entire graph: yelp's mask isn't contiguous + bps.push_back(0); + bps.push_back(716847); + } else if (filename.find("amazon") != std::string::npos) { + // this is entire graph: amazon's mask isn't contiguous + bps.push_back(0); + bps.push_back(1569960); } else { - // XXX only die under certain conditions + // TODO(loc) only die under certain conditions; don't die if something + // is missing // GALOIS_DIE("invalid input for gnn partitioning ", filename, // " hardcode needed"); } - // TODO hardcode the rest return bps; } diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index baee47c3fb..320189c44e 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -6,6 +6,7 @@ set(sources src/layers/GNNLayer.cpp src/layers/GluonGradientInterface.cpp src/layers/GraphConvolutionalLayer.cpp + src/layers/SigmoidLayer.cpp src/layers/SoftmaxLayer.cpp ) diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h index 488b538d75..f8edd9650f 100644 --- a/libgnn/include/galois/GNNMath.h +++ b/libgnn/include/galois/GNNMath.h @@ -28,10 +28,17 @@ void GNNSoftmaxDerivative(const size_t vector_length, galois::GNNFloat GNNCrossEntropy(const size_t vector_length, const GNNFloat* ground_truth, const GNNFloat* input); + //! Derivative of cross entropy; gradients saved into an output vector. +template void GNNCrossEntropyDerivative(const size_t vector_length, - const GNNFloat* ground_truth, - const GNNFloat* input, GNNFloat* gradients); + const TruthType* ground_truth, + const GNNFloat* input, GNNFloat* gradients) { + for (size_t i = 0; i < vector_length; i++) { + gradients[i] = -(ground_truth[i]) / (input[i] + static_cast(1e-10)); + } +} + //! Calls into a library BLAS call to do matrix muliply; uses default alpha/beta void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, size_t input_rows, size_t input_columns, size_t output_columns, diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index d4681746f3..51142b9b38 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -22,13 +22,6 @@ namespace galois { //! determine how the network gets constructed. class GraphNeuralNetworkConfig { public: - // default move, no copy - GraphNeuralNetworkConfig() = delete; - GraphNeuralNetworkConfig(const GraphNeuralNetworkConfig&) = delete; - GraphNeuralNetworkConfig& operator=(const GraphNeuralNetworkConfig&) = delete; - GraphNeuralNetworkConfig(GraphNeuralNetworkConfig&&) = default; - GraphNeuralNetworkConfig& operator=(GraphNeuralNetworkConfig&&) = default; - //! Construction without a config for layers specified; uses a default GraphNeuralNetworkConfig(size_t num_layers, const std::vector& layer_types, diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 3296b17d20..93498a6497 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -20,9 +20,8 @@ enum class GNNLayerType { // TODO SAGE and GAT }; -// TODO Sigmoid //! Supported output layer types in the GNN -enum class GNNOutputLayerType { kInvalid, kSoftmax }; +enum class GNNOutputLayerType { kInvalid, kSoftmax, kSigmoid }; //! Struct holding the dimensions of a layer. Assumption is that a layer takes //! a matrix and outputs another matrix with a different # of columns (e.g. diff --git a/libgnn/include/galois/layers/SigmoidLayer.h b/libgnn/include/galois/layers/SigmoidLayer.h new file mode 100644 index 0000000000..44c215909d --- /dev/null +++ b/libgnn/include/galois/layers/SigmoidLayer.h @@ -0,0 +1,52 @@ +#pragma once +#include "galois/layers/GNNLayer.h" + +// TODO(loc) GPU support + +namespace galois { + +//! Sigmoid layer: applies sigmoid function element wise to each element of the +//! input. +//! Meant for use with *multi-class* labels. +class SigmoidLayer : public GNNLayer { +public: + SigmoidLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions) + : GNNLayer(layer_num, graph, dimensions, + GNNLayerConfig{.allocate_weights = false}), + // input_loss_(dimensions.input_rows), + norm_gradient_vectors_(dimensions.input_columns) { + output_layer_type_ = galois::GNNOutputLayerType::kSigmoid; + // input/output columns must be equivalent + GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns); + // output needs to match number of possible classes + GALOIS_LOG_ASSERT(dimensions.input_columns == graph.GetNumLabelClasses()); + } + + //! Normalizes all elements by applying sigmoid to all of them + const PointerWithSize + ForwardPhase(const PointerWithSize input_embeddings) final; + + //! Get gradients to fix distribution such that it leans more towards + //! multiclass ground truth. + PointerWithSize + BackwardPhase(const PointerWithSize, + PointerWithSize*) final; + +private: + const PointerWithSize + ForwardPhaseCPU(const PointerWithSize input_embeddings); + + PointerWithSize BackwardPhaseCPU(); + + //! Loss for each row of the input; unused for now because loss doesn't + //! need to be calculated for correctness + // std::vector input_loss_; + + //! Each thread gets storage to allocate the gradients during backward + //! prop; each is the size of a feature vector + galois::substrate::PerThreadStorage> + norm_gradient_vectors_; +}; + +} // namespace galois diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp index 0d065d6bcc..294dc9f7be 100644 --- a/libgnn/src/GNNMath.cpp +++ b/libgnn/src/GNNMath.cpp @@ -107,15 +107,6 @@ galois::GNNFloat galois::GNNCrossEntropy(const size_t vector_length, return loss; } -void galois::GNNCrossEntropyDerivative(const size_t vector_length, - const GNNFloat* ground_truth, - const GNNFloat* input, - GNNFloat* gradients) { - for (size_t i = 0; i < vector_length; i++) { - gradients[i] = -(ground_truth[i]) / (input[i] + static_cast(1e-10)); - } -} - void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, size_t input_rows, size_t input_columns, size_t output_columns, diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index afd46f1bcb..ce89b4b780 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -2,6 +2,7 @@ #include "galois/GraphNeuralNetwork.h" #include "galois/layers/GraphConvolutionalLayer.h" #include "galois/layers/SoftmaxLayer.h" +#include "galois/layers/SigmoidLayer.h" galois::GraphNeuralNetwork::GraphNeuralNetwork( std::unique_ptr graph, @@ -54,9 +55,22 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( gnn_layers_.push_back(std::move(std::make_unique( config_.num_intermediate_layers(), *graph_, output_dims))); break; + case (GNNOutputLayerType::kSigmoid): + gnn_layers_.push_back(std::move(std::make_unique( + config_.num_intermediate_layers(), *graph_, output_dims))); + break; default: GALOIS_LOG_FATAL("Invalid layer type during network construction"); } + + // sanity checking multi-class + output layer + if (!graph_->is_single_class_label() && + (config_.output_layer_type() != GNNOutputLayerType::kSigmoid)) { + GALOIS_LOG_WARN( + "Using a non-sigmoid output layer with a multi-class label!"); + // if debug mode just kill program + assert(false); + } } float galois::GraphNeuralNetwork::Train(size_t num_epochs) { diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp new file mode 100644 index 0000000000..8db6b8e0cc --- /dev/null +++ b/libgnn/src/layers/SigmoidLayer.cpp @@ -0,0 +1,86 @@ +#include "galois/layers/SigmoidLayer.h" +#include "galois/GNNMath.h" +#include + +// TODO(loc) GPU support + +const galois::PointerWithSize +galois::SigmoidLayer::ForwardPhaseCPU( + const galois::PointerWithSize input_embeddings) { + // loss is ignored for now anyways + // input_loss_.assign(input_loss_.size(), 0.0); + forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); + const size_t feature_length = layer_dimensions_.input_columns; + + galois::do_all( + galois::iterate(graph_.begin_owned(), graph_.end_owned()), + [&](const unsigned local_node) { + if (graph_.IsValidForPhase(local_node, layer_phase_)) { + size_t node_offset = feature_length * local_node; + // sigmoid the values for this node + for (unsigned index = 0; index < feature_length; index++) { + forward_output_matrix_[node_offset + index] = + 1.0 / (1.0 + expf(-input_embeddings[node_offset + index])); + } + // TODO(loc) calculate loss (it's not even being used/not required + // for correctness so I'm ignoring it for now) + } + }, + galois::steal(), galois::loopname("SigmoidForward")); + + return forward_output_matrix_; +} + +const galois::PointerWithSize +galois::SigmoidLayer::ForwardPhase( + const galois::PointerWithSize input_embeddings) { +#ifdef GALOIS_ENABLE_GPU + // TODO(loc) when GPU needs it + return 0; +#else + return ForwardPhaseCPU(input_embeddings); +#endif +} + +galois::PointerWithSize +galois::SigmoidLayer::BackwardPhaseCPU() { + const size_t feature_length = layer_dimensions_.input_columns; + backward_output_matrix_.assign(backward_output_matrix_.size(), 0); + + galois::do_all( + galois::iterate(graph_.begin_owned(), graph_.end_owned()), + [&](const unsigned local_node) { + if (graph_.IsValidForPhase(local_node, layer_phase_)) { + // derivative cross entropy into norm grad + const GNNLabel* ground_truth = graph_.GetMultiClassLabel(local_node); + size_t node_offset = feature_length * local_node; + std::vector* norm_gradient = + norm_gradient_vectors_.getLocal(); + GNNCrossEntropyDerivative(feature_length, ground_truth, + &(forward_output_matrix_[node_offset]), + norm_gradient->data()); + + // sigmoid derivative + for (unsigned index = 0; index < feature_length; index++) { + backward_output_matrix_[node_offset + index] = + (*norm_gradient)[index] * + forward_output_matrix_[node_offset + index] * + (1.0 - forward_output_matrix_[node_offset + index]); + } + } + }, + galois::steal(), galois::loopname("SigmoidBackward")); + + return backward_output_matrix_; +} + +galois::PointerWithSize +galois::SigmoidLayer::BackwardPhase(const PointerWithSize, + PointerWithSize*) { +#ifdef GALOIS_ENABLE_GPU + // TODO(loc) when GPU needs it + return 0; +#else + return BackwardPhaseCPU(); +#endif +} diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 820bd03019..6bf0ac6bd8 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -30,12 +30,16 @@ if (NOT GALOIS_ENABLE_GPU) add_executable(epoch-test epoch-test.cpp) target_link_libraries(epoch-test galois_gnn) add_test(NAME epoch-test COMMAND epoch-test) - + + add_executable(multilabel-epoch-test multilabel-epoch-test.cpp) + target_link_libraries(multilabel-epoch-test galois_gnn) + add_test(NAME multilabel-epoch-test COMMAND multilabel-epoch-test) + # TODO figure out how to make this test run in parallel add_executable(aggregate-sync-test aggregate-sync-test.cpp) target_link_libraries(aggregate-sync-test galois_gnn) #add_test(NAME aggregate-sync-test COMMAND GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 ./aggregate-sync-test) - + add_executable(weight-sync-test weight-sync-test.cpp) target_link_libraries(weight-sync-test galois_gnn) diff --git a/libgnn/test/multilabel-epoch-test.cpp b/libgnn/test/multilabel-epoch-test.cpp new file mode 100644 index 0000000000..3fb96f8c81 --- /dev/null +++ b/libgnn/test/multilabel-epoch-test.cpp @@ -0,0 +1,59 @@ +//! @file multilabel-epoch-test.cpp +//! Run 100 epochs of multilabel dataset + +#include "galois/Logging.h" +#include "galois/GraphNeuralNetwork.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + // size_t num_threads = galois::setActiveThreads(1); + GALOIS_LOG_VERBOSE("Num threads is {}", num_threads); + + // load graph + auto test_graph = std::make_unique( + "tester", galois::graphs::GNNPartitionScheme::kOEC, false); + + std::vector layer_types = { + galois::GNNLayerType::kGraphConvolutional, + galois::GNNLayerType::kGraphConvolutional}; + std::vector layer_output_sizes = { + 16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()}; + galois::GNNLayerConfig layer_config; + layer_config.do_dropout = true; + layer_config.do_activation = false; + layer_config.do_normalization = true; + // XXX Activation kills accuracy compared to old code, esp. for cora + galois::GraphNeuralNetworkConfig gnn_config( + 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSigmoid, + layer_config); + + std::vector adam_sizes = {16 * test_graph->node_feature_length(), + 16 * test_graph->GetNumLabelClasses()}; + auto adam = std::make_unique(adam_sizes, 2); + + auto gnn = std::make_unique( + std::move(test_graph), std::move(adam), std::move(gnn_config)); + + ////////////////////////////////////////////////////////////////////////////// + + // no verification; test should be eyeballed to make sure accuracy is + // increasing + galois::StatTimer main_timer("Timer_0"); + main_timer.start(); + for (size_t epoch = 0; epoch < 100; epoch++) { + galois::PointerWithSize predictions = gnn->DoInference(); + gnn->GradientPropagation(); + galois::gPrint("Epoch ", epoch, ": Accuracy is ", + gnn->GetGlobalAccuracy(predictions), "\n"); + } + + // check test accuracy + gnn->SetLayerPhases(galois::GNNPhase::kTest); + galois::PointerWithSize predictions = gnn->DoInference(); + galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(predictions), + "\n"); + main_timer.stop(); +} From 50537bc4ad641cae6536f3ba7c99220660ff0e1e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 12 Jan 2021 13:52:06 -0600 Subject: [PATCH 451/660] Sigmoid and multiclass option added to gnnbench Users can now specify multiclass and Sigmoid output layer in gnn applications. --- lonestar/libgnnbench/include/GNNBench/Input.h | 5 +++- lonestar/libgnnbench/src/Input.cpp | 23 +++++++++++++++---- 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h index fc5059bb0c..737887a756 100644 --- a/lonestar/libgnnbench/include/GNNBench/Input.h +++ b/lonestar/libgnnbench/include/GNNBench/Input.h @@ -21,7 +21,10 @@ extern llvm::cl::opt do_activation; // TODO activation layer type once more are supported //! Controls weight normalization based on degree extern llvm::cl::opt do_normalization; -// TODO output layer type +//! Output layer type +extern llvm::cl::opt output_layer_type; +//! If true, use multiclass ground truth +extern llvm::cl::opt multiclass_labels; // TODO optimizer type //! Toggles an optimization that flips aggregate/update step if it would be //! beneficial diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index 97ef7a6fc3..d1dbb5bba3 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -59,6 +59,20 @@ llvm::cl::opt "features based on their degree"), cll::init(true)); +llvm::cl::opt output_layer_type( + "outputLayer", cll::desc("Type of output layer"), + cll::values(clEnumValN(galois::GNNOutputLayerType::kSoftmax, "softmax", + "Softmax (default)"), + clEnumValN(galois::GNNOutputLayerType::kSigmoid, "sigmoid", + "Sigmoid")), + cll::init(galois::GNNOutputLayerType::kSoftmax)); + +llvm::cl::opt + multiclass_labels("multiclassLabels", + cll::desc("If true (off by default), use multi-class " + "ground truth; required for some inputs"), + cll::init(false)); + llvm::cl::opt agg_after_update("allowAggregationAfterUpdate", cll::desc("If true (on by default), allows aggregate to " @@ -161,7 +175,7 @@ std::unique_ptr InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type) { // partition/load graph auto gnn_graph = std::make_unique( - input_directory, input_name, partition_scheme, true); + input_directory, input_name, partition_scheme, !multiclass_labels); // create layer types vector std::vector layer_types; @@ -174,10 +188,9 @@ InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type) { // layer config object galois::GNNLayerConfig layer_config = CreateLayerConfig(); // GNN config object - // TODO output type should be configurable - galois::GraphNeuralNetworkConfig gnn_config( - num_layers, layer_types, layer_sizes_vector, - galois::GNNOutputLayerType::kSoftmax, layer_config); + galois::GraphNeuralNetworkConfig gnn_config(num_layers, layer_types, + layer_sizes_vector, + output_layer_type, layer_config); // optimizer std::unique_ptr opt = CreateOptimizer(gnn_graph.get()); From 3ffe9ed26a2d7a91e0e80ea78263bffd271f96a4 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 19 Jan 2021 16:51:28 -0600 Subject: [PATCH 452/660] Fixing cross-entropy loss; no ReLU for last layer Cross-entropy loss was not punishing false positives before. This has been fixed. Also, ReLU has been turned off for the last layer whenever activation is used as this seems to completely destroy performance. --- libgnn/include/galois/GNNMath.h | 44 +++++++++++++++++++-- libgnn/include/galois/layers/GNNLayer.h | 2 + libgnn/include/galois/layers/SigmoidLayer.h | 9 ++--- libgnn/src/GNNMath.cpp | 25 ------------ libgnn/src/GraphNeuralNetwork.cpp | 4 ++ libgnn/src/layers/SigmoidLayer.cpp | 17 ++++++-- 6 files changed, 63 insertions(+), 38 deletions(-) diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h index f8edd9650f..231d437836 100644 --- a/libgnn/include/galois/GNNMath.h +++ b/libgnn/include/galois/GNNMath.h @@ -1,7 +1,9 @@ #pragma once +#include "galois/Logging.h" #include "galois/GNNTypes.h" #include +#include namespace galois { @@ -25,9 +27,35 @@ void GNNSoftmaxDerivative(const size_t vector_length, GNNFloat* temp_vector, GNNFloat* output); //! Performs cross entropy given a ground truth and input and returns the loss //! value. +template galois::GNNFloat GNNCrossEntropy(const size_t vector_length, - const GNNFloat* ground_truth, - const GNNFloat* input); + const TruthType* ground_truth, + const GNNFloat* input) { + GNNFloat loss = 0.0; + + // Note that this function works if there are multiple non-zeros in the + // ground truth vector + // If there is only 1 then this function is overkill and it should break + // early (i.e. single class) + // Multiclass = fine + for (size_t i = 0; i < vector_length; i++) { + if (ground_truth[i] == 0.0) { + if (input[i] == 1.0) { + loss -= std::log(static_cast(1e-10)); + } else { + loss -= std::log(1 - input[i]); + } + } else { + if (input[i] == 0.0) { + loss -= std::log(static_cast(1e-10)); + } else { + loss -= std::log(input[i]); + } + } + } + + return loss; +} //! Derivative of cross entropy; gradients saved into an output vector. template @@ -35,7 +63,17 @@ void GNNCrossEntropyDerivative(const size_t vector_length, const TruthType* ground_truth, const GNNFloat* input, GNNFloat* gradients) { for (size_t i = 0; i < vector_length; i++) { - gradients[i] = -(ground_truth[i]) / (input[i] + static_cast(1e-10)); + // TODO(loc) assumption: binary classifier, make explicit in function name + if (ground_truth[i]) { + gradients[i] = -1.0 / (input[i] + static_cast(1e-10)); + } else { + if (input[i] == 1.0) { + // opposite + gradients[i] = 1.0 / static_cast(1e-10); + } else { + gradients[i] = 1.0 / (1.0 - input[i]); + } + } } } diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 93498a6497..f8d8cd8d8a 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -75,6 +75,8 @@ class GNNLayer { //! Changes this layer's phase void SetLayerPhase(GNNPhase new_phase) { layer_phase_ = new_phase; } + void DisableActivation() { config_.do_activation = false; } + //! Initializes all layer weights to 1. This is used as a debug function for //! testing. void InitAllWeightsTo1() { diff --git a/libgnn/include/galois/layers/SigmoidLayer.h b/libgnn/include/galois/layers/SigmoidLayer.h index 44c215909d..7efe8cd9db 100644 --- a/libgnn/include/galois/layers/SigmoidLayer.h +++ b/libgnn/include/galois/layers/SigmoidLayer.h @@ -14,7 +14,7 @@ class SigmoidLayer : public GNNLayer { const GNNLayerDimensions& dimensions) : GNNLayer(layer_num, graph, dimensions, GNNLayerConfig{.allocate_weights = false}), - // input_loss_(dimensions.input_rows), + input_loss_(dimensions.input_rows), norm_gradient_vectors_(dimensions.input_columns) { output_layer_type_ = galois::GNNOutputLayerType::kSigmoid; // input/output columns must be equivalent @@ -36,13 +36,10 @@ class SigmoidLayer : public GNNLayer { private: const PointerWithSize ForwardPhaseCPU(const PointerWithSize input_embeddings); - PointerWithSize BackwardPhaseCPU(); - //! Loss for each row of the input; unused for now because loss doesn't - //! need to be calculated for correctness - // std::vector input_loss_; - + //! Loss for each row of the input + std::vector input_loss_; //! Each thread gets storage to allocate the gradients during backward //! prop; each is the size of a feature vector galois::substrate::PerThreadStorage> diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp index 294dc9f7be..dcaaf31a42 100644 --- a/libgnn/src/GNNMath.cpp +++ b/libgnn/src/GNNMath.cpp @@ -1,4 +1,3 @@ -#include #include #include #include "galois/GNNMath.h" @@ -83,30 +82,6 @@ void galois::GNNSoftmaxDerivative(const size_t vector_length, } } -galois::GNNFloat galois::GNNCrossEntropy(const size_t vector_length, - const GNNFloat* ground_truth, - const GNNFloat* input) { - GNNFloat loss = 0.0; - - // Note that this function works if there are multiple non-zeros in the - // ground truth vector - // If there is only 1 then this function is overkill and it should break - // early - for (size_t i = 0; i < vector_length; i++) { - if (ground_truth[i] == 0.0) { - continue; - } - - if (input[i] == 0.0) { - loss -= ground_truth[i] * std::log(static_cast(1e-10)); - } else { - loss -= ground_truth[i] * std::log(input[i]); - } - } - - return loss; -} - void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, size_t input_rows, size_t input_columns, size_t output_columns, diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index ce89b4b780..aae4fbb8a1 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -36,6 +36,10 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( case GNNLayerType::kGraphConvolutional: gnn_layers_.push_back(std::move(std::make_unique( i, *graph_, layer_dims, config_.default_layer_config()))); + if (i == config_.num_intermediate_layers() - 1) { + // last layer before output layer should never have activation + gnn_layers_.back()->DisableActivation(); + } break; default: GALOIS_LOG_FATAL("Invalid layer type during network construction"); diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp index 8db6b8e0cc..1b6fe9eb05 100644 --- a/libgnn/src/layers/SigmoidLayer.cpp +++ b/libgnn/src/layers/SigmoidLayer.cpp @@ -7,10 +7,11 @@ const galois::PointerWithSize galois::SigmoidLayer::ForwardPhaseCPU( const galois::PointerWithSize input_embeddings) { - // loss is ignored for now anyways - // input_loss_.assign(input_loss_.size(), 0.0); + input_loss_.assign(input_loss_.size(), 0.0); forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); const size_t feature_length = layer_dimensions_.input_columns; + galois::GAccumulator total_loss; + total_loss.reset(); galois::do_all( galois::iterate(graph_.begin_owned(), graph_.end_owned()), @@ -21,13 +22,21 @@ galois::SigmoidLayer::ForwardPhaseCPU( for (unsigned index = 0; index < feature_length; index++) { forward_output_matrix_[node_offset + index] = 1.0 / (1.0 + expf(-input_embeddings[node_offset + index])); + // if (local_node == 0) { + // galois::gPrint(forward_output_matrix_[node_offset + index], + // "\n"); + //} } - // TODO(loc) calculate loss (it's not even being used/not required - // for correctness so I'm ignoring it for now) + + input_loss_[local_node] = GNNCrossEntropy( + feature_length, graph_.GetMultiClassLabel(local_node), + &forward_output_matrix_[node_offset]); + total_loss += input_loss_[local_node]; } }, galois::steal(), galois::loopname("SigmoidForward")); + galois::gPrint("Total loss is ", total_loss.reduce(), "\n"); return forward_output_matrix_; } From 3bf83392a35aa033f1f08099699d4566a4e408f2 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 22 Jan 2021 16:04:01 -0600 Subject: [PATCH 453/660] Fixing Sigmoid and GCN derivatives Sigmoid derivative has been changed to a numerically stable and simple subtraction calculation. GCN derivative in the case of aggregate -> xform was incorrect: it was multiplying with non-aggregated features. This caused gradient explosion and degrading accuracy as time went on due to the gradients being non-sensical. --- libgnn/src/layers/GraphConvolutionalLayer.cpp | 19 ++++++++++-- libgnn/src/layers/SigmoidLayer.cpp | 30 +++++++++---------- 2 files changed, 30 insertions(+), 19 deletions(-) diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 9c4379dbcc..9967a76773 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -112,12 +112,13 @@ galois::GraphConvolutionalLayer::BackwardPhase( &input_column_intermediates_); } // weight gradient calculation - // TODO put this in a function to put the ifdef in there + // TODO(loc) put this in a function to put the ifdef in there #ifndef GALOIS_ENABLE_GPU + // temp 2 holds aggregated feature vectors from forward phase galois::CBlasSGEMM( CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, layer_dimensions_.input_rows, layer_dimensions_.output_columns, - prev_layer_input.data(), input_gradient->data(), + p_in_temp_2_.data(), input_gradient->data(), p_layer_weight_gradients_.data()); #else gpu_object_.GetWeightGradientsGPU( @@ -189,7 +190,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( [&](size_t src) { size_t index_to_src_feature = src * column_length; // zero out src feature first - // TODO can init to self as well + // TODO(loc) can init to self as well to add to self for (size_t i = 0; i < column_length; i++) { aggregate_output[index_to_src_feature + i] = 0; } @@ -225,6 +226,18 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( &aggregate_output[index_to_src_feature]); } } + + // GNNFloat* intermediate = pts->getLocal()->data(); + // GNNFloat norm_scale = source_norm * source_norm; + // for (size_t i = 0; i < column_length; i++) { + // intermediate[i] = + // norm_scale * node_embeddings[index_to_src_feature + i]; + //} + //// add self + // galois::VectorAdd(column_length, + // &aggregate_output[index_to_src_feature], + // intermediate, + // &aggregate_output[index_to_src_feature]); }, galois::steal(), galois::loopname("ConvolutionalAggregateAll")); diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp index 1b6fe9eb05..a676383e6f 100644 --- a/libgnn/src/layers/SigmoidLayer.cpp +++ b/libgnn/src/layers/SigmoidLayer.cpp @@ -20,17 +20,21 @@ galois::SigmoidLayer::ForwardPhaseCPU( size_t node_offset = feature_length * local_node; // sigmoid the values for this node for (unsigned index = 0; index < feature_length; index++) { - forward_output_matrix_[node_offset + index] = - 1.0 / (1.0 + expf(-input_embeddings[node_offset + index])); - // if (local_node == 0) { - // galois::gPrint(forward_output_matrix_[node_offset + index], - // "\n"); - //} + // splitting in half is done for numerical stability of log + if (input_embeddings[node_offset + index] >= 0) { + forward_output_matrix_[node_offset + index] = + 1.0 / (1.0 + expf(-input_embeddings[node_offset + index])); + } else { + forward_output_matrix_[node_offset + index] = + expf(input_embeddings[node_offset + index]) / + (1.0 + expf(input_embeddings[node_offset + index])); + } } input_loss_[local_node] = GNNCrossEntropy( feature_length, graph_.GetMultiClassLabel(local_node), &forward_output_matrix_[node_offset]); + // TODO(loc) normalize the loss total_loss += input_loss_[local_node]; } }, @@ -63,18 +67,12 @@ galois::SigmoidLayer::BackwardPhaseCPU() { // derivative cross entropy into norm grad const GNNLabel* ground_truth = graph_.GetMultiClassLabel(local_node); size_t node_offset = feature_length * local_node; - std::vector* norm_gradient = - norm_gradient_vectors_.getLocal(); - GNNCrossEntropyDerivative(feature_length, ground_truth, - &(forward_output_matrix_[node_offset]), - norm_gradient->data()); - - // sigmoid derivative + // sigmoid-cross-entropy derivative: turns out all it is is simple + // subtraction for (unsigned index = 0; index < feature_length; index++) { backward_output_matrix_[node_offset + index] = - (*norm_gradient)[index] * - forward_output_matrix_[node_offset + index] * - (1.0 - forward_output_matrix_[node_offset + index]); + forward_output_matrix_[node_offset + index] - + ground_truth[index]; } } }, From ef58e82d77b8d42fe6b594cbd407a8799d033d32 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 27 Jan 2021 15:24:19 -0600 Subject: [PATCH 454/660] Fixing GPU code/build Fixes backpropagation gradient used in GPU to match fix applied to CPU code. Fixes a CMake var used by the GPU build (no clue how it was building before). Adds dummy uses/returns to SigmoidLayer gpu (which needs to be implemented). --- CMakeLists.txt | 2 +- libgnn/src/layers/GraphConvolutionalLayer.cpp | 2 +- libgnn/src/layers/SigmoidLayer.cpp | 5 +++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 41f318b828..bb72f24c71 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -259,7 +259,7 @@ if (GALOIS_ENABLE_GPU) set(CUDA_PROPAGATE_HOST_FLAGS off) set(CUDA_HOST_COMPILER g++) - string(REPLACE "." "" GENCODES ${CUDA_CAPABILITY}) + string(REPLACE "." "" GENCODES ${GALOIS_CUDA_CAPABILITY}) string(REPLACE "," ";" GENCODES ${GENCODES}) foreach(GENCODE ${GENCODES}) set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS}; --expt-extended-lambda -gencode arch=compute_${GENCODE},code=sm_${GENCODE}) diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 9967a76773..ed05ddf4be 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -123,7 +123,7 @@ galois::GraphConvolutionalLayer::BackwardPhase( #else gpu_object_.GetWeightGradientsGPU( layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, prev_layer_input.data(), + layer_dimensions_.output_columns, p_in_temp_2_.data(), input_gradient->data(), p_layer_weight_gradients_.data()); #endif } else { diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp index a676383e6f..3ae7492046 100644 --- a/libgnn/src/layers/SigmoidLayer.cpp +++ b/libgnn/src/layers/SigmoidLayer.cpp @@ -49,7 +49,8 @@ galois::SigmoidLayer::ForwardPhase( const galois::PointerWithSize input_embeddings) { #ifdef GALOIS_ENABLE_GPU // TODO(loc) when GPU needs it - return 0; + printf("%p\n", input_embeddings.data()); + return p_layer_weights_; #else return ForwardPhaseCPU(input_embeddings); #endif @@ -86,7 +87,7 @@ galois::SigmoidLayer::BackwardPhase(const PointerWithSize, PointerWithSize*) { #ifdef GALOIS_ENABLE_GPU // TODO(loc) when GPU needs it - return 0; + return p_layer_weights_; #else return BackwardPhaseCPU(); #endif From eca2d0dd75858ed2383e291c9b7fc36b416a9b88 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 28 Jan 2021 16:49:31 -0600 Subject: [PATCH 455/660] Various comments on GCN layer added a few comments to better mark exactly what is going on in the GCN forward/backward pass Added a label class print as well --- libgnn/src/graphs/GNNGraph.cpp | 7 ++++++- libgnn/src/layers/GraphConvolutionalLayer.cpp | 13 +++++++++++++ 2 files changed, 19 insertions(+), 1 deletion(-) diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 2753e07f3d..646f39db8e 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -146,6 +146,7 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, size_t num_nodes; file_stream >> num_nodes >> num_label_classes_ >> std::ws; assert(num_nodes == partitioned_graph_->globalSize()); + galois::gPrint("Number of label classes is ", num_label_classes_, "\n"); // allocate memory for labels if (has_single_class_label) { @@ -454,7 +455,8 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti( galois::iterate(begin_owned(), end_owned()), [&](const unsigned lid) { if (IsValidForPhase(lid, phase)) { - size_t label_index = lid * num_label_classes_ + label_class; + size_t label_index = lid * num_label_classes_ + label_class; + GNNLabel true_label = full_ground_truth[label_index]; GNNLabel prediction_is_positive = (predictions[label_index] > 0.5) ? 1 : 0; @@ -510,6 +512,9 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti( global_f1_score += class_f1_score; } // end label class loop + // GALOIS_LOG_WARN("{} {} {} {}", global_true_positive, global_true_negative, + // global_false_positive, global_false_negative); + // double global_f1_macro_score = global_f1_score / num_label_classes_; // micro = considers all classes for precision/recall diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index ed05ddf4be..48759a9bfa 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -66,7 +66,9 @@ galois::GraphConvolutionalLayer::ForwardPhase( UpdateEmbeddings(p_in_temp_2_.data(), p_forward_output_matrix_.data()); } else { // update to aggregate + // FW UpdateEmbeddings(input_data, p_out_temp_.data()); + // A(FW) AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(), p_forward_output_matrix_.data(), &output_column_intermediates_); @@ -95,6 +97,8 @@ galois::GraphConvolutionalLayer::BackwardPhase( ActivationDerivative(input_gradient); } + // AFW = O + // derivative of aggregation/update // TODO clean up logic here to reduce nesting if (!config_.allow_aggregate_after_update || @@ -105,11 +109,15 @@ galois::GraphConvolutionalLayer::BackwardPhase( layer_dimensions_.input_rows * layer_dimensions_.output_columns); assert(p_in_temp_1_.size() == layer_dimensions_.input_columns * layer_dimensions_.input_rows); + // pintemp1 contains (AF)' UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data()); + // pback contains F' // derivative of aggregate is the same due to symmetric graph AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(), p_backward_output_matrix_.data(), &input_column_intermediates_); + // TODO if training A, then A' compute here if layer # is 0 + // dot product of edges that exist in A } // weight gradient calculation // TODO(loc) put this in a function to put the ifdef in there @@ -127,16 +135,21 @@ galois::GraphConvolutionalLayer::BackwardPhase( input_gradient->data(), p_layer_weight_gradients_.data()); #endif } else { + // TODO at this point, out_temp contains memoized FW + // can use it to get A' = O' (FW)^T // aggregate occurs regardless of layer being equal to 0 because it is // required in this case for the weight gradient calculation + // this is (FW)' AggregateAll(layer_dimensions_.output_columns, input_gradient->data(), p_out_temp_.data(), &output_column_intermediates_); if (layer_number_ != 0) { // derivative for update + // backout = F' UpdateEmbeddingsDerivative(p_out_temp_.data(), p_backward_output_matrix_.data()); } // TODO put this in a function + // W' = F^T (FW)' #ifndef GALOIS_ENABLE_GPU // weight gradient; note the use of the aggregated gradient in out_temp galois::CBlasSGEMM( From 0dc99b2d1ccbac6aa1dd3c7f8368dfbd048b84ac Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 28 Jan 2021 17:10:59 -0600 Subject: [PATCH 456/660] Add sigmoid test, fix conv test Adds sigmoid test (which doens't do an automated correctness check because floats are pain). Fixes the convolutional layer test after fixing the backward phase a few commits ago. --- libgnn/test/CMakeLists.txt | 4 ++ libgnn/test/convlayer-test.cpp | 20 +++++----- libgnn/test/sigmoidlayer-test.cpp | 64 +++++++++++++++++++++++++++++++ libgnn/test/softmaxlayer-test.cpp | 9 +++-- 4 files changed, 83 insertions(+), 14 deletions(-) create mode 100644 libgnn/test/sigmoidlayer-test.cpp diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 6bf0ac6bd8..18a854ff8f 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -11,6 +11,10 @@ if (NOT GALOIS_ENABLE_GPU) target_link_libraries(softmaxlayer-test galois_gnn) add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test) + add_executable(sigmoidlayer-test sigmoidlayer-test.cpp) + target_link_libraries(sigmoidlayer-test galois_gnn) + add_test(NAME sigmoidlayer-test COMMAND sigmoidlayer-test) + add_executable(gnnconstruct-test gnnconstruct-test.cpp) target_link_libraries(gnnconstruct-test galois_gnn) add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test) diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp index ae23fa4f23..58d1d7d581 100644 --- a/libgnn/test/convlayer-test.cpp +++ b/libgnn/test/convlayer-test.cpp @@ -127,11 +127,11 @@ int main() { layer_0->GetLayerWeightGradients(); // make sure they are sane GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 36); layer_0.reset(); @@ -196,11 +196,11 @@ int main() { layer_1->GetLayerWeightGradients(); // make sure they are sane GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 36); layer_1.reset(); diff --git a/libgnn/test/sigmoidlayer-test.cpp b/libgnn/test/sigmoidlayer-test.cpp new file mode 100644 index 0000000000..333651bdf5 --- /dev/null +++ b/libgnn/test/sigmoidlayer-test.cpp @@ -0,0 +1,64 @@ +//! @file sigmoidlayer-test.cpp +//! Sigmoid layer test with a test graph +//! No automated ground truth checking; when this was written it was compared +//! manually with pytorch +//! TODO add in automated checking eventually; for now this just makes sure it +//! runs + +#include "galois/Logging.h" +#include "galois/GNNMath.h" +#include "galois/layers/SigmoidLayer.h" + +int main() { + galois::DistMemSys G; + + galois::setActiveThreads(1); + + // load test graph + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, false); + + // input/output columns must be same in softmax + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = 7; + dimension_0.input_columns = test_graph.GetNumLabelClasses(); + dimension_0.output_columns = test_graph.GetNumLabelClasses(); + + GALOIS_LOG_VERBOSE("Num output classes is {}", dimension_0.input_columns); + + // input to softmax + std::vector softmax_input(49, 0.0); + // create input with perfect accuracy + softmax_input[0] = 1; + softmax_input[1] = 1; + softmax_input[2] = 100000000000; + softmax_input[3] = 100000000000000000; + softmax_input[4] = -1000; + softmax_input[5] = -10; + softmax_input[6] = 1000000; + softmax_input[8] = 1; + softmax_input[9] = 1; + softmax_input[10] = 1; + softmax_input[16] = 1; + softmax_input[17] = 1; + softmax_input[18] = 1; + softmax_input[24] = 0; + softmax_input[32] = 0; + softmax_input[40] = 0; + softmax_input[48] = 0; + + // train mode + auto output_layer = + std::make_unique(3, test_graph, dimension_0); + output_layer->ForwardPhase(softmax_input); + + galois::PointerWithSize asdf = + output_layer->BackwardPhase(softmax_input, nullptr); + printf("Output 1\n========\n"); + for (unsigned i = 0; i < asdf.size(); i++) { + if (i % 7 == 0) { + printf("--------------\n"); + } + printf("%f\n", asdf[i]); + } +} diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp index 9f15bedfa3..7a6de416dc 100644 --- a/libgnn/test/softmaxlayer-test.cpp +++ b/libgnn/test/softmaxlayer-test.cpp @@ -1,5 +1,9 @@ -//! @file convlayer-test.cpp +//! @file softmaxlayer-test.cpp //! Softmax layer test with a test graph +//! No automated ground truth checking; when this was written it was compared +//! manually with pytorch +//! TODO add in automated checking eventually; for now this just makes sure it +//! runs #include "galois/Logging.h" #include "galois/GNNMath.h" @@ -117,7 +121,4 @@ int main() { GALOIS_LOG_ASSERT(pd3[i * 7 + 5] == 0.0); GALOIS_LOG_ASSERT(pd3[i * 7 + 6] == 0.0); } - - // TODO in future maybe: add better test for backward phase besides just - // running it } From 53f6140ba5d6d671ec81d5efccc6cb4156b07368 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 30 Jan 2021 13:08:49 -0600 Subject: [PATCH 457/660] gpuconv layer test fix based on backward phase fix Fixing conv layer test for gpu after backward gcn fix. --- libgnn/test/gpu-convlayer-test.cpp | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp index f4bb4cf4d3..a79262d706 100644 --- a/libgnn/test/gpu-convlayer-test.cpp +++ b/libgnn/test/gpu-convlayer-test.cpp @@ -112,11 +112,11 @@ int main() { layer_0->CopyWeightGradientsFromGPU(); // make sure they are sane GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 21); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 21); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 21); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 21); - GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 36); layer_0.reset(); @@ -183,11 +183,11 @@ int main() { layer_1->CopyWeightGradientsFromGPU(); // make sure they are sane GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 21); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 21); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 21); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 21); - GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 36); layer_1.reset(); From f0802dfe7fb9d0a2b06551c236f08d4b5b21d5c8 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 1 Feb 2021 13:27:13 -0600 Subject: [PATCH 458/660] Configuration hooks for random subgraph sampling Adds configuration hooks for random subgraph sampling via a on/off flag on a node. Renames var to enable/disable agg after update optimization --- libgnn/include/galois/GraphNeuralNetwork.h | 45 +++++++++--- libgnn/include/galois/graphs/GNNGraph.h | 72 +++++++++++++------ libgnn/include/galois/layers/GNNLayer.h | 10 ++- libgnn/src/GraphNeuralNetwork.cpp | 17 +++++ libgnn/src/graphs/GNNGraph.cpp | 10 +++ libgnn/src/layers/GraphConvolutionalLayer.cpp | 4 +- libgnn/test/aggregate-sync-test.cpp | 2 +- libgnn/test/convlayer-test.cpp | 10 +-- libgnn/test/gnnfb-test.cpp | 2 +- libgnn/test/weight-sync-test.cpp | 2 +- lonestar/libgnnbench/include/GNNBench/Input.h | 2 +- lonestar/libgnnbench/src/Input.cpp | 20 +++--- 12 files changed, 143 insertions(+), 53 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 51142b9b38..9aa7d8189e 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -23,22 +23,44 @@ namespace galois { class GraphNeuralNetworkConfig { public: //! Construction without a config for layers specified; uses a default + //! also no sampling specified GraphNeuralNetworkConfig(size_t num_layers, const std::vector& layer_types, const std::vector& layer_column_sizes, GNNOutputLayerType output_layer_type) : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes, - output_layer_type, GNNLayerConfig()) {} + output_layer_type, false, GNNLayerConfig()) {} + + //! Construction without a config for layers specified + GraphNeuralNetworkConfig(size_t num_layers, + const std::vector& layer_types, + const std::vector& layer_column_sizes, + GNNOutputLayerType output_layer_type, + bool do_sampling) + : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes, + output_layer_type, do_sampling, + GNNLayerConfig()) {} + + //! Construction without sampling specified + GraphNeuralNetworkConfig(size_t num_layers, + const std::vector& layer_types, + const std::vector& layer_column_sizes, + GNNOutputLayerType output_layer_type, + const GNNLayerConfig& default_layer_config) + : GraphNeuralNetworkConfig(num_layers, layer_types, layer_column_sizes, + output_layer_type, false, + default_layer_config) {} //! Construction with a specified config for layers GraphNeuralNetworkConfig(size_t num_layers, const std::vector& layer_types, const std::vector& layer_column_sizes, GNNOutputLayerType output_layer_type, + bool do_sampling, const GNNLayerConfig& default_layer_config) : num_intermediate_layers_(num_layers), layer_types_(layer_types), layer_column_sizes_(layer_column_sizes), - output_layer_type_(output_layer_type), + output_layer_type_(output_layer_type), do_sampling_(do_sampling), default_layer_config_(default_layer_config) { // Do sanity checks on inputs // should have a type for each layer @@ -51,25 +73,30 @@ class GraphNeuralNetworkConfig { } //! # layers NOT including output layer - size_t num_intermediate_layers() { return num_intermediate_layers_; } + size_t num_intermediate_layers() const { return num_intermediate_layers_; } //! Get intermediate layer i - GNNLayerType intermediate_layer_type(size_t i) { + GNNLayerType intermediate_layer_type(size_t i) const { assert(i < num_intermediate_layers_); return layer_types_[i]; } //! Get intermediate layer i's size - size_t intermediate_layer_size(size_t i) { + size_t intermediate_layer_size(size_t i) const { assert(i < num_intermediate_layers_); return layer_column_sizes_[i]; } //! Type of output layer - GNNOutputLayerType output_layer_type() { return output_layer_type_; } + GNNOutputLayerType output_layer_type() const { return output_layer_type_; } //! Size of output layer is last element of layer column sizes - size_t output_layer_size() { + size_t output_layer_size() const { return layer_column_sizes_[num_intermediate_layers_]; } + + bool do_sampling() const { return do_sampling_; } + //! Get the default layer config of layers in this GNN - const GNNLayerConfig& default_layer_config() { return default_layer_config_; } + const GNNLayerConfig& default_layer_config() const { + return default_layer_config_; + } private: //! Number of layers to construct in the GNN not including the output @@ -83,6 +110,8 @@ class GraphNeuralNetworkConfig { std::vector layer_column_sizes_; //! Output layer type GNNOutputLayerType output_layer_type_; + //! Graph sampling + bool do_sampling_; //! Default config to use for layers GNNLayerConfig default_layer_config_; }; diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index cfed56aade..4400809940 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -1,6 +1,7 @@ #pragma once #include "galois/GNNTypes.h" +#include "galois/PerThreadRNG.h" #include "galois/graphs/CuSPPartitioner.h" #include "galois/graphs/GluonSubstrate.h" #include "galois/graphs/GraphAggregationSyncStructures.h" @@ -140,10 +141,52 @@ class GNNGraph { void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size) const; + ////////////////////////////////////////////////////////////////////////////// + // Sampling related + ////////////////////////////////////////////////////////////////////////////// + + //! Loops through all master nodes and determines if it is "on" or "off" + //! (the meaning of on and off depends on how it is used; for now, it is used + //! to indicate subgraph presence) + void UniformNodeSample(); + + //! Returns true if a particular node is currently considered "in" a sampled + //! graph + bool IsInSampledGraph(const NodeIterator& ni) const { + // TODO(loc) GPU + return partitioned_graph_->getData(*ni); + } + #ifdef GALOIS_ENABLE_GPU const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; } #endif private: + ////////////////////////////////////////////////////////////////////////////// + // Initialization + ////////////////////////////////////////////////////////////////////////////// + + //! Read labels of local nodes only + void ReadLocalLabels(const std::string& dataset_name, + bool has_single_class_label); + //! Read features of local nodes only + void ReadLocalFeatures(const std::string& dataset_str); + //! Helper function to read masks from file into the appropriate structures + //! given a name, mask type, and arrays to save into + size_t ReadLocalMasksFromFile(const std::string& dataset_name, + const std::string& mask_type, + GNNRange* mask_range, char* masks); + //! Read masks of local nodes only for training, validation, and testing + void ReadLocalMasks(const std::string& dataset_name); + //! Reads the entire graph topology in (but nothing else) + void ReadWholeGraph(const std::string& dataset_name); + //! Initializes the norm factors using the entire graph's topology for global + //! degree access + void InitNormFactor(); + + ////////////////////////////////////////////////////////////////////////////// + // Accuracy + ////////////////////////////////////////////////////////////////////////////// + float GetGlobalAccuracyCPU(PointerWithSize predictions, GNNPhase phase); float GetGlobalAccuracyCPUSingle(PointerWithSize predictions, @@ -151,6 +194,10 @@ class GNNGraph { float GetGlobalAccuracyCPUMulti(PointerWithSize predictions, GNNPhase phase); + ////////////////////////////////////////////////////////////////////////////// + // Vars + ////////////////////////////////////////////////////////////////////////////// + //! Directory for input data const std::string input_directory_; //! In a multi-host setting, this variable stores the host id that the graph @@ -199,29 +246,10 @@ class GNNGraph { //! Normalization constant based on structure of the graph (degrees) std::vector norm_factors_; - // TODO vars for subgraphs as necessary - - ////////////////////////////////////////////////////////////////////////////// - // Initialization - ////////////////////////////////////////////////////////////////////////////// + //! RNG for subgraph sampling + galois::PerThreadRNG sample_rng_; - //! Read labels of local nodes only - void ReadLocalLabels(const std::string& dataset_name, - bool has_single_class_label); - //! Read features of local nodes only - void ReadLocalFeatures(const std::string& dataset_str); - //! Helper function to read masks from file into the appropriate structures - //! given a name, mask type, and arrays to save into - size_t ReadLocalMasksFromFile(const std::string& dataset_name, - const std::string& mask_type, - GNNRange* mask_range, char* masks); - //! Read masks of local nodes only for training, validation, and testing - void ReadLocalMasks(const std::string& dataset_name); - //! Reads the entire graph topology in (but nothing else) - void ReadWholeGraph(const std::string& dataset_name); - //! Initializes the norm factors using the entire graph's topology for global - //! degree access - void InitNormFactor(); + // TODO vars for subgraphs as necessary ////////////////////////////////////////////////////////////////////////////// // GPU things diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index f8d8cd8d8a..2924520661 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -48,9 +48,11 @@ struct GNNLayerConfig { bool do_activation{false}; //! True if normalization is to occur during multiplies bool do_normalization{false}; - //! If this is true, aggregate may occur after multiply if # of input columns + //! If this is false, aggregate may occur after multiply if # of input columns //! is higher than output columns to do less work in aggregation - bool allow_aggregate_after_update{true}; + bool disable_aggregate_after_update{false}; + //! Graph sampling flag in use or not + bool do_sampling{false}; // TODO activation type; for now default is softmax }; @@ -135,6 +137,10 @@ class GNNLayer { //! stored in the layer void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number); + //! Flip sampling switch on + void EnableSampling() { config_.do_sampling = true; } + bool IsSampledLayer() { return config_.do_sampling; } + #ifdef GALOIS_ENABLE_GPU //! Utility function for allocating PointerWithSize AllocateGPU(const std::vector& v) { diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index aae4fbb8a1..7892bf4f9e 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -75,12 +75,29 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( // if debug mode just kill program assert(false); } + + // flip sampling + if (config_.do_sampling()) { + for (std::unique_ptr& ptr : gnn_layers_) { + ptr->EnableSampling(); + } + } } float galois::GraphNeuralNetwork::Train(size_t num_epochs) { const size_t this_host = graph_->host_id(); + if (config_.do_sampling()) { + for (std::unique_ptr& ptr : gnn_layers_) { + assert(ptr->IsSampledLayer()); + } + } + // TODO incorporate validation/test intervals for (size_t epoch = 0; epoch < num_epochs; epoch++) { + if (config_.do_sampling()) { + // subgraph sample every epoch + graph_->UniformNodeSample(); + } const PointerWithSize predictions = DoInference(); GradientPropagation(); float train_accuracy = GetGlobalAccuracy(predictions); diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 646f39db8e..f110228fa3 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -130,6 +130,16 @@ void galois::graphs::GNNGraph::AggregateSync( "GraphAggregateSync"); } +void galois::graphs::GNNGraph::UniformNodeSample() { + galois::do_all( + galois::iterate(begin_owned(), end_owned()), [&](const NodeIterator& x) { + partitioned_graph_->getData(*x) = sample_rng_.DoBernoulli(0.5); + }); + // TODO(loc) GPU + // TODO(loc) sync the flags across all machines to have same sample on all of + // them +} + void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, bool has_single_class_label) { GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_); diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 48759a9bfa..2bef20ab1e 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -58,7 +58,7 @@ galois::GraphConvolutionalLayer::ForwardPhase( } // flip aggregate/update if dimensions favor it (do less work) - if (!config_.allow_aggregate_after_update || + if (config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { // aggregation and update AggregateAll(layer_dimensions_.input_columns, input_data, @@ -101,7 +101,7 @@ galois::GraphConvolutionalLayer::BackwardPhase( // derivative of aggregation/update // TODO clean up logic here to reduce nesting - if (!config_.allow_aggregate_after_update || + if (config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { if (layer_number_ != 0) { // transposed sgemm for derivative; in_temp is output diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp index 600ac42018..d13674f1a2 100644 --- a/libgnn/test/aggregate-sync-test.cpp +++ b/libgnn/test/aggregate-sync-test.cpp @@ -30,7 +30,7 @@ int main() { dimension_0.input_columns = 3; dimension_0.output_columns = 2; galois::GNNLayerConfig l_config; - l_config.allow_aggregate_after_update = false; + l_config.disable_aggregate_after_update = false; // create the layer, no norm factor std::unique_ptr layer_0 = diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp index 58d1d7d581..136953378d 100644 --- a/libgnn/test/convlayer-test.cpp +++ b/libgnn/test/convlayer-test.cpp @@ -52,7 +52,7 @@ int main() { dimension_0.output_columns = 2; galois::GNNLayerConfig dcon; - dcon.allow_aggregate_after_update = false; + dcon.disable_aggregate_after_update = false; // create the layer, no norm factor std::unique_ptr layer_0 = @@ -207,10 +207,10 @@ int main() { ////////////////////////////////////////////////////////////////////////////// galois::GNNLayerConfig config; - config.do_dropout = true; - config.do_activation = true; - config.do_normalization = true; - config.allow_aggregate_after_update = false; + config.do_dropout = true; + config.do_activation = true; + config.do_normalization = true; + config.disable_aggregate_after_update = false; // finally, just make sure dropout and activation run without crashes // (verification requires floating point accuracy or setting a seed which I diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp index e7232ca108..224204bceb 100644 --- a/libgnn/test/gnnfb-test.cpp +++ b/libgnn/test/gnnfb-test.cpp @@ -24,7 +24,7 @@ int main() { // note this includes the output; last 2 must be same because softmax std::vector layer_output_sizes = {4, 7, 7}; galois::GNNLayerConfig dcon; - dcon.allow_aggregate_after_update = false; + dcon.disable_aggregate_after_update = false; // note GNNLayerConfig is passed in; use a config that does not do anything // extra like dropout or activation and the like so that input is easier to // verify diff --git a/libgnn/test/weight-sync-test.cpp b/libgnn/test/weight-sync-test.cpp index 3ea524e4a7..4c2c01f844 100644 --- a/libgnn/test/weight-sync-test.cpp +++ b/libgnn/test/weight-sync-test.cpp @@ -21,7 +21,7 @@ int main() { dimension_0.output_columns = 2; galois::GNNLayerConfig dcon; - dcon.allow_aggregate_after_update = false; + dcon.disable_aggregate_after_update = false; // create the layer, no norm factor std::unique_ptr layer_0 = std::make_unique(0, *(test_graph.get()), diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h index 737887a756..598148af42 100644 --- a/lonestar/libgnnbench/include/GNNBench/Input.h +++ b/lonestar/libgnnbench/include/GNNBench/Input.h @@ -28,7 +28,7 @@ extern llvm::cl::opt multiclass_labels; // TODO optimizer type //! Toggles an optimization that flips aggregate/update step if it would be //! beneficial -extern llvm::cl::opt agg_after_update; +extern llvm::cl::opt disable_agg_after_update; const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s); diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index d1dbb5bba3..684e4111dd 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -73,11 +73,11 @@ llvm::cl::opt "ground truth; required for some inputs"), cll::init(false)); -llvm::cl::opt - agg_after_update("allowAggregationAfterUpdate", - cll::desc("If true (on by default), allows aggregate to " - "be done after update as an optimization"), - cll::init(true)); +llvm::cl::opt disable_agg_after_update( + "disableAggregationAfterUpdate", + cll::desc("If true (off by default), disables aggregate " + "after update optimization"), + cll::init(false)); const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) { switch (s) { @@ -127,11 +127,11 @@ CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) { //! Setup layer config struct based on cli args galois::GNNLayerConfig CreateLayerConfig() { galois::GNNLayerConfig layer_config; - layer_config.do_dropout = do_dropout; - layer_config.dropout_rate = dropout_rate; - layer_config.do_activation = do_activation; - layer_config.do_normalization = do_normalization; - layer_config.allow_aggregate_after_update = agg_after_update; + layer_config.do_dropout = do_dropout; + layer_config.dropout_rate = dropout_rate; + layer_config.do_activation = do_activation; + layer_config.do_normalization = do_normalization; + layer_config.disable_aggregate_after_update = disable_agg_after_update; return layer_config; } From e72595a75d44ed3d6215fe6101653551d91231a7 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 1 Feb 2021 16:35:20 -0600 Subject: [PATCH 459/660] Layers account for sampling: sampling test If sampling is enabled, layers now ignore the appropriate data depending on the state of the sampled flag in the graph. Adds a test to make sure sampling works sanely as well. TODO: for completeness, can test the agg/xform swap in the GCN layer. --- libgnn/include/galois/graphs/GNNGraph.h | 5 + libgnn/src/layers/GraphConvolutionalLayer.cpp | 15 +- libgnn/src/layers/SigmoidLayer.cpp | 10 + libgnn/src/layers/SoftmaxLayer.cpp | 10 + libgnn/test/CMakeLists.txt | 4 + libgnn/test/sample-test.cpp | 211 ++++++++++++++++++ 6 files changed, 254 insertions(+), 1 deletion(-) create mode 100644 libgnn/test/sample-test.cpp diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 4400809940..242b63d4c3 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -150,6 +150,11 @@ class GNNGraph { //! to indicate subgraph presence) void UniformNodeSample(); + //! Makes a node "sampled"; used for debugging/testing + void SetSampledNode(size_t node) { partitioned_graph_->getData(node) = 1; } + //! Makes a node "not sampled"; used for debugging/testing + void UnsetSampledNode(size_t node) { partitioned_graph_->getData(node) = 0; } + //! Returns true if a particular node is currently considered "in" a sampled //! graph bool IsInSampledGraph(const NodeIterator& ni) const { diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 2bef20ab1e..46b997b087 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -208,6 +208,13 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( aggregate_output[index_to_src_feature + i] = 0; } + if (IsSampledLayer()) { + // check if node is part of sampled graph; ignore after 0'ing if not + // sampled + if (!graph_.IsInSampledGraph(src)) + return; + } + GNNFloat source_norm = 0.0; if (config_.do_normalization) { source_norm = graph_.NormFactor(src); @@ -215,7 +222,13 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( // loop through all destinations to grab the feature to aggregate for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) { - size_t dst = graph_.EdgeDestination(e); + size_t dst = graph_.EdgeDestination(e); + if (IsSampledLayer()) { + // ignore non-sampled nodes + if (!graph_.IsInSampledGraph(dst)) + continue; + } + size_t index_to_dst_feature = dst * column_length; if (config_.do_normalization) { diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp index 3ae7492046..a7b373373c 100644 --- a/libgnn/src/layers/SigmoidLayer.cpp +++ b/libgnn/src/layers/SigmoidLayer.cpp @@ -17,6 +17,11 @@ galois::SigmoidLayer::ForwardPhaseCPU( galois::iterate(graph_.begin_owned(), graph_.end_owned()), [&](const unsigned local_node) { if (graph_.IsValidForPhase(local_node, layer_phase_)) { + if (IsSampledLayer()) { + if (!graph_.IsInSampledGraph(local_node)) + return; + } + size_t node_offset = feature_length * local_node; // sigmoid the values for this node for (unsigned index = 0; index < feature_length; index++) { @@ -65,6 +70,11 @@ galois::SigmoidLayer::BackwardPhaseCPU() { galois::iterate(graph_.begin_owned(), graph_.end_owned()), [&](const unsigned local_node) { if (graph_.IsValidForPhase(local_node, layer_phase_)) { + if (IsSampledLayer()) { + if (!graph_.IsInSampledGraph(local_node)) + return; + } + // derivative cross entropy into norm grad const GNNLabel* ground_truth = graph_.GetMultiClassLabel(local_node); size_t node_offset = feature_length * local_node; diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 562349780b..62dcabe622 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -12,6 +12,11 @@ galois::SoftmaxLayer::ForwardPhaseCPU( galois::do_all( galois::iterate(graph_.begin_owned(), graph_.end_owned()), [&](const unsigned i) { + if (IsSampledLayer()) { + if (!graph_.IsInSampledGraph(i)) + return; + } + if (graph_.IsValidForPhase(i, layer_phase_)) { // do softmax GNNSoftmax(feature_length, &input_embeddings[feature_length * i], @@ -63,6 +68,11 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { galois::iterate(graph_.begin_owned(), graph_.end_owned()), [&](const unsigned i) { if (graph_.IsValidForPhase(i, layer_phase_)) { + if (IsSampledLayer()) { + if (!graph_.IsInSampledGraph(i)) + return; + } + // create ground truth vector for this LID // TODO maybe make this part of the graph class instead of recreating // every time diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 18a854ff8f..a6b711397b 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -54,6 +54,10 @@ if (NOT GALOIS_ENABLE_GPU) add_executable(f1-test f1-test.cpp) target_link_libraries(f1-test galois_gnn) add_test(NAME f1-test COMMAND f1-test) + + add_executable(sample-test sample-test.cpp) + target_link_libraries(sample-test galois_gnn) + add_test(NAME sample-test COMMAND sample-test) else() add_executable(gpu-convlayer-test gpu-convlayer-test.cpp) target_link_libraries(gpu-convlayer-test galois_gnn) diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp new file mode 100644 index 0000000000..ead938e5aa --- /dev/null +++ b/libgnn/test/sample-test.cpp @@ -0,0 +1,211 @@ +//! @file sample-test.cpp +//! Sampling tester + +#include "galois/Logging.h" +#include "galois/GNNMath.h" +#include "galois/layers/GraphConvolutionalLayer.h" +#include "galois/layers/SoftmaxLayer.h" +#include "galois/layers/SigmoidLayer.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + + GALOIS_LOG_VERBOSE("[{}] Using {} threads", + galois::runtime::getSystemNetworkInterface().ID, + num_threads); + // load test graph + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = 7; + dimension_0.input_columns = 3; + dimension_0.output_columns = 2; + + galois::GNNLayerConfig dcon; + dcon.disable_aggregate_after_update = false; + + // choose a few sample nodes + test_graph.SetSampledNode(0); + test_graph.SetSampledNode(2); + test_graph.SetSampledNode(4); + test_graph.SetSampledNode(5); + test_graph.UnsetSampledNode(1); + test_graph.UnsetSampledNode(3); + test_graph.UnsetSampledNode(6); + + ////////////////////////////////////////////////////////////////////////////// + + std::unique_ptr layer_1 = + std::make_unique(1, test_graph, + dimension_0, dcon); + layer_1->InitAllWeightsTo1(); + layer_1->EnableSampling(); + + galois::PointerWithSize layer_1_forward_output = + layer_1->ForwardPhase(test_graph.GetLocalFeatures()); + // same check as before for sanity purposes + GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14); + GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 0); + GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 0); + GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 0); + GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 0); + GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 0); + GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 0); + GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 0); + GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 0); + GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 15); + GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 15); + GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 12); + GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 12); + GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 0); + GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 0); + + // dummy 1 matrix + std::vector dummy_ones_v(14, 1); + galois::PointerWithSize dummy_ones(dummy_ones_v); + + // since layer isn't 0 anymore, backward phase will actually return something + dummy_ones_v.assign(14, 1); + // 0 out unsampled nodes + dummy_ones_v[2] = 0; + dummy_ones_v[3] = 0; + dummy_ones_v[6] = 0; + dummy_ones_v[7] = 0; + dummy_ones_v[12] = 0; + dummy_ones_v[13] = 0; + + galois::PointerWithSize layer_1_backward_output = + layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + + ////////////////////////////////////////////////////////////////////////////// + // check that multiplies go as expected + ////////////////////////////////////////////////////////////////////////////// + + GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21); + GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 0); + GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 0); + GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 0); + GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 0); + GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 0); + GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 0); + GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 0); + GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 0); + GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 0); + GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 0); + GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 0); + GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 0); + GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 2); + GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 0); + GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 0); + GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 0); + + galois::PointerWithSize layer_1_weight_gradients = + layer_1->GetLayerWeightGradients(); + // make sure they are sane + GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 9); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 9); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 9); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 9); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 9); + + layer_1.reset(); + + ////////////////////////////////////////////////////////////////////////////// + // softmax + ////////////////////////////////////////////////////////////////////////////// + + galois::GNNLayerDimensions dimension_out; + dimension_out.input_rows = 7; + dimension_out.input_columns = test_graph.GetNumLabelClasses(); + dimension_out.output_columns = test_graph.GetNumLabelClasses(); + std::vector softmax_input(49, 0.0); + // create input with perfect accuracy + softmax_input[0] = 1; + softmax_input[8] = 1; + softmax_input[16] = 1; + softmax_input[24] = 1; + softmax_input[32] = 1; + softmax_input[40] = 1; + softmax_input[48] = 1; + + auto output_layer = + std::make_unique(3, test_graph, dimension_out); + output_layer->EnableSampling(); + galois::PointerWithSize prediction_distribution = + output_layer->ForwardPhase(softmax_input); + + GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[0])) == 0); + GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[2 * 7])) == + 2); + GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[4 * 7])) == + 4); + + std::vector sampled_out = {1, 3, 6}; + // assert sampled out are all 0s + for (size_t i : sampled_out) { + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 0] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 1] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 2] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 3] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 4] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 5] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0); + } + // softmax back: check sampled out is all 0s (others are floats, too painful) + galois::PointerWithSize asdf = + output_layer->BackwardPhase(softmax_input, nullptr); + for (size_t i : sampled_out) { + GALOIS_LOG_ASSERT(asdf[i * 7 + 0] == 0.0); + GALOIS_LOG_ASSERT(asdf[i * 7 + 1] == 0.0); + GALOIS_LOG_ASSERT(asdf[i * 7 + 2] == 0.0); + GALOIS_LOG_ASSERT(asdf[i * 7 + 3] == 0.0); + GALOIS_LOG_ASSERT(asdf[i * 7 + 4] == 0.0); + GALOIS_LOG_ASSERT(asdf[i * 7 + 5] == 0.0); + GALOIS_LOG_ASSERT(asdf[i * 7 + 6] == 0.0); + } + + output_layer.reset(); + + ////////////////////////////////////////////////////////////////////////////// + // sigmoid + ////////////////////////////////////////////////////////////////////////////// + galois::graphs::GNNGraph multi_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, false); + + auto sigmoid_layer = + std::make_unique(3, multi_graph, dimension_out); + sigmoid_layer->EnableSampling(); + // reuse softmax input; only thing interested in is checking for 0s + prediction_distribution = sigmoid_layer->ForwardPhase(softmax_input); + for (size_t i : sampled_out) { + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 0] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 1] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 2] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 3] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 4] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 5] == 0.0); + GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0); + } + asdf = sigmoid_layer->BackwardPhase(softmax_input, nullptr); + for (size_t i : sampled_out) { + GALOIS_LOG_ASSERT(asdf[i * 7 + 0] == 0.0); + GALOIS_LOG_ASSERT(asdf[i * 7 + 1] == 0.0); + GALOIS_LOG_ASSERT(asdf[i * 7 + 2] == 0.0); + GALOIS_LOG_ASSERT(asdf[i * 7 + 3] == 0.0); + GALOIS_LOG_ASSERT(asdf[i * 7 + 4] == 0.0); + GALOIS_LOG_ASSERT(asdf[i * 7 + 5] == 0.0); + GALOIS_LOG_ASSERT(asdf[i * 7 + 6] == 0.0); + } + + return 0; +} From f573b1d857d9c85c0a2d5761b21ed8ff0b845422 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 2 Feb 2021 16:22:34 -0600 Subject: [PATCH 460/660] gcn app sampling option: loss/accuracy report fix Adds CLI option for sampling for GCN app. Also fixes the loss/accuracy reporting by the library by averaging loss to # of nodes checked and taking into account sampling when calculating both. --- libgnn/include/galois/GraphNeuralNetwork.h | 2 ++ libgnn/include/galois/graphs/GNNGraph.h | 13 ++++++-- libgnn/include/galois/layers/GNNLayer.h | 4 +++ libgnn/src/GraphNeuralNetwork.cpp | 12 ++++---- libgnn/src/graphs/GNNGraph.cpp | 30 +++++++++++++++---- libgnn/src/layers/SigmoidLayer.cpp | 11 ++++--- libgnn/src/layers/SoftmaxLayer.cpp | 1 + lonestar/libgnnbench/include/GNNBench/Input.h | 2 ++ lonestar/libgnnbench/src/Input.cpp | 12 ++++++-- 9 files changed, 65 insertions(+), 22 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 9aa7d8189e..ed4cc19b8c 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -171,6 +171,8 @@ class GraphNeuralNetwork { //! Returns classification accuracy for single class label or micro F1 score //! for multi-class predictions; this calls into GNNGraph's accuracy call float GetGlobalAccuracy(const PointerWithSize predictions); + float GetGlobalAccuracy(const PointerWithSize predictions, + bool sampling); //! Backpropagate gradients from the output layer backwards through the //! network to update the layer weights. Also known as a backward phase in diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 242b63d4c3..a4ef90ea4a 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -96,8 +96,11 @@ class GNNGraph { }; GNNFloat NormFactor(GraphNode n) const { return norm_factors_[n]; } + // Get accuracy: sampling is by default false float GetGlobalAccuracy(PointerWithSize predictions, GNNPhase phase); + float GetGlobalAccuracy(PointerWithSize predictions, GNNPhase phase, + bool sampling); //! Returns the ground truth label of some local id assuming labels are single //! class labels. @@ -161,6 +164,10 @@ class GNNGraph { // TODO(loc) GPU return partitioned_graph_->getData(*ni); } + bool IsInSampledGraph(size_t node_id) const { + // TODO(loc) GPU + return partitioned_graph_->getData(node_id); + } #ifdef GALOIS_ENABLE_GPU const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; } @@ -193,11 +200,11 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// float GetGlobalAccuracyCPU(PointerWithSize predictions, - GNNPhase phase); + GNNPhase phase, bool sampling); float GetGlobalAccuracyCPUSingle(PointerWithSize predictions, - GNNPhase phase); + GNNPhase phase, bool sampling); float GetGlobalAccuracyCPUMulti(PointerWithSize predictions, - GNNPhase phase); + GNNPhase phase, bool sampling); ////////////////////////////////////////////////////////////////////////////// // Vars diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 2924520661..06bce9660f 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -221,6 +221,10 @@ class GNNLayer { galois::GNNOutputLayerType output_layer_type_{ galois::GNNOutputLayerType::kInvalid}; + // Used mainly for accuracy tracking + galois::DGAccumulator node_count_; + galois::DGAccumulator float_accumulator_; + ////////////////////////////////////////////////////////////////////////////// //! Init based from following paper diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 7892bf4f9e..d9c0110a9f 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -86,11 +86,11 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( float galois::GraphNeuralNetwork::Train(size_t num_epochs) { const size_t this_host = graph_->host_id(); - if (config_.do_sampling()) { - for (std::unique_ptr& ptr : gnn_layers_) { - assert(ptr->IsSampledLayer()); - } - } + // if (config_.do_sampling()) { + // for (std::unique_ptr& ptr : gnn_layers_) { + // assert(ptr->IsSampledLayer()); + // } + // } // TODO incorporate validation/test intervals for (size_t epoch = 0; epoch < num_epochs; epoch++) { @@ -136,7 +136,7 @@ galois::GraphNeuralNetwork::DoInference() { float galois::GraphNeuralNetwork::GetGlobalAccuracy( PointerWithSize predictions) { - return graph_->GetGlobalAccuracy(predictions, phase_); + return graph_->GetGlobalAccuracy(predictions, phase_, config_.do_sampling()); } void galois::GraphNeuralNetwork::GradientPropagation() { diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index f110228fa3..4a1b3a2f99 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -393,20 +393,26 @@ void galois::graphs::GNNGraph::InitNormFactor() { float galois::graphs::GNNGraph::GetGlobalAccuracy( PointerWithSize predictions, GNNPhase phase) { // No GPU version yet, but this is where it would be - return GetGlobalAccuracyCPU(predictions, phase); + return GetGlobalAccuracy(predictions, phase, false); +} + +float galois::graphs::GNNGraph::GetGlobalAccuracy( + PointerWithSize predictions, GNNPhase phase, bool sampling) { + // No GPU version yet, but this is where it would be + return GetGlobalAccuracyCPU(predictions, phase, sampling); } float galois::graphs::GNNGraph::GetGlobalAccuracyCPU( - PointerWithSize predictions, GNNPhase phase) { + PointerWithSize predictions, GNNPhase phase, bool sampling) { if (is_single_class_label()) { - return GetGlobalAccuracyCPUSingle(predictions, phase); + return GetGlobalAccuracyCPUSingle(predictions, phase, sampling); } else { - return GetGlobalAccuracyCPUMulti(predictions, phase); + return GetGlobalAccuracyCPUMulti(predictions, phase, sampling); } } float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( - PointerWithSize predictions, GNNPhase phase) { + PointerWithSize predictions, GNNPhase phase, bool sampling) { // check owned nodes' accuracy assert((num_label_classes_ * size()) == predictions.size()); num_correct_.reset(); @@ -416,6 +422,12 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( galois::iterate(begin_owned(), end_owned()), [&](const unsigned lid) { if (IsValidForPhase(lid, phase)) { + if (sampling) { + if (!IsInSampledGraph(lid)) { + return; + } + } + total_checked_ += 1; // get prediction by getting max size_t predicted_label = galois::MaxIndex( @@ -441,7 +453,7 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( } float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti( - PointerWithSize predictions, GNNPhase phase) { + PointerWithSize predictions, GNNPhase phase, bool sampling) { const GNNLabel* full_ground_truth = GetMultiClassLabel(0); assert(predictions.size() == (num_label_classes_ * size())); @@ -465,6 +477,12 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti( galois::iterate(begin_owned(), end_owned()), [&](const unsigned lid) { if (IsValidForPhase(lid, phase)) { + if (sampling) { + if (!IsInSampledGraph(lid)) { + return; + } + } + size_t label_index = lid * num_label_classes_ + label_class; GNNLabel true_label = full_ground_truth[label_index]; diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp index a7b373373c..983ab9af87 100644 --- a/libgnn/src/layers/SigmoidLayer.cpp +++ b/libgnn/src/layers/SigmoidLayer.cpp @@ -10,8 +10,8 @@ galois::SigmoidLayer::ForwardPhaseCPU( input_loss_.assign(input_loss_.size(), 0.0); forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); const size_t feature_length = layer_dimensions_.input_columns; - galois::GAccumulator total_loss; - total_loss.reset(); + node_count_.reset(); + float_accumulator_.reset(); galois::do_all( galois::iterate(graph_.begin_owned(), graph_.end_owned()), @@ -22,6 +22,8 @@ galois::SigmoidLayer::ForwardPhaseCPU( return; } + node_count_ += 1; + size_t node_offset = feature_length * local_node; // sigmoid the values for this node for (unsigned index = 0; index < feature_length; index++) { @@ -40,12 +42,13 @@ galois::SigmoidLayer::ForwardPhaseCPU( feature_length, graph_.GetMultiClassLabel(local_node), &forward_output_matrix_[node_offset]); // TODO(loc) normalize the loss - total_loss += input_loss_[local_node]; + float_accumulator_ += input_loss_[local_node]; } }, galois::steal(), galois::loopname("SigmoidForward")); - galois::gPrint("Total loss is ", total_loss.reduce(), "\n"); + galois::gPrint("Average loss is ", + float_accumulator_.reduce() / node_count_.reduce(), "\n"); return forward_output_matrix_; } diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 62dcabe622..9b9fc0e3a6 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -8,6 +8,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU( input_loss_.assign(input_loss_.size(), 0.0); forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); const size_t feature_length = layer_dimensions_.input_columns; + // TODO(loc) once needed for accuracy debugging, print out loss galois::do_all( galois::iterate(graph_.begin_owned(), graph_.end_owned()), diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h index 598148af42..18db419793 100644 --- a/lonestar/libgnnbench/include/GNNBench/Input.h +++ b/lonestar/libgnnbench/include/GNNBench/Input.h @@ -29,6 +29,8 @@ extern llvm::cl::opt multiclass_labels; //! Toggles an optimization that flips aggregate/update step if it would be //! beneficial extern llvm::cl::opt disable_agg_after_update; +//! Random sampling of nodes every epoch +extern llvm::cl::opt do_graph_sampling; const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s); diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index 684e4111dd..b4cd7fb67e 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -79,6 +79,12 @@ llvm::cl::opt disable_agg_after_update( "after update optimization"), cll::init(false)); +llvm::cl::opt + do_graph_sampling("doGraphSampling", + cll::desc("If true (off by default), sample nodes for " + "use every epoch at a 50\% drop rate"), + cll::init(false)); + const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) { switch (s) { case galois::graphs::GNNPartitionScheme::kOEC: @@ -188,9 +194,9 @@ InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type) { // layer config object galois::GNNLayerConfig layer_config = CreateLayerConfig(); // GNN config object - galois::GraphNeuralNetworkConfig gnn_config(num_layers, layer_types, - layer_sizes_vector, - output_layer_type, layer_config); + galois::GraphNeuralNetworkConfig gnn_config( + num_layers, layer_types, layer_sizes_vector, output_layer_type, + do_graph_sampling, layer_config); // optimizer std::unique_ptr opt = CreateOptimizer(gnn_graph.get()); From c2582e93fc746b6ae488d433947eb0998d2d4ac7 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 2 Feb 2021 17:10:15 -0600 Subject: [PATCH 461/660] Option for diff. sample rates; test sample fix Separated sampling function into 2 functions: one allows you to adjust rate of sampling. Fixed sampling such that anything not the training phase will use the entire graph as it should. --- libgnn/include/galois/graphs/GNNGraph.h | 4 +++- libgnn/src/graphs/GNNGraph.cpp | 10 ++++++---- libgnn/src/layers/GraphConvolutionalLayer.cpp | 5 +++-- libgnn/src/layers/SigmoidLayer.cpp | 6 ++++-- libgnn/src/layers/SoftmaxLayer.cpp | 4 ++-- 5 files changed, 18 insertions(+), 11 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index a4ef90ea4a..3c0419c28f 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -150,8 +150,10 @@ class GNNGraph { //! Loops through all master nodes and determines if it is "on" or "off" //! (the meaning of on and off depends on how it is used; for now, it is used - //! to indicate subgraph presence) + //! to indicate subgraph presence); droprate controls chance of being dropped + //! (e.g. if 0.8, a node is 80% likely to not be included in subgraph) void UniformNodeSample(); + void UniformNodeSample(float droprate); //! Makes a node "sampled"; used for debugging/testing void SetSampledNode(size_t node) { partitioned_graph_->getData(node) = 1; } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 4a1b3a2f99..ee243f5c4d 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -130,10 +130,12 @@ void galois::graphs::GNNGraph::AggregateSync( "GraphAggregateSync"); } -void galois::graphs::GNNGraph::UniformNodeSample() { +void galois::graphs::GNNGraph::UniformNodeSample() { UniformNodeSample(0.8); } + +void galois::graphs::GNNGraph::UniformNodeSample(float droprate) { galois::do_all( galois::iterate(begin_owned(), end_owned()), [&](const NodeIterator& x) { - partitioned_graph_->getData(*x) = sample_rng_.DoBernoulli(0.5); + partitioned_graph_->getData(*x) = sample_rng_.DoBernoulli(droprate); }); // TODO(loc) GPU // TODO(loc) sync the flags across all machines to have same sample on all of @@ -423,7 +425,7 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( [&](const unsigned lid) { if (IsValidForPhase(lid, phase)) { if (sampling) { - if (!IsInSampledGraph(lid)) { + if (phase == GNNPhase::kTrain && !IsInSampledGraph(lid)) { return; } } @@ -478,7 +480,7 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti( [&](const unsigned lid) { if (IsValidForPhase(lid, phase)) { if (sampling) { - if (!IsInSampledGraph(lid)) { + if (phase == GNNPhase::kTrain && !IsInSampledGraph(lid)) { return; } } diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 46b997b087..c416a0272a 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -211,7 +211,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( if (IsSampledLayer()) { // check if node is part of sampled graph; ignore after 0'ing if not // sampled - if (!graph_.IsInSampledGraph(src)) + if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(src)) return; } @@ -225,7 +225,8 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( size_t dst = graph_.EdgeDestination(e); if (IsSampledLayer()) { // ignore non-sampled nodes - if (!graph_.IsInSampledGraph(dst)) + if (layer_phase_ == GNNPhase::kTrain && + !graph_.IsInSampledGraph(dst)) continue; } diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp index 983ab9af87..35f95b64a6 100644 --- a/libgnn/src/layers/SigmoidLayer.cpp +++ b/libgnn/src/layers/SigmoidLayer.cpp @@ -18,7 +18,8 @@ galois::SigmoidLayer::ForwardPhaseCPU( [&](const unsigned local_node) { if (graph_.IsValidForPhase(local_node, layer_phase_)) { if (IsSampledLayer()) { - if (!graph_.IsInSampledGraph(local_node)) + if (layer_phase_ == GNNPhase::kTrain && + !graph_.IsInSampledGraph(local_node)) return; } @@ -74,7 +75,8 @@ galois::SigmoidLayer::BackwardPhaseCPU() { [&](const unsigned local_node) { if (graph_.IsValidForPhase(local_node, layer_phase_)) { if (IsSampledLayer()) { - if (!graph_.IsInSampledGraph(local_node)) + if (layer_phase_ == GNNPhase::kTrain && + !graph_.IsInSampledGraph(local_node)) return; } diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 9b9fc0e3a6..d98251091c 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -14,7 +14,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU( galois::iterate(graph_.begin_owned(), graph_.end_owned()), [&](const unsigned i) { if (IsSampledLayer()) { - if (!graph_.IsInSampledGraph(i)) + if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i)) return; } @@ -70,7 +70,7 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { [&](const unsigned i) { if (graph_.IsValidForPhase(i, layer_phase_)) { if (IsSampledLayer()) { - if (!graph_.IsInSampledGraph(i)) + if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i)) return; } From e393c03b80e2106ff1954e48cd50c0ffd8946ace Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 5 Feb 2021 12:16:56 -0600 Subject: [PATCH 462/660] ogbn-proteins split --- libcusp/include/galois/graphs/NewGeneric.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index 0c3e4b31d4..771c5b5143 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -115,6 +115,10 @@ class NewDistGraphGeneric : public DistGraph { // this is entire graph: amazon's mask isn't contiguous bps.push_back(0); bps.push_back(1569960); + } else if (filename.find("ogbn-proteins") != std::string::npos) { + // this is entire graph: amazon's mask isn't contiguous + bps.push_back(0); + bps.push_back(86618); } else { // TODO(loc) only die under certain conditions; don't die if something // is missing From 78f1e86a792091cff94d56e47d02b75fbcb1434d Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 5 Feb 2021 12:40:28 -0600 Subject: [PATCH 463/660] Inductive gnn training Inductive gnn training option enabled by this commit. Users can specify inductive training, and the result is that during training validation and test nodes are completely ignored. --- libgnn/include/galois/layers/GNNLayer.h | 5 ++- libgnn/src/layers/GraphConvolutionalLayer.cpp | 37 ++++++++++++++----- lonestar/libgnnbench/include/GNNBench/Input.h | 21 ----------- lonestar/libgnnbench/include/GNNBench/Start.h | 3 -- lonestar/libgnnbench/src/Input.cpp | 7 ++++ 5 files changed, 38 insertions(+), 35 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 06bce9660f..6ec6a78671 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -53,6 +53,8 @@ struct GNNLayerConfig { bool disable_aggregate_after_update{false}; //! Graph sampling flag in use or not bool do_sampling{false}; + //! Inductive layer means for aggregation all non-training nodes are ignored + bool inductive_training_{false}; // TODO activation type; for now default is softmax }; @@ -139,7 +141,8 @@ class GNNLayer { //! Flip sampling switch on void EnableSampling() { config_.do_sampling = true; } - bool IsSampledLayer() { return config_.do_sampling; } + bool IsSampledLayer() const { return config_.do_sampling; } + bool IsInductiveLayer() const { return config_.inductive_training_; } #ifdef GALOIS_ENABLE_GPU //! Utility function for allocating diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index c416a0272a..208229d6f1 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -208,11 +208,19 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( aggregate_output[index_to_src_feature + i] = 0; } - if (IsSampledLayer()) { - // check if node is part of sampled graph; ignore after 0'ing if not - // sampled - if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(src)) - return; + if (layer_phase_ == GNNPhase::kTrain) { + if (IsInductiveLayer()) { + // if inductive, all non-training nodes do not exist + if (!graph_.IsValidForPhase(src, GNNPhase::kTrain)) + return; + } + + if (IsSampledLayer()) { + // check if node is part of sampled graph; ignore after 0'ing if not + // sampled + if (!graph_.IsInSampledGraph(src)) + return; + } } GNNFloat source_norm = 0.0; @@ -223,11 +231,20 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( // loop through all destinations to grab the feature to aggregate for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) { size_t dst = graph_.EdgeDestination(e); - if (IsSampledLayer()) { - // ignore non-sampled nodes - if (layer_phase_ == GNNPhase::kTrain && - !graph_.IsInSampledGraph(dst)) - continue; + + if (layer_phase_ == GNNPhase::kTrain) { + if (IsInductiveLayer()) { + // if inductive, all non-training nodes do not exist + if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain)) + return; + } + + if (IsSampledLayer()) { + // ignore non-sampled nodes + if (layer_phase_ == GNNPhase::kTrain && + !graph_.IsInSampledGraph(dst)) + continue; + } } size_t index_to_dst_feature = dst * column_length; diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h index 18db419793..784b1fd431 100644 --- a/lonestar/libgnnbench/include/GNNBench/Input.h +++ b/lonestar/libgnnbench/include/GNNBench/Input.h @@ -10,27 +10,6 @@ extern llvm::cl::opt input_directory; extern llvm::cl::opt input_name; //! Scheme used to partition the graph extern llvm::cl::opt partition_scheme; -// Control layer count and size -extern llvm::cl::opt num_layers; -extern llvm::cl::list layer_sizes; -// Control dropout -extern llvm::cl::opt do_dropout; -extern llvm::cl::opt dropout_rate; -// Control activation -extern llvm::cl::opt do_activation; -// TODO activation layer type once more are supported -//! Controls weight normalization based on degree -extern llvm::cl::opt do_normalization; -//! Output layer type -extern llvm::cl::opt output_layer_type; -//! If true, use multiclass ground truth -extern llvm::cl::opt multiclass_labels; -// TODO optimizer type -//! Toggles an optimization that flips aggregate/update step if it would be -//! beneficial -extern llvm::cl::opt disable_agg_after_update; -//! Random sampling of nodes every epoch -extern llvm::cl::opt do_graph_sampling; const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s); diff --git a/lonestar/libgnnbench/include/GNNBench/Start.h b/lonestar/libgnnbench/include/GNNBench/Start.h index c17ddecadc..c03970c868 100644 --- a/lonestar/libgnnbench/include/GNNBench/Start.h +++ b/lonestar/libgnnbench/include/GNNBench/Start.h @@ -8,10 +8,7 @@ // CLI //////////////////////////////////////////////////////////////////////////////// -extern llvm::cl::opt num_threads; -extern llvm::cl::opt num_runs; extern llvm::cl::opt num_epochs; -extern llvm::cl::opt stat_file; //////////////////////////////////////////////////////////////////////////////// // Init functions diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index b4cd7fb67e..d9b92607b1 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -85,6 +85,12 @@ llvm::cl::opt "use every epoch at a 50\% drop rate"), cll::init(false)); +llvm::cl::opt + do_inductive_training("doInductiveTraining", + cll::desc("If true (off by default), during training " + "all non-train nodes are ignored"), + cll::init(false)); + const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) { switch (s) { case galois::graphs::GNNPartitionScheme::kOEC: @@ -138,6 +144,7 @@ galois::GNNLayerConfig CreateLayerConfig() { layer_config.do_activation = do_activation; layer_config.do_normalization = do_normalization; layer_config.disable_aggregate_after_update = disable_agg_after_update; + layer_config.inductive_training_ = do_inductive_training; return layer_config; } From 5b95e08224d8337b7cff5d71f2dc60967c7795aa Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 5 Feb 2021 21:09:31 -0600 Subject: [PATCH 464/660] Sample and inductive norm factor correction Adds inductive training (i.e. ignore non-train nodes during training). Sampling/inductive training changes up input graph: norm factor must chnage as well. --- libgnn/include/galois/GraphNeuralNetwork.h | 14 +++-- libgnn/include/galois/graphs/GNNGraph.h | 7 +++ libgnn/src/GraphNeuralNetwork.cpp | 7 ++- libgnn/src/graphs/GNNGraph.cpp | 72 +++++++++++++++++++++- lonestar/libgnnbench/src/Input.cpp | 1 + 5 files changed, 93 insertions(+), 8 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index ed4cc19b8c..ae860d0d32 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -58,9 +58,9 @@ class GraphNeuralNetworkConfig { GNNOutputLayerType output_layer_type, bool do_sampling, const GNNLayerConfig& default_layer_config) - : num_intermediate_layers_(num_layers), layer_types_(layer_types), - layer_column_sizes_(layer_column_sizes), - output_layer_type_(output_layer_type), do_sampling_(do_sampling), + : do_sampling_(do_sampling), num_intermediate_layers_(num_layers), + layer_types_(layer_types), layer_column_sizes_(layer_column_sizes), + output_layer_type_(output_layer_type), default_layer_config_(default_layer_config) { // Do sanity checks on inputs // should have a type for each layer @@ -98,6 +98,12 @@ class GraphNeuralNetworkConfig { return default_layer_config_; } + // public because they are independent of other settings + //! Graph sampling + bool do_sampling_{false}; + //! Inductive = training ignores test/val set + bool inductive_training_{false}; + private: //! Number of layers to construct in the GNN not including the output //! layer @@ -110,8 +116,6 @@ class GraphNeuralNetworkConfig { std::vector layer_column_sizes_; //! Output layer type GNNOutputLayerType output_layer_type_; - //! Graph sampling - bool do_sampling_; //! Default config to use for layers GNNLayerConfig default_layer_config_; }; diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 3c0419c28f..2ed6647b7c 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -171,9 +171,16 @@ class GNNGraph { return partitioned_graph_->getData(node_id); } + //! Calculate norm factor considering the entire graph + void CalculateFullNormFactor(); + //! Calculate norm factor considering sampled nodes and/or training nodes + //! only (inductive) + void CalculateSpecialNormFactor(bool is_sampled, bool is_inductive); + #ifdef GALOIS_ENABLE_GPU const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; } #endif + private: ////////////////////////////////////////////////////////////////////////////// // Initialization diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index d9c0110a9f..be188ff843 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -92,11 +92,16 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // } // } + if (config_.inductive_training_) { + graph_->CalculateSpecialNormFactor(false, true); + } + // TODO incorporate validation/test intervals for (size_t epoch = 0; epoch < num_epochs; epoch++) { if (config_.do_sampling()) { // subgraph sample every epoch graph_->UniformNodeSample(); + graph_->CalculateSpecialNormFactor(true, config_.inductive_training_); } const PointerWithSize predictions = DoInference(); GradientPropagation(); @@ -107,7 +112,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } // TODO validation and test as necessary } - + graph_->CalculateFullNormFactor(); // check test accuracy galois::StatTimer acc_timer("FinalAccuracyTest"); acc_timer.start(); diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index ee243f5c4d..3e5d468da2 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -130,7 +130,7 @@ void galois::graphs::GNNGraph::AggregateSync( "GraphAggregateSync"); } -void galois::graphs::GNNGraph::UniformNodeSample() { UniformNodeSample(0.8); } +void galois::graphs::GNNGraph::UniformNodeSample() { UniformNodeSample(0.5); } void galois::graphs::GNNGraph::UniformNodeSample(float droprate) { galois::do_all( @@ -374,6 +374,11 @@ void galois::graphs::GNNGraph::ReadWholeGraph(const std::string& dataset_name) { void galois::graphs::GNNGraph::InitNormFactor() { GALOIS_LOG_VERBOSE("[{}] Initializing norm factors", host_id_); norm_factors_.resize(partitioned_graph_->size(), 0.0); + CalculateFullNormFactor(); +} + +void galois::graphs::GNNGraph::CalculateFullNormFactor() { + norm_factors_.assign(partitioned_graph_->size(), 0.0); // get the norm factor contribution for each node based on the GLOBAL graph galois::do_all( @@ -389,7 +394,70 @@ void galois::graphs::GNNGraph::InitNormFactor() { 1.0 / std::sqrt(static_cast(global_degree)); } }, - galois::loopname("InitNormFactor")); + galois::loopname("CalculateFullNormFactor")); +} + +void galois::graphs::GNNGraph::CalculateSpecialNormFactor(bool is_sampled, + bool is_inductive) { + if (galois::runtime::getSystemNetworkInterface().Num > 1) { + GALOIS_LOG_FATAL("cannot run special norm factor in dist setting yet"); + } + + norm_factors_.assign(partitioned_graph_->size(), 0.0); + + // get the norm factor contribution for each node based on the GLOBAL graph + galois::do_all( + galois::iterate(static_cast(0), partitioned_graph_->size()), + [&](size_t local_id) { + // ignore node if not valid + if (is_sampled && is_inductive) { + if (!IsValidForPhase(local_id, GNNPhase::kTrain) || + !IsInSampledGraph(local_id)) { + return; + } + } else if (is_sampled) { + if (!IsInSampledGraph(local_id)) { + return; + } + } else if (is_inductive) { + if (!IsValidForPhase(local_id, GNNPhase::kTrain)) { + return; + } + } + + size_t degree = 0; + + // TODO(loc) make this work in a distributed setting; assuming + // whole graph is present on single host at the moment + for (EdgeIterator e = EdgeBegin(local_id); e != EdgeEnd(local_id); + e++) { + size_t dest = EdgeDestination(e); + if (is_sampled && is_inductive) { + if (!IsValidForPhase(dest, GNNPhase::kTrain) || + !IsInSampledGraph(dest)) { + continue; + } + } else if (is_sampled) { + if (!IsInSampledGraph(dest)) { + continue; + } + } else if (is_inductive) { + if (!IsValidForPhase(dest, GNNPhase::kTrain)) { + continue; + } + } else { + GALOIS_LOG_WARN( + "Why is special norm factor called if not sampled/inductive?"); + } + degree += 1; + } + + // only set if non-zero + if (degree != 0) { + norm_factors_[local_id] = 1.0 / std::sqrt(static_cast(degree)); + } + }, + galois::loopname("CalculateSpecialNormFactor")); } float galois::graphs::GNNGraph::GetGlobalAccuracy( diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index d9b92607b1..dea458a6b3 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -204,6 +204,7 @@ InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type) { galois::GraphNeuralNetworkConfig gnn_config( num_layers, layer_types, layer_sizes_vector, output_layer_type, do_graph_sampling, layer_config); + gnn_config.inductive_training_ = do_inductive_training; // optimizer std::unique_ptr opt = CreateOptimizer(gnn_graph.get()); From c704c5a001a6de5a8fe88c904173840a7b1745af Mon Sep 17 00:00:00 2001 From: Hochan Lee Date: Tue, 9 Feb 2021 17:45:41 -0600 Subject: [PATCH 465/660] Implement a distributed multi-gpu GCN. (#1) Multi-GPU GCN --- libgluon/include/galois/cuda/Context.h | 118 +++++++ libgluon/include/galois/cuda/HostDecls.h | 46 ++- .../include/galois/graphs/GluonSubstrate.h | 294 ++++++++++++------ .../include/galois/runtime/SyncStructures.h | 99 ++++++ .../include/galois/runtime/cuda/DeviceSync.h | 217 ++++++++----- libgnn/CMakeLists.txt | 5 +- libgnn/include/galois/CUDAUtilHostDecls.h | 3 + .../include/galois/GNNCudaContextHostDecls.h | 82 +++++ libgnn/include/galois/GNNOptimizers.h | 40 +-- libgnn/include/galois/GNNTypes.h | 5 + libgnn/include/galois/graphs/GNNGraph.h | 30 +- .../graphs/GraphAggregationSyncStructures.h | 26 +- libgnn/include/galois/layers/GNNLayer.h | 4 +- libgnn/src/CUDAUtil.cu | 9 + libgnn/src/GNNCudaContext.cu | 228 ++++++++++++++ libgnn/src/GNNOptimizers.cpp | 59 ++-- libgnn/src/GraphNeuralNetwork.cpp | 58 +++- libgnn/src/graphs/GNNGraph.cpp | 68 +++- libgnn/src/layers/GNNLayer.cpp | 112 ++++--- libgnn/src/layers/GraphConvolutionalLayer.cpp | 158 ++++++---- libgnn/src/layers/SoftmaxLayer.cpp | 30 +- libgnn/test/CMakeLists.txt | 3 + libgnn/test/gpu-adam-test.cpp | 2 +- libgnn/test/gpu-aggregate-sync-test.cpp | 212 +++++++++++++ libgnn/test/gpu-convlayer-test.cpp | 11 +- libgnn/test/gpu-epoch-test.cpp | 1 + libgnn/test/gpu-softmaxlayer-test.cpp | 1 + libgpu/include/sharedptr.h | 11 + lonestar/libgnnbench/CMakeLists.txt | 1 + lonestar/libgnnbench/include/GNNBench/Input.h | 4 + lonestar/libgnnbench/include/GNNBench/Start.h | 14 + lonestar/libgnnbench/src/Start.cpp | 73 ++++- 32 files changed, 1640 insertions(+), 384 deletions(-) create mode 100644 libgnn/include/galois/CUDAUtilHostDecls.h create mode 100644 libgnn/include/galois/GNNCudaContextHostDecls.h create mode 100644 libgnn/src/CUDAUtil.cu create mode 100644 libgnn/src/GNNCudaContext.cu create mode 100644 libgnn/test/gpu-aggregate-sync-test.cpp diff --git a/libgluon/include/galois/cuda/Context.h b/libgluon/include/galois/cuda/Context.h index 0ecf9eba82..57492bfdf6 100644 --- a/libgluon/include/galois/cuda/Context.h +++ b/libgluon/include/galois/cuda/Context.h @@ -32,6 +32,7 @@ #include #include "gg.h" #include "galois/cuda/HostDecls.h" +#include "galois/cuda/DynamicBitset.h" struct CUDA_Context_Shared { unsigned int* num_nodes; // per host @@ -170,6 +171,34 @@ size_t mem_usage_CUDA_common(MarshalGraph& g, unsigned num_hosts) { return mem_usage; } +size_t mem_usage_CUDA_common(PartitionedGraphInfo& g_info, unsigned num_hosts) { + size_t mem_usage = 0; + size_t max_shared_size = 0; // for union across master/mirror of all hosts + mem_usage += num_hosts * sizeof(unsigned int); + mem_usage += num_hosts * sizeof(Shared); + for (uint32_t h = 0; h < num_hosts; ++h) { + if (g_info.num_master_nodes[h] > 0) { + mem_usage += g_info.num_master_nodes[h] * sizeof(unsigned int); + } + if (g_info.num_master_nodes[h] > max_shared_size) { + max_shared_size = g_info.num_master_nodes[h]; + } + } + mem_usage += num_hosts * sizeof(unsigned int); + mem_usage += num_hosts * sizeof(Shared); + for (uint32_t h = 0; h < num_hosts; ++h) { + if (g_info.num_mirror_nodes[h] > 0) { + mem_usage += g_info.num_mirror_nodes[h] * sizeof(unsigned int); + } + if (g_info.num_mirror_nodes[h] > max_shared_size) { + max_shared_size = g_info.num_mirror_nodes[h]; + } + } + mem_usage += max_shared_size * sizeof(unsigned int); + mem_usage += ((max_shared_size + 63) / 64) * sizeof(unsigned long long int); + return mem_usage; +} + template void load_graph_CUDA_field(struct CUDA_Context_Common* ctx, struct CUDA_Context_Field* field, @@ -191,6 +220,44 @@ void load_graph_CUDA_field(struct CUDA_Context_Common* ctx, field->is_updated.cpu_wr_ptr()->alloc(ctx->gg.nnodes); } +//! Set up cuda context for vector communication. +//! A vector of the vector is represented as a flattened 1D vector. +//! Users can either allocate data on this function or not. +//! The data could be a pointer which had been allocated at outside. +template +void load_graph_CUDA_field_inflating(struct CUDA_Context_Common* ctx, + struct CUDA_Context_Field* field, + unsigned num_hosts, unsigned nnodes, + size_t infl_size) { + load_graph_CUDA_field_inflating(ctx, field, num_hosts, nnodes, + infl_size, true); +} + +template +void load_graph_CUDA_field_inflating(struct CUDA_Context_Common* ctx, + struct CUDA_Context_Field* field, + unsigned num_hosts, unsigned nnodes, + size_t infl_size, bool data_alloc) { + size_t max_shared_size = 0; // for union across master/mirror of all hosts + for (uint32_t h = 0; h < num_hosts; ++h) { + if (ctx->master.num_nodes[h] > max_shared_size) { + max_shared_size = ctx->master.num_nodes[h]; + } + } + for (uint32_t h = 0; h < num_hosts; ++h) { + if (ctx->mirror.num_nodes[h] > max_shared_size) { + max_shared_size = ctx->mirror.num_nodes[h]; + } + } + field->is_updated.alloc(1); + field->is_updated.cpu_wr_ptr()->alloc(nnodes); + + if (data_alloc) { + field->data.alloc(nnodes * infl_size); + } + field->shared_data.alloc(max_shared_size * infl_size); +} + template size_t mem_usage_CUDA_field(struct CUDA_Context_Field* field, MarshalGraph& g, unsigned num_hosts) { @@ -211,3 +278,54 @@ size_t mem_usage_CUDA_field(struct CUDA_Context_Field* field, mem_usage += ((g.nnodes + 63) / 64) * sizeof(unsigned long long int); return mem_usage; } + +void load_graph_CUDA_common(struct CUDA_Context_Common* ctx, + PartitionedGraphInfo& g_info, unsigned num_hosts) { + ctx->numOwned = g_info.numOwned; + ctx->beginMaster = g_info.beginMaster; + ctx->numNodesWithEdges = g_info.numNodesWithEdges; + assert(ctx->id == g_info.id); + + size_t mem_usage = + ((g_info.nnodes + 1) + g_info.nedges) * sizeof(index_type) + + (g_info.nnodes) * sizeof(node_data_type); + + size_t max_shared_size = 0; // for union across master/mirror of all hosts + ctx->master.num_nodes = + (unsigned int*)calloc(num_hosts, sizeof(unsigned int)); + memcpy(ctx->master.num_nodes, g_info.num_master_nodes, + sizeof(unsigned int) * num_hosts); + ctx->master.nodes = (DeviceOnly*)calloc( + num_hosts, sizeof(Shared)); + for (uint32_t h = 0; h < num_hosts; ++h) { + if (ctx->master.num_nodes[h] > 0) { + ctx->master.nodes[h].alloc(ctx->master.num_nodes[h]); + ctx->master.nodes[h].copy_to_gpu(g_info.master_nodes[h], + ctx->master.num_nodes[h]); + } + if (ctx->master.num_nodes[h] > max_shared_size) { + max_shared_size = ctx->master.num_nodes[h]; + } + } + ctx->mirror.num_nodes = + (unsigned int*)calloc(num_hosts, sizeof(unsigned int)); + memcpy(ctx->mirror.num_nodes, g_info.num_mirror_nodes, + sizeof(unsigned int) * num_hosts); + ctx->mirror.nodes = (DeviceOnly*)calloc( + num_hosts, sizeof(Shared)); + for (uint32_t h = 0; h < num_hosts; ++h) { + if (ctx->mirror.num_nodes[h] > 0) { + ctx->mirror.nodes[h].alloc(ctx->mirror.num_nodes[h]); + ctx->mirror.nodes[h].copy_to_gpu(g_info.mirror_nodes[h], + ctx->mirror.num_nodes[h]); + } + if (ctx->mirror.num_nodes[h] > max_shared_size) { + max_shared_size = ctx->mirror.num_nodes[h]; + } + } + ctx->offsets.alloc(max_shared_size); + ctx->is_updated.alloc(1); + ctx->is_updated.cpu_wr_ptr()->alloc(max_shared_size); + // printf("[%u] load_graph_GPU: %u owned nodes of total %u resident, %lu + // edges\n", ctx->id, ctx->nowned, graph.nnodes, graph.nedges); +} diff --git a/libgluon/include/galois/cuda/HostDecls.h b/libgluon/include/galois/cuda/HostDecls.h index a085b26967..d4852df70d 100644 --- a/libgluon/include/galois/cuda/HostDecls.h +++ b/libgluon/include/galois/cuda/HostDecls.h @@ -35,6 +35,45 @@ typedef unsigned int node_data_type; typedef unsigned int edge_data_type; #endif +struct PartitionedGraphInfo { + size_t nnodes; + size_t nedges; + unsigned int numOwned; // Number of nodes owned (masters) by this host + unsigned int beginMaster; // local id of the beginning of master nodes + unsigned int numNodesWithEdges; // Number of nodes (masters + mirrors) that + // have outgoing edges + int id; + unsigned numHosts; + unsigned int* num_master_nodes; + unsigned int** master_nodes; + unsigned int* num_mirror_nodes; + unsigned int** mirror_nodes; + + PartitionedGraphInfo() + : nnodes(0), nedges(0), numOwned(0), beginMaster(0), numNodesWithEdges(0), + id(-1), numHosts(0), num_master_nodes(nullptr), master_nodes(nullptr), + num_mirror_nodes(nullptr), mirror_nodes(nullptr) {} + + ~PartitionedGraphInfo() { + if (!num_master_nodes) + free(num_master_nodes); + if (!master_nodes) { + for (unsigned i = 0; i < numHosts; ++i) { + free(master_nodes[i]); + } + free(master_nodes); + } + if (!num_mirror_nodes) + free(num_mirror_nodes); + if (!mirror_nodes) { + for (unsigned i = 0; i < numHosts; ++i) { + free(mirror_nodes[i]); + } + free(mirror_nodes); + } + } +}; + struct MarshalGraph { size_t nnodes; size_t nedges; @@ -55,9 +94,10 @@ struct MarshalGraph { MarshalGraph() : nnodes(0), nedges(0), numOwned(0), beginMaster(0), numNodesWithEdges(0), - id(-1), numHosts(0), row_start(NULL), edge_dst(NULL), node_data(NULL), - edge_data(NULL), num_master_nodes(NULL), master_nodes(NULL), - num_mirror_nodes(NULL), mirror_nodes(NULL) {} + id(-1), numHosts(0), row_start(nullptr), edge_dst(nullptr), + node_data(nullptr), edge_data(nullptr), num_master_nodes(nullptr), + master_nodes(nullptr), num_mirror_nodes(nullptr), + mirror_nodes(nullptr) {} ~MarshalGraph() { if (!row_start) diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index f79427af89..8b68216794 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -719,21 +719,22 @@ class GluonSubstrate : public galois::runtime::GlobalObject { * @param loopName Name to give timer * @param x Host to send to * @param b OUTPUT: Buffer that will hold data to send + * @param elem_size The inner-vector dimesnion of a vector of the vector */ template < SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, bool async, typename std::enable_if::type* = nullptr> void getSendBuffer(std::string loopName, unsigned x, - galois::runtime::SendBuffer& b) { + galois::runtime::SendBuffer& b, size_t elem_size) { auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes; if (BitsetFnTy::is_valid()) { syncExtract( - loopName, x, sharedNodes[x], b); + loopName, x, sharedNodes[x], b, elem_size); } else { syncExtract(loopName, x, sharedNodes[x], - b); + b, elem_size); } std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; @@ -747,11 +748,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject { bool async, typename std::enable_if::type* = nullptr> void getSendBuffer(std::string loopName, unsigned x, - galois::runtime::SendBuffer& b) { + galois::runtime::SendBuffer& b, size_t elem_size) { auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes; syncExtract( - loopName, x, sharedNodes[x], b); + loopName, x, sharedNodes[x], b, elem_size); std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; std::string statSendBytes_str(syncTypeStr + "SendBytesVector_" + @@ -1644,9 +1645,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { typename std::enable_if::value>::type* = nullptr> void syncExtract(std::string loopName, unsigned from_id, - std::vector& indices, - galois::runtime::SendBuffer& b) { - uint32_t num = indices.size(); + std::vector& indices, galois::runtime::SendBuffer& b, + size_t elem_size) { + uint32_t num = indices.size() * elem_size; static VecTy val_vec; // sometimes wasteful galois::PODResizeableArray& offsets = syncOffsets; std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; @@ -1725,8 +1726,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { typename std::enable_if::value>::type* = nullptr> void syncExtract(std::string loopName, unsigned from_id, - std::vector& indices, - galois::runtime::SendBuffer& b) { + std::vector& indices, galois::runtime::SendBuffer& b, + size_t elem_size) { std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; std::string extract_timer_str(syncTypeStr + "Extract_" + get_run_identifier(loopName)); @@ -1739,7 +1740,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { DataCommMode data_mode; - uint32_t num = indices.size(); + uint32_t num = indices.size() * elem_size; static VecTy val_vec; // sometimes wasteful static galois::PODResizeableArray dummyVector; @@ -1768,7 +1769,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject { b.resize(sizeof(DataCommMode) + sizeof(size_t) + (num * sizeof(typename SyncFnTy::ValTy))); } - } else { b.resize(0); if (!async) { @@ -1808,9 +1808,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { bool async, typename std::enable_if::type* = nullptr> void syncExtract(std::string loopName, unsigned from_id, - std::vector& indices, - galois::runtime::SendBuffer& b) { - uint32_t num = indices.size(); + std::vector& indices, galois::runtime::SendBuffer& b, + size_t elem_size) { + uint32_t num = indices.size() * elem_size; galois::DynamicBitSet& bit_set_comm = syncBitset; static VecTy val_vec; // sometimes wasteful galois::PODResizeableArray& offsets = syncOffsets; @@ -1947,8 +1947,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { bool async, typename std::enable_if::type* = nullptr> void syncExtract(std::string loopName, unsigned, std::vector& indices, - galois::runtime::SendBuffer& b) { - uint32_t num = indices.size(); + galois::runtime::SendBuffer& b, size_t elem_size) { + uint32_t num = indices.size() * elem_size; galois::DynamicBitSet& bit_set_comm = syncBitset; static VecTy val_vec; // sometimes wasteful galois::PODResizeableArray& offsets = syncOffsets; @@ -1958,7 +1958,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject { get_run_identifier(loopName)); galois::CondStatTimer Textract(extract_timer_str.c_str(), RNAME); - Textract.start(); if (num > 0) { @@ -2123,7 +2122,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { template - void syncNetSend(std::string loopName) { + void syncNetSend(std::string loopName, size_t elem_size) { static galois::runtime::SendBuffer b; // although a static variable, allocation not reused // due to std::move in net.sendTagged() @@ -2141,7 +2140,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { continue; getSendBuffer(loopName, x, - b); + b, elem_size); if ((!async) || (b.size() > 0)) { size_t syncTypePhase = 0; @@ -2178,14 +2177,14 @@ class GluonSubstrate : public galois::runtime::GlobalObject { template - void syncSend(std::string loopName) { + void syncSend(std::string loopName, size_t elem_size) { std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; galois::CondStatTimer TSendTime( (syncTypeStr + "Send_" + get_run_identifier(loopName)).c_str(), RNAME); TSendTime.start(); syncNetSend(loopName); + VecTy, async>(loopName, elem_size); TSendTime.stop(); } @@ -2717,7 +2716,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { */ template - inline void reduce(std::string loopName) { + inline void reduce(std::string loopName, size_t elem_size) { std::string timer_str("Reduce_" + get_run_identifier(loopName)); galois::CondStatTimer TsyncReduce(timer_str.c_str(), RNAME); @@ -2735,7 +2734,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { case noBareMPI: #endif syncSend(loopName); + VecTy, async>(loopName, elem_size); syncRecv(loopName); #ifdef GALOIS_USE_BARE_MPI @@ -2768,7 +2767,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { */ template - inline void broadcast(std::string loopName) { + inline void broadcast(std::string loopName, size_t elem_size) { std::string timer_str("Broadcast_" + get_run_identifier(loopName)); galois::CondStatTimer TsyncBroadcast(timer_str.c_str(), RNAME); @@ -2810,10 +2809,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject { #endif if (use_bitset) { syncSend(loopName); + BitsetFnTy, VecTy, async>(loopName, elem_size); } else { syncSend(loopName); + galois::InvalidBitsetFnTy, VecTy, async>(loopName, elem_size); } syncRecv(loopName); @@ -2845,12 +2844,14 @@ class GluonSubstrate : public galois::runtime::GlobalObject { * @param loopName used to name timers for statistics */ template - inline void sync_src_to_src(std::string loopName) { + inline void sync_src_to_src(std::string loopName, size_t elem_size) { // do nothing for OEC // reduce and broadcast for IEC, CVC, UVC if (transposed || isVertexCut) { - reduce(loopName); - broadcast(loopName); + reduce(loopName, + elem_size); + broadcast( + loopName, elem_size); } } @@ -2863,24 +2864,24 @@ class GluonSubstrate : public galois::runtime::GlobalObject { * @param loopName used to name timers for statistics */ template - inline void sync_src_to_dst(std::string loopName) { + inline void sync_src_to_dst(std::string loopName, size_t elem_size) { // only broadcast for OEC // only reduce for IEC // reduce and broadcast for CVC, UVC if (transposed) { reduce( - loopName); + loopName, elem_size); if (isVertexCut) { broadcast( - loopName); + loopName, elem_size); } } else { if (isVertexCut) { reduce( - loopName); + loopName, elem_size); } broadcast( - loopName); + loopName, elem_size); } } @@ -2893,13 +2894,15 @@ class GluonSubstrate : public galois::runtime::GlobalObject { * @param loopName used to name timers for statistics */ template - inline void sync_src_to_any(std::string loopName) { + inline void sync_src_to_any(std::string loopName, size_t elem_size) { // only broadcast for OEC // reduce and broadcast for IEC, CVC, UVC if (transposed || isVertexCut) { - reduce(loopName); + reduce(loopName, + elem_size); } - broadcast(loopName); + broadcast(loopName, + elem_size); } /** @@ -2911,23 +2914,23 @@ class GluonSubstrate : public galois::runtime::GlobalObject { * @param loopName used to name timers for statistics */ template - inline void sync_dst_to_src(std::string loopName) { + inline void sync_dst_to_src(std::string loopName, size_t elem_size) { // only reduce for OEC // only broadcast for IEC // reduce and broadcast for CVC, UVC if (transposed) { if (isVertexCut) { reduce( - loopName); + loopName, elem_size); } broadcast( - loopName); + loopName, elem_size); } else { reduce( - loopName); + loopName, elem_size); if (isVertexCut) { broadcast( - loopName); + loopName, elem_size); } } } @@ -2941,14 +2944,14 @@ class GluonSubstrate : public galois::runtime::GlobalObject { * @param loopName used to name timers for statistics */ template - inline void sync_dst_to_dst(std::string loopName) { + inline void sync_dst_to_dst(std::string loopName, size_t elem_size) { // do nothing for IEC // reduce and broadcast for OEC, CVC, UVC if (!transposed || isVertexCut) { reduce( - loopName); + loopName, elem_size); broadcast( - loopName); + loopName, elem_size); } } @@ -2961,13 +2964,15 @@ class GluonSubstrate : public galois::runtime::GlobalObject { * @param loopName used to name timers for statistics */ template - inline void sync_dst_to_any(std::string loopName) { + inline void sync_dst_to_any(std::string loopName, size_t elem_size) { // only broadcast for IEC // reduce and broadcast for OEC, CVC, UVC if (!transposed || isVertexCut) { - reduce(loopName); + reduce(loopName, + elem_size); } - broadcast(loopName); + broadcast( + loopName, elem_size); } /** @@ -2979,12 +2984,14 @@ class GluonSubstrate : public galois::runtime::GlobalObject { * @param loopName used to name timers for statistics */ template - inline void sync_any_to_src(std::string loopName) { + inline void sync_any_to_src(std::string loopName, size_t elem_size) { // only reduce for OEC // reduce and broadcast for IEC, CVC, UVC - reduce(loopName); + reduce(loopName, + elem_size); if (transposed || isVertexCut) { - broadcast(loopName); + broadcast(loopName, + elem_size); } } @@ -2997,14 +3004,15 @@ class GluonSubstrate : public galois::runtime::GlobalObject { * @param loopName used to name timers for statistics */ template - inline void sync_any_to_dst(std::string loopName) { + inline void sync_any_to_dst(std::string loopName, size_t elem_size) { // only reduce for IEC // reduce and broadcast for OEC, CVC, UVC - reduce(loopName); + reduce(loopName, + elem_size); if (!transposed || isVertexCut) { broadcast( - loopName); + loopName, elem_size); } } @@ -3017,10 +3025,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject { * @param loopName used to name timers for statistics */ template - inline void sync_any_to_any(std::string loopName) { + inline void sync_any_to_any(std::string loopName, size_t elem_size) { // reduce and broadcast for OEC, IEC, CVC, UVC - reduce(loopName); - broadcast(loopName); + reduce(loopName, elem_size); + broadcast(loopName, + elem_size); } //////////////////////////////////////////////////////////////////////////////// @@ -3028,6 +3037,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject { //////////////////////////////////////////////////////////////////////////////// public: + template + inline void sync(std::string loopName) { + sync(loopName, 1); + } + /** * Main sync call exposed to the user that calls the correct sync function * based on provided template arguments. Must provide information through @@ -3043,38 +3059,38 @@ class GluonSubstrate : public galois::runtime::GlobalObject { template - inline void sync(std::string loopName) { + inline void sync(std::string loopName, size_t elem_size) { std::string timer_str("Sync_" + loopName + "_" + get_run_identifier()); galois::StatTimer Tsync(timer_str.c_str(), RNAME); Tsync.start(); if (partitionAgnostic) { - sync_any_to_any(loopName); + sync_any_to_any(loopName, elem_size); } else { if (writeLocation == writeSource) { if (readLocation == readSource) { - sync_src_to_src(loopName); + sync_src_to_src(loopName, elem_size); } else if (readLocation == readDestination) { - sync_src_to_dst(loopName); + sync_src_to_dst(loopName, elem_size); } else { // readAny - sync_src_to_any(loopName); + sync_src_to_any(loopName, elem_size); } } else if (writeLocation == writeDestination) { if (readLocation == readSource) { - sync_dst_to_src(loopName); + sync_dst_to_src(loopName, elem_size); } else if (readLocation == readDestination) { - sync_dst_to_dst(loopName); + sync_dst_to_dst(loopName, elem_size); } else { // readAny - sync_dst_to_any(loopName); + sync_dst_to_any(loopName, elem_size); } } else { // writeAny if (readLocation == readSource) { - sync_any_to_src(loopName); + sync_any_to_src(loopName, elem_size); } else if (readLocation == readDestination) { - sync_any_to_dst(loopName); + sync_any_to_dst(loopName, elem_size); } else { // readAny - sync_any_to_any(loopName); + sync_any_to_any(loopName, elem_size); } } } @@ -3153,13 +3169,20 @@ class GluonSubstrate : public galois::runtime::GlobalObject { */ static inline void call(GluonSubstrate* substrate, galois::runtime::FieldFlags& fieldFlags, - std::string loopName, const BITVECTOR_STATUS&) { + std::string loopName, const BITVECTOR_STATUS& b) { + call(substrate, fieldFlags, loopName, b, 1); + } + + static inline void call(GluonSubstrate* substrate, + galois::runtime::FieldFlags& fieldFlags, + std::string loopName, const BITVECTOR_STATUS&, + size_t elem_size) { if (fieldFlags.src_to_dst() && fieldFlags.dst_to_dst()) { - substrate->sync_any_to_dst(loopName); + substrate->sync_any_to_dst(loopName, elem_size); } else if (fieldFlags.src_to_dst()) { - substrate->sync_src_to_dst(loopName); + substrate->sync_src_to_dst(loopName, elem_size); } else if (fieldFlags.dst_to_dst()) { - substrate->sync_dst_to_dst(loopName); + substrate->sync_dst_to_dst(loopName, elem_size); } fieldFlags.clear_read_dst(); @@ -3189,6 +3212,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject { galois::runtime::FieldFlags& fieldFlags, std::string loopName, const BITVECTOR_STATUS& bvFlag) { + call(substrate, fieldFlags, loopName, bvFlag, 1); + } + + static inline void call(GluonSubstrate* substrate, + galois::runtime::FieldFlags& fieldFlags, + std::string loopName, + const BITVECTOR_STATUS& bvFlag, size_t elem_size) { bool src_write = fieldFlags.src_to_src() || fieldFlags.src_to_dst(); bool dst_write = fieldFlags.dst_to_src() || fieldFlags.dst_to_dst(); @@ -3201,42 +3231,56 @@ class GluonSubstrate : public galois::runtime::GlobalObject { if (src_write) { if (fieldFlags.src_to_src() && fieldFlags.src_to_dst()) { if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) { - substrate->sync_src_to_any(loopName); + substrate->sync_src_to_any(loopName, + elem_size); } else if (galois::runtime::src_invalid(bvFlag)) { // src invalid bitset; sync individually so it can be called // without bitset - substrate->sync_src_to_dst(loopName); - substrate->sync_src_to_src(loopName); + substrate->sync_src_to_dst(loopName, + elem_size); + substrate->sync_src_to_src(loopName, + elem_size); } else if (galois::runtime::dst_invalid(bvFlag)) { // dst invalid bitset; sync individually so it can be called // without bitset - substrate->sync_src_to_src(loopName); - substrate->sync_src_to_dst(loopName); + substrate->sync_src_to_src(loopName, + elem_size); + substrate->sync_src_to_dst(loopName, + elem_size); } else { GALOIS_DIE("invalid bitvector flag setting in syncOnDemand"); } } else if (fieldFlags.src_to_src()) { - substrate->sync_src_to_src(loopName); + substrate->sync_src_to_src(loopName, + elem_size); } else { // src to dst is set - substrate->sync_src_to_dst(loopName); + substrate->sync_src_to_dst(loopName, + elem_size); } } else if (dst_write) { if (fieldFlags.dst_to_src() && fieldFlags.dst_to_dst()) { if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) { - substrate->sync_dst_to_any(loopName); + substrate->sync_dst_to_any(loopName, + elem_size); } else if (galois::runtime::src_invalid(bvFlag)) { - substrate->sync_dst_to_dst(loopName); - substrate->sync_dst_to_src(loopName); + substrate->sync_dst_to_dst(loopName, + elem_size); + substrate->sync_dst_to_src(loopName, + elem_size); } else if (galois::runtime::dst_invalid(bvFlag)) { - substrate->sync_dst_to_src(loopName); - substrate->sync_dst_to_dst(loopName); + substrate->sync_dst_to_src(loopName, + elem_size); + substrate->sync_dst_to_dst(loopName, + elem_size); } else { GALOIS_DIE("invalid bitvector flag setting in syncOnDemand"); } } else if (fieldFlags.dst_to_src()) { - substrate->sync_dst_to_src(loopName); + substrate->sync_dst_to_src(loopName, + elem_size); } else { // dst to dst is set - substrate->sync_dst_to_dst(loopName); + substrate->sync_dst_to_dst(loopName, + elem_size); } } @@ -3252,20 +3296,25 @@ class GluonSubstrate : public galois::runtime::GlobalObject { if (src_read && dst_read) { if (bvFlag == BITVECTOR_STATUS::NONE_INVALID) { - substrate->sync_any_to_any(loopName); + substrate->sync_any_to_any(loopName, + elem_size); } else if (galois::runtime::src_invalid(bvFlag)) { - substrate->sync_any_to_dst(loopName); - substrate->sync_any_to_src(loopName); + substrate->sync_any_to_dst(loopName, + elem_size); + substrate->sync_any_to_src(loopName, + elem_size); } else if (galois::runtime::dst_invalid(bvFlag)) { - substrate->sync_any_to_src(loopName); - substrate->sync_any_to_dst(loopName); + substrate->sync_any_to_src(loopName, + elem_size); + substrate->sync_any_to_dst(loopName, + elem_size); } else { GALOIS_DIE("invalid bitvector flag setting in syncOnDemand"); } } else if (src_read) { - substrate->sync_any_to_src(loopName); + substrate->sync_any_to_src(loopName, elem_size); } else { // dst_read - substrate->sync_any_to_dst(loopName); + substrate->sync_any_to_dst(loopName, elem_size); } } @@ -3395,6 +3444,63 @@ class GluonSubstrate : public galois::runtime::GlobalObject { userGraph.deallocate(); } } + + void getPartitionedGraphInfo(PartitionedGraphInfo& g_info) { + getPartitionedGraphInfo(g_info, true); + } + + void getPartitionedGraphInfo(PartitionedGraphInfo& g_info, + bool deallocate_graph) { + g_info.numOwned = userGraph.numMasters(); + // Assumption: master occurs at beginning in contiguous range + g_info.beginMaster = 0; + g_info.numNodesWithEdges = userGraph.getNumNodesWithEdges(); + g_info.id = id; + g_info.numHosts = numHosts; + + // copy memoization meta-data + g_info.num_master_nodes = + (unsigned int*)calloc(masterNodes.size(), sizeof(unsigned int)); + g_info.master_nodes = + (unsigned int**)calloc(masterNodes.size(), sizeof(unsigned int*)); + + for (uint32_t h = 0; h < masterNodes.size(); ++h) { + g_info.num_master_nodes[h] = masterNodes[h].size(); + + if (masterNodes[h].size() > 0) { + g_info.master_nodes[h] = + (unsigned int*)calloc(masterNodes[h].size(), sizeof(unsigned int)); + ; + std::copy(masterNodes[h].begin(), masterNodes[h].end(), + g_info.master_nodes[h]); + } else { + g_info.master_nodes[h] = NULL; + } + } + + g_info.num_mirror_nodes = + (unsigned int*)calloc(mirrorNodes.size(), sizeof(unsigned int)); + g_info.mirror_nodes = + (unsigned int**)calloc(mirrorNodes.size(), sizeof(unsigned int*)); + for (uint32_t h = 0; h < mirrorNodes.size(); ++h) { + g_info.num_mirror_nodes[h] = mirrorNodes[h].size(); + + if (mirrorNodes[h].size() > 0) { + g_info.mirror_nodes[h] = + (unsigned int*)calloc(mirrorNodes[h].size(), sizeof(unsigned int)); + std::copy(mirrorNodes[h].begin(), mirrorNodes[h].end(), + g_info.mirror_nodes[h]); + } else { + g_info.mirror_nodes[h] = NULL; + } + } + + // user needs to provide method of freeing up graph (it can do nothing + // if they wish) + if (deallocate_graph) { + userGraph.deallocate(); + } + } #endif // het galois def //////////////////////////////////////////////////////////////////////////////// diff --git a/libgluon/include/galois/runtime/SyncStructures.h b/libgluon/include/galois/runtime/SyncStructures.h index b5a2b65d5c..44264461cd 100644 --- a/libgluon/include/galois/runtime/SyncStructures.h +++ b/libgluon/include/galois/runtime/SyncStructures.h @@ -1981,4 +1981,103 @@ class FieldFlags { } \ } +#ifdef GALOIS_ENABLE_GPU +#define GALOIS_SYNC_STRUCTURE_GNN_LAYER(fieldname, cuda_ctx_for_sync, \ + gnn_matrix_to_sync_column_length_, \ + layer_number_to_sync) \ + struct GNNSumAggregate_##fieldname { \ + using ValTy = GNNFloat; \ + \ + static ValTy extract(uint32_t, char&) { return 0.f; } \ + \ + static bool reduce(uint32_t, char&, ValTy) { return false; } \ + \ + static void reset(uint32_t, char&) {} \ + \ + static void setVal(uint32_t, char&, ValTy) {} \ + \ + static bool extract_batch(unsigned from_id, uint8_t* buf, \ + size_t* buf_size, DataCommMode* mode) { \ + if (device_personality == DevicePersonality::GPU_CUDA) { \ + batch_get_node_##fieldname##_matrix_cuda( \ + cuda_ctx_for_sync, from_id, buf, buf_size, mode, \ + gnn_matrix_to_sync_column_length_, layer_number_to_sync); \ + return true; \ + } \ + assert(device_personality == DevicePersonality::CPU); \ + return false; \ + } \ + \ + static bool extract_batch(unsigned from_id, uint8_t* buf) { \ + if (device_personality == DevicePersonality::GPU_CUDA) { \ + batch_get_node_##fieldname##_matrix_cuda( \ + cuda_ctx_for_sync, from_id, buf, \ + gnn_matrix_to_sync_column_length_, layer_number_to_sync); \ + return true; \ + } \ + assert(device_personality == DevicePersonality::CPU); \ + return false; \ + } \ + \ + static bool reduce_batch(unsigned from_id, uint8_t* buf, \ + DataCommMode mode) { \ + if (device_personality == DevicePersonality::GPU_CUDA) { \ + batch_aggregate_node_##fieldname##_matrix_cuda( \ + cuda_ctx_for_sync, from_id, buf, mode, \ + gnn_matrix_to_sync_column_length_, layer_number_to_sync); \ + return true; \ + } \ + assert(device_personality == DevicePersonality::CPU); \ + return false; \ + } \ + \ + static bool reduce_mirror_batch(unsigned from_id, uint8_t* buf, \ + DataCommMode mode) { \ + if (device_personality == DevicePersonality::GPU_CUDA) { \ + batch_aggregate_mirror_node_##fieldname##_matrix_cuda( \ + cuda_ctx_for_sync, from_id, buf, mode, \ + gnn_matrix_to_sync_column_length_, layer_number_to_sync); \ + return true; \ + } \ + assert(device_personality == DevicePersonality::CPU); \ + return false; \ + } \ + \ + static bool setVal_batch(unsigned from_id, uint8_t* buf, \ + DataCommMode mode) { \ + if (device_personality == DevicePersonality::GPU_CUDA) { \ + batch_set_mirror_node_##fieldname##_matrix_cuda( \ + cuda_ctx_for_sync, from_id, buf, mode, \ + gnn_matrix_to_sync_column_length_, layer_number_to_sync); \ + return true; \ + } \ + assert(device_personality == DevicePersonality::CPU); \ + return false; \ + } \ + \ + static bool extract_reset_batch(unsigned from_id, uint8_t* buf, \ + size_t* buf_size, DataCommMode* mode) { \ + if (device_personality == DevicePersonality::GPU_CUDA) { \ + batch_get_reset_node_##fieldname##_matrix_cuda( \ + cuda_ctx_for_sync, from_id, buf, buf_size, mode, \ + gnn_matrix_to_sync_column_length_, layer_number_to_sync); \ + return true; \ + } \ + assert(device_personality == DevicePersonality::CPU); \ + return false; \ + } \ + \ + static bool extract_reset_batch(unsigned from_id, uint8_t* buf) { \ + if (device_personality == DevicePersonality::GPU_CUDA) { \ + batch_get_reset_node_##fieldname##_matrix_cuda( \ + cuda_ctx_for_sync, from_id, buf, \ + gnn_matrix_to_sync_column_length_, layer_number_to_sync); \ + return true; \ + } \ + assert(device_personality == DevicePersonality::CPU); \ + return false; \ + } \ + }; +#endif + #endif // header guard diff --git a/libgluon/include/galois/runtime/cuda/DeviceSync.h b/libgluon/include/galois/runtime/cuda/DeviceSync.h index db23350c4a..a9512b1cc1 100644 --- a/libgluon/include/galois/runtime/cuda/DeviceSync.h +++ b/libgluon/include/galois/runtime/cuda/DeviceSync.h @@ -52,7 +52,8 @@ void kernel_sizing(dim3& blocks, dim3& threads) { } template -__global__ void batch_get_subset(index_type subset_size, +__global__ void batch_get_subset(const index_type subset_size, + const index_type elem_size, const unsigned int* __restrict__ indices, DataType* __restrict__ subset, const DataType* __restrict__ array) { @@ -61,12 +62,15 @@ __global__ void batch_get_subset(index_type subset_size, index_type src_end = subset_size; for (index_type src = 0 + tid; src < src_end; src += nthreads) { unsigned index = indices[src]; - subset[src] = array[index]; + for (index_type eid = 0; eid < elem_size; eid++) { + subset[src * elem_size + eid] = array[index * elem_size + eid]; + } } } template -__global__ void batch_get_subset(index_type subset_size, +__global__ void batch_get_subset(const index_type subset_size, + const index_type elem_size, const unsigned int* __restrict__ indices, const OffsetIteratorType offsets, DataType* __restrict__ subset, @@ -76,45 +80,52 @@ __global__ void batch_get_subset(index_type subset_size, index_type src_end = subset_size; for (index_type src = 0 + tid; src < src_end; src += nthreads) { unsigned index = indices[offsets[src]]; - subset[src] = array[index]; + for (index_type eid = 0; eid < elem_size; eid++) { + subset[src * elem_size + eid] = array[index * elem_size + eid]; + } } } template -__global__ void batch_get_reset_subset(index_type subset_size, - const unsigned int* __restrict__ indices, - DataType* __restrict__ subset, - DataType* __restrict__ array, - DataType reset_value) { +__global__ void +batch_get_reset_subset(const index_type subset_size, const index_type elem_size, + const unsigned int* __restrict__ indices, + DataType* __restrict__ subset, + DataType* __restrict__ array, DataType reset_value) { unsigned tid = TID_1D; unsigned nthreads = TOTAL_THREADS_1D; index_type src_end = subset_size; for (index_type src = 0 + tid; src < src_end; src += nthreads) { unsigned index = indices[src]; - subset[src] = array[index]; - array[index] = reset_value; + for (index_type eid = 0; eid < elem_size; eid++) { + subset[src * elem_size + eid] = array[index * elem_size + eid]; + array[index * elem_size + eid] = reset_value; + } } } template -__global__ void batch_get_reset_subset(index_type subset_size, - const unsigned int* __restrict__ indices, - const OffsetIteratorType offsets, - DataType* __restrict__ subset, - DataType* __restrict__ array, - DataType reset_value) { +__global__ void +batch_get_reset_subset(const index_type subset_size, const index_type elem_size, + const unsigned int* __restrict__ indices, + const OffsetIteratorType offsets, + DataType* __restrict__ subset, + DataType* __restrict__ array, DataType reset_value) { unsigned tid = TID_1D; unsigned nthreads = TOTAL_THREADS_1D; index_type src_end = subset_size; for (index_type src = 0 + tid; src < src_end; src += nthreads) { unsigned index = indices[offsets[src]]; - subset[src] = array[index]; - array[index] = reset_value; + for (index_type eid = 0; eid < elem_size; eid++) { + subset[src * elem_size + eid] = array[index * elem_size + eid]; + array[index * elem_size + eid] = reset_value; + } } } template -__global__ void batch_set_subset(index_type subset_size, +__global__ void batch_set_subset(const index_type subset_size, + const index_type elem_size, const unsigned int* __restrict__ indices, const DataType* __restrict__ subset, DataType* __restrict__ array, @@ -124,7 +135,10 @@ __global__ void batch_set_subset(index_type subset_size, index_type src_end = subset_size; for (index_type src = 0 + tid; src < src_end; src += nthreads) { unsigned index = indices[src]; - array[index] = subset[src]; + for (index_type eid = 0; eid < elem_size; eid++) { + array[index * elem_size + eid] = subset[src * elem_size + eid]; + } + if (sharedType != sharedMirror) { is_array_updated->set(index); } @@ -132,7 +146,8 @@ __global__ void batch_set_subset(index_type subset_size, } template -__global__ void batch_set_subset(index_type subset_size, +__global__ void batch_set_subset(const index_type subset_size, + const index_type elem_size, const unsigned int* __restrict__ indices, const OffsetIteratorType offsets, const DataType* __restrict__ subset, @@ -143,7 +158,10 @@ __global__ void batch_set_subset(index_type subset_size, index_type src_end = subset_size; for (index_type src = 0 + tid; src < src_end; src += nthreads) { unsigned index = indices[offsets[src]]; - array[index] = subset[src]; + for (index_type eid = 0; eid < elem_size; eid++) { + array[index * elem_size + eid] = subset[src * elem_size + eid]; + } + if (sharedType != sharedMirror) { is_array_updated->set(index); } @@ -151,7 +169,8 @@ __global__ void batch_set_subset(index_type subset_size, } template -__global__ void batch_add_subset(index_type subset_size, +__global__ void batch_add_subset(const index_type subset_size, + const index_type elem_size, const unsigned int* __restrict__ indices, const DataType* __restrict__ subset, DataType* __restrict__ array, @@ -161,7 +180,10 @@ __global__ void batch_add_subset(index_type subset_size, index_type src_end = subset_size; for (index_type src = 0 + tid; src < src_end; src += nthreads) { unsigned index = indices[src]; - array[index] += subset[src]; + for (index_type eid = 0; eid < elem_size; eid++) { + array[index * elem_size + eid] += subset[src * elem_size + eid]; + } + if (sharedType != sharedMirror) { is_array_updated->set(index); } @@ -169,7 +191,8 @@ __global__ void batch_add_subset(index_type subset_size, } template -__global__ void batch_add_subset(index_type subset_size, +__global__ void batch_add_subset(const index_type subset_size, + const index_type elem_size, const unsigned int* __restrict__ indices, const OffsetIteratorType offsets, const DataType* __restrict__ subset, @@ -180,7 +203,10 @@ __global__ void batch_add_subset(index_type subset_size, index_type src_end = subset_size; for (index_type src = 0 + tid; src < src_end; src += nthreads) { unsigned index = indices[offsets[src]]; - array[index] += subset[src]; + for (index_type eid = 0; eid < elem_size; eid++) { + array[index * elem_size + eid] += subset[src * elem_size + eid]; + } + if (sharedType != sharedMirror) { is_array_updated->set(index); } @@ -188,7 +214,8 @@ __global__ void batch_add_subset(index_type subset_size, } template -__global__ void batch_min_subset(index_type subset_size, +__global__ void batch_min_subset(const index_type subset_size, + const index_type elem_size, const unsigned int* __restrict__ indices, const DataType* __restrict__ subset, DataType* __restrict__ array, @@ -198,17 +225,20 @@ __global__ void batch_min_subset(index_type subset_size, index_type src_end = subset_size; for (index_type src = 0 + tid; src < src_end; src += nthreads) { unsigned index = indices[src]; - if (array[index] > subset[src]) { - array[index] = subset[src]; - if (sharedType != sharedMirror) { - is_array_updated->set(index); + for (index_type eid = 0; eid < elem_size; eid++) { + if (array[index * elem_size + eid] > subset[src * elem_size + eid]) { + array[index * elem_size + eid] = subset[src * elem_size + eid]; + if (sharedType != sharedMirror) { + is_array_updated->set(index); + } } } } } template -__global__ void batch_min_subset(index_type subset_size, +__global__ void batch_min_subset(const index_type subset_size, + const index_type elem_size, const unsigned int* __restrict__ indices, const OffsetIteratorType offsets, const DataType* __restrict__ subset, @@ -219,10 +249,12 @@ __global__ void batch_min_subset(index_type subset_size, index_type src_end = subset_size; for (index_type src = 0 + tid; src < src_end; src += nthreads) { unsigned index = indices[offsets[src]]; - if (array[index] > subset[src]) { - array[index] = subset[src]; - if (sharedType != sharedMirror) { - is_array_updated->set(index); + for (index_type eid = 0; eid < elem_size; eid++) { + if (array[index * elem_size + eid] > subset[src * elem_size + eid]) { + array[index * elem_size + eid] = subset[src * elem_size + eid]; + if (sharedType != sharedMirror) { + is_array_updated->set(index); + } } } } @@ -437,6 +469,15 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx, struct CUDA_Context_Field* field, unsigned from_id, uint8_t* send_buffer, DataType i = 0) { + batch_get_shared_field(ctx, field, from_id, + send_buffer, 1, i); +} + +template +void batch_get_shared_field(struct CUDA_Context_Common* ctx, + struct CUDA_Context_Field* field, + unsigned from_id, uint8_t* send_buffer, + size_t elem_size, DataType i = 0) { struct CUDA_Context_Shared* shared; if (sharedType == sharedMaster) { shared = &ctx->master; @@ -454,12 +495,12 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx, size_t v_size = shared->num_nodes[from_id]; if (reset) { batch_get_reset_subset<<>>( - v_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(), - field->data.gpu_wr_ptr(), i); + v_size, elem_size, shared->nodes[from_id].device_ptr(), + shared_data->device_ptr(), field->data.gpu_wr_ptr(), i); } else { batch_get_subset<<>>( - v_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(), - field->data.gpu_rd_ptr()); + v_size, elem_size, shared->nodes[from_id].device_ptr(), + shared_data->device_ptr(), field->data.gpu_rd_ptr()); } check_cuda_kernel; // timer1.stop(); @@ -468,7 +509,9 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx, memcpy(send_buffer, &data_mode, sizeof(data_mode)); memcpy(send_buffer + sizeof(data_mode), &v_size, sizeof(v_size)); shared_data->copy_to_cpu( - (DataType*)(send_buffer + sizeof(data_mode) + sizeof(v_size)), v_size); + (DataType*)(send_buffer + sizeof(data_mode) + sizeof(v_size)), + v_size * elem_size); + // timer2.stop(); // timer.stop(); // fprintf(stderr, "Get %u->%u: Time (ms): %llu + %llu = %llu\n", @@ -480,7 +523,8 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx, template void serializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode, size_t bit_set_count, size_t num_shared, - DeviceOnly* shared_data, uint8_t* send_buffer) { + DeviceOnly* shared_data, uint8_t* send_buffer, + size_t elem_size) { if (data_mode == noData) { // do nothing return; @@ -520,7 +564,8 @@ void serializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode, // serialize data vector memcpy(send_buffer + offset, &bit_set_count, sizeof(bit_set_count)); offset += sizeof(bit_set_count); - shared_data->copy_to_cpu((DataType*)(send_buffer + offset), bit_set_count); + shared_data->copy_to_cpu((DataType*)(send_buffer + offset), + (elem_size * bit_set_count)); // offset += bit_set_count * sizeof(DataType); } @@ -530,6 +575,16 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx, unsigned from_id, uint8_t* send_buffer, size_t* v_size, DataCommMode* data_mode, DataType i = 0) { + batch_get_shared_field( + ctx, field, from_id, send_buffer, v_size, data_mode, 1, i); +} + +template +void batch_get_shared_field(struct CUDA_Context_Common* ctx, + struct CUDA_Context_Field* field, + unsigned from_id, uint8_t* send_buffer, + size_t* v_size, DataCommMode* data_mode, + size_t elem_size, DataType i = 0) { struct CUDA_Context_Shared* shared; if (sharedType == sharedMaster) { shared = &ctx->master; @@ -541,69 +596,53 @@ void batch_get_shared_field(struct CUDA_Context_Common* ctx, dim3 threads; kernel_sizing(blocks, threads); - // ggc::Timer timer("timer"), timer1("timer1"), timer2("timer2"), - // timer3("timer3"), timer4("timer 4"); timer.start(); if (enforcedDataMode != onlyData) { - // timer1.start(); ctx->is_updated.cpu_rd_ptr()->resize(shared->num_nodes[from_id]); ctx->is_updated.cpu_rd_ptr()->reset(); + //! check updated entries and update bitset batch_get_subset_bitset<<>>( shared->num_nodes[from_id], shared->nodes[from_id].device_ptr(), ctx->is_updated.gpu_rd_ptr(), field->is_updated.gpu_rd_ptr()); check_cuda_kernel; - // timer1.stop(); - // timer2.start(); get_offsets_from_bitset(shared->num_nodes[from_id], ctx->offsets.device_ptr(), ctx->is_updated.gpu_rd_ptr(), v_size); - // timer2.stop(); } *data_mode = get_data_mode(*v_size, shared->num_nodes[from_id]); - // timer3.start(); if ((*data_mode) == onlyData) { *v_size = shared->num_nodes[from_id]; if (reset) { batch_get_reset_subset<<>>( - *v_size, shared->nodes[from_id].device_ptr(), + *v_size, elem_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(), field->data.gpu_wr_ptr(), i); } else { batch_get_subset<<>>( - *v_size, shared->nodes[from_id].device_ptr(), + *v_size, elem_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(), field->data.gpu_rd_ptr()); } } else { // bitsetData || offsetsData if (reset) { batch_get_reset_subset<<>>( - *v_size, shared->nodes[from_id].device_ptr(), + *v_size, elem_size, shared->nodes[from_id].device_ptr(), ctx->offsets.device_ptr(), shared_data->device_ptr(), field->data.gpu_wr_ptr(), i); } else { batch_get_subset<<>>( - *v_size, shared->nodes[from_id].device_ptr(), + *v_size, elem_size, shared->nodes[from_id].device_ptr(), ctx->offsets.device_ptr(), shared_data->device_ptr(), field->data.gpu_rd_ptr()); } } check_cuda_kernel; - // timer3.stop(); - // timer4.start(); serializeMessage(ctx, *data_mode, *v_size, shared->num_nodes[from_id], - shared_data, send_buffer); - // timer4.stop(); - // timer.stop(); - // fprintf(stderr, "Get %u->%u: %d mode %u bitset %u indices. Time (ms): %llu - // + %llu + %llu + %llu = %llu\n", - // ctx->id, from_id, *data_mode, - // ctx->is_updated.cpu_rd_ptr()->alloc_size(), sizeof(unsigned int) * - // (*v_size), timer1.duration_ms(), timer2.duration_ms(), - // timer3.duration_ms(), timer4.duration_ms(), timer.duration_ms()); + shared_data, send_buffer, elem_size); } template void deserializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode, size_t& bit_set_count, size_t num_shared, - DeviceOnly* shared_data, - uint8_t* recv_buffer) { + DeviceOnly* shared_data, uint8_t* recv_buffer, + size_t elem_size) { size_t offset = 0; // data_mode is already deserialized if (data_mode != onlyData) { @@ -640,8 +679,8 @@ void deserializeMessage(struct CUDA_Context_Common* ctx, DataCommMode data_mode, // deserialize data vector offset += sizeof(bit_set_count); - shared_data->copy_to_gpu((DataType*)(recv_buffer + offset), bit_set_count); - // offset += bit_set_count * sizeof(DataType); + shared_data->copy_to_gpu((DataType*)(recv_buffer + offset), + bit_set_count * elem_size); } template @@ -649,6 +688,15 @@ void batch_set_shared_field(struct CUDA_Context_Common* ctx, struct CUDA_Context_Field* field, unsigned from_id, uint8_t* recv_buffer, DataCommMode data_mode) { + batch_set_shared_field(ctx, field, from_id, + recv_buffer, data_mode, 1); +} + +template +void batch_set_shared_field(struct CUDA_Context_Common* ctx, + struct CUDA_Context_Field* field, + unsigned from_id, uint8_t* recv_buffer, + DataCommMode data_mode, size_t elem_size) { assert(data_mode != noData); struct CUDA_Context_Shared* shared; if (sharedType == sharedMaster) { @@ -666,54 +714,57 @@ void batch_set_shared_field(struct CUDA_Context_Common* ctx, // timer.start(); // timer1.start(); deserializeMessage(ctx, data_mode, v_size, shared->num_nodes[from_id], - shared_data, recv_buffer); + shared_data, recv_buffer, elem_size); // timer1.stop(); // timer2.start(); if (data_mode == onlyData) { if (op == setOp) { batch_set_subset<<>>( - v_size, shared->nodes[from_id].device_ptr(), + v_size, elem_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(), field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr()); } else if (op == addOp) { batch_add_subset<<>>( - v_size, shared->nodes[from_id].device_ptr(), + v_size, elem_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(), field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr()); } else if (op == minOp) { batch_min_subset<<>>( - v_size, shared->nodes[from_id].device_ptr(), + v_size, elem_size, shared->nodes[from_id].device_ptr(), shared_data->device_ptr(), field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr()); } } else if (data_mode == gidsData) { if (op == setOp) { batch_set_subset<<>>( - v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(), - field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr()); + v_size, elem_size, ctx->offsets.device_ptr(), + shared_data->device_ptr(), field->data.gpu_wr_ptr(), + field->is_updated.gpu_wr_ptr()); } else if (op == addOp) { batch_add_subset<<>>( - v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(), - field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr()); + v_size, elem_size, ctx->offsets.device_ptr(), + shared_data->device_ptr(), field->data.gpu_wr_ptr(), + field->is_updated.gpu_wr_ptr()); } else if (op == minOp) { batch_min_subset<<>>( - v_size, ctx->offsets.device_ptr(), shared_data->device_ptr(), - field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr()); + v_size, elem_size, ctx->offsets.device_ptr(), + shared_data->device_ptr(), field->data.gpu_wr_ptr(), + field->is_updated.gpu_wr_ptr()); } } else { // bitsetData || offsetsData if (op == setOp) { batch_set_subset<<>>( - v_size, shared->nodes[from_id].device_ptr(), + v_size, elem_size, shared->nodes[from_id].device_ptr(), ctx->offsets.device_ptr(), shared_data->device_ptr(), field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr()); } else if (op == addOp) { batch_add_subset<<>>( - v_size, shared->nodes[from_id].device_ptr(), + v_size, elem_size, shared->nodes[from_id].device_ptr(), ctx->offsets.device_ptr(), shared_data->device_ptr(), field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr()); } else if (op == minOp) { batch_min_subset<<>>( - v_size, shared->nodes[from_id].device_ptr(), + v_size, elem_size, shared->nodes[from_id].device_ptr(), ctx->offsets.device_ptr(), shared_data->device_ptr(), field->data.gpu_wr_ptr(), field->is_updated.gpu_wr_ptr()); } diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 320189c44e..e0d90216e2 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -14,6 +14,7 @@ set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64) set(INTEL_LIBS "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core") add_library(galois_gnn STATIC ${sources}) + target_link_directories(galois_gnn PUBLIC ${MKL_LIBRARIES}) target_link_libraries(galois_gnn galois_shmem) target_link_libraries(galois_gnn ${INTEL_LIBS}) @@ -38,6 +39,8 @@ if (GALOIS_ENABLE_GPU) src/layers/SoftmaxLayer.cu src/GraphNeuralNetwork.cu src/GNNOptimizers.cu + src/GNNCudaContext.cu + src/CUDAUtil.cu ) add_library(galois_gnn_gpu STATIC ${gpusources}) target_compile_definitions(galois_gnn_gpu PRIVATE _FORCE_INLINES) @@ -49,7 +52,7 @@ if (GALOIS_ENABLE_GPU) ) # link to gpu lib (which takes care of moderngpu and cub) as well as cu libs - target_link_libraries(galois_gnn_gpu Galois::gpu galois_support -lcublas -lcurand) + target_link_libraries(galois_gnn_gpu galois_gluon Galois::gpu galois_support -lcublas -lcurand) # gpu -> cpu lib target_link_libraries(galois_gnn galois_gnn_gpu) diff --git a/libgnn/include/galois/CUDAUtilHostDecls.h b/libgnn/include/galois/CUDAUtilHostDecls.h new file mode 100644 index 0000000000..d9fe5230a5 --- /dev/null +++ b/libgnn/include/galois/CUDAUtilHostDecls.h @@ -0,0 +1,3 @@ +#pragma once + +void SetCUDADeviceId(int gpu_id); diff --git a/libgnn/include/galois/GNNCudaContextHostDecls.h b/libgnn/include/galois/GNNCudaContextHostDecls.h new file mode 100644 index 0000000000..fea68d5fec --- /dev/null +++ b/libgnn/include/galois/GNNCudaContextHostDecls.h @@ -0,0 +1,82 @@ +#pragma once + +#include "galois/cuda/HostDecls.h" + +extern int gpudevice; + +void load_graph_CUDA_GNN(struct CUDA_Context* ctx, PartitionedGraphInfo& g, + unsigned num_hosts); +void resize_CUDA_layer_vector(struct CUDA_Context* ctx, size_t num_layers); +void init_CUDA_layer_vector_meta_obj(struct CUDA_Context* ctx, + unsigned layer_number, unsigned num_hosts, + unsigned nnodes, size_t infl_in_size, + size_t infl_out_size); + +namespace galois { +void batch_get_node_layer_input_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size, + DataCommMode* mode, size_t column_size, unsigned layer_number); +void batch_get_node_layer_input_matrix_cuda(struct CUDA_Context* ctx, + unsigned from_id, uint8_t* buf, + size_t column_size, + unsigned layer_number); +void batch_aggregate_node_layer_input_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode, + size_t column_size, unsigned layer_number); +void batch_aggregate_mirror_node_layer_input_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode, + size_t column_size, unsigned layer_number); +void batch_set_node_layer_input_matrix_cuda(struct CUDA_Context* ctx, + unsigned from_id, uint8_t* buf, + DataCommMode mode, + size_t column_size, + unsigned layer_number); +void batch_set_mirror_node_layer_input_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode, + size_t column_size, unsigned layer_number); +void batch_get_reset_node_layer_input_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size, + DataCommMode* mode, size_t column_size, unsigned layer_number); +void batch_get_reset_node_layer_input_matrix_cuda(struct CUDA_Context* ctx, + unsigned from_id, + uint8_t* buf, + size_t column_size, + unsigned layer_number); +void batch_get_node_layer_output_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size, + DataCommMode* mode, size_t column_size, unsigned layer_number); +void batch_get_node_layer_output_matrix_cuda(struct CUDA_Context* ctx, + unsigned from_id, uint8_t* buf, + size_t column_size, + unsigned layer_number); +void batch_aggregate_node_layer_output_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode, + size_t column_size, unsigned layer_number); +void batch_aggregate_mirror_node_layer_output_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode, + size_t column_size, unsigned layer_number); +void batch_set_node_layer_output_matrix_cuda(struct CUDA_Context* ctx, + unsigned from_id, uint8_t* buf, + DataCommMode mode, + size_t column_size, + unsigned layer_number); +void batch_set_mirror_node_layer_output_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode, + size_t column_size, unsigned layer_number); +void batch_get_reset_node_layer_output_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size, + DataCommMode* mode, size_t column_size, unsigned layer_number); +void batch_get_reset_node_layer_output_matrix_cuda(struct CUDA_Context* ctx, + unsigned from_id, + uint8_t* buf, + size_t column_size, + unsigned layer_number); + +void cudaSetLayerInputOutput(struct CUDA_Context* ctx, GNNFloat* layer_matrix, + size_t column_size, size_t num_nodes, + unsigned layer_number); +size_t getLayerInputMatrixColumnSize(struct CUDA_Context* ctx, + unsigned layer_number); +size_t getLayerOutputMatrixColumnSize(struct CUDA_Context* ctx, + unsigned layer_number); +} // namespace galois diff --git a/libgnn/include/galois/GNNOptimizers.h b/libgnn/include/galois/GNNOptimizers.h index 86a656fd30..8a171f96da 100644 --- a/libgnn/include/galois/GNNOptimizers.h +++ b/libgnn/include/galois/GNNOptimizers.h @@ -54,24 +54,28 @@ class AdamOptimizer : public BaseOptimizer { beta2_power_t_(num_trainable_layers_, config.beta2) { // >= because only prefix will be considered otherwise assert(trainable_layer_sizes.size() >= num_trainable_layers_); -#ifndef GALOIS_ENABLE_GPU - // allocate vectors based on # of trainable layers - for (size_t i = 0; i < num_trainable_layers_; i++) { - first_moments_.emplace_back(trainable_layer_sizes[i], 0.0); - second_moments_.emplace_back(trainable_layer_sizes[i], 0.0); - // Pointer with size construction - p_first_moments_.emplace_back(first_moments_.back()); - p_second_moments_.emplace_back(second_moments_.back()); - } - assert(first_moments_.size() == num_trainable_layers_); - assert(second_moments_.size() == num_trainable_layers_); -#else - // pointer with size initialization with GPU pointers - for (size_t i = 0; i < num_trainable_layers_; i++) { - p_first_moments_.emplace_back(gpu_object_.first_moment(i), - trainable_layer_sizes[i]); - p_second_moments_.emplace_back(gpu_object_.second_moment(i), - trainable_layer_sizes[i]); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + // pointer with size initialization with GPU pointers + for (size_t i = 0; i < num_trainable_layers_; i++) { + p_first_moments_.emplace_back(gpu_object_.first_moment(i), + trainable_layer_sizes[i]); + p_second_moments_.emplace_back(gpu_object_.second_moment(i), + trainable_layer_sizes[i]); + } + } else { +#endif + // allocate vectors based on # of trainable layers + for (size_t i = 0; i < num_trainable_layers_; i++) { + first_moments_.emplace_back(trainable_layer_sizes[i], 0.0); + second_moments_.emplace_back(trainable_layer_sizes[i], 0.0); + // Pointer with size construction + p_first_moments_.emplace_back(first_moments_.back()); + p_second_moments_.emplace_back(second_moments_.back()); + } + assert(first_moments_.size() == num_trainable_layers_); + assert(second_moments_.size() == num_trainable_layers_); +#ifdef GALOIS_ENABLE_GPU } #endif } diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h index 40f19da7b0..3603cb68d7 100644 --- a/libgnn/include/galois/GNNTypes.h +++ b/libgnn/include/galois/GNNTypes.h @@ -6,6 +6,11 @@ #include #include +#ifdef GALOIS_ENABLE_GPU +enum class DevicePersonality { CPU, GPU_CUDA }; +extern DevicePersonality device_personality; +#endif + namespace galois { //! Floating point type to use throughout GNN compute; typedef'd so it's easier //! to flip later diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 2ed6647b7c..7b55b84162 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -119,13 +119,14 @@ class GNNGraph { //! Return matrix of the local node features const PointerWithSize GetLocalFeatures() { -#ifndef GALOIS_ENABLE_GPU - return PointerWithSize(local_node_features_); -#else - // TODO remove reliance on local_node_features - return PointerWithSize(gpu_memory_.feature_vector(), - local_node_features_.size()); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + // TODO remove reliance on local_node_features + return PointerWithSize(gpu_memory_.feature_vector(), + local_node_features_.size()); + } #endif + return PointerWithSize(local_node_features_); } //! Given an LID and the current phase of GNN computation, determine if the @@ -178,7 +179,23 @@ class GNNGraph { void CalculateSpecialNormFactor(bool is_sampled, bool is_inductive); #ifdef GALOIS_ENABLE_GPU + void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size, + const unsigned layer_number) const; + + void InitLayerVectorMetaObjects(size_t layer_number, unsigned num_hosts, + size_t infl_in_size, size_t infl_out_size); + + void ResizeLayerVector(size_t num_layers); + const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; } + + void GetMarshalGraph(MarshalGraph& m) const { + sync_substrate_->getMarshalGraph(m, false); + } + + void GetPartitionedGraphInfo(PartitionedGraphInfo& g_info) const { + sync_substrate_->getPartitionedGraphInfo(g_info); + } #endif private: @@ -277,6 +294,7 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// #ifdef GALOIS_ENABLE_GPU + struct CUDA_Context* cuda_ctx_; //! Object that holds all GPU allocated pointers to memory related to graphs. GNNGraphGPUAllocations gpu_memory_; //! Call this to setup GPU memory for this graph: allocates necessary GPU diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index 75a18fd830..62a5ab14cb 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -2,18 +2,28 @@ // gets synchronized #include "galois/GNNTypes.h" #include "galois/BufferWrapper.h" +#ifdef GALOIS_ENABLE_GPU +#include "galois/GNNCudaContextHostDecls.h" +#endif namespace galois { namespace graphs { extern GNNFloat* gnn_matrix_to_sync_; extern size_t gnn_matrix_to_sync_column_length_; +#ifdef GALOIS_ENABLE_GPU +extern struct CUDA_Context* cuda_ctx_for_sync; +extern unsigned layer_number_to_sync; +#endif struct GNNSumAggregate { using ValTy = galois::BufferWrapper; //! return a vector of floats to sync static ValTy extract(uint32_t node_id, char&) { + // It should be a CPU synchronizing substrate. + // If the GPU flag is turned off, then personality does not exist. + // assert(device_personality == DevicePersonality::CPU); ValTy extracted_vec( &gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_], gnn_matrix_to_sync_column_length_); @@ -51,16 +61,24 @@ struct GNNSumAggregate { return false; } static bool extract_batch(unsigned, uint8_t*) { return false; } - static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { - return false; - } - static bool extract_reset_batch(unsigned, uint8_t*) { return false; } static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { return false; } static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_reset_batch(unsigned, uint8_t*) { return false; } }; +#ifdef GALOIS_ENABLE_GPU +GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_input, cuda_ctx_for_sync, + gnn_matrix_to_sync_column_length_, + layer_number_to_sync); +GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_output, cuda_ctx_for_sync, + gnn_matrix_to_sync_column_length_, + layer_number_to_sync); +#endif } // namespace graphs } // namespace galois diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 6ec6a78671..c4cc29290f 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -88,7 +88,9 @@ class GNNLayer { layer_weights_.assign(layer_weights_.size(), 1); } #ifdef GALOIS_ENABLE_GPU - CopyLayerWeightsToGPU(); + if (device_personality == DevicePersonality::GPU_CUDA) { + CopyLayerWeightsToGPU(); + } #endif } diff --git a/libgnn/src/CUDAUtil.cu b/libgnn/src/CUDAUtil.cu new file mode 100644 index 0000000000..7d39a81ff2 --- /dev/null +++ b/libgnn/src/CUDAUtil.cu @@ -0,0 +1,9 @@ +#include + +#include "galois/CUDAUtilHostDecls.h" +#include "galois/GNNTypes.h" + +DevicePersonality device_personality; +int gpudevice; + +void SetCUDADeviceId(int gpu_id) { cudaSetDevice(gpu_id); } diff --git a/libgnn/src/GNNCudaContext.cu b/libgnn/src/GNNCudaContext.cu new file mode 100644 index 0000000000..d0512f8e72 --- /dev/null +++ b/libgnn/src/GNNCudaContext.cu @@ -0,0 +1,228 @@ +#include +#include "gg.h" +#include "ggcuda.h" +#include "galois/cuda/Context.h" +#include "galois/GNNTypes.h" +#include "galois/runtime/cuda/DeviceSync.h" +#include "galois/GNNCudaContextHostDecls.h" + +// The forward declaration is in the original Context.h file; as long as +// pointers to it are used it shouldn't be an issue (since space usage is +// unknown at that point) +struct CUDA_Context : public CUDA_Context_Common { + // TODO to arrays: each context handles all layers of the graph + // Possible to add a "layer" argument to the below functions? + std::vector> layer_input_matrix; + std::vector> layer_output_matrix; + std::vector layer_input_matrix_column_size; + std::vector layer_output_matrix_column_size; +}; + +//! Allocates a new CUDA context +//! Note: caller is responsible for freeing it +struct CUDA_Context* get_CUDA_context(int id) { + struct CUDA_Context* ctx = + (struct CUDA_Context*)calloc(1, sizeof(struct CUDA_Context)); + ctx->id = id; + return ctx; +} + +bool init_CUDA_context(struct CUDA_Context* ctx, int device) { + return init_CUDA_context_common(ctx, device); +} + +void resize_CUDA_layer_vector(struct CUDA_Context* ctx, size_t num_layers) { + ctx->layer_output_matrix.resize(num_layers); + ctx->layer_output_matrix_column_size.resize(num_layers); + ctx->layer_input_matrix.resize(num_layers); + ctx->layer_input_matrix_column_size.resize(num_layers); +} + +void load_graph_CUDA_GNN(struct CUDA_Context* ctx, PartitionedGraphInfo& g_info, + unsigned num_hosts) { + size_t mem_usage = mem_usage_CUDA_common(g_info, num_hosts); + printf("[%d] Host memory for communication context: (%3u B) %3u MB\n", + ctx->id, mem_usage, mem_usage / 1048756); + + // TODO This is expensive; is it required? Can we get away with less? + // should only need one copy of mirror/masters for entire execution, + // not per layer + // graph does not need to be copied either since that's handled elsewhere + // (gpu object on GNNGraph) + load_graph_CUDA_common(ctx, g_info, num_hosts); +} + +void init_CUDA_layer_vector_meta_obj(struct CUDA_Context* ctx, + unsigned layer_number, unsigned num_hosts, + unsigned nnodes, size_t infl_in_size, + size_t infl_out_size) { + ctx->layer_input_matrix_column_size[layer_number] = infl_in_size; + load_graph_CUDA_field_inflating(ctx, &ctx->layer_input_matrix[layer_number], + num_hosts, nnodes, infl_in_size, false); + ctx->layer_output_matrix_column_size[layer_number] = infl_out_size; + load_graph_CUDA_field_inflating(ctx, &ctx->layer_output_matrix[layer_number], + num_hosts, nnodes, infl_out_size, false); +} + +////////// layer_input_matrix (forward) synchronization function /////////////// + +namespace galois { +void batch_get_node_layer_input_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size, + DataCommMode* mode, size_t column_size, unsigned layer_number) { + batch_get_shared_field( + ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, buf_size, mode, + column_size); +} + +void batch_get_node_layer_input_matrix_cuda(struct CUDA_Context* ctx, + unsigned from_id, uint8_t* buf, + size_t column_size, + unsigned layer_number) { + batch_get_shared_field( + ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, column_size); +} + +void batch_aggregate_node_layer_input_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode, + size_t column_size, unsigned layer_number) { + batch_set_shared_field( + ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, mode, + column_size); +} + +void batch_aggregate_mirror_node_layer_input_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode, + size_t column_size, unsigned layer_number) { + batch_set_shared_field( + ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, mode, + column_size); +} + +void batch_set_node_layer_input_matrix_cuda(struct CUDA_Context* ctx, + unsigned from_id, uint8_t* buf, + DataCommMode mode, + size_t column_size, + unsigned layer_number) { + batch_set_shared_field( + ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, mode, + column_size); +} + +void batch_set_mirror_node_layer_input_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode, + size_t column_size, unsigned layer_number) { + batch_set_shared_field( + ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, mode, + column_size); +} + +void batch_get_reset_node_layer_input_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size, + DataCommMode* mode, size_t column_size, unsigned layer_number) { + batch_get_shared_field( + ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, buf_size, mode, + column_size); +} + +void batch_get_reset_node_layer_input_matrix_cuda(struct CUDA_Context* ctx, + unsigned from_id, + uint8_t* buf, + size_t column_size, + unsigned layer_number) { + batch_get_shared_field( + ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, column_size); +} + +////////// layer_output_matrix (backward) synchronization function ///////////// + +void batch_get_node_layer_output_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size, + DataCommMode* mode, size_t column_size, unsigned layer_number) { + batch_get_shared_field( + ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, buf_size, + mode, column_size); +} + +void batch_get_node_layer_output_matrix_cuda(struct CUDA_Context* ctx, + unsigned from_id, uint8_t* buf, + size_t column_size, + unsigned layer_number) { + batch_get_shared_field( + ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, column_size); +} + +void batch_aggregate_node_layer_output_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode, + size_t column_size, unsigned layer_number) { + batch_set_shared_field( + ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, mode, + column_size); +} + +void batch_aggregate_mirror_node_layer_output_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode, + size_t column_size, unsigned layer_number) { + batch_set_shared_field( + ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, mode, + column_size); +} + +void batch_set_node_layer_output_matrix_cuda(struct CUDA_Context* ctx, + unsigned from_id, uint8_t* buf, + DataCommMode mode, + size_t column_size, + unsigned layer_number) { + batch_set_shared_field( + ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, mode, + column_size); +} + +void batch_set_mirror_node_layer_output_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, DataCommMode mode, + size_t column_size, unsigned layer_number) { + batch_set_shared_field( + ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, mode, + column_size); +} + +void batch_get_reset_node_layer_output_matrix_cuda( + struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size, + DataCommMode* mode, size_t column_size, unsigned layer_number) { + batch_get_shared_field( + ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, buf_size, + mode, column_size); +} + +void batch_get_reset_node_layer_output_matrix_cuda(struct CUDA_Context* ctx, + unsigned from_id, + uint8_t* buf, + size_t column_size, + unsigned layer_number) { + batch_get_shared_field( + ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, column_size); +} + +void cudaSetLayerInputOutput(struct CUDA_Context* ctx, GNNFloat* layer_matrix, + size_t column_size, size_t num_nodes, + unsigned layer_number) { + if (ctx->layer_input_matrix_column_size[layer_number] == column_size) { + ctx->layer_input_matrix[layer_number].data.set_data( + layer_matrix, column_size * num_nodes); + } else if (ctx->layer_output_matrix_column_size[layer_number] == + column_size) { + ctx->layer_output_matrix[layer_number].data.set_data( + layer_matrix, column_size * num_nodes); + } +} + +size_t getLayerInputMatrixColumnSize(struct CUDA_Context* ctx, + unsigned layer_number) { + return ctx->layer_input_matrix_column_size[layer_number]; +} + +size_t getLayerOutputMatrixColumnSize(struct CUDA_Context* ctx, + unsigned layer_number) { + return ctx->layer_output_matrix_column_size[layer_number]; +} +} // namespace galois diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp index 566b61c14e..664de35e01 100644 --- a/libgnn/src/GNNOptimizers.cpp +++ b/libgnn/src/GNNOptimizers.cpp @@ -14,33 +14,38 @@ void galois::AdamOptimizer::GradientDescent( assert(derivatives.size() == first_moment.size()); assert(derivatives.size() == second_moment.size()); -#ifndef GALOIS_ENABLE_GPU - // individual weight updates via gradients - galois::do_all( - galois::iterate(static_cast(0), matrix.size()), - [&](size_t i) { - // moment estimate updates - first_moment[i] = config_.beta1 * first_moment[i] + - (1.0 - config_.beta1) * derivatives[i]; - second_moment[i] = - config_.beta2 * second_moment[i] + - (1.0 - config_.beta2) * (derivatives[i] * derivatives[i]); - // bias corrected moments using beta power - GNNFloat bias_correct_first = - first_moment[i] / (1.0 - beta1_power_t_[layer_number]); - GNNFloat bias_correct_second = - second_moment[i] / (1.0 - beta2_power_t_[layer_number]); - // weight update using bias corrected moments - (matrix.data())[i] -= config_.alpha * bias_correct_first / - std::sqrt(bias_correct_second + config_.epsilon); - }, - galois::loopname("AdamOptimizerGradientDescent")); -#else - gpu_object_.AdamUpdate(derivatives.data(), matrix.data(), matrix.size(), - first_moment.data(), second_moment.data(), - config_.alpha, config_.beta1, config_.beta2, - config_.epsilon, beta1_power_t_[layer_number], - beta2_power_t_[layer_number]); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AdamUpdate(derivatives.data(), matrix.data(), matrix.size(), + first_moment.data(), second_moment.data(), + config_.alpha, config_.beta1, config_.beta2, + config_.epsilon, beta1_power_t_[layer_number], + beta2_power_t_[layer_number]); + } else { +#endif + // individual weight updates via gradients + galois::do_all( + galois::iterate(static_cast(0), matrix.size()), + [&](size_t i) { + // moment estimate updates + first_moment[i] = config_.beta1 * first_moment[i] + + (1.0 - config_.beta1) * derivatives[i]; + second_moment[i] = + config_.beta2 * second_moment[i] + + (1.0 - config_.beta2) * (derivatives[i] * derivatives[i]); + // bias corrected moments using beta power + GNNFloat bias_correct_first = + first_moment[i] / (1.0 - beta1_power_t_[layer_number]); + GNNFloat bias_correct_second = + second_moment[i] / (1.0 - beta2_power_t_[layer_number]); + // weight update using bias corrected moments + (matrix.data())[i] -= + config_.alpha * bias_correct_first / + std::sqrt(bias_correct_second + config_.epsilon); + }, + galois::loopname("AdamOptimizerGradientDescent")); +#ifdef GALOIS_ENABLE_GPU + } #endif // update the power terms for next update call diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index be188ff843..5eac909e18 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -14,6 +14,11 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( // this will be the # of rows for each layer size_t max_rows = graph_->size(); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + graph_->ResizeLayerVector(config_.num_intermediate_layers()); + } +#endif // create the intermediate layers for (size_t i = 0; i < config_.num_intermediate_layers(); i++) { GNNLayerType layer_type = config_.intermediate_layer_type(i); @@ -36,6 +41,13 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( case GNNLayerType::kGraphConvolutional: gnn_layers_.push_back(std::move(std::make_unique( i, *graph_, layer_dims, config_.default_layer_config()))); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + graph_->InitLayerVectorMetaObjects( + i, galois::runtime::getSystemNetworkInterface().Num, + layer_dims.input_columns, layer_dims.output_columns); + } +#endif if (i == config_.num_intermediate_layers() - 1) { // last layer before output layer should never have activation gnn_layers_.back()->DisableActivation(); @@ -86,11 +98,16 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( float galois::GraphNeuralNetwork::Train(size_t num_epochs) { const size_t this_host = graph_->host_id(); - // if (config_.do_sampling()) { - // for (std::unique_ptr& ptr : gnn_layers_) { - // assert(ptr->IsSampledLayer()); - // } - // } + std::vector cpu_pred; + float train_accuracy{0.f}; + + /* + if (config_.do_sampling()) { + for (std::unique_ptr& ptr : gnn_layers_) { + assert(ptr->IsSampledLayer()); + } + } + */ if (config_.inductive_training_) { graph_->CalculateSpecialNormFactor(false, true); @@ -105,7 +122,22 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } const PointerWithSize predictions = DoInference(); GradientPropagation(); - float train_accuracy = GetGlobalAccuracy(predictions); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + if (cpu_pred.size() != predictions.size()) { + cpu_pred.resize(predictions.size()); + } + + AdamOptimizer* adam = static_cast(optimizer_.get()); + adam->CopyToVector(cpu_pred, predictions); + train_accuracy = GetGlobalAccuracy(cpu_pred); + } else { +#endif + train_accuracy = GetGlobalAccuracy(predictions); +#ifdef GALOIS_ENABLE_GPU + } +#endif + if (this_host == 0) { galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ", train_accuracy, "\n"); @@ -118,7 +150,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { acc_timer.start(); SetLayerPhases(galois::GNNPhase::kTest); const PointerWithSize predictions = DoInference(); - float global_accuracy = GetGlobalAccuracy(predictions); + float global_accuracy{0.0}; +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + AdamOptimizer* adam = static_cast(optimizer_.get()); + adam->CopyToVector(cpu_pred, predictions); + global_accuracy = GetGlobalAccuracy(cpu_pred); + } else { +#endif + global_accuracy = GetGlobalAccuracy(predictions); +#ifdef GALOIS_ENABLE_GPU + } +#endif acc_timer.stop(); if (this_host == 0) { @@ -136,6 +179,7 @@ galois::GraphNeuralNetwork::DoInference() { for (std::unique_ptr& ptr : gnn_layers_) { layer_input = ptr->ForwardPhase(layer_input); } + return layer_input; } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 3e5d468da2..c102fc8283 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -35,6 +35,10 @@ namespace galois { namespace graphs { GNNFloat* gnn_matrix_to_sync_ = nullptr; size_t gnn_matrix_to_sync_column_length_ = 0; +#ifdef GALOIS_ENABLE_GPU +struct CUDA_Context* cuda_ctx_for_sync; +unsigned layer_number_to_sync; +#endif } // namespace graphs } // namespace galois @@ -78,9 +82,21 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, InitNormFactor(); #ifdef GALOIS_ENABLE_GPU - // allocate/copy data structures over to GPU - GALOIS_LOG_VERBOSE("[{}] Initializing GPU memory", host_id_); - InitGPUMemory(); + if (device_personality == DevicePersonality::GPU_CUDA) { + // allocate/copy data structures over to GPU + GALOIS_LOG_VERBOSE("[{}] Initializing GPU memory", host_id_); + InitGPUMemory(); + + // initialize CUDA context + cuda_ctx_ = get_CUDA_context(host_id_); + if (!init_CUDA_context(cuda_ctx_, ::gpudevice)) { + GALOIS_DIE("Failed to initialize CUDA context"); + } + PartitionedGraphInfo g_info; + GetPartitionedGraphInfo(g_info); + load_graph_CUDA_GNN(cuda_ctx_, g_info, + galois::runtime::getSystemNetworkInterface().Num); + } #endif } @@ -124,13 +140,44 @@ void galois::graphs::GNNGraph::AggregateSync( gnn_matrix_to_sync_column_length_ = matrix_column_size; // XXX bitset setting - // call sync sync_substrate_->sync( "GraphAggregateSync"); } -void galois::graphs::GNNGraph::UniformNodeSample() { UniformNodeSample(0.5); } +#ifdef GALOIS_ENABLE_GPU +void galois::graphs::GNNGraph::AggregateSync( + GNNFloat* matrix_to_sync, const size_t matrix_column_size, + const unsigned layer_number) const { + size_t layer_input_mtx_column_size = + getLayerInputMatrixColumnSize(cuda_ctx_, layer_number); + size_t layer_output_mtx_column_size = + getLayerOutputMatrixColumnSize(cuda_ctx_, layer_number); + // set globals for the sync substrate + gnn_matrix_to_sync_ = matrix_to_sync; + gnn_matrix_to_sync_column_length_ = matrix_column_size; + cuda_ctx_for_sync = cuda_ctx_; + layer_number_to_sync = layer_number; + // XXX bitset setting + // call sync + cudaSetLayerInputOutput(cuda_ctx_, matrix_to_sync, matrix_column_size, size(), + layer_number); + + if (gnn_matrix_to_sync_column_length_ == layer_input_mtx_column_size) { + sync_substrate_->sync( + "GraphAggregateSync", gnn_matrix_to_sync_column_length_); + } else if (gnn_matrix_to_sync_column_length_ == + layer_output_mtx_column_size) { + sync_substrate_->sync( + "GraphAggregateSync", gnn_matrix_to_sync_column_length_); + } else { + GALOIS_LOG_FATAL("Column size of the synchronized matrix does not" + " match to the column size of the CUDA context"); + } +} +#endif + +void galois::graphs::GNNGraph::UniformNodeSample() { UniformNodeSample(0.8); } void galois::graphs::GNNGraph::UniformNodeSample(float droprate) { galois::do_all( @@ -685,4 +732,15 @@ void galois::graphs::GNNGraph::InitGPUMemory() { local_testing_mask_); gpu_memory_.SetNormFactors(norm_factors_); } + +void galois::graphs::GNNGraph::InitLayerVectorMetaObjects( + size_t layer_number, unsigned num_hosts, size_t infl_in_size, + size_t infl_out_size) { + init_CUDA_layer_vector_meta_obj(cuda_ctx_, layer_number, num_hosts, size(), + infl_in_size, infl_out_size); +} + +void galois::graphs::GNNGraph::ResizeLayerVector(size_t num_layers) { + resize_CUDA_layer_vector(cuda_ctx_, num_layers); +} #endif diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index aff4bc3b11..9da77a004f 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -19,9 +19,11 @@ galois::GNNLayer::GNNLayer(size_t layer_num, layer_weights_.resize(num_weight_elements); layer_weight_gradients_.resize(num_weight_elements, 0); #ifdef GALOIS_ENABLE_GPU - base_gpu_object_.InitWeightMemory(num_weight_elements); - base_gpu_object_.InitDropoutMemory(layer_dimensions_.input_rows * - layer_dimensions_.input_columns); + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.InitWeightMemory(num_weight_elements); + base_gpu_object_.InitDropoutMemory(layer_dimensions_.input_rows * + layer_dimensions_.input_columns); + } #endif GlorotBengioInit(&layer_weights_); @@ -42,31 +44,35 @@ galois::GNNLayer::GNNLayer(size_t layer_num, backward_output_matrix_.resize( layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0); #ifdef GALOIS_ENABLE_GPU - base_gpu_object_.InitInOutMemory(num_output_elements, - layer_dimensions_.input_rows * - layer_dimensions_.input_columns); -#endif + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.InitInOutMemory(num_output_elements, + layer_dimensions_.input_rows * + layer_dimensions_.input_columns); - // initialize the PointerWithSize wrappers -#ifndef GALOIS_ENABLE_GPU - p_layer_weights_ = PointerWithSize(layer_weights_); - p_layer_weight_gradients_ = - PointerWithSize(layer_weight_gradients_); - p_forward_output_matrix_ = PointerWithSize(forward_output_matrix_); - p_backward_output_matrix_ = - PointerWithSize(backward_output_matrix_); -#else - p_layer_weights_ = PointerWithSize(base_gpu_object_.layer_weights(), - layer_weights_.size()); - p_layer_weight_gradients_ = - PointerWithSize(base_gpu_object_.layer_weight_gradients(), - layer_weight_gradients_.size()); - p_forward_output_matrix_ = PointerWithSize( - base_gpu_object_.forward_output(), forward_output_matrix_.size()); - p_backward_output_matrix_ = PointerWithSize( - base_gpu_object_.backward_output(), backward_output_matrix_.size()); - // TODO can clear the cpu side vectors/don't use .size() since optimally they - // aren't initialized + // initialize the PointerWithSize wrappers + p_layer_weights_ = PointerWithSize( + base_gpu_object_.layer_weights(), layer_weights_.size()); + p_layer_weight_gradients_ = + PointerWithSize(base_gpu_object_.layer_weight_gradients(), + layer_weight_gradients_.size()); + p_forward_output_matrix_ = PointerWithSize( + base_gpu_object_.forward_output(), forward_output_matrix_.size()); + p_backward_output_matrix_ = PointerWithSize( + base_gpu_object_.backward_output(), backward_output_matrix_.size()); + // TODO can clear the cpu side vectors/don't use .size() since optimally + // they aren't initialized + } else { +#endif + // initialize the PointerWithSize wrappers + p_layer_weights_ = PointerWithSize(layer_weights_); + p_layer_weight_gradients_ = + PointerWithSize(layer_weight_gradients_); + p_forward_output_matrix_ = + PointerWithSize(forward_output_matrix_); + p_backward_output_matrix_ = + PointerWithSize(backward_output_matrix_); +#ifdef GALOIS_ENABLE_GPU + } #endif } @@ -81,7 +87,9 @@ void galois::GNNLayer::GlorotBengioInit(std::vector* vector_to_init) { (*vector_to_init)[i] = dist(rng); } #ifdef GALOIS_ENABLE_GPU - CopyLayerWeightsToGPU(); + if (device_personality == DevicePersonality::GPU_CUDA) { + CopyLayerWeightsToGPU(); + } #endif } @@ -94,7 +102,9 @@ void galois::GNNLayer::RandomInitVector(std::vector* vector_to_init) { }, galois::loopname("RandomInitVector")); #ifdef GALOIS_ENABLE_GPU - CopyLayerWeightsToGPU(); + if (device_personality == DevicePersonality::GPU_CUDA) { + CopyLayerWeightsToGPU(); + } #endif } @@ -128,11 +138,15 @@ void galois::GNNLayer::DoDropoutCPU( void galois::GNNLayer::DoDropout( const PointerWithSize input_to_dropout, PointerWithSize* output_matrix) { -#ifndef GALOIS_ENABLE_GPU - DoDropoutCPU(input_to_dropout, output_matrix); -#else - base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix, - config_.dropout_rate); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix, + config_.dropout_rate); + } else { +#endif + DoDropoutCPU(input_to_dropout, output_matrix); +#ifdef GALOIS_ENABLE_GPU + } #endif } @@ -140,19 +154,23 @@ void galois::GNNLayer::DoDropoutDerivative() { assert(backward_output_matrix_.size() == dropout_mask_.size()); GNNFloat scale = 1. / (1. - config_.dropout_rate); -#ifndef GALOIS_ENABLE_GPU - // use dropout mask to figure out derivative - galois::do_all( - galois::iterate(static_cast(0), backward_output_matrix_.size()), - [&](size_t i) { - backward_output_matrix_[i] = backward_output_matrix_[i] * - static_cast(dropout_mask_[i]) * - scale; - }, - galois::loopname("LayerDropoutDerivative")); -#else - base_gpu_object_.DoDropoutDerivativeGPU(p_backward_output_matrix_.size(), - scale); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.DoDropoutDerivativeGPU(p_backward_output_matrix_.size(), + scale); + } else { +#endif + // use dropout mask to figure out derivative + galois::do_all( + galois::iterate(static_cast(0), backward_output_matrix_.size()), + [&](size_t i) { + backward_output_matrix_[i] = backward_output_matrix_[i] * + static_cast(dropout_mask_[i]) * + scale; + }, + galois::loopname("LayerDropoutDerivative")); +#ifdef GALOIS_ENABLE_GPU + } #endif } diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 208229d6f1..07f69cee6e 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -20,22 +20,24 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( out_temp_.resize(num_output_elements, 0); layer_type_ = galois::GNNLayerType::kGraphConvolutional; #ifdef GALOIS_ENABLE_GPU - gpu_object_.Allocate(num_input_elements, num_output_elements); + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.Allocate(num_input_elements, num_output_elements); + // init pointers with size + p_in_temp_1_ = + PointerWithSize(gpu_object_.in_temp_1(), in_temp_1_.size()); + p_in_temp_2_ = + PointerWithSize(gpu_object_.in_temp_2(), in_temp_2_.size()); + p_out_temp_ = + PointerWithSize(gpu_object_.out_temp(), out_temp_.size()); + } else { #endif - - // init pointers with size -#ifndef GALOIS_ENABLE_GPU - p_in_temp_1_ = PointerWithSize(in_temp_1_); - p_in_temp_2_ = PointerWithSize(in_temp_2_); - p_out_temp_ = PointerWithSize(out_temp_); -#else - p_in_temp_1_ = - PointerWithSize(gpu_object_.in_temp_1(), in_temp_1_.size()); - p_in_temp_2_ = - PointerWithSize(gpu_object_.in_temp_2(), in_temp_2_.size()); - p_out_temp_ = - PointerWithSize(gpu_object_.out_temp(), out_temp_.size()); + p_in_temp_1_ = PointerWithSize(in_temp_1_); + p_in_temp_2_ = PointerWithSize(in_temp_2_); + p_out_temp_ = PointerWithSize(out_temp_); +#ifdef GALOIS_ENABLE_GPU + } #endif + GALOIS_LOG_VERBOSE("Conv layer initialized"); } @@ -121,18 +123,22 @@ galois::GraphConvolutionalLayer::BackwardPhase( } // weight gradient calculation // TODO(loc) put this in a function to put the ifdef in there -#ifndef GALOIS_ENABLE_GPU - // temp 2 holds aggregated feature vectors from forward phase - galois::CBlasSGEMM( - CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, - layer_dimensions_.input_rows, layer_dimensions_.output_columns, - p_in_temp_2_.data(), input_gradient->data(), - p_layer_weight_gradients_.data()); -#else - gpu_object_.GetWeightGradientsGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, p_in_temp_2_.data(), - input_gradient->data(), p_layer_weight_gradients_.data()); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.GetWeightGradientsGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, p_in_temp_2_.data(), + input_gradient->data(), p_layer_weight_gradients_.data()); + } else { +#endif + // temp 2 holds aggregated feature vectors from forward phase + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, + layer_dimensions_.input_rows, layer_dimensions_.output_columns, + p_in_temp_2_.data(), input_gradient->data(), + p_layer_weight_gradients_.data()); +#ifdef GALOIS_ENABLE_GPU + } #endif } else { // TODO at this point, out_temp contains memoized FW @@ -150,18 +156,21 @@ galois::GraphConvolutionalLayer::BackwardPhase( } // TODO put this in a function // W' = F^T (FW)' -#ifndef GALOIS_ENABLE_GPU - // weight gradient; note the use of the aggregated gradient in out_temp - galois::CBlasSGEMM( - CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, - layer_dimensions_.input_rows, layer_dimensions_.output_columns, - prev_layer_input.data(), p_out_temp_.data(), - p_layer_weight_gradients_.data()); -#else - gpu_object_.GetWeightGradientsGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, prev_layer_input.data(), - p_out_temp_.data(), p_layer_weight_gradients_.data()); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.GetWeightGradientsGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, prev_layer_input.data(), + p_out_temp_.data(), p_layer_weight_gradients_.data()); + } else { +#endif + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, + layer_dimensions_.input_rows, layer_dimensions_.output_columns, + prev_layer_input.data(), p_out_temp_.data(), + p_layer_weight_gradients_.data()); +#ifdef GALOIS_ENABLE_GPU + } #endif } @@ -183,12 +192,17 @@ void galois::GraphConvolutionalLayer::AggregateAll( GNNFloat* aggregate_output, [[maybe_unused]] galois::substrate::PerThreadStorage>* pts) { -#ifndef GALOIS_ENABLE_GPU - AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts); -#else - gpu_object_.AggregateAllGPU(graph_.GetGPUGraph(), graph_.size(), - column_length, node_embeddings, aggregate_output, - config_.do_normalization); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AggregateAllGPU(graph_.GetGPUGraph(), graph_.size(), + column_length, node_embeddings, + aggregate_output, config_.do_normalization); + graph_.AggregateSync(aggregate_output, column_length, layer_number_); + } else { +#endif + AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts); +#ifdef GALOIS_ENABLE_GPU + } #endif } @@ -284,24 +298,27 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( // &aggregate_output[index_to_src_feature]); }, galois::steal(), galois::loopname("ConvolutionalAggregateAll")); - // aggregate sync graph_.AggregateSync(aggregate_output, column_length); } void galois::GraphConvolutionalLayer::UpdateEmbeddings( const GNNFloat* node_embeddings, GNNFloat* output) { -#ifndef GALOIS_ENABLE_GPU - // CPU version is just a call into CBlas - galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, - layer_dimensions_.input_columns, - layer_dimensions_.output_columns, node_embeddings, - layer_weights_.data(), output); -#else - gpu_object_.UpdateEmbeddingsGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, node_embeddings, - base_gpu_object_.layer_weights(), output); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.UpdateEmbeddingsGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, node_embeddings, + base_gpu_object_.layer_weights(), output); + } else { +#endif + // CPU version is just a call into CBlas + galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, + layer_dimensions_.input_columns, + layer_dimensions_.output_columns, node_embeddings, + layer_weights_.data(), output); +#ifdef GALOIS_ENABLE_GPU + } #endif } @@ -309,18 +326,21 @@ void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative( const GNNFloat* gradients, GNNFloat* output) { assert(p_layer_weights_.size() == layer_dimensions_.input_columns * layer_dimensions_.output_columns); -#ifndef GALOIS_ENABLE_GPU - // difference is Trans for B matrix (data) to get z by y (weights is y by z - // normally); result is x by y - galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, - layer_dimensions_.output_columns, - layer_dimensions_.input_columns, gradients, - layer_weights_.data(), output); -#else - gpu_object_.UpdateEmbeddingsDerivativeGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, gradients, - base_gpu_object_.layer_weights(), output); - +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.UpdateEmbeddingsDerivativeGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, gradients, + base_gpu_object_.layer_weights(), output); + } else { +#endif + // difference is Trans for B matrix (data) to get z by y (weights is y by z + // normally); result is x by y + galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, + layer_dimensions_.output_columns, + layer_dimensions_.input_columns, gradients, + layer_weights_.data(), output); +#ifdef GALOIS_ENABLE_GPU + } #endif } diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index d98251091c..f541b43a18 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -48,14 +48,15 @@ galois::SoftmaxLayer::ForwardPhaseCPU( const galois::PointerWithSize galois::SoftmaxLayer::ForwardPhase( const galois::PointerWithSize input_embeddings) { -#ifndef GALOIS_ENABLE_GPU - return ForwardPhaseCPU(input_embeddings); -#else - gpu_object_.ForwardPhaseGPU( - layer_phase_, graph_.size(), layer_dimensions_.input_columns, - input_embeddings.data(), p_forward_output_matrix_.data()); - return p_forward_output_matrix_; +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.ForwardPhaseGPU( + layer_phase_, graph_.size(), layer_dimensions_.input_columns, + input_embeddings.data(), p_forward_output_matrix_.data()); + return p_forward_output_matrix_; + } #endif + return ForwardPhaseCPU(input_embeddings); } galois::PointerWithSize @@ -112,14 +113,15 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { galois::PointerWithSize galois::SoftmaxLayer::BackwardPhase(const PointerWithSize, PointerWithSize*) { -#ifndef GALOIS_ENABLE_GPU - return BackwardPhaseCPU(); -#else - gpu_object_.BackwardPhaseGPU( - layer_phase_, graph_.size(), layer_dimensions_.input_columns, - p_forward_output_matrix_.data(), p_backward_output_matrix_.data()); - return p_backward_output_matrix_; +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.BackwardPhaseGPU( + layer_phase_, graph_.size(), layer_dimensions_.input_columns, + p_forward_output_matrix_.data(), p_backward_output_matrix_.data()); + return p_backward_output_matrix_; + } #endif + return BackwardPhaseCPU(); } // TODO function for getting loss diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index a6b711397b..9e10da1246 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -74,6 +74,9 @@ else() add_executable(gpu-epoch-test gpu-epoch-test.cpp) target_link_libraries(gpu-epoch-test galois_gnn) #add_test(NAME gpu-epoch-test COMMAND gpu-epoch-test) + + add_executable(gpu-aggregate-sync-test gpu-aggregate-sync-test.cpp) + target_link_libraries(gpu-aggregate-sync-test galois_gnn) endif() # TODO multi host tests? diff --git a/libgnn/test/gpu-adam-test.cpp b/libgnn/test/gpu-adam-test.cpp index a1d0c1961e..ed99982a78 100644 --- a/libgnn/test/gpu-adam-test.cpp +++ b/libgnn/test/gpu-adam-test.cpp @@ -13,7 +13,7 @@ int main() { GALOIS_LOG_VERBOSE("[{}] Using {} threads", galois::runtime::getSystemNetworkInterface().ID, num_threads); - + device_personality = DevicePersonality::GPU_CUDA; // create sample config that is easy to trace galois::AdamOptimizer::AdamConfiguration config; config.alpha = 1; diff --git a/libgnn/test/gpu-aggregate-sync-test.cpp b/libgnn/test/gpu-aggregate-sync-test.cpp new file mode 100644 index 0000000000..a3f645c5ee --- /dev/null +++ b/libgnn/test/gpu-aggregate-sync-test.cpp @@ -0,0 +1,212 @@ +//! @file gpu-aggregate-sync-test.cpp +//! GPU sync test to make sure it's sane +#include "galois/Logging.h" +#include "galois/GraphNeuralNetwork.h" +#include "galois/layers/GraphConvolutionalLayer.h" +#include "galois/CUDAUtilHostDecls.h" + +int main() { + galois::DistMemSys G; + + if (galois::runtime::getSystemNetworkInterface().Num == 1) { + GALOIS_LOG_ERROR("This test should be run with multiple hosts/processes"); + exit(1); + } + device_personality = DevicePersonality::GPU_CUDA; + gpudevice = galois::runtime::getSystemNetworkInterface().ID; + SetCUDADeviceId(gpudevice); + + auto test_graph = std::make_unique( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + + // create same layer from convlayer-test and make sure result is the same even + // in multi-host environment + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = test_graph->size(); + dimension_0.input_columns = 3; + dimension_0.output_columns = 2; + galois::GNNLayerConfig l_config; + l_config.disable_aggregate_after_update = true; + + unsigned num_layers = 2; + test_graph->ResizeLayerVector(num_layers); + test_graph->InitLayerVectorMetaObjects( + 0, galois::runtime::getSystemNetworkInterface().Num, + dimension_0.input_columns, dimension_0.output_columns); + test_graph->InitLayerVectorMetaObjects( + 1, galois::runtime::getSystemNetworkInterface().Num, + dimension_0.input_columns, dimension_0.output_columns); + // create the layer, no norm factor + std::unique_ptr layer_0 = + std::make_unique(0, *(test_graph.get()), + dimension_0, l_config); + layer_0->InitAllWeightsTo1(); + // make sure it runs in a sane manner + layer_0->ForwardPhase(test_graph->GetLocalFeatures()); + // pointer is to GPU memory: copy it over to a CPU source for verification + const std::vector& layer_0_forward_output = + layer_0->CopyForwardOutputFromGPU(); + + ////////////////////////////////////////////////////////////////////////////// + // sanity check output + ////////////////////////////////////////////////////////////////////////////// + + // check each row on each host: convert row into GID, and based on GID we + // know what the ground truth is + // row 0 = 3 + // row 1 = 6 + // row 2 = 12 + // row 3 = 18 + // row 4 = 24 + // row 5 = 30 + // row 6 = 15 + + // row should correspond to LID + for (size_t row = 0; row < test_graph->size(); row++) { + // row -> GID + size_t global_row = test_graph->GetGID(row); + + galois::GNNFloat ground_truth = 0.0; + + switch (global_row) { + case 0: + ground_truth = 3; + break; + case 1: + ground_truth = 6; + break; + case 2: + ground_truth = 12; + break; + case 3: + ground_truth = 18; + break; + case 4: + ground_truth = 24; + break; + case 5: + ground_truth = 30; + break; + case 6: + ground_truth = 15; + break; + default: + GALOIS_LOG_FATAL("bad global row for test graph"); + break; + } + + // size 2 columns + for (size_t c = 0; c < 2; c++) { + GALOIS_LOG_ASSERT(layer_0_forward_output[row * 2 + c] == ground_truth); + } + } + + ////////////////////////////////////////////////////////////////////////////// + + std::vector dummy_ones_v(test_graph->size() * 2, 1); + galois::PointerWithSize dummy_ones = + layer_0->AllocateGPU(dummy_ones_v); + // backward pass checking + // layer 0 means that an empty weight matrix is returned since there is no + // point passing back anything + layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones); + const std::vector& layer_0_backward_output = + layer_0->CopyBackwardOutputFromGPU(); + + ////////////////////////////////////////////////////////////////////////////// + // sanity check layer 0 backward output; all 0 because layer 0 + ////////////////////////////////////////////////////////////////////////////// + // since norm factors aren't invovled it is possible to do full assertions + GALOIS_LOG_ASSERT(layer_0_backward_output.size() == test_graph->size() * 3); + for (size_t i = 0; i < layer_0_backward_output.size(); i++) { + GALOIS_LOG_ASSERT((layer_0_backward_output)[i] == 0); + } + + ////////////////////////////////////////////////////////////////////////////// + // layer 1 to check backward output + ////////////////////////////////////////////////////////////////////////////// + std::unique_ptr layer_1 = + std::make_unique(1, *(test_graph.get()), + dimension_0, l_config); + layer_1->InitAllWeightsTo1(); + layer_1->ForwardPhase(test_graph->GetLocalFeatures()); + const std::vector& layer_1_forward_output = + layer_1->CopyForwardOutputFromGPU(); + + // same check for forward as before + for (size_t row = 0; row < test_graph->size(); row++) { + // row -> GID + size_t global_row = test_graph->GetGID(row); + + galois::GNNFloat ground_truth = 0.0; + + switch (global_row) { + case 0: + ground_truth = 3; + break; + case 1: + ground_truth = 6; + break; + case 2: + ground_truth = 12; + break; + case 3: + ground_truth = 18; + break; + case 4: + ground_truth = 24; + break; + case 5: + ground_truth = 30; + break; + case 6: + ground_truth = 15; + break; + default: + GALOIS_LOG_FATAL("bad global row for test graph"); + break; + } + + // size 2 columns + for (size_t c = 0; c < 2; c++) { + GALOIS_LOG_ASSERT(layer_1_forward_output[row * 2 + c] == ground_truth); + } + } + + // since layer isn't 0 anymore, backward phase will actually return something + dummy_ones_v.assign(test_graph->size() * 2, 1); + layer_1->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones); + const std::vector& layer_1_backward_output = + layer_1->CopyBackwardOutputFromGPU(); + + for (size_t row = 0; row < test_graph->size(); row++) { + // row -> GID + size_t global_row = test_graph->GetGID(row); + + galois::GNNFloat ground_truth = 0.0; + + switch (global_row) { + case 0: + case 6: + ground_truth = 2; + break; + case 1: + case 2: + case 3: + case 4: + case 5: + ground_truth = 4; + break; + default: + GALOIS_LOG_FATAL("bad global row for test graph"); + break; + } + + // size 3 columns + for (size_t c = 0; c < 3; c++) { + GALOIS_LOG_ASSERT((layer_1_backward_output)[row * 3 + c] == ground_truth); + } + } + + // TODO CVC +} diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp index a79262d706..947a0b8703 100644 --- a/libgnn/test/gpu-convlayer-test.cpp +++ b/libgnn/test/gpu-convlayer-test.cpp @@ -13,6 +13,7 @@ int main() { GALOIS_LOG_VERBOSE("[{}] Using {} threads", galois::runtime::getSystemNetworkInterface().ID, num_threads); + device_personality = DevicePersonality::GPU_CUDA; // load test graph galois::graphs::GNNGraph test_graph( "tester", galois::graphs::GNNPartitionScheme::kOEC, true); @@ -31,8 +32,16 @@ int main() { dimension_0.output_columns = 2; galois::GNNLayerConfig dcon; - dcon.allow_aggregate_after_update = false; + dcon.disable_aggregate_after_update = false; + unsigned num_layers = 2; + test_graph.ResizeLayerVector(num_layers); + test_graph.InitLayerVectorMetaObjects( + 0, galois::runtime::getSystemNetworkInterface().Num, + dimension_0.input_columns, dimension_0.output_columns); + test_graph.InitLayerVectorMetaObjects( + 1, galois::runtime::getSystemNetworkInterface().Num, + dimension_0.input_columns, dimension_0.output_columns); // create the layer, no norm factor std::unique_ptr layer_0 = std::make_unique(0, test_graph, diff --git a/libgnn/test/gpu-epoch-test.cpp b/libgnn/test/gpu-epoch-test.cpp index 3a481b9d66..3ac2c2b2ed 100644 --- a/libgnn/test/gpu-epoch-test.cpp +++ b/libgnn/test/gpu-epoch-test.cpp @@ -11,6 +11,7 @@ int main() { 56 / galois::runtime::getSystemNetworkInterface().Num); // size_t num_threads = galois::setActiveThreads(1); GALOIS_LOG_VERBOSE("Num threads is {}", num_threads); + device_personality = DevicePersonality::GPU_CUDA; // load graph auto test_graph = std::make_unique( diff --git a/libgnn/test/gpu-softmaxlayer-test.cpp b/libgnn/test/gpu-softmaxlayer-test.cpp index 453606e311..5d52e80e35 100644 --- a/libgnn/test/gpu-softmaxlayer-test.cpp +++ b/libgnn/test/gpu-softmaxlayer-test.cpp @@ -11,6 +11,7 @@ int main() { size_t num_threads = galois::setActiveThreads( 56 / galois::runtime::getSystemNetworkInterface().Num); GALOIS_LOG_VERBOSE("Num threads is {}", num_threads); + device_personality = DevicePersonality::GPU_CUDA; // load test graph galois::graphs::GNNGraph test_graph( diff --git a/libgpu/include/sharedptr.h b/libgpu/include/sharedptr.h index 9ce66de597..191812ff57 100644 --- a/libgpu/include/sharedptr.h +++ b/libgpu/include/sharedptr.h @@ -202,6 +202,17 @@ class Shared { return ptrs[0]; #endif } + + void set_data(T* src, size_t src_nmemb) { set_data(src, src_nmemb, 1); } + + void set_data(T* src, size_t src_nmemb, int device) { + if (this->nmemb == 0) { + alloc(src_nmemb); + nmemb = src_nmemb; + } + assert(this->nmemb == src_nmemb); + ptrs[device] = src; + } }; template diff --git a/lonestar/libgnnbench/CMakeLists.txt b/lonestar/libgnnbench/CMakeLists.txt index 14d152c8e7..0818a3310c 100644 --- a/lonestar/libgnnbench/CMakeLists.txt +++ b/lonestar/libgnnbench/CMakeLists.txt @@ -2,4 +2,5 @@ add_library(gnnbench STATIC src/Input.cpp src/Start.cpp) target_include_directories(gnnbench PUBLIC "${CMAKE_CURRENT_SOURCE_DIR}/include" ) + target_link_libraries(gnnbench galois_gnn LLVMSupport) diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h index 784b1fd431..dc62b19d50 100644 --- a/lonestar/libgnnbench/include/GNNBench/Input.h +++ b/lonestar/libgnnbench/include/GNNBench/Input.h @@ -4,6 +4,10 @@ #include "galois/graphs/GNNGraph.h" #include +#ifdef GALOIS_ENABLE_GPU +extern int gpudevice; +#endif + //! Directory where all files used for GNN training are found extern llvm::cl::opt input_directory; //! Base graph name (used to find the csgr, features, masks, etc.) diff --git a/lonestar/libgnnbench/include/GNNBench/Start.h b/lonestar/libgnnbench/include/GNNBench/Start.h index c03970c868..75ec167f78 100644 --- a/lonestar/libgnnbench/include/GNNBench/Start.h +++ b/lonestar/libgnnbench/include/GNNBench/Start.h @@ -3,13 +3,27 @@ #include "galois/Galois.h" #include "galois/Version.h" #include "GNNBench/Input.h" +#ifdef GALOIS_ENABLE_GPU +#include "galois/CUDAUtilHostDecls.h" +#endif //////////////////////////////////////////////////////////////////////////////// // CLI //////////////////////////////////////////////////////////////////////////////// +extern llvm::cl::opt num_threads; extern llvm::cl::opt num_epochs; +#ifdef GALOIS_ENABLE_GPU +std::string personality_str(DevicePersonality p); +extern llvm::cl::opt num_nodes; +extern llvm::cl::opt personality_set; + +namespace internal { +void heteroSetup(); +}; +#endif + //////////////////////////////////////////////////////////////////////////////// // Init functions //////////////////////////////////////////////////////////////////////////////// diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp index 1a178c583d..aa059c60f6 100644 --- a/lonestar/libgnnbench/src/Start.cpp +++ b/lonestar/libgnnbench/src/Start.cpp @@ -4,12 +4,36 @@ namespace cll = llvm::cl; cll::opt num_threads("t", cll::desc("Number of threads (default 1)"), cll::init(1)); -cll::opt num_runs("runs", cll::desc("Number of runs (default 1)"), - cll::init(1)); cll::opt num_epochs("epochs", cll::desc("Number of epochs (default 50)"), cll::init(50)); +#ifdef GALOIS_ENABLE_GPU +std::string personality_str(DevicePersonality p) { + switch (p) { + case DevicePersonality::CPU: + return "CPU"; + case DevicePersonality::GPU_CUDA: + return "GPU_CUDA"; + default: + GALOIS_LOG_ASSERT(false && "Invalid personality"); + break; + } + return ""; +} + +cll::opt num_nodes( + "numNodes", + cll::desc("Num of physical nodes with devices (default = num of hosts): " + "detect GPU to use for each host automatically"), + cll::init(-1)); +cll::opt personality_set( + "pset", + cll::desc("String specifying personality for hosts on each physical " + "node. 'c'=CPU, 'g'=GPU (default 'c')"), + cll::init("c")); +#endif + cll::opt stat_file("statFile", cll::desc("Optional output file to print stats to")); @@ -65,7 +89,6 @@ void GNNBenchStart(int argc, char** argv, const char* app, const char* desc, galois::runtime::reportParam("GNNBench", "CommandLine", cmdout.str()); galois::runtime::reportParam("GNNBench", "Threads", num_threads); galois::runtime::reportParam("GNNBench", "Hosts", net.Num); - galois::runtime::reportParam("GNNBench", "Runs", num_runs); galois::runtime::reportParam("GNNBench", "Run_UUID", galois::runtime::getRandUUID()); galois::runtime::reportParam("GNNBench", "InputDirectory", input_directory); @@ -78,4 +101,48 @@ void GNNBenchStart(int argc, char** argv, const char* app, const char* desc, char name[256]; gethostname(name, 256); galois::runtime::reportParam("GNNBench", "Hostname", name); + +#ifdef GALOIS_ENABLE_GPU + internal::heteroSetup(); +#endif +} + +#ifdef GALOIS_ENABLE_GPU +void internal::heteroSetup() { + const unsigned my_host_id = galois::runtime::getHostID(); + + auto& net = galois::runtime::getSystemNetworkInterface(); + + if (num_nodes == -1) { + num_nodes = net.Num; + } + + GALOIS_LOG_ASSERT((net.Num % num_nodes) == 0); + + device_personality = DevicePersonality::CPU; + if (personality_set.length() == (net.Num / num_nodes)) { + switch (personality_set.c_str()[my_host_id % (net.Num / num_nodes)]) { + case 'g': + galois::gInfo(my_host_id, " chooses GPU"); + device_personality = DevicePersonality::GPU_CUDA; + break; + case 'c': + galois::gInfo(my_host_id, " chooses CPU"); + device_personality = DevicePersonality::CPU; + break; + } + + if (device_personality == DevicePersonality::GPU_CUDA) { + gpudevice = get_gpu_device_id(personality_set, num_nodes); + } else { + gpudevice = -1; + } + + SetCUDADeviceId(gpudevice); + } else { + galois::gWarn( + "Command line option -pset ignored because its string length is not " + "equal to the number of processes/hosts on each physical node"); + } } +#endif From 0f99d7cbf9d78abc70d10be9767275610c7af801 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 11 Feb 2021 14:34:41 -0600 Subject: [PATCH 466/660] libgnn: SAINT sampling, dense layer, var renames 1) Adds GraphSAINT random walk sampling; not exactly the same, but the idea is. 2) Adds a dense layer: GCN layer without any aggregation (just a weight multiply) 3) Renames some variables so that false is the default state. --- libgnn/CMakeLists.txt | 1 + libgnn/include/galois/graphs/GNNGraph.h | 6 +- libgnn/include/galois/layers/DenseLayer.h | 54 ++++++++ libgnn/include/galois/layers/GNNLayer.h | 21 +-- libgnn/src/GraphNeuralNetwork.cpp | 20 ++- libgnn/src/graphs/GNNGraph.cpp | 75 ++++++++++- libgnn/src/layers/DenseLayer.cpp | 127 ++++++++++++++++++ libgnn/src/layers/GraphConvolutionalLayer.cpp | 39 +++--- 8 files changed, 306 insertions(+), 37 deletions(-) create mode 100644 libgnn/include/galois/layers/DenseLayer.h create mode 100644 libgnn/src/layers/DenseLayer.cpp diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index e0d90216e2..83fcc327cf 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -6,6 +6,7 @@ set(sources src/layers/GNNLayer.cpp src/layers/GluonGradientInterface.cpp src/layers/GraphConvolutionalLayer.cpp + src/layers/DenseLayer.cpp src/layers/SigmoidLayer.cpp src/layers/SoftmaxLayer.cpp ) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 7b55b84162..3978661a54 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -153,9 +153,13 @@ class GNNGraph { //! (the meaning of on and off depends on how it is used; for now, it is used //! to indicate subgraph presence); droprate controls chance of being dropped //! (e.g. if 0.8, a node is 80% likely to not be included in subgraph) - void UniformNodeSample(); + void UniformNodeSample() { UniformNodeSample(0.5); } void UniformNodeSample(float droprate); + //! Use the sampling method present in GraphSAINT + void GraphSAINTSample() { GraphSAINTSample(3000, 2); }; + void GraphSAINTSample(size_t num_roots, size_t walk_depth); + //! Makes a node "sampled"; used for debugging/testing void SetSampledNode(size_t node) { partitioned_graph_->getData(node) = 1; } //! Makes a node "not sampled"; used for debugging/testing diff --git a/libgnn/include/galois/layers/DenseLayer.h b/libgnn/include/galois/layers/DenseLayer.h new file mode 100644 index 0000000000..d9918f8c2e --- /dev/null +++ b/libgnn/include/galois/layers/DenseLayer.h @@ -0,0 +1,54 @@ +#pragma once +#include "galois/layers/GNNLayer.h" + +namespace galois { + +//! Just does a linear xform with no convolution over graph +class DenseLayer : public GNNLayer { +public: + //! Initializes the variables of the base class and also allocates additional + //! memory for temporary matrices. Also initializes sync substrate for the + //! weight matrix + DenseLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions, + const GNNLayerConfig& config); + + DenseLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions) + : DenseLayer(layer_num, graph, dimensions, GNNLayerConfig()) {} + + // Parent functions + const PointerWithSize + ForwardPhase(const PointerWithSize input_embeddings) final; + + PointerWithSize + BackwardPhase(const PointerWithSize prev_layer_input, + PointerWithSize* input_gradient) final; + +private: + // 2 temporaries the size of the forward input; used for dropout and + // aggregation (if either are required) + std::vector in_temp_1_; + // Pointer with size versions + PointerWithSize p_in_temp_1_; + + // Each thread has a vector of size # input columns or # output columns for + // storing intermediate results during aggregation. + // The one used depeneds on if aggregation occurs before or after the mxm. + galois::substrate::PerThreadStorage> + input_column_intermediates_; + galois::substrate::PerThreadStorage> + output_column_intermediates_; + + //! Do embedding update via mxm with this layer's weights (forward) + void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output); + //! Calculate graident via mxm with last layer's gradients (backward) + void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output); + +#ifdef GALOIS_ENABLE_GPU + // TODO(hochan/loc) replace with dense gpu object + GCNGPUAllocations gpu_object_; +#endif +}; + +} // namespace galois diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index c4cc29290f..e387441b8f 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -16,8 +16,10 @@ enum class GNNLayerType { //! Invalid placeholder kInvalid, //! GCN - kGraphConvolutional - // TODO SAGE and GAT + kGraphConvolutional, + //! Dense linear xform layer + kDense + // TODO GAT }; //! Supported output layer types in the GNN @@ -39,15 +41,14 @@ struct GNNLayerDimensions { struct GNNLayerConfig { //! True if weights should be allocated bool allocate_weights{true}; - //! True if dropout is to be done at beginning of forward phase - bool do_dropout{false}; + //! Turns off dropout of weights if enabled + bool disable_dropout{false}; //! Rate at which to drop things if dropout is on float dropout_rate{0.5}; - //! True if some activation function is to be called done at end of forward - //! phase - bool do_activation{false}; - //! True if normalization is to occur during multiplies - bool do_normalization{false}; + //! True to disable activation function for intermediate layers + bool disable_activation{false}; + //! True if normalization is disabled to occur during multiplies + bool disable_normalization{false}; //! If this is false, aggregate may occur after multiply if # of input columns //! is higher than output columns to do less work in aggregation bool disable_aggregate_after_update{false}; @@ -79,7 +80,7 @@ class GNNLayer { //! Changes this layer's phase void SetLayerPhase(GNNPhase new_phase) { layer_phase_ = new_phase; } - void DisableActivation() { config_.do_activation = false; } + void DisableActivation() { config_.disable_activation = true; } //! Initializes all layer weights to 1. This is used as a debug function for //! testing. diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 5eac909e18..8192b3f087 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -1,6 +1,7 @@ #include "galois/GNNMath.h" #include "galois/GraphNeuralNetwork.h" #include "galois/layers/GraphConvolutionalLayer.h" +#include "galois/layers/DenseLayer.h" #include "galois/layers/SoftmaxLayer.h" #include "galois/layers/SigmoidLayer.h" @@ -48,14 +49,22 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( layer_dims.input_columns, layer_dims.output_columns); } #endif - if (i == config_.num_intermediate_layers() - 1) { - // last layer before output layer should never have activation - gnn_layers_.back()->DisableActivation(); - } + break; + case GNNLayerType::kDense: + gnn_layers_.push_back(std::move(std::make_unique( + i, *graph_, layer_dims, config_.default_layer_config()))); +#ifdef GALOIS_ENABLE_GPU + // TODO(loc/hochan) dense layer gpu +#endif break; default: GALOIS_LOG_FATAL("Invalid layer type during network construction"); } + + if (i == config_.num_intermediate_layers() - 1) { + // last layer before output layer should never have activation + gnn_layers_.back()->DisableActivation(); + } } // create the output layer @@ -117,7 +126,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { for (size_t epoch = 0; epoch < num_epochs; epoch++) { if (config_.do_sampling()) { // subgraph sample every epoch - graph_->UniformNodeSample(); + // graph_->UniformNodeSample(); + graph_->GraphSAINTSample(); graph_->CalculateSpecialNormFactor(true, config_.inductive_training_); } const PointerWithSize predictions = DoInference(); diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index c102fc8283..4f12dad28c 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -177,8 +177,6 @@ void galois::graphs::GNNGraph::AggregateSync( } #endif -void galois::graphs::GNNGraph::UniformNodeSample() { UniformNodeSample(0.8); } - void galois::graphs::GNNGraph::UniformNodeSample(float droprate) { galois::do_all( galois::iterate(begin_owned(), end_owned()), [&](const NodeIterator& x) { @@ -189,6 +187,74 @@ void galois::graphs::GNNGraph::UniformNodeSample(float droprate) { // them } +// TODO(loc) does not work in a distributed setting: assumes the partitioned +// graph is the entire graph +void galois::graphs::GNNGraph::GraphSAINTSample(size_t num_roots, + size_t walk_depth) { + // reset sample + galois::do_all(galois::iterate(begin(), end()), + [&](size_t n) { partitioned_graph_->getData(n) = 0; }); + + galois::on_each([&](size_t thread_id, size_t num_threads) { + size_t my_start = 0; + size_t my_end = 0; + std::tie(my_start, my_end) = + galois::block_range(size_t{0}, num_roots, thread_id, num_threads); + size_t thread_roots = my_end - my_start; + size_t train_range = global_training_mask_range_.size; + // init RNG + drand48_data seed_struct; + srand48_r(sample_rng_.GetRandomNumber() * thread_id * num_threads, + &seed_struct); + + for (size_t root_num = 0; root_num < thread_roots; root_num++) { + // pick a random training node root at random (with replacement); + size_t root = 0; + while (true) { + long int rand_num; + lrand48_r(&seed_struct, &rand_num); + root = global_training_mask_range_.begin + (rand_num % train_range); + if (IsValidForPhase(root, GNNPhase::kTrain)) { + break; + } + } + // mark this root as sampled + SetSampledNode(root); + assert(IsInSampledGraph(root)); + + // sample more nodes based on depth of the walk + for (size_t current_depth = 0; current_depth < walk_depth; + current_depth++) { + // pick random edge, mark sampled, swap roots + EdgeIterator first_edge = EdgeBegin(root); + size_t num_edges = std::distance(first_edge, EdgeEnd(root)); + if (num_edges == 0) { + break; + } + + // must select training neighbor: if it doesn't, then ignore and + // continue + // To prevent infinite loop in case node has NO training neighbor, + // this implementation will not loop until one is found and will + // not find full depth if it doesn't find any training nodes randomly + long int rand_num; + lrand48_r(&seed_struct, &rand_num); + EdgeIterator selected_edge = first_edge + (rand_num % num_edges); + size_t candidate_dest = EdgeDestination(selected_edge); + + // TODO(loc) another possibility is to just pick it anyways regardless + // but don't mark it as sampled, though this would lead to disconnected + // graph + if (IsValidForPhase(candidate_dest, GNNPhase::kTrain)) { + SetSampledNode(candidate_dest); + assert(IsInSampledGraph(candidate_dest)); + root = candidate_dest; + } + } + } + }); +} + void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, bool has_single_class_label) { GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_); @@ -432,9 +498,10 @@ void galois::graphs::GNNGraph::CalculateFullNormFactor() { galois::iterate(static_cast(0), partitioned_graph_->size()), [&](size_t local_id) { // translate lid into gid to get global degree - size_t global_id = partitioned_graph_->getGID(local_id); + size_t global_id = partitioned_graph_->getGID(local_id); + // +1 because simulated self edge size_t global_degree = whole_graph_.edge_end(global_id) - - whole_graph_.edge_begin(global_id); + whole_graph_.edge_begin(global_id) + 1; // only set if non-zero if (global_degree != 0) { norm_factors_[local_id] = diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp new file mode 100644 index 0000000000..b767805a6a --- /dev/null +++ b/libgnn/src/layers/DenseLayer.cpp @@ -0,0 +1,127 @@ +#include "galois/Logging.h" +#include "galois/GNNMath.h" +#include "galois/layers/DenseLayer.h" + +galois::DenseLayer::DenseLayer(size_t layer_num, + const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions, + const GNNLayerConfig& config) + : GNNLayer(layer_num, graph, dimensions, config), + input_column_intermediates_(dimensions.input_columns), + output_column_intermediates_(dimensions.output_columns) { + size_t num_input_elements = + layer_dimensions_.input_rows * layer_dimensions_.input_columns; + in_temp_1_.resize(num_input_elements, 0); + size_t num_output_elements = + layer_dimensions_.input_rows * layer_dimensions_.output_columns; + GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements); + layer_type_ = galois::GNNLayerType::kDense; + p_in_temp_1_ = PointerWithSize(in_temp_1_); + GALOIS_LOG_VERBOSE("Dense initialized"); +} + +const galois::PointerWithSize +galois::DenseLayer::ForwardPhase( + const galois::PointerWithSize input_embeddings) { + GALOIS_LOG_VERBOSE("Calling forward phase"); + assert(input_embeddings.size() == + (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); + assert(p_in_temp_1_.size() == input_embeddings.size()); + assert(p_forward_output_matrix_.size() == + (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); + // pointer to input to operate on + const GNNFloat* input_data = input_embeddings.data(); + // first, dropout + if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) { + DoDropout(input_embeddings, &p_in_temp_1_); + input_data = p_in_temp_1_.data(); + } + + // FW + UpdateEmbeddings(input_data, p_forward_output_matrix_.data()); + + if (!config_.disable_activation) { + GALOIS_LOG_VERBOSE("Doing activation"); + Activation(); + } + + assert(p_forward_output_matrix_.size() == + (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); + return p_forward_output_matrix_; +} + +galois::PointerWithSize galois::DenseLayer::BackwardPhase( + galois::PointerWithSize prev_layer_input, + galois::PointerWithSize* input_gradient) { + assert(layer_phase_ == GNNPhase::kTrain); + + // derivative of activation + if (!config_.disable_activation) { + ActivationDerivative(input_gradient); + } + + if (layer_number_ != 0) { + // derivative for update + // backout = F' + UpdateEmbeddingsDerivative(input_gradient->data(), + p_backward_output_matrix_.data()); + } + + // W' = F^T (FW)' + galois::CBlasSGEMM(CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, + layer_dimensions_.input_rows, + layer_dimensions_.output_columns, prev_layer_input.data(), + input_gradient->data(), p_layer_weight_gradients_.data()); + // sync weight gradients; note aggregation sync occurs in the function call + // already + WeightGradientSyncSum(); + + if (!config_.disable_dropout && layer_number_ != 0) { + DoDropoutDerivative(); + } + + return p_backward_output_matrix_; +} + +void galois::DenseLayer::UpdateEmbeddings(const GNNFloat* node_embeddings, + GNNFloat* output) { +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.UpdateEmbeddingsGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, node_embeddings, + base_gpu_object_.layer_weights(), output); + } else { +#endif + // CPU version is just a call into CBlas + galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, + layer_dimensions_.input_columns, + layer_dimensions_.output_columns, node_embeddings, + layer_weights_.data(), output); +#ifdef GALOIS_ENABLE_GPU + } +#endif +} + +void galois::DenseLayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, + GNNFloat* output) { + assert(p_layer_weights_.size() == + layer_dimensions_.input_columns * layer_dimensions_.output_columns); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.UpdateEmbeddingsDerivativeGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, gradients, + base_gpu_object_.layer_weights(), output); + } else { +#endif + // difference is Trans for B matrix (data) to get z by y (weights is y by z + // normally); result is x by y + galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, + layer_dimensions_.output_columns, + layer_dimensions_.input_columns, gradients, + layer_weights_.data(), output); +#ifdef GALOIS_ENABLE_GPU + } +#endif +} diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 07f69cee6e..23c2affde7 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -54,7 +54,7 @@ galois::GraphConvolutionalLayer::ForwardPhase( // pointer to input to operate on const GNNFloat* input_data = input_embeddings.data(); // first, dropout - if (config_.do_dropout && (layer_phase_ == GNNPhase::kTrain)) { + if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) { DoDropout(input_embeddings, &p_in_temp_1_); input_data = p_in_temp_1_.data(); } @@ -78,7 +78,7 @@ galois::GraphConvolutionalLayer::ForwardPhase( // TODO synchronization of aggregation functions - if (config_.do_activation) { + if (!config_.disable_activation) { GALOIS_LOG_VERBOSE("Doing activation"); Activation(); } @@ -95,7 +95,7 @@ galois::GraphConvolutionalLayer::BackwardPhase( assert(layer_phase_ == GNNPhase::kTrain); // derivative of activation - if (config_.do_activation) { + if (!config_.disable_activation) { ActivationDerivative(input_gradient); } @@ -180,7 +180,7 @@ galois::GraphConvolutionalLayer::BackwardPhase( // WeightGradientSyncAverage(); WeightGradientSyncSum(); - if (config_.do_dropout && layer_number_ != 0) { + if (!config_.disable_dropout && layer_number_ != 0) { DoDropoutDerivative(); } @@ -194,9 +194,9 @@ void galois::GraphConvolutionalLayer::AggregateAll( pts) { #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AggregateAllGPU(graph_.GetGPUGraph(), graph_.size(), - column_length, node_embeddings, - aggregate_output, config_.do_normalization); + gpu_object_.AggregateAllGPU( + graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings, + aggregate_output, !config_.disable_normalization); graph_.AggregateSync(aggregate_output, column_length, layer_number_); } else { #endif @@ -217,7 +217,6 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( [&](size_t src) { size_t index_to_src_feature = src * column_length; // zero out src feature first - // TODO(loc) can init to self as well to add to self for (size_t i = 0; i < column_length; i++) { aggregate_output[index_to_src_feature + i] = 0; } @@ -238,10 +237,16 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( } GNNFloat source_norm = 0.0; - if (config_.do_normalization) { + if (!config_.disable_normalization) { source_norm = graph_.NormFactor(src); } + // init to self + for (size_t i = 0; i < column_length; i++) { + aggregate_output[index_to_src_feature + i] = + node_embeddings[index_to_src_feature + i]; + } + // loop through all destinations to grab the feature to aggregate for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) { size_t dst = graph_.EdgeDestination(e); @@ -263,7 +268,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( size_t index_to_dst_feature = dst * column_length; - if (config_.do_normalization) { + if (!config_.disable_normalization) { GNNFloat norm_scale = source_norm * graph_.NormFactor(dst); // scale the value on the destination by the combined norm term assert(pts->getLocal()->size() == column_length); @@ -288,14 +293,14 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( // GNNFloat* intermediate = pts->getLocal()->data(); // GNNFloat norm_scale = source_norm * source_norm; // for (size_t i = 0; i < column_length; i++) { - // intermediate[i] = - // norm_scale * node_embeddings[index_to_src_feature + i]; - //} - //// add self + // intermediate[i] = + // norm_scale * node_embeddings[index_to_src_feature + i]; + // } + // // add self // galois::VectorAdd(column_length, - // &aggregate_output[index_to_src_feature], - // intermediate, - // &aggregate_output[index_to_src_feature]); + // &aggregate_output[index_to_src_feature], + // intermediate, + // &aggregate_output[index_to_src_feature]); }, galois::steal(), galois::loopname("ConvolutionalAggregateAll")); // aggregate sync From 781b0cb439da41e5578bbad6d4315c0c2f7a7577 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 11 Feb 2021 14:38:40 -0600 Subject: [PATCH 467/660] GALOIS_LOG_VASSERT Steals VASSERT from Katana so that assertions can be more easy to understand. --- libsupport/include/galois/Logging.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/libsupport/include/galois/Logging.h b/libsupport/include/galois/Logging.h index 8621233fdf..674a9b65fa 100644 --- a/libsupport/include/galois/Logging.h +++ b/libsupport/include/galois/Logging.h @@ -111,4 +111,13 @@ void LogLine(LogLevel level, const char* file_name, int line_no, F fmt_string, } \ } while (0) +#define GALOIS_LOG_VASSERT(cond, fmt_string, ...) \ + do { \ + if (!(cond)) { \ + ::galois::LogLine(::galois::LogLevel::Error, __FILE__, __LINE__, \ + FMT_STRING(fmt_string), ##__VA_ARGS__); \ + ::std::abort(); \ + } \ + } while (0) + #endif From e0a6f8cc1842b1d2f91e8812f16f6bbe718f1f93 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 11 Feb 2021 14:42:13 -0600 Subject: [PATCH 468/660] gcn-app allows specifying layers; var changes CLI option to specify layer types for gcn-dist (which should now be called gnn-dist probably). Changes variable names to reflect libgnn var name changes. --- lonestar/gnn/distributed/gcn/gcn-dist.cpp | 2 +- lonestar/libgnnbench/include/GNNBench/Input.h | 3 +- lonestar/libgnnbench/src/Input.cpp | 84 ++++++++++++------- 3 files changed, 58 insertions(+), 31 deletions(-) diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp index a7eb0a4bae..65fe1338cc 100644 --- a/lonestar/gnn/distributed/gcn/gcn-dist.cpp +++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp @@ -9,7 +9,7 @@ int main(int argc, char* argv[]) { galois::StatTimer init_timer("InitializationTime"); init_timer.start(); std::unique_ptr gnn = - InitializeGraphNeuralNetwork(galois::GNNLayerType::kGraphConvolutional); + InitializeGraphNeuralNetwork(); gnn->SetLayerPhases(galois::GNNPhase::kTrain); init_timer.stop(); diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h index dc62b19d50..bb417a90f2 100644 --- a/lonestar/libgnnbench/include/GNNBench/Input.h +++ b/lonestar/libgnnbench/include/GNNBench/Input.h @@ -19,5 +19,4 @@ const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s); //! Using command line args above, create a GNN using some specified layer type //! as the intermediate layer. -std::unique_ptr -InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type); +std::unique_ptr InitializeGraphNeuralNetwork(); diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index dea458a6b3..3e602f8f74 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -30,16 +30,29 @@ llvm::cl::opt num_layers( "Number of intermediate layers in the neural network (default 2))"), cll::init(2)); -llvm::cl::list - layer_sizes("layerSizes", - cll::desc("Comma separated list of numbers specifying " - "intermediate layer sizes (does not include output)"), - cll::CommaSeparated); +llvm::cl::list layer_sizes( + "layerSizes", + cll::desc( + "Comma separated list of numbers specifying " + "intermediate layer sizes (does not include output); default sizes are " + "16 until last layer which is the size of the # of labels"), + cll::CommaSeparated); + +llvm::cl::list cl_layer_types( + "layerTypes", + cll::desc("Comma separated list of layer types specifying " + "intermediate layers (does not include output)"), + cll::values(clEnumValN(galois::GNNLayerType::kGraphConvolutional, "gcn", + "Graph Convolutional Layer (default)"), + clEnumValN(galois::GNNLayerType::kDense, "dense", + "Dense Layer")), + cll::CommaSeparated); -llvm::cl::opt do_dropout( - "doDropout", - cll::desc("If true (on by default), does dropout of input during training"), - cll::init(true)); +llvm::cl::opt + disable_dropout("disableDropout", + cll::desc("If true (off by default), disables dropout of " + "layer weights during training"), + cll::init(false)); llvm::cl::opt dropout_rate( "dropoutRate", @@ -47,17 +60,17 @@ llvm::cl::opt dropout_rate( "0.1, then 10 percent chance of dropping) (default 0.5)"), cll::init(0.5)); -llvm::cl::opt - do_activation("doActivation", - cll::desc("If true (off by default), does activation at the " - "end of an intermediate layer"), - cll::init(false)); +llvm::cl::opt disable_activation( + "disableActivation", + cll::desc("If true (off by default), disable activation at the " + "end of an intermediate layers"), + cll::init(false)); -llvm::cl::opt - do_normalization("doNormalization", - cll::desc("If true (on by default), normalizes vertex " - "features based on their degree"), - cll::init(true)); +llvm::cl::opt disable_normalization( + "disableNormalization", + cll::desc("If true (off by default), disable normalizing vertex " + "features based on their degree"), + cll::init(false)); llvm::cl::opt output_layer_type( "outputLayer", cll::desc("Type of output layer"), @@ -103,6 +116,25 @@ const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) { } } +//! Initializes the vector of layer sizes from command line args + graph +std::vector CreateLayerTypesVector() { + std::vector layer_types; + if (!cl_layer_types.size()) { + // default is all GCN layers + for (size_t i = 0; i < num_layers; i++) { + layer_types.emplace_back(galois::GNNLayerType::kGraphConvolutional); + } + } else { + GALOIS_LOG_VASSERT(cl_layer_types.size() == num_layers, + "Number layer types should be {} not {}", num_layers, + cl_layer_types.size()); + for (size_t i = 0; i < num_layers; i++) { + layer_types.emplace_back(cl_layer_types[i]); + } + } + return layer_types; +} + //! Initializes the vector of layer sizes from command line args + graph std::vector CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) { @@ -139,10 +171,10 @@ CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) { //! Setup layer config struct based on cli args galois::GNNLayerConfig CreateLayerConfig() { galois::GNNLayerConfig layer_config; - layer_config.do_dropout = do_dropout; + layer_config.disable_dropout = disable_dropout; layer_config.dropout_rate = dropout_rate; - layer_config.do_activation = do_activation; - layer_config.do_normalization = do_normalization; + layer_config.disable_activation = disable_activation; + layer_config.disable_normalization = disable_normalization; layer_config.disable_aggregate_after_update = disable_agg_after_update; layer_config.inductive_training_ = do_inductive_training; return layer_config; @@ -184,17 +216,13 @@ CreateOptimizer(const galois::graphs::GNNGraph* gnn_graph) { return std::make_unique(opt_sizes, num_layers); } -std::unique_ptr -InitializeGraphNeuralNetwork(galois::GNNLayerType layer_type) { +std::unique_ptr InitializeGraphNeuralNetwork() { // partition/load graph auto gnn_graph = std::make_unique( input_directory, input_name, partition_scheme, !multiclass_labels); // create layer types vector - std::vector layer_types; - for (size_t i = 0; i < num_layers; i++) { - layer_types.push_back(layer_type); - } + std::vector layer_types = CreateLayerTypesVector(); // sizes std::vector layer_sizes_vector = CreateLayerSizesVector(gnn_graph.get()); From 3112b9e4ec40f228c50759ce0bb5f700a6de8702 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 11 Feb 2021 15:47:32 -0600 Subject: [PATCH 469/660] Fix libgnn tests; disable self aggregation option Tests broke after changing variable names and adding self aggregation in a previous commit. /this commit adds a config option for self aggregation and fixes tests. --- libgnn/include/galois/layers/GNNLayer.h | 10 ++++++++++ libgnn/src/layers/GraphConvolutionalLayer.cpp | 8 +++++--- libgnn/test/convlayer-test.cpp | 13 ++++++++----- libgnn/test/epoch-test.cpp | 6 +++--- libgnn/test/gnnfb-test.cpp | 1 + libgnn/test/multilabel-epoch-test.cpp | 6 +++--- libgnn/test/sample-test.cpp | 1 + lonestar/libgnnbench/src/Input.cpp | 6 ++++++ 8 files changed, 37 insertions(+), 14 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index e387441b8f..68d2107456 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -52,11 +52,21 @@ struct GNNLayerConfig { //! If this is false, aggregate may occur after multiply if # of input columns //! is higher than output columns to do less work in aggregation bool disable_aggregate_after_update{false}; + //! On to not aggregate self vector during aggregation + bool disable_self_aggregate{false}; //! Graph sampling flag in use or not bool do_sampling{false}; //! Inductive layer means for aggregation all non-training nodes are ignored bool inductive_training_{false}; // TODO activation type; for now default is softmax + + //! Sets settings such that testing is easy + void DebugConfig() { + disable_activation = true; + disable_normalization = true; + disable_dropout = true; + disable_self_aggregate = true; + } }; // Tried to avoid inheritance, but keeping track of heterogeneous layers diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 23c2affde7..1b2778fe6f 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -242,9 +242,11 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( } // init to self - for (size_t i = 0; i < column_length; i++) { - aggregate_output[index_to_src_feature + i] = - node_embeddings[index_to_src_feature + i]; + if (!config_.disable_self_aggregate) { + for (size_t i = 0; i < column_length; i++) { + aggregate_output[index_to_src_feature + i] = + node_embeddings[index_to_src_feature + i]; + } } // loop through all destinations to grab the feature to aggregate diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp index 136953378d..bcada6c4ed 100644 --- a/libgnn/test/convlayer-test.cpp +++ b/libgnn/test/convlayer-test.cpp @@ -53,6 +53,7 @@ int main() { galois::GNNLayerConfig dcon; dcon.disable_aggregate_after_update = false; + dcon.DebugConfig(); // create the layer, no norm factor std::unique_ptr layer_0 = @@ -69,9 +70,11 @@ int main() { // since norm factors aren't invovled it is possible to do full assertions // 7 x 2 GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14); - GALOIS_LOG_ASSERT(layer_0_forward_output[0] == 3); + GALOIS_LOG_VASSERT(layer_0_forward_output[0] == 3, "{} should be 3", + layer_0_forward_output[0]); GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3); - GALOIS_LOG_ASSERT(layer_0_forward_output[2] == 6); + GALOIS_LOG_VASSERT(layer_0_forward_output[2] == 6, "{} should be 6", + layer_0_forward_output[2]); GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 6); GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 12); GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 12); @@ -207,9 +210,9 @@ int main() { ////////////////////////////////////////////////////////////////////////////// galois::GNNLayerConfig config; - config.do_dropout = true; - config.do_activation = true; - config.do_normalization = true; + config.disable_dropout = false; + config.disable_activation = false; + config.disable_normalization = false; config.disable_aggregate_after_update = false; // finally, just make sure dropout and activation run without crashes diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp index d8a27cc13b..2dbaea3372 100644 --- a/libgnn/test/epoch-test.cpp +++ b/libgnn/test/epoch-test.cpp @@ -22,9 +22,9 @@ int main() { std::vector layer_output_sizes = { 16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()}; galois::GNNLayerConfig layer_config; - layer_config.do_dropout = true; - layer_config.do_activation = false; - layer_config.do_normalization = true; + layer_config.disable_dropout = false; + layer_config.disable_activation = false; + layer_config.disable_normalization = false; // XXX Activation kills accuracy compared to old code, esp. for cora galois::GraphNeuralNetworkConfig gnn_config( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp index 224204bceb..091c6f01c8 100644 --- a/libgnn/test/gnnfb-test.cpp +++ b/libgnn/test/gnnfb-test.cpp @@ -25,6 +25,7 @@ int main() { std::vector layer_output_sizes = {4, 7, 7}; galois::GNNLayerConfig dcon; dcon.disable_aggregate_after_update = false; + dcon.DebugConfig(); // note GNNLayerConfig is passed in; use a config that does not do anything // extra like dropout or activation and the like so that input is easier to // verify diff --git a/libgnn/test/multilabel-epoch-test.cpp b/libgnn/test/multilabel-epoch-test.cpp index 3fb96f8c81..7626abda1d 100644 --- a/libgnn/test/multilabel-epoch-test.cpp +++ b/libgnn/test/multilabel-epoch-test.cpp @@ -22,9 +22,9 @@ int main() { std::vector layer_output_sizes = { 16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()}; galois::GNNLayerConfig layer_config; - layer_config.do_dropout = true; - layer_config.do_activation = false; - layer_config.do_normalization = true; + layer_config.disable_dropout = false; + layer_config.disable_activation = false; + layer_config.disable_normalization = false; // XXX Activation kills accuracy compared to old code, esp. for cora galois::GraphNeuralNetworkConfig gnn_config( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSigmoid, diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp index ead938e5aa..063ff80ca5 100644 --- a/libgnn/test/sample-test.cpp +++ b/libgnn/test/sample-test.cpp @@ -27,6 +27,7 @@ int main() { galois::GNNLayerConfig dcon; dcon.disable_aggregate_after_update = false; + dcon.DebugConfig(); // choose a few sample nodes test_graph.SetSampledNode(0); diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index 3e602f8f74..47ca1bfe0c 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -92,6 +92,11 @@ llvm::cl::opt disable_agg_after_update( "after update optimization"), cll::init(false)); +llvm::cl::opt disable_self_aggregate( + "disableSelfAggregation", + cll::desc("If true (off by default), disables aggregate of self feature"), + cll::init(false)); + llvm::cl::opt do_graph_sampling("doGraphSampling", cll::desc("If true (off by default), sample nodes for " @@ -176,6 +181,7 @@ galois::GNNLayerConfig CreateLayerConfig() { layer_config.disable_activation = disable_activation; layer_config.disable_normalization = disable_normalization; layer_config.disable_aggregate_after_update = disable_agg_after_update; + layer_config.disable_self_aggregate = disable_self_aggregate; layer_config.inductive_training_ = do_inductive_training; return layer_config; } From 6607147502f5c05e24d374695d2481cbf15089b5 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 17 Feb 2021 17:48:24 -0600 Subject: [PATCH 470/660] Partial SAGE layer + self loop norm fix in GCN Adds incomplete SAGE layer implementation: the mean aggregation (done via the norms being 1 / degree) is done in the SAGE layer. What needs to be done next is the concat part of it. (next commit will take care of this) Also fixes a minor thing in GCN self-loop aggregation where norm^2 needs to be applied to the self feature. --- libgnn/CMakeLists.txt | 1 + libgnn/include/galois/graphs/GNNGraph.h | 6 + libgnn/include/galois/layers/GNNLayer.h | 2 + libgnn/include/galois/layers/SAGELayer.h | 99 +++++ libgnn/src/GraphNeuralNetwork.cpp | 8 + libgnn/src/graphs/GNNGraph.cpp | 3 + libgnn/src/layers/GraphConvolutionalLayer.cpp | 3 +- libgnn/src/layers/SAGELayer.cpp | 352 ++++++++++++++++++ lonestar/libgnnbench/src/Input.cpp | 2 + 9 files changed, 475 insertions(+), 1 deletion(-) create mode 100644 libgnn/include/galois/layers/SAGELayer.h create mode 100644 libgnn/src/layers/SAGELayer.cpp diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 83fcc327cf..82454b1301 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -9,6 +9,7 @@ set(sources src/layers/DenseLayer.cpp src/layers/SigmoidLayer.cpp src/layers/SoftmaxLayer.cpp + src/layers/SAGELayer.cpp ) set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 3978661a54..4dafda2afb 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -95,6 +95,9 @@ class GNNGraph { return partitioned_graph_->getEdgeDst(ei); }; GNNFloat NormFactor(GraphNode n) const { return norm_factors_[n]; } + //! Degree norm (1 / degree) of current functional graph (e.g., sampled, + //! inductive graph, etc); calculated whenever norm factor is calculated + GNNFloat DegreeNorm(GraphNode n) const { return degree_norm_[n]; } // Get accuracy: sampling is by default false float GetGlobalAccuracy(PointerWithSize predictions, @@ -287,6 +290,9 @@ class GNNGraph { //! Normalization constant based on structure of the graph (degrees) std::vector norm_factors_; + //! Normalization constant based on degrees (unlike nomral norm factors + //! it's only division without a square root) + std::vector degree_norm_; //! RNG for subgraph sampling galois::PerThreadRNG sample_rng_; diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 68d2107456..3be7908ad7 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -17,6 +17,8 @@ enum class GNNLayerType { kInvalid, //! GCN kGraphConvolutional, + //! Sage layer: same as GCN except with mean aggregation and concat + kSAGE, //! Dense linear xform layer kDense // TODO GAT diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h new file mode 100644 index 0000000000..a3fd5ecac6 --- /dev/null +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -0,0 +1,99 @@ +#pragma once +#include "galois/layers/GNNLayer.h" + +#ifdef GALOIS_ENABLE_GPU +// TODO(loc/hochan) +#endif + +namespace galois { + +struct SAGELayerConfig { + // TODO(loc) relevant options here + bool todo; +}; + +//! Same as GCN layer except for the following: +//! - Mean aggregation; no symmetric norm with sqrts used (this +//! ends up performing better for some graphs) +//! - Concatination of the self: rather than aggregating self +//! feature it is concatinated (i.e. dimensions are doubled) +class SAGELayer : public GNNLayer { +public: + //! Initializes the variables of the base class and also allocates additional + //! memory for temporary matrices. Also initializes sync substrate for the + //! weight matrix + SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions, const GNNLayerConfig& config, + const SAGELayerConfig& sage_config); + + SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions, const GNNLayerConfig& config) + : SAGELayer(layer_num, graph, dimensions, config, SAGELayerConfig()) {} + + SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions) + : SAGELayer(layer_num, graph, dimensions, GNNLayerConfig(), + SAGELayerConfig()) {} + + // Parent functions + const PointerWithSize + ForwardPhase(const PointerWithSize input_embeddings) final; + + PointerWithSize + BackwardPhase(const PointerWithSize prev_layer_input, + PointerWithSize* input_gradient) final; + +private: + // 2 temporaries the size of the forward input; used for dropout and + // aggregation (if either are required) + std::vector in_temp_1_; + std::vector in_temp_2_; + // Temporary matrix the size of the output of the forward pass; used if + // an intermediate op occurs before writing to the final output matrix + std::vector out_temp_; + + // Pointer with size versions + PointerWithSize p_in_temp_1_; + PointerWithSize p_in_temp_2_; + PointerWithSize p_out_temp_; + + // Each thread has a vector of size # input columns or # output columns for + // storing intermediate results during aggregation. + // The one used depeneds on if aggregation occurs before or after the mxm. + galois::substrate::PerThreadStorage> + input_column_intermediates_; + galois::substrate::PerThreadStorage> + output_column_intermediates_; + + //! CPU aggregation + void AggregateAllCPU( + size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + galois::substrate::PerThreadStorage>* pts, + bool is_backward); + + //! Performs aggregation for all nodes of the graph given the length of the + //! vector to aggregate, the features themselves, an output array, and per + //! thread storage for the intermediate scaling via norm factor + void + AggregateAll(size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + galois::substrate::PerThreadStorage>* pts); + void + AggregateAll(size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + galois::substrate::PerThreadStorage>* pts, + bool is_backward); + + //! Do embedding update via mxm with this layer's weights (forward) + void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output); + //! Calculate graident via mxm with last layer's gradients (backward) + void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output); + +#ifdef GALOIS_ENABLE_GPU + // TODO(loc/hochan) + GCNGPUAllocations gpu_object_; +#endif +}; + +} // namespace galois diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 8192b3f087..40e3c8a7e1 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -4,6 +4,7 @@ #include "galois/layers/DenseLayer.h" #include "galois/layers/SoftmaxLayer.h" #include "galois/layers/SigmoidLayer.h" +#include "galois/layers/SAGELayer.h" galois::GraphNeuralNetwork::GraphNeuralNetwork( std::unique_ptr graph, @@ -48,6 +49,13 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( i, galois::runtime::getSystemNetworkInterface().Num, layer_dims.input_columns, layer_dims.output_columns); } +#endif + break; + case GNNLayerType::kSAGE: + gnn_layers_.push_back(std::move(std::make_unique( + i, *graph_, layer_dims, config_.default_layer_config()))); +#ifdef GALOIS_ENABLE_GPU + // TODO(loc/hochan) sage layer gpu #endif break; case GNNLayerType::kDense: diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 4f12dad28c..af3ef00baf 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -487,6 +487,7 @@ void galois::graphs::GNNGraph::ReadWholeGraph(const std::string& dataset_name) { void galois::graphs::GNNGraph::InitNormFactor() { GALOIS_LOG_VERBOSE("[{}] Initializing norm factors", host_id_); norm_factors_.resize(partitioned_graph_->size(), 0.0); + degree_norm_.resize(partitioned_graph_->size(), 0.0); CalculateFullNormFactor(); } @@ -506,6 +507,7 @@ void galois::graphs::GNNGraph::CalculateFullNormFactor() { if (global_degree != 0) { norm_factors_[local_id] = 1.0 / std::sqrt(static_cast(global_degree)); + degree_norm_[local_id] = 1.0 / static_cast(global_degree); } }, galois::loopname("CalculateFullNormFactor")); @@ -569,6 +571,7 @@ void galois::graphs::GNNGraph::CalculateSpecialNormFactor(bool is_sampled, // only set if non-zero if (degree != 0) { norm_factors_[local_id] = 1.0 / std::sqrt(static_cast(degree)); + degree_norm_[local_id] = 1.0 / static_cast(degree); } }, galois::loopname("CalculateSpecialNormFactor")); diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 1b2778fe6f..81bebfd8e2 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -245,7 +245,8 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( if (!config_.disable_self_aggregate) { for (size_t i = 0; i < column_length; i++) { aggregate_output[index_to_src_feature + i] = - node_embeddings[index_to_src_feature + i]; + node_embeddings[index_to_src_feature + i] * source_norm * + source_norm; } } diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp new file mode 100644 index 0000000000..a7e6d4c5f6 --- /dev/null +++ b/libgnn/src/layers/SAGELayer.cpp @@ -0,0 +1,352 @@ +#include "galois/Logging.h" +#include "galois/GNNMath.h" +#include "galois/layers/SAGELayer.h" + +galois::SAGELayer::SAGELayer(size_t layer_num, + const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions, + const GNNLayerConfig& config, + const SAGELayerConfig&) + : GNNLayer(layer_num, graph, dimensions, config), + input_column_intermediates_(dimensions.input_columns), + output_column_intermediates_(dimensions.output_columns) { + size_t num_input_elements = + layer_dimensions_.input_rows * layer_dimensions_.input_columns; + in_temp_1_.resize(num_input_elements, 0); + // TODO temp2 does not need to be initialized in all circumstances + in_temp_2_.resize(num_input_elements, 0); + + size_t num_output_elements = + layer_dimensions_.input_rows * layer_dimensions_.output_columns; + GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements); + out_temp_.resize(num_output_elements, 0); + layer_type_ = galois::GNNLayerType::kGraphConvolutional; +#ifdef GALOIS_ENABLE_GPU + // TODO + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.Allocate(num_input_elements, num_output_elements); + // init pointers with size + p_in_temp_1_ = + PointerWithSize(gpu_object_.in_temp_1(), in_temp_1_.size()); + p_in_temp_2_ = + PointerWithSize(gpu_object_.in_temp_2(), in_temp_2_.size()); + p_out_temp_ = + PointerWithSize(gpu_object_.out_temp(), out_temp_.size()); + } else { +#endif + p_in_temp_1_ = PointerWithSize(in_temp_1_); + p_in_temp_2_ = PointerWithSize(in_temp_2_); + p_out_temp_ = PointerWithSize(out_temp_); +#ifdef GALOIS_ENABLE_GPU + // TODO concat + } +#endif + + GALOIS_LOG_VERBOSE("SAGE layer initialized"); +} + +const galois::PointerWithSize galois::SAGELayer::ForwardPhase( + const galois::PointerWithSize input_embeddings) { + GALOIS_LOG_VERBOSE("Calling forward phase"); + assert(input_embeddings.size() == + (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); + assert(p_in_temp_1_.size() == input_embeddings.size()); + assert(p_in_temp_2_.size() == input_embeddings.size()); + assert(p_forward_output_matrix_.size() == + (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); + // pointer to input to operate on + const GNNFloat* input_data = input_embeddings.data(); + // first, dropout + if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) { + DoDropout(input_embeddings, &p_in_temp_1_); + input_data = p_in_temp_1_.data(); + } + + // flip aggregate/update if dimensions favor it (do less work) + if (config_.disable_aggregate_after_update || + layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + // aggregation and update + AggregateAll(layer_dimensions_.input_columns, input_data, + p_in_temp_2_.data(), &input_column_intermediates_); + UpdateEmbeddings(p_in_temp_2_.data(), p_forward_output_matrix_.data()); + } else { + // update to aggregate + // FW + UpdateEmbeddings(input_data, p_out_temp_.data()); + // A(FW) + AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(), + p_forward_output_matrix_.data(), + &output_column_intermediates_); + } + + // TODO synchronization of aggregation functions + + if (!config_.disable_activation) { + GALOIS_LOG_VERBOSE("Doing activation"); + Activation(); + } + + assert(p_forward_output_matrix_.size() == + (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); + return p_forward_output_matrix_; +} + +galois::PointerWithSize galois::SAGELayer::BackwardPhase( + galois::PointerWithSize prev_layer_input, + galois::PointerWithSize* input_gradient) { + assert(layer_phase_ == GNNPhase::kTrain); + + // derivative of activation + if (!config_.disable_activation) { + ActivationDerivative(input_gradient); + } + + // AFW = O + + // derivative of aggregation/update + // TODO clean up logic here to reduce nesting + if (config_.disable_aggregate_after_update || + layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + if (layer_number_ != 0) { + // transposed sgemm for derivative; in_temp is output + assert(input_gradient->size() == + layer_dimensions_.input_rows * layer_dimensions_.output_columns); + assert(p_in_temp_1_.size() == + layer_dimensions_.input_columns * layer_dimensions_.input_rows); + // pintemp1 contains (AF)' + UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data()); + // pback contains F' + // derivative of aggregate is the same due to symmetric graph + AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(), + p_backward_output_matrix_.data(), + &input_column_intermediates_, true); + // TODO if training A, then A' compute here if layer # is 0 + // dot product of edges that exist in A + } + // weight gradient calculation + // TODO(loc) put this in a function to put the ifdef in there +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.GetWeightGradientsGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, p_in_temp_2_.data(), + input_gradient->data(), p_layer_weight_gradients_.data()); + } else { +#endif + // temp 2 holds aggregated feature vectors from forward phase + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, + layer_dimensions_.input_rows, layer_dimensions_.output_columns, + p_in_temp_2_.data(), input_gradient->data(), + p_layer_weight_gradients_.data()); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } else { + // TODO at this point, out_temp contains memoized FW + // can use it to get A' = O' (FW)^T + // aggregate occurs regardless of layer being equal to 0 because it is + // required in this case for the weight gradient calculation + // this is (FW)' + AggregateAll(layer_dimensions_.output_columns, input_gradient->data(), + p_out_temp_.data(), &output_column_intermediates_, true); + if (layer_number_ != 0) { + // derivative for update + // backout = F' + UpdateEmbeddingsDerivative(p_out_temp_.data(), + p_backward_output_matrix_.data()); + } + // TODO put this in a function + // W' = F^T (FW)' +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.GetWeightGradientsGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, prev_layer_input.data(), + p_out_temp_.data(), p_layer_weight_gradients_.data()); + } else { +#endif + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, + layer_dimensions_.input_rows, layer_dimensions_.output_columns, + prev_layer_input.data(), p_out_temp_.data(), + p_layer_weight_gradients_.data()); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } + + // sync weight gradients; note aggregation sync occurs in the function call + // already + // TODO figure out how to do this with GPUs + // WeightGradientSyncAverage(); + WeightGradientSyncSum(); + + if (!config_.disable_dropout && layer_number_ != 0) { + DoDropoutDerivative(); + } + + return p_backward_output_matrix_; +} + +void galois::SAGELayer::AggregateAll( + size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + [[maybe_unused]] galois::substrate::PerThreadStorage>* + pts) { + AggregateAll(column_length, node_embeddings, aggregate_output, pts, false); +} + +void galois::SAGELayer::AggregateAll( + size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + [[maybe_unused]] galois::substrate::PerThreadStorage>* + pts, + bool is_backward) { +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AggregateAllGPU( + graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings, + aggregate_output, !config_.disable_normalization); + graph_.AggregateSync(aggregate_output, column_length, layer_number_); + } else { +#endif + AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts, + is_backward); +#ifdef GALOIS_ENABLE_GPU + } +#endif +} + +void galois::SAGELayer::AggregateAllCPU( + size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + galois::substrate::PerThreadStorage>* pts, + bool is_backward) { + size_t num_nodes = graph_.size(); + + galois::do_all( + galois::iterate(static_cast(0), num_nodes), + [&](size_t src) { + size_t index_to_src_feature = src * column_length; + // zero out src feature first + for (size_t i = 0; i < column_length; i++) { + aggregate_output[index_to_src_feature + i] = 0; + } + + if (layer_phase_ == GNNPhase::kTrain) { + if (IsInductiveLayer()) { + // if inductive, all non-training nodes do not exist + if (!graph_.IsValidForPhase(src, GNNPhase::kTrain)) + return; + } + + if (IsSampledLayer()) { + // check if node is part of sampled graph; ignore after 0'ing if not + // sampled + if (!graph_.IsInSampledGraph(src)) + return; + } + } + + GNNFloat source_norm = 0.0; + if (!config_.disable_normalization) { + source_norm = graph_.DegreeNorm(src); + } + + // loop through all destinations to grab the feature to aggregate + for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) { + size_t dst = graph_.EdgeDestination(e); + + if (layer_phase_ == GNNPhase::kTrain) { + if (IsInductiveLayer()) { + // if inductive, all non-training nodes do not exist + if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain)) + return; + } + + if (IsSampledLayer()) { + // ignore non-sampled nodes + if (layer_phase_ == GNNPhase::kTrain && + !graph_.IsInSampledGraph(dst)) + continue; + } + } + + size_t index_to_dst_feature = dst * column_length; + + if (!config_.disable_normalization) { + GNNFloat norm_scale; + if (!is_backward) { + norm_scale = source_norm; + } else { + norm_scale = graph_.DegreeNorm(dst); + } + + // scale the value on the destination by the combined norm term + assert(pts->getLocal()->size() == column_length); + GNNFloat* intermediate = pts->getLocal()->data(); + for (size_t i = 0; i < column_length; i++) { + intermediate[i] = + norm_scale * node_embeddings[index_to_dst_feature + i]; + } + // add intermediate instead of original feature + galois::VectorAdd( + column_length, &aggregate_output[index_to_src_feature], + intermediate, &aggregate_output[index_to_src_feature]); + } else { + // add dst feature to aggregate output + galois::VectorAdd(column_length, + &aggregate_output[index_to_src_feature], + &node_embeddings[index_to_dst_feature], + &aggregate_output[index_to_src_feature]); + } + } + }, + galois::steal(), galois::loopname("ConvolutionalAggregateAll")); + // aggregate sync + graph_.AggregateSync(aggregate_output, column_length); +} + +void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, + GNNFloat* output) { +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.UpdateEmbeddingsGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, node_embeddings, + base_gpu_object_.layer_weights(), output); + } else { +#endif + // CPU version is just a call into CBlas + galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, + layer_dimensions_.input_columns, + layer_dimensions_.output_columns, node_embeddings, + layer_weights_.data(), output); +#ifdef GALOIS_ENABLE_GPU + } +#endif +} + +void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, + GNNFloat* output) { + assert(p_layer_weights_.size() == + layer_dimensions_.input_columns * layer_dimensions_.output_columns); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.UpdateEmbeddingsDerivativeGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, gradients, + base_gpu_object_.layer_weights(), output); + } else { +#endif + // difference is Trans for B matrix (data) to get z by y (weights is y by z + // normally); result is x by y + galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, + layer_dimensions_.output_columns, + layer_dimensions_.input_columns, gradients, + layer_weights_.data(), output); +#ifdef GALOIS_ENABLE_GPU + } +#endif +} diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index 47ca1bfe0c..c48f0b41b4 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -44,6 +44,8 @@ llvm::cl::list cl_layer_types( "intermediate layers (does not include output)"), cll::values(clEnumValN(galois::GNNLayerType::kGraphConvolutional, "gcn", "Graph Convolutional Layer (default)"), + clEnumValN(galois::GNNLayerType::kSAGE, "sage", + "SAGE layer (GCN with concat + mean)"), clEnumValN(galois::GNNLayerType::kDense, "dense", "Dense Layer")), cll::CommaSeparated); From a0a14278f6b4e21868e49d5f978ab1a77da8f0ec Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 18 Feb 2021 15:35:49 -0600 Subject: [PATCH 471/660] SAGE layer: forward phase Implement the forward concat phase for the SAGE layer; maintains another weight matrix solely for multiplying with the self features. Conceptually W = W1 | W2: since this is the case, you can multiply the self matrix separately from the aggregated one and just sum it into the aggregated linear xform. This code has been unit tested; the test will be pushed in later once full layer is done. Next up is backward phase. --- libgnn/include/galois/layers/SAGELayer.h | 66 +++++++++++++++-------- libgnn/src/layers/SAGELayer.cpp | 69 ++++++++++++++++++++---- 2 files changed, 102 insertions(+), 33 deletions(-) diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index a3fd5ecac6..9dcd53d9c6 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -8,10 +8,12 @@ namespace galois { struct SAGELayerConfig { - // TODO(loc) relevant options here - bool todo; + bool disable_concat{false}; }; +// TODO(loc) move common functionality with GCN layer to common parent class +// (e.g. inits): cleans up Dense code a bit as well + //! Same as GCN layer except for the following: //! - Mean aggregation; no symmetric norm with sqrts used (this //! ends up performing better for some graphs) @@ -35,6 +37,12 @@ class SAGELayer : public GNNLayer { : SAGELayer(layer_num, graph, dimensions, GNNLayerConfig(), SAGELayerConfig()) {} + void InitSelfWeightsTo1() { + if (layer_weights_2_.size()) { + layer_weights_2_.assign(layer_weights_2_.size(), 1); + } + } + // Parent functions const PointerWithSize ForwardPhase(const PointerWithSize input_embeddings) final; @@ -44,27 +52,6 @@ class SAGELayer : public GNNLayer { PointerWithSize* input_gradient) final; private: - // 2 temporaries the size of the forward input; used for dropout and - // aggregation (if either are required) - std::vector in_temp_1_; - std::vector in_temp_2_; - // Temporary matrix the size of the output of the forward pass; used if - // an intermediate op occurs before writing to the final output matrix - std::vector out_temp_; - - // Pointer with size versions - PointerWithSize p_in_temp_1_; - PointerWithSize p_in_temp_2_; - PointerWithSize p_out_temp_; - - // Each thread has a vector of size # input columns or # output columns for - // storing intermediate results during aggregation. - // The one used depeneds on if aggregation occurs before or after the mxm. - galois::substrate::PerThreadStorage> - input_column_intermediates_; - galois::substrate::PerThreadStorage> - output_column_intermediates_; - //! CPU aggregation void AggregateAllCPU( size_t column_length, const GNNFloat* node_embeddings, @@ -87,9 +74,42 @@ class SAGELayer : public GNNLayer { //! Do embedding update via mxm with this layer's weights (forward) void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output); + //! Same as above but uses the second set of weights (self feature weights) + void SelfFeatureUpdateEmbeddings(const GNNFloat* node_embeddings, + GNNFloat* output); //! Calculate graident via mxm with last layer's gradients (backward) void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output); + //! SAGE config params + SAGELayerConfig sage_config_; + + // second set of weights for the concat that may occur + std::vector layer_weights_2_; + std::vector layer_weight_gradients_2_; + PointerWithSize p_layer_weights_2_; + PointerWithSize p_layer_weight_gradients_2_; + + // 2 temporaries the size of the forward input; used for dropout and + // aggregation (if either are required) + std::vector in_temp_1_; + std::vector in_temp_2_; + // Temporary matrix the size of the output of the forward pass; used if + // an intermediate op occurs before writing to the final output matrix + std::vector out_temp_; + + // Pointer with size versions + PointerWithSize p_in_temp_1_; + PointerWithSize p_in_temp_2_; + PointerWithSize p_out_temp_; + + // Each thread has a vector of size # input columns or # output columns for + // storing intermediate results during aggregation. + // The one used depeneds on if aggregation occurs before or after the mxm. + galois::substrate::PerThreadStorage> + input_column_intermediates_; + galois::substrate::PerThreadStorage> + output_column_intermediates_; + #ifdef GALOIS_ENABLE_GPU // TODO(loc/hochan) GCNGPUAllocations gpu_object_; diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index a7e6d4c5f6..461f23bd99 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -6,10 +6,24 @@ galois::SAGELayer::SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, const GNNLayerDimensions& dimensions, const GNNLayerConfig& config, - const SAGELayerConfig&) - : GNNLayer(layer_num, graph, dimensions, config), + const SAGELayerConfig& sage_config) + : GNNLayer(layer_num, graph, dimensions, config), sage_config_(sage_config), input_column_intermediates_(dimensions.input_columns), output_column_intermediates_(dimensions.output_columns) { + if (!sage_config_.disable_concat) { + // there are now 2 weight matrices used: one for self, one for aggregation + // abstractly it's one matrix: W = W1 | W2 + size_t num_weight_elements = + layer_dimensions_.input_columns * layer_dimensions_.output_columns; + layer_weights_2_.resize(num_weight_elements); + layer_weight_gradients_2_.resize(num_weight_elements, 0); + GlorotBengioInit(&layer_weights_2_); + // update the pointers to them as well as realloc will require it + p_layer_weights_2_ = PointerWithSize(layer_weights_2_); + p_layer_weight_gradients_2_ = + PointerWithSize(layer_weight_gradients_2_); + } + size_t num_input_elements = layer_dimensions_.input_rows * layer_dimensions_.input_columns; in_temp_1_.resize(num_input_elements, 0); @@ -20,9 +34,9 @@ galois::SAGELayer::SAGELayer(size_t layer_num, layer_dimensions_.input_rows * layer_dimensions_.output_columns; GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements); out_temp_.resize(num_output_elements, 0); - layer_type_ = galois::GNNLayerType::kGraphConvolutional; + layer_type_ = galois::GNNLayerType::kSAGE; #ifdef GALOIS_ENABLE_GPU - // TODO + // TODO(loc/hochan) GPU SAGE if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.Allocate(num_input_elements, num_output_elements); // init pointers with size @@ -38,13 +52,26 @@ galois::SAGELayer::SAGELayer(size_t layer_num, p_in_temp_2_ = PointerWithSize(in_temp_2_); p_out_temp_ = PointerWithSize(out_temp_); #ifdef GALOIS_ENABLE_GPU - // TODO concat + // TODO concat parameters } #endif GALOIS_LOG_VERBOSE("SAGE layer initialized"); } +void MatrixAdd(size_t num_nodes, galois::PointerWithSize in, + galois::PointerWithSize* out) { + assert(in.size() == out->size()); + assert((in.size() % num_nodes) == 0); + size_t column_size = in.size() / num_nodes; + // split matrix to threads + galois::do_all(galois::iterate(size_t{0}, num_nodes), [&](size_t node) { + size_t my_offset = node * column_size; + galois::VectorAdd(column_size, &(in[my_offset]), + &((out->data())[my_offset]), &(out->data()[my_offset])); + }); +} + const galois::PointerWithSize galois::SAGELayer::ForwardPhase( const galois::PointerWithSize input_embeddings) { GALOIS_LOG_VERBOSE("Calling forward phase"); @@ -62,6 +89,9 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( input_data = p_in_temp_1_.data(); } + // O = FW1 + AFW2 is what is done if concat is on: below is the AFW2 part + // which is done regardless + // flip aggregate/update if dimensions favor it (do less work) if (config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { @@ -79,7 +109,14 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( &output_column_intermediates_); } - // TODO synchronization of aggregation functions + if (!sage_config_.disable_concat) { + // FW1 is unaffected by the agg/update flip, so can to it + // separately + SelfFeatureUpdateEmbeddings(input_data, p_out_temp_.data()); + // add result to the output matrix: FW1 + AFW2 + MatrixAdd(layer_dimensions_.input_rows, p_out_temp_, + &p_forward_output_matrix_); + } if (!config_.disable_activation) { GALOIS_LOG_VERBOSE("Doing activation"); @@ -176,10 +213,6 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( #endif } - // sync weight gradients; note aggregation sync occurs in the function call - // already - // TODO figure out how to do this with GPUs - // WeightGradientSyncAverage(); WeightGradientSyncSum(); if (!config_.disable_dropout && layer_number_ != 0) { @@ -311,6 +344,7 @@ void galois::SAGELayer::AggregateAllCPU( void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output) { #ifdef GALOIS_ENABLE_GPU + // TODO self change if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.UpdateEmbeddingsGPU( layer_dimensions_.input_rows, layer_dimensions_.input_columns, @@ -328,6 +362,21 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, #endif } +void galois::SAGELayer::SelfFeatureUpdateEmbeddings( + const GNNFloat* node_embeddings, GNNFloat* output) { +#ifdef GALOIS_ENABLE_GPU + // TODO self change +#endif + // note use of layer weights 2 differentiates this from above + galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, + layer_dimensions_.input_columns, + layer_dimensions_.output_columns, node_embeddings, + layer_weights_2_.data(), output); +#ifdef GALOIS_ENABLE_GPU +} +#endif +} + void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output) { assert(p_layer_weights_.size() == From 0b43a3a7165184c8265a5e4f97d31882ad85c8a6 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 19 Feb 2021 12:41:57 -0600 Subject: [PATCH 472/660] train/val/test splits for non-complete masks Code before did not support masks that were not complete (i.e., assumes that range given in file is complete meaning if in range it is part of that set). Code now checks to make sure this is the case before doing that; if not, it uses the mask instead (slower, but correct). --- libgnn/include/galois/graphs/GNNGraph.h | 21 ++++++++++++- libgnn/src/graphs/GNNGraph.cpp | 41 ++++++++++++++++++++++--- 2 files changed, 57 insertions(+), 5 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 4dafda2afb..b69eb43ea2 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -136,7 +136,13 @@ class GNNGraph { //! lid in question is valid for the current phase (i.e., it is part of //! a training, validation, or test phase mask) bool IsValidForPhase(const unsigned lid, - const galois::GNNPhase current_phase) const; + const galois::GNNPhase current_phase) const { + if (!incomplete_masks_) { + return IsValidForPhaseCompleteRange(lid, current_phase); + } else { + return IsValidForPhaseMasked(lid, current_phase); + } + } ////////////////////////////////////////////////////////////////////////////// @@ -228,6 +234,15 @@ class GNNGraph { //! degree access void InitNormFactor(); + //! Used if ranges for a mask are complete (if in range, it's part of mask). + bool IsValidForPhaseCompleteRange(const unsigned lid, + const galois::GNNPhase current_phase) const; + + //! Used if ranges for a mask are incomplete, meaning I actually have to + //! check the mask. + bool IsValidForPhaseMasked(const unsigned lid, + const galois::GNNPhase current_phase) const; + ////////////////////////////////////////////////////////////////////////////// // Accuracy ////////////////////////////////////////////////////////////////////////////// @@ -288,6 +303,10 @@ class GNNGraph { //! in this class GNNRange global_testing_mask_range_; + //! If true, then node splits of train/val/test aren't complete (i.e. + //! falling in range != part of that set) + bool incomplete_masks_{false}; + //! Normalization constant based on structure of the graph (degrees) std::vector norm_factors_; //! Normalization constant based on degrees (unlike nomral norm factors diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index af3ef00baf..919d7340e4 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -100,8 +100,9 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, #endif } -bool galois::graphs::GNNGraph::IsValidForPhase( +bool galois::graphs::GNNGraph::IsValidForPhaseCompleteRange( const unsigned lid, const galois::GNNPhase current_phase) const { + // only use ranges if they're complete // convert to gid first size_t gid = partitioned_graph_->getGID(lid); @@ -123,9 +124,9 @@ bool galois::graphs::GNNGraph::IsValidForPhase( } // if within range, it is valid - // TODO there is an assumption here that ranges are contiguous; may not - // necessarily be the case in all inputs in which case using the mask is safer - // (but less cache efficient) + // there is an assumption here that ranges are contiguous; may not + // necessarily be the case in all inputs in which case using the mask is + // required (but less cache efficient) if (range_to_use->begin <= gid && gid < range_to_use->end) { return true; } else { @@ -133,6 +134,28 @@ bool galois::graphs::GNNGraph::IsValidForPhase( } } +bool galois::graphs::GNNGraph::IsValidForPhaseMasked( + const unsigned lid, const galois::GNNPhase current_phase) const { + // select mask to use based on phase + const std::vector* mask_to_use; + switch (current_phase) { + case GNNPhase::kTrain: + mask_to_use = &local_training_mask_; + break; + case GNNPhase::kValidate: + mask_to_use = &local_validation_mask_; + break; + case GNNPhase::kTest: + mask_to_use = &local_testing_mask_; + break; + default: + GALOIS_LOG_FATAL("Invalid phase used"); + mask_to_use = nullptr; + } + + return (*mask_to_use)[lid]; +} + void galois::graphs::GNNGraph::AggregateSync( GNNFloat* matrix_to_sync, const size_t matrix_column_size) const { // set globals for the sync substrate @@ -425,6 +448,16 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( } mask_stream.close(); + if (local_sample_count != mask_range->size) { + // overlapping masks: need to actually check the masks rather than use + // ranges + if (!incomplete_masks_) { + galois::gInfo( + "Masks are not contained in range: must actually check mask"); + } + incomplete_masks_ = true; + } + return local_sample_count; } From c80ede803cac9384d642e3db9b19ecda072940ac Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 19 Feb 2021 13:16:40 -0600 Subject: [PATCH 473/660] GNNLayer: optimize function made virtual Made optimize layer function able to be overridden by child classes because they may be training more than just the single weight matrix (e.g., sage trains 2 matrices). --- libgnn/include/galois/layers/GNNLayer.h | 6 +++++- libgnn/src/layers/GNNLayer.cpp | 6 ------ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 3be7908ad7..7b8737e204 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -152,7 +152,11 @@ class GNNLayer { //! Given an optimizer, update the weights in this layer based on gradients //! stored in the layer - void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number); + virtual void OptimizeLayer(BaseOptimizer* optimizer, + size_t trainable_layer_number) { + optimizer->GradientDescent(p_layer_weight_gradients_, p_layer_weights_, + trainable_layer_number); + } //! Flip sampling switch on void EnableSampling() { config_.do_sampling = true; } diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 9da77a004f..6deab8e682 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -202,12 +202,6 @@ void galois::GNNLayer::ActivationDerivative( galois::loopname("ReLU-Derivative")); } -void galois::GNNLayer::OptimizeLayer(BaseOptimizer* optimizer, - size_t trainable_layer_number) { - optimizer->GradientDescent(p_layer_weight_gradients_, p_layer_weights_, - trainable_layer_number); -} - void galois::GNNLayer::WeightGradientSyncSum() { // XXX bitset gradient_sync_substrate_->sync( From bd25d8d8d2c0e9bfd00737640f35bfff2aaf8465 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 22 Feb 2021 17:15:57 -0600 Subject: [PATCH 474/660] L2 normalization layer + test Adds an implementation of an l2 normalization layer (taken from GraphSAINT, but I verified it manually via pen/paper derivation). A test is added to ensure the math being done per row is correct as well. --- libgnn/CMakeLists.txt | 5 +- libgnn/include/galois/layers/GNNLayer.h | 5 +- libgnn/include/galois/layers/L2NormLayer.h | 50 +++++++++ libgnn/src/layers/L2NormLayer.cpp | 121 +++++++++++++++++++++ libgnn/test/CMakeLists.txt | 6 +- libgnn/test/l2norm-layer-test.cpp | 85 +++++++++++++++ 6 files changed, 268 insertions(+), 4 deletions(-) create mode 100644 libgnn/include/galois/layers/L2NormLayer.h create mode 100644 libgnn/src/layers/L2NormLayer.cpp create mode 100644 libgnn/test/l2norm-layer-test.cpp diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 82454b1301..b59cccef93 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -3,13 +3,14 @@ set(sources src/GNNOptimizers.cpp src/GraphNeuralNetwork.cpp src/graphs/GNNGraph.cpp + src/layers/DenseLayer.cpp src/layers/GNNLayer.cpp src/layers/GluonGradientInterface.cpp src/layers/GraphConvolutionalLayer.cpp - src/layers/DenseLayer.cpp + src/layers/L2NormLayer.cpp + src/layers/SAGELayer.cpp src/layers/SigmoidLayer.cpp src/layers/SoftmaxLayer.cpp - src/layers/SAGELayer.cpp ) set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 7b8737e204..0039683ad4 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -20,7 +20,9 @@ enum class GNNLayerType { //! Sage layer: same as GCN except with mean aggregation and concat kSAGE, //! Dense linear xform layer - kDense + kDense, + //! L2 normalization layer + kL2Norm // TODO GAT }; @@ -129,6 +131,7 @@ class GNNLayer { galois::GNNOutputLayerType output_layer_type() const { return output_layer_type_; } + size_t layer_number() const { return layer_number_; } //! Conducts the forward phase given the input to this layer which //! ultimately leads to an output (classfication of node labels) at the end diff --git a/libgnn/include/galois/layers/L2NormLayer.h b/libgnn/include/galois/layers/L2NormLayer.h new file mode 100644 index 0000000000..176c88700e --- /dev/null +++ b/libgnn/include/galois/layers/L2NormLayer.h @@ -0,0 +1,50 @@ +#pragma once +#include "galois/layers/GNNLayer.h" + +#ifdef GALOIS_ENABLE_GPU +// TODO(loc/hochan) +#endif + +namespace galois { + +//! Applies L2 norm to rows of the input +class L2NormLayer : public GNNLayer { +public: + L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions) + : L2NormLayer(layer_num, graph, dimensions, + GNNLayerConfig{.allocate_weights = false}) {} + L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + const GNNLayerDimensions& dimensions, + const GNNLayerConfig& config) + : GNNLayer(layer_num, graph, dimensions, config) { + layer_type_ = galois::GNNLayerType::kL2Norm; + // input/output columns must be equivalent in a softmax + GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns); + GALOIS_LOG_VERBOSE("L2 norm initialized"); + } + + const PointerWithSize + ForwardPhase(const PointerWithSize input_embeddings); + + PointerWithSize + BackwardPhase(const PointerWithSize prev_layer_input, + PointerWithSize* input_gradient); + +private: + const PointerWithSize + ForwardPhaseCPU(const PointerWithSize input_embeddings); + + PointerWithSize + BackwardPhaseCPU(const PointerWithSize prev_layer_input, + PointerWithSize* input_gradient); + + //! No op + void OptimizeLayer(BaseOptimizer*, size_t) { return; }; + +#ifdef GALOIS_ENABLE_GPU + // TODO(loc/hochan) +#endif +}; + +} // namespace galois diff --git a/libgnn/src/layers/L2NormLayer.cpp b/libgnn/src/layers/L2NormLayer.cpp new file mode 100644 index 0000000000..a29fccab1d --- /dev/null +++ b/libgnn/src/layers/L2NormLayer.cpp @@ -0,0 +1,121 @@ +#include "galois/layers/L2NormLayer.h" +const galois::PointerWithSize +galois::L2NormLayer::ForwardPhase( + const galois::PointerWithSize input_embeddings) { +#ifdef GALOIS_ENABLE_GPU + // TODO +#endif + return ForwardPhaseCPU(input_embeddings); +} + +const galois::PointerWithSize +galois::L2NormLayer::ForwardPhaseCPU( + const galois::PointerWithSize input_embeddings) { + forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); + // for each row, get square root of squared sums then normalize + const size_t feature_length = layer_dimensions_.input_columns; + // TODO(loc) make sure this works in distributed setting as well + galois::do_all( + galois::iterate(graph_.begin_owned(), graph_.end_owned()), + [&](const unsigned row) { + if (IsSampledLayer()) { + if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(row)) + return; + } + + if (graph_.IsValidForPhase(row, layer_phase_)) { + size_t row_offset = row * feature_length; + float running_square_sum = 0.0; + // get square sums + for (size_t row_index = row_offset; + row_index < (row_offset + feature_length); row_index++) { + running_square_sum += std::pow(input_embeddings[row_index], 2); + } + + // make sure running sum isn't too small + running_square_sum = + (running_square_sum < 1.0e-12) ? 10e-12 : running_square_sum; + + // sqrt of sums, then divide row by it + float sqrt_squares = std::pow(running_square_sum, 0.5); + for (size_t row_index = row_offset; + row_index < (row_offset + feature_length); row_index++) { + forward_output_matrix_[row_index] = + input_embeddings[row_index] / sqrt_squares; + } + } + }, + galois::loopname("L2ForwardNormalization")); + + return forward_output_matrix_; +} + +galois::PointerWithSize galois::L2NormLayer::BackwardPhase( + const PointerWithSize prev_layer_input, + PointerWithSize* input_gradient) { +#ifdef GALOIS_ENABLE_GPU + // TODO +#endif + return BackwardPhaseCPU(prev_layer_input, input_gradient); +} + +galois::PointerWithSize galois::L2NormLayer::BackwardPhaseCPU( + galois::PointerWithSize prev_layer_input, + galois::PointerWithSize* input_gradient) { + backward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); + const size_t feature_length = layer_dimensions_.input_columns; + + // derivative of some x_1 is sum of gradient w.r.t. x_1 for all elements of + // the row (since l2 norm affects entire row) + // The math itself can be derived using quotient/chain rule on each element + // of the normalized row + galois::do_all( + galois::iterate(graph_.begin_owned(), graph_.end_owned()), + [&](const unsigned row) { + if (IsSampledLayer()) { + if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(row)) + return; + } + + if (graph_.IsValidForPhase(row, layer_phase_)) { + size_t row_offset = row * feature_length; + // note: if you work this out on paper it turns out that terms that + // seem extra in the way this is calculated below simply get canceled + // out, so this ends up working out This implementation is taken from + // the IPDPS GraphSAINT implementation: I (loc) have confirmed the + // math checks out + float running_square_sum = 0.0; + float mult_with_input = 0.0; + + // get square sums + for (size_t row_index = row_offset; + row_index < (row_offset + feature_length); row_index++) { + running_square_sum += std::pow(prev_layer_input[row_index], 2); + // gradient multiplied with corresponding input; subtraction because + // derivative math ends up working out that way + mult_with_input -= + prev_layer_input[row_index] * (*input_gradient)[row_index]; + } + running_square_sum = + (running_square_sum < 1.0e-12) ? 10e-12 : running_square_sum; + assert(running_square_sum != 0.0); + + // denominator for all gradients is just the square sum to the -3/2'd + // power since this is -, all we have to do is multiply it later + // rather than divide + float denominator = std::pow(running_square_sum, -1.5); + assert(denominator != 0.0); + + for (size_t row_index = row_offset; + row_index < (row_offset + feature_length); row_index++) { + backward_output_matrix_[row_index] = + denominator * + (prev_layer_input[row_index] * mult_with_input + + (*input_gradient)[row_index] * running_square_sum); + } + } + }, + galois::loopname("L2Backward")); + + return PointerWithSize(backward_output_matrix_); +} diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 9e10da1246..4f2eca0295 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -6,7 +6,11 @@ if (NOT GALOIS_ENABLE_GPU) add_executable(convlayer-test convlayer-test.cpp) target_link_libraries(convlayer-test galois_gnn) add_test(NAME convlayer-test COMMAND convlayer-test) - + + add_executable(l2norm-layer-test l2norm-layer-test.cpp) + target_link_libraries(l2norm-layer-test galois_gnn) + add_test(NAME l2norm-layer-test COMMAND l2norm-layer-test) + add_executable(softmaxlayer-test softmaxlayer-test.cpp) target_link_libraries(softmaxlayer-test galois_gnn) add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test) diff --git a/libgnn/test/l2norm-layer-test.cpp b/libgnn/test/l2norm-layer-test.cpp new file mode 100644 index 0000000000..a66c419a7f --- /dev/null +++ b/libgnn/test/l2norm-layer-test.cpp @@ -0,0 +1,85 @@ +#include "galois/Logging.h" +#include "galois/GNNMath.h" +#include "galois/layers/L2NormLayer.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + GALOIS_LOG_VERBOSE("Num threads is {}", num_threads); + + // load test graph + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + + // input/output columns must be same in softmax + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = 7; + dimension_0.input_columns = 2; + dimension_0.output_columns = 2; + + std::vector l2_input(14, 0.0); + l2_input[0] = 4; + l2_input[1] = 3; + l2_input[2] = 4; + l2_input[3] = 3; + l2_input[4] = 4; + l2_input[5] = 3; + l2_input[6] = 4; + l2_input[7] = 3; + l2_input[8] = 4; + l2_input[9] = 3; + l2_input[10] = 4; + l2_input[11] = 3; + l2_input[12] = 4; + l2_input[13] = 3; + + auto l2_layer = + std::make_unique(2, test_graph, dimension_0); + galois::PointerWithSize normed = + l2_layer->ForwardPhase(l2_input); + + // only go up to 5 because training set + for (size_t row = 0; row < 5; row++) { + GALOIS_LOG_VASSERT(std::abs(normed[row * 2] - 0.8) < 0.0001, + "input 4 should become 0.8 not {}, index {}", + normed[row * 2], row * 2); + GALOIS_LOG_VASSERT(std::abs(normed[row * 2 + 1] - 0.6) < 0.0001, + "input 3 should become 0.6 not {}, index {}", + normed[row * 2 + 1], row * 2 + 1); + } + // only go up to 5 because training set + for (size_t row = 5; row < 7; row++) { + GALOIS_LOG_VASSERT(std::abs(normed[row * 2] - 0.0) < 0.0001, + "index {} should be 0, not part of train", row * 2); + GALOIS_LOG_VASSERT(std::abs(normed[row * 2 + 1] - 0.0) < 0.0001, + "index {} should be 0, not part of train", row * 2 + 1); + } + + // backward + std::vector dummy_ones_v(14, 1); + galois::PointerWithSize dummy_ones(dummy_ones_v); + + galois::PointerWithSize grads = + l2_layer->BackwardPhase(l2_input, &dummy_ones); + float out_4 = (-3.0 / 125.0); + float out_3 = (4.0 / 125.0); + for (size_t row = 0; row < 5; row++) { + GALOIS_LOG_VASSERT(std::abs(grads[row * 2] - out_4) < 0.0001, + "index {} grad 4 gradient should be {} not {}", row * 2, + out_4, grads[row * 2]); + GALOIS_LOG_VASSERT(std::abs(grads[row * 2 + 1] - out_3) < 0.0001, + "index {} grad 3 gradient should be {} not {}", + row * 2 + 1, out_3, grads[row * 2 + 1]); + } + + for (size_t row = 5; row < 7; row++) { + GALOIS_LOG_VASSERT(std::abs(grads[row * 2] - 0.0) < 0.0001, + "index {} should be 0, not part of train", row * 2); + GALOIS_LOG_VASSERT(std::abs(grads[row * 2 + 1] - 0.0) < 0.0001, + "index {} should be 0, not part of train", row * 2 + 1); + } + + return 0; +} From 0b7c2eb8fbb1a9bdba7135d3db31a749689a4855 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 23 Feb 2021 14:01:40 -0600 Subject: [PATCH 475/660] SAGE Layer completed + tests This commit finishes up the implementation of the SAGE layer's backward phase. One thing to note is that if dropout is enabled, then the dropout matrix needs to be used during backward phase computations and not the original. This needs to be fixed in the other existing layers. GNNMath has a new version of the SGEMM routine that aggregates directly into the output matrix: this is useful for the SAGE split matrix. --- libgnn/include/galois/GNNMath.h | 5 + libgnn/include/galois/layers/SAGELayer.h | 13 ++ libgnn/src/GNNMath.cpp | 13 +- libgnn/src/GraphNeuralNetwork.cpp | 28 ++- libgnn/src/layers/SAGELayer.cpp | 88 ++++++-- libgnn/test/CMakeLists.txt | 4 + libgnn/test/sage-layer-test.cpp | 272 +++++++++++++++++++++++ lonestar/libgnnbench/src/Input.cpp | 13 +- 8 files changed, 407 insertions(+), 29 deletions(-) create mode 100644 libgnn/test/sage-layer-test.cpp diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h index 231d437836..9e17a448fc 100644 --- a/libgnn/include/galois/GNNMath.h +++ b/libgnn/include/galois/GNNMath.h @@ -82,4 +82,9 @@ void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, size_t input_rows, size_t input_columns, size_t output_columns, const GNNFloat* a, const GNNFloat* b, GNNFloat* output); +void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, + size_t input_rows, size_t input_columns, size_t output_columns, + const GNNFloat* a, const GNNFloat* b, GNNFloat* output, + bool accumulate); + } // namespace galois diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index 9dcd53d9c6..a489913ef5 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -43,6 +43,11 @@ class SAGELayer : public GNNLayer { } } + //! Returns the 2nd set of weight gradients + const PointerWithSize GetLayerWeightGradients2() { + return p_layer_weight_gradients_2_; + } + // Parent functions const PointerWithSize ForwardPhase(const PointerWithSize input_embeddings) final; @@ -79,9 +84,17 @@ class SAGELayer : public GNNLayer { GNNFloat* output); //! Calculate graident via mxm with last layer's gradients (backward) void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output); + //! Same as above but uses the second set of weights (self feature weights) + void SelfFeatureUpdateEmbeddingsDerivative(const GNNFloat* gradients, + GNNFloat* output); + + //! override parent function: optimizes the second set of weights as well + void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number); //! SAGE config params SAGELayerConfig sage_config_; + //! Need own optimizer for the 2nd weight matrix + std::unique_ptr second_weight_optimizer_; // second set of weights for the concat that may occur std::vector layer_weights_2_; diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp index dcaaf31a42..38af349a8c 100644 --- a/libgnn/src/GNNMath.cpp +++ b/libgnn/src/GNNMath.cpp @@ -87,6 +87,15 @@ void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, size_t input_columns, size_t output_columns, const GNNFloat* a, const GNNFloat* b, GNNFloat* output) { + CBlasSGEMM(trans_a, trans_b, input_rows, input_columns, output_columns, a, b, + output, false); +} + +void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, + const CBLAS_TRANSPOSE trans_b, size_t input_rows, + size_t input_columns, size_t output_columns, + const GNNFloat* a, const GNNFloat* b, GNNFloat* output, + bool accumulate) { // set lead dimension based on cblas spec w.r.t. transpose setting size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows; size_t lead_dim_b = @@ -94,6 +103,6 @@ void galois::CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, // do the MM // TODO roll our own sgemm rather than use 3rd party? cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns, - input_columns, 1.0, a, lead_dim_a, b, lead_dim_b, 0.0, output, - output_columns); + input_columns, 1.0, a, lead_dim_a, b, lead_dim_b, + accumulate ? 1.0 : 0.0, output, output_columns); } diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 40e3c8a7e1..0905713dd8 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -1,10 +1,11 @@ #include "galois/GNNMath.h" #include "galois/GraphNeuralNetwork.h" -#include "galois/layers/GraphConvolutionalLayer.h" #include "galois/layers/DenseLayer.h" -#include "galois/layers/SoftmaxLayer.h" -#include "galois/layers/SigmoidLayer.h" +#include "galois/layers/GraphConvolutionalLayer.h" +#include "galois/layers/L2NormLayer.h" #include "galois/layers/SAGELayer.h" +#include "galois/layers/SigmoidLayer.h" +#include "galois/layers/SoftmaxLayer.h" galois::GraphNeuralNetwork::GraphNeuralNetwork( std::unique_ptr graph, @@ -56,6 +57,13 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( i, *graph_, layer_dims, config_.default_layer_config()))); #ifdef GALOIS_ENABLE_GPU // TODO(loc/hochan) sage layer gpu +#endif + break; + case GNNLayerType::kL2Norm: + gnn_layers_.push_back(std::move(std::make_unique( + i, *graph_, layer_dims, config_.default_layer_config()))); +#ifdef GALOIS_ENABLE_GPU + // TODO(loc/hochan) l2 layer gpu #endif break; case GNNLayerType::kDense: @@ -68,10 +76,18 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( default: GALOIS_LOG_FATAL("Invalid layer type during network construction"); } + } - if (i == config_.num_intermediate_layers() - 1) { - // last layer before output layer should never have activation - gnn_layers_.back()->DisableActivation(); + // loop backward and find last GCN/SAGE (main) layer to disable activation + for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); + back_iter++) { + GNNLayerType layer_type = (*back_iter)->layer_type(); + if (layer_type == GNNLayerType::kGraphConvolutional || + layer_type == GNNLayerType::kSAGE) { + galois::gDebug("Disabling activation on layer ", + (*back_iter)->layer_number(), "\n"); + (*back_iter)->DisableActivation(); + break; } } diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 461f23bd99..79e757e93c 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -22,6 +22,9 @@ galois::SAGELayer::SAGELayer(size_t layer_num, p_layer_weights_2_ = PointerWithSize(layer_weights_2_); p_layer_weight_gradients_2_ = PointerWithSize(layer_weight_gradients_2_); + // initialize the optimizer + std::vector weight_size = {num_weight_elements}; + second_weight_optimizer_ = std::make_unique(weight_size, 1); } size_t num_input_elements = @@ -112,10 +115,7 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( if (!sage_config_.disable_concat) { // FW1 is unaffected by the agg/update flip, so can to it // separately - SelfFeatureUpdateEmbeddings(input_data, p_out_temp_.data()); - // add result to the output matrix: FW1 + AFW2 - MatrixAdd(layer_dimensions_.input_rows, p_out_temp_, - &p_forward_output_matrix_); + SelfFeatureUpdateEmbeddings(input_data, p_forward_output_matrix_.data()); } if (!config_.disable_activation) { @@ -125,6 +125,7 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( assert(p_forward_output_matrix_.size() == (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); + return p_forward_output_matrix_; } @@ -138,7 +139,29 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( ActivationDerivative(input_gradient); } + // if dropout was used, use the dropout matrix for the input + galois::PointerWithSize input_to_use; + if (!config_.disable_dropout) { + // dropout result is currently stored in temp 1 + // needs to be used before it gets overwritten + input_to_use = p_in_temp_1_; + } else { + // no dropout = use vanilla input + input_to_use = prev_layer_input; + } + // AFW = O + if (!sage_config_.disable_concat) { + // Fw1 + AFW2 = O; self feature has own weight matrix and makes own + // contribution to gradients which is handled in this block + // !!!! do this early because p_in_temp may get overwritten later + // if update occurs before aggregate !!! + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, + layer_dimensions_.input_rows, layer_dimensions_.output_columns, + input_to_use.data(), input_gradient->data(), + p_layer_weight_gradients_2_.data()); + } // derivative of aggregation/update // TODO clean up logic here to reduce nesting @@ -157,8 +180,6 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(), p_backward_output_matrix_.data(), &input_column_intermediates_, true); - // TODO if training A, then A' compute here if layer # is 0 - // dot product of edges that exist in A } // weight gradient calculation // TODO(loc) put this in a function to put the ifdef in there @@ -180,8 +201,6 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } #endif } else { - // TODO at this point, out_temp contains memoized FW - // can use it to get A' = O' (FW)^T // aggregate occurs regardless of layer being equal to 0 because it is // required in this case for the weight gradient calculation // this is (FW)' @@ -195,24 +214,35 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } // TODO put this in a function // W' = F^T (FW)' + // input to use is not overwritten in this branch so it's safe to use #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.GetWeightGradientsGPU( layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, prev_layer_input.data(), + layer_dimensions_.output_columns, input_to_use.data(), p_out_temp_.data(), p_layer_weight_gradients_.data()); } else { #endif - galois::CBlasSGEMM( - CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, - layer_dimensions_.input_rows, layer_dimensions_.output_columns, - prev_layer_input.data(), p_out_temp_.data(), - p_layer_weight_gradients_.data()); + galois::CBlasSGEMM(CblasTrans, CblasNoTrans, + layer_dimensions_.input_columns, + layer_dimensions_.input_rows, + layer_dimensions_.output_columns, input_to_use.data(), + p_out_temp_.data(), p_layer_weight_gradients_.data()); #ifdef GALOIS_ENABLE_GPU } #endif } + if (!sage_config_.disable_concat) { + if (layer_number_ != 0) { + // deal with feature gradients for the self feature here + // this function will sum directly into the backward matrix + SelfFeatureUpdateEmbeddingsDerivative(input_gradient->data(), + p_backward_output_matrix_.data()); + } + } + + // TODO(loc) sync both weight matrices WeightGradientSyncSum(); if (!config_.disable_dropout && layer_number_ != 0) { @@ -371,7 +401,7 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddings( galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, layer_dimensions_.input_columns, layer_dimensions_.output_columns, node_embeddings, - layer_weights_2_.data(), output); + layer_weights_2_.data(), output, true); #ifdef GALOIS_ENABLE_GPU } #endif @@ -399,3 +429,31 @@ void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, } #endif } + +void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative( + const GNNFloat* gradients, GNNFloat* output) { + assert(p_layer_weights_.size() == + layer_dimensions_.input_columns * layer_dimensions_.output_columns); +#ifdef GALOIS_ENABLE_GPU + // TODO gpu self +#endif + // difference is Trans for B matrix (data) to get z by y (weights is y by z + // normally); result is x by y + // true at end -> accumulate + galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, + layer_dimensions_.output_columns, + layer_dimensions_.input_columns, gradients, + layer_weights_2_.data(), output, true); +#ifdef GALOIS_ENABLE_GPU +#endif +} + +void galois::SAGELayer::OptimizeLayer(BaseOptimizer* optimizer, + size_t trainable_layer_number) { + optimizer->GradientDescent(p_layer_weight_gradients_, p_layer_weights_, + trainable_layer_number); + if (!sage_config_.disable_concat) { + second_weight_optimizer_->GradientDescent(p_layer_weight_gradients_2_, + p_layer_weights_2_, 0); + } +} diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 4f2eca0295..853c5a22f9 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -7,6 +7,10 @@ if (NOT GALOIS_ENABLE_GPU) target_link_libraries(convlayer-test galois_gnn) add_test(NAME convlayer-test COMMAND convlayer-test) + add_executable(sage-layer-test sage-layer-test.cpp) + target_link_libraries(sage-layer-test galois_gnn) + add_test(NAME sage-layer-test COMMAND sage-layer-test) + add_executable(l2norm-layer-test l2norm-layer-test.cpp) target_link_libraries(l2norm-layer-test galois_gnn) add_test(NAME l2norm-layer-test COMMAND l2norm-layer-test) diff --git a/libgnn/test/sage-layer-test.cpp b/libgnn/test/sage-layer-test.cpp new file mode 100644 index 0000000000..dadc8b0096 --- /dev/null +++ b/libgnn/test/sage-layer-test.cpp @@ -0,0 +1,272 @@ +//! @file sage-layer-test.cpp +//! Sage layer test + +#include "galois/Logging.h" +#include "galois/layers/SAGELayer.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + + GALOIS_LOG_VERBOSE("[{}] Using {} threads", + galois::runtime::getSystemNetworkInterface().ID, + num_threads); + // load test graph + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = 7; + dimension_0.input_columns = 3; + dimension_0.output_columns = 2; + + galois::GNNLayerConfig dcon; + dcon.disable_aggregate_after_update = false; + dcon.DebugConfig(); + galois::SAGELayerConfig scon; + scon.disable_concat = false; + + std::unique_ptr layer_0 = + std::make_unique(0, test_graph, dimension_0, dcon, + scon); + layer_0->InitAllWeightsTo1(); + // sage weights for self + layer_0->InitSelfWeightsTo1(); + + // make sure it runs in a sane manner + const galois::PointerWithSize layer_0_forward_output = + layer_0->ForwardPhase(test_graph.GetLocalFeatures()); + + ////////////////////////////////////////////////////////////////////////////// + // sanity check layer 0 output + ////////////////////////////////////////////////////////////////////////////// + // since norm factors aren't invovled it is possible to do full assertions + // 7 x 2 + GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14); + GALOIS_LOG_VASSERT(layer_0_forward_output[0] == 3, "{} should be 3", + layer_0_forward_output[0]); + GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3); + GALOIS_LOG_VASSERT(layer_0_forward_output[2] == 9, "{} should be 6", + layer_0_forward_output[2]); + GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 9); + GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 18); + GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 18); + GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 27); + GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 27); + GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 36); + GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 36); + GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 45); + GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 45); + GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 33); + GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 33); + ////////////////////////////////////////////////////////////////////////////// + + // dummy 1 matrix + std::vector dummy_ones_v(14, 1); + galois::PointerWithSize dummy_ones(dummy_ones_v); + + // backward pass checking + // layer 0 means that an empty weight matrix is returned since there is no + // point passing back anything + galois::PointerWithSize layer_0_backward_output = + layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + + //////////////////////////////////////////////////////////////////////////////// + //// sanity check layer 0 backward output; all 0 because layer 0 + //////////////////////////////////////////////////////////////////////////////// + // since norm factors aren't invovled it is possible to do full assertions + // 7 x 3 + GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21); + GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0); + GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0); + + galois::PointerWithSize layer_0_weight_gradients = + layer_0->GetLayerWeightGradients(); + galois::PointerWithSize layer_0_weight_gradients_2 = + layer_0->GetLayerWeightGradients2(); + + // make sure they are sane + GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 36); + + // make sure they are sane + GALOIS_LOG_ASSERT(layer_0_weight_gradients_2.size() == 6); + GALOIS_LOG_VASSERT(layer_0_weight_gradients_2[0] == 21, + "{} is wrong should be {}", layer_0_weight_gradients_2[0], + 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[1] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[2] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[3] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[4] == 21); + + layer_0.reset(); + + //////////////////////////////////////////////////////////////////////////////// + + // create layer 1 for testing backward prop actually giving weights back + + auto layer_1 = std::make_unique(1, test_graph, dimension_0, + dcon, scon); + layer_1->InitAllWeightsTo1(); + layer_1->InitSelfWeightsTo1(); + + galois::PointerWithSize layer_1_forward_output = + layer_1->ForwardPhase(test_graph.GetLocalFeatures()); + // same check as before for sanity purposes + GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14); + GALOIS_LOG_VASSERT(layer_1_forward_output[0] == 3, "{} should be 3", + layer_1_forward_output[0]); + GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3); + GALOIS_LOG_VASSERT(layer_1_forward_output[2] == 9, "{} should be 6", + layer_1_forward_output[2]); + GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 9); + GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 18); + GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 18); + GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 27); + GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 27); + GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 36); + GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 36); + GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 45); + GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 45); + GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 33); + GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 33); + + // since layer isn't 0 anymore, backward phase will actually return something + dummy_ones_v.assign(14, 1); + galois::PointerWithSize layer_1_backward_output = + layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + + ////////////////////////////////////////////////////////////////////////////// + // check that multiplies go as expected + ////////////////////////////////////////////////////////////////////////////// + GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21); + GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 4); + + galois::PointerWithSize layer_1_weight_gradients = + layer_1->GetLayerWeightGradients(); + // make sure they are sane + GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 36); + + galois::PointerWithSize layer_1_weight_gradients_2 = + layer_1->GetLayerWeightGradients2(); + GALOIS_LOG_ASSERT(layer_1_weight_gradients_2.size() == 6); + GALOIS_LOG_VASSERT(layer_1_weight_gradients_2[0] == 21, + "{} is wrong should be {}", layer_1_weight_gradients_2[0], + 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[1] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[2] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[3] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[4] == 21); + + layer_1.reset(); + + //////////////////////////////////////////////////////////////////////////////// + + galois::GNNLayerConfig config; + config.disable_dropout = false; + config.disable_activation = false; + config.disable_normalization = false; + config.disable_aggregate_after_update = false; + + // finally, just make sure dropout and activation run without crashes + // (verification requires floating point accuracy or setting a seed which I + // don't have time for at the moment + // TODO in future maybe add better unit test for this + auto layer_2 = std::make_unique(1, test_graph, dimension_0, + config, scon); + galois::PointerWithSize l2_fo = + layer_2->ForwardPhase(test_graph.GetLocalFeatures()); + GALOIS_LOG_ASSERT(l2_fo.size() == 14); + GALOIS_LOG_VERBOSE("{}", l2_fo[0]); + GALOIS_LOG_VERBOSE("{}", l2_fo[1]); + GALOIS_LOG_VERBOSE("{}", l2_fo[2]); + GALOIS_LOG_VERBOSE("{}", l2_fo[3]); + GALOIS_LOG_VERBOSE("{}", l2_fo[4]); + GALOIS_LOG_VERBOSE("{}", l2_fo[5]); + GALOIS_LOG_VERBOSE("{}", l2_fo[6]); + GALOIS_LOG_VERBOSE("{}", l2_fo[7]); + GALOIS_LOG_VERBOSE("{}", l2_fo[8]); + GALOIS_LOG_VERBOSE("{}", l2_fo[9]); + GALOIS_LOG_VERBOSE("{}", l2_fo[10]); + GALOIS_LOG_VERBOSE("{}", l2_fo[11]); + GALOIS_LOG_VERBOSE("{}", l2_fo[12]); + GALOIS_LOG_VERBOSE("{}", l2_fo[13]); + + galois::PointerWithSize l2_bo = + layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + + GALOIS_LOG_ASSERT(l2_bo.size() == 21); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]); + + return 0; +} diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index c48f0b41b4..dbddb552e2 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -42,12 +42,13 @@ llvm::cl::list cl_layer_types( "layerTypes", cll::desc("Comma separated list of layer types specifying " "intermediate layers (does not include output)"), - cll::values(clEnumValN(galois::GNNLayerType::kGraphConvolutional, "gcn", - "Graph Convolutional Layer (default)"), - clEnumValN(galois::GNNLayerType::kSAGE, "sage", - "SAGE layer (GCN with concat + mean)"), - clEnumValN(galois::GNNLayerType::kDense, "dense", - "Dense Layer")), + cll::values( + clEnumValN(galois::GNNLayerType::kGraphConvolutional, "gcn", + "Graph Convolutional Layer (default)"), + clEnumValN(galois::GNNLayerType::kSAGE, "sage", + "SAGE layer (GCN with concat + mean)"), + clEnumValN(galois::GNNLayerType::kL2Norm, "l2norm", "L2 norm layer"), + clEnumValN(galois::GNNLayerType::kDense, "dense", "Dense layer")), cll::CommaSeparated); llvm::cl::opt From 8ee70932ef08ce870a771e06ca48061c57ee7279 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 23 Feb 2021 14:21:11 -0600 Subject: [PATCH 476/660] Fixed dropout derivative bug in Dense/GCN layers Backward prop if dropout is on needs to be done with the dropout'd matrix and not the original one. This is already fixed in the SAGE layer (and was the reason why SAGE was giving me issues for a week). --- libgnn/src/layers/DenseLayer.cpp | 12 ++++++++++- libgnn/src/layers/GraphConvolutionalLayer.cpp | 21 +++++++++++++------ 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp index b767805a6a..b2da6bf010 100644 --- a/libgnn/src/layers/DenseLayer.cpp +++ b/libgnn/src/layers/DenseLayer.cpp @@ -67,10 +67,20 @@ galois::PointerWithSize galois::DenseLayer::BackwardPhase( p_backward_output_matrix_.data()); } + galois::PointerWithSize input_data; + if (!config_.disable_dropout) { + // dropout result is currently stored in temp 1 + // needs to be used before it gets overwritten + input_data = p_in_temp_1_; + } else { + // no dropout = use vanilla input + input_data = prev_layer_input; + } + // W' = F^T (FW)' galois::CBlasSGEMM(CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, layer_dimensions_.input_rows, - layer_dimensions_.output_columns, prev_layer_input.data(), + layer_dimensions_.output_columns, input_data.data(), input_gradient->data(), p_layer_weight_gradients_.data()); // sync weight gradients; note aggregation sync occurs in the function call // already diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 81bebfd8e2..70e37ab23c 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -100,6 +100,15 @@ galois::GraphConvolutionalLayer::BackwardPhase( } // AFW = O + galois::PointerWithSize input_data; + if (!config_.disable_dropout) { + // dropout result is currently stored in temp 1 + // needs to be used before it gets overwritten + input_data = p_in_temp_1_; + } else { + // no dropout = use vanilla input + input_data = prev_layer_input; + } // derivative of aggregation/update // TODO clean up logic here to reduce nesting @@ -160,15 +169,15 @@ galois::GraphConvolutionalLayer::BackwardPhase( if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.GetWeightGradientsGPU( layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, prev_layer_input.data(), + layer_dimensions_.output_columns, input_data.data(), p_out_temp_.data(), p_layer_weight_gradients_.data()); } else { #endif - galois::CBlasSGEMM( - CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, - layer_dimensions_.input_rows, layer_dimensions_.output_columns, - prev_layer_input.data(), p_out_temp_.data(), - p_layer_weight_gradients_.data()); + galois::CBlasSGEMM(CblasTrans, CblasNoTrans, + layer_dimensions_.input_columns, + layer_dimensions_.input_rows, + layer_dimensions_.output_columns, input_data.data(), + p_out_temp_.data(), p_layer_weight_gradients_.data()); #ifdef GALOIS_ENABLE_GPU } #endif From 95d8c079e0f909a117b58d32afb75cce25c85734 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 24 Feb 2021 16:08:14 -0600 Subject: [PATCH 477/660] VectorMultAdd Fused multiply-add of vectors in GNNMath. --- libgnn/include/galois/GNNMath.h | 4 ++++ libgnn/src/GNNMath.cpp | 36 +++++++++++++++++++++++++++++++++ 2 files changed, 40 insertions(+) diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h index 9e17a448fc..e32d062cc5 100644 --- a/libgnn/include/galois/GNNMath.h +++ b/libgnn/include/galois/GNNMath.h @@ -13,6 +13,10 @@ size_t MaxIndex(const size_t length, const GNNFloat* vector); //! Can be called in parallel sections as its sigle threaded code void VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b, GNNFloat* output); +//! Given 2 float array pointers, do element wise addition of length elements +//! while scaling the second vector with a multiplier +void VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b, + const GNNFloat b_scale, GNNFloat* output); //! Does a softmax operation on the input vector and saves result to output //! vector; single threaded so it can be called in a parallel section diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp index 38af349a8c..fe14198d83 100644 --- a/libgnn/src/GNNMath.cpp +++ b/libgnn/src/GNNMath.cpp @@ -35,6 +35,8 @@ void galois::VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b, output[i] = a[i] + b[i]; } #else + galois::gWarn("No vectorization support on this machine! Falling back to " + "simple for loop"); // no vector -> trivial loop add for (size_t i = 0; i < length; ++i) { output[i] = a[i] + b[i]; @@ -42,6 +44,40 @@ void galois::VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b, #endif } +void galois::VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b, + const GNNFloat b_scale, GNNFloat* output) { +#ifdef __AVX2__ + constexpr size_t vectorization_length = + 8; // for 32-bit floating point in AVX2; TODO AVX512 + // can only do up to a particular multiple due to alignment + // create scale vector for b + __m128 scale_vec_half = _mm_set_ps(b_scale, b_scale, b_scale, b_scale); + __m256 scale_vec_main = _mm256_castps128_ps256(scale_vec_half); + scale_vec_main = _mm256_insertf128_ps(scale_vec_main, scale_vec_half, 1); + + const size_t aligned_end = length - length % vectorization_length; + // do add via vector ops + for (size_t i = 0; i < aligned_end; i += vectorization_length) { + _mm256_storeu_ps( + &output[i], + _mm256_add_ps(_mm256_loadu_ps(&a[i]), + _mm256_mul_ps(scale_vec_main, _mm256_loadu_ps(&b[i])))); + } + + // handle the rest + for (size_t i = aligned_end; i < length; ++i) { + output[i] = a[i] + b[i] * b_scale; + } +#else + galois::gWarn("No vectorization support on this machine! Falling back to " + "simple for loop"); + // no vector -> trivial loop add + for (size_t i = 0; i < length; ++i) { + output[i] = a[i] + b[i] * b_scale; + } +#endif +} + void galois::GNNSoftmax(const size_t vector_length, const GNNFloat* input, GNNFloat* output) { const GNNFloat max_element = From 9cba4e28a6bf775cf5a1ca9b7a413d49e8be6519 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 24 Feb 2021 16:14:30 -0600 Subject: [PATCH 478/660] GCN layer aggregation uses fused multiply add Gets rid of the need for per thread storage (very slow) and on demand alloc of an intermediate vector by using a fused multiply add for aggregation. Also adds a few timers to the layer. --- .../galois/layers/GraphConvolutionalLayer.h | 2 +- libgnn/src/layers/GraphConvolutionalLayer.cpp | 46 +++++++++---------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h index c677389df7..47980dcd0c 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h @@ -32,6 +32,7 @@ class GraphConvolutionalLayer : public GNNLayer { PointerWithSize* input_gradient) final; private: + static const constexpr char* kRegionName = "GCNLayer"; // 2 temporaries the size of the forward input; used for dropout and // aggregation (if either are required) std::vector in_temp_1_; @@ -71,7 +72,6 @@ class GraphConvolutionalLayer : public GNNLayer { void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output); //! Calculate graident via mxm with last layer's gradients (backward) void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output); - #ifdef GALOIS_ENABLE_GPU GCNGPUAllocations gpu_object_; #endif diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 70e37ab23c..bbf42a47b0 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -44,6 +44,8 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( const galois::PointerWithSize galois::GraphConvolutionalLayer::ForwardPhase( const galois::PointerWithSize input_embeddings) { + galois::StatTimer timer("ForwardPhase", kRegionName); + timer.start(); GALOIS_LOG_VERBOSE("Calling forward phase"); assert(input_embeddings.size() == (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); @@ -85,6 +87,7 @@ galois::GraphConvolutionalLayer::ForwardPhase( assert(p_forward_output_matrix_.size() == (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); + timer.stop(); return p_forward_output_matrix_; } @@ -92,6 +95,9 @@ galois::PointerWithSize galois::GraphConvolutionalLayer::BackwardPhase( galois::PointerWithSize prev_layer_input, galois::PointerWithSize* input_gradient) { + galois::StatTimer timer("BackwardPhase", kRegionName); + timer.start(); + assert(layer_phase_ == GNNPhase::kTrain); // derivative of activation @@ -193,6 +199,7 @@ galois::GraphConvolutionalLayer::BackwardPhase( DoDropoutDerivative(); } + timer.stop(); return p_backward_output_matrix_; } @@ -201,6 +208,9 @@ void galois::GraphConvolutionalLayer::AggregateAll( GNNFloat* aggregate_output, [[maybe_unused]] galois::substrate::PerThreadStorage>* pts) { + galois::StatTimer timer("Aggregate", kRegionName); + timer.start(); + #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.AggregateAllGPU( @@ -213,12 +223,13 @@ void galois::GraphConvolutionalLayer::AggregateAll( #ifdef GALOIS_ENABLE_GPU } #endif + timer.stop(); } void galois::GraphConvolutionalLayer::AggregateAllCPU( size_t column_length, const GNNFloat* node_embeddings, GNNFloat* aggregate_output, - galois::substrate::PerThreadStorage>* pts) { + galois::substrate::PerThreadStorage>*) { size_t num_nodes = graph_.size(); galois::do_all( @@ -282,17 +293,10 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( if (!config_.disable_normalization) { GNNFloat norm_scale = source_norm * graph_.NormFactor(dst); - // scale the value on the destination by the combined norm term - assert(pts->getLocal()->size() == column_length); - GNNFloat* intermediate = pts->getLocal()->data(); - for (size_t i = 0; i < column_length; i++) { - intermediate[i] = - norm_scale * node_embeddings[index_to_dst_feature + i]; - } - // add intermediate instead of original feature - galois::VectorAdd( + galois::VectorMulAdd( column_length, &aggregate_output[index_to_src_feature], - intermediate, &aggregate_output[index_to_src_feature]); + &node_embeddings[index_to_dst_feature], norm_scale, + &aggregate_output[index_to_src_feature]); } else { // add dst feature to aggregate output galois::VectorAdd(column_length, @@ -301,18 +305,6 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( &aggregate_output[index_to_src_feature]); } } - - // GNNFloat* intermediate = pts->getLocal()->data(); - // GNNFloat norm_scale = source_norm * source_norm; - // for (size_t i = 0; i < column_length; i++) { - // intermediate[i] = - // norm_scale * node_embeddings[index_to_src_feature + i]; - // } - // // add self - // galois::VectorAdd(column_length, - // &aggregate_output[index_to_src_feature], - // intermediate, - // &aggregate_output[index_to_src_feature]); }, galois::steal(), galois::loopname("ConvolutionalAggregateAll")); // aggregate sync @@ -321,6 +313,9 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( void galois::GraphConvolutionalLayer::UpdateEmbeddings( const GNNFloat* node_embeddings, GNNFloat* output) { + galois::StatTimer timer("ForwardXform", kRegionName); + timer.start(); + #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.UpdateEmbeddingsGPU( @@ -337,10 +332,14 @@ void galois::GraphConvolutionalLayer::UpdateEmbeddings( #ifdef GALOIS_ENABLE_GPU } #endif + timer.stop(); } void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative( const GNNFloat* gradients, GNNFloat* output) { + galois::StatTimer timer("BackwardXform", kRegionName); + timer.start(); + assert(p_layer_weights_.size() == layer_dimensions_.input_columns * layer_dimensions_.output_columns); #ifdef GALOIS_ENABLE_GPU @@ -360,4 +359,5 @@ void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative( #ifdef GALOIS_ENABLE_GPU } #endif + timer.stop(); } From c89ad4638dca6b2cb465de85105a3057224b326a Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 24 Feb 2021 16:40:57 -0600 Subject: [PATCH 479/660] Preliminary timers Some timers added to GNN and GNNLayer --- libgnn/src/GraphNeuralNetwork.cpp | 4 ++++ libgnn/src/layers/GNNLayer.cpp | 14 ++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 0905713dd8..0badb4f312 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -148,6 +148,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // TODO incorporate validation/test intervals for (size_t epoch = 0; epoch < num_epochs; epoch++) { + const std::string t_name = "Epoch" + std::to_string(epoch); + galois::StatTimer epoch_timer(t_name.c_str(), "GraphNeuralNetwork"); + epoch_timer.start(); if (config_.do_sampling()) { // subgraph sample every epoch // graph_->UniformNodeSample(); @@ -176,6 +179,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ", train_accuracy, "\n"); } + epoch_timer.stop(); // TODO validation and test as necessary } graph_->CalculateFullNormFactor(); diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 6deab8e682..2018c4f5c5 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -138,6 +138,8 @@ void galois::GNNLayer::DoDropoutCPU( void galois::GNNLayer::DoDropout( const PointerWithSize input_to_dropout, PointerWithSize* output_matrix) { + galois::StatTimer timer("ForwardDropout", "GNNLayer"); + timer.start(); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix, @@ -148,9 +150,12 @@ void galois::GNNLayer::DoDropout( #ifdef GALOIS_ENABLE_GPU } #endif + timer.stop(); } void galois::GNNLayer::DoDropoutDerivative() { + galois::StatTimer timer("BackwardDropout", "GNNLayer"); + timer.start(); assert(backward_output_matrix_.size() == dropout_mask_.size()); GNNFloat scale = 1. / (1. - config_.dropout_rate); @@ -172,9 +177,13 @@ void galois::GNNLayer::DoDropoutDerivative() { #ifdef GALOIS_ENABLE_GPU } #endif + timer.stop(); } void galois::GNNLayer::Activation() { + galois::StatTimer timer("ForwardActivation", "GNNLayer"); + timer.start(); + // TODO only does relu at the moment; should check user specified activation // and act accordingly galois::do_all( @@ -184,10 +193,14 @@ void galois::GNNLayer::Activation() { std::max(forward_output_matrix_.at(i), static_cast(0)); }, galois::loopname("ReLU")); + timer.stop(); } void galois::GNNLayer::ActivationDerivative( PointerWithSize* gradient) { + galois::StatTimer timer("BackwardActivation", "GNNLayer"); + timer.start(); + // TODO only does relu at the moment; should check user specified activation // and act accordingly // keep gradient if the original output is greater than 0 @@ -200,6 +213,7 @@ void galois::GNNLayer::ActivationDerivative( : static_cast(0); }, galois::loopname("ReLU-Derivative")); + timer.stop(); } void galois::GNNLayer::WeightGradientSyncSum() { From c279dff4d7685365489560fb349cc4d41ad49a1a Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 24 Feb 2021 16:50:15 -0600 Subject: [PATCH 480/660] SAGE aggregation: fused multiply add Like GCN layer, get rid of need for intermediate vector with fused multiply add. --- libgnn/src/layers/SAGELayer.cpp | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 79e757e93c..dfbd006eba 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -284,7 +284,7 @@ void galois::SAGELayer::AggregateAll( void galois::SAGELayer::AggregateAllCPU( size_t column_length, const GNNFloat* node_embeddings, GNNFloat* aggregate_output, - galois::substrate::PerThreadStorage>* pts, + galois::substrate::PerThreadStorage>*, bool is_backward) { size_t num_nodes = graph_.size(); @@ -346,17 +346,10 @@ void galois::SAGELayer::AggregateAllCPU( norm_scale = graph_.DegreeNorm(dst); } - // scale the value on the destination by the combined norm term - assert(pts->getLocal()->size() == column_length); - GNNFloat* intermediate = pts->getLocal()->data(); - for (size_t i = 0; i < column_length; i++) { - intermediate[i] = - norm_scale * node_embeddings[index_to_dst_feature + i]; - } - // add intermediate instead of original feature - galois::VectorAdd( + galois::VectorMulAdd( column_length, &aggregate_output[index_to_src_feature], - intermediate, &aggregate_output[index_to_src_feature]); + &node_embeddings[index_to_dst_feature], norm_scale, + &aggregate_output[index_to_src_feature]); } else { // add dst feature to aggregate output galois::VectorAdd(column_length, From 34199abaee13d25aab0859cbe35bdd9abe12a02c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 25 Feb 2021 18:22:05 -0600 Subject: [PATCH 481/660] Fix GNN CVC by fixing the edge master function Edge assign function for GNN's CVC was wrong: need to account for non-contiguous rows. This commit fixes that. Also adds aggregate sync test for CVC (long overdue) and adds the original CVC as an option to the gcn app. --- .../galois/graphs/GenericPartitioners.h | 12 +- libgnn/include/galois/graphs/GNNGraph.h | 2 +- libgnn/src/graphs/GNNGraph.cpp | 7 +- libgnn/test/aggregate-sync-test.cpp | 161 +++++++++++++++++- lonestar/libgnnbench/src/Input.cpp | 6 +- 5 files changed, 180 insertions(+), 8 deletions(-) diff --git a/libcusp/include/galois/graphs/GenericPartitioners.h b/libcusp/include/galois/graphs/GenericPartitioners.h index 006faea862..b02d2c9594 100644 --- a/libcusp/include/galois/graphs/GenericPartitioners.h +++ b/libcusp/include/galois/graphs/GenericPartitioners.h @@ -981,6 +981,11 @@ class GnnCVC : public galois::graphs::CustomMasterAssignment { //! Returns the grid column ID of the specified host unsigned gridColumnID(unsigned id) const { return (id % numColumnHosts); } + //! Find the row of a particular node + unsigned getRowOfNode(uint64_t gid) const { + return gridRowID(retrieveMaster(gid)); + } + //! Find the column of a particular node unsigned getColumnOfNode(uint64_t gid) const { return gridColumnID(retrieveMaster(gid)); @@ -1009,9 +1014,10 @@ class GnnCVC : public galois::graphs::CustomMasterAssignment { uint32_t retrieveMaster(uint32_t gid) const { return _globalHostMap[gid]; } - uint32_t getEdgeOwner(uint32_t, uint32_t dst, uint64_t) const { - int i = getColumnOfNode(dst); - return _h_offset + i; + uint32_t getEdgeOwner(uint32_t src, uint32_t dst, uint64_t) const { + unsigned blockedRowOffset = getRowOfNode(src) * numColumnHosts; + unsigned cyclicColumnOffset = getColumnOfNode(dst); + return blockedRowOffset + cyclicColumnOffset; } bool noCommunication() { return false; } diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index b69eb43ea2..5f4a337845 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -28,7 +28,7 @@ struct GNNRange { namespace graphs { //! Possible partitioning schemes for the GNN graph -enum class GNNPartitionScheme { kOEC, kCVC }; +enum class GNNPartitionScheme { kOEC, kCVC, kOCVC }; //! XXX class GNNGraph { diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 919d7340e4..cd76d118e0 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -23,6 +23,9 @@ LoadPartition(const std::string& input_directory, case galois::graphs::GNNPartitionScheme::kCVC: return galois::cuspPartitionGraph( input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1); + case galois::graphs::GNNPartitionScheme::kOCVC: + return galois::cuspPartitionGraph( + input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1); default: GALOIS_LOG_FATAL("Error: partition scheme specified is invalid"); return nullptr; @@ -74,7 +77,8 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, sync_substrate_ = std::make_unique>( *partitioned_graph_, host_id_, - galois::runtime::getSystemNetworkInterface().Num, false); + galois::runtime::getSystemNetworkInterface().Num, false, + partitioned_graph_->cartesianGrid()); // read in entire graph topology ReadWholeGraph(dataset_name); @@ -163,7 +167,6 @@ void galois::graphs::GNNGraph::AggregateSync( gnn_matrix_to_sync_column_length_ = matrix_column_size; // XXX bitset setting - // call sync sync_substrate_->sync( "GraphAggregateSync"); } diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp index d13674f1a2..888f2ca69f 100644 --- a/libgnn/test/aggregate-sync-test.cpp +++ b/libgnn/test/aggregate-sync-test.cpp @@ -22,6 +22,11 @@ int main() { test_graph->GetGID(test_graph->EdgeDestination(e)), "\n"); } } + for (auto own = test_graph->begin_owned(); own != test_graph->end_owned(); + own++) { + galois::gPrint(test_graph->host_prefix(), "Node owned GID ", + test_graph->GetGID(*own), "\n"); + } // create same layer from convlayer-test and make sure result is the same even // in multi-host environment @@ -31,6 +36,7 @@ int main() { dimension_0.output_columns = 2; galois::GNNLayerConfig l_config; l_config.disable_aggregate_after_update = false; + l_config.DebugConfig(); // create the layer, no norm factor std::unique_ptr layer_0 = @@ -197,6 +203,159 @@ int main() { GALOIS_LOG_ASSERT((layer_1_backward_output)[row * 3 + c] == ground_truth); } } + ////////////////////////////////////////////////////////////////////////////// + auto test_graph_2 = std::make_unique( + "tester", galois::graphs::GNNPartitionScheme::kCVC, true); + // print edges for sanity + for (size_t node = 0; node < test_graph_2->size(); node++) { + for (auto e = test_graph_2->EdgeBegin(node); + e != test_graph_2->EdgeEnd(node); e++) { + galois::gPrint( + test_graph_2->host_prefix(), "Edge ", test_graph_2->GetGID(node), " ", + test_graph_2->GetGID(test_graph_2->EdgeDestination(e)), "\n"); + } + } + for (auto own = test_graph_2->begin_owned(); own != test_graph_2->end_owned(); + own++) { + galois::gPrint(test_graph_2->host_prefix(), "Node owned GID ", + test_graph_2->GetGID(*own), "\n"); + } + + // create same layer from convlayer-test and make sure result is the same even + // in multi-host environment + dimension_0.input_rows = test_graph_2->size(); + dimension_0.input_columns = 3; + dimension_0.output_columns = 2; + l_config.disable_aggregate_after_update = false; + l_config.DebugConfig(); + + // create the layer, no norm factor + layer_0 = std::make_unique( + 0, *(test_graph_2.get()), dimension_0, l_config); + layer_0->InitAllWeightsTo1(); + // make sure it runs in a sane manner + layer_0_forward_output = + layer_0->ForwardPhase(test_graph_2->GetLocalFeatures()); + + for (size_t row = 0; row < test_graph_2->size(); row++) { + // row -> GID + size_t global_row = test_graph_2->GetGID(row); + + galois::GNNFloat ground_truth = 0.0; + + switch (global_row) { + case 0: + ground_truth = 3; + break; + case 1: + ground_truth = 6; + break; + case 2: + ground_truth = 12; + break; + case 3: + ground_truth = 18; + break; + case 4: + ground_truth = 24; + break; + case 5: + ground_truth = 30; + break; + case 6: + ground_truth = 15; + break; + default: + GALOIS_LOG_FATAL("bad global row for test graph"); + break; + } + + // size 2 columns + for (size_t c = 0; c < 2; c++) { + GALOIS_LOG_VASSERT(layer_0_forward_output[row * 2 + c] == ground_truth, + "{} Row {} GID {} need to be {} not {}", + test_graph_2->host_prefix(), row, global_row, + ground_truth, layer_0_forward_output[row * 2 + c]); + } + } + + layer_1 = std::make_unique( + 1, *(test_graph_2.get()), dimension_0, l_config); + layer_1->InitAllWeightsTo1(); + layer_1_forward_output = + layer_1->ForwardPhase(test_graph_2->GetLocalFeatures()); + + // same check for forward as before + for (size_t row = 0; row < test_graph_2->size(); row++) { + // row -> GID + size_t global_row = test_graph_2->GetGID(row); + + galois::GNNFloat ground_truth = 0.0; + + switch (global_row) { + case 0: + ground_truth = 3; + break; + case 1: + ground_truth = 6; + break; + case 2: + ground_truth = 12; + break; + case 3: + ground_truth = 18; + break; + case 4: + ground_truth = 24; + break; + case 5: + ground_truth = 30; + break; + case 6: + ground_truth = 15; + break; + default: + GALOIS_LOG_FATAL("bad global row for test graph"); + break; + } + + // size 2 columns + for (size_t c = 0; c < 2; c++) { + GALOIS_LOG_ASSERT(layer_1_forward_output[row * 2 + c] == ground_truth); + } + } + + // since layer isn't 0 anymore, backward phase will actually return something + dummy_ones_v.assign(test_graph_2->size() * 2, 1); + layer_1_backward_output = + layer_1->BackwardPhase(test_graph_2->GetLocalFeatures(), &dummy_ones); + + for (size_t row = 0; row < test_graph_2->size(); row++) { + // row -> GID + size_t global_row = test_graph_2->GetGID(row); + + galois::GNNFloat ground_truth = 0.0; - // XXX TODO CVC + switch (global_row) { + case 0: + case 6: + ground_truth = 2; + break; + case 1: + case 2: + case 3: + case 4: + case 5: + ground_truth = 4; + break; + default: + GALOIS_LOG_FATAL("bad global row for test graph"); + break; + } + + // size 3 columns + for (size_t c = 0; c < 3; c++) { + GALOIS_LOG_ASSERT((layer_1_backward_output)[row * 3 + c] == ground_truth); + } + } } diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index dbddb552e2..e02adb56dc 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -21,7 +21,9 @@ llvm::cl::opt partition_scheme( cll::values(clEnumValN(galois::graphs::GNNPartitionScheme::kOEC, "oec", "Outgoing Edge-Cut (default)"), clEnumValN(galois::graphs::GNNPartitionScheme::kCVC, "cvc", - "Cartesian Vertex-Cut")), + "Cartesian Vertex-Cut"), + clEnumValN(galois::graphs::GNNPartitionScheme::kOCVC, "ocvc", + "Original Cartesian Vertex-Cut")), cll::init(galois::graphs::GNNPartitionScheme::kOEC)); llvm::cl::opt num_layers( @@ -118,6 +120,8 @@ const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) { return "oec"; case galois::graphs::GNNPartitionScheme::kCVC: return "cvc"; + case galois::graphs::GNNPartitionScheme::kOCVC: + return "ocvc"; default: GALOIS_LOG_FATAL("Invalid partitioning scheme"); return ""; From f71e418ecc2016e0ac7acf2b696e35d2690dc108 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 25 Feb 2021 20:28:13 -0600 Subject: [PATCH 482/660] Support for non existent labels (-1) If any bit of a label bitset for single class graphs is -1, then that node has no label. This is now supported in the code. --- libgnn/include/galois/graphs/GNNGraph.h | 7 ++++++- libgnn/src/graphs/GNNGraph.cpp | 6 ++++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 5f4a337845..02cef8e621 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -109,7 +109,12 @@ class GNNGraph { //! class labels. GNNFloat GetSingleClassLabel(const unsigned lid) const { assert(using_single_class_labels_); - return local_ground_truth_labels_[lid]; + if (local_ground_truth_labels_[lid] != num_label_classes_) { + return local_ground_truth_labels_[lid]; + } else { + GALOIS_LOG_FATAL( + "should not get the label of a node that has no ground truth"); + } } //! Returns pointer to start of ground truth vector for some local id assuming diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index cd76d118e0..3ec30ec57f 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -330,6 +330,12 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, label_stream >> cur_bit; if (has_single_class_label) { + // no label + if (cur_bit == = -1) { + local_ground_truth_labels_[cur_lid] = num_label_classes_; + break; + } + // in single class, only 1 bit is set in bitset; that represents the // class to take if (cur_bit != 0) { From 2543fb976c458fdfb9be2db3313ac7be783f75f8 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 27 Feb 2021 15:01:41 -0600 Subject: [PATCH 483/660] (attempted) -1 fixing for graph label reading Untested -1 reading fix; issue was unsigned being used to load -1 --- libgnn/src/graphs/GNNGraph.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 3ec30ec57f..fd87c08fb6 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -323,7 +323,7 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, uint32_t cur_lid = partitioned_graph_->getLID(cur_gid); // read line as bitset of 0s and 1s std::istringstream label_stream(read_line); - unsigned cur_bit; + int cur_bit; // bitset size is # of label classes for (size_t cur_class = 0; cur_class < num_label_classes_; ++cur_class) { // read a bit @@ -331,7 +331,7 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, if (has_single_class_label) { // no label - if (cur_bit == = -1) { + if (cur_bit == -1) { local_ground_truth_labels_[cur_lid] = num_label_classes_; break; } From 5bff7cced8f30822b7a09d29fe85b5bda0fd2296 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 27 Feb 2021 18:29:10 -0600 Subject: [PATCH 484/660] Val/test intervals added to training + timers Can now specify intervals where training will evaluate val/test set and return accuracy in order to track progress. Added a lot more kinds of timers and stats that break down epoch time and accuracy for the stat file (prep for paper result collection). --- libgnn/include/galois/GraphNeuralNetwork.h | 6 + libgnn/src/GraphNeuralNetwork.cpp | 132 +++++++++++++++------ lonestar/gnn/distributed/gcn/gcn-dist.cpp | 4 - lonestar/libgnnbench/src/Input.cpp | 14 ++- 4 files changed, 115 insertions(+), 41 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index ae860d0d32..3df6fbe94e 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -103,6 +103,10 @@ class GraphNeuralNetworkConfig { bool do_sampling_{false}; //! Inductive = training ignores test/val set bool inductive_training_{false}; + //! Interval to run validation set on network at; 0 = no run + unsigned validation_interval_{0}; + //! Interval to run testing set on network at; 0 = no run + unsigned test_interval_{0}; private: //! Number of layers to construct in the GNN not including the output @@ -198,6 +202,8 @@ class GraphNeuralNetwork { #ifdef GALOIS_ENABLE_GPU //! Holds all GPU functions GraphNeuralNetworkGPU gpu_object_; + // Used to copy predictions from gpu over + std::vector cpu_pred_; #endif }; diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 0badb4f312..7955d9e92f 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -131,7 +131,6 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( float galois::GraphNeuralNetwork::Train(size_t num_epochs) { const size_t this_host = graph_->host_id(); - std::vector cpu_pred; float train_accuracy{0.f}; /* @@ -142,68 +141,113 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } */ + bool altered_norm_factor = + config_.inductive_training_ || config_.do_sampling(); + if (config_.inductive_training_) { graph_->CalculateSpecialNormFactor(false, true); } // TODO incorporate validation/test intervals for (size_t epoch = 0; epoch < num_epochs; epoch++) { - const std::string t_name = "Epoch" + std::to_string(epoch); + const std::string t_name = "TrainEpoch" + std::to_string(epoch); + const std::string t_name_acc = t_name + "Accuracy"; galois::StatTimer epoch_timer(t_name.c_str(), "GraphNeuralNetwork"); epoch_timer.start(); if (config_.do_sampling()) { // subgraph sample every epoch - // graph_->UniformNodeSample(); graph_->GraphSAINTSample(); graph_->CalculateSpecialNormFactor(true, config_.inductive_training_); } const PointerWithSize predictions = DoInference(); GradientPropagation(); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - if (cpu_pred.size() != predictions.size()) { - cpu_pred.resize(predictions.size()); - } + epoch_timer.stop(); - AdamOptimizer* adam = static_cast(optimizer_.get()); - adam->CopyToVector(cpu_pred, predictions); - train_accuracy = GetGlobalAccuracy(cpu_pred); - } else { -#endif - train_accuracy = GetGlobalAccuracy(predictions); -#ifdef GALOIS_ENABLE_GPU - } -#endif + train_accuracy = GetGlobalAccuracy(predictions); if (this_host == 0) { galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ", train_accuracy, "\n"); + galois::runtime::reportStat_Single("GraphNeuralNetwork", t_name_acc, + train_accuracy); } - epoch_timer.stop(); - // TODO validation and test as necessary + + bool do_validate = config_.validation_interval_ + ? epoch % config_.validation_interval_ == 0 + : false; + bool do_test = + config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false; + + // get real norm factor back if altered by sampling or inductive training + if ((do_validate || do_test) && altered_norm_factor) { + graph_->CalculateFullNormFactor(); + } + + if (do_validate) { + const std::string v_name = "ValEpoch" + std::to_string(epoch); + const std::string v_name_acc = v_name + "Accuracy"; + galois::StatTimer val_epoch_timer(v_name.c_str(), "GraphNeuralNetwork"); + + val_epoch_timer.start(); + SetLayerPhases(galois::GNNPhase::kValidate); + const PointerWithSize val_pred = DoInference(); + val_epoch_timer.stop(); + + float val_acc = GetGlobalAccuracy(val_pred); + if (this_host == 0) { + galois::gPrint("Epoch ", epoch, ": Validation accuracy is ", val_acc, + "\n"); + galois::runtime::reportStat_Single("GraphNeuralNetwork", v_name_acc, + val_acc); + } + } + + if (do_test) { + const std::string test_name = "TestEpoch" + std::to_string(epoch); + const std::string test_name_acc = test_name + "Accuracy"; + galois::StatTimer test_epoch_timer(test_name.c_str(), + "GraphNeuralNetwork"); + + test_epoch_timer.start(); + SetLayerPhases(galois::GNNPhase::kTest); + const PointerWithSize test_pred = DoInference(); + test_epoch_timer.stop(); + + float test_acc = GetGlobalAccuracy(test_pred); + if (this_host == 0) { + galois::gPrint("Epoch ", epoch, ": Test accuracy is ", test_acc, "\n"); + galois::runtime::reportStat_Single("GraphNeuralNetwork", test_name_acc, + test_acc); + } + } + + if (do_validate || do_test) { + // revert to training phase for next epoch + SetLayerPhases(galois::GNNPhase::kTrain); + // get back inductive norm factor as necessary; sampling norm is handled + // at beginning of every iteration + if (config_.inductive_training_ && !config_.do_sampling()) { + graph_->CalculateSpecialNormFactor(false, true); + } + } + } + + if (altered_norm_factor) { + graph_->CalculateFullNormFactor(); } - graph_->CalculateFullNormFactor(); + // check test accuracy - galois::StatTimer acc_timer("FinalAccuracyTest"); - acc_timer.start(); + galois::StatTimer test_timer("FinalTestRun", "GraphNeuralNetwork"); + test_timer.start(); SetLayerPhases(galois::GNNPhase::kTest); const PointerWithSize predictions = DoInference(); - float global_accuracy{0.0}; -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - AdamOptimizer* adam = static_cast(optimizer_.get()); - adam->CopyToVector(cpu_pred, predictions); - global_accuracy = GetGlobalAccuracy(cpu_pred); - } else { -#endif - global_accuracy = GetGlobalAccuracy(predictions); -#ifdef GALOIS_ENABLE_GPU - } -#endif - acc_timer.stop(); + float global_accuracy = GetGlobalAccuracy(predictions); + test_timer.stop(); if (this_host == 0) { galois::gPrint("Final test accuracy is ", global_accuracy, "\n"); + galois::runtime::reportStat_Single("GraphNeuralNetwork", + "FinalTestAccuracy", global_accuracy); } return global_accuracy; @@ -223,7 +267,23 @@ galois::GraphNeuralNetwork::DoInference() { float galois::GraphNeuralNetwork::GetGlobalAccuracy( PointerWithSize predictions) { - return graph_->GetGlobalAccuracy(predictions, phase_, config_.do_sampling()); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + if (cpu_pred_.size() != predictions.size()) { + cpu_pred_.resize(predictions.size()); + } + + // TODO get rid of CPU copy here if possible + AdamOptimizer* adam = static_cast(optimizer_.get()); + adam->CopyToVector(cpu_pred_, predictions); + return graph_->GetGlobalAccuracy(cpu_pred_, phase_, config_.do_sampling()); + } else { +#endif + return graph_->GetGlobalAccuracy(predictions, phase_, + config_.do_sampling()); +#ifdef GALOIS_ENABLE_GPU + } +#endif } void galois::GraphNeuralNetwork::GradientPropagation() { diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp index 65fe1338cc..e3dd1cac77 100644 --- a/lonestar/gnn/distributed/gcn/gcn-dist.cpp +++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp @@ -15,11 +15,7 @@ int main(int argc, char* argv[]) { galois::StatTimer compute_timer("Timer_0"); compute_timer.start(); - - galois::StatTimer train_timer("TrainingTime"); - train_timer.start(); gnn->Train(num_epochs); - train_timer.stop(); compute_timer.stop(); return 0; diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index e02adb56dc..d8975204c5 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -114,6 +114,16 @@ llvm::cl::opt "all non-train nodes are ignored"), cll::init(false)); +llvm::cl::opt + val_interval("valInterval", + cll::desc("# of epochs to test validation set (default 0)"), + cll::init(0)); + +llvm::cl::opt + test_interval("testInterval", + cll::desc("# of epochs to test test set (default 0)"), + cll::init(0)); + const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) { switch (s) { case galois::graphs::GNNPartitionScheme::kOEC: @@ -245,7 +255,9 @@ std::unique_ptr InitializeGraphNeuralNetwork() { galois::GraphNeuralNetworkConfig gnn_config( num_layers, layer_types, layer_sizes_vector, output_layer_type, do_graph_sampling, layer_config); - gnn_config.inductive_training_ = do_inductive_training; + gnn_config.inductive_training_ = do_inductive_training; + gnn_config.validation_interval_ = val_interval; + gnn_config.test_interval_ = test_interval; // optimizer std::unique_ptr opt = CreateOptimizer(gnn_graph.get()); From b7a01bdb56a9f78fb98c9833710d64cf255614f7 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 1 Mar 2021 15:57:11 -0600 Subject: [PATCH 485/660] Fixed aggregate sync in GNNs The buffer wrapper I was using to save a copy of data during sync is incorrect as the memory can potentially get written before it is actually serialized into a message. This was leading to inconsistent and wrong results for GNN training. This commit changes it to use a gstl vector. There is now some copy overhead due to this, but the tradeoff is actual correct execution. Some small fixes to the aggregate sync test as well. --- .../graphs/GraphAggregationSyncStructures.h | 19 ++++++++++---- libgnn/src/layers/GraphConvolutionalLayer.cpp | 15 +++++++---- libgnn/test/aggregate-sync-test.cpp | 25 +++++++++++++++---- 3 files changed, 44 insertions(+), 15 deletions(-) diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index 62a5ab14cb..e5dcb970af 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -17,16 +17,18 @@ extern unsigned layer_number_to_sync; #endif struct GNNSumAggregate { - using ValTy = galois::BufferWrapper; + using ValTy = galois::gstl::Vector; //! return a vector of floats to sync static ValTy extract(uint32_t node_id, char&) { // It should be a CPU synchronizing substrate. // If the GPU flag is turned off, then personality does not exist. // assert(device_personality == DevicePersonality::CPU); - ValTy extracted_vec( - &gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_], - gnn_matrix_to_sync_column_length_); + ValTy extracted_vec(gnn_matrix_to_sync_column_length_); + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + extracted_vec[i] = + gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i]; + } // move constructor should kick in here to avoid return copy return extracted_vec; } @@ -43,8 +45,15 @@ struct GNNSumAggregate { return true; } - //! do nothing (waste of a write) + //! No-op: readAny = overwritten anyways static void reset(uint32_t, char&) {} + // Reset is here in case anyone wants to bring it back + // static void reset(uint32_t node_id, char&) { + // for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + // gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] = + // 0; + // } + //} //! element wise set static void setVal(uint32_t node_id, char&, ValTy y) { diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index bbf42a47b0..7d7667a624 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -230,7 +230,9 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( size_t column_length, const GNNFloat* node_embeddings, GNNFloat* aggregate_output, galois::substrate::PerThreadStorage>*) { - size_t num_nodes = graph_.size(); + size_t num_nodes = graph_.size(); + size_t last_master = *(graph_.end_owned()); + assert(0 == *(graph_.begin_owned())); galois::do_all( galois::iterate(static_cast(0), num_nodes), @@ -263,10 +265,13 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( // init to self if (!config_.disable_self_aggregate) { - for (size_t i = 0; i < column_length; i++) { - aggregate_output[index_to_src_feature + i] = - node_embeddings[index_to_src_feature + i] * source_norm * - source_norm; + // only aggregate self once on master + if (src < last_master) { + for (size_t i = 0; i < column_length; i++) { + aggregate_output[index_to_src_feature + i] = + node_embeddings[index_to_src_feature + i] * source_norm * + source_norm; + } } } diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp index 888f2ca69f..7025331029 100644 --- a/libgnn/test/aggregate-sync-test.cpp +++ b/libgnn/test/aggregate-sync-test.cpp @@ -6,8 +6,7 @@ int main() { galois::DistMemSys G; if (galois::runtime::getSystemNetworkInterface().Num == 1) { - GALOIS_LOG_ERROR("This test should be run with multiple hosts/processes"); - exit(1); + GALOIS_LOG_WARN("This test should be run with multiple hosts/processes!"); } auto test_graph = std::make_unique( @@ -233,10 +232,26 @@ int main() { layer_0 = std::make_unique( 0, *(test_graph_2.get()), dimension_0, l_config); layer_0->InitAllWeightsTo1(); + // make sure it runs in a sane manner + // galois::PointerWithSize layer_0_forward_output = layer_0_forward_output = layer_0->ForwardPhase(test_graph_2->GetLocalFeatures()); + for (size_t row = 0; row < test_graph_2->size(); row++) { + // row -> GID + size_t global_row = test_graph_2->GetGID(row); + + if (global_row == 1) { + galois::gPrint(test_graph_2->host_prefix(), "GID ", global_row, " local ", + row, " value ", layer_0_forward_output[row * 2], "\n"); + } + if (global_row == 4) { + galois::gPrint(test_graph_2->host_prefix(), "GID ", global_row, " local ", + row, " value ", layer_0_forward_output[row * 2], "\n"); + } + } + for (size_t row = 0; row < test_graph_2->size(); row++) { // row -> GID size_t global_row = test_graph_2->GetGID(row); @@ -325,10 +340,10 @@ int main() { } } - // since layer isn't 0 anymore, backward phase will actually return something - dummy_ones_v.assign(test_graph_2->size() * 2, 1); + std::vector dummy_ones_v2(test_graph_2->size() * 2, 1); + galois::PointerWithSize dummy_ones2(dummy_ones_v2); layer_1_backward_output = - layer_1->BackwardPhase(test_graph_2->GetLocalFeatures(), &dummy_ones); + layer_1->BackwardPhase(test_graph_2->GetLocalFeatures(), &dummy_ones2); for (size_t row = 0; row < test_graph_2->size(); row++) { // row -> GID From c983f2cd4e278b15e0f14071393c6a61910ec2af Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 1 Mar 2021 17:29:34 -0600 Subject: [PATCH 486/660] Func to sync layer weight matrix; layer init Added a (currently unused) function that synchronizes entire weight matrix with a set operation. Currently unused because Bengio init will do the same thing on each host. Changed the seed used by Bengio init to include layer number as well to make it so each layer has a different weight set to start with. --- libgnn/include/galois/layers/GNNLayer.h | 3 +++ .../galois/layers/GradientSyncStructures.h | 27 +++++++++++++++++++ libgnn/src/layers/GNNLayer.cpp | 24 ++++++++++++++++- 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 0039683ad4..cfbf81fc51 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -225,6 +225,9 @@ class GNNLayer { PointerWithSize p_forward_output_matrix_; PointerWithSize p_backward_output_matrix_; + //! Synchronizes all weights (used in distributed setting) + void SyncInitialWeights(); + //! Wrapper over gradient matrix to make it compatible with Gluon std::unique_ptr gradient_sync_interface_; //! Synchronization substrate for the weight gradients diff --git a/libgnn/include/galois/layers/GradientSyncStructures.h b/libgnn/include/galois/layers/GradientSyncStructures.h index 32b7a85b82..ad76f514cd 100644 --- a/libgnn/include/galois/layers/GradientSyncStructures.h +++ b/libgnn/include/galois/layers/GradientSyncStructures.h @@ -34,4 +34,31 @@ struct WeightGradientSummation { static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } }; +struct WeightGradientSet { + using ValTy = GNNFloat; + static ValTy extract(uint32_t, ValTy& weight) { return weight; } + static bool reduce(uint32_t, ValTy&, ValTy) { return true; } + + //! reset weight to 0 + static void reset(uint32_t, ValTy& weight) { weight = 0.0; } + + //! save weight + static void setVal(uint32_t, ValTy& weight, ValTy y) { weight = y; } + + // GPU options TODO for GPU + static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_batch(unsigned, uint8_t*) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_reset_batch(unsigned, uint8_t*) { return false; } + static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { + return false; + } + static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } +}; + } // namespace galois diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 2018c4f5c5..6831ccb0b7 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -80,7 +80,7 @@ void galois::GNNLayer::GlorotBengioInit(std::vector* vector_to_init) { float max = std::sqrt(6.0) / std::sqrt(layer_dimensions_.output_columns + layer_dimensions_.input_columns); // TODO this seed should be configurable - std::default_random_engine rng(1); + std::default_random_engine rng(1 + layer_number_); std::uniform_real_distribution dist(-max, max); for (size_t i = 0; i < vector_to_init->size(); i++) { @@ -238,3 +238,25 @@ void galois::GNNLayer::WeightGradientSyncAverage() { galois::loopname("WeightGradientSyncAverageDivide")); } } + +void galois::GNNLayer::SyncInitialWeights() { + if (galois::runtime::getSystemNetworkInterface().Num == 1) { + return; + } +#ifdef GALOIS_ENABLE_GPU + // TODO(loc/hochan) + GALOIS_LOG_FATAL("Need to implement GPU version of this"); +#endif + // copy weights over to gradients + for (size_t i = 0; i < layer_weights_.size(); i++) { + layer_weight_gradients_[i] = layer_weights_[i]; + } + // sync "gradients" with a set only (reduction ignored) + gradient_sync_substrate_->sync( + "InitialSync"); + // copy "gradients" (actually weights) back to weight matrix + for (size_t i = 0; i < layer_weights_.size(); i++) { + layer_weights_[i] = layer_weight_gradients_[i]; + layer_weight_gradients_[i] = 0; + } +} From 921347891550702d201c1b23cac77c94b551c5b3 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 2 Mar 2021 13:19:11 -0600 Subject: [PATCH 487/660] SAGE paired matrix init The 2 matrices used by SAGE layer are technically 1 whole matrix. This commit adds a pair glorot bengio init to work on them as a pair. --- libgnn/include/galois/layers/GNNLayer.h | 4 ++++ libgnn/src/layers/GNNLayer.cpp | 29 ++++++++++++++++++++++++- libgnn/src/layers/SAGELayer.cpp | 5 ++++- 3 files changed, 36 insertions(+), 2 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index cfbf81fc51..ecd79bec34 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -263,6 +263,10 @@ class GNNLayer { //! Code inspired DGL and TinyDNN void GlorotBengioInit(std::vector* vector_to_init); + //! Init 2 things as one unit; used for SAGE + void PairGlorotBengioInit(std::vector* vector1, + std::vector* vector2); + //! Randomly init a float vector using the class's random init RNG void RandomInitVector(std::vector* vector_to_init); diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 6831ccb0b7..9f228cad25 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -79,7 +79,6 @@ galois::GNNLayer::GNNLayer(size_t layer_num, void galois::GNNLayer::GlorotBengioInit(std::vector* vector_to_init) { float max = std::sqrt(6.0) / std::sqrt(layer_dimensions_.output_columns + layer_dimensions_.input_columns); - // TODO this seed should be configurable std::default_random_engine rng(1 + layer_number_); std::uniform_real_distribution dist(-max, max); @@ -93,6 +92,34 @@ void galois::GNNLayer::GlorotBengioInit(std::vector* vector_to_init) { #endif } +void galois::GNNLayer::PairGlorotBengioInit(std::vector* vector1, + std::vector* vector2) { + // multiplied by 2 here because 2 pieces are 1 unit + float max = + std::sqrt(6.0) / std::sqrt((2 * layer_dimensions_.output_columns) + + layer_dimensions_.input_columns); + assert(vector1->size() == + (layer_dimensions_.input_columns * layer_dimensions_.output_columns)); + assert(vector2->size() == + (layer_dimensions_.input_columns * layer_dimensions_.output_columns)); + std::default_random_engine rng(1 + layer_number_); + std::uniform_real_distribution dist(-max, max); + + for (size_t i = 0; i < vector1->size(); i++) { + (*vector1)[i] = dist(rng); + } + for (size_t i = 0; i < vector2->size(); i++) { + (*vector2)[i] = dist(rng); + } +#ifdef GALOIS_ENABLE_GPU + // TODO + GALOIS_LOG_FATAL("TODO: copy both not 1"); + if (device_personality == DevicePersonality::GPU_CUDA) { + CopyLayerWeightsToGPU(); + } +#endif +} + void galois::GNNLayer::RandomInitVector(std::vector* vector_to_init) { galois::do_all( galois::iterate(static_cast(0), vector_to_init->size()), diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index dfbd006eba..c9b2a16da7 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -17,7 +17,10 @@ galois::SAGELayer::SAGELayer(size_t layer_num, layer_dimensions_.input_columns * layer_dimensions_.output_columns; layer_weights_2_.resize(num_weight_elements); layer_weight_gradients_2_.resize(num_weight_elements, 0); - GlorotBengioInit(&layer_weights_2_); + + // reinit both weight matrices as one unit + PairGlorotBengioInit(&layer_weights_, &layer_weights_2_); + // update the pointers to them as well as realloc will require it p_layer_weights_2_ = PointerWithSize(layer_weights_2_); p_layer_weight_gradients_2_ = From e3bbbc32167ade693d775ade0c0146ba4c4fbb67 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 2 Mar 2021 14:15:36 -0600 Subject: [PATCH 488/660] SAGE distribution: sync 2nd set of weights SAGE layer now suppports distributed execution: this commit adds the second sync required for the 2nd set of weights to the backward pass --- libgnn/include/galois/layers/SAGELayer.h | 15 +++++++++++++++ libgnn/src/layers/SAGELayer.cpp | 11 ++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index a489913ef5..431a7f0696 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -1,5 +1,6 @@ #pragma once #include "galois/layers/GNNLayer.h" +#include "galois/layers/GradientSyncStructures.h" #ifdef GALOIS_ENABLE_GPU // TODO(loc/hochan) @@ -91,6 +92,14 @@ class SAGELayer : public GNNLayer { //! override parent function: optimizes the second set of weights as well void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number); + //! Sync second set of weight gradients + void WeightGradientSyncSum2() { + // TODO bitset + gradient_sync_substrate_2_ + ->sync( + "WeightGradients2Sync"); + } + //! SAGE config params SAGELayerConfig sage_config_; //! Need own optimizer for the 2nd weight matrix @@ -102,6 +111,12 @@ class SAGELayer : public GNNLayer { PointerWithSize p_layer_weights_2_; PointerWithSize p_layer_weight_gradients_2_; + //! Wrapper over 2nd gradient matrix to make it compatible with Gluon + std::unique_ptr gradient_sync_interface_2_; + //! Synchronization substrate for the 2nd weight gradients + std::unique_ptr> + gradient_sync_substrate_2_; + // 2 temporaries the size of the forward input; used for dropout and // aggregation (if either are required) std::vector in_temp_1_; diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index c9b2a16da7..3c8184faee 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -28,6 +28,15 @@ galois::SAGELayer::SAGELayer(size_t layer_num, // initialize the optimizer std::vector weight_size = {num_weight_elements}; second_weight_optimizer_ = std::make_unique(weight_size, 1); + + // initialize sync substrate for second set + gradient_sync_interface_2_ = + std::make_unique(layer_weight_gradients_2_); + gradient_sync_substrate_2_ = std::make_unique< + galois::graphs::GluonSubstrate>( + *gradient_sync_interface_2_, + galois::runtime::getSystemNetworkInterface().ID, + galois::runtime::getSystemNetworkInterface().Num, false); } size_t num_input_elements = @@ -165,6 +174,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( input_to_use.data(), input_gradient->data(), p_layer_weight_gradients_2_.data()); } + WeightGradientSyncSum2(); // derivative of aggregation/update // TODO clean up logic here to reduce nesting @@ -245,7 +255,6 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } } - // TODO(loc) sync both weight matrices WeightGradientSyncSum(); if (!config_.disable_dropout && layer_number_ != 0) { From 97ca9be9dd3aeeb51ffa23b960c458afaff99165 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 2 Mar 2021 16:56:32 -0600 Subject: [PATCH 489/660] Cleanup some prints in cusp and gnn; ogbn100m Lots of trace prints/debug prints being printed that this commit cleans up. Also adds the training boundaries for the ogbn-100M dataset. --- .../include/galois/graphs/DistributedGraph.h | 26 +++--- libcusp/include/galois/graphs/NewGeneric.h | 79 +++++++++---------- libgnn/src/graphs/GNNGraph.cpp | 8 +- libgnn/src/layers/GNNLayer.cpp | 4 +- libgnn/src/layers/GluonGradientInterface.cpp | 4 +- libgnn/src/layers/SoftmaxLayer.cpp | 17 +++- 6 files changed, 75 insertions(+), 63 deletions(-) diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h index bf70bbf3e2..d13350f848 100644 --- a/libcusp/include/galois/graphs/DistributedGraph.h +++ b/libcusp/include/galois/graphs/DistributedGraph.h @@ -60,16 +60,16 @@ enum MASTERS_DISTRIBUTION { * @tparam NodeTy type of node data for the graph * @tparam EdgeTy type of edge data for the graph */ -template +template class DistGraph { private: //! Graph name used for printing things constexpr static const char* const GRNAME = "dGraph"; - using GraphTy = galois::graphs::LC_CSR_Graph; + using GraphTy = + galois::graphs::LC_CSR_Graph; // vector for determining range objects for master nodes + nodes // with edges (which includes masters) @@ -393,10 +393,10 @@ class DistGraph { galois::runtime::reportStatCond_Tmax( GRNAME, "MasterDistTime", timer.get()); - galois::gPrint( + galois::gDebug( "[", id, "] Master distribution time : ", timer.get_usec() / 1000000.0f, " seconds to read ", g.num_bytes_read(), " bytes in ", g.num_seeks(), - " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)\n"); + " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)"); return numNodes_to_divide; } @@ -866,9 +866,7 @@ class DistGraph { /** * Deallocates underlying LC CSR Graph */ - void deallocate() { - graph.deallocate(); - } + void deallocate() { graph.deallocate(); } /** * Sort the underlying LC_CSR_Graph by ID (destinations) @@ -882,10 +880,10 @@ class DistGraph { galois::no_stats(), galois::loopname("CSREdgeSort"), galois::steal()); } -//////////////////////////////////////////////////////////////////////////////// -// what follows are GNN functions; some are not great (e.g. expose arrays) -// TODO figure out better way to do this -//////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////// + // what follows are GNN functions; some are not great (e.g. expose arrays) + // TODO figure out better way to do this + //////////////////////////////////////////////////////////////////////////////// EdgeIndexTy* row_start_ptr() { return graph.row_start_ptr(); } NodeIndexTy* edge_dst_ptr() { return graph.edge_dst_ptr(); } }; diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index 771c5b5143..c29127d9e6 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -119,6 +119,10 @@ class NewDistGraphGeneric : public DistGraph { // this is entire graph: amazon's mask isn't contiguous bps.push_back(0); bps.push_back(86618); + } else if (filename.find("ogbn-papers100M") != std::string::npos) { + // this is entire graph: amazon's mask isn't contiguous + bps.push_back(602); + bps.push_back(111052523); } else { // TODO(loc) only die under certain conditions; don't die if something // is missing @@ -224,9 +228,8 @@ class NewDistGraphGeneric : public DistGraph { Tgraph_construct.start(); if (readFromFile) { - galois::gPrint("[", base_DistGraph::id, - "] Reading local graph from file ", localGraphFileName, - "\n"); + galois::gDebug("[", base_DistGraph::id, + "] Reading local graph from file ", localGraphFileName); base_DistGraph::read_local_graph_from_file(localGraphFileName); Tgraph_construct.stop(); return; @@ -312,7 +315,7 @@ class NewDistGraphGeneric : public DistGraph { // phase 0 - galois::gPrint("[", base_DistGraph::id, "] Starting graph reading.\n"); + galois::gDebug("[", base_DistGraph::id, "] Starting graph reading."); galois::graphs::BufferedGraph bufGraph; bufGraph.resetReadCounters(); galois::StatTimer graphReadTimer("GraphReading", GRNAME); @@ -321,18 +324,16 @@ class NewDistGraphGeneric : public DistGraph { *edgeEnd, base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges); graphReadTimer.stop(); - galois::gPrint("[", base_DistGraph::id, "] Reading graph complete.\n"); + galois::gDebug("[", base_DistGraph::id, "] Reading graph complete."); if (graphPartitioner->masterAssignPhase()) { // loop over all nodes, determine where neighbors are, assign masters galois::StatTimer phase0Timer("Phase0", GRNAME); - galois::gPrint("[", base_DistGraph::id, - "] Starting master assignment.\n"); + galois::gDebug("[", base_DistGraph::id, "] Starting master assignment."); phase0Timer.start(); phase0(bufGraph, cuspAsync, stateRounds); phase0Timer.stop(); - galois::gPrint("[", base_DistGraph::id, - "] Master assignment complete.\n"); + galois::gDebug("[", base_DistGraph::id, "] Master assignment complete."); } galois::StatTimer inspectionTimer("EdgeInspection", GRNAME); @@ -447,13 +448,14 @@ class NewDistGraphGeneric : public DistGraph { base_DistGraph::initializeSpecificRanges(); Tgraph_construct.stop(); - galois::gPrint("[", base_DistGraph::id, "] Graph construction complete.\n"); + galois::gDebug("[", base_DistGraph::id, "] Graph construction complete."); // report state rounds if (base_DistGraph::id == 0) { galois::runtime::reportStat_Single(GRNAME, "CuSPStateRounds", (uint32_t)stateRounds); } + galois::gPrint("[", base_DistGraph::id, "] Dist graph constructed\n"); } private: @@ -1363,7 +1365,7 @@ class NewDistGraphGeneric : public DistGraph { if (async) { if (base_DistGraph::id == 0) { - galois::gPrint("Using asynchronous master determination sends.\n"); + galois::gDebug("Using asynchronous master determination sends."); } hostFinished.resize(base_DistGraph::numHosts); @@ -1381,8 +1383,8 @@ class NewDistGraphGeneric : public DistGraph { #endif if (base_DistGraph::id == 0) { - galois::gPrint("Number of BSP sync rounds in master assignment: ", - stateRounds, "\n"); + galois::gDebug("Number of BSP sync rounds in master assignment: ", + stateRounds); } // galois::PerThreadTimer ptt( @@ -1484,9 +1486,9 @@ class NewDistGraphGeneric : public DistGraph { base_DistGraph::increment_evilPhase(); } - galois::gPrint("[", base_DistGraph::id, + galois::gDebug("[", base_DistGraph::id, "] Local master assignment " - "complete.\n"); + "complete."); // one more step: let masters know of nodes they own (if they don't // have the node locally then this is the only way they will learn about @@ -1498,7 +1500,7 @@ class NewDistGraphGeneric : public DistGraph { recvMastersToOwners(); p0master2ownerTimer.stop(); - galois::gPrint("[", base_DistGraph::id, "] Received my master mappings.\n"); + galois::gDebug("[", base_DistGraph::id, "] Received my master mappings."); base_DistGraph::increment_evilPhase(); @@ -1543,11 +1545,10 @@ class NewDistGraphGeneric : public DistGraph { inspectionTimer.stop(); uint64_t allBytesRead = bufGraph.getBytesRead(); - galois::gPrint( - "[", base_DistGraph::id, - "] Edge inspection time: ", inspectionTimer.get_usec() / 1000000.0f, - " seconds to read ", allBytesRead, " bytes (", - allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)\n"); + galois::gDebug("[", base_DistGraph::id, "] Edge inspection time: ", + inspectionTimer.get_usec() / 1000000.0f, " seconds to read ", + allBytesRead, " bytes (", + allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)"); // get incoming mirrors ready for creation uint32_t additionalMirrorCount = incomingMirrors.count(); @@ -1646,7 +1647,7 @@ class NewDistGraphGeneric : public DistGraph { void edgeCutLoad(GraphTy& graph, galois::graphs::BufferedGraph& bGraph) { if (base_DistGraph::id == 0) { - galois::gPrint("Loading edge-data while creating edges\n"); + galois::gDebug("Loading edge-data while creating edges"); } uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first; @@ -1677,10 +1678,10 @@ class NewDistGraphGeneric : public DistGraph { galois::steal(), galois::no_stats()); timer.stop(); - galois::gPrint("[", base_DistGraph::id, + galois::gDebug("[", base_DistGraph::id, "] Edge loading time: ", timer.get_usec() / 1000000.0f, " seconds to read ", bGraph.getBytesRead(), " bytes (", - bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)\n"); + bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)"); } /** @@ -1698,7 +1699,7 @@ class NewDistGraphGeneric : public DistGraph { void edgeCutLoad(GraphTy& graph, galois::graphs::BufferedGraph& bGraph) { if (base_DistGraph::id == 0) { - galois::gPrint("Loading edge-data while creating edges\n"); + galois::gDebug("Loading edge-data while creating edges"); } uint64_t globalOffset = base_DistGraph::gid2host[base_DistGraph::id].first; @@ -1728,10 +1729,10 @@ class NewDistGraphGeneric : public DistGraph { galois::steal(), galois::no_stats()); timer.stop(); - galois::gPrint("[", base_DistGraph::id, + galois::gDebug("[", base_DistGraph::id, "] Edge loading time: ", timer.get_usec() / 1000000.0f, " seconds to read ", bGraph.getBytesRead(), " bytes (", - bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)\n"); + bGraph.getBytesRead() / (float)timer.get_usec(), " MBPS)"); } /** @@ -1764,11 +1765,10 @@ class NewDistGraphGeneric : public DistGraph { inspectionTimer.stop(); // report edge inspection time uint64_t allBytesRead = bufGraph.getBytesRead(); - galois::gPrint( - "[", base_DistGraph::id, - "] Edge inspection time: ", inspectionTimer.get_usec() / 1000000.0f, - " seconds to read ", allBytesRead, " bytes (", - allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)\n"); + galois::gDebug("[", base_DistGraph::id, "] Edge inspection time: ", + inspectionTimer.get_usec() / 1000000.0f, " seconds to read ", + allBytesRead, " bytes (", + allBytesRead / (float)inspectionTimer.get_usec(), " MBPS)"); // old inspection barrier // galois::runtime::getHostBarrier().wait(); @@ -2138,7 +2138,7 @@ class NewDistGraphGeneric : public DistGraph { galois::runtime::reportStat_Tsum( GRNAME, std::string("EdgeInspectionBytesSent"), bytesSent.reduce()); - galois::gPrint("[", base_DistGraph::id, "] Inspection sends complete.\n"); + galois::gDebug("[", base_DistGraph::id, "] Inspection sends complete."); } /** @@ -2218,8 +2218,7 @@ class NewDistGraphGeneric : public DistGraph { } } - galois::gPrint("[", base_DistGraph::id, - "] Inspection receives complete.\n"); + galois::gDebug("[", base_DistGraph::id, "] Inspection receives complete."); } /** @@ -2246,7 +2245,7 @@ class NewDistGraphGeneric : public DistGraph { inspectIncomingNodes(hasIncomingEdge, prefixSumOfEdges); finalizeInspection(prefixSumOfEdges); - galois::gPrint("[", base_DistGraph::id, "] Inspection mapping complete.\n"); + galois::gDebug("[", base_DistGraph::id, "] Inspection mapping complete."); return prefixSumOfEdges; } @@ -2598,9 +2597,9 @@ class NewDistGraphGeneric : public DistGraph { galois::graphs::BufferedGraph& bufGraph) { if (base_DistGraph::id == 0) { if (std::is_void::value) { - fprintf(stderr, "Loading void edge-data while creating edges.\n"); + galois::gDebug("Loading void edge-data while creating edges."); } else { - fprintf(stderr, "Loading edge-data while creating edges.\n"); + galois::gDebug(stderr, "Loading edge-data while creating edges."); } } @@ -2625,10 +2624,10 @@ class NewDistGraphGeneric : public DistGraph { loadEdgeTimer.stop(); - galois::gPrint("[", base_DistGraph::id, "] Edge loading time: ", + galois::gDebug("[", base_DistGraph::id, "] Edge loading time: ", loadEdgeTimer.get_usec() / 1000000.0f, " seconds to read ", bufBytesRead, " bytes (", - bufBytesRead / (float)loadEdgeTimer.get_usec(), " MBPS)\n"); + bufBytesRead / (float)loadEdgeTimer.get_usec(), " MBPS)"); } // Edge type is not void. (i.e. edge data exists) diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index fd87c08fb6..4efae3c429 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -166,7 +166,7 @@ void galois::graphs::GNNGraph::AggregateSync( gnn_matrix_to_sync_ = matrix_to_sync; gnn_matrix_to_sync_column_length_ = matrix_column_size; - // XXX bitset setting + // TODO(loc) bitset setting sync_substrate_->sync( "GraphAggregateSync"); } @@ -184,7 +184,7 @@ void galois::graphs::GNNGraph::AggregateSync( gnn_matrix_to_sync_column_length_ = matrix_column_size; cuda_ctx_for_sync = cuda_ctx_; layer_number_to_sync = layer_number; - // XXX bitset setting + // TODO bitset setting // call sync cudaSetLayerInputOutput(cuda_ctx_, matrix_to_sync, matrix_column_size, size(), layer_number); @@ -297,7 +297,9 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, size_t num_nodes; file_stream >> num_nodes >> num_label_classes_ >> std::ws; assert(num_nodes == partitioned_graph_->globalSize()); - galois::gPrint("Number of label classes is ", num_label_classes_, "\n"); + if (host_id_ == 0) { + galois::gInfo("Number of label classes is ", num_label_classes_); + } // allocate memory for labels if (has_single_class_label) { diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 9f228cad25..92c1fa3250 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -244,7 +244,7 @@ void galois::GNNLayer::ActivationDerivative( } void galois::GNNLayer::WeightGradientSyncSum() { - // XXX bitset + // TODO bitset gradient_sync_substrate_->sync( "WeightGradientsSync"); } @@ -252,7 +252,7 @@ void galois::GNNLayer::WeightGradientSyncSum() { void galois::GNNLayer::WeightGradientSyncAverage() { size_t num_hosts = galois::runtime::getSystemNetworkInterface().Num; if (num_hosts > 1) { - // XXX bitset + // TODO bitset // sum, then average by dividing all by num hosts (every host participates // in sync) gradient_sync_substrate_->sync( diff --git a/libgnn/src/layers/GluonGradientInterface.cpp b/libgnn/src/layers/GluonGradientInterface.cpp index 31770afb4e..74e612af17 100644 --- a/libgnn/src/layers/GluonGradientInterface.cpp +++ b/libgnn/src/layers/GluonGradientInterface.cpp @@ -44,6 +44,6 @@ galois::GluonGradientInterface::GluonGradientInterface( } } - galois::gInfo("[", my_host, "] Weight gradients: this host owns ", - begin_master_, " to ", end_master_); + galois::gDebug("[", my_host, "] Weight gradients: this host owns ", + begin_master_, " to ", end_master_); } diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index f541b43a18..a268089b33 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -8,7 +8,12 @@ galois::SoftmaxLayer::ForwardPhaseCPU( input_loss_.assign(input_loss_.size(), 0.0); forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); const size_t feature_length = layer_dimensions_.input_columns; - // TODO(loc) once needed for accuracy debugging, print out loss +#ifndef NDEBUG + galois::DGAccumulator loss_accum; + galois::DGAccumulator handled; + loss_accum.reset(); + handled.reset(); +#endif galois::do_all( galois::iterate(graph_.begin_owned(), graph_.end_owned()), @@ -22,7 +27,6 @@ galois::SoftmaxLayer::ForwardPhaseCPU( // do softmax GNNSoftmax(feature_length, &input_embeddings[feature_length * i], &forward_output_matrix_[feature_length * i]); - // create ground truth vector for this LID std::vector* ground_truth_vec = ground_truth_vectors_.getLocal(); @@ -36,11 +40,20 @@ galois::SoftmaxLayer::ForwardPhaseCPU( input_loss_[i] = GNNCrossEntropy(feature_length, ground_truth_vec->data(), &forward_output_matrix_[feature_length * i]); +#ifndef NDEBUG + loss_accum += input_loss_[i]; + handled += 1; +#endif } }, // TODO chunk size? // steal on as some threads may have nothing to work on galois::steal(), galois::loopname("SoftmaxForward")); +#ifndef NDEBUG + GNNFloat reduced_loss = loss_accum.reduce(); + size_t t = handled.reduce(); + galois::gPrint("Loss is ", reduced_loss / t, "\n"); +#endif return forward_output_matrix_; } From 7b0d246f6ef22477955b44111d7ce0efc369c7bf Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 3 Mar 2021 13:36:13 -0600 Subject: [PATCH 490/660] aggregate sync test for gnn runs via ctest Added CMake code to run the aggregation sync test with multiple processes. Also changed a few of the other tests to make them run for less time. TODO: weight-sync test is broken, need to fix + add other partitioning policies. --- libgnn/test/CMakeLists.txt | 16 ++++++++++++++-- libgnn/test/epoch-test.cpp | 2 +- libgnn/test/gnngraph-test.cpp | 6 +++--- 3 files changed, 18 insertions(+), 6 deletions(-) diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 853c5a22f9..b1170e2d16 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -2,6 +2,15 @@ add_executable(gnngraph-test gnngraph-test.cpp) target_link_libraries(gnngraph-test galois_gnn) add_test(NAME gnngraph-test COMMAND gnngraph-test) +# multihost testing things +set(hosts) +set(host 12) +while (${host} GREATER 1) + list(APPEND hosts ${host}) + math(EXPR host "${host} - 1") +endwhile() +list(APPEND hosts "1") + if (NOT GALOIS_ENABLE_GPU) add_executable(convlayer-test convlayer-test.cpp) target_link_libraries(convlayer-test galois_gnn) @@ -47,13 +56,16 @@ if (NOT GALOIS_ENABLE_GPU) target_link_libraries(multilabel-epoch-test galois_gnn) add_test(NAME multilabel-epoch-test COMMAND multilabel-epoch-test) - # TODO figure out how to make this test run in parallel add_executable(aggregate-sync-test aggregate-sync-test.cpp) target_link_libraries(aggregate-sync-test galois_gnn) - #add_test(NAME aggregate-sync-test COMMAND GALOIS_DO_NOT_BIND_THREADS=1 mpirun -n=4 ./aggregate-sync-test) + foreach(host_count ${hosts}) + add_test(NAME run-aggsync-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} aggregate-sync-test) + set_tests_properties(run-aggsync-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1") + endforeach() add_executable(weight-sync-test weight-sync-test.cpp) target_link_libraries(weight-sync-test galois_gnn) + # TODO multi host tests add_executable(multilabel-read multilabel-read.cpp) target_link_libraries(multilabel-read galois_gnn) diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp index 2dbaea3372..ed665684f1 100644 --- a/libgnn/test/epoch-test.cpp +++ b/libgnn/test/epoch-test.cpp @@ -43,7 +43,7 @@ int main() { // increasing galois::StatTimer main_timer("Timer_0"); main_timer.start(); - for (size_t epoch = 0; epoch < 100; epoch++) { + for (size_t epoch = 0; epoch < 25; epoch++) { galois::PointerWithSize predictions = gnn->DoInference(); gnn->GradientPropagation(); galois::gPrint("Epoch ", epoch, ": Accuracy is ", diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp index 5aa4d72ddf..101540f4d5 100644 --- a/libgnn/test/gnngraph-test.cpp +++ b/libgnn/test/gnngraph-test.cpp @@ -14,12 +14,12 @@ int main() { galois::runtime::getSystemNetworkInterface().ID, num_threads); - // multi level reading tested in another test + // note multi level reading tested in another test GALOIS_LOG_VERBOSE("reddit with single label, oec"); - galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kOEC, + galois::graphs::GNNGraph("cora", galois::graphs::GNNPartitionScheme::kOEC, true); GALOIS_LOG_VERBOSE("reddit with single label, cvc"); - galois::graphs::GNNGraph("reddit", galois::graphs::GNNPartitionScheme::kCVC, + galois::graphs::GNNGraph("cora", galois::graphs::GNNPartitionScheme::kCVC, true); return 0; From f2960b1ac2a5f4c20ddb497bc43fd4c9e2bf964e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 3 Mar 2021 16:48:20 -0600 Subject: [PATCH 491/660] GNN replication factor fix Replication factor stat was wrong for GNNs because multiple sync substrates were being created that all used the same replication factor name. Resolve issue by only reporting it ifthe object the substrate is created for is a graph (should only be 1 graph in any execution). --- .../include/galois/graphs/DistributedGraph.h | 3 +++ .../include/galois/graphs/GluonSubstrate.h | 27 +++++++++++-------- .../galois/layers/GluonGradientInterface.h | 2 ++ 3 files changed, 21 insertions(+), 11 deletions(-) diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h index d13350f848..42b659fa67 100644 --- a/libcusp/include/galois/graphs/DistributedGraph.h +++ b/libcusp/include/galois/graphs/DistributedGraph.h @@ -886,6 +886,9 @@ class DistGraph { //////////////////////////////////////////////////////////////////////////////// EdgeIndexTy* row_start_ptr() { return graph.row_start_ptr(); } NodeIndexTy* edge_dst_ptr() { return graph.edge_dst_ptr(); } + + //! Used by substrate to determine if some stats are to be reported + bool is_a_graph() const { return true; } }; template class GluonSubstrate : public galois::runtime::GlobalObject { private: + bool is_a_graph_{false}; + //! Synchronization type enum SyncType { syncReduce, //!< Reduction sync @@ -340,16 +342,18 @@ class GluonSubstrate : public galois::runtime::GlobalObject { */ void reportProxyStats(uint64_t global_total_mirror_nodes, uint64_t GALOIS_UNUSED(global_total_owned_nodes)) { - float replication_factor = - (float)(global_total_mirror_nodes + userGraph.globalSize()) / - (float)userGraph.globalSize(); - galois::runtime::reportStat_Single(RNAME, "ReplicationFactor", - replication_factor); + if (is_a_graph_) { + float replication_factor = + (float)(global_total_mirror_nodes + userGraph.globalSize()) / + (float)userGraph.globalSize(); + galois::runtime::reportStat_Single(RNAME, "ReplicationFactor", + replication_factor); - galois::runtime::reportStatCond_Single( - RNAME, "TotalNodes", userGraph.globalSize()); - galois::runtime::reportStatCond_Single( - RNAME, "TotalGlobalMirrorNodes", global_total_mirror_nodes); + galois::runtime::reportStatCond_Single( + RNAME, "TotalNodes", userGraph.globalSize()); + galois::runtime::reportStatCond_Single( + RNAME, "TotalGlobalMirrorNodes", global_total_mirror_nodes); + } } //////////////////////////////////////////////////////////////////////////////// @@ -431,12 +435,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject { substrateDataMode(_enforcedDataMode), numHosts(numHosts), num_run(0), num_round(0), currentBVFlag(nullptr), mirrorNodes(userGraph.getMirrorNodes()) { + is_a_graph_ = _userGraph.is_a_graph(); if (cartesianGrid.first != 0 && cartesianGrid.second != 0) { GALOIS_ASSERT(cartesianGrid.first * cartesianGrid.second == numHosts, "Cartesian split doesn't equal number of hosts"); if (id == 0) { - galois::gInfo("Gluon optimizing communication for 2-D cartesian cut: ", - cartesianGrid.first, " x ", cartesianGrid.second); + galois::gDebug("Gluon optimizing communication for 2-D cartesian cut: ", + cartesianGrid.first, " x ", cartesianGrid.second); } isCartCut = true; } else { diff --git a/libgnn/include/galois/layers/GluonGradientInterface.h b/libgnn/include/galois/layers/GluonGradientInterface.h index 473151efcd..a41ca0cb4d 100644 --- a/libgnn/include/galois/layers/GluonGradientInterface.h +++ b/libgnn/include/galois/layers/GluonGradientInterface.h @@ -59,6 +59,8 @@ class GluonGradientInterface { unsigned getEdgeData(uint32_t) const { return 0; } void deallocate() const {}; + bool is_a_graph() const { return false; } + private: //! Reference to gradients that can get synchronized std::vector& gradients_; From 660e5ea968ee46dc01d006c2f06651437571c3e2 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 3 Mar 2021 17:07:49 -0600 Subject: [PATCH 492/660] Timer chnages for layers and GNN - GNN: single train/val/test timer that becomes more fine grained as necessary (e.g. when reporting a test/val forward phase). - Added timers to SAGE. - Separated forward/backward aggregation timers. - Chunk size 1 for do_all aggregation --- .../galois/layers/GraphConvolutionalLayer.h | 5 +++ libgnn/include/galois/layers/SAGELayer.h | 1 + libgnn/src/GraphNeuralNetwork.cpp | 37 +++++++++++-------- libgnn/src/layers/GraphConvolutionalLayer.cpp | 24 ++++++++++-- libgnn/src/layers/SAGELayer.cpp | 37 ++++++++++++++++++- 5 files changed, 82 insertions(+), 22 deletions(-) diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h index 47980dcd0c..4c884c129f 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h @@ -67,6 +67,11 @@ class GraphConvolutionalLayer : public GNNLayer { AggregateAll(size_t column_length, const GNNFloat* node_embeddings, GNNFloat* aggregate_output, galois::substrate::PerThreadStorage>* pts); + void + AggregateAll(size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + galois::substrate::PerThreadStorage>* pts, + bool is_backward); //! Do embedding update via mxm with this layer's weights (forward) void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output); diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index 431a7f0696..b72e9dca50 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -58,6 +58,7 @@ class SAGELayer : public GNNLayer { PointerWithSize* input_gradient) final; private: + static const constexpr char* kRegionName = "SAGELayer"; //! CPU aggregation void AggregateAllCPU( size_t column_length, const GNNFloat* node_embeddings, diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 7955d9e92f..b31a31ecd1 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -147,12 +147,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { if (config_.inductive_training_) { graph_->CalculateSpecialNormFactor(false, true); } + galois::StatTimer epoch_timer("TrainingTime", "GraphNeuralNetwork"); + galois::StatTimer validation_timer("ValidationTime", "GraphNeuralNetwork"); + galois::StatTimer epoch_test_timer("TestTime", "GraphNeuralNetwork"); // TODO incorporate validation/test intervals for (size_t epoch = 0; epoch < num_epochs; epoch++) { - const std::string t_name = "TrainEpoch" + std::to_string(epoch); - const std::string t_name_acc = t_name + "Accuracy"; - galois::StatTimer epoch_timer(t_name.c_str(), "GraphNeuralNetwork"); epoch_timer.start(); if (config_.do_sampling()) { // subgraph sample every epoch @@ -166,6 +166,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { train_accuracy = GetGlobalAccuracy(predictions); if (this_host == 0) { + const std::string t_name_acc = + "TrainEpoch" + std::to_string(epoch) + "Accuracy"; galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ", train_accuracy, "\n"); galois::runtime::reportStat_Single("GraphNeuralNetwork", t_name_acc, @@ -184,44 +186,43 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } if (do_validate) { - const std::string v_name = "ValEpoch" + std::to_string(epoch); - const std::string v_name_acc = v_name + "Accuracy"; - galois::StatTimer val_epoch_timer(v_name.c_str(), "GraphNeuralNetwork"); - - val_epoch_timer.start(); + validation_timer.start(); SetLayerPhases(galois::GNNPhase::kValidate); const PointerWithSize val_pred = DoInference(); - val_epoch_timer.stop(); + validation_timer.stop(); float val_acc = GetGlobalAccuracy(val_pred); if (this_host == 0) { galois::gPrint("Epoch ", epoch, ": Validation accuracy is ", val_acc, "\n"); + const std::string v_name_acc = + "ValEpoch" + std::to_string(epoch) + "Accuracy"; galois::runtime::reportStat_Single("GraphNeuralNetwork", v_name_acc, val_acc); } } if (do_test) { - const std::string test_name = "TestEpoch" + std::to_string(epoch); - const std::string test_name_acc = test_name + "Accuracy"; - galois::StatTimer test_epoch_timer(test_name.c_str(), - "GraphNeuralNetwork"); - - test_epoch_timer.start(); + epoch_test_timer.start(); SetLayerPhases(galois::GNNPhase::kTest); const PointerWithSize test_pred = DoInference(); - test_epoch_timer.stop(); + epoch_test_timer.stop(); float test_acc = GetGlobalAccuracy(test_pred); if (this_host == 0) { galois::gPrint("Epoch ", epoch, ": Test accuracy is ", test_acc, "\n"); + const std::string test_name_acc = + "TestEpoch" + std::to_string(epoch) + "Accuracy"; galois::runtime::reportStat_Single("GraphNeuralNetwork", test_name_acc, test_acc); } } if (do_validate || do_test) { + // report the training time elapsed at this point in time + galois::runtime::reportStat_Single( + "GraphNeuralNetwork", "ElapsedTrainTimeEpoch" + std::to_string(epoch), + epoch_timer.get()); // revert to training phase for next epoch SetLayerPhases(galois::GNNPhase::kTrain); // get back inductive norm factor as necessary; sampling norm is handled @@ -232,6 +233,10 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } } + uint64_t average_epoch_time = epoch_timer.get() / num_epochs; + galois::runtime::reportStat_Tavg("GraphNeuralNetwork", "AverageEpochTime", + average_epoch_time); + if (altered_norm_factor) { graph_->CalculateFullNormFactor(); } diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 7d7667a624..419d813119 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -132,7 +132,7 @@ galois::GraphConvolutionalLayer::BackwardPhase( // derivative of aggregate is the same due to symmetric graph AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(), p_backward_output_matrix_.data(), - &input_column_intermediates_); + &input_column_intermediates_, true); // TODO if training A, then A' compute here if layer # is 0 // dot product of edges that exist in A } @@ -162,7 +162,7 @@ galois::GraphConvolutionalLayer::BackwardPhase( // required in this case for the weight gradient calculation // this is (FW)' AggregateAll(layer_dimensions_.output_columns, input_gradient->data(), - p_out_temp_.data(), &output_column_intermediates_); + p_out_temp_.data(), &output_column_intermediates_, true); if (layer_number_ != 0) { // derivative for update // backout = F' @@ -208,7 +208,22 @@ void galois::GraphConvolutionalLayer::AggregateAll( GNNFloat* aggregate_output, [[maybe_unused]] galois::substrate::PerThreadStorage>* pts) { - galois::StatTimer timer("Aggregate", kRegionName); + AggregateAll(column_length, node_embeddings, aggregate_output, pts, false); +} + +void galois::GraphConvolutionalLayer::AggregateAll( + size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + [[maybe_unused]] galois::substrate::PerThreadStorage>* + pts, + bool is_backward) { + std::string agg_timer_name = "Aggregate"; + if (!is_backward) { + agg_timer_name += "Forward"; + } else { + agg_timer_name += "Backward"; + } + galois::StatTimer timer(agg_timer_name.c_str(), kRegionName); timer.start(); #ifdef GALOIS_ENABLE_GPU @@ -311,7 +326,8 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( } } }, - galois::steal(), galois::loopname("ConvolutionalAggregateAll")); + galois::chunk_size<1>(), galois::steal(), + galois::loopname("ConvolutionalAggregateAll")); // aggregate sync graph_.AggregateSync(aggregate_output, column_length); } diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 3c8184faee..f8632dd7f0 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -89,7 +89,9 @@ void MatrixAdd(size_t num_nodes, galois::PointerWithSize in, const galois::PointerWithSize galois::SAGELayer::ForwardPhase( const galois::PointerWithSize input_embeddings) { - GALOIS_LOG_VERBOSE("Calling forward phase"); + galois::StatTimer timer("ForwardPhase", kRegionName); + timer.start(); + assert(input_embeddings.size() == (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); assert(p_in_temp_1_.size() == input_embeddings.size()); @@ -138,12 +140,17 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( assert(p_forward_output_matrix_.size() == (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); + timer.stop(); + return p_forward_output_matrix_; } galois::PointerWithSize galois::SAGELayer::BackwardPhase( galois::PointerWithSize prev_layer_input, galois::PointerWithSize* input_gradient) { + galois::StatTimer timer("BackwardPhase", kRegionName); + timer.start(); + assert(layer_phase_ == GNNPhase::kTrain); // derivative of activation @@ -261,6 +268,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( DoDropoutDerivative(); } + timer.stop(); return p_backward_output_matrix_; } @@ -278,6 +286,15 @@ void galois::SAGELayer::AggregateAll( [[maybe_unused]] galois::substrate::PerThreadStorage>* pts, bool is_backward) { + std::string agg_timer_name = "Aggregate"; + if (!is_backward) { + agg_timer_name += "Forward"; + } else { + agg_timer_name += "Backward"; + } + galois::StatTimer timer(agg_timer_name.c_str(), kRegionName); + timer.start(); + #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.AggregateAllGPU( @@ -291,6 +308,7 @@ void galois::SAGELayer::AggregateAll( #ifdef GALOIS_ENABLE_GPU } #endif + timer.stop(); } void galois::SAGELayer::AggregateAllCPU( @@ -371,13 +389,16 @@ void galois::SAGELayer::AggregateAllCPU( } } }, - galois::steal(), galois::loopname("ConvolutionalAggregateAll")); + galois::chunk_size<1>(), galois::steal(), + galois::loopname("ConvolutionalAggregateAll")); // aggregate sync graph_.AggregateSync(aggregate_output, column_length); } void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output) { + galois::StatTimer timer("ForwardXForm", kRegionName); + timer.start(); #ifdef GALOIS_ENABLE_GPU // TODO self change if (device_personality == DevicePersonality::GPU_CUDA) { @@ -395,10 +416,13 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, #ifdef GALOIS_ENABLE_GPU } #endif + timer.stop(); } void galois::SAGELayer::SelfFeatureUpdateEmbeddings( const GNNFloat* node_embeddings, GNNFloat* output) { + galois::StatTimer timer("SelfForwardXForm", kRegionName); + timer.start(); #ifdef GALOIS_ENABLE_GPU // TODO self change #endif @@ -410,10 +434,14 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddings( #ifdef GALOIS_ENABLE_GPU } #endif +timer.stop(); } void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output) { + galois::StatTimer timer("BackwardXForm", kRegionName); + timer.start(); + assert(p_layer_weights_.size() == layer_dimensions_.input_columns * layer_dimensions_.output_columns); #ifdef GALOIS_ENABLE_GPU @@ -433,10 +461,14 @@ void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, #ifdef GALOIS_ENABLE_GPU } #endif + timer.stop(); } void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative( const GNNFloat* gradients, GNNFloat* output) { + galois::StatTimer timer("SelfBackwardXForm", kRegionName); + timer.start(); + assert(p_layer_weights_.size() == layer_dimensions_.input_columns * layer_dimensions_.output_columns); #ifdef GALOIS_ENABLE_GPU @@ -451,6 +483,7 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative( layer_weights_2_.data(), output, true); #ifdef GALOIS_ENABLE_GPU #endif + timer.stop(); } void galois::SAGELayer::OptimizeLayer(BaseOptimizer* optimizer, From 89604a5a0179cba629dd691471acd578d112df7b Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 4 Mar 2021 16:11:43 -0600 Subject: [PATCH 493/660] Forcefully change last layer to match label classes Rather than forcing user to specify the last layer's size correctly, this commit adds code that will automatically overwrite the layer size with the correct size. --- lonestar/libgnnbench/src/Input.cpp | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index d8975204c5..0bc508963d 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -169,6 +169,15 @@ CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) { } // verify user satisfies last intermediate layer needing to have same size // as # label classes + if (layer_sizes_vector.back() != gnn_graph->GetNumLabelClasses()) { + galois::gWarn( + "Size of last layer (", layer_sizes_vector.back(), + ") is not equal to # label classes: forcefully changing it to ", + gnn_graph->GetNumLabelClasses()); + layer_sizes_vector.back() = gnn_graph->GetNumLabelClasses(); + layer_sizes[num_layers - 1] = gnn_graph->GetNumLabelClasses(); + } + GALOIS_LOG_ASSERT(layer_sizes_vector.back() == gnn_graph->GetNumLabelClasses()); } else { From 1bd09da74741682db44c95fdb351754487caec82 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 6 Mar 2021 14:41:58 -0600 Subject: [PATCH 494/660] Dist. acc consistency, weight sync collective Replaces the weight sync gluon call with mpi collectives: much more efficient as it removes the need for point to point. Will go in and delete the Gluon substrate for weight in another commit later. No-dropout distributed execution should now result in 100% consistent accuracy with single host runs. The issue is that you can use a masked gradient for the weight grad calculation, but the feat grad calc requires that you have the non-masked version since you may need mirror info. Before this commit, the masked version was used everywhere which would lead to inconsistency down the line. This has been corrected in both GCN and SAGE. The output layers need a change too (loop over all nodes rather than masters only). Added back-conv-test, which tests to make sure weights/feats gradients passed back from a GCN layer are consistent/correct. This doesn't not catch some corner cases as I found during debugging due to the tester graph being too small. Also do not allocate ptemp2 if it's not required to get some space savings. (significant if # nodes and feature length is high). --- libgnn/include/galois/graphs/GNNGraph.h | 6 +- libgnn/include/galois/layers/GNNLayer.h | 7 +- libgnn/include/galois/layers/SAGELayer.h | 7 +- libgnn/src/layers/GNNLayer.cpp | 89 +++++++-- libgnn/src/layers/GraphConvolutionalLayer.cpp | 16 +- libgnn/src/layers/SAGELayer.cpp | 93 ++++++--- libgnn/src/layers/SigmoidLayer.cpp | 4 +- libgnn/src/layers/SoftmaxLayer.cpp | 4 +- libgnn/test/CMakeLists.txt | 9 +- libgnn/test/back-conv-test.cpp | 176 ++++++++++++++++++ libgnn/test/weight-sync-test.cpp | 45 ----- 11 files changed, 341 insertions(+), 115 deletions(-) create mode 100644 libgnn/test/back-conv-test.cpp delete mode 100644 libgnn/test/weight-sync-test.cpp diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 02cef8e621..071b33aeac 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -67,14 +67,16 @@ class GNNGraph { //! Return # of nodes in the partitioned graph size_t size() const { return partitioned_graph_->size(); } + bool is_local(size_t gid) const { return partitioned_graph_->isLocal(gid); } + size_t GetLID(size_t gid) const { return partitioned_graph_->getLID(gid); } + size_t GetGID(size_t lid) const { return partitioned_graph_->getGID(lid); } + //! Node begin for all local nodes NodeIterator begin() const { return partitioned_graph_->allNodesRange().begin(); } //! Node end for all local nodes NodeIterator end() const { return partitioned_graph_->allNodesRange().end(); } - //! Return GID of some local node - size_t GetGID(unsigned lid) const { return partitioned_graph_->getGID(lid); } NodeIterator begin_owned() const { return partitioned_graph_->masterNodesRange().begin(); diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index ecd79bec34..27fd1ac0c7 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -281,6 +281,8 @@ class GNNLayer { PointerWithSize* output_matrix); //! Apply the derivative of dropout to the backward phase output void DoDropoutDerivative(); + void ReconstructDropoutMatrix(const PointerWithSize input_to_drop, + PointerWithSize* output_matrix); //! Does some activation function based on configuration on forward output //! matrix @@ -290,9 +292,6 @@ class GNNLayer { //! Synchronize weight gradients with a summation void WeightGradientSyncSum(); - //! Synchronize weight gradients with a summation, then locally divide all - //! weights to get an average - void WeightGradientSyncAverage(); #ifdef GALOIS_ENABLE_GPU //! Object that holds all GPU allocated pointers to memory related to layers @@ -302,6 +301,8 @@ class GNNLayer { base_gpu_object_.CopyToWeights(layer_weights_); } #endif + + void MaskGradientNonMasters(PointerWithSize* gradients); }; } // namespace galois diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index b72e9dca50..6825812315 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -94,12 +94,7 @@ class SAGELayer : public GNNLayer { void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number); //! Sync second set of weight gradients - void WeightGradientSyncSum2() { - // TODO bitset - gradient_sync_substrate_2_ - ->sync( - "WeightGradients2Sync"); - } + void WeightGradientSyncSum2(); //! SAGE config params SAGELayerConfig sage_config_; diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 92c1fa3250..1084bf9010 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -180,6 +180,33 @@ void galois::GNNLayer::DoDropout( timer.stop(); } +void galois::GNNLayer::ReconstructDropoutMatrix( + const PointerWithSize input_to_dropout, + PointerWithSize* output_matrix) { + galois::StatTimer timer("ReconstructDropoutMatrix", "GNNLayer"); + timer.start(); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + // TODO(hochan) + GALOIS_LOG_FATAL("Implement me"); + } else { +#endif + // reuse the dropout mask from a previous dropout call + size_t num_elements = output_matrix->size(); + GNNFloat scale = 1. / (1. - config_.dropout_rate); + galois::do_all( + galois::iterate(static_cast(0), num_elements), + [&](size_t i) { + (*output_matrix)[i] = input_to_dropout[i] * + static_cast(dropout_mask_[i]) * scale; + }, + galois::loopname("ReconstructDropout")); +#ifdef GALOIS_ENABLE_GPU + } +#endif + timer.stop(); +} + void galois::GNNLayer::DoDropoutDerivative() { galois::StatTimer timer("BackwardDropout", "GNNLayer"); timer.start(); @@ -244,26 +271,30 @@ void galois::GNNLayer::ActivationDerivative( } void galois::GNNLayer::WeightGradientSyncSum() { - // TODO bitset + galois::StatTimer t("Sync_WeightGradientsSum", "GNNLayer"); + t.start(); +#ifdef GALOIS_ENABLE_GPU + // TODO(hochan) collectives here rather than gluon sync if possible like the + // CPU code + // preferably without needing to do a gpu->cpu copy + galois::gWarn( + "GPU still using inefficient point to point comms for weight sync"); gradient_sync_substrate_->sync( "WeightGradientsSync"); -} - -void galois::GNNLayer::WeightGradientSyncAverage() { - size_t num_hosts = galois::runtime::getSystemNetworkInterface().Num; - if (num_hosts > 1) { - // TODO bitset - // sum, then average by dividing all by num hosts (every host participates - // in sync) - gradient_sync_substrate_->sync( - "WeightGradientsSyncAverage"); - galois::do_all( - galois::iterate(static_cast(0), layer_weight_gradients_.size()), - [&](size_t weight_index) { - layer_weight_gradients_[weight_index] /= num_hosts; - }, - galois::loopname("WeightGradientSyncAverageDivide")); +#else + // TODO(loc) remove this limitation later; can just do a loop over the weight + // matrix + if (p_layer_weight_gradients_.size() > + size_t{std::numeric_limits::max()}) { + GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max " + "int at the moment"); } + MPI_Allreduce(MPI_IN_PLACE, + static_cast(p_layer_weight_gradients_.data()), + static_cast(p_layer_weight_gradients_.size()), MPI_FLOAT, + MPI_SUM, MPI_COMM_WORLD); +#endif + t.stop(); } void galois::GNNLayer::SyncInitialWeights() { @@ -271,7 +302,7 @@ void galois::GNNLayer::SyncInitialWeights() { return; } #ifdef GALOIS_ENABLE_GPU - // TODO(loc/hochan) + // TODO(loc/hochan); not required at the moment however GALOIS_LOG_FATAL("Need to implement GPU version of this"); #endif // copy weights over to gradients @@ -287,3 +318,25 @@ void galois::GNNLayer::SyncInitialWeights() { layer_weight_gradients_[i] = 0; } } + +void galois::GNNLayer::MaskGradientNonMasters( + PointerWithSize* gradient) { +#ifdef GALOIS_ENABLE_GPU + // TODO(hochan) mask away the **non** masters on gpu + GALOIS_LOG_FATAL("implement this"); +#else + assert(*(graph_.begin_owned()) == 0); + size_t start_node = *(graph_.end_owned()); + size_t end_node = graph_.size(); + size_t row_index = layer_dimensions_.output_columns; + galois::do_all( + galois::iterate(start_node, end_node), + [&](size_t non_master) { + // TODO(loc) use a std function for this for max efficiency + for (size_t i = 0; i < row_index; i++) { + (*gradient)[non_master * row_index + i] = 0; + } + }, + galois::loopname("MaskGradientNonMasters")); +#endif +} diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 419d813119..8d3c6754a2 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -11,8 +11,10 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( size_t num_input_elements = layer_dimensions_.input_rows * layer_dimensions_.input_columns; in_temp_1_.resize(num_input_elements, 0); - // TODO temp2 does not need to be initialized in all circumstances - in_temp_2_.resize(num_input_elements, 0); + if (config_.disable_aggregate_after_update || + layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + in_temp_2_.resize(num_input_elements, 0); + } size_t num_output_elements = layer_dimensions_.input_rows * layer_dimensions_.output_columns; @@ -50,7 +52,6 @@ galois::GraphConvolutionalLayer::ForwardPhase( assert(input_embeddings.size() == (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); assert(p_in_temp_1_.size() == input_embeddings.size()); - assert(p_in_temp_2_.size() == input_embeddings.size()); assert(p_forward_output_matrix_.size() == (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); // pointer to input to operate on @@ -78,8 +79,6 @@ galois::GraphConvolutionalLayer::ForwardPhase( &output_column_intermediates_); } - // TODO synchronization of aggregation functions - if (!config_.disable_activation) { GALOIS_LOG_VERBOSE("Doing activation"); Activation(); @@ -88,6 +87,7 @@ galois::GraphConvolutionalLayer::ForwardPhase( assert(p_forward_output_matrix_.size() == (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); timer.stop(); + return p_forward_output_matrix_; } @@ -138,6 +138,7 @@ galois::GraphConvolutionalLayer::BackwardPhase( } // weight gradient calculation // TODO(loc) put this in a function to put the ifdef in there + MaskGradientNonMasters(input_gradient); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.GetWeightGradientsGPU( @@ -169,8 +170,9 @@ galois::GraphConvolutionalLayer::BackwardPhase( UpdateEmbeddingsDerivative(p_out_temp_.data(), p_backward_output_matrix_.data()); } - // TODO put this in a function // W' = F^T (FW)' + MaskGradientNonMasters(&p_out_temp_); + // TODO put this in a function #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.GetWeightGradientsGPU( @@ -191,8 +193,6 @@ galois::GraphConvolutionalLayer::BackwardPhase( // sync weight gradients; note aggregation sync occurs in the function call // already - // TODO figure out how to do this with GPUs - // WeightGradientSyncAverage(); WeightGradientSyncSum(); if (!config_.disable_dropout && layer_number_ != 0) { diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index f8632dd7f0..194563610d 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -42,8 +42,11 @@ galois::SAGELayer::SAGELayer(size_t layer_num, size_t num_input_elements = layer_dimensions_.input_rows * layer_dimensions_.input_columns; in_temp_1_.resize(num_input_elements, 0); - // TODO temp2 does not need to be initialized in all circumstances - in_temp_2_.resize(num_input_elements, 0); + // only need to allocate if input <= output because not used otherwise + if (config_.disable_aggregate_after_update || + layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + in_temp_2_.resize(num_input_elements, 0); + } size_t num_output_elements = layer_dimensions_.input_rows * layer_dimensions_.output_columns; @@ -74,17 +77,27 @@ galois::SAGELayer::SAGELayer(size_t layer_num, GALOIS_LOG_VERBOSE("SAGE layer initialized"); } -void MatrixAdd(size_t num_nodes, galois::PointerWithSize in, - galois::PointerWithSize* out) { - assert(in.size() == out->size()); - assert((in.size() % num_nodes) == 0); - size_t column_size = in.size() / num_nodes; - // split matrix to threads - galois::do_all(galois::iterate(size_t{0}, num_nodes), [&](size_t node) { - size_t my_offset = node * column_size; - galois::VectorAdd(column_size, &(in[my_offset]), - &((out->data())[my_offset]), &(out->data()[my_offset])); - }); +void galois::SAGELayer::WeightGradientSyncSum2() { + galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName); + t.start(); +#ifdef GALOIS_ENABLE_GPU + // TODO(hochan) collectives here rather than gluon sync if possible like the + // CPU code + GALOIS_LOG_FATAL("implement me"); +#else + // TODO(loc) remove this limitation later; can just do a loop over the weight + // matrix + if (p_layer_weight_gradients_2_.size() > + size_t{std::numeric_limits::max()}) { + GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max " + "int at the moment"); + } + MPI_Allreduce(MPI_IN_PLACE, + static_cast(p_layer_weight_gradients_2_.data()), + static_cast(p_layer_weight_gradients_2_.size()), MPI_FLOAT, + MPI_SUM, MPI_COMM_WORLD); +#endif + t.stop(); } const galois::PointerWithSize galois::SAGELayer::ForwardPhase( @@ -95,7 +108,6 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( assert(input_embeddings.size() == (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); assert(p_in_temp_1_.size() == input_embeddings.size()); - assert(p_in_temp_2_.size() == input_embeddings.size()); assert(p_forward_output_matrix_.size() == (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); // pointer to input to operate on @@ -170,30 +182,21 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } // AFW = O - if (!sage_config_.disable_concat) { - // Fw1 + AFW2 = O; self feature has own weight matrix and makes own - // contribution to gradients which is handled in this block - // !!!! do this early because p_in_temp may get overwritten later - // if update occurs before aggregate !!! - galois::CBlasSGEMM( - CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, - layer_dimensions_.input_rows, layer_dimensions_.output_columns, - input_to_use.data(), input_gradient->data(), - p_layer_weight_gradients_2_.data()); - } - WeightGradientSyncSum2(); // derivative of aggregation/update // TODO clean up logic here to reduce nesting if (config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { if (layer_number_ != 0) { + // ---unmasked--- // transposed sgemm for derivative; in_temp is output assert(input_gradient->size() == layer_dimensions_.input_rows * layer_dimensions_.output_columns); assert(p_in_temp_1_.size() == layer_dimensions_.input_columns * layer_dimensions_.input_rows); // pintemp1 contains (AF)' + // overwrites the dropout matrix that was in ptemp1 (needed for second + // weight matrix) UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data()); // pback contains F' // derivative of aggregate is the same due to symmetric graph @@ -203,6 +206,8 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } // weight gradient calculation // TODO(loc) put this in a function to put the ifdef in there + // ---masked--- + MaskGradientNonMasters(input_gradient); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.GetWeightGradientsGPU( @@ -224,6 +229,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // aggregate occurs regardless of layer being equal to 0 because it is // required in this case for the weight gradient calculation // this is (FW)' + // --unmasked-- AggregateAll(layer_dimensions_.output_columns, input_gradient->data(), p_out_temp_.data(), &output_column_intermediates_, true); if (layer_number_ != 0) { @@ -235,6 +241,8 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // TODO put this in a function // W' = F^T (FW)' // input to use is not overwritten in this branch so it's safe to use + // --- masked ---, uses ptemp1 + MaskGradientNonMasters(&p_out_temp_); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.GetWeightGradientsGPU( @@ -254,6 +262,39 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } if (!sage_config_.disable_concat) { + // Fw1 + AFW2 = O; self feature has own weight matrix and makes own + // contribution to gradients which is handled in this block + // second weight matrix: reconstruct the dropout matrix if it was + // overwritten into temp1 + if (config_.disable_aggregate_after_update || + layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + if (!config_.disable_dropout) { + // input gradients have already been masked; need to reconstruct the + // dropout matrix which we can do since we saved the dropout mask + // save it into ptemp1 + ReconstructDropoutMatrix(prev_layer_input, &p_in_temp_1_); + // !!!NOTE!!! + // If you're using dropout in the distributed setting you've already + // thrown consistency out the window anyways because distributed RNG + // will make it so each host does something different + // Therefore, this op above is nothing more than a feeble attempt + // at getting *some* notion of consistency + } + } else { + // mask original input gradients since this path masks the aggregated + // gradients only + MaskGradientNonMasters(input_gradient); + // in dropout case, ptemp1 (contained in input to use) still contains the + // dropout matrix so no need to recompute + } + + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, + layer_dimensions_.input_rows, layer_dimensions_.output_columns, + input_to_use.data(), input_gradient->data(), + p_layer_weight_gradients_2_.data()); + WeightGradientSyncSum2(); + if (layer_number_ != 0) { // deal with feature gradients for the self feature here // this function will sum directly into the backward matrix diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp index 35f95b64a6..317811b6df 100644 --- a/libgnn/src/layers/SigmoidLayer.cpp +++ b/libgnn/src/layers/SigmoidLayer.cpp @@ -14,7 +14,7 @@ galois::SigmoidLayer::ForwardPhaseCPU( float_accumulator_.reset(); galois::do_all( - galois::iterate(graph_.begin_owned(), graph_.end_owned()), + galois::iterate(graph_.begin(), graph_.end()), [&](const unsigned local_node) { if (graph_.IsValidForPhase(local_node, layer_phase_)) { if (IsSampledLayer()) { @@ -71,7 +71,7 @@ galois::SigmoidLayer::BackwardPhaseCPU() { backward_output_matrix_.assign(backward_output_matrix_.size(), 0); galois::do_all( - galois::iterate(graph_.begin_owned(), graph_.end_owned()), + galois::iterate(graph_.begin(), graph_.end()), [&](const unsigned local_node) { if (graph_.IsValidForPhase(local_node, layer_phase_)) { if (IsSampledLayer()) { diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index a268089b33..6ac09806fe 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -16,7 +16,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU( #endif galois::do_all( - galois::iterate(graph_.begin_owned(), graph_.end_owned()), + galois::iterate(graph_.begin(), graph_.end()), [&](const unsigned i) { if (IsSampledLayer()) { if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i)) @@ -80,7 +80,7 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { backward_output_matrix_.assign(backward_output_matrix_.size(), 0); galois::do_all( - galois::iterate(graph_.begin_owned(), graph_.end_owned()), + galois::iterate(graph_.begin(), graph_.end()), [&](const unsigned i) { if (graph_.IsValidForPhase(i, layer_phase_)) { if (IsSampledLayer()) { diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index b1170e2d16..b9ef634c53 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -63,9 +63,12 @@ if (NOT GALOIS_ENABLE_GPU) set_tests_properties(run-aggsync-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1") endforeach() - add_executable(weight-sync-test weight-sync-test.cpp) - target_link_libraries(weight-sync-test galois_gnn) - # TODO multi host tests + add_executable(back-conv-test back-conv-test.cpp) + target_link_libraries(back-conv-test galois_gnn) + foreach(host_count ${hosts}) + add_test(NAME run-back-conv-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} back-conv-test) + set_tests_properties(run-back-conv-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1") + endforeach() add_executable(multilabel-read multilabel-read.cpp) target_link_libraries(multilabel-read galois_gnn) diff --git a/libgnn/test/back-conv-test.cpp b/libgnn/test/back-conv-test.cpp new file mode 100644 index 0000000000..b1c9c025c6 --- /dev/null +++ b/libgnn/test/back-conv-test.cpp @@ -0,0 +1,176 @@ +#include "galois/Logging.h" +#include "galois/layers/GraphConvolutionalLayer.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + + GALOIS_LOG_VERBOSE("[{}] Using {} threads", + galois::runtime::getSystemNetworkInterface().ID, + num_threads); + // load test graph + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kCVC, true); + galois::PointerWithSize feats = + test_graph.GetLocalFeatures(); + for (size_t row = 0; row < test_graph.size(); row++) { + // row -> GID + size_t global_row = test_graph.GetGID(row); + galois::GNNFloat ground_truth = 0.0; + + switch (global_row) { + case 0: + ground_truth = 0; + break; + case 1: + ground_truth = 1; + break; + case 2: + ground_truth = 2; + break; + case 3: + ground_truth = 3; + break; + case 4: + ground_truth = 4; + break; + case 5: + ground_truth = 5; + break; + case 6: + ground_truth = 6; + break; + default: + GALOIS_LOG_FATAL("bad global row for test graph"); + break; + } + // size 2 columns + for (size_t c = 0; c < 3; c++) { + GALOIS_LOG_VASSERT(feats[row * 3 + c] == ground_truth, "{} not {}", + ground_truth, feats[row * 2 + c]); + } + } + + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = test_graph.size(); + dimension_0.input_columns = 3; + dimension_0.output_columns = 2; + + galois::GNNLayerConfig dcon; + dcon.DebugConfig(); + + // dummy 1 matrix + std::vector dummy_ones_v(test_graph.size() * 2, 1); + galois::PointerWithSize dummy_ones(dummy_ones_v); + + // create layer 1 for testing backward prop actually giving weights back + std::unique_ptr layer_1 = + std::make_unique(1, test_graph, + dimension_0, dcon); + layer_1->InitAllWeightsTo1(); + galois::PointerWithSize layer_1_forward_output = + layer_1->ForwardPhase(test_graph.GetLocalFeatures()); + + for (size_t row = 0; row < test_graph.size(); row++) { + // row -> GID + size_t global_row = test_graph.GetGID(row); + galois::GNNFloat ground_truth = 0.0; + + switch (global_row) { + case 0: + ground_truth = 3; + break; + case 1: + ground_truth = 6; + break; + case 2: + ground_truth = 12; + break; + case 3: + ground_truth = 18; + break; + case 4: + ground_truth = 24; + break; + case 5: + ground_truth = 30; + break; + case 6: + ground_truth = 15; + break; + default: + GALOIS_LOG_FATAL("bad global row for test graph"); + break; + } + // size 2 columns + for (size_t c = 0; c < 2; c++) { + GALOIS_LOG_VASSERT(layer_1_forward_output[row * 2 + c] == ground_truth, + "{} not {}", ground_truth, + layer_1_forward_output[row * 2 + c]); + } + } + + galois::PointerWithSize layer_1_backward_output = + layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + + for (size_t row = 0; row < test_graph.size(); row++) { + // row -> GID + size_t global_row = test_graph.GetGID(row); + galois::GNNFloat ground_truth = 0.0; + + switch (global_row) { + case 0: + ground_truth = 2; + break; + case 1: + ground_truth = 4; + break; + case 2: + ground_truth = 4; + break; + case 3: + ground_truth = 4; + break; + case 4: + ground_truth = 4; + break; + case 5: + ground_truth = 4; + break; + case 6: + ground_truth = 2; + break; + default: + GALOIS_LOG_FATAL("bad global row for test graph"); + break; + } + // size 2 columns + for (size_t c = 0; c < 3; c++) { + GALOIS_LOG_ASSERT(layer_1_backward_output[row * 3 + c] == ground_truth); + } + } + + galois::PointerWithSize layer_1_weight_gradients = + layer_1->GetLayerWeightGradients(); + + // make sure they are sane + GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); + GALOIS_LOG_VASSERT(layer_1_weight_gradients[0] == 36, "36 not {}", + layer_1_weight_gradients[0]); + GALOIS_LOG_VASSERT(layer_1_weight_gradients[1] == 36, "36 not {}", + layer_1_weight_gradients[1]); + GALOIS_LOG_VASSERT(layer_1_weight_gradients[2] == 36, "36 not {}", + layer_1_weight_gradients[2]); + GALOIS_LOG_VASSERT(layer_1_weight_gradients[3] == 36, "36 not {}", + layer_1_weight_gradients[3]); + GALOIS_LOG_VASSERT(layer_1_weight_gradients[4] == 36, "36 not {}", + layer_1_weight_gradients[4]); + GALOIS_LOG_VASSERT(layer_1_weight_gradients[5] == 36, "36 not {}", + layer_1_weight_gradients[5]); + + layer_1.reset(); + + return 0; +} diff --git a/libgnn/test/weight-sync-test.cpp b/libgnn/test/weight-sync-test.cpp deleted file mode 100644 index 4c2c01f844..0000000000 --- a/libgnn/test/weight-sync-test.cpp +++ /dev/null @@ -1,45 +0,0 @@ -#include "galois/Logging.h" -#include "galois/GraphNeuralNetwork.h" -#include "galois/layers/GraphConvolutionalLayer.h" - -int main() { - galois::DistMemSys G; - - if (galois::runtime::getSystemNetworkInterface().Num == 4) { - GALOIS_LOG_ERROR("This test should be run with 4 hosts/processes"); - exit(1); - } - - auto test_graph = std::make_unique( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); - - // create same layer from convlayer-test and make sure result is the same even - // in multi-host environment - galois::GNNLayerDimensions dimension_0; - dimension_0.input_rows = test_graph->size(); - dimension_0.input_columns = 3; - dimension_0.output_columns = 2; - galois::GNNLayerConfig dcon; - - dcon.disable_aggregate_after_update = false; - // create the layer, no norm factor - std::unique_ptr layer_0 = - std::make_unique(0, *(test_graph.get()), - dimension_0, dcon); - layer_0->InitAllWeightsTo1(); - - // backward pass checking; check the gradients out - std::vector dummy_ones_v(test_graph->size() * 2, 1); - galois::PointerWithSize dummy_ones(dummy_ones_v); - layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones); - - // gradient verification; average - // host 0 has 18, 1 has 21, 2 has 12, 3 has 0s; averaged to 12.75 - const galois::PointerWithSize& grads = - layer_0->GetLayerWeightGradients(); - for (size_t i = 0; i < 6; i++) { - GALOIS_LOG_ASSERT(grads[i] == 12.75); - } - - // XXX CVC -} From 5ce1c24d09741a9ac27490058d8b51a59b097274 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 6 Mar 2021 17:28:28 -0600 Subject: [PATCH 495/660] Bitset now used for GNN aggregation Adds the use of a bitset for sync for the aggregation in GCN and SAGE. Theoretically this should improve sync time significantly. --- .../galois/graphs/GraphAggregationSyncStructures.h | 3 +++ libgnn/include/galois/layers/GraphConvolutionalLayer.h | 2 ++ libgnn/include/galois/layers/SAGELayer.h | 2 ++ libgnn/src/graphs/GNNGraph.cpp | 9 +++++---- libgnn/src/layers/GraphConvolutionalLayer.cpp | 2 ++ libgnn/src/layers/SAGELayer.cpp | 2 ++ 6 files changed, 16 insertions(+), 4 deletions(-) diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index e5dcb970af..8e3db38096 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -11,6 +11,7 @@ namespace graphs { extern GNNFloat* gnn_matrix_to_sync_; extern size_t gnn_matrix_to_sync_column_length_; +extern galois::DynamicBitSet bitset_graph_aggregate; #ifdef GALOIS_ENABLE_GPU extern struct CUDA_Context* cuda_ctx_for_sync; extern unsigned layer_number_to_sync; @@ -89,5 +90,7 @@ GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_output, cuda_ctx_for_sync, gnn_matrix_to_sync_column_length_, layer_number_to_sync); #endif + +GALOIS_SYNC_STRUCTURE_BITSET(graph_aggregate); } // namespace graphs } // namespace galois diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h index 4c884c129f..e44976f73b 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h @@ -7,6 +7,8 @@ namespace galois { +extern galois::DynamicBitSet graphs::bitset_graph_aggregate; + class GraphConvolutionalLayer : public GNNLayer { public: //! Initializes the variables of the base class and also allocates additional diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index 6825812315..59f71b9041 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -8,6 +8,8 @@ namespace galois { +extern galois::DynamicBitSet graphs::bitset_graph_aggregate; + struct SAGELayerConfig { bool disable_concat{false}; }; diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 4efae3c429..481784dc82 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -38,6 +38,7 @@ namespace galois { namespace graphs { GNNFloat* gnn_matrix_to_sync_ = nullptr; size_t gnn_matrix_to_sync_column_length_ = 0; +galois::DynamicBitSet bitset_graph_aggregate; #ifdef GALOIS_ENABLE_GPU struct CUDA_Context* cuda_ctx_for_sync; unsigned layer_number_to_sync; @@ -79,6 +80,7 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, *partitioned_graph_, host_id_, galois::runtime::getSystemNetworkInterface().Num, false, partitioned_graph_->cartesianGrid()); + bitset_graph_aggregate.resize(partitioned_graph_->size()); // read in entire graph topology ReadWholeGraph(dataset_name); @@ -165,10 +167,9 @@ void galois::graphs::GNNGraph::AggregateSync( // set globals for the sync substrate gnn_matrix_to_sync_ = matrix_to_sync; gnn_matrix_to_sync_column_length_ = matrix_column_size; - - // TODO(loc) bitset setting - sync_substrate_->sync( - "GraphAggregateSync"); + sync_substrate_ + ->sync( + "GraphAggregateSync"); } #ifdef GALOIS_ENABLE_GPU diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 8d3c6754a2..d2ee5ddcb0 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -273,6 +273,8 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( } } + graphs::bitset_graph_aggregate.set(src); + GNNFloat source_norm = 0.0; if (!config_.disable_normalization) { source_norm = graph_.NormFactor(src); diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 194563610d..697722a9c5 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -383,6 +383,8 @@ void galois::SAGELayer::AggregateAllCPU( } } + graphs::bitset_graph_aggregate.set(src); + GNNFloat source_norm = 0.0; if (!config_.disable_normalization) { source_norm = graph_.DegreeNorm(src); From 1bf25f4e3a12716265c76e5b56121fe8c21f1f0b Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 6 Mar 2021 17:50:00 -0600 Subject: [PATCH 496/660] Slightly more intelligent bitset for GNN Was setting bitset even if source had no edges: changed it to be slightly more precise. Unlikely that bitset helps at all though because of topology driven nature of the aggregation operator. The hope was that mirror nodes without edges would not get sync'd, but since sync call is already writeSource those shouldn't be getting sync'd anyways. --- libgnn/src/layers/GraphConvolutionalLayer.cpp | 3 ++- libgnn/src/layers/SAGELayer.cpp | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index d2ee5ddcb0..79c074ff2a 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -273,7 +273,6 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( } } - graphs::bitset_graph_aggregate.set(src); GNNFloat source_norm = 0.0; if (!config_.disable_normalization) { @@ -282,6 +281,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( // init to self if (!config_.disable_self_aggregate) { + graphs::bitset_graph_aggregate.set(src); // only aggregate self once on master if (src < last_master) { for (size_t i = 0; i < column_length; i++) { @@ -295,6 +295,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( // loop through all destinations to grab the feature to aggregate for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) { size_t dst = graph_.EdgeDestination(e); + graphs::bitset_graph_aggregate.set(src); if (layer_phase_ == GNNPhase::kTrain) { if (IsInductiveLayer()) { diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 697722a9c5..3ba096182f 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -383,7 +383,6 @@ void galois::SAGELayer::AggregateAllCPU( } } - graphs::bitset_graph_aggregate.set(src); GNNFloat source_norm = 0.0; if (!config_.disable_normalization) { @@ -392,6 +391,7 @@ void galois::SAGELayer::AggregateAllCPU( // loop through all destinations to grab the feature to aggregate for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) { + graphs::bitset_graph_aggregate.set(src); size_t dst = graph_.EdgeDestination(e); if (layer_phase_ == GNNPhase::kTrain) { From 011fc14a35524ac6442665281ad4919d37190ed5 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 6 Mar 2021 21:01:59 -0600 Subject: [PATCH 497/660] Softmax-cross-entropy derivative merged Merged the application of softmax and cross entropy derivatives into a very simple subtraction term that is apparently correct according to math gurus on the internet. Advantages: 1) much simpler 2) most importantly, numerical stability + correctness: previous softmax/cross-entropy derivative was causing blowup of accuracy after a few epochs where Xuhao OMP code was not. This simplifies the derivative into something that is numerically stable (and probably more correct than what was there before). --- libgnn/include/galois/GNNMath.h | 35 +++++++----- libgnn/src/GNNMath.cpp | 5 ++ libgnn/src/layers/GraphConvolutionalLayer.cpp | 1 - libgnn/src/layers/SAGELayer.cpp | 1 - libgnn/src/layers/SoftmaxLayer.cpp | 55 ++++++++----------- 5 files changed, 49 insertions(+), 48 deletions(-) diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h index e32d062cc5..9e50295200 100644 --- a/libgnn/include/galois/GNNMath.h +++ b/libgnn/include/galois/GNNMath.h @@ -29,6 +29,7 @@ void GNNSoftmaxDerivative(const size_t vector_length, const GNNFloat* prev_output, const GNNFloat* prev_output_derivative, GNNFloat* temp_vector, GNNFloat* output); + //! Performs cross entropy given a ground truth and input and returns the loss //! value. template @@ -36,12 +37,12 @@ galois::GNNFloat GNNCrossEntropy(const size_t vector_length, const TruthType* ground_truth, const GNNFloat* input) { GNNFloat loss = 0.0; - // Note that this function works if there are multiple non-zeros in the // ground truth vector // If there is only 1 then this function is overkill and it should break - // early (i.e. single class) - // Multiclass = fine + // early (i.e. single class); in one-hot vector setting for instance + // Multiclass = fine: in fact this is meant for multiclass but also + // works for single for (size_t i = 0; i < vector_length; i++) { if (ground_truth[i] == 0.0) { if (input[i] == 1.0) { @@ -60,24 +61,28 @@ galois::GNNFloat GNNCrossEntropy(const size_t vector_length, return loss; } - //! Derivative of cross entropy; gradients saved into an output vector. template void GNNCrossEntropyDerivative(const size_t vector_length, const TruthType* ground_truth, const GNNFloat* input, GNNFloat* gradients) { + // TODO(loc) delete this function once I fully understand why it wasn't + // working + GALOIS_LOG_FATAL( + "DO NOT USE THIS FUNCTION; NOT CORRECT IN ALL CIRCUMSTANCES"); for (size_t i = 0; i < vector_length; i++) { - // TODO(loc) assumption: binary classifier, make explicit in function name - if (ground_truth[i]) { - gradients[i] = -1.0 / (input[i] + static_cast(1e-10)); - } else { - if (input[i] == 1.0) { - // opposite - gradients[i] = 1.0 / static_cast(1e-10); - } else { - gradients[i] = 1.0 / (1.0 - input[i]); - } - } + gradients[i] = -ground_truth[i] / (input[i] + static_cast(1e-10)); + // if (ground_truth[i]) { + // gradients[i] = -1.0 / (input[i] + static_cast(1e-10)); + //} + // else { + // if (input[i] == 1.0) { + // // opposite + // gradients[i] = 1.0 / static_cast(1e-10); + // } else { + // gradients[i] = 1.0 / (1.0 - input[i]); + // } + //} } } diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp index fe14198d83..aef3dae6dd 100644 --- a/libgnn/src/GNNMath.cpp +++ b/libgnn/src/GNNMath.cpp @@ -98,6 +98,11 @@ void galois::GNNSoftmaxDerivative(const size_t vector_length, const GNNFloat* prev_output, const GNNFloat* prev_output_derivative, GNNFloat* temp_vector, GNNFloat* output) { + // TODO(loc) remove this function, unnecessary as cross/softmax derivatives + // can be merged as currently done in Softmax code + // will do so in a separate commit + GALOIS_LOG_FATAL("Should not need this function anymore with simplified " + "combined derivatives in each layer"); for (size_t i = 0; i < vector_length; i++) { for (size_t j = 0; j < vector_length; j++) { temp_vector[j] = (j == i) ? prev_output[i] * (1.0 - prev_output[i]) diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 79c074ff2a..a60b1eb0c4 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -273,7 +273,6 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( } } - GNNFloat source_norm = 0.0; if (!config_.disable_normalization) { source_norm = graph_.NormFactor(src); diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 3ba096182f..191c02d00e 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -383,7 +383,6 @@ void galois::SAGELayer::AggregateAllCPU( } } - GNNFloat source_norm = 0.0; if (!config_.disable_normalization) { source_norm = graph_.DegreeNorm(src); diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 6ac09806fe..10ed93c8ff 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -9,6 +9,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU( forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); const size_t feature_length = layer_dimensions_.input_columns; #ifndef NDEBUG + //#ifdef NDEBUG galois::DGAccumulator loss_accum; galois::DGAccumulator handled; loss_accum.reset(); @@ -41,6 +42,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU( GNNCrossEntropy(feature_length, ground_truth_vec->data(), &forward_output_matrix_[feature_length * i]); #ifndef NDEBUG + //#ifdef NDEBUG loss_accum += input_loss_[i]; handled += 1; #endif @@ -50,6 +52,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU( // steal on as some threads may have nothing to work on galois::steal(), galois::loopname("SoftmaxForward")); #ifndef NDEBUG + //#ifdef NDEBUG GNNFloat reduced_loss = loss_accum.reduce(); size_t t = handled.reduce(); galois::gPrint("Loss is ", reduced_loss / t, "\n"); @@ -81,43 +84,33 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { galois::do_all( galois::iterate(graph_.begin(), graph_.end()), - [&](const unsigned i) { - if (graph_.IsValidForPhase(i, layer_phase_)) { + [&](const unsigned node) { + if (graph_.IsValidForPhase(node, layer_phase_)) { if (IsSampledLayer()) { - if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i)) + if (layer_phase_ == GNNPhase::kTrain && + !graph_.IsInSampledGraph(node)) return; } - // create ground truth vector for this LID - // TODO maybe make this part of the graph class instead of recreating - // every time - std::vector* ground_truth_vec = - ground_truth_vectors_.getLocal(); - assert(ground_truth_vec->size() == feature_length); - ground_truth_vec->assign(ground_truth_vec->size(), 0.0); - // single class label is an index; set the correct one - (*ground_truth_vec)[static_cast( - graph_.GetSingleClassLabel(i))] = 1.0; - - // derivative cross entropy into norm grad - std::vector* norm_gradient = - norm_gradient_vectors_.getLocal(); - GNNCrossEntropyDerivative( - feature_length, ground_truth_vec->data(), - &(forward_output_matrix_[i * feature_length]), - norm_gradient->data()); - - // use norm grad with softmax deritave, save and return - std::vector* softmax_temp = - softmax_temp_vectors_.getLocal(); - GNNSoftmaxDerivative(feature_length, - &(forward_output_matrix_[i * feature_length]), - norm_gradient->data(), softmax_temp->data(), - &(backward_output_matrix_[i * feature_length])); + size_t correct = graph_.GetSingleClassLabel(node); + // See here for explanation for why this works + // https://gombru.github.io/2018/05/23/cross_entropy_loss/ + // Derivation of full combined derivative isn't there, but some + // emperical inspection tells me this is likely correct + // TODO(loc) work it out myself + for (size_t idx = 0; idx < feature_length; idx++) { + if (idx == correct) { + // positive class + backward_output_matrix_[node * feature_length + idx] = + forward_output_matrix_[node * feature_length + idx] - 1; + } else { + // negative class + backward_output_matrix_[node * feature_length + idx] = + forward_output_matrix_[node * feature_length + idx]; + } + } } }, - // TODO chunk size? - // steal on as some threads may have nothing to work on galois::steal(), galois::loopname("SoftmaxBackward")); return PointerWithSize(backward_output_matrix_); From 8d80e9e7e03bb5d17b17ee4d5d7e0057193c5f3f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 8 Mar 2021 21:49:36 -0600 Subject: [PATCH 498/660] better serialization for vec of vectors in gluon Vector of vector serializatoin in Gluon was bad. This commit fixes it by serializing all vectors into a single one before sendoff so serialization unit doens't have to deal with it. Because this commit was made in a rush (so i can schedule tests) I did not really bother cleaning it much. --- libgluon/CMakeLists.txt | 2 +- .../include/galois/graphs/GluonSubstrate.h | 318 +++++++++++++++++- 2 files changed, 313 insertions(+), 7 deletions(-) diff --git a/libgluon/CMakeLists.txt b/libgluon/CMakeLists.txt index 3c9812e498..543e796156 100644 --- a/libgluon/CMakeLists.txt +++ b/libgluon/CMakeLists.txt @@ -18,7 +18,7 @@ target_include_directories(galois_gluon PUBLIC ) if (GALOIS_COMM_STATS) - target_compile_definitions(galois_gluon PRIVATE GALOIS_COMM_STATS=1) + target_compile_definitions(galois_gluon PUBLIC GALOIS_COMM_STATS=1) endif() if (GALOIS_USE_BARE_MPI) diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index 6f9c9f5b85..2ef2e0b136 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -710,6 +710,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject { galois::no_stats()); } + template + struct is_vector_of_vec : public std::false_type {}; + template + struct is_vector_of_vec, A>> + : public std::true_type {}; + //////////////////////////////////////////////////////////////////////////////// // Message prep functions (buffering, send buffer getting, etc.) //////////////////////////////////////////////////////////////////////////////// @@ -735,11 +741,21 @@ class GluonSubstrate : public galois::runtime::GlobalObject { auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes; if (BitsetFnTy::is_valid()) { - syncExtract( - loopName, x, sharedNodes[x], b, elem_size); + if (is_vector_of_vec::value) { + syncExtractFloatVecHack( + loopName, x, sharedNodes[x], b, elem_size); + } else { + syncExtract( + loopName, x, sharedNodes[x], b, elem_size); + } } else { - syncExtract(loopName, x, sharedNodes[x], - b, elem_size); + if (is_vector_of_vec::value) { + // TODO (loc) + GALOIS_LOG_FATAL("implement me"); + } else { + syncExtract( + loopName, x, sharedNodes[x], b, elem_size); + } } std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; @@ -1876,10 +1892,164 @@ class GluonSubstrate : public galois::runtime::GlobalObject { extractSubset( loopName, indices, bit_set_count, offsets, val_vec); } + serializeMessage(loopName, data_mode, bit_set_count, indices, offsets, bit_set_comm, val_vec, b); } else { + // TODO(loc/hochan) vector gpu hack for gnns + if (data_mode == noData) { + b.resize(0); + if (!async) { + gSerialize(b, data_mode); + } + } else if (data_mode == gidsData) { + b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) + + sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) + + sizeof(size_t) + + (bit_set_count * sizeof(typename SyncFnTy::ValTy))); + } else if (data_mode == offsetsData) { + b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) + + sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) + + sizeof(size_t) + + (bit_set_count * sizeof(typename SyncFnTy::ValTy))); + } else if (data_mode == bitsetData) { + size_t bitset_alloc_size = ((num + 63) / 64) * sizeof(uint64_t); + b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) + + sizeof(size_t) // bitset size + + sizeof(size_t) // bitset vector size + + bitset_alloc_size + sizeof(size_t) + + (bit_set_count * sizeof(typename SyncFnTy::ValTy))); + } else { // onlyData + b.resize(sizeof(DataCommMode) + sizeof(size_t) + + (num * sizeof(typename SyncFnTy::ValTy))); + } + } + + reportRedundantSize(loopName, syncTypeStr, num, bit_set_count, + bit_set_comm); + } else { + data_mode = noData; + b.resize(0); + if (!async) { + gSerialize(b, noData); + } + } + + Textract.stop(); + + std::string metadata_str(syncTypeStr + "MetadataMode_" + + std::to_string(data_mode) + "_" + + get_run_identifier(loopName)); + galois::runtime::reportStatCond_Single(RNAME, metadata_str, + 1); + } + template < + SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, + bool async, + typename std::enable_if::type* = nullptr, + typename std::enable_if::value>::type* = nullptr> + void syncExtractFloatVecHack(std::string, unsigned, std::vector&, + galois::runtime::SendBuffer&, size_t) { + // TODO(loc) cleaner way to do this + GALOIS_LOG_FATAL( + "Execution should not call float vec hack if not vector of vectors"); + } + + template < + SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, + bool async, + typename std::enable_if::type* = nullptr, + typename std::enable_if::value>::type* = nullptr> + void syncExtractFloatVecHack(std::string loopName, unsigned from_id, + std::vector& indices, + galois::runtime::SendBuffer& b, + size_t elem_size) { + // TODO(loc) assumption that type in the VecTy is a vector of floats + // throughout this code; more robust solution would detect it other ways + uint32_t num = indices.size() * elem_size; + galois::DynamicBitSet& bit_set_comm = syncBitset; + static VecTy val_vec; // sometimes wasteful + static galois::gstl::Vector single_array; + galois::PODResizeableArray& offsets = syncOffsets; + + //////////////////////////////////////////////////////////////////////////// + std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; + std::string extract_timer_str(syncTypeStr + "Extract_" + + get_run_identifier(loopName)); + galois::CondStatTimer Textract(extract_timer_str.c_str(), + RNAME); + std::string extract_alloc_timer_str(syncTypeStr + "ExtractAlloc_" + + get_run_identifier(loopName)); + galois::CondStatTimer Textractalloc( + extract_alloc_timer_str.c_str(), RNAME); + std::string extract_batch_timer_str(syncTypeStr + "ExtractBatch_" + + get_run_identifier(loopName)); + galois::CondStatTimer Textractbatch( + extract_batch_timer_str.c_str(), RNAME); + //////////////////////////////////////////////////////////////////////////// + + DataCommMode data_mode; + Textract.start(); + + if (num > 0) { + size_t bit_set_count = 0; + Textractalloc.start(); + b.reserve(getMaxSendBufferSize(num)); + Textractalloc.stop(); + + Textractbatch.start(); + bool batch_succeeded = extractBatchWrapper( + from_id, b, bit_set_count, data_mode); + Textractbatch.stop(); + + // GPUs have a batch function they can use; CPUs do not; therefore, + // CPUS always enter this if block + if (!batch_succeeded) { + Textractalloc.start(); + b.resize(0); + bit_set_comm.reserve(maxSharedSize); + offsets.reserve(maxSharedSize); + val_vec.reserve(maxSharedSize); + bit_set_comm.resize(num); + offsets.resize(num); + val_vec.resize(num); + Textractalloc.stop(); + const galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get(); + + getBitsetAndOffsets( + loopName, indices, bit_set_compute, bit_set_comm, offsets, + bit_set_count, data_mode); + + if (data_mode == onlyData) { + bit_set_count = indices.size(); + extractSubset( + loopName, indices, bit_set_count, offsets, val_vec); + } else if (data_mode != + noData) { // bitsetData or offsetsData or gidsData + extractSubset( + loopName, indices, bit_set_count, offsets, val_vec); + } + + // Vector of vectors is in val_vec + // val vec over to contiguous array of #s + size_t num_nodes = val_vec.size(); + size_t feature_size = val_vec[0].size(); + single_array.resize(num_nodes * feature_size); + galois::do_all( + galois::iterate(size_t{0}, num_nodes), + [&](size_t node) { + std::memcpy(&(single_array.data()[node * feature_size]), + val_vec[node].data(), feature_size * sizeof(float)); + }, + galois::loopname("GluonSerializeManyVecToOne")); + + serializeMessage(loopName, data_mode, bit_set_count, + indices, offsets, bit_set_comm, + single_array, b); + gSerialize(b, feature_size); + } else { + // TODO(loc/hochan) vector gpu hack for gnns if (data_mode == noData) { b.resize(0); if (!async) { @@ -2259,6 +2429,121 @@ class GluonSubstrate : public galois::runtime::GlobalObject { deserializeMessage(loopName, data_mode, num, buf, bit_set_count, offsets, bit_set_comm, buf_start, retval, val_vec); + bit_set_comm.reserve(maxSharedSize); + offsets.reserve(maxSharedSize); + val_vec.reserve(maxSharedSize); + + galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get(); + + if (data_mode == bitsetData) { + size_t bit_set_count2; + getOffsetsFromBitset(loopName, bit_set_comm, offsets, + bit_set_count2); + assert(bit_set_count == bit_set_count2); + } + + if (data_mode == onlyData) { + setSubset(loopName, sharedNodes[from_id], + bit_set_count, offsets, val_vec, + bit_set_compute); + } else if (data_mode == dataSplit || data_mode == dataSplitFirst) { + setSubset(loopName, sharedNodes[from_id], + bit_set_count, offsets, val_vec, + bit_set_compute, buf_start); + } else if (data_mode == gidsData) { + setSubset(loopName, offsets, bit_set_count, offsets, val_vec, + bit_set_compute); + } else { // bitsetData or offsetsData + setSubset(loopName, sharedNodes[from_id], + bit_set_count, offsets, val_vec, + bit_set_compute); + } + // TODO: reduce could update the bitset, so it needs to be copied + // back to the device + } + } + } + + Tset.stop(); + + return retval; + } + + // TODO (loc) way too much code duplication + template < + SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, + bool async, + typename std::enable_if::type* = nullptr, + typename std::enable_if::value>::type* = nullptr> + size_t syncRecvApplyVecHack(uint32_t from_id, + galois::runtime::RecvBuffer& buf, + std::string loopName) { + //////////////////////////////////////////////////////////////////////////// + std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; + std::string set_timer_str(syncTypeStr + "Set_" + + get_run_identifier(loopName)); + galois::CondStatTimer Tset(set_timer_str.c_str(), RNAME); + std::string set_batch_timer_str(syncTypeStr + "SetBatch_" + + get_run_identifier(loopName)); + galois::CondStatTimer Tsetbatch( + set_batch_timer_str.c_str(), RNAME); + //////////////////////////////////////////////////////////////////////////// + + galois::DynamicBitSet& bit_set_comm = syncBitset; + static VecTy val_vec; + // TODO(loc) assumes float for now + static galois::gstl::Vector single_array; + galois::PODResizeableArray& offsets = syncOffsets; + + auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes; + uint32_t num = sharedNodes[from_id].size(); + size_t retval = 0; + + Tset.start(); + + if (num > 0) { // only enter if we expect message from that host + DataCommMode data_mode; + // 1st deserialize gets data mode + galois::runtime::gDeserialize(buf, data_mode); + + if (data_mode != noData) { + // GPU update call + Tsetbatch.start(); + bool batch_succeeded = + setBatchWrapper(from_id, buf, data_mode); + Tsetbatch.stop(); + + // cpu always enters this block + if (!batch_succeeded) { + size_t bit_set_count = num; + size_t buf_start = 0; + + // deserialize the rest of the data in the buffer depending on the + // data mode; arguments passed in here are mostly output vars + deserializeMessage(loopName, data_mode, num, buf, + bit_set_count, offsets, bit_set_comm, + buf_start, retval, single_array); + + // deserialize sngle array into vector of vector state again + size_t feature_size; + gDeserialize(buf, feature_size); + size_t num_nodes = single_array.size() / feature_size; + + assert(single_array.size() % feature_size == 0); + val_vec.resize(num_nodes); + galois::do_all( + galois::iterate(size_t{0}, num_nodes), + [&](size_t node) { + val_vec[node].resize(feature_size); + std::memcpy((void*)(val_vec[node].data()), + &(single_array[node * feature_size]), + feature_size * sizeof(float)); + }, + galois::loopname("GluonDeserializeBackToVecOfVec")); bit_set_comm.reserve(maxSharedSize); offsets.reserve(maxSharedSize); @@ -2304,6 +2589,17 @@ class GluonSubstrate : public galois::runtime::GlobalObject { return retval; } + template < + SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, + bool async, + typename std::enable_if::type* = nullptr, + typename std::enable_if::value>::type* = nullptr> + size_t syncRecvApplyVecHack(uint32_t, galois::runtime::RecvBuffer&, + std::string) { + GALOIS_LOG_FATAL("NOT SUPPORTED, should never get called"); + return 0; + } + /** * VECTOR BITSET VARIANT. * @@ -2498,6 +2794,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject { RNAME); if (async) { + if (is_vector_of_vec::value) { + galois::gWarn("Async execution does not support the vector of vec hack " + "(most important for GNN)"); + } + size_t syncTypePhase = 0; if (syncType == syncBroadcast) syncTypePhase = 1; @@ -2526,8 +2827,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } while (!p); Twait.stop(); - syncRecvApply( - p->first, p->second, loopName); + if (is_vector_of_vec::value) { + syncRecvApplyVecHack( + p->first, p->second, loopName); + } else { + syncRecvApply( + p->first, p->second, loopName); + } } incrementEvilPhase(); } From 6c6947d245d5ac313973a53954077261393bdd87 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 9 Mar 2021 15:48:58 -0600 Subject: [PATCH 499/660] Size usage, weight sync sub removal, fixes Prints for size used on layers. Removal of sync substrate for weights since collectives are used now. Some bug fixes for mask policy in graph reading + size used + print of set sizes. Do not allocate weights for layer 0 backward. Fixes tests that check for it as well. --- libgnn/CMakeLists.txt | 1 - libgnn/include/galois/layers/GNNLayer.h | 15 ++-- .../galois/layers/GluonGradientInterface.h | 81 ------------------- libgnn/include/galois/layers/SAGELayer.h | 6 -- libgnn/src/graphs/GNNGraph.cpp | 47 +++++++---- libgnn/src/layers/GNNLayer.cpp | 65 ++++++--------- libgnn/src/layers/GluonGradientInterface.cpp | 49 ----------- libgnn/src/layers/GraphConvolutionalLayer.cpp | 10 ++- libgnn/src/layers/SAGELayer.cpp | 25 +++--- libgnn/test/aggregate-sync-test.cpp | 13 ++- libgnn/test/convlayer-test.cpp | 23 +----- libgnn/test/sage-layer-test.cpp | 23 +----- 12 files changed, 95 insertions(+), 263 deletions(-) delete mode 100644 libgnn/include/galois/layers/GluonGradientInterface.h delete mode 100644 libgnn/src/layers/GluonGradientInterface.cpp diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index b59cccef93..ed60ae032b 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -5,7 +5,6 @@ set(sources src/graphs/GNNGraph.cpp src/layers/DenseLayer.cpp src/layers/GNNLayer.cpp - src/layers/GluonGradientInterface.cpp src/layers/GraphConvolutionalLayer.cpp src/layers/L2NormLayer.cpp src/layers/SAGELayer.cpp diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 27fd1ac0c7..47b38a7f73 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -3,7 +3,6 @@ #include "galois/PerThreadRNG.h" #include "galois/GNNOptimizers.h" #include "galois/graphs/GNNGraph.h" -#include "galois/layers/GluonGradientInterface.h" #ifdef GALOIS_ENABLE_GPU #include "galois/layers/GNNLayer.cuh" @@ -225,15 +224,6 @@ class GNNLayer { PointerWithSize p_forward_output_matrix_; PointerWithSize p_backward_output_matrix_; - //! Synchronizes all weights (used in distributed setting) - void SyncInitialWeights(); - - //! Wrapper over gradient matrix to make it compatible with Gluon - std::unique_ptr gradient_sync_interface_; - //! Synchronization substrate for the weight gradients - std::unique_ptr> - gradient_sync_substrate_; - //! RNG for matrix initialization PerThreadRNG random_init_rng_{-5.0, 5.0}; //! RNG for dropout @@ -303,6 +293,11 @@ class GNNLayer { #endif void MaskGradientNonMasters(PointerWithSize* gradients); + + //! Does some math to get GB used by some # of floats + double FloatElementsToGB(size_t num_of_floats) const { + return num_of_floats * double{4} / (1 << 30); + } }; } // namespace galois diff --git a/libgnn/include/galois/layers/GluonGradientInterface.h b/libgnn/include/galois/layers/GluonGradientInterface.h deleted file mode 100644 index a41ca0cb4d..0000000000 --- a/libgnn/include/galois/layers/GluonGradientInterface.h +++ /dev/null @@ -1,81 +0,0 @@ -#pragma once - -#include "galois/GNNTypes.h" -#include "galois/gstl.h" -#include "galois/runtime/Network.h" - -namespace galois { - -// TODO figure out which function calls can be removed without causing compiler -// to complain - -//! Wraps a matrix and allows it to be synchronized via Gluon as it provides -//! all the functions Gluon needs. -//! Assumes the matrix is initialized the same way across all hosts (if not -//! they'll all see the same values after the first round of sync anyways) -class GluonGradientInterface { -public: - // typedefs required by GPU end to build; not actually used anywhere in this - // class (...at the moment) - // as such, dummy declarations that don't particularly make sense - // TODO will likely need to revisit once GPU substrate for this needs to be - // setup - using GraphNode = uint32_t; - using edge_iterator = boost::counting_iterator; - using EdgeType = char; - - //! Save reference to weight gradients. - //! Then setup mirror metadata for Gluon to use during setup. - GluonGradientInterface(std::vector& gradients); - - //! Size is number of weights since all hosts own everything - size_t size() const { return num_weights_; } - //! Global size is number of weights - size_t globalSize() const { return num_weights_; } - //! Return the weights owned by this host - size_t numMasters() const { return num_owned_; } - //! GID is same as LID since all hosts have all weights - uint32_t getGID(const uint32_t node_id) const { return node_id; } - //! LID is same as GID since all hosts have all weights - uint32_t getLID(const uint32_t node_id) const { return node_id; } - //! Return weight w - GNNFloat& getData(uint32_t w) const { return gradients_[w]; } - //! Return ranges for mirrors (unowned nodes) - const std::vector>& getMirrorRanges() const { - return mirror_ranges_; - } - //! Return mirror nodes for each host from this host's point of view - std::vector>& getMirrorNodes() { return mirror_nodes_; } - - ////////////////////////////////////////////////////////////////////////////// - - // for all that follow, no edges in this sync so most of this returns what - // you expect - // size_t getNumNodesWithEdges() const { return 0; } - bool is_vertex_cut() const { return false; } - unsigned edge_begin(uint32_t) const { return 0; } - unsigned edge_end(uint32_t) const { return 0; } - unsigned getEdgeDst(uint32_t) const { return 0; } - unsigned getEdgeData(uint32_t) const { return 0; } - void deallocate() const {}; - - bool is_a_graph() const { return false; } - -private: - //! Reference to gradients that can get synchronized - std::vector& gradients_; - //! number of weight gradients - size_t num_weights_; - //! number of single gradients this host is responsible for - size_t num_owned_; - //! First weight that's a master - size_t begin_master_; - //! Last weight that's a master - size_t end_master_; - //! My nodes whose's masters are on other hosts; global ids - std::vector> mirror_nodes_; - //! nodes that are mirrors on this host - std::vector> mirror_ranges_; -}; - -} // namespace galois diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index 59f71b9041..9dc7007931 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -109,12 +109,6 @@ class SAGELayer : public GNNLayer { PointerWithSize p_layer_weights_2_; PointerWithSize p_layer_weight_gradients_2_; - //! Wrapper over 2nd gradient matrix to make it compatible with Gluon - std::unique_ptr gradient_sync_interface_2_; - //! Synchronization substrate for the 2nd weight gradients - std::unique_ptr> - gradient_sync_substrate_2_; - // 2 temporaries the size of the forward input; used for dropout and // aggregation (if either are required) std::vector in_temp_1_; diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 481784dc82..a75bb47498 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -362,6 +362,9 @@ void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, file_stream.close(); + galois::gInfo(host_prefix_, "Read ", found_local_vertices, " labels (", + local_ground_truth_labels_.size() * double{4} / (1 << 30), + " GB)"); GALOIS_LOG_ASSERT(found_local_vertices == partitioned_graph_->size()); } @@ -414,6 +417,10 @@ void galois::graphs::GNNGraph::ReadLocalFeatures( } } full_feature_set.reset(); + + galois::gInfo( + host_prefix_, "Read ", local_node_features_.size(), " features (", + local_ground_truth_labels_.size() * double{4} / (1 << 30), " GB)"); GALOIS_LOG_ASSERT(num_kept_vertices == partitioned_graph_->size()); } @@ -438,19 +445,23 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( mask_range->end = range_end; mask_range->size = range_end - range_begin; - size_t cur_line_num = 0; + size_t cur_line_num = 0; + // valid nodes on this host size_t local_sample_count = 0; + // this tracks TOTAL # of valid nodes in this group (not necessarily valid + // ones on this host) + size_t valid_count = 0; std::string line; // each line is a number signifying if mask is set for the vertex while (std::getline(mask_stream, line)) { std::istringstream mask_stream(line); // only examine vertices/lines in range if (cur_line_num >= range_begin && cur_line_num < range_end) { - // only bother if node is local - if (partitioned_graph_->isLocal(cur_line_num)) { - unsigned mask = 0; - mask_stream >> mask; - if (mask == 1) { + unsigned mask = 0; + mask_stream >> mask; + if (mask == 1) { + valid_count++; + if (partitioned_graph_->isLocal(cur_line_num)) { masks[partitioned_graph_->getLID(cur_line_num)] = 1; local_sample_count++; } @@ -460,7 +471,7 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( } mask_stream.close(); - if (local_sample_count != mask_range->size) { + if (valid_count != mask_range->size) { // overlapping masks: need to actually check the masks rather than use // ranges if (!incomplete_masks_) { @@ -470,7 +481,7 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( incomplete_masks_ = true; } - return local_sample_count; + return valid_count; } void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { @@ -513,12 +524,20 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { } } else { // XXX i can get local sample counts from here if i need it - ReadLocalMasksFromFile(dataset_name, "train", &global_training_mask_range_, - local_training_mask_.data()); - ReadLocalMasksFromFile(dataset_name, "val", &global_validation_mask_range_, - local_validation_mask_.data()); - ReadLocalMasksFromFile(dataset_name, "test", &global_testing_mask_range_, - local_testing_mask_.data()); + size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train", + &global_training_mask_range_, + local_training_mask_.data()); + size_t valid_val = ReadLocalMasksFromFile(dataset_name, "val", + &global_validation_mask_range_, + local_validation_mask_.data()); + size_t valid_test = ReadLocalMasksFromFile(dataset_name, "test", + &global_testing_mask_range_, + local_testing_mask_.data()); + if (galois::runtime::getSystemNetworkInterface().ID == 0) { + galois::gInfo("Valid # training nodes is ", valid_train); + galois::gInfo("Valid # validation nodes is ", valid_val); + galois::gInfo("Valid # test nodes is ", valid_test); + } } } diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 1084bf9010..cde5698a93 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -9,14 +9,22 @@ galois::GNNLayer::GNNLayer(size_t layer_num, : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions), config_(config) { if (config_.allocate_weights) { - // TODO some of this does not need alloc if not used // dropout allocation; dropout is same as input - dropout_mask_.resize( - layer_dimensions_.input_rows * layer_dimensions_.input_columns, false); + if (!config_.disable_dropout) { + dropout_mask_.resize(layer_dimensions_.input_rows * + layer_dimensions_.input_columns, + false); + } // allocate memory based on layer dimensions size_t num_weight_elements = layer_dimensions_.input_columns * layer_dimensions_.output_columns; + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", layer weights ", num_weight_elements, " (", + FloatElementsToGB(num_weight_elements), " GB)"); layer_weights_.resize(num_weight_elements); + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", layer gradients ", num_weight_elements, " (", + FloatElementsToGB(num_weight_elements), " GB)"); layer_weight_gradients_.resize(num_weight_elements, 0); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { @@ -27,22 +35,25 @@ galois::GNNLayer::GNNLayer(size_t layer_num, #endif GlorotBengioInit(&layer_weights_); - - // initialize sync substrate - gradient_sync_interface_ = - std::make_unique(layer_weight_gradients_); - gradient_sync_substrate_ = std::make_unique< - galois::graphs::GluonSubstrate>( - *gradient_sync_interface_, - galois::runtime::getSystemNetworkInterface().ID, - galois::runtime::getSystemNetworkInterface().Num, false); } size_t num_output_elements = layer_dimensions_.input_rows * layer_dimensions_.output_columns; + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", forward output matrix ", num_output_elements, " (", + FloatElementsToGB(num_output_elements), " GB)"); forward_output_matrix_.resize(num_output_elements, 0); - backward_output_matrix_.resize( - layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0); + if (layer_number_ != 0) { + galois::gInfo( + graph_.host_prefix(), "Creating layer ", layer_number_, + ", backward output matrix ", + layer_dimensions_.input_rows * layer_dimensions_.input_columns, " (", + FloatElementsToGB(layer_dimensions_.input_rows * + layer_dimensions_.input_columns), + " GB)"); + backward_output_matrix_.resize( + layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0); + } #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { base_gpu_object_.InitInOutMemory(num_output_elements, @@ -277,10 +288,6 @@ void galois::GNNLayer::WeightGradientSyncSum() { // TODO(hochan) collectives here rather than gluon sync if possible like the // CPU code // preferably without needing to do a gpu->cpu copy - galois::gWarn( - "GPU still using inefficient point to point comms for weight sync"); - gradient_sync_substrate_->sync( - "WeightGradientsSync"); #else // TODO(loc) remove this limitation later; can just do a loop over the weight // matrix @@ -297,28 +304,6 @@ void galois::GNNLayer::WeightGradientSyncSum() { t.stop(); } -void galois::GNNLayer::SyncInitialWeights() { - if (galois::runtime::getSystemNetworkInterface().Num == 1) { - return; - } -#ifdef GALOIS_ENABLE_GPU - // TODO(loc/hochan); not required at the moment however - GALOIS_LOG_FATAL("Need to implement GPU version of this"); -#endif - // copy weights over to gradients - for (size_t i = 0; i < layer_weights_.size(); i++) { - layer_weight_gradients_[i] = layer_weights_[i]; - } - // sync "gradients" with a set only (reduction ignored) - gradient_sync_substrate_->sync( - "InitialSync"); - // copy "gradients" (actually weights) back to weight matrix - for (size_t i = 0; i < layer_weights_.size(); i++) { - layer_weights_[i] = layer_weight_gradients_[i]; - layer_weight_gradients_[i] = 0; - } -} - void galois::GNNLayer::MaskGradientNonMasters( PointerWithSize* gradient) { #ifdef GALOIS_ENABLE_GPU diff --git a/libgnn/src/layers/GluonGradientInterface.cpp b/libgnn/src/layers/GluonGradientInterface.cpp deleted file mode 100644 index 74e612af17..0000000000 --- a/libgnn/src/layers/GluonGradientInterface.cpp +++ /dev/null @@ -1,49 +0,0 @@ -#include "galois/layers/GluonGradientInterface.h" - -galois::GluonGradientInterface::GluonGradientInterface( - std::vector& gradients) - : gradients_(gradients), num_weights_(gradients_.size()) { - size_t my_host = galois::runtime::getSystemNetworkInterface().ID; - size_t num_hosts = galois::runtime::getSystemNetworkInterface().Num; - - // allocate a vector for each host - mirror_nodes_.resize(num_hosts); - - // loop through distribution of weights to hosts - for (unsigned h = 0; h < num_hosts; h++) { - std::pair cur_range = - galois::block_range((size_t)0, num_weights_, h, num_hosts); - - if (h != my_host) { - // setup mirrors for the host h which is just the list of IDs - size_t current_weight = cur_range.first; - size_t last_weight = cur_range.second; - size_t num_host_weights = last_weight - current_weight; - - // set mirrors for host h - mirror_nodes_[h].reserve(num_host_weights); - for (; current_weight < last_weight; current_weight++) { - mirror_nodes_[h].push_back(current_weight); - } - } else { - // these belong to this host; save, then mirror ranges can be - // calculated from this - begin_master_ = cur_range.first; - end_master_ = cur_range.second; - num_owned_ = end_master_ - begin_master_; - - // first range is 0 to begin master - if (begin_master_ > 0) { - mirror_ranges_.emplace_back(0, begin_master_); - } - - // second range is endMaster to end - if (end_master_ < num_weights_) { - mirror_ranges_.emplace_back(end_master_, num_weights_); - } - } - } - - galois::gDebug("[", my_host, "] Weight gradients: this host owns ", - begin_master_, " to ", end_master_); -} diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index a60b1eb0c4..44d2df6529 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -10,15 +10,23 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( output_column_intermediates_(dimensions.output_columns) { size_t num_input_elements = layer_dimensions_.input_rows * layer_dimensions_.input_columns; + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", GCN input temp var 1 ", num_input_elements, " (", + FloatElementsToGB(num_input_elements), " GB)"); in_temp_1_.resize(num_input_elements, 0); if (config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", GCN input temp var 2 ", num_input_elements, " (", + FloatElementsToGB(num_input_elements), " GB)"); in_temp_2_.resize(num_input_elements, 0); } size_t num_output_elements = layer_dimensions_.input_rows * layer_dimensions_.output_columns; - GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements); + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", GCN output temp var ", num_output_elements, " (", + FloatElementsToGB(num_output_elements), " GB)"); out_temp_.resize(num_output_elements, 0); layer_type_ = galois::GNNLayerType::kGraphConvolutional; #ifdef GALOIS_ENABLE_GPU diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 191c02d00e..9f80000bdd 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -15,7 +15,13 @@ galois::SAGELayer::SAGELayer(size_t layer_num, // abstractly it's one matrix: W = W1 | W2 size_t num_weight_elements = layer_dimensions_.input_columns * layer_dimensions_.output_columns; + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", SAGE second layer weights ", num_weight_elements, " (", + FloatElementsToGB(num_weight_elements), " GB)"); layer_weights_2_.resize(num_weight_elements); + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", SAGE second layer gradients ", num_weight_elements, " (", + FloatElementsToGB(num_weight_elements), " GB)"); layer_weight_gradients_2_.resize(num_weight_elements, 0); // reinit both weight matrices as one unit @@ -28,29 +34,28 @@ galois::SAGELayer::SAGELayer(size_t layer_num, // initialize the optimizer std::vector weight_size = {num_weight_elements}; second_weight_optimizer_ = std::make_unique(weight_size, 1); - - // initialize sync substrate for second set - gradient_sync_interface_2_ = - std::make_unique(layer_weight_gradients_2_); - gradient_sync_substrate_2_ = std::make_unique< - galois::graphs::GluonSubstrate>( - *gradient_sync_interface_2_, - galois::runtime::getSystemNetworkInterface().ID, - galois::runtime::getSystemNetworkInterface().Num, false); } size_t num_input_elements = layer_dimensions_.input_rows * layer_dimensions_.input_columns; + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", SAGE input temp var 1 ", num_input_elements, " (", + FloatElementsToGB(num_input_elements), " GB)"); in_temp_1_.resize(num_input_elements, 0); // only need to allocate if input <= output because not used otherwise if (config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", SAGE input temp var 2 ", num_input_elements, " (", + FloatElementsToGB(num_input_elements), " GB)"); in_temp_2_.resize(num_input_elements, 0); } size_t num_output_elements = layer_dimensions_.input_rows * layer_dimensions_.output_columns; - GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements); + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", SAGE output temp var ", num_output_elements, " (", + FloatElementsToGB(num_output_elements), " GB)"); out_temp_.resize(num_output_elements, 0); layer_type_ = galois::GNNLayerType::kSAGE; #ifdef GALOIS_ENABLE_GPU diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp index 7025331029..c3e3439a5e 100644 --- a/libgnn/test/aggregate-sync-test.cpp +++ b/libgnn/test/aggregate-sync-test.cpp @@ -96,7 +96,9 @@ int main() { // size 2 columns for (size_t c = 0; c < 2; c++) { - GALOIS_LOG_ASSERT(layer_0_forward_output[row * 2 + c] == ground_truth); + GALOIS_LOG_VASSERT(layer_0_forward_output[row * 2 + c] == ground_truth, + "should be {} not {}", ground_truth, + layer_0_forward_output[row * 2 + c]); } } @@ -111,13 +113,10 @@ int main() { layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones); ////////////////////////////////////////////////////////////////////////////// - // sanity check layer 0 backward output; all 0 because layer 0 + // sanity check layer 0 backward output: empty ////////////////////////////////////////////////////////////////////////////// - // since norm factors aren't invovled it is possible to do full assertions - GALOIS_LOG_ASSERT(layer_0_backward_output.size() == test_graph->size() * 3); - for (size_t i = 0; i < layer_0_backward_output.size(); i++) { - GALOIS_LOG_ASSERT((layer_0_backward_output)[i] == 0); - } + + GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 0); ////////////////////////////////////////////////////////////////////////////// // layer 1 to check backward output diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp index bcada6c4ed..309433845b 100644 --- a/libgnn/test/convlayer-test.cpp +++ b/libgnn/test/convlayer-test.cpp @@ -103,28 +103,7 @@ int main() { ////////////////////////////////////////////////////////////////////////////// // since norm factors aren't invovled it is possible to do full assertions // 7 x 3 - GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21); - GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0); + GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 0); galois::PointerWithSize layer_0_weight_gradients = layer_0->GetLayerWeightGradients(); diff --git a/libgnn/test/sage-layer-test.cpp b/libgnn/test/sage-layer-test.cpp index dadc8b0096..39a2cd2635 100644 --- a/libgnn/test/sage-layer-test.cpp +++ b/libgnn/test/sage-layer-test.cpp @@ -78,28 +78,7 @@ int main() { //////////////////////////////////////////////////////////////////////////////// // since norm factors aren't invovled it is possible to do full assertions // 7 x 3 - GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21); - GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0); + GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 0); galois::PointerWithSize layer_0_weight_gradients = layer_0->GetLayerWeightGradients(); From 3c15b652ee6e3d87bf6008a5f07d27fc45f458e4 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 9 Mar 2021 16:16:18 -0600 Subject: [PATCH 500/660] 2 minor fixes: paper100M split, features size Made training split for papers100M the entire graph (mask is non-contiguous and occupies most of the graph anyways). It did not start at 0 before which was also problematic on its own. Corrected feature GB print in GNNGraph. --- libcusp/include/galois/graphs/NewGeneric.h | 6 +++--- libgnn/src/graphs/GNNGraph.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index c29127d9e6..710ba82996 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -120,9 +120,9 @@ class NewDistGraphGeneric : public DistGraph { bps.push_back(0); bps.push_back(86618); } else if (filename.find("ogbn-papers100M") != std::string::npos) { - // this is entire graph: amazon's mask isn't contiguous - bps.push_back(602); - bps.push_back(111052523); + // whole graph (non contiguous mask) + bps.push_back(0); + bps.push_back(111059956); } else { // TODO(loc) only die under certain conditions; don't die if something // is missing diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index a75bb47498..1bc4cb830e 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -420,7 +420,7 @@ void galois::graphs::GNNGraph::ReadLocalFeatures( galois::gInfo( host_prefix_, "Read ", local_node_features_.size(), " features (", - local_ground_truth_labels_.size() * double{4} / (1 << 30), " GB)"); + local_node_features_.size() * double{4} / (1 << 30), " GB)"); GALOIS_LOG_ASSERT(num_kept_vertices == partitioned_graph_->size()); } From 5f1f89696c9000d619f4071011a349f2e4cda85c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 11 Mar 2021 17:12:03 -0600 Subject: [PATCH 501/660] The big GNN space saving commit (part 1) This commit does the following: - Temporary/memoization matrices have been reduced to a minimum of only 1 per layer. The size of it will be the smaller of the input or output matrices. This change along with distributed execution/masking required some reorganization of the order of things in order to make it so you can mask the correct things at the right time + avoid overwriting something before it is used. This has made the code more annoying to read as a result. Some cleanup will be necessary. - Forward output matrices of layer n now act as the backward output matrix for layer n + 1. This is a huge space savings, but again, it required some changes to keep correctness. For example, for the activation derivative I now have to keep a bitset marking which elements were originally greater than one (originally this was done by checking the forward output matrix itself, but now it gets overwritten so you can't do that anymore). This change required a signature change to the layer constructors. - Tests now have to pass in a "forward output" for a layer to write the backward output into. This has made things slightly more annoying to read as well. Part 2 will make it so output layers do not allocate their own output matrices and just overwrite the input matrix. This will require some function signature changes, which is why it will be done in a separate commit. Also TODO: Distributed SAGE unit tests. --- libgnn/include/galois/GNNMath.h | 3 + libgnn/include/galois/layers/DenseLayer.h | 5 +- libgnn/include/galois/layers/GNNLayer.h | 15 +- .../galois/layers/GraphConvolutionalLayer.h | 6 +- libgnn/include/galois/layers/L2NormLayer.h | 7 +- libgnn/include/galois/layers/SAGELayer.h | 11 +- libgnn/include/galois/layers/SigmoidLayer.h | 4 +- libgnn/include/galois/layers/SoftmaxLayer.h | 4 +- libgnn/src/GNNMath.cpp | 6 + libgnn/src/GraphNeuralNetwork.cpp | 23 +- libgnn/src/graphs/GNNGraph.cpp | 6 +- libgnn/src/layers/DenseLayer.cpp | 10 +- libgnn/src/layers/GNNLayer.cpp | 99 +++++--- libgnn/src/layers/GraphConvolutionalLayer.cpp | 118 ++++++---- libgnn/src/layers/L2NormLayer.cpp | 7 +- libgnn/src/layers/SAGELayer.cpp | 214 ++++++++++-------- libgnn/src/layers/SigmoidLayer.cpp | 7 +- libgnn/src/layers/SoftmaxLayer.cpp | 10 +- libgnn/test/aggregate-sync-test.cpp | 21 +- libgnn/test/back-conv-test.cpp | 6 +- libgnn/test/convlayer-test.cpp | 10 +- libgnn/test/l2norm-layer-test.cpp | 7 +- libgnn/test/sage-layer-test.cpp | 16 +- libgnn/test/sample-test.cpp | 16 +- libgnn/test/sigmoidlayer-test.cpp | 7 +- libgnn/test/softmaxlayer-test.cpp | 7 +- 26 files changed, 409 insertions(+), 236 deletions(-) diff --git a/libgnn/include/galois/GNNMath.h b/libgnn/include/galois/GNNMath.h index 9e50295200..dd7ee5b479 100644 --- a/libgnn/include/galois/GNNMath.h +++ b/libgnn/include/galois/GNNMath.h @@ -7,6 +7,9 @@ namespace galois { +//! zeros out a vector of some length +void VectorZero(size_t length, GNNFloat* a); + //! Find max index in a vector of some length size_t MaxIndex(const size_t length, const GNNFloat* vector); //! Given 2 float array pointers, do element wise addition of length elements diff --git a/libgnn/include/galois/layers/DenseLayer.h b/libgnn/include/galois/layers/DenseLayer.h index d9918f8c2e..bb651ca30e 100644 --- a/libgnn/include/galois/layers/DenseLayer.h +++ b/libgnn/include/galois/layers/DenseLayer.h @@ -10,12 +10,15 @@ class DenseLayer : public GNNLayer { //! memory for temporary matrices. Also initializes sync substrate for the //! weight matrix DenseLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions, const GNNLayerConfig& config); DenseLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) - : DenseLayer(layer_num, graph, dimensions, GNNLayerConfig()) {} + : DenseLayer(layer_num, graph, backward_output_matrix, dimensions, + GNNLayerConfig()) {} // Parent functions const PointerWithSize diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 47b38a7f73..4e83cdc145 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -44,6 +44,9 @@ struct GNNLayerDimensions { struct GNNLayerConfig { //! True if weights should be allocated bool allocate_weights{true}; + //! If true, disable allocation of the output matrix (used for output layers + //! which can overwrite the input, i.e. passthrough) + bool disable_output{false}; //! Turns off dropout of weights if enabled bool disable_dropout{false}; //! Rate at which to drop things if dropout is on @@ -77,17 +80,19 @@ struct GNNLayerConfig { //! Base class for layers in a graph neural network class GNNLayer { public: - GNNLayer() = delete; //! Creation of a layer needs the # of the layer, the graph to train on, and //! the input/output dimensions of the MxM that occurs in the layer; config //! as well GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions, const GNNLayerConfig& config); //! Uses a default config GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) - : GNNLayer(layer_num, graph, dimensions, GNNLayerConfig()) {} + : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, + GNNLayerConfig()) {} GNNPhase layer_phase() { return layer_phase_; } //! Changes this layer's phase @@ -213,8 +218,6 @@ class GNNLayer { // want to allocate memory once to avoid runtime memory allocation. //! The output of the forward phase for this layer. std::vector forward_output_matrix_; - //! The output of the backward phase for this layer. - std::vector backward_output_matrix_; // These are wrapper around the pointer for the data associated with // any GNN layer: takes a CPU or GPU pointer depending on configuration @@ -223,6 +226,7 @@ class GNNLayer { PointerWithSize p_layer_weight_gradients_; PointerWithSize p_forward_output_matrix_; PointerWithSize p_backward_output_matrix_; + galois::DynamicBitSet activation_memo_; //! RNG for matrix initialization PerThreadRNG random_init_rng_{-5.0, 5.0}; @@ -292,6 +296,9 @@ class GNNLayer { } #endif + //! Mask a input size'd matrix's rows that correspond to mirrors + void MaskInputNonMasters(PointerWithSize* input); + //! Mask a gradient size'd matrix's rows that correspond to mirrors void MaskGradientNonMasters(PointerWithSize* gradients); //! Does some math to get GB used by some # of floats diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h index e44976f73b..d7a600096d 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h @@ -16,14 +16,16 @@ class GraphConvolutionalLayer : public GNNLayer { //! weight matrix GraphConvolutionalLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions, const GNNLayerConfig& config); GraphConvolutionalLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) - : GraphConvolutionalLayer(layer_num, graph, dimensions, - GNNLayerConfig()) {} + : GraphConvolutionalLayer(layer_num, graph, backward_output_matrix, + dimensions, GNNLayerConfig()) {} // Parent functions const PointerWithSize diff --git a/libgnn/include/galois/layers/L2NormLayer.h b/libgnn/include/galois/layers/L2NormLayer.h index 176c88700e..34ac3983e1 100644 --- a/libgnn/include/galois/layers/L2NormLayer.h +++ b/libgnn/include/galois/layers/L2NormLayer.h @@ -11,13 +11,16 @@ namespace galois { class L2NormLayer : public GNNLayer { public: L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) - : L2NormLayer(layer_num, graph, dimensions, + : L2NormLayer(layer_num, graph, backward_output_matrix, dimensions, GNNLayerConfig{.allocate_weights = false}) {} L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions, const GNNLayerConfig& config) - : GNNLayer(layer_num, graph, dimensions, config) { + : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config) { layer_type_ = galois::GNNLayerType::kL2Norm; // input/output columns must be equivalent in a softmax GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns); diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index 9dc7007931..056ea748c1 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -28,17 +28,22 @@ class SAGELayer : public GNNLayer { //! memory for temporary matrices. Also initializes sync substrate for the //! weight matrix SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions, const GNNLayerConfig& config, const SAGELayerConfig& sage_config); SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions, const GNNLayerConfig& config) - : SAGELayer(layer_num, graph, dimensions, config, SAGELayerConfig()) {} + : SAGELayer(layer_num, graph, backward_output_matrix, dimensions, config, + SAGELayerConfig()) {} SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) - : SAGELayer(layer_num, graph, dimensions, GNNLayerConfig(), - SAGELayerConfig()) {} + : SAGELayer(layer_num, graph, backward_output_matrix, dimensions, + GNNLayerConfig(), SAGELayerConfig()) {} void InitSelfWeightsTo1() { if (layer_weights_2_.size()) { diff --git a/libgnn/include/galois/layers/SigmoidLayer.h b/libgnn/include/galois/layers/SigmoidLayer.h index 7efe8cd9db..5a2f9f6894 100644 --- a/libgnn/include/galois/layers/SigmoidLayer.h +++ b/libgnn/include/galois/layers/SigmoidLayer.h @@ -11,8 +11,10 @@ namespace galois { class SigmoidLayer : public GNNLayer { public: SigmoidLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) - : GNNLayer(layer_num, graph, dimensions, + : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, GNNLayerConfig{.allocate_weights = false}), input_loss_(dimensions.input_rows), norm_gradient_vectors_(dimensions.input_columns) { diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index 7bf29272cd..444a383386 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -12,8 +12,10 @@ namespace galois { class SoftmaxLayer : public GNNLayer { public: SoftmaxLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) - : GNNLayer(layer_num, graph, dimensions, + : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, GNNLayerConfig{.allocate_weights = false}), #ifdef GALOIS_ENABLE_GPU gpu_object_(graph.GetGPUGraph()), diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp index aef3dae6dd..afb3712981 100644 --- a/libgnn/src/GNNMath.cpp +++ b/libgnn/src/GNNMath.cpp @@ -3,6 +3,12 @@ #include "galois/GNNMath.h" #include "galois/Logging.h" +void galois::VectorZero(size_t length, GNNFloat* a) { + for (size_t i = 0; i < length; i++) { + a[i] = 0; + } +} + size_t galois::MaxIndex(const size_t length, const GNNFloat* vector) { size_t index = 0; GNNFloat cur_max = vector[0]; diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index b31a31ecd1..9e944d0568 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -22,6 +22,9 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( graph_->ResizeLayerVector(config_.num_intermediate_layers()); } #endif + // used for chaining layers together; begins as nullptr + PointerWithSize prev_output_layer(nullptr, 0); + // create the intermediate layers for (size_t i = 0; i < config_.num_intermediate_layers(); i++) { GNNLayerType layer_type = config_.intermediate_layer_type(i); @@ -43,7 +46,8 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( switch (layer_type) { case GNNLayerType::kGraphConvolutional: gnn_layers_.push_back(std::move(std::make_unique( - i, *graph_, layer_dims, config_.default_layer_config()))); + i, *graph_, &prev_output_layer, layer_dims, + config_.default_layer_config()))); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { graph_->InitLayerVectorMetaObjects( @@ -54,21 +58,24 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( break; case GNNLayerType::kSAGE: gnn_layers_.push_back(std::move(std::make_unique( - i, *graph_, layer_dims, config_.default_layer_config()))); + i, *graph_, &prev_output_layer, layer_dims, + config_.default_layer_config()))); #ifdef GALOIS_ENABLE_GPU // TODO(loc/hochan) sage layer gpu #endif break; case GNNLayerType::kL2Norm: gnn_layers_.push_back(std::move(std::make_unique( - i, *graph_, layer_dims, config_.default_layer_config()))); + i, *graph_, &prev_output_layer, layer_dims, + config_.default_layer_config()))); #ifdef GALOIS_ENABLE_GPU // TODO(loc/hochan) l2 layer gpu #endif break; case GNNLayerType::kDense: gnn_layers_.push_back(std::move(std::make_unique( - i, *graph_, layer_dims, config_.default_layer_config()))); + i, *graph_, &prev_output_layer, layer_dims, + config_.default_layer_config()))); #ifdef GALOIS_ENABLE_GPU // TODO(loc/hochan) dense layer gpu #endif @@ -76,6 +83,8 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( default: GALOIS_LOG_FATAL("Invalid layer type during network construction"); } + // update output layer for next layer + prev_output_layer = gnn_layers_.back()->GetForwardOutput(); } // loop backward and find last GCN/SAGE (main) layer to disable activation @@ -102,11 +111,13 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( switch (config_.output_layer_type()) { case (GNNOutputLayerType::kSoftmax): gnn_layers_.push_back(std::move(std::make_unique( - config_.num_intermediate_layers(), *graph_, output_dims))); + config_.num_intermediate_layers(), *graph_, &prev_output_layer, + output_dims))); break; case (GNNOutputLayerType::kSigmoid): gnn_layers_.push_back(std::move(std::make_unique( - config_.num_intermediate_layers(), *graph_, output_dims))); + config_.num_intermediate_layers(), *graph_, &prev_output_layer, + output_dims))); break; default: GALOIS_LOG_FATAL("Invalid layer type during network construction"); diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 1bc4cb830e..89cdca94e9 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -418,9 +418,9 @@ void galois::graphs::GNNGraph::ReadLocalFeatures( } full_feature_set.reset(); - galois::gInfo( - host_prefix_, "Read ", local_node_features_.size(), " features (", - local_node_features_.size() * double{4} / (1 << 30), " GB)"); + galois::gInfo(host_prefix_, "Read ", local_node_features_.size(), + " features (", + local_node_features_.size() * double{4} / (1 << 30), " GB)"); GALOIS_LOG_ASSERT(num_kept_vertices == partitioned_graph_->size()); } diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp index b2da6bf010..75e715e482 100644 --- a/libgnn/src/layers/DenseLayer.cpp +++ b/libgnn/src/layers/DenseLayer.cpp @@ -2,11 +2,11 @@ #include "galois/GNNMath.h" #include "galois/layers/DenseLayer.h" -galois::DenseLayer::DenseLayer(size_t layer_num, - const galois::graphs::GNNGraph& graph, - const GNNLayerDimensions& dimensions, - const GNNLayerConfig& config) - : GNNLayer(layer_num, graph, dimensions, config), +galois::DenseLayer::DenseLayer( + size_t layer_num, const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, + const GNNLayerDimensions& dimensions, const GNNLayerConfig& config) + : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config), input_column_intermediates_(dimensions.input_columns), output_column_intermediates_(dimensions.output_columns) { size_t num_input_elements = diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index cde5698a93..b88f91b631 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -4,6 +4,7 @@ galois::GNNLayer::GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions, const GNNLayerConfig& config) : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions), @@ -37,23 +38,29 @@ galois::GNNLayer::GNNLayer(size_t layer_num, GlorotBengioInit(&layer_weights_); } - size_t num_output_elements = - layer_dimensions_.input_rows * layer_dimensions_.output_columns; - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", forward output matrix ", num_output_elements, " (", - FloatElementsToGB(num_output_elements), " GB)"); - forward_output_matrix_.resize(num_output_elements, 0); + if (!config_.disable_output) { + size_t num_output_elements = + layer_dimensions_.input_rows * layer_dimensions_.output_columns; + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", forward output matrix ", num_output_elements, " (", + FloatElementsToGB(num_output_elements), " GB)"); + forward_output_matrix_.resize(num_output_elements, 0); + } + if (layer_number_ != 0) { - galois::gInfo( - graph_.host_prefix(), "Creating layer ", layer_number_, - ", backward output matrix ", - layer_dimensions_.input_rows * layer_dimensions_.input_columns, " (", - FloatElementsToGB(layer_dimensions_.input_rows * - layer_dimensions_.input_columns), - " GB)"); - backward_output_matrix_.resize( - layer_dimensions_.input_rows * layer_dimensions_.input_columns, 0); + GALOIS_LOG_VASSERT( + backward_output_matrix->size() == + layer_dimensions_.input_rows * layer_dimensions_.input_columns, + "backward output size {} should equal input size {}", + backward_output_matrix->size(), + layer_dimensions_.input_rows * layer_dimensions_.input_columns); + } else { + GALOIS_LOG_VASSERT(backward_output_matrix->data() == nullptr, + "layer 0 should null ptr backward output"); + GALOIS_LOG_VASSERT(backward_output_matrix->size() == 0, + "layer 0 should size 0 backward output"); } + #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { base_gpu_object_.InitInOutMemory(num_output_elements, @@ -68,8 +75,7 @@ galois::GNNLayer::GNNLayer(size_t layer_num, layer_weight_gradients_.size()); p_forward_output_matrix_ = PointerWithSize( base_gpu_object_.forward_output(), forward_output_matrix_.size()); - p_backward_output_matrix_ = PointerWithSize( - base_gpu_object_.backward_output(), backward_output_matrix_.size()); + p_backward_output_matrix_ = *backward_output_matrix; // TODO can clear the cpu side vectors/don't use .size() since optimally // they aren't initialized } else { @@ -80,8 +86,7 @@ galois::GNNLayer::GNNLayer(size_t layer_num, PointerWithSize(layer_weight_gradients_); p_forward_output_matrix_ = PointerWithSize(forward_output_matrix_); - p_backward_output_matrix_ = - PointerWithSize(backward_output_matrix_); + p_backward_output_matrix_ = *backward_output_matrix; #ifdef GALOIS_ENABLE_GPU } #endif @@ -221,7 +226,7 @@ void galois::GNNLayer::ReconstructDropoutMatrix( void galois::GNNLayer::DoDropoutDerivative() { galois::StatTimer timer("BackwardDropout", "GNNLayer"); timer.start(); - assert(backward_output_matrix_.size() == dropout_mask_.size()); + assert(p_backward_output_matrix_.size() == dropout_mask_.size()); GNNFloat scale = 1. / (1. - config_.dropout_rate); #ifdef GALOIS_ENABLE_GPU @@ -232,11 +237,12 @@ void galois::GNNLayer::DoDropoutDerivative() { #endif // use dropout mask to figure out derivative galois::do_all( - galois::iterate(static_cast(0), backward_output_matrix_.size()), + galois::iterate(static_cast(0), + p_backward_output_matrix_.size()), [&](size_t i) { - backward_output_matrix_[i] = backward_output_matrix_[i] * - static_cast(dropout_mask_[i]) * - scale; + p_backward_output_matrix_[i] = + p_backward_output_matrix_[i] * + static_cast(dropout_mask_[i]) * scale; }, galois::loopname("LayerDropoutDerivative")); #ifdef GALOIS_ENABLE_GPU @@ -249,13 +255,22 @@ void galois::GNNLayer::Activation() { galois::StatTimer timer("ForwardActivation", "GNNLayer"); timer.start(); + if (activation_memo_.size() == 0) { + activation_memo_.resize(forward_output_matrix_.size()); + } + activation_memo_.reset(); + // TODO only does relu at the moment; should check user specified activation // and act accordingly galois::do_all( galois::iterate(static_cast(0), forward_output_matrix_.size()), [&](size_t i) { - forward_output_matrix_[i] = - std::max(forward_output_matrix_.at(i), static_cast(0)); + if (forward_output_matrix_[i] > 0.0) { + // do nothing, keep value; set the memo though + activation_memo_.set(i); + } else { + forward_output_matrix_[i] = 0; + } }, galois::loopname("ReLU")); timer.stop(); @@ -268,14 +283,14 @@ void galois::GNNLayer::ActivationDerivative( // TODO only does relu at the moment; should check user specified activation // and act accordingly - // keep gradient if the original output is greater than 0 + // keep gradient if the original output was greater than 0 galois::do_all( galois::iterate(static_cast(0), gradient->size()), [&](size_t i) { - (*gradient)[i] = - (forward_output_matrix_.at(i) > static_cast(0)) - ? (*gradient)[i] - : static_cast(0); + // it was <= 0 before; set back to 0 + if (!activation_memo_.test(i)) { + (*gradient)[i] = 0; + } }, galois::loopname("ReLU-Derivative")); timer.stop(); @@ -304,6 +319,28 @@ void galois::GNNLayer::WeightGradientSyncSum() { t.stop(); } +void galois::GNNLayer::MaskInputNonMasters(PointerWithSize* input) { +#ifdef GALOIS_ENABLE_GPU + // TODO(hochan) mask away the **non** masters on gpu + GALOIS_LOG_FATAL("implement this"); +#else + assert(*(graph_.begin_owned()) == 0); + size_t start_node = *(graph_.end_owned()); + size_t end_node = graph_.size(); + size_t row_index = layer_dimensions_.input_columns; + assert((row_index * layer_dimensions_.input_rows) == input->size()); + galois::do_all( + galois::iterate(start_node, end_node), + [&](size_t non_master) { + // TODO(loc) use a std function for this for max efficiency + for (size_t i = 0; i < row_index; i++) { + (*input)[non_master * row_index + i] = 0; + } + }, + galois::loopname("MaskInputNonMasters")); +#endif +} + void galois::GNNLayer::MaskGradientNonMasters( PointerWithSize* gradient) { #ifdef GALOIS_ENABLE_GPU diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 44d2df6529..6f86cf1395 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -4,18 +4,25 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( size_t layer_num, const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions, const GNNLayerConfig& config) - : GNNLayer(layer_num, graph, dimensions, config), + : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config), input_column_intermediates_(dimensions.input_columns), output_column_intermediates_(dimensions.output_columns) { size_t num_input_elements = layer_dimensions_.input_rows * layer_dimensions_.input_columns; - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", GCN input temp var 1 ", num_input_elements, " (", - FloatElementsToGB(num_input_elements), " GB)"); - in_temp_1_.resize(num_input_elements, 0); - if (config_.disable_aggregate_after_update || + if (!config_.disable_dropout || config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", GCN input temp var 1 ", num_input_elements, " (", + FloatElementsToGB(num_input_elements), " GB)"); + in_temp_1_.resize(num_input_elements, 0); + } + + // only on in dropout case + if in temp is smaller than out temp + if (!config_.disable_dropout && + (config_.disable_aggregate_after_update || + layer_dimensions_.input_columns <= layer_dimensions_.output_columns)) { galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, ", GCN input temp var 2 ", num_input_elements, " (", FloatElementsToGB(num_input_elements), " GB)"); @@ -24,10 +31,17 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( size_t num_output_elements = layer_dimensions_.input_rows * layer_dimensions_.output_columns; - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", GCN output temp var ", num_output_elements, " (", - FloatElementsToGB(num_output_elements), " GB)"); - out_temp_.resize(num_output_elements, 0); + + // only needed if out temp would be smaller than intemp + if (!config_.disable_aggregate_after_update && + layer_dimensions_.input_columns > layer_dimensions_.output_columns) { + // xform matrix first to work with a smaller output size + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", GCN output temp var ", num_output_elements, " (", + FloatElementsToGB(num_output_elements), " GB)"); + out_temp_.resize(num_output_elements, 0); + } + layer_type_ = galois::GNNLayerType::kGraphConvolutional; #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { @@ -59,24 +73,27 @@ galois::GraphConvolutionalLayer::ForwardPhase( GALOIS_LOG_VERBOSE("Calling forward phase"); assert(input_embeddings.size() == (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); - assert(p_in_temp_1_.size() == input_embeddings.size()); assert(p_forward_output_matrix_.size() == (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); // pointer to input to operate on const GNNFloat* input_data = input_embeddings.data(); + GNNFloat* agg_data; // first, dropout if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) { DoDropout(input_embeddings, &p_in_temp_1_); input_data = p_in_temp_1_.data(); + agg_data = p_in_temp_2_.data(); + } else { + agg_data = p_in_temp_1_.data(); } // flip aggregate/update if dimensions favor it (do less work) if (config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { // aggregation and update - AggregateAll(layer_dimensions_.input_columns, input_data, - p_in_temp_2_.data(), &input_column_intermediates_); - UpdateEmbeddings(p_in_temp_2_.data(), p_forward_output_matrix_.data()); + AggregateAll(layer_dimensions_.input_columns, input_data, agg_data, + &input_column_intermediates_); + UpdateEmbeddings(agg_data, p_forward_output_matrix_.data()); } else { // update to aggregate // FW @@ -115,43 +132,34 @@ galois::GraphConvolutionalLayer::BackwardPhase( // AFW = O galois::PointerWithSize input_data; + galois::PointerWithSize agg_data; if (!config_.disable_dropout) { // dropout result is currently stored in temp 1 // needs to be used before it gets overwritten input_data = p_in_temp_1_; + agg_data = p_in_temp_2_; } else { // no dropout = use vanilla input input_data = prev_layer_input; + agg_data = p_in_temp_1_; } + // NOTE: PREV LAYER INPUT AND BACKWARDOUTPUT ARE THE SAME MEMORY LOCATION; + // BEWARE OF DEPENDENCIES + // derivative of aggregation/update // TODO clean up logic here to reduce nesting if (config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { - if (layer_number_ != 0) { - // transposed sgemm for derivative; in_temp is output - assert(input_gradient->size() == - layer_dimensions_.input_rows * layer_dimensions_.output_columns); - assert(p_in_temp_1_.size() == - layer_dimensions_.input_columns * layer_dimensions_.input_rows); - // pintemp1 contains (AF)' - UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data()); - // pback contains F' - // derivative of aggregate is the same due to symmetric graph - AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(), - p_backward_output_matrix_.data(), - &input_column_intermediates_, true); - // TODO if training A, then A' compute here if layer # is 0 - // dot product of edges that exist in A - } - // weight gradient calculation - // TODO(loc) put this in a function to put the ifdef in there - MaskGradientNonMasters(input_gradient); + // aggdata can == p_intemp1; in other words, need to use before overwrite + // mask it, then use it + MaskInputNonMasters(&agg_data); + #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.GetWeightGradientsGPU( layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, p_in_temp_2_.data(), + layer_dimensions_.output_columns, agg_data.data(), input_gradient->data(), p_layer_weight_gradients_.data()); } else { #endif @@ -159,11 +167,26 @@ galois::GraphConvolutionalLayer::BackwardPhase( galois::CBlasSGEMM( CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, layer_dimensions_.input_rows, layer_dimensions_.output_columns, - p_in_temp_2_.data(), input_gradient->data(), + agg_data.data(), input_gradient->data(), p_layer_weight_gradients_.data()); #ifdef GALOIS_ENABLE_GPU } #endif + + // gradient isn't masked here; only temp1, which has already been + // overwritten = fine + if (layer_number_ != 0) { + // transposed sgemm for derivative; in_temp is output + assert(input_gradient->size() == + layer_dimensions_.input_rows * layer_dimensions_.output_columns); + // pintemp1 contains (AF)' + UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data()); + // pback contains F' + // derivative of aggregate is the same due to symmetric graph + AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(), + p_backward_output_matrix_.data(), + &input_column_intermediates_, true); + } } else { // TODO at this point, out_temp contains memoized FW // can use it to get A' = O' (FW)^T @@ -172,15 +195,19 @@ galois::GraphConvolutionalLayer::BackwardPhase( // this is (FW)' AggregateAll(layer_dimensions_.output_columns, input_gradient->data(), p_out_temp_.data(), &output_column_intermediates_, true); + + // done after above because input_data = p_backward_output_matrix in some + // cases; use first before overwriting here if layer # doesn't = 0, it means + // I can mess with the input data itself instad of masking the gradients I + // can mask the input if (layer_number_ != 0) { - // derivative for update - // backout = F' - UpdateEmbeddingsDerivative(p_out_temp_.data(), - p_backward_output_matrix_.data()); + MaskInputNonMasters(&input_data); + } else { + // if 0 then no input to mask: mask the gradient + // this is fine because gradient won't be used to get feature gradients + MaskGradientNonMasters(&p_out_temp_); } - // W' = F^T (FW)' - MaskGradientNonMasters(&p_out_temp_); - // TODO put this in a function + #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.GetWeightGradientsGPU( @@ -197,6 +224,13 @@ galois::GraphConvolutionalLayer::BackwardPhase( #ifdef GALOIS_ENABLE_GPU } #endif + + if (layer_number_ != 0) { + // can now overwrite p_backward without issue; since input gradient + // is untouched if layer number isn't 0 this will be correct + UpdateEmbeddingsDerivative(p_out_temp_.data(), + p_backward_output_matrix_.data()); + } } // sync weight gradients; note aggregation sync occurs in the function call diff --git a/libgnn/src/layers/L2NormLayer.cpp b/libgnn/src/layers/L2NormLayer.cpp index a29fccab1d..d7c04c52e9 100644 --- a/libgnn/src/layers/L2NormLayer.cpp +++ b/libgnn/src/layers/L2NormLayer.cpp @@ -62,7 +62,8 @@ galois::PointerWithSize galois::L2NormLayer::BackwardPhase( galois::PointerWithSize galois::L2NormLayer::BackwardPhaseCPU( galois::PointerWithSize prev_layer_input, galois::PointerWithSize* input_gradient) { - backward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); + galois::do_all(galois::iterate(size_t{0}, p_backward_output_matrix_.size()), + [&](size_t i) { p_backward_output_matrix_[i] = 0; }); const size_t feature_length = layer_dimensions_.input_columns; // derivative of some x_1 is sum of gradient w.r.t. x_1 for all elements of @@ -108,7 +109,7 @@ galois::PointerWithSize galois::L2NormLayer::BackwardPhaseCPU( for (size_t row_index = row_offset; row_index < (row_offset + feature_length); row_index++) { - backward_output_matrix_[row_index] = + p_backward_output_matrix_[row_index] = denominator * (prev_layer_input[row_index] * mult_with_input + (*input_gradient)[row_index] * running_square_sum); @@ -117,5 +118,5 @@ galois::PointerWithSize galois::L2NormLayer::BackwardPhaseCPU( }, galois::loopname("L2Backward")); - return PointerWithSize(backward_output_matrix_); + return p_backward_output_matrix_; } diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 9f80000bdd..8fde856ac8 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -4,10 +4,12 @@ galois::SAGELayer::SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions, const GNNLayerConfig& config, const SAGELayerConfig& sage_config) - : GNNLayer(layer_num, graph, dimensions, config), sage_config_(sage_config), + : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config), + sage_config_(sage_config), input_column_intermediates_(dimensions.input_columns), output_column_intermediates_(dimensions.output_columns) { if (!sage_config_.disable_concat) { @@ -38,13 +40,20 @@ galois::SAGELayer::SAGELayer(size_t layer_num, size_t num_input_elements = layer_dimensions_.input_rows * layer_dimensions_.input_columns; - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", SAGE input temp var 1 ", num_input_elements, " (", - FloatElementsToGB(num_input_elements), " GB)"); - in_temp_1_.resize(num_input_elements, 0); - // only need to allocate if input <= output because not used otherwise - if (config_.disable_aggregate_after_update || + + // if in temp is smaller than out temp, or if dropout exists + if (!config_.disable_dropout || config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", SAGE input temp var 1 ", num_input_elements, " (", + FloatElementsToGB(num_input_elements), " GB)"); + in_temp_1_.resize(num_input_elements, 0); + } + + // only on in dropout case + if in temp is smaller than out temp + if (!config_.disable_dropout && + (config_.disable_aggregate_after_update || + layer_dimensions_.input_columns <= layer_dimensions_.output_columns)) { galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, ", SAGE input temp var 2 ", num_input_elements, " (", FloatElementsToGB(num_input_elements), " GB)"); @@ -53,10 +62,16 @@ galois::SAGELayer::SAGELayer(size_t layer_num, size_t num_output_elements = layer_dimensions_.input_rows * layer_dimensions_.output_columns; - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", SAGE output temp var ", num_output_elements, " (", - FloatElementsToGB(num_output_elements), " GB)"); - out_temp_.resize(num_output_elements, 0); + + // only needed if out temp would be smaller than intemp + if (!config_.disable_aggregate_after_update && + layer_dimensions_.input_columns > layer_dimensions_.output_columns) { + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", SAGE output temp var ", num_output_elements, " (", + FloatElementsToGB(num_output_elements), " GB)"); + out_temp_.resize(num_output_elements, 0); + } + layer_type_ = galois::GNNLayerType::kSAGE; #ifdef GALOIS_ENABLE_GPU // TODO(loc/hochan) GPU SAGE @@ -112,15 +127,18 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( assert(input_embeddings.size() == (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); - assert(p_in_temp_1_.size() == input_embeddings.size()); assert(p_forward_output_matrix_.size() == (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); // pointer to input to operate on const GNNFloat* input_data = input_embeddings.data(); + GNNFloat* agg_data; // first, dropout if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) { DoDropout(input_embeddings, &p_in_temp_1_); input_data = p_in_temp_1_.data(); + agg_data = p_in_temp_2_.data(); + } else { + agg_data = p_in_temp_1_.data(); } // O = FW1 + AFW2 is what is done if concat is on: below is the AFW2 part @@ -130,9 +148,9 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( if (config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { // aggregation and update - AggregateAll(layer_dimensions_.input_columns, input_data, - p_in_temp_2_.data(), &input_column_intermediates_); - UpdateEmbeddings(p_in_temp_2_.data(), p_forward_output_matrix_.data()); + AggregateAll(layer_dimensions_.input_columns, input_data, agg_data, + &input_column_intermediates_); + UpdateEmbeddings(agg_data, p_forward_output_matrix_.data()); } else { // update to aggregate // FW @@ -176,15 +194,47 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } // if dropout was used, use the dropout matrix for the input - galois::PointerWithSize input_to_use; + galois::PointerWithSize input_data; + galois::PointerWithSize agg_data; if (!config_.disable_dropout) { // dropout result is currently stored in temp 1 // needs to be used before it gets overwritten - input_to_use = p_in_temp_1_; + input_data = p_in_temp_1_; + agg_data = p_in_temp_2_; } else { // no dropout = use vanilla input - input_to_use = prev_layer_input; + input_data = prev_layer_input; + agg_data = p_in_temp_1_; + } + + // aggregate this here before gradient starts to get overwritten + if (!config_.disable_aggregate_after_update && + layer_dimensions_.input_columns > layer_dimensions_.output_columns) { + // aggregate occurs regardless of layer being equal to 0 because it is + // required in this case for the weight gradient calculation + // this is (FW)' + AggregateAll(layer_dimensions_.output_columns, input_gradient->data(), + p_out_temp_.data(), &output_column_intermediates_, true); + } + + if (!sage_config_.disable_concat) { + if (layer_number_ != 0) { + MaskInputNonMasters(&input_data); + } else { + // if 0 then no input to mask: mask the gradient + // this is fine because gradient won't be used to get feature gradients + MaskGradientNonMasters(input_gradient); + } + // input data (prev layer input or temp1) or gradient need mask + // can mask gradient if layer == 0 + // otherwise must mask other + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, + layer_dimensions_.input_rows, layer_dimensions_.output_columns, + input_data.data(), input_gradient->data(), + p_layer_weight_gradients_2_.data()); } + WeightGradientSyncSum2(); // AFW = O @@ -192,32 +242,19 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // TODO clean up logic here to reduce nesting if (config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { - if (layer_number_ != 0) { - // ---unmasked--- - // transposed sgemm for derivative; in_temp is output - assert(input_gradient->size() == - layer_dimensions_.input_rows * layer_dimensions_.output_columns); - assert(p_in_temp_1_.size() == - layer_dimensions_.input_columns * layer_dimensions_.input_rows); - // pintemp1 contains (AF)' - // overwrites the dropout matrix that was in ptemp1 (needed for second - // weight matrix) - UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data()); - // pback contains F' - // derivative of aggregate is the same due to symmetric graph - AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(), - p_backward_output_matrix_.data(), - &input_column_intermediates_, true); + // aggdata can == p_intemp1; in other words, need to use before overwrite + // mask it, then use it + if (layer_number_ != 0 || sage_config_.disable_concat) { + MaskInputNonMasters(&agg_data); } - // weight gradient calculation - // TODO(loc) put this in a function to put the ifdef in there - // ---masked--- - MaskGradientNonMasters(input_gradient); + // if concat is disabled, then input grad isn't masked; therefore, mask + // this to get the same effect + #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.GetWeightGradientsGPU( layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, p_in_temp_2_.data(), + layer_dimensions_.output_columns, agg_data.data(), input_gradient->data(), p_layer_weight_gradients_.data()); } else { #endif @@ -225,91 +262,76 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( galois::CBlasSGEMM( CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, layer_dimensions_.input_rows, layer_dimensions_.output_columns, - p_in_temp_2_.data(), input_gradient->data(), + agg_data.data(), input_gradient->data(), p_layer_weight_gradients_.data()); #ifdef GALOIS_ENABLE_GPU } #endif + + // 0 means input gradient shouldn't get masked + if (layer_number_ != 0) { + // ---unmasked--- + // transposed sgemm for derivative; in_temp is output + assert(input_gradient->size() == + layer_dimensions_.input_rows * layer_dimensions_.output_columns); + // pintemp1 contains (AF)' + // overwrites the dropout matrix that was in ptemp1 (needed for second + // weight matrix) + UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data()); + // pback contains F' + // derivative of aggregate is the same due to symmetric graph + AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(), + p_backward_output_matrix_.data(), + &input_column_intermediates_, true); + } } else { - // aggregate occurs regardless of layer being equal to 0 because it is - // required in this case for the weight gradient calculation - // this is (FW)' // --unmasked-- - AggregateAll(layer_dimensions_.output_columns, input_gradient->data(), - p_out_temp_.data(), &output_column_intermediates_, true); - if (layer_number_ != 0) { - // derivative for update - // backout = F' - UpdateEmbeddingsDerivative(p_out_temp_.data(), - p_backward_output_matrix_.data()); + // disable concat part is here because otherwise it would get done elsewhere + if (layer_number_ != 0 && sage_config_.disable_concat) { + MaskInputNonMasters(&input_data); + } else { + // if 0 then no input to mask: mask the gradient + // this is fine because gradient won't be used to get feature gradients + MaskGradientNonMasters(&p_out_temp_); } - // TODO put this in a function + // W' = F^T (FW)' - // input to use is not overwritten in this branch so it's safe to use - // --- masked ---, uses ptemp1 - MaskGradientNonMasters(&p_out_temp_); + // TODO put this in a function #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.GetWeightGradientsGPU( layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, input_to_use.data(), + layer_dimensions_.output_columns, input_data.data(), p_out_temp_.data(), p_layer_weight_gradients_.data()); } else { #endif galois::CBlasSGEMM(CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, layer_dimensions_.input_rows, - layer_dimensions_.output_columns, input_to_use.data(), + layer_dimensions_.output_columns, input_data.data(), p_out_temp_.data(), p_layer_weight_gradients_.data()); #ifdef GALOIS_ENABLE_GPU } #endif - } - - if (!sage_config_.disable_concat) { - // Fw1 + AFW2 = O; self feature has own weight matrix and makes own - // contribution to gradients which is handled in this block - // second weight matrix: reconstruct the dropout matrix if it was - // overwritten into temp1 - if (config_.disable_aggregate_after_update || - layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { - if (!config_.disable_dropout) { - // input gradients have already been masked; need to reconstruct the - // dropout matrix which we can do since we saved the dropout mask - // save it into ptemp1 - ReconstructDropoutMatrix(prev_layer_input, &p_in_temp_1_); - // !!!NOTE!!! - // If you're using dropout in the distributed setting you've already - // thrown consistency out the window anyways because distributed RNG - // will make it so each host does something different - // Therefore, this op above is nothing more than a feeble attempt - // at getting *some* notion of consistency - } - } else { - // mask original input gradients since this path masks the aggregated - // gradients only - MaskGradientNonMasters(input_gradient); - // in dropout case, ptemp1 (contained in input to use) still contains the - // dropout matrix so no need to recompute - } - - galois::CBlasSGEMM( - CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, - layer_dimensions_.input_rows, layer_dimensions_.output_columns, - input_to_use.data(), input_gradient->data(), - p_layer_weight_gradients_2_.data()); - WeightGradientSyncSum2(); if (layer_number_ != 0) { - // deal with feature gradients for the self feature here - // this function will sum directly into the backward matrix - SelfFeatureUpdateEmbeddingsDerivative(input_gradient->data(), - p_backward_output_matrix_.data()); + // derivative for update + // backout = F' + UpdateEmbeddingsDerivative(p_out_temp_.data(), + p_backward_output_matrix_.data()); } } - WeightGradientSyncSum(); + // full gradient needed here; should occur after all updates + if (layer_number_ != 0) { + // deal with feature gradients for the self feature here + // this function will sum directly into the backward matrix + // input gradient never gets masked if layer number != 0 + SelfFeatureUpdateEmbeddingsDerivative(input_gradient->data(), + p_backward_output_matrix_.data()); + } + if (!config_.disable_dropout && layer_number_ != 0) { DoDropoutDerivative(); } diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp index 317811b6df..f2a421bed1 100644 --- a/libgnn/src/layers/SigmoidLayer.cpp +++ b/libgnn/src/layers/SigmoidLayer.cpp @@ -68,7 +68,8 @@ galois::SigmoidLayer::ForwardPhase( galois::PointerWithSize galois::SigmoidLayer::BackwardPhaseCPU() { const size_t feature_length = layer_dimensions_.input_columns; - backward_output_matrix_.assign(backward_output_matrix_.size(), 0); + galois::do_all(galois::iterate(size_t{0}, p_backward_output_matrix_.size()), + [&](size_t i) { p_backward_output_matrix_[i] = 0; }); galois::do_all( galois::iterate(graph_.begin(), graph_.end()), @@ -86,7 +87,7 @@ galois::SigmoidLayer::BackwardPhaseCPU() { // sigmoid-cross-entropy derivative: turns out all it is is simple // subtraction for (unsigned index = 0; index < feature_length; index++) { - backward_output_matrix_[node_offset + index] = + p_backward_output_matrix_[node_offset + index] = forward_output_matrix_[node_offset + index] - ground_truth[index]; } @@ -94,7 +95,7 @@ galois::SigmoidLayer::BackwardPhaseCPU() { }, galois::steal(), galois::loopname("SigmoidBackward")); - return backward_output_matrix_; + return p_backward_output_matrix_; } galois::PointerWithSize diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 10ed93c8ff..f0ded3ac49 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -79,8 +79,8 @@ galois::PointerWithSize galois::SoftmaxLayer::BackwardPhaseCPU() { const size_t feature_length = layer_dimensions_.input_columns; - // zero out output - backward_output_matrix_.assign(backward_output_matrix_.size(), 0); + galois::do_all(galois::iterate(size_t{0}, p_backward_output_matrix_.size()), + [&](size_t i) { p_backward_output_matrix_[i] = 0; }); galois::do_all( galois::iterate(graph_.begin(), graph_.end()), @@ -101,11 +101,11 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { for (size_t idx = 0; idx < feature_length; idx++) { if (idx == correct) { // positive class - backward_output_matrix_[node * feature_length + idx] = + p_backward_output_matrix_[node * feature_length + idx] = forward_output_matrix_[node * feature_length + idx] - 1; } else { // negative class - backward_output_matrix_[node * feature_length + idx] = + p_backward_output_matrix_[node * feature_length + idx] = forward_output_matrix_[node * feature_length + idx]; } } @@ -113,7 +113,7 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { }, galois::steal(), galois::loopname("SoftmaxBackward")); - return PointerWithSize(backward_output_matrix_); + return p_backward_output_matrix_; } galois::PointerWithSize diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp index c3e3439a5e..eac1f89c84 100644 --- a/libgnn/test/aggregate-sync-test.cpp +++ b/libgnn/test/aggregate-sync-test.cpp @@ -34,13 +34,17 @@ int main() { dimension_0.input_columns = 3; dimension_0.output_columns = 2; galois::GNNLayerConfig l_config; - l_config.disable_aggregate_after_update = false; l_config.DebugConfig(); + l_config.disable_aggregate_after_update = true; + + galois::PointerWithSize p_null(nullptr, 0); + std::vector back_matrix(test_graph->size() * 3); + galois::PointerWithSize p_back(back_matrix); // create the layer, no norm factor std::unique_ptr layer_0 = - std::make_unique(0, *(test_graph.get()), - dimension_0, l_config); + std::make_unique( + 0, *(test_graph.get()), &p_null, dimension_0, l_config); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner galois::PointerWithSize layer_0_forward_output = @@ -122,8 +126,8 @@ int main() { // layer 1 to check backward output ////////////////////////////////////////////////////////////////////////////// std::unique_ptr layer_1 = - std::make_unique(1, *(test_graph.get()), - dimension_0, l_config); + std::make_unique( + 1, *(test_graph.get()), &p_back, dimension_0, l_config); layer_1->InitAllWeightsTo1(); galois::PointerWithSize layer_1_forward_output = layer_1->ForwardPhase(test_graph->GetLocalFeatures()); @@ -229,7 +233,7 @@ int main() { // create the layer, no norm factor layer_0 = std::make_unique( - 0, *(test_graph_2.get()), dimension_0, l_config); + 0, *(test_graph_2.get()), &p_null, dimension_0, l_config); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner @@ -293,8 +297,11 @@ int main() { } } + std::vector back_matrix_2(test_graph_2->size() * 3); + galois::PointerWithSize p_back_2(back_matrix_2); + layer_1 = std::make_unique( - 1, *(test_graph_2.get()), dimension_0, l_config); + 1, *(test_graph_2.get()), &p_back_2, dimension_0, l_config); layer_1->InitAllWeightsTo1(); layer_1_forward_output = layer_1->ForwardPhase(test_graph_2->GetLocalFeatures()); diff --git a/libgnn/test/back-conv-test.cpp b/libgnn/test/back-conv-test.cpp index b1c9c025c6..480058f6ae 100644 --- a/libgnn/test/back-conv-test.cpp +++ b/libgnn/test/back-conv-test.cpp @@ -60,14 +60,18 @@ int main() { galois::GNNLayerConfig dcon; dcon.DebugConfig(); + dcon.disable_aggregate_after_update = true; // dummy 1 matrix std::vector dummy_ones_v(test_graph.size() * 2, 1); galois::PointerWithSize dummy_ones(dummy_ones_v); + std::vector back_matrix(test_graph.size() * 3); + galois::PointerWithSize p_back(back_matrix); + // create layer 1 for testing backward prop actually giving weights back std::unique_ptr layer_1 = - std::make_unique(1, test_graph, + std::make_unique(1, test_graph, &p_back, dimension_0, dcon); layer_1->InitAllWeightsTo1(); galois::PointerWithSize layer_1_forward_output = diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp index 309433845b..5902d059fa 100644 --- a/libgnn/test/convlayer-test.cpp +++ b/libgnn/test/convlayer-test.cpp @@ -55,9 +55,13 @@ int main() { dcon.disable_aggregate_after_update = false; dcon.DebugConfig(); + galois::PointerWithSize p_null(nullptr, 0); + std::vector back_matrix(21); + galois::PointerWithSize p_back(back_matrix); + // create the layer, no norm factor std::unique_ptr layer_0 = - std::make_unique(0, test_graph, + std::make_unique(0, test_graph, &p_null, dimension_0, dcon); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner @@ -122,7 +126,7 @@ int main() { // create layer 1 for testing backward prop actually giving weights back std::unique_ptr layer_1 = - std::make_unique(1, test_graph, + std::make_unique(1, test_graph, &p_back, dimension_0, dcon); layer_1->InitAllWeightsTo1(); galois::PointerWithSize layer_1_forward_output = @@ -199,7 +203,7 @@ int main() { // don't have time for at the moment // TODO in future maybe add better unit test for this std::unique_ptr layer_2 = - std::make_unique(1, test_graph, + std::make_unique(1, test_graph, &p_back, dimension_0, config); galois::PointerWithSize l2_fo = layer_2->ForwardPhase(test_graph.GetLocalFeatures()); diff --git a/libgnn/test/l2norm-layer-test.cpp b/libgnn/test/l2norm-layer-test.cpp index a66c419a7f..ca30c99ac0 100644 --- a/libgnn/test/l2norm-layer-test.cpp +++ b/libgnn/test/l2norm-layer-test.cpp @@ -35,8 +35,11 @@ int main() { l2_input[12] = 4; l2_input[13] = 3; - auto l2_layer = - std::make_unique(2, test_graph, dimension_0); + std::vector back_matrix(14); + galois::PointerWithSize p_back(back_matrix); + + auto l2_layer = std::make_unique(2, test_graph, &p_back, + dimension_0); galois::PointerWithSize normed = l2_layer->ForwardPhase(l2_input); diff --git a/libgnn/test/sage-layer-test.cpp b/libgnn/test/sage-layer-test.cpp index 39a2cd2635..830e147a7c 100644 --- a/libgnn/test/sage-layer-test.cpp +++ b/libgnn/test/sage-layer-test.cpp @@ -28,9 +28,13 @@ int main() { galois::SAGELayerConfig scon; scon.disable_concat = false; + galois::PointerWithSize p_null(nullptr, 0); + std::vector back_matrix(21); + galois::PointerWithSize p_back(back_matrix); + std::unique_ptr layer_0 = - std::make_unique(0, test_graph, dimension_0, dcon, - scon); + std::make_unique(0, test_graph, &p_null, dimension_0, + dcon, scon); layer_0->InitAllWeightsTo1(); // sage weights for self layer_0->InitSelfWeightsTo1(); @@ -109,8 +113,8 @@ int main() { // create layer 1 for testing backward prop actually giving weights back - auto layer_1 = std::make_unique(1, test_graph, dimension_0, - dcon, scon); + auto layer_1 = std::make_unique(1, test_graph, &p_back, + dimension_0, dcon, scon); layer_1->InitAllWeightsTo1(); layer_1->InitSelfWeightsTo1(); @@ -201,8 +205,8 @@ int main() { // (verification requires floating point accuracy or setting a seed which I // don't have time for at the moment // TODO in future maybe add better unit test for this - auto layer_2 = std::make_unique(1, test_graph, dimension_0, - config, scon); + auto layer_2 = std::make_unique(1, test_graph, &p_back, + dimension_0, config, scon); galois::PointerWithSize l2_fo = layer_2->ForwardPhase(test_graph.GetLocalFeatures()); GALOIS_LOG_ASSERT(l2_fo.size() == 14); diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp index 063ff80ca5..3540582ade 100644 --- a/libgnn/test/sample-test.cpp +++ b/libgnn/test/sample-test.cpp @@ -40,8 +40,11 @@ int main() { ////////////////////////////////////////////////////////////////////////////// + std::vector back_matrix(21); + galois::PointerWithSize p_back(back_matrix); + std::unique_ptr layer_1 = - std::make_unique(1, test_graph, + std::make_unique(1, test_graph, &p_back, dimension_0, dcon); layer_1->InitAllWeightsTo1(); layer_1->EnableSampling(); @@ -139,8 +142,11 @@ int main() { softmax_input[40] = 1; softmax_input[48] = 1; - auto output_layer = - std::make_unique(3, test_graph, dimension_out); + std::vector back_matrix_2(49); + galois::PointerWithSize p_back_2(back_matrix_2); + + auto output_layer = std::make_unique( + 3, test_graph, &p_back_2, dimension_out); output_layer->EnableSampling(); galois::PointerWithSize prediction_distribution = output_layer->ForwardPhase(softmax_input); @@ -183,8 +189,8 @@ int main() { galois::graphs::GNNGraph multi_graph( "tester", galois::graphs::GNNPartitionScheme::kOEC, false); - auto sigmoid_layer = - std::make_unique(3, multi_graph, dimension_out); + auto sigmoid_layer = std::make_unique( + 3, multi_graph, &p_back_2, dimension_out); sigmoid_layer->EnableSampling(); // reuse softmax input; only thing interested in is checking for 0s prediction_distribution = sigmoid_layer->ForwardPhase(softmax_input); diff --git a/libgnn/test/sigmoidlayer-test.cpp b/libgnn/test/sigmoidlayer-test.cpp index 333651bdf5..0bc2cd7252 100644 --- a/libgnn/test/sigmoidlayer-test.cpp +++ b/libgnn/test/sigmoidlayer-test.cpp @@ -47,9 +47,12 @@ int main() { softmax_input[40] = 0; softmax_input[48] = 0; + std::vector back_matrix(49); + galois::PointerWithSize p_back(back_matrix); + // train mode - auto output_layer = - std::make_unique(3, test_graph, dimension_0); + auto output_layer = std::make_unique( + 3, test_graph, &p_back, dimension_0); output_layer->ForwardPhase(softmax_input); galois::PointerWithSize asdf = diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp index 7a6de416dc..b85e0b4bb6 100644 --- a/libgnn/test/softmaxlayer-test.cpp +++ b/libgnn/test/softmaxlayer-test.cpp @@ -39,9 +39,12 @@ int main() { softmax_input[40] = 1; softmax_input[48] = 1; + std::vector back_matrix(49); + galois::PointerWithSize p_back(back_matrix); + // train mode - auto output_layer = - std::make_unique(3, test_graph, dimension_0); + auto output_layer = std::make_unique( + 3, test_graph, &p_back, dimension_0); galois::PointerWithSize prediction_distribution = output_layer->ForwardPhase(softmax_input); From d95af12fbb373177ed500d587b85b7fea757a21c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 11 Mar 2021 17:27:17 -0600 Subject: [PATCH 502/660] Backward pass call takes non-const forward input Allow Backward Pass to mess with the input from the previous layer. Reason for this is to let the output layers reuse it to save significant space. --- libgnn/include/galois/layers/DenseLayer.h | 2 +- libgnn/include/galois/layers/GNNLayer.h | 2 +- libgnn/include/galois/layers/GraphConvolutionalLayer.h | 2 +- libgnn/include/galois/layers/L2NormLayer.h | 4 ++-- libgnn/include/galois/layers/SAGELayer.h | 2 +- libgnn/include/galois/layers/SigmoidLayer.h | 2 +- libgnn/include/galois/layers/SoftmaxLayer.h | 2 +- libgnn/src/layers/L2NormLayer.cpp | 2 +- libgnn/src/layers/SigmoidLayer.cpp | 2 +- libgnn/src/layers/SoftmaxLayer.cpp | 2 +- 10 files changed, 11 insertions(+), 11 deletions(-) diff --git a/libgnn/include/galois/layers/DenseLayer.h b/libgnn/include/galois/layers/DenseLayer.h index bb651ca30e..7b00d1987c 100644 --- a/libgnn/include/galois/layers/DenseLayer.h +++ b/libgnn/include/galois/layers/DenseLayer.h @@ -25,7 +25,7 @@ class DenseLayer : public GNNLayer { ForwardPhase(const PointerWithSize input_embeddings) final; PointerWithSize - BackwardPhase(const PointerWithSize prev_layer_input, + BackwardPhase(PointerWithSize prev_layer_input, PointerWithSize* input_gradient) final; private: diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 4e83cdc145..b5fb109ffe 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -154,7 +154,7 @@ class GNNLayer { //! @returns Output of the backward phase (i.e. input to previous layer); note //! it's a pointer because layer can mess with it virtual PointerWithSize - BackwardPhase(const PointerWithSize prev_layer_input, + BackwardPhase(PointerWithSize prev_layer_input, PointerWithSize* input_gradient) = 0; //! Given an optimizer, update the weights in this layer based on gradients diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h index d7a600096d..988276965d 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h @@ -32,7 +32,7 @@ class GraphConvolutionalLayer : public GNNLayer { ForwardPhase(const PointerWithSize input_embeddings) final; PointerWithSize - BackwardPhase(const PointerWithSize prev_layer_input, + BackwardPhase(PointerWithSize prev_layer_input, PointerWithSize* input_gradient) final; private: diff --git a/libgnn/include/galois/layers/L2NormLayer.h b/libgnn/include/galois/layers/L2NormLayer.h index 34ac3983e1..0ed1a0d0df 100644 --- a/libgnn/include/galois/layers/L2NormLayer.h +++ b/libgnn/include/galois/layers/L2NormLayer.h @@ -31,7 +31,7 @@ class L2NormLayer : public GNNLayer { ForwardPhase(const PointerWithSize input_embeddings); PointerWithSize - BackwardPhase(const PointerWithSize prev_layer_input, + BackwardPhase(PointerWithSize prev_layer_input, PointerWithSize* input_gradient); private: @@ -39,7 +39,7 @@ class L2NormLayer : public GNNLayer { ForwardPhaseCPU(const PointerWithSize input_embeddings); PointerWithSize - BackwardPhaseCPU(const PointerWithSize prev_layer_input, + BackwardPhaseCPU(PointerWithSize prev_layer_input, PointerWithSize* input_gradient); //! No op diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index 056ea748c1..b5ee978067 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -61,7 +61,7 @@ class SAGELayer : public GNNLayer { ForwardPhase(const PointerWithSize input_embeddings) final; PointerWithSize - BackwardPhase(const PointerWithSize prev_layer_input, + BackwardPhase(PointerWithSize prev_layer_input, PointerWithSize* input_gradient) final; private: diff --git a/libgnn/include/galois/layers/SigmoidLayer.h b/libgnn/include/galois/layers/SigmoidLayer.h index 5a2f9f6894..209929bf30 100644 --- a/libgnn/include/galois/layers/SigmoidLayer.h +++ b/libgnn/include/galois/layers/SigmoidLayer.h @@ -32,7 +32,7 @@ class SigmoidLayer : public GNNLayer { //! Get gradients to fix distribution such that it leans more towards //! multiclass ground truth. PointerWithSize - BackwardPhase(const PointerWithSize, + BackwardPhase(PointerWithSize, PointerWithSize*) final; private: diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index 444a383386..5fae882531 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -43,7 +43,7 @@ class SoftmaxLayer : public GNNLayer { //! Get gradients to fix distribution such that it leans more towards single //! class ground truth. PointerWithSize - BackwardPhase(const PointerWithSize prev_layer_input, + BackwardPhase(PointerWithSize prev_layer_input, PointerWithSize* input_gradient) final; private: diff --git a/libgnn/src/layers/L2NormLayer.cpp b/libgnn/src/layers/L2NormLayer.cpp index d7c04c52e9..bcf66eb2f9 100644 --- a/libgnn/src/layers/L2NormLayer.cpp +++ b/libgnn/src/layers/L2NormLayer.cpp @@ -51,7 +51,7 @@ galois::L2NormLayer::ForwardPhaseCPU( } galois::PointerWithSize galois::L2NormLayer::BackwardPhase( - const PointerWithSize prev_layer_input, + PointerWithSize prev_layer_input, PointerWithSize* input_gradient) { #ifdef GALOIS_ENABLE_GPU // TODO diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp index f2a421bed1..1809decc8a 100644 --- a/libgnn/src/layers/SigmoidLayer.cpp +++ b/libgnn/src/layers/SigmoidLayer.cpp @@ -99,7 +99,7 @@ galois::SigmoidLayer::BackwardPhaseCPU() { } galois::PointerWithSize -galois::SigmoidLayer::BackwardPhase(const PointerWithSize, +galois::SigmoidLayer::BackwardPhase(PointerWithSize, PointerWithSize*) { #ifdef GALOIS_ENABLE_GPU // TODO(loc) when GPU needs it diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index f0ded3ac49..57eebd005e 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -117,7 +117,7 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { } galois::PointerWithSize -galois::SoftmaxLayer::BackwardPhase(const PointerWithSize, +galois::SoftmaxLayer::BackwardPhase(PointerWithSize, PointerWithSize*) { #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { From 347018f1ee39b8e467da1e9baec608b1d092405d Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 11 Mar 2021 20:12:57 -0600 Subject: [PATCH 503/660] GNN space saving (part 2): no softmax output The softmax layer now reuses the output layer from the layer before it in all of its operations, making it possible to completely avoid allocating another matrix (resulting in *huge* space savings if # of input classes is sufficiently large). The problem with this is now you cannot check the output matrix of the layer before it (because softmax will destroy it), and running backward phase will destroy the predictions (meaning you have to check accuracy before you do gradient descent). Some tests had to be changed as a result of the changes described above. --- libgnn/include/galois/layers/SoftmaxLayer.h | 7 ++- libgnn/src/GraphNeuralNetwork.cpp | 5 +- libgnn/src/layers/SoftmaxLayer.cpp | 25 ++++---- libgnn/test/gnnfb-test.cpp | 64 ++++++++++----------- libgnn/test/softmaxlayer-test.cpp | 37 ++++++------ 5 files changed, 73 insertions(+), 65 deletions(-) diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index 5fae882531..433d055f83 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -15,8 +15,9 @@ class SoftmaxLayer : public GNNLayer { PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) - : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, - GNNLayerConfig{.allocate_weights = false}), + : GNNLayer( + layer_num, graph, backward_output_matrix, dimensions, + GNNLayerConfig{.allocate_weights = false, .disable_output = true}), #ifdef GALOIS_ENABLE_GPU gpu_object_(graph.GetGPUGraph()), #endif @@ -43,7 +44,7 @@ class SoftmaxLayer : public GNNLayer { //! Get gradients to fix distribution such that it leans more towards single //! class ground truth. PointerWithSize - BackwardPhase(PointerWithSize prev_layer_input, + BackwardPhase(PointerWithSize in_out, PointerWithSize* input_gradient) final; private: diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 9e944d0568..4942076b23 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -171,11 +171,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { graph_->CalculateSpecialNormFactor(true, config_.inductive_training_); } const PointerWithSize predictions = DoInference(); + // have to get accuracy here because gradient prop destroys the predictions + // matrix + train_accuracy = GetGlobalAccuracy(predictions); GradientPropagation(); epoch_timer.stop(); - train_accuracy = GetGlobalAccuracy(predictions); - if (this_host == 0) { const std::string t_name_acc = "TrainEpoch" + std::to_string(epoch) + "Accuracy"; diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 57eebd005e..47d5f2ce0b 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -5,8 +5,8 @@ const galois::PointerWithSize galois::SoftmaxLayer::ForwardPhaseCPU( const galois::PointerWithSize input_embeddings) { + // note: p_backward == input_embeddings input_loss_.assign(input_loss_.size(), 0.0); - forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); const size_t feature_length = layer_dimensions_.input_columns; #ifndef NDEBUG //#ifdef NDEBUG @@ -20,14 +20,17 @@ galois::SoftmaxLayer::ForwardPhaseCPU( galois::iterate(graph_.begin(), graph_.end()), [&](const unsigned i) { if (IsSampledLayer()) { - if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i)) + if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i)) { + VectorZero(feature_length, + &p_backward_output_matrix_[i * feature_length]); return; + } } if (graph_.IsValidForPhase(i, layer_phase_)) { // do softmax GNNSoftmax(feature_length, &input_embeddings[feature_length * i], - &forward_output_matrix_[feature_length * i]); + &p_backward_output_matrix_[feature_length * i]); // create ground truth vector for this LID std::vector* ground_truth_vec = ground_truth_vectors_.getLocal(); @@ -40,12 +43,15 @@ galois::SoftmaxLayer::ForwardPhaseCPU( // calculate loss for this LID (note not all i will be filled) input_loss_[i] = GNNCrossEntropy(feature_length, ground_truth_vec->data(), - &forward_output_matrix_[feature_length * i]); + &p_backward_output_matrix_[feature_length * i]); #ifndef NDEBUG //#ifdef NDEBUG loss_accum += input_loss_[i]; handled += 1; #endif + } else { + VectorZero(feature_length, + &p_backward_output_matrix_[i * feature_length]); } }, // TODO chunk size? @@ -58,7 +64,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU( galois::gPrint("Loss is ", reduced_loss / t, "\n"); #endif - return forward_output_matrix_; + return p_backward_output_matrix_; } const galois::PointerWithSize @@ -79,9 +85,6 @@ galois::PointerWithSize galois::SoftmaxLayer::BackwardPhaseCPU() { const size_t feature_length = layer_dimensions_.input_columns; - galois::do_all(galois::iterate(size_t{0}, p_backward_output_matrix_.size()), - [&](size_t i) { p_backward_output_matrix_[i] = 0; }); - galois::do_all( galois::iterate(graph_.begin(), graph_.end()), [&](const unsigned node) { @@ -102,11 +105,11 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { if (idx == correct) { // positive class p_backward_output_matrix_[node * feature_length + idx] = - forward_output_matrix_[node * feature_length + idx] - 1; + p_backward_output_matrix_[node * feature_length + idx] - 1; } else { // negative class p_backward_output_matrix_[node * feature_length + idx] = - forward_output_matrix_[node * feature_length + idx]; + p_backward_output_matrix_[node * feature_length + idx]; } } } @@ -123,7 +126,7 @@ galois::SoftmaxLayer::BackwardPhase(PointerWithSize, if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.BackwardPhaseGPU( layer_phase_, graph_.size(), layer_dimensions_.input_columns, - p_forward_output_matrix_.data(), p_backward_output_matrix_.data()); + p_backward_output_matrix_.data(), p_backward_output_matrix_.data()); return p_backward_output_matrix_; } #endif diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp index 091c6f01c8..b99c8aeb8d 100644 --- a/libgnn/test/gnnfb-test.cpp +++ b/libgnn/test/gnnfb-test.cpp @@ -44,7 +44,7 @@ int main() { ////////////////////////////////////////////////////////////////////////////// // forward phase ////////////////////////////////////////////////////////////////////////////// - gnn->DoInference(); + const galois::PointerWithSize fo_out = gnn->DoInference(); // check output for layers to make sure it's as expected galois::PointerWithSize lf0_out = @@ -72,33 +72,36 @@ int main() { GALOIS_LOG_ASSERT(lf0_out[24 + i] == 15); } - const galois::PointerWithSize lf1_out = - gnn->GetIntermediateLayer(1)->GetForwardOutput(); - GALOIS_LOG_ASSERT(lf1_out.size() == 49); - for (size_t i = 0; i < 7; i++) { - GALOIS_LOG_ASSERT(lf1_out[0 + i] == 24); - } - for (size_t i = 0; i < 7; i++) { - GALOIS_LOG_ASSERT(lf1_out[7 + i] == 60); - } - for (size_t i = 0; i < 7; i++) { - GALOIS_LOG_ASSERT(lf1_out[14 + i] == 96); - } - for (size_t i = 0; i < 7; i++) { - GALOIS_LOG_ASSERT(lf1_out[21 + i] == 144); - } - for (size_t i = 0; i < 7; i++) { - GALOIS_LOG_ASSERT(lf1_out[28 + i] == 192); - } - for (size_t i = 0; i < 7; i++) { - GALOIS_LOG_ASSERT(lf1_out[35 + i] == 156); - } - for (size_t i = 0; i < 7; i++) { - GALOIS_LOG_ASSERT(lf1_out[42 + i] == 120); - } + // Disabled: this test worked in past because forward outputs were all + // separate matrices, but due to space saving measures this forward output + // gets messed with by the softmax call + + // const galois::PointerWithSize lf1_out = + // gnn->GetIntermediateLayer(1)->GetForwardOutput(); + // GALOIS_LOG_ASSERT(lf1_out.size() == 49); + // for (size_t i = 0; i < 7; i++) { + // GALOIS_LOG_VASSERT(lf1_out[0 + i] == 24, "{} vs {} (correct)", lf1_out[0 + + // i], 24); + //} + // for (size_t i = 0; i < 7; i++) { + // GALOIS_LOG_ASSERT(lf1_out[7 + i] == 60); + //} + // for (size_t i = 0; i < 7; i++) { + // GALOIS_LOG_ASSERT(lf1_out[14 + i] == 96); + //} + // for (size_t i = 0; i < 7; i++) { + // GALOIS_LOG_ASSERT(lf1_out[21 + i] == 144); + //} + // for (size_t i = 0; i < 7; i++) { + // GALOIS_LOG_ASSERT(lf1_out[28 + i] == 192); + //} + // for (size_t i = 0; i < 7; i++) { + // GALOIS_LOG_ASSERT(lf1_out[35 + i] == 156); + //} + // for (size_t i = 0; i < 7; i++) { + // GALOIS_LOG_ASSERT(lf1_out[42 + i] == 120); + //} - const galois::PointerWithSize fo_out = - gnn->GetOutputLayer()->GetForwardOutput(); GALOIS_LOG_ASSERT(fo_out.size() == 49); // since row all same, prob distribution across row should be same for (size_t c = 0; c < 49; c += 7) { @@ -127,9 +130,8 @@ int main() { ////////////////////////////////////////////////////////////////////////////// gnn->SetLayerPhases(galois::GNNPhase::kValidate); gnn->SetAllLayerWeightsTo1(); - gnn->DoInference(); const galois::PointerWithSize fo_out_val = - gnn->GetOutputLayer()->GetForwardOutput(); + gnn->DoInference(); for (size_t c = 0; c < 49; c += 7) { for (size_t i = 0; i < 6; i++) { GALOIS_LOG_ASSERT(fo_out_val[c + i] == fo_out_val[c + i + 1]); @@ -150,9 +152,7 @@ int main() { // all but last should be 0s gnn->SetLayerPhases(galois::GNNPhase::kTest); gnn->SetAllLayerWeightsTo1(); - gnn->DoInference(); - galois::PointerWithSize fo_out_test = - gnn->GetOutputLayer()->GetForwardOutput(); + galois::PointerWithSize fo_out_test = gnn->DoInference(); for (size_t c = 0; c < 49; c += 7) { for (size_t i = 0; i < 6; i++) { GALOIS_LOG_ASSERT(fo_out_test[c + i] == fo_out_test[c + i + 1]); diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp index b85e0b4bb6..66c4e557bc 100644 --- a/libgnn/test/softmaxlayer-test.cpp +++ b/libgnn/test/softmaxlayer-test.cpp @@ -48,13 +48,6 @@ int main() { galois::PointerWithSize prediction_distribution = output_layer->ForwardPhase(softmax_input); - galois::PointerWithSize asdf = - output_layer->BackwardPhase(softmax_input, nullptr); - printf("Output 1\n========\n"); - for (unsigned i = 0; i < asdf.size(); i++) { - printf("%f\n", asdf[i]); - } - // assert that predictions are as expected for (size_t i = 0; i < 5; i++) { GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(prediction_distribution[i * 7])) == @@ -71,15 +64,19 @@ int main() { GALOIS_LOG_ASSERT(prediction_distribution[i * 7 + 6] == 0.0); } + // NOTE: checked before backward because backward overwrites this matrix + + galois::PointerWithSize asdf = + output_layer->BackwardPhase(softmax_input, nullptr); + printf("Output 1\n========\n"); + for (unsigned i = 0; i < asdf.size(); i++) { + printf("%f\n", asdf[i]); + } + // validation mode output_layer->SetLayerPhase(galois::GNNPhase::kValidate); galois::PointerWithSize pd2 = output_layer->ForwardPhase(softmax_input); - asdf = output_layer->BackwardPhase(softmax_input, nullptr); - printf("Output 2\n========\n"); - for (unsigned i = 0; i < asdf.size(); i++) { - printf("%f\n", asdf[i]); - } // validate vertex is index 5 GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd2[5 * 7])) == 5); @@ -102,16 +99,16 @@ int main() { GALOIS_LOG_ASSERT(pd2[i * 7 + 6] == 0.0); } - // test mode - output_layer->SetLayerPhase(galois::GNNPhase::kTest); - galois::PointerWithSize pd3 = - output_layer->ForwardPhase(softmax_input); asdf = output_layer->BackwardPhase(softmax_input, nullptr); - printf("Output 3\n========\n"); + printf("Output 2\n========\n"); for (unsigned i = 0; i < asdf.size(); i++) { printf("%f\n", asdf[i]); } + // test mode + output_layer->SetLayerPhase(galois::GNNPhase::kTest); + galois::PointerWithSize pd3 = + output_layer->ForwardPhase(softmax_input); // validate vertex is index 6 GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd3[6 * 7])) == 6); // all but last are empty distributions @@ -124,4 +121,10 @@ int main() { GALOIS_LOG_ASSERT(pd3[i * 7 + 5] == 0.0); GALOIS_LOG_ASSERT(pd3[i * 7 + 6] == 0.0); } + + asdf = output_layer->BackwardPhase(softmax_input, nullptr); + printf("Output 3\n========\n"); + for (unsigned i = 0; i < asdf.size(); i++) { + printf("%f\n", asdf[i]); + } } From bc1b748ffb8ba82d1bea932ac21b4f5bcf8a644a Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 17 Mar 2021 21:05:25 -0500 Subject: [PATCH 504/660] buffer overflow check Prevent send buffer from overflowing --- libgluon/include/galois/graphs/GluonSubstrate.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index 2ef2e0b136..9e7a7738a4 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -2317,6 +2317,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject { getSendBuffer(loopName, x, b, elem_size); + if (b.size() > static_cast(std::numeric_limits::max())) { + GALOIS_LOG_FATAL("send buff limit limit reached: {}", b.size()); + } + if ((!async) || (b.size() > 0)) { size_t syncTypePhase = 0; if (async && (syncType == syncBroadcast)) From 81249b69370bbf4f11e3541e8614de6554e1fdb8 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 15 Mar 2021 22:19:15 -0500 Subject: [PATCH 505/660] Remove host fence Unused from what I can tell. --- libdist/src/Barrier.cpp | 51 ----------------------------------------- 1 file changed, 51 deletions(-) diff --git a/libdist/src/Barrier.cpp b/libdist/src/Barrier.cpp index 455e22aaed..0558d8ebb4 100644 --- a/libdist/src/Barrier.cpp +++ b/libdist/src/Barrier.cpp @@ -41,52 +41,6 @@ #include "galois/runtime/BareMPI.h" namespace { -class HostFence : public galois::substrate::Barrier { -public: - virtual const char* name() const { return "HostFence"; } - - virtual void reinit(unsigned) {} - - //! control-flow barrier across distributed hosts - //! acts as a distributed-memory fence as well (flushes send and receives) - virtual void wait() { - auto& net = galois::runtime::getSystemNetworkInterface(); - - if (galois::runtime::evilPhase == 0) { - galois::gWarn("evilPhase is 0, implying loop-around or no use: fence " - "may not work correctly!"); - } - - for (unsigned h = 0; h < net.Num; ++h) { - if (h == net.ID) - continue; - galois::runtime::SendBuffer b; - galois::runtime::gSerialize(b, net.ID + 1); // non-zero message - net.sendTagged(h, galois::runtime::evilPhase, b); - } - net.flush(); // flush all sends - - unsigned received = 1; // self - while (received < net.Num) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; - do { - net.handleReceives(); // flush all receives from net.sendMsg() or - // net.sendSimple() - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); - } while (!p); - assert(p->first != net.ID); - // ignore received data - ++received; - } - ++galois::runtime::evilPhase; - if (galois::runtime::evilPhase >= - static_cast( - std::numeric_limits::max())) { // limit defined by MPI or - // LCI - galois::runtime::evilPhase = 1; - } - } -}; class HostBarrier : public galois::substrate::Barrier { public: @@ -110,8 +64,3 @@ galois::substrate::Barrier& galois::runtime::getHostBarrier() { static HostBarrier b; return b; } - -galois::substrate::Barrier& galois::runtime::getHostFence() { - static HostFence b; - return b; -} From 39bec32762d7c58453330b4562a780d08fdb2c19 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 31 Mar 2021 19:40:08 -0500 Subject: [PATCH 506/660] Adam optimizer fix: epsilon outside sqrt The epsilon value for the Adam optimizer is meant to prevent division by 0: it should not appear in the sqrt computation as it can greatly affect the gradient in some cases since it is in the denominator. --- libgnn/src/GNNOptimizers.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libgnn/src/GNNOptimizers.cpp b/libgnn/src/GNNOptimizers.cpp index 664de35e01..843e75a1a6 100644 --- a/libgnn/src/GNNOptimizers.cpp +++ b/libgnn/src/GNNOptimizers.cpp @@ -41,7 +41,7 @@ void galois::AdamOptimizer::GradientDescent( // weight update using bias corrected moments (matrix.data())[i] -= config_.alpha * bias_correct_first / - std::sqrt(bias_correct_second + config_.epsilon); + (std::sqrt(bias_correct_second) + config_.epsilon); }, galois::loopname("AdamOptimizerGradientDescent")); #ifdef GALOIS_ENABLE_GPU From fa1d597e4906351137df1b00afe6a5d676cf3e18 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 14 Apr 2021 16:56:43 -0500 Subject: [PATCH 507/660] GNNGraph now has edge sample bit and reverse edges After graph partitioning, reverse edges get constructed on each partition: this is to do correct aggregation later when sampling edges. A byte has been added to all edges as well to mark if the edge is "on" or "off" (i.e., sampled), with the corresponding in-edge sharing this data as necessary. This is all in preparation for distributed edge sampling. --- .../include/galois/graphs/DistributedGraph.h | 45 +++++++++- .../include/galois/graphs/BufferedGraph.h | 66 ++++++++------ .../include/galois/graphs/LC_CSR_CSC_Graph.h | 46 +++++----- .../include/galois/graphs/LC_CSR_Graph.h | 61 ++++++------- libgnn/include/galois/graphs/GNNGraph.h | 53 +++++++++-- libgnn/src/graphs/GNNGraph.cpp | 18 ++-- libgnn/src/layers/GraphConvolutionalLayer.cpp | 8 +- libgnn/src/layers/SAGELayer.cpp | 8 +- libgnn/test/CMakeLists.txt | 4 + libgnn/test/aggregate-sync-test.cpp | 14 +-- libgnn/test/sample-bit-test.cpp | 88 +++++++++++++++++++ 11 files changed, 299 insertions(+), 112 deletions(-) create mode 100644 libgnn/test/sample-bit-test.cpp diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h index 42b659fa67..1c56302b93 100644 --- a/libcusp/include/galois/graphs/DistributedGraph.h +++ b/libcusp/include/galois/graphs/DistributedGraph.h @@ -31,6 +31,7 @@ #include #include "galois/graphs/LC_CSR_Graph.h" +#include "galois/graphs/LC_CSR_CSC_Graph.h" #include "galois/graphs/BufferedGraph.h" #include "galois/runtime/DistStats.h" #include "galois/graphs/OfflineGraph.h" @@ -68,8 +69,8 @@ class DistGraph { constexpr static const char* const GRNAME = "dGraph"; using GraphTy = - galois::graphs::LC_CSR_Graph; + galois::graphs::LC_CSR_CSC_Graph; // vector for determining range objects for master nodes + nodes // with edges (which includes masters) @@ -745,6 +746,46 @@ class DistGraph { return IDs; } + ////////////////////////////////////////////////////////////////////////////// + // for in edges + ////////////////////////////////////////////////////////////////////////////// + + //! Construct the transpose graph for the partitioned graph + void ConstructIncomingEdges() { graph.constructIncomingEdges(); } + + /** + * Get the edge data for a particular edge in the graph. + * + * @param ni edge to get the data of + * @param mflag access flag for edge data + * @returns The edge data for the requested edge + */ + typename GraphTy::edge_data_reference + GetInEdgeData(edge_iterator ni, + galois::MethodFlag mflag = galois::MethodFlag::UNPROTECTED) { + return graph.getInEdgeData(ni, mflag); + } + + GraphNode GetInEdgeDest(edge_iterator ni) { return graph.getInEdgeDst(ni); } + + edge_iterator in_edge_begin(GraphNode N) { + return graph.in_edge_begin(N, galois::MethodFlag::UNPROTECTED); + } + + edge_iterator in_edge_end(GraphNode N) { + return graph.in_edge_end(N, galois::MethodFlag::UNPROTECTED); + } + + galois::runtime::iterable> + in_edges(GraphNode N) { + return galois::graphs::internal::make_no_deref_range(in_edge_begin(N), + in_edge_end(N)); + } + + ////////////////////////////////////////////////////////////////////////////// + // end in edges + ////////////////////////////////////////////////////////////////////////////// + protected: /** * Uses a pre-computed prefix sum to determine division of nodes among diff --git a/libgalois/include/galois/graphs/BufferedGraph.h b/libgalois/include/galois/graphs/BufferedGraph.h index e5e3fa4221..22cc10cc11 100644 --- a/libgalois/include/galois/graphs/BufferedGraph.h +++ b/libgalois/include/galois/graphs/BufferedGraph.h @@ -180,7 +180,7 @@ class BufferedGraph { typename std::enable_if::value>::type* = nullptr> void loadEdgeData(std::ifstream& graphFile, uint64_t edgeStart, uint64_t numEdgesToLoad, uint64_t numGlobalNodes, - uint64_t numGlobalEdges) { + uint64_t numGlobalEdges, uint64_t file_data_size) { if (numEdgesToLoad == 0) { return; } @@ -193,30 +193,39 @@ class BufferedGraph { GALOIS_DIE("Failed to allocate memory for edge data buffer."); } - // position after nodes + edges - uint64_t baseReadPosition = (4 + numGlobalNodes) * sizeof(uint64_t) + - (sizeof(uint32_t) * numGlobalEdges); - - // version 1 padding TODO make version agnostic - if (numGlobalEdges % 2) { - baseReadPosition += sizeof(uint32_t); - } - - // jump to first byte of edge data - uint64_t readPosition = - baseReadPosition + (sizeof(EdgeDataType) * edgeStart); - graphFile.seekg(readPosition); - uint64_t numBytesToLoad = numEdgesToLoad * sizeof(EdgeDataType); - uint64_t bytesRead = 0; - - while (numBytesToLoad > 0) { - graphFile.read(((char*)this->edgeDataBuffer) + bytesRead, numBytesToLoad); - size_t numRead = graphFile.gcount(); - numBytesToLoad -= numRead; - bytesRead += numRead; + if (file_data_size == sizeof(EdgeDataType)) { + // position after nodes + edges + uint64_t baseReadPosition = (4 + numGlobalNodes) * sizeof(uint64_t) + + (sizeof(uint32_t) * numGlobalEdges); + + // version 1 padding TODO make version agnostic + if (numGlobalEdges % 2) { + baseReadPosition += sizeof(uint32_t); + } + + // jump to first byte of edge data + uint64_t readPosition = + baseReadPosition + (sizeof(EdgeDataType) * edgeStart); + graphFile.seekg(readPosition); + uint64_t numBytesToLoad = numEdgesToLoad * sizeof(EdgeDataType); + uint64_t bytesRead = 0; + + while (numBytesToLoad > 0) { + graphFile.read(((char*)this->edgeDataBuffer) + bytesRead, + numBytesToLoad); + size_t numRead = graphFile.gcount(); + numBytesToLoad -= numRead; + bytesRead += numRead; + } + + assert(numBytesToLoad == 0); + } else { + // file on disk does not match edge data type: fill in the buffer + // with 0s instead + galois::gInfo("File on disk does not have appropriate edge data to read; " + "filling with 0s"); + memset(edgeDataBuffer, 0, sizeof(EdgeDataType) * numEdgesToLoad); } - - assert(numBytesToLoad == 0); } /** @@ -230,7 +239,8 @@ class BufferedGraph { template < typename EdgeType, typename std::enable_if::value>::type* = nullptr> - void loadEdgeData(std::ifstream&, uint64_t, uint64_t, uint64_t, uint64_t) { + void loadEdgeData(std::ifstream&, uint64_t, uint64_t, uint64_t, uint64_t, + uint64_t) { // do nothing (edge data is void, i.e. no edge data) } @@ -322,7 +332,7 @@ class BufferedGraph { loadEdgeDest(graphFile, 0, globalEdgeSize, globalSize); // may or may not do something depending on EdgeDataType loadEdgeData(graphFile, 0, globalEdgeSize, globalSize, - globalEdgeSize); + globalEdgeSize, header[1]); graphLoaded = true; graphFile.close(); @@ -350,6 +360,8 @@ class BufferedGraph { } std::ifstream graphFile(filename.c_str()); + uint64_t header[4]; + graphFile.read(((char*)header), sizeof(uint64_t) * 4); globalSize = numGlobalNodes; globalEdgeSize = numGlobalEdges; @@ -364,7 +376,7 @@ class BufferedGraph { // may or may not do something depending on EdgeDataType loadEdgeData(graphFile, edgeStart, numLocalEdges, - numGlobalNodes, numGlobalEdges); + numGlobalNodes, numGlobalEdges, header[1]); graphLoaded = true; graphFile.close(); diff --git a/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h b/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h index 2da77fb6cb..9509f73a8e 100644 --- a/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h +++ b/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h @@ -50,31 +50,35 @@ namespace graphs { */ template + bool HasOutOfLineLockable = false, typename FileEdgeTy = EdgeTy, + typename NodeIndexTy = uint32_t, typename EdgeIndexTy = uint64_t> class LC_CSR_CSC_Graph : public LC_CSR_Graph { + HasOutOfLineLockable, FileEdgeTy, NodeIndexTy, + EdgeIndexTy> { // typedef to make it easier to read //! Typedef referring to base LC_CSR_Graph - using BaseGraph = LC_CSR_Graph; + using BaseGraph = + LC_CSR_Graph; //! Typedef referring to this class itself using ThisGraph = LC_CSR_CSC_Graph; + UseNumaAlloc, HasOutOfLineLockable, FileEdgeTy, + NodeIndexTy, EdgeIndexTy>; public: //! Graph node typedef - using GraphNode = uint32_t; + using GraphNode = NodeIndexTy; protected: - // retypedefs of base class + // redefinitions of base class typedefs //! large array for edge data using EdgeData = LargeArray; //! large array for edge destinations - using EdgeDst = LargeArray; + using EdgeDst = LargeArray; //! large array for edge index data - using EdgeIndData = LargeArray; + using EdgeIndData = LargeArray; public: //! iterator for edges @@ -85,7 +89,7 @@ class LC_CSR_CSC_Graph protected: //! edge index data for the reverse edges - EdgeIndData inEdgeIndData; + EdgeIndData in_edge_ind_data_; //! edge destination data for the reverse edges EdgeDst inEdgeDst; //! Edge data of inedges can be a value copy of the outedges (i.e. in and @@ -162,9 +166,9 @@ class LC_CSR_CSC_Graph } // copy over the new tranposed edge index data - inEdgeIndData.allocateInterleaved(BaseGraph::numNodes); + in_edge_ind_data_.allocateInterleaved(BaseGraph::numNodes); galois::do_all(galois::iterate(UINT64_C(0), BaseGraph::numNodes), - [&](uint64_t n) { inEdgeIndData[n] = dataBuffer[n]; }); + [&](uint64_t n) { in_edge_ind_data_[n] = dataBuffer[n]; }); } /** @@ -179,8 +183,9 @@ class LC_CSR_CSC_Graph // saving an edge for a node if (BaseGraph::numNodes >= 1) { dataBuffer[0] = 0; - galois::do_all(galois::iterate(UINT64_C(1), BaseGraph::numNodes), - [&](uint64_t n) { dataBuffer[n] = inEdgeIndData[n - 1]; }); + galois::do_all( + galois::iterate(UINT64_C(1), BaseGraph::numNodes), + [&](uint64_t n) { dataBuffer[n] = in_edge_ind_data_[n - 1]; }); } // allocate edge dests and data @@ -212,13 +217,6 @@ class LC_CSR_CSC_Graph } public: - //! default constructor - LC_CSR_CSC_Graph() = default; - //! default move constructor - LC_CSR_CSC_Graph(LC_CSR_CSC_Graph&& rhs) = default; - //! default = operator - LC_CSR_CSC_Graph& operator=(LC_CSR_CSC_Graph&&) = default; - ///////////////////////////////////////////////////////////////////////////// // Construction functions ///////////////////////////////////////////////////////////////////////////// @@ -254,7 +252,7 @@ class LC_CSR_CSC_Graph * @returns Iterator to first in edge of node N */ edge_iterator in_raw_begin(GraphNode N) const { - return edge_iterator((N == 0) ? 0 : inEdgeIndData[N - 1]); + return edge_iterator((N == 0) ? 0 : in_edge_ind_data_[N - 1]); } /** @@ -265,7 +263,7 @@ class LC_CSR_CSC_Graph * node N+1) */ edge_iterator in_raw_end(GraphNode N) const { - return edge_iterator(inEdgeIndData[N]); + return edge_iterator(in_edge_ind_data_[N]); } /** @@ -389,7 +387,7 @@ class LC_CSR_CSC_Graph /** * @returns the prefix sum of in-edges */ - const EdgeIndData& getInEdgePrefixSum() const { return inEdgeIndData; } + const EdgeIndData& getInEdgePrefixSum() const { return in_edge_ind_data_; } ///////////////////////////////////////////////////////////////////////////// // Utility diff --git a/libgalois/include/galois/graphs/LC_CSR_Graph.h b/libgalois/include/galois/graphs/LC_CSR_Graph.h index afd0b52abd..9f849d0efc 100644 --- a/libgalois/include/galois/graphs/LC_CSR_Graph.h +++ b/libgalois/include/galois/graphs/LC_CSR_Graph.h @@ -344,8 +344,9 @@ class LC_CSR_Graph : uint64_t operator[](uint64_t n) { return *(edge_end(n)); } template - LC_CSR_Graph(NodeIndexTy _numNodes, EdgeIndexTy _numEdges, EdgeNumFnTy edgeNum, - EdgeDstFnTy _edgeDst, EdgeDataFnTy _edgeData) + LC_CSR_Graph(NodeIndexTy _numNodes, EdgeIndexTy _numEdges, + EdgeNumFnTy edgeNum, EdgeDstFnTy _edgeDst, + EdgeDataFnTy _edgeData) : numNodes(_numNodes), numEdges(_numEdges) { if (UseNumaAlloc) { //! [numaallocex] @@ -717,8 +718,8 @@ class LC_CSR_Graph : } template - void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, EdgeIndexTy e_new, - EdgeIndexTy e, + void edgeDataCopy(EdgeData& edgeData_new, EdgeData& edgeData, + EdgeIndexTy e_new, EdgeIndexTy e, typename std::enable_if::type* = 0) { edgeData_new[e_new] = edgeData[e]; } @@ -815,7 +816,7 @@ class LC_CSR_Graph : std::vector& prefix_sum, std::vector>& edges_id, std::vector>& edges_data) { - //allocateFrom(numNodes, numEdges); + // allocateFrom(numNodes, numEdges); /* * Deallocate if reusing the graph */ @@ -823,24 +824,25 @@ class LC_CSR_Graph : constructNodes(); galois::do_all(galois::iterate((NodeIndexTy)0, numNodes), - [&](NodeIndexTy n) { edgeIndData[n] = prefix_sum[n]; }); + [&](NodeIndexTy n) { edgeIndData[n] = prefix_sum[n]; }); galois::do_all(galois::iterate((NodeIndexTy)0, numNodes), - [&](NodeIndexTy n) { - if (n == 0) { - if (edgeIndData[n] > 0) { - std::copy(edges_id[n].begin(), edges_id[n].end(), edgeDst.begin()); - std::copy(edges_data[n].begin(), edges_data[n].end(), - edgeData.begin()); - } - } else { - if (edgeIndData[n] - edgeIndData[n - 1] > 0) { - std::copy(edges_id[n].begin(), edges_id[n].end(), - edgeDst.begin() + edgeIndData[n - 1]); - std::copy(edges_data[n].begin(), edges_data[n].end(), - edgeData.begin() + edgeIndData[n - 1]); - } - } - }); + [&](NodeIndexTy n) { + if (n == 0) { + if (edgeIndData[n] > 0) { + std::copy(edges_id[n].begin(), edges_id[n].end(), + edgeDst.begin()); + std::copy(edges_data[n].begin(), edges_data[n].end(), + edgeData.begin()); + } + } else { + if (edgeIndData[n] - edgeIndData[n - 1] > 0) { + std::copy(edges_id[n].begin(), edges_id[n].end(), + edgeDst.begin() + edgeIndData[n - 1]); + std::copy(edges_data[n].begin(), edges_data[n].end(), + edgeData.begin() + edgeIndData[n - 1]); + } + } + }); initializeLocalRanges(); } @@ -874,10 +876,10 @@ class LC_CSR_Graph : initializeLocalRanges(); } -//////////////////////////////////////////////////////////////////////////////// -// Warning: the below code is NOT compatible with NodeIndexTy/EdgeIndexTy; -// do NOT use with them -//////////////////////////////////////////////////////////////////////////////// + //////////////////////////////////////////////////////////////////////////////// + // Warning: the below code is NOT compatible with NodeIndexTy/EdgeIndexTy; + // do NOT use with them + //////////////////////////////////////////////////////////////////////////////// /** * Reads the GR files directly into in-memory @@ -1030,10 +1032,9 @@ class LC_CSR_Graph : this->setLocalRange(*r.first, *r.second); }); } -//////////////////////////////////////////////////////////////////////////////// -// End warning section -//////////////////////////////////////////////////////////////////////////////// - + //////////////////////////////////////////////////////////////////////////////// + // End warning section + //////////////////////////////////////////////////////////////////////////////// }; } // namespace galois::graphs diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 071b33aeac..3b5a499a57 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -33,7 +33,7 @@ enum class GNNPartitionScheme { kOEC, kCVC, kOCVC }; //! XXX class GNNGraph { public: - using GNNDistGraph = galois::graphs::DistGraph; + using GNNDistGraph = galois::graphs::DistGraph; using WholeGraph = galois::graphs::LC_CSR_Graph; using GraphNode = GNNDistGraph::GraphNode; // defined as such because dist graph range objects used long unsigned @@ -86,20 +86,61 @@ class GNNGraph { return partitioned_graph_->masterNodesRange().end(); } + ////////////////////////////////////////////////////////////////////////////// + // out edges + ////////////////////////////////////////////////////////////////////////////// // All following functions take a local node id - EdgeIterator EdgeBegin(GraphNode n) const { + EdgeIterator edge_begin(GraphNode n) const { return partitioned_graph_->edge_begin(n); }; - EdgeIterator EdgeEnd(GraphNode n) const { + EdgeIterator edge_end(GraphNode n) const { return partitioned_graph_->edge_end(n); }; - GraphNode EdgeDestination(EdgeIterator ei) const { + GraphNode GetEdgeDest(EdgeIterator ei) const { return partitioned_graph_->getEdgeDst(ei); }; - GNNFloat NormFactor(GraphNode n) const { return norm_factors_[n]; } + char IsEdgeSampled(EdgeIterator ei) const { + return partitioned_graph_->getEdgeData(ei); + }; + //! Set the flag on the edge to 1; makes it sampled + void MakeEdgeSampled(EdgeIterator ei) { + partitioned_graph_->getEdgeData(ei) = 1; + }; + //! Set the flag on the edge to 0; makes it not sampled + void MakeEdgeUnsampled(EdgeIterator ei) { + partitioned_graph_->getEdgeData(ei) = 0; + }; + galois::runtime::iterable< + galois::NoDerefIterator> + edges(GraphNode N) { + return partitioned_graph_->edges(N); + } + ////////////////////////////////////////////////////////////////////////////// + // in edges + ////////////////////////////////////////////////////////////////////////////// + EdgeIterator in_edge_begin(GraphNode n) const { + return partitioned_graph_->in_edge_begin(n); + } + EdgeIterator in_edge_end(GraphNode n) const { + return partitioned_graph_->in_edge_end(n); + } + GraphNode GetInEdgeDest(EdgeIterator ei) const { + return partitioned_graph_->GetInEdgeDest(ei); + }; + char IsInEdgeSampled(EdgeIterator ei) const { + return partitioned_graph_->GetInEdgeData(ei); + }; + galois::runtime::iterable< + galois::NoDerefIterator> + in_edges(GraphNode N) { + return partitioned_graph_->in_edges(N); + } + ////////////////////////////////////////////////////////////////////////////// + + GNNFloat GetNormFactor(GraphNode n) const { return norm_factors_[n]; } //! Degree norm (1 / degree) of current functional graph (e.g., sampled, //! inductive graph, etc); calculated whenever norm factor is calculated - GNNFloat DegreeNorm(GraphNode n) const { return degree_norm_[n]; } + GNNFloat GetDegreeNorm(GraphNode n) const { return degree_norm_[n]; } // Get accuracy: sampling is by default false float GetGlobalAccuracy(PointerWithSize predictions, diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 89cdca94e9..0c10f7a023 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -18,13 +18,13 @@ LoadPartition(const std::string& input_directory, // load partition switch (partition_scheme) { case galois::graphs::GNNPartitionScheme::kOEC: - return galois::cuspPartitionGraph( + return galois::cuspPartitionGraph( input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1); case galois::graphs::GNNPartitionScheme::kCVC: - return galois::cuspPartitionGraph( + return galois::cuspPartitionGraph( input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1); case galois::graphs::GNNPartitionScheme::kOCVC: - return galois::cuspPartitionGraph( + return galois::cuspPartitionGraph( input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1); default: GALOIS_LOG_FATAL("Error: partition scheme specified is invalid"); @@ -68,6 +68,8 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, // load partition partitioned_graph_ = LoadPartition(input_directory_, dataset_name, partition_scheme); + // reverse edges + partitioned_graph_->ConstructIncomingEdges(); // read additional graph data ReadLocalLabels(dataset_name, has_single_class_label); @@ -253,8 +255,8 @@ void galois::graphs::GNNGraph::GraphSAINTSample(size_t num_roots, for (size_t current_depth = 0; current_depth < walk_depth; current_depth++) { // pick random edge, mark sampled, swap roots - EdgeIterator first_edge = EdgeBegin(root); - size_t num_edges = std::distance(first_edge, EdgeEnd(root)); + EdgeIterator first_edge = edge_begin(root); + size_t num_edges = std::distance(first_edge, edge_end(root)); if (num_edges == 0) { break; } @@ -267,7 +269,7 @@ void galois::graphs::GNNGraph::GraphSAINTSample(size_t num_roots, long int rand_num; lrand48_r(&seed_struct, &rand_num); EdgeIterator selected_edge = first_edge + (rand_num % num_edges); - size_t candidate_dest = EdgeDestination(selected_edge); + size_t candidate_dest = GetEdgeDest(selected_edge); // TODO(loc) another possibility is to just pick it anyways regardless // but don't mark it as sampled, though this would lead to disconnected @@ -609,9 +611,9 @@ void galois::graphs::GNNGraph::CalculateSpecialNormFactor(bool is_sampled, // TODO(loc) make this work in a distributed setting; assuming // whole graph is present on single host at the moment - for (EdgeIterator e = EdgeBegin(local_id); e != EdgeEnd(local_id); + for (EdgeIterator e = edge_begin(local_id); e != edge_end(local_id); e++) { - size_t dest = EdgeDestination(e); + size_t dest = GetEdgeDest(e); if (is_sampled && is_inductive) { if (!IsValidForPhase(dest, GNNPhase::kTrain) || !IsInSampledGraph(dest)) { diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 6f86cf1395..282042a805 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -317,7 +317,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( GNNFloat source_norm = 0.0; if (!config_.disable_normalization) { - source_norm = graph_.NormFactor(src); + source_norm = graph_.GetNormFactor(src); } // init to self @@ -334,8 +334,8 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( } // loop through all destinations to grab the feature to aggregate - for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) { - size_t dst = graph_.EdgeDestination(e); + for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src); e++) { + size_t dst = graph_.GetEdgeDest(e); graphs::bitset_graph_aggregate.set(src); if (layer_phase_ == GNNPhase::kTrain) { @@ -356,7 +356,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( size_t index_to_dst_feature = dst * column_length; if (!config_.disable_normalization) { - GNNFloat norm_scale = source_norm * graph_.NormFactor(dst); + GNNFloat norm_scale = source_norm * graph_.GetNormFactor(dst); galois::VectorMulAdd( column_length, &aggregate_output[index_to_src_feature], &node_embeddings[index_to_dst_feature], norm_scale, diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 8fde856ac8..bd6b84469f 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -412,13 +412,13 @@ void galois::SAGELayer::AggregateAllCPU( GNNFloat source_norm = 0.0; if (!config_.disable_normalization) { - source_norm = graph_.DegreeNorm(src); + source_norm = graph_.GetDegreeNorm(src); } // loop through all destinations to grab the feature to aggregate - for (auto e = graph_.EdgeBegin(src); e != graph_.EdgeEnd(src); e++) { + for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src); e++) { graphs::bitset_graph_aggregate.set(src); - size_t dst = graph_.EdgeDestination(e); + size_t dst = graph_.GetEdgeDest(e); if (layer_phase_ == GNNPhase::kTrain) { if (IsInductiveLayer()) { @@ -442,7 +442,7 @@ void galois::SAGELayer::AggregateAllCPU( if (!is_backward) { norm_scale = source_norm; } else { - norm_scale = graph_.DegreeNorm(dst); + norm_scale = graph_.GetDegreeNorm(dst); } galois::VectorMulAdd( diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index b9ef634c53..91835cfc07 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -81,6 +81,10 @@ if (NOT GALOIS_ENABLE_GPU) add_executable(sample-test sample-test.cpp) target_link_libraries(sample-test galois_gnn) add_test(NAME sample-test COMMAND sample-test) + + add_executable(sample-bit-test sample-bit-test.cpp) + target_link_libraries(sample-bit-test galois_gnn) + add_test(NAME sample-bit-test COMMAND sample-bit-test) else() add_executable(gpu-convlayer-test gpu-convlayer-test.cpp) target_link_libraries(gpu-convlayer-test galois_gnn) diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp index eac1f89c84..d95931a798 100644 --- a/libgnn/test/aggregate-sync-test.cpp +++ b/libgnn/test/aggregate-sync-test.cpp @@ -14,11 +14,11 @@ int main() { // print edges for sanity for (size_t node = 0; node < test_graph->size(); node++) { - for (auto e = test_graph->EdgeBegin(node); e != test_graph->EdgeEnd(node); + for (auto e = test_graph->edge_begin(node); e != test_graph->edge_end(node); e++) { galois::gPrint(test_graph->host_prefix(), "Edge ", test_graph->GetGID(node), " ", - test_graph->GetGID(test_graph->EdgeDestination(e)), "\n"); + test_graph->GetGID(test_graph->GetEdgeDest(e)), "\n"); } } for (auto own = test_graph->begin_owned(); own != test_graph->end_owned(); @@ -210,11 +210,11 @@ int main() { "tester", galois::graphs::GNNPartitionScheme::kCVC, true); // print edges for sanity for (size_t node = 0; node < test_graph_2->size(); node++) { - for (auto e = test_graph_2->EdgeBegin(node); - e != test_graph_2->EdgeEnd(node); e++) { - galois::gPrint( - test_graph_2->host_prefix(), "Edge ", test_graph_2->GetGID(node), " ", - test_graph_2->GetGID(test_graph_2->EdgeDestination(e)), "\n"); + for (auto e = test_graph_2->edge_begin(node); + e != test_graph_2->edge_end(node); e++) { + galois::gPrint(test_graph_2->host_prefix(), "Edge ", + test_graph_2->GetGID(node), " ", + test_graph_2->GetGID(test_graph_2->GetEdgeDest(e)), "\n"); } } for (auto own = test_graph_2->begin_owned(); own != test_graph_2->end_owned(); diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp new file mode 100644 index 0000000000..cc1226e8bf --- /dev/null +++ b/libgnn/test/sample-bit-test.cpp @@ -0,0 +1,88 @@ +//! @file sample-bit-test.cpp +//! Checks to see if edge sample bit is set correctly. + +#include "galois/Logging.h" +#include "galois/graphs/GNNGraph.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + GALOIS_LOG_VERBOSE("[{}] Using {} threads", + galois::runtime::getSystemNetworkInterface().ID, + num_threads); + + galois::graphs::GNNGraph graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + + // first, assert all edges are not sampled (should come with all 0s) + for (size_t node = 0; node < graph.size(); node++) { + for (auto ei : graph.edges(node)) { + GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei)); + } + for (auto ei : graph.in_edges(node)) { + GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei)); + } + } + + // make all edges sampled; it should set the in-edges as well + for (size_t node = 0; node < graph.size(); node++) { + for (auto ei : graph.edges(node)) { + graph.MakeEdgeSampled(ei); + } + } + + // all edges (including ins) should be sampled + for (size_t node = 0; node < graph.size(); node++) { + for (auto ei : graph.edges(node)) { + GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei)); + } + for (auto ei : graph.in_edges(node)) { + GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei)); + } + } + + // clear sample bits for odd numbers + for (size_t node = 0; node < graph.size(); node++) { + if (node % 2 == 1) { + for (auto ei : graph.edges(node)) { + graph.MakeEdgeUnsampled(ei); + } + } + } + + // do another check + for (size_t node = 0; node < graph.size(); node++) { + for (auto ei : graph.edges(node)) { + if (node % 2 == 1) { + GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei)); + } else { + GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei)); + } + } + + // in edges for this node: if destination (i.e., source) is + // odd, then it should not be sampled + for (auto ei : graph.in_edges(node)) { + if ((graph.GetInEdgeDest(ei) % 2) == 1) { + GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei)); + } else { + GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei)); + } + } + } + + // print edges for a quick lookover if run manually + for (size_t node = 0; node < graph.size(); node++) { + for (auto ei : graph.edges(node)) { + galois::gPrint("Out edge ", node, " ", graph.GetEdgeDest(ei), "\n"); + } + for (auto ei : graph.in_edges(node)) { + galois::gPrint("In edge to ", node, " from ", graph.GetInEdgeDest(ei), + "\n"); + } + } + + return 0; +} From 386fbb83682be6232da9c980a459ad6fac0111f1 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 15 Apr 2021 14:14:57 -0500 Subject: [PATCH 508/660] CSR_CSC always creates in-to-out mapping Before, in to out mapping in CSR/CSC only created if edge data exists. Now it is always created in case user wants to create edge-data outside of the graph object (e.g., in GNNs). Adds the function to get access to this mapping as well. --- .../include/galois/graphs/DistributedGraph.h | 5 ++++ .../include/galois/graphs/LC_CSR_CSC_Graph.h | 27 ++++++++++--------- 2 files changed, 19 insertions(+), 13 deletions(-) diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h index 1c56302b93..e13f71e4d2 100644 --- a/libcusp/include/galois/graphs/DistributedGraph.h +++ b/libcusp/include/galois/graphs/DistributedGraph.h @@ -782,6 +782,11 @@ class DistGraph { in_edge_end(N)); } + //! Return corresponding out-edge index for an in-edge + size_t InEdgeToOutEdge(edge_iterator ni) const { + return graph.InEdgeToOutEdge(ni); + } + ////////////////////////////////////////////////////////////////////////////// // end in edges ////////////////////////////////////////////////////////////////////////////// diff --git a/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h b/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h index 9509f73a8e..2f0b9e88de 100644 --- a/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h +++ b/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h @@ -99,6 +99,7 @@ class LC_CSR_CSC_Graph typename std::conditional::type; //! The data for the reverse edges EdgeDataRep inEdgeData; + EdgeIndData in_edge_to_out_edge_; //! redefinition of the edge sort iterator in LC_CSR_Graph using edge_sort_iterator = @@ -127,19 +128,11 @@ class LC_CSR_CSC_Graph BaseGraph::edgeDataCopy(inEdgeData, BaseGraph::edgeData, e_new, e); } - /** - * Save a pointer to an outedge (i.e. map an in-edge to an out-edge). Done - * to share edge data. - * - * @param e_new position of out-edge to save - * @param e position of in-edge - */ + //! Do nothing; getting edge data will be done via pointer template ::type* = nullptr> - void createEdgeData(const uint64_t e_new, const uint64_t e) { - if (!std::is_void::value) { - inEdgeData[e_new] = e; - } + void createEdgeData(const uint64_t, const uint64_t) { + // do nothing } /** @@ -194,6 +187,7 @@ class LC_CSR_CSC_Graph if (!std::is_void::value) { inEdgeData.allocateInterleaved(BaseGraph::numEdges); } + in_edge_to_out_edge_.allocateInterleaved(BaseGraph::numEdges); galois::do_all( galois::iterate(UINT64_C(0), BaseGraph::numNodes), [&](uint64_t src) { @@ -211,6 +205,7 @@ class LC_CSR_CSC_Graph inEdgeDst[e_new] = src; // edge data to "new" array createEdgeData(e_new, e); + in_edge_to_out_edge_[e_new] = e; e++; } }); @@ -365,7 +360,7 @@ class LC_CSR_CSC_Graph typename std::enable_if::type* = nullptr> edge_data_reference getInEdgeData(edge_iterator ni, MethodFlag = MethodFlag::UNPROTECTED) const { - return BaseGraph::edgeData[inEdgeData[*ni]]; + return BaseGraph::edgeData[in_edge_to_out_edge_[*ni]]; } /** @@ -381,7 +376,13 @@ class LC_CSR_CSC_Graph typename std::enable_if::type* = nullptr> edge_data_reference getInEdgeData(edge_iterator ni, MethodFlag = MethodFlag::UNPROTECTED) { - return BaseGraph::edgeData[inEdgeData[*ni]]; + return BaseGraph::edgeData[in_edge_to_out_edge_[*ni]]; + } + + //! Returns corresponding index for the out-edge corresponding to + //! an in-edge. + size_t InEdgeToOutEdge(edge_iterator ni) const { + return in_edge_to_out_edge_[*ni]; } /** From fbfe895ed217a4cbf894681a82f7c2fe549c5438 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 15 Apr 2021 14:23:39 -0500 Subject: [PATCH 509/660] Separate edge sample data; data also vector now Edge sampling data separated from partitioned graph. Also made into a vector because each layer will have different sampling status. --- libgnn/include/galois/graphs/GNNGraph.h | 36 ++++++++++++++++++------- libgnn/src/graphs/GNNGraph.cpp | 6 ++--- libgnn/test/sample-bit-test.cpp | 21 ++++++++------- 3 files changed, 40 insertions(+), 23 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 3b5a499a57..72dce17185 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -33,7 +33,7 @@ enum class GNNPartitionScheme { kOEC, kCVC, kOCVC }; //! XXX class GNNGraph { public: - using GNNDistGraph = galois::graphs::DistGraph; + using GNNDistGraph = galois::graphs::DistGraph; using WholeGraph = galois::graphs::LC_CSR_Graph; using GraphNode = GNNDistGraph::GraphNode; // defined as such because dist graph range objects used long unsigned @@ -87,8 +87,19 @@ class GNNGraph { } ////////////////////////////////////////////////////////////////////////////// - // out edges + // Edges ////////////////////////////////////////////////////////////////////////////// + + void InitializeEdgeData() { InitializeEdgeData(1); } + + void InitializeEdgeData(size_t num_layers) { + edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers); + } + + ////////////////////////////////////////////////////////////////////////////// + // Out Edges + ////////////////////////////////////////////////////////////////////////////// + // All following functions take a local node id EdgeIterator edge_begin(GraphNode n) const { return partitioned_graph_->edge_begin(n); @@ -99,22 +110,23 @@ class GNNGraph { GraphNode GetEdgeDest(EdgeIterator ei) const { return partitioned_graph_->getEdgeDst(ei); }; - char IsEdgeSampled(EdgeIterator ei) const { - return partitioned_graph_->getEdgeData(ei); + bool IsEdgeSampled(EdgeIterator ei, size_t layer_num) const { + return edge_sample_status_[*ei][layer_num]; }; //! Set the flag on the edge to 1; makes it sampled - void MakeEdgeSampled(EdgeIterator ei) { - partitioned_graph_->getEdgeData(ei) = 1; + void MakeEdgeSampled(EdgeIterator ei, size_t layer_num) { + edge_sample_status_[*ei][layer_num] = 1; }; //! Set the flag on the edge to 0; makes it not sampled - void MakeEdgeUnsampled(EdgeIterator ei) { - partitioned_graph_->getEdgeData(ei) = 0; + void MakeEdgeUnsampled(EdgeIterator ei, size_t layer_num) { + edge_sample_status_[*ei][layer_num] = 0; }; galois::runtime::iterable< galois::NoDerefIterator> edges(GraphNode N) { return partitioned_graph_->edges(N); } + ////////////////////////////////////////////////////////////////////////////// // in edges ////////////////////////////////////////////////////////////////////////////// @@ -127,8 +139,9 @@ class GNNGraph { GraphNode GetInEdgeDest(EdgeIterator ei) const { return partitioned_graph_->GetInEdgeDest(ei); }; - char IsInEdgeSampled(EdgeIterator ei) const { - return partitioned_graph_->GetInEdgeData(ei); + bool IsInEdgeSampled(EdgeIterator ei, size_t layer_num) const { + return edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)] + [layer_num]; }; galois::runtime::iterable< galois::NoDerefIterator> @@ -332,6 +345,9 @@ class GNNGraph { std::vector local_ground_truth_labels_; //! Feature vectors for nodes in partitioned graph std::vector local_node_features_; + //! Sample data on edges: each edge gets a small bitset to mark + //! if it's been sampled for a particular layer + galois::LargeArray> edge_sample_status_; // TODO maybe revisit this and use an actual bitset //! Bitset indicating which nodes are training nodes diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 0c10f7a023..073c616127 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -18,13 +18,13 @@ LoadPartition(const std::string& input_directory, // load partition switch (partition_scheme) { case galois::graphs::GNNPartitionScheme::kOEC: - return galois::cuspPartitionGraph( + return galois::cuspPartitionGraph( input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1); case galois::graphs::GNNPartitionScheme::kCVC: - return galois::cuspPartitionGraph( + return galois::cuspPartitionGraph( input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1); case galois::graphs::GNNPartitionScheme::kOCVC: - return galois::cuspPartitionGraph( + return galois::cuspPartitionGraph( input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1); default: GALOIS_LOG_FATAL("Error: partition scheme specified is invalid"); diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp index cc1226e8bf..1ad2d50196 100644 --- a/libgnn/test/sample-bit-test.cpp +++ b/libgnn/test/sample-bit-test.cpp @@ -15,31 +15,32 @@ int main() { galois::graphs::GNNGraph graph( "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + graph.InitializeEdgeData(3); // first, assert all edges are not sampled (should come with all 0s) for (size_t node = 0; node < graph.size(); node++) { for (auto ei : graph.edges(node)) { - GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei)); + GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 0)); } for (auto ei : graph.in_edges(node)) { - GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei)); + GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 0)); } } // make all edges sampled; it should set the in-edges as well for (size_t node = 0; node < graph.size(); node++) { for (auto ei : graph.edges(node)) { - graph.MakeEdgeSampled(ei); + graph.MakeEdgeSampled(ei, 0); } } // all edges (including ins) should be sampled for (size_t node = 0; node < graph.size(); node++) { for (auto ei : graph.edges(node)) { - GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei)); + GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 0)); } for (auto ei : graph.in_edges(node)) { - GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei)); + GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 0)); } } @@ -47,7 +48,7 @@ int main() { for (size_t node = 0; node < graph.size(); node++) { if (node % 2 == 1) { for (auto ei : graph.edges(node)) { - graph.MakeEdgeUnsampled(ei); + graph.MakeEdgeUnsampled(ei, 0); } } } @@ -56,9 +57,9 @@ int main() { for (size_t node = 0; node < graph.size(); node++) { for (auto ei : graph.edges(node)) { if (node % 2 == 1) { - GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei)); + GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 0)); } else { - GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei)); + GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 0)); } } @@ -66,9 +67,9 @@ int main() { // odd, then it should not be sampled for (auto ei : graph.in_edges(node)) { if ((graph.GetInEdgeDest(ei) % 2) == 1) { - GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei)); + GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 0)); } else { - GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei)); + GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 0)); } } } From 7891816ad0516f4e399c0f97d3ce81cb534fe3cd Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 15 Apr 2021 15:39:38 -0500 Subject: [PATCH 510/660] GNN set sample edge via in-edge; more tests Functions for setting an edge bit via in-edge + tests for it along with tests for the different layers. --- libgnn/include/galois/graphs/GNNGraph.h | 8 +++ libgnn/test/sample-bit-test.cpp | 76 +++++++++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 72dce17185..d5b5ee05dd 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -143,6 +143,14 @@ class GNNGraph { return edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)] [layer_num]; }; + //! Set the flag on the edge to 1; makes it sampled + void MakeInEdgeSampled(EdgeIterator ei, size_t layer_num) { + edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)][layer_num] = 1; + }; + //! Set the flag on the edge to 0; makes it not sampled + void MakeInEdgeUnsampled(EdgeIterator ei, size_t layer_num) { + edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)][layer_num] = 0; + }; galois::runtime::iterable< galois::NoDerefIterator> in_edges(GraphNode N) { diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp index 1ad2d50196..66d739a6d7 100644 --- a/libgnn/test/sample-bit-test.cpp +++ b/libgnn/test/sample-bit-test.cpp @@ -61,6 +61,8 @@ int main() { } else { GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 0)); } + GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 1)); + GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 2)); } // in edges for this node: if destination (i.e., source) is @@ -71,6 +73,80 @@ int main() { } else { GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 0)); } + GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 1)); + GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 2)); + } + } + + // odd layer 1, even layer 2 + for (size_t node = 0; node < graph.size(); node++) { + if (node % 2 == 1) { + for (auto ei : graph.edges(node)) { + graph.MakeEdgeSampled(ei, 1); + } + } else { + for (auto ei : graph.edges(node)) { + graph.MakeEdgeSampled(ei, 2); + } + } + } + + for (size_t node = 0; node < graph.size(); node++) { + for (auto ei : graph.edges(node)) { + if (node % 2 == 1) { + GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 0)); + GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 1)); + GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 2)); + } else { + GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 0)); + GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 1)); + GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 2)); + } + } + + // in edges for this node: if destination (i.e., source) is + // odd, then it should not be sampled + for (auto ei : graph.in_edges(node)) { + if ((graph.GetInEdgeDest(ei) % 2) == 1) { + GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 0)); + GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 1)); + GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 2)); + } else { + GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 0)); + GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 1)); + GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 2)); + } + } + } + + // odd layer 1, even layer 2; set in edge + for (size_t node = 0; node < graph.size(); node++) { + if (node % 2 == 1) { + for (auto ei : graph.in_edges(node)) { + graph.MakeInEdgeUnsampled(ei, 1); + } + } else { + for (auto ei : graph.in_edges(node)) { + graph.MakeInEdgeSampled(ei, 1); + } + } + } + + for (size_t node = 0; node < graph.size(); node++) { + for (auto ei : graph.in_edges(node)) { + if (node % 2 == 1) { + GALOIS_LOG_ASSERT(!graph.IsInEdgeSampled(ei, 1)); + } else { + GALOIS_LOG_ASSERT(graph.IsInEdgeSampled(ei, 1)); + } + } + + for (auto ei : graph.edges(node)) { + if ((graph.GetEdgeDest(ei) % 2) == 1) { + GALOIS_LOG_ASSERT(!graph.IsEdgeSampled(ei, 1)); + } else { + GALOIS_LOG_ASSERT(graph.IsEdgeSampled(ei, 1)); + } } } From 4065206d38bf3181a176a6466884520b748100e6 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 19 Apr 2021 18:04:30 -0500 Subject: [PATCH 511/660] Basic edge sampling for distributed SAGE 1) Edges sampled probabilistically across hosts, but out-degree used when in-degree is supposed to be used. 2) GCN not supported yet. 3) Norm factors are wrong (requires another comm step). 4) Bitset has to be completely set because otherwise it segfaults for some reason. 5) Memory usage not reduced: still iterating over full graph. Next step is to actually create the subgraph. --- libgnn/include/galois/graphs/GNNGraph.h | 19 +++ .../graphs/GraphAggregationSyncStructures.h | 40 ++++++ libgnn/src/GraphNeuralNetwork.cpp | 23 +++- libgnn/src/graphs/GNNGraph.cpp | 70 ++++++++++ libgnn/src/layers/GraphConvolutionalLayer.cpp | 3 + libgnn/src/layers/SAGELayer.cpp | 123 +++++++++++++----- 6 files changed, 240 insertions(+), 38 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index d5b5ee05dd..853a96dc0d 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -156,6 +156,16 @@ class GNNGraph { in_edges(GraphNode N) { return partitioned_graph_->in_edges(N); } + + ////////////////////////////////////////////////////////////////////////////// + // neighborhood sampling + ////////////////////////////////////////////////////////////////////////////// + + //! Set seed nodes, i.e., nodes that are being predicted on + void SetupNeighborhoodSample(); + //! Sample neighbors of nodes that are marked as ready for sampling + void SampleEdges(size_t sample_layer_num, size_t num_to_sample); + ////////////////////////////////////////////////////////////////////////////// GNNFloat GetNormFactor(GraphNode n) const { return norm_factors_[n]; } @@ -247,10 +257,12 @@ class GNNGraph { //! graph bool IsInSampledGraph(const NodeIterator& ni) const { // TODO(loc) GPU + assert(*ni < size()); return partitioned_graph_->getData(*ni); } bool IsInSampledGraph(size_t node_id) const { // TODO(loc) GPU + assert(node_id < size()); return partitioned_graph_->getData(node_id); } @@ -353,10 +365,17 @@ class GNNGraph { std::vector local_ground_truth_labels_; //! Feature vectors for nodes in partitioned graph std::vector local_node_features_; + + ////////////////////////////////////////////////////////////////////////////// + //! Sample data on edges: each edge gets a small bitset to mark //! if it's been sampled for a particular layer galois::LargeArray> edge_sample_status_; + galois::DynamicBitSet new_sampled_nodes_; + + ////////////////////////////////////////////////////////////////////////////// + // TODO maybe revisit this and use an actual bitset //! Bitset indicating which nodes are training nodes std::vector local_training_mask_; diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index 8e3db38096..7759c26dca 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -17,6 +17,46 @@ extern struct CUDA_Context* cuda_ctx_for_sync; extern unsigned layer_number_to_sync; #endif +struct SampleFlagSync { + using ValTy = char; + + //! return a vector of floats to sync + static ValTy extract(uint32_t, char& i) { return i; } + + //! reduction is addition in this case; add received vector to + //! own vector + static bool reduce(uint32_t, char& i, ValTy y) { + if (y > i) { + i = y; + assert(i == 1); + return true; + } else { + return false; + } + } + + //! No-op: readAny = overwritten anyways + static void reset(uint32_t, char&) {} + + //! element wise set + static void setVal(uint32_t, char& i, ValTy y) { i = y; } + + // GPU options TODO for GPU + static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_batch(unsigned, uint8_t*) { return false; } + static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { + return false; + } + static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_reset_batch(unsigned, uint8_t*) { return false; } +}; + struct GNNSumAggregate { using ValTy = galois::gstl::Vector; diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 4942076b23..42bcfc3b08 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -99,6 +99,8 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( break; } } + // output layer not included; it will never involve sampling + graph_->InitializeEdgeData(gnn_layers_.size()); // create the output layer GNNLayerDimensions output_dims = { @@ -158,6 +160,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { if (config_.inductive_training_) { graph_->CalculateSpecialNormFactor(false, true); } + galois::StatTimer epoch_timer("TrainingTime", "GraphNeuralNetwork"); galois::StatTimer validation_timer("ValidationTime", "GraphNeuralNetwork"); galois::StatTimer epoch_test_timer("TestTime", "GraphNeuralNetwork"); @@ -165,11 +168,25 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // TODO incorporate validation/test intervals for (size_t epoch = 0; epoch < num_epochs; epoch++) { epoch_timer.start(); + if (config_.do_sampling()) { - // subgraph sample every epoch - graph_->GraphSAINTSample(); - graph_->CalculateSpecialNormFactor(true, config_.inductive_training_); + graph_->SetupNeighborhoodSample(); + size_t num_sampled_layers = 0; + + // work backwards on GCN/SAGE layers + // loop backward and find last GCN/SAGE (main) layer to disable activation + for (auto back_iter = gnn_layers_.rbegin(); + back_iter != gnn_layers_.rend(); back_iter++) { + GNNLayerType layer_type = (*back_iter)->layer_type(); + if (layer_type == GNNLayerType::kGraphConvolutional || + layer_type == GNNLayerType::kSAGE) { + graph_->SampleEdges((*back_iter)->layer_number(), 5); + num_sampled_layers++; + } + } + galois::gDebug("Number of sampled layers is ", num_sampled_layers); } + const PointerWithSize predictions = DoInference(); // have to get accuracy here because gradient prop destroys the predictions // matrix diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 073c616127..c12288c950 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -819,6 +819,76 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti( return global_f1_micro_score; } +void galois::graphs::GNNGraph::SetupNeighborhoodSample() { + new_sampled_nodes_.resize(size()); + new_sampled_nodes_.reset(); + + // for now, if training node, it goes into seed node + galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { + if (IsValidForPhase(*x, GNNPhase::kTrain)) { + SetSampledNode(*x); + } else { + UnsetSampledNode(*x); + } + }); + // clear all sampled edges + galois::do_all(galois::iterate(size_t{0}, partitioned_graph_->sizeEdges()), + [&](size_t edge_id) { + std::fill(edge_sample_status_[edge_id].begin(), + edge_sample_status_[edge_id].end(), 0); + }); +} + +void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, + size_t num_to_sample) { + galois::GAccumulator sampled; + galois::GAccumulator total; + sampled.reset(); + total.reset(); + galois::do_all( + galois::iterate(begin(), end()), + [&](const NodeIterator& x) { + // only operate on if sampled + if (partitioned_graph_->getData(*x)) { + // chance of not uniformly choosing an edge of this node num_to_sample + // times (degree norm is 1 / degree) + // XXX in-degree prob, not out degree + double probability_of_reject = + std::pow(1 - GetDegreeNorm(*x), num_to_sample); + // loop through in-edges, turn "on" edge with some probability + for (auto edge_iter : partitioned_graph_->in_edges(*x)) { + if (sample_rng_.DoBernoulli(probability_of_reject)) { + // if here, it means edge accepted; set sampled on, mark source + // as part of next set + MakeInEdgeSampled(edge_iter, sample_layer_num); + new_sampled_nodes_.set( + partitioned_graph_->GetInEdgeDest(edge_iter)); + sampled += 1; + } + total += 1; + } + } + }, + galois::steal(), galois::loopname("NeighborhoodSample")); + + galois::gPrint("Num sampled edges is ", sampled.reduce(), " out of ", + total.reduce(), "\n"); + + std::vector new_nodes = new_sampled_nodes_.getOffsets(); + + // update nodes, then communicate update to all hosts so that they can + // continue the exploration + galois::do_all( + galois::iterate(new_nodes), + [&](uint32_t new_node_id) { SetSampledNode(new_node_id); }, + galois::loopname("NeighborhoodSampleSet")); + + // XXX(loc) bitset; can readAny be weaker? + sync_substrate_->sync("SampleSync"); +} + +//////////////////////////////////////////////////////////////////////////////// + #ifdef GALOIS_ENABLE_GPU void galois::graphs::GNNGraph::InitGPUMemory() { // create int casted CSR diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 282042a805..b5a538d314 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -308,6 +308,9 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( } if (IsSampledLayer()) { + // XXX(loc) + GALOIS_LOG_WARN( + "Edge sampling not yet implemented for GCN; only SAGE"); // check if node is part of sampled graph; ignore after 0'ing if not // sampled if (!graph_.IsInSampledGraph(src)) diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index bd6b84469f..8a01c8fe1d 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -384,11 +384,16 @@ void galois::SAGELayer::AggregateAllCPU( GNNFloat* aggregate_output, galois::substrate::PerThreadStorage>*, bool is_backward) { + size_t num_nodes = graph_.size(); galois::do_all( galois::iterate(static_cast(0), num_nodes), [&](size_t src) { + // TODO(loc) this is currently a hack: the sync substrate blows + // up if not the entire bitset is set for sync call like in + // edge sampling + graphs::bitset_graph_aggregate.set(src); size_t index_to_src_feature = src * column_length; // zero out src feature first for (size_t i = 0; i < column_length; i++) { @@ -403,10 +408,10 @@ void galois::SAGELayer::AggregateAllCPU( } if (IsSampledLayer()) { - // check if node is part of sampled graph; ignore after 0'ing if not - // sampled - if (!graph_.IsInSampledGraph(src)) + // check if node is part of sampled graph + if (!graph_.IsInSampledGraph(src)) { return; + } } } @@ -415,46 +420,94 @@ void galois::SAGELayer::AggregateAllCPU( source_norm = graph_.GetDegreeNorm(src); } - // loop through all destinations to grab the feature to aggregate - for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src); e++) { - graphs::bitset_graph_aggregate.set(src); - size_t dst = graph_.GetEdgeDest(e); - - if (layer_phase_ == GNNPhase::kTrain) { - if (IsInductiveLayer()) { - // if inductive, all non-training nodes do not exist - if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain)) - return; + if (!is_backward) { + // loop through all destinations to grab the feature to aggregate + for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src); + e++) { + // graphs::bitset_graph_aggregate.set(src); + size_t dst = graph_.GetEdgeDest(e); + + if (layer_phase_ == GNNPhase::kTrain) { + if (IsInductiveLayer()) { + // if inductive, all non-training nodes do not exist + if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain)) + return; + } + + if (IsSampledLayer()) { + if (!graph_.IsEdgeSampled(e, layer_number_)) { + continue; + } + // ignore non-sampled nodes + if (layer_phase_ == GNNPhase::kTrain && + !graph_.IsInSampledGraph(dst)) + continue; + } } - if (IsSampledLayer()) { - // ignore non-sampled nodes - if (layer_phase_ == GNNPhase::kTrain && - !graph_.IsInSampledGraph(dst)) - continue; + size_t index_to_dst_feature = dst * column_length; + + if (!config_.disable_normalization) { + GNNFloat norm_scale; + if (!is_backward) { + norm_scale = source_norm; + } else { + norm_scale = graph_.GetDegreeNorm(dst); + } + + galois::VectorMulAdd( + column_length, &aggregate_output[index_to_src_feature], + &node_embeddings[index_to_dst_feature], norm_scale, + &aggregate_output[index_to_src_feature]); + } else { + // add dst feature to aggregate output + galois::VectorAdd(column_length, + &aggregate_output[index_to_src_feature], + &node_embeddings[index_to_dst_feature], + &aggregate_output[index_to_src_feature]); } } + } else { + // loop through all destinations to grab the feature to aggregate + for (auto e = graph_.in_edge_begin(src); e != graph_.in_edge_end(src); + e++) { + // graphs::bitset_graph_aggregate.set(src); + size_t dst = graph_.GetInEdgeDest(e); + + if (layer_phase_ == GNNPhase::kTrain) { + if (IsInductiveLayer()) { + // if inductive, all non-training nodes do not exist + if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain)) + return; + } + + if (IsSampledLayer()) { + if (!graph_.IsInEdgeSampled(e, layer_number_)) { + continue; + } + // ignore non-sampled nodes + if (layer_phase_ == GNNPhase::kTrain && + !graph_.IsInSampledGraph(dst)) + continue; + } + } - size_t index_to_dst_feature = dst * column_length; + size_t index_to_dst_feature = dst * column_length; - if (!config_.disable_normalization) { - GNNFloat norm_scale; - if (!is_backward) { - norm_scale = source_norm; + if (!config_.disable_normalization) { + GNNFloat norm_scale = graph_.GetDegreeNorm(dst); + + galois::VectorMulAdd( + column_length, &aggregate_output[index_to_src_feature], + &node_embeddings[index_to_dst_feature], norm_scale, + &aggregate_output[index_to_src_feature]); } else { - norm_scale = graph_.GetDegreeNorm(dst); + // add dst feature to aggregate output + galois::VectorAdd(column_length, + &aggregate_output[index_to_src_feature], + &node_embeddings[index_to_dst_feature], + &aggregate_output[index_to_src_feature]); } - - galois::VectorMulAdd( - column_length, &aggregate_output[index_to_src_feature], - &node_embeddings[index_to_dst_feature], norm_scale, - &aggregate_output[index_to_src_feature]); - } else { - // add dst feature to aggregate output - galois::VectorAdd(column_length, - &aggregate_output[index_to_src_feature], - &node_embeddings[index_to_dst_feature], - &aggregate_output[index_to_src_feature]); } } }, From 51cab969c403f89671dbc4522cba6f0e7c62f056 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 23 Apr 2021 18:42:12 -0500 Subject: [PATCH 512/660] CSR/CSC Graph: manual construction functions Allows a user to manually construct the the in-edges rather than calling into a function to do it. --- .../include/galois/graphs/LC_CSR_CSC_Graph.h | 42 +++++++++++++++---- 1 file changed, 35 insertions(+), 7 deletions(-) diff --git a/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h b/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h index 2f0b9e88de..09224296a3 100644 --- a/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h +++ b/libgalois/include/galois/graphs/LC_CSR_CSC_Graph.h @@ -91,7 +91,7 @@ class LC_CSR_CSC_Graph //! edge index data for the reverse edges EdgeIndData in_edge_ind_data_; //! edge destination data for the reverse edges - EdgeDst inEdgeDst; + EdgeDst in_edge_dst_; //! Edge data of inedges can be a value copy of the outedges (i.e. in and //! out edges have separate edge values) or inedges can refer to the same //! data as its corresponding outedge; this is what this typedef is for @@ -108,12 +108,12 @@ class LC_CSR_CSC_Graph //! beginning iterator to an edge sorter for in-edges edge_sort_iterator in_edge_sort_begin(GraphNode N) { - return edge_sort_iterator(*in_raw_begin(N), &inEdgeDst, &inEdgeData); + return edge_sort_iterator(*in_raw_begin(N), &in_edge_dst_, &inEdgeData); } //! ending iterator to an edge sorter for in-edges edge_sort_iterator in_edge_sort_end(GraphNode N) { - return edge_sort_iterator(*in_raw_end(N), &inEdgeDst, &inEdgeData); + return edge_sort_iterator(*in_raw_end(N), &in_edge_dst_, &inEdgeData); } /** @@ -182,7 +182,7 @@ class LC_CSR_CSC_Graph } // allocate edge dests and data - inEdgeDst.allocateInterleaved(BaseGraph::numEdges); + in_edge_dst_.allocateInterleaved(BaseGraph::numEdges); if (!std::is_void::value) { inEdgeData.allocateInterleaved(BaseGraph::numEdges); @@ -202,7 +202,7 @@ class LC_CSR_CSC_Graph // location to save edge auto e_new = __sync_fetch_and_add(&(dataBuffer[dst]), 1); // save src as destination - inEdgeDst[e_new] = src; + in_edge_dst_[e_new] = src; // edge data to "new" array createEdgeData(e_new, e); in_edge_to_out_edge_[e_new] = e; @@ -212,6 +212,34 @@ class LC_CSR_CSC_Graph } public: + ///////////////////////////////////////////////////////////////////////////// + // Manual construction functions + ///////////////////////////////////////////////////////////////////////////// + + // no edge data support at the moment for these functions because not required + // for the current use case + + //! Reallocate memory for the CSC part of the graph + void CSCAllocate() { + // assumes nodes and edges set from CSR version of this call + in_edge_dst_.deallocate(); + in_edge_ind_data_.deallocate(); + + if (UseNumaAlloc) { + in_edge_ind_data_.allocateBlocked(BaseGraph::numNodes); + in_edge_dst_.allocateBlocked(BaseGraph::numEdges); + } else { + in_edge_ind_data_.allocateInterleaved(BaseGraph::numNodes); + in_edge_dst_.allocateInterleaved(BaseGraph::numEdges); + } + } + //! Construct the in edge for some edge index by setting the destination + void ConstructInEdge(EdgeIndexTy e, NodeIndexTy dst) { + in_edge_dst_[e] = dst; + } + //! In-edge index setting + void FixEndInEdge(NodeIndexTy n, EdgeIndexTy e) { in_edge_ind_data_[n] = e; } + ///////////////////////////////////////////////////////////////////////////// // Construction functions ///////////////////////////////////////////////////////////////////////////// @@ -274,7 +302,7 @@ class LC_CSR_CSC_Graph if (!HasNoLockable && galois::runtime::shouldLock(mflag)) { for (edge_iterator ii = in_raw_begin(N), ee = in_raw_end(N); ii != ee; ++ii) { - BaseGraph::acquireNode(inEdgeDst[*ii], mflag); + BaseGraph::acquireNode(in_edge_dst_[*ii], mflag); } } return in_raw_begin(N); @@ -313,7 +341,7 @@ class LC_CSR_CSC_Graph * @param ni edge id * @returns destination for that in edge */ - GraphNode getInEdgeDst(edge_iterator ni) const { return inEdgeDst[*ni]; } + GraphNode getInEdgeDst(edge_iterator ni) const { return in_edge_dst_[*ni]; } /** * Given an edge id for in edge, get the data associated with that edge. From 455159e13e980796cc0e59ec712fe220e5ef7e16 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 23 Apr 2021 18:43:47 -0500 Subject: [PATCH 513/660] AVX512 for vector add/mul function --- libgnn/src/GNNMath.cpp | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp index afb3712981..9e8de18d5f 100644 --- a/libgnn/src/GNNMath.cpp +++ b/libgnn/src/GNNMath.cpp @@ -52,6 +52,24 @@ void galois::VectorAdd(size_t length, const GNNFloat* a, const GNNFloat* b, void galois::VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b, const GNNFloat b_scale, GNNFloat* output) { +#ifdef __AVX512F__ + // 512 + constexpr size_t vectorization_length = 16; + const size_t aligned_end = length - length % vectorization_length; + __m512 scale_vec_main = _mm512_set_ps( + b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, + b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale); + for (size_t i = 0; i < aligned_end; i += vectorization_length) { + _mm512_storeu_ps( + &output[i], + _mm512_add_ps(_mm512_loadu_ps(&a[i]), + _mm512_mul_ps(scale_vec_main, _mm512_loadu_ps(&b[i])))); + } + // handle the rest + for (size_t i = aligned_end; i < length; ++i) { + output[i] = a[i] + b[i] * b_scale; + } +#else #ifdef __AVX2__ constexpr size_t vectorization_length = 8; // for 32-bit floating point in AVX2; TODO AVX512 @@ -82,6 +100,7 @@ void galois::VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b, output[i] = a[i] + b[i] * b_scale; } #endif +#endif } void galois::GNNSoftmax(const size_t vector_length, const GNNFloat* input, From 4d7bc090b048f0d0b32077edc5da6e8d19ed349e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 24 Apr 2021 13:05:31 -0500 Subject: [PATCH 514/660] MKL link update: 20.0 and parallel link On CDGC machines, link to MKL 20. Also, fix sequential MKL link because that makes the BLAS calls sequential and not parallel. Fixing parallel link ended up with a near 4x speedup for 3 layer 256 hidden layer SAGE on products. --- CMakeLists.txt | 2 +- libgnn/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index bb72f24c71..1eaa1e1e0a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -141,7 +141,7 @@ endif() # TODO (loc) prefix with GALOIS, move elsewhere more fitting in this file ################################################################################ if(USE_MKL_BLAS) - SET(INTEL_ROOT /opt/apps/sysnet/intel/19.0) + SET(INTEL_ROOT /opt/apps/sysnet/intel/20.0) SET(MKL_ROOT ${INTEL_ROOT}/mkl) find_package(MKL REQUIRED) message(STATUS "MKL: ${MKL_INCLUDE_DIRS}") diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index ed60ae032b..665cd14545 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -13,7 +13,7 @@ set(sources ) set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64) -set(INTEL_LIBS "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core") +set(INTEL_LIBS "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5") add_library(galois_gnn STATIC ${sources}) From 0d31eff9e9d1e0edccad8702792b59faed92f950 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 26 Apr 2021 16:58:53 -0500 Subject: [PATCH 515/660] GNNSubgraph in Graph Subgraph object in graph that is used when flag is flipped. NOT CHECKED IN DETAIL DUE TO DEADLINE COMING UP AND NEED TO RUSH. --- libgnn/CMakeLists.txt | 1 + libgnn/include/galois/GNNTypes.h | 2 +- libgnn/include/galois/graphs/GNNGraph.h | 234 +++++++++++++++++---- libgnn/include/galois/graphs/GNNSubgraph.h | 132 ++++++++++++ libgnn/src/GNNMath.cpp | 2 +- libgnn/src/graphs/GNNGraph.cpp | 139 ++++++++++-- libgnn/src/graphs/GNNSubgraph.cpp | 186 ++++++++++++++++ 7 files changed, 636 insertions(+), 60 deletions(-) create mode 100644 libgnn/include/galois/graphs/GNNSubgraph.h create mode 100644 libgnn/src/graphs/GNNSubgraph.cpp diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 665cd14545..2393ce043b 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -3,6 +3,7 @@ set(sources src/GNNOptimizers.cpp src/GraphNeuralNetwork.cpp src/graphs/GNNGraph.cpp + src/graphs/GNNSubgraph.cpp src/layers/DenseLayer.cpp src/layers/GNNLayer.cpp src/layers/GraphConvolutionalLayer.cpp diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h index 3603cb68d7..492bc841dc 100644 --- a/libgnn/include/galois/GNNTypes.h +++ b/libgnn/include/galois/GNNTypes.h @@ -25,7 +25,7 @@ using GPUNodeIndex = uint32_t; using GPUEdgeIndex = uint64_t; //! Phase of GNN computation -enum class GNNPhase { kTrain, kValidate, kTest }; +enum class GNNPhase { kTrain, kValidate, kTest, kOther }; //! Vector like wrapper over a pointer and size; exists solely to pass around //! raw pointers with size (because vectors are a no-go due to the code diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 853a96dc0d..3a538d9da5 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -30,7 +30,6 @@ namespace graphs { //! Possible partitioning schemes for the GNN graph enum class GNNPartitionScheme { kOEC, kCVC, kOCVC }; -//! XXX class GNNGraph { public: using GNNDistGraph = galois::graphs::DistGraph; @@ -66,6 +65,14 @@ class GNNGraph { //! Return # of nodes in the partitioned graph size_t size() const { return partitioned_graph_->size(); } + //! Returns # of nodes in the *graph that is currently active*. + size_t active_size() const { + if (!use_subgraph_) { + return partitioned_graph_->size(); + } else { + return subgraph_->size(); + } + } bool is_local(size_t gid) const { return partitioned_graph_->isLocal(gid); } size_t GetLID(size_t gid) const { return partitioned_graph_->getLID(gid); } @@ -73,28 +80,44 @@ class GNNGraph { //! Node begin for all local nodes NodeIterator begin() const { - return partitioned_graph_->allNodesRange().begin(); + if (!use_subgraph_) { + return partitioned_graph_->allNodesRange().begin(); + } else { + return subgraph_->begin(); + } } //! Node end for all local nodes - NodeIterator end() const { return partitioned_graph_->allNodesRange().end(); } + NodeIterator end() const { + if (!use_subgraph_) { + return partitioned_graph_->allNodesRange().end(); + } else { + return subgraph_->end(); + } + } NodeIterator begin_owned() const { - return partitioned_graph_->masterNodesRange().begin(); + if (!use_subgraph_) { + return partitioned_graph_->masterNodesRange().begin(); + } else { + return subgraph_->begin_owned(); + } } NodeIterator end_owned() const { - return partitioned_graph_->masterNodesRange().end(); + if (!use_subgraph_) { + return partitioned_graph_->masterNodesRange().end(); + } else { + return subgraph_->end_owned(); + } } ////////////////////////////////////////////////////////////////////////////// // Edges ////////////////////////////////////////////////////////////////////////////// - void InitializeEdgeData() { InitializeEdgeData(1); } - - void InitializeEdgeData(size_t num_layers) { - edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers); - } + void InitializeSamplingData() { InitializeSamplingData(1); } + //! Initialize data required to do graph sampling + void InitializeSamplingData(size_t num_layers); ////////////////////////////////////////////////////////////////////////////// // Out Edges @@ -102,17 +125,64 @@ class GNNGraph { // All following functions take a local node id EdgeIterator edge_begin(GraphNode n) const { - return partitioned_graph_->edge_begin(n); + if (!use_subgraph_) { + return partitioned_graph_->edge_begin(n); + } else { + return subgraph_->edge_begin(n); + } }; + EdgeIterator edge_end(GraphNode n) const { - return partitioned_graph_->edge_end(n); + if (!use_subgraph_) { + return partitioned_graph_->edge_end(n); + } else { + return subgraph_->edge_end(n); + } }; GraphNode GetEdgeDest(EdgeIterator ei) const { - return partitioned_graph_->getEdgeDst(ei); + if (!use_subgraph_) { + return partitioned_graph_->getEdgeDst(ei); + } else { + return subgraph_->GetEdgeDest(ei); + } + }; + galois::runtime::iterable< + galois::NoDerefIterator> + edges(GraphNode N) const { + if (!use_subgraph_) { + return partitioned_graph_->edges(N); + } else { + return subgraph_->edges(N); + } + } + + bool IsEdgeSampledAny(EdgeIterator ei) const { + for (bool b : edge_sample_status_[*ei]) { + if (b) + return true; + } + return false; + } + bool IsEdgeSampled(uint32_t ei, size_t layer_num) const { + if (!use_subgraph_) { + return edge_sample_status_[ei][layer_num]; + } else { + GALOIS_LOG_FATAL("This shouldn't be called with subgraph"); + return false; + } }; bool IsEdgeSampled(EdgeIterator ei, size_t layer_num) const { + if (!use_subgraph_) { + return edge_sample_status_[*ei][layer_num]; + } else { + return subgraph_->OutEdgeSampled(ei, layer_num, *this); + } + }; + //! Always use original graph's edge iterator here + bool IsEdgeSampledOriginalGraph(EdgeIterator ei, size_t layer_num) const { return edge_sample_status_[*ei][layer_num]; }; + //! Set the flag on the edge to 1; makes it sampled void MakeEdgeSampled(EdgeIterator ei, size_t layer_num) { edge_sample_status_[*ei][layer_num] = 1; @@ -121,28 +191,62 @@ class GNNGraph { void MakeEdgeUnsampled(EdgeIterator ei, size_t layer_num) { edge_sample_status_[*ei][layer_num] = 0; }; - galois::runtime::iterable< - galois::NoDerefIterator> - edges(GraphNode N) { - return partitioned_graph_->edges(N); - } ////////////////////////////////////////////////////////////////////////////// // in edges ////////////////////////////////////////////////////////////////////////////// EdgeIterator in_edge_begin(GraphNode n) const { - return partitioned_graph_->in_edge_begin(n); + if (!use_subgraph_) { + return partitioned_graph_->in_edge_begin(n); + } else { + return subgraph_->in_edge_begin(n); + } } EdgeIterator in_edge_end(GraphNode n) const { - return partitioned_graph_->in_edge_end(n); + if (!use_subgraph_) { + return partitioned_graph_->in_edge_end(n); + } else { + return subgraph_->in_edge_end(n); + } + } + galois::runtime::iterable< + galois::NoDerefIterator> + in_edges(GraphNode N) const { + if (!use_subgraph_) { + return partitioned_graph_->in_edges(N); + } else { + return subgraph_->in_edges(N); + } } GraphNode GetInEdgeDest(EdgeIterator ei) const { - return partitioned_graph_->GetInEdgeDest(ei); + if (!use_subgraph_) { + return partitioned_graph_->GetInEdgeDest(ei); + } else { + return subgraph_->GetInEdgeDest(ei); + } + }; + + EdgeIterator InEdgeToOutEdge(EdgeIterator in_edge_iter) const { + return partitioned_graph_->InEdgeToOutEdge(in_edge_iter); + } + + bool IsInEdgeSampledAny(EdgeIterator ei) const { + for (bool b : + edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)]) { + if (b) + return true; + } + return false; }; bool IsInEdgeSampled(EdgeIterator ei, size_t layer_num) const { - return edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)] - [layer_num]; + if (!use_subgraph_) { + return edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)] + [layer_num]; + } else { + return subgraph_->InEdgeSampled(ei, layer_num, *this); + } }; + //! Set the flag on the edge to 1; makes it sampled void MakeInEdgeSampled(EdgeIterator ei, size_t layer_num) { edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)][layer_num] = 1; @@ -151,11 +255,6 @@ class GNNGraph { void MakeInEdgeUnsampled(EdgeIterator ei, size_t layer_num) { edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)][layer_num] = 0; }; - galois::runtime::iterable< - galois::NoDerefIterator> - in_edges(GraphNode N) { - return partitioned_graph_->in_edges(N); - } ////////////////////////////////////////////////////////////////////////////// // neighborhood sampling @@ -163,15 +262,42 @@ class GNNGraph { //! Set seed nodes, i.e., nodes that are being predicted on void SetupNeighborhoodSample(); + + //! Choose all edges from sampled nodes + void SampleAllEdges(size_t agg_layer_num); //! Sample neighbors of nodes that are marked as ready for sampling void SampleEdges(size_t sample_layer_num, size_t num_to_sample); + //! Construct the subgraph from sampled edges and corresponding nodes + size_t ConstructSampledSubgraph() { + // false first so that the build process can use functions to access the + // real graph + use_subgraph_ = false; + size_t num_subgraph_nodes = subgraph_->BuildSubgraph(*this); + // after this, this graph is a subgraph + use_subgraph_ = true; + return num_subgraph_nodes; + } + + void EnableSubgraph() { use_subgraph_ = true; } + + void DisableSubgraph() { use_subgraph_ = false; } + ////////////////////////////////////////////////////////////////////////////// GNNFloat GetNormFactor(GraphNode n) const { return norm_factors_[n]; } //! Degree norm (1 / degree) of current functional graph (e.g., sampled, //! inductive graph, etc); calculated whenever norm factor is calculated - GNNFloat GetDegreeNorm(GraphNode n) const { return degree_norm_[n]; } + GNNFloat GetDegreeNorm(GraphNode n) const { + if (!use_subgraph_) { + return degree_norm_[n]; + } else { + // XXX does not work in distributed case, fix there + // XXX also need to account for current layer number in sampling + // case because degrees in each layer differ + return 1.0 / subgraph_->GetLocalDegree(n); + } + } // Get accuracy: sampling is by default false float GetGlobalAccuracy(PointerWithSize predictions, @@ -183,11 +309,19 @@ class GNNGraph { //! class labels. GNNFloat GetSingleClassLabel(const unsigned lid) const { assert(using_single_class_labels_); - if (local_ground_truth_labels_[lid] != num_label_classes_) { - return local_ground_truth_labels_[lid]; + unsigned to_use = lid; + if (use_subgraph_) { + to_use = subgraph_->SIDToLID(lid); + } + + if (local_ground_truth_labels_[to_use] != num_label_classes_) { + // galois::gPrint(lid, " ", to_use, " ", + // (int)local_ground_truth_labels_[to_use], "\n"); + return local_ground_truth_labels_[to_use]; } else { GALOIS_LOG_FATAL( - "should not get the label of a node that has no ground truth"); + "should not get the label of a node that has no ground truth {}", + to_use); } } @@ -208,7 +342,12 @@ class GNNGraph { local_node_features_.size()); } #endif - return PointerWithSize(local_node_features_); + if (!use_subgraph_) { + return PointerWithSize(local_node_features_); + } else { + return PointerWithSize(subgraph_->GetLocalFeatures().data(), + subgraph_->GetLocalFeatures().size()); + } } //! Given an LID and the current phase of GNN computation, determine if the @@ -216,10 +355,16 @@ class GNNGraph { //! a training, validation, or test phase mask) bool IsValidForPhase(const unsigned lid, const galois::GNNPhase current_phase) const { - if (!incomplete_masks_) { - return IsValidForPhaseCompleteRange(lid, current_phase); + // XXX maybe just map this all over to subgraph, though in that case + // issue is that subgraph doesn't necessarily know about test/val + unsigned to_use = lid; + if (use_subgraph_) { + to_use = subgraph_->SIDToLID(lid); + } + if (!incomplete_masks_ && current_phase != GNNPhase::kOther) { + return IsValidForPhaseCompleteRange(to_use, current_phase); } else { - return IsValidForPhaseMasked(lid, current_phase); + return IsValidForPhaseMasked(to_use, current_phase); } } @@ -293,6 +438,10 @@ class GNNGraph { #endif private: +// included like this to avoid cyclic dependency issues + not used anywhere but +// in this class anyways +#include "galois/graphs/GNNSubgraph.h" + ////////////////////////////////////////////////////////////////////////////// // Initialization ////////////////////////////////////////////////////////////////////////////// @@ -307,6 +456,8 @@ class GNNGraph { size_t ReadLocalMasksFromFile(const std::string& dataset_name, const std::string& mask_type, GNNRange* mask_range, char* masks); + //! Finds nodes that aren't part of the 3 main GNN phase classifications + size_t FindOtherMask(); //! Read masks of local nodes only for training, validation, and testing void ReadLocalMasks(const std::string& dataset_name); //! Reads the entire graph topology in (but nothing else) @@ -368,10 +519,15 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// + std::unique_ptr subgraph_; + // Degrees for sampled subgraph + galois::LargeArray sampled_out_degrees_; + galois::LargeArray sampled_in_degrees_; //! Sample data on edges: each edge gets a small bitset to mark //! if it's been sampled for a particular layer galois::LargeArray> edge_sample_status_; - + //! Indicates newly sampled nodes (for distributed synchronization of sampling + //! status galois::DynamicBitSet new_sampled_nodes_; ////////////////////////////////////////////////////////////////////////////// @@ -383,6 +539,9 @@ class GNNGraph { std::vector local_validation_mask_; //! Bitset indicating which nodes are testing nodes std::vector local_testing_mask_; + size_t valid_other_{0}; + //! Bitset indicating which nodes don't fall anywhere + std::vector other_mask_; //! Global mask range for training nodes; must convert to LIDs when using //! in this class @@ -408,6 +567,7 @@ class GNNGraph { galois::PerThreadRNG sample_rng_; // TODO vars for subgraphs as necessary + bool use_subgraph_{false}; ////////////////////////////////////////////////////////////////////////////// // GPU things diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h new file mode 100644 index 0000000000..c3c931f0da --- /dev/null +++ b/libgnn/include/galois/graphs/GNNSubgraph.h @@ -0,0 +1,132 @@ +// Note no header guard or anything like that; this file is meant to be +// included in the middle of GNNGraph class declaration as a class in a class +class GNNSubgraph { +public: + using GraphNode = LC_CSR_CSC_Graph::GraphNode; + using NodeIterator = boost::counting_iterator; + using EdgeIterator = LC_CSR_CSC_Graph::edge_iterator; + + //! Allocates space for the lid to sid map + GNNSubgraph(size_t main_graph_size) { + lid_to_subgraph_id_.create(main_graph_size, + std::numeric_limits::max()); + // the subgraph to original graph maps are allocated on demand in gstl + // vectors since those change every epoch + } + //! Given sampled bits set on gnn_graph, builds an explicit subgraph + //! for the sampled bits + size_t BuildSubgraph(GNNGraph& gnn_graph); + + galois::gstl::Vector& GetLocalFeatures() { + return subgraph_node_features_; + } + + ////////////////////////////////////////////////////////////////////////////// + // Nodes + ////////////////////////////////////////////////////////////////////////////// + + uint32_t size() { return num_subgraph_nodes_; } + NodeIterator begin() const { return NodeIterator(0); } + NodeIterator end() const { return NodeIterator(num_subgraph_nodes_); } + + NodeIterator begin_owned() const { return NodeIterator(0); } + NodeIterator end_owned() const { + return NodeIterator(subgraph_master_boundary_); + } + + uint32_t SIDToLID(uint32_t sid) const { return subgraph_id_to_lid_[sid]; } + + ////////////////////////////////////////////////////////////////////////////// + // Edge iteration and destination + ////////////////////////////////////////////////////////////////////////////// + + EdgeIterator edge_begin(GraphNode n) { + return underlying_graph_.edge_begin(n); + } + EdgeIterator edge_end(GraphNode n) { return underlying_graph_.edge_end(n); } + GraphNode GetEdgeDest(EdgeIterator out_edge_iterator) { + return underlying_graph_.getEdgeDst(out_edge_iterator); + }; + galois::runtime::iterable< + galois::NoDerefIterator> + edges(GraphNode n) { + return internal::make_no_deref_range(edge_begin(n), edge_end(n)); + } + + EdgeIterator in_edge_begin(GraphNode n) { + return underlying_graph_.in_edge_begin(n); + } + EdgeIterator in_edge_end(GraphNode n) { + return underlying_graph_.in_edge_end(n); + } + GraphNode GetInEdgeDest(EdgeIterator in_edge_iterator) { + return underlying_graph_.getInEdgeDst(in_edge_iterator); + }; + galois::runtime::iterable< + galois::NoDerefIterator> + in_edges(GraphNode n) { + return internal::make_no_deref_range(in_edge_begin(n), in_edge_end(n)); + } + + size_t GetLocalDegree(GraphNode n) { + return std::distance(edge_begin(n), edge_end(n)); + } + + ////////////////////////////////////////////////////////////////////////////// + // Edge sampling status check + ////////////////////////////////////////////////////////////////////////////// + + bool OutEdgeSampled(EdgeIterator out_edge_iterator, size_t layer_num, + const GNNGraph& original_graph) { + return original_graph.IsEdgeSampledOriginalGraph( + subedge_to_original_edge_[*out_edge_iterator], layer_num); + } + bool InEdgeSampled(EdgeIterator in_edge_iterator, size_t layer_num, + const GNNGraph& original_graph) { + // note that original IsEdgeSampled is called because this object stores the + // original edge already + return original_graph.IsEdgeSampledOriginalGraph( + in_subedge_to_original_edge_[*in_edge_iterator], layer_num); + } + + ////////////////////////////////////////////////////////////////////////////// + +private: + //! Creates subgraph ID mapping from the number of sampled nodes from the + //! original graph. Should be done every epoch when sampled graph changes. + void CreateLocalToSubgraphMapping(const GNNGraph& gnn_graph); + //! Counts in and out degrees of all sampled nodes in the graph + void DegreeCounting(const GNNGraph& gnn_graph); + //! Creates edges + void EdgeCreation(const GNNGraph& gnn_graph); + //! Copies over relevant features of the nodes + void NodeFeatureCreation(GNNGraph& gnn_graph); + + static const constexpr char* kRegionName = "GNNSubgraph"; + + // name is self explanatory + LC_CSR_CSC_Graph underlying_graph_; + // size vars + uint32_t num_subgraph_nodes_; + uint32_t num_subgraph_edges_; + uint32_t subgraph_master_boundary_; + //! Features corresponding only to this subgraph; copied from main graph + //! (in other words, redundant; would be nice if there was a way to + //! fake contiguous memory + galois::gstl::Vector subgraph_node_features_; + //! Dense array mapping local ids to subgraph id (not space efficient) + galois::LargeArray lid_to_subgraph_id_; + //! Map subgraph ids back to local graph ids + //! gstl vector because this will get resized every epoch (LargeArray + //! is for static) + galois::gstl::Vector subgraph_id_to_lid_; + // intermediate degrees used for edge construction + galois::gstl::Vector subgraph_out_degrees_; + galois::gstl::Vector subgraph_in_degrees_; + //! Maps from subgraph out-edge id to original graph edge id (used to check if + //! edge exists in particular layer) + galois::gstl::Vector subedge_to_original_edge_; + //! Maps from subgraph in-edge id to original graph edge id (used to check if + //! edge exists in particular layer) + galois::gstl::Vector in_subedge_to_original_edge_; +}; diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp index 9e8de18d5f..582fba95f6 100644 --- a/libgnn/src/GNNMath.cpp +++ b/libgnn/src/GNNMath.cpp @@ -56,7 +56,7 @@ void galois::VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b, // 512 constexpr size_t vectorization_length = 16; const size_t aligned_end = length - length % vectorization_length; - __m512 scale_vec_main = _mm512_set_ps( + __m512 scale_vec_main = _mm512_set_ps( b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale); for (size_t i = 0; i < aligned_end; i += vectorization_length) { diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index c12288c950..56572ccb76 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -126,6 +126,9 @@ bool galois::graphs::GNNGraph::IsValidForPhaseCompleteRange( case GNNPhase::kTest: range_to_use = &global_testing_mask_range_; break; + case GNNPhase::kOther: + GALOIS_LOG_FATAL("no range for other"); + break; default: GALOIS_LOG_FATAL("Invalid phase used"); range_to_use = nullptr; @@ -156,6 +159,12 @@ bool galois::graphs::GNNGraph::IsValidForPhaseMasked( case GNNPhase::kTest: mask_to_use = &local_testing_mask_; break; + case GNNPhase::kOther: + if (valid_other_ == 0) { + return false; + } + mask_to_use = &other_mask_; + break; default: GALOIS_LOG_FATAL("Invalid phase used"); mask_to_use = nullptr; @@ -486,6 +495,25 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( return valid_count; } +size_t galois::graphs::GNNGraph::FindOtherMask() { + galois::GAccumulator other_accum; + other_accum.reset(); + other_mask_.resize(partitioned_graph_->size()); + + galois::do_all( + galois::iterate(size_t{0}, partitioned_graph_->size()), + [&](size_t local_id) { + if (!IsValidForPhase(local_id, GNNPhase::kTrain) && + !IsValidForPhase(local_id, GNNPhase::kValidate) && + !IsValidForPhase(local_id, GNNPhase::kTest)) { + other_mask_[local_id] = 1; + other_accum += 1; + } + }, + galois::loopname("FindOtherMask")); + return other_accum.reduce(); +} + void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { // allocate the memory for the local masks local_training_mask_.resize(partitioned_graph_->size()); @@ -535,10 +563,13 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { size_t valid_test = ReadLocalMasksFromFile(dataset_name, "test", &global_testing_mask_range_, local_testing_mask_.data()); + valid_other_ = FindOtherMask(); + // the "other" set of nodes that don't fall into any classification if (galois::runtime::getSystemNetworkInterface().ID == 0) { galois::gInfo("Valid # training nodes is ", valid_train); galois::gInfo("Valid # validation nodes is ", valid_val); galois::gInfo("Valid # test nodes is ", valid_test); + galois::gInfo("Valid # other nodes is ", valid_other_); } } } @@ -665,26 +696,30 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPU( } float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( - PointerWithSize predictions, GNNPhase phase, bool sampling) { + PointerWithSize predictions, GNNPhase phase, bool) { // check owned nodes' accuracy assert((num_label_classes_ * size()) == predictions.size()); num_correct_.reset(); total_checked_.reset(); galois::do_all( + // will only loop over sampled nodes if sampling is on galois::iterate(begin_owned(), end_owned()), - [&](const unsigned lid) { - if (IsValidForPhase(lid, phase)) { - if (sampling) { - if (phase == GNNPhase::kTrain && !IsInSampledGraph(lid)) { - return; - } - } + // this is possibly the subgraph id + [&](const unsigned node_id) { + unsigned lid = node_id; + if (use_subgraph_) { + // convert SID over to LID + lid = subgraph_->SIDToLID(node_id); + } + if (IsValidForPhase(lid, phase)) { total_checked_ += 1; // get prediction by getting max + // note the use of node_id here: lid only used to check original + // labels size_t predicted_label = galois::MaxIndex( - num_label_classes_, &(predictions[lid * num_label_classes_])); + num_label_classes_, &(predictions[node_id * num_label_classes_])); // check against ground truth and track accordingly // TODO static cast used here is dangerous if (predicted_label == @@ -699,7 +734,8 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( size_t global_correct = num_correct_.reduce(); size_t global_checked = total_checked_.reduce(); - GALOIS_LOG_VERBOSE("Accuracy: {} / {}", global_correct, global_checked); + GALOIS_LOG_WARN("Sub: {}, Accuracy: {} / {}", use_subgraph_, global_correct, + global_checked); return static_cast(global_correct) / static_cast(global_checked); @@ -819,6 +855,15 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti( return global_f1_micro_score; } +//////////////////////////////////////////////////////////////////////////////// + +void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers) { + subgraph_ = std::make_unique(partitioned_graph_->size()); + edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers); + sampled_out_degrees_.create(partitioned_graph_->size(), 0); + sampled_in_degrees_.create(partitioned_graph_->size(), 0); +} + void galois::graphs::GNNGraph::SetupNeighborhoodSample() { new_sampled_nodes_.resize(size()); new_sampled_nodes_.reset(); @@ -839,8 +884,54 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample() { }); } +void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { + use_subgraph_ = false; + + galois::GAccumulator sampled; + galois::GAccumulator total; + sampled.reset(); + total.reset(); + + galois::do_all( + galois::iterate(begin(), end()), + [&](const NodeIterator& x) { + // only operate on if sampled + if (partitioned_graph_->getData(*x)) { + // marks ALL edges of nodes that connect to train/other nodes + for (auto edge_iter : partitioned_graph_->edges(*x)) { + if (IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), + GNNPhase::kTrain) || + IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), + GNNPhase::kOther)) { + MakeEdgeSampled(edge_iter, agg_layer_num); + new_sampled_nodes_.set(partitioned_graph_->getEdgeDst(edge_iter)); + sampled += 1; + } + total += 1; + } + } + }, + galois::steal(), galois::loopname("ChooseAllEdges")); + + galois::gPrint("Num sampled edges is ", sampled.reduce(), " out of ", + total.reduce(), "\n"); + + std::vector new_nodes = new_sampled_nodes_.getOffsets(); + // update nodes, then communicate update to all hosts so that they can + // continue the exploration + galois::do_all( + galois::iterate(new_nodes), + [&](uint32_t new_node_id) { SetSampledNode(new_node_id); }, + galois::loopname("NeighborhoodSampleSet")); + + // XXX(loc) bitset; can readAny be weaker? + sync_substrate_->sync("SampleSync"); +} + void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, size_t num_to_sample) { + use_subgraph_ = false; + galois::GAccumulator sampled; galois::GAccumulator total; sampled.reset(); @@ -852,18 +943,24 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, if (partitioned_graph_->getData(*x)) { // chance of not uniformly choosing an edge of this node num_to_sample // times (degree norm is 1 / degree) - // XXX in-degree prob, not out degree double probability_of_reject = std::pow(1 - GetDegreeNorm(*x), num_to_sample); - // loop through in-edges, turn "on" edge with some probability - for (auto edge_iter : partitioned_graph_->in_edges(*x)) { + // loop through edges, turn "on" edge with some probability + for (auto edge_iter : partitioned_graph_->edges(*x)) { if (sample_rng_.DoBernoulli(probability_of_reject)) { - // if here, it means edge accepted; set sampled on, mark source - // as part of next set - MakeInEdgeSampled(edge_iter, sample_layer_num); - new_sampled_nodes_.set( - partitioned_graph_->GetInEdgeDest(edge_iter)); - sampled += 1; + // only take if node is training node or a node not classified + // into train/test/val + if (IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), + GNNPhase::kTrain) || + IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), + GNNPhase::kOther)) { + // if here, it means edge accepted; set sampled on, mark source + // as part of next set + MakeEdgeSampled(edge_iter, sample_layer_num); + new_sampled_nodes_.set( + partitioned_graph_->getEdgeDst(edge_iter)); + sampled += 1; + } } total += 1; } @@ -871,8 +968,8 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, }, galois::steal(), galois::loopname("NeighborhoodSample")); - galois::gPrint("Num sampled edges is ", sampled.reduce(), " out of ", - total.reduce(), "\n"); + galois::gDebug("Num sampled edges for layer ", sample_layer_num, " is ", + sampled.reduce(), " out of ", total.reduce()); std::vector new_nodes = new_sampled_nodes_.getOffsets(); diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp new file mode 100644 index 0000000000..e80dfffbc9 --- /dev/null +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -0,0 +1,186 @@ +#include "galois/graphs/GNNGraph.h" +#include + +size_t +galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(GNNGraph& gnn_graph) { + galois::StatTimer timer("BuildSubgraph", kRegionName); + timer.start(); + CreateLocalToSubgraphMapping(gnn_graph); + DegreeCounting(gnn_graph); + EdgeCreation(gnn_graph); + NodeFeatureCreation(gnn_graph); + // loop over each node, grab out/in edges, construct them in LC_CSR_CSC + // no edge data, just topology + timer.stop(); + return num_subgraph_nodes_; +} + +void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping( + const GNNGraph& gnn_graph) { + galois::StatTimer timer("LIDToSIDMapping", kRegionName); + timer.start(); + + assert(gnn_graph.size() == lid_to_subgraph_id_.size()); + // clear all mappings + std::fill(lid_to_subgraph_id_.begin(), lid_to_subgraph_id_.end(), + std::numeric_limits::max()); + // TODO(loc) depending on overhead, can parallelize this with a prefix sum + // serial loop over LIDs to construct lid -> subgraph id mapping + uint32_t current_sid = 0; + + // split into 2 parts: masters, then mirrors + size_t last_owned_node = *(gnn_graph.end_owned()); + for (size_t local_node_id = 0; local_node_id < last_owned_node; + local_node_id++) { + if (gnn_graph.IsInSampledGraph(local_node_id)) { + // TODO should bound check the SID to max uint32_t + // note: if SID is max uint32t, then it's not valid + lid_to_subgraph_id_[local_node_id] = current_sid++; + } + } + + // all nodes before this SID are master nodes + subgraph_master_boundary_ = current_sid; + + for (size_t local_node_id = last_owned_node; local_node_id < gnn_graph.size(); + local_node_id++) { + if (gnn_graph.IsInSampledGraph(local_node_id)) { + // TODO should bound check the SID to max uint32_t + // note: if SID is max uint32t, then it's not valid + lid_to_subgraph_id_[local_node_id] = current_sid++; + } + } + galois::gDebug("Numbered sampled nodes for subgraph construction is ", + current_sid); + + num_subgraph_nodes_ = current_sid; + + timer.stop(); +} + +void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting( + const GNNGraph& gnn_graph) { + galois::StatTimer timer("DegreeCounting", kRegionName); + timer.start(); + + subgraph_id_to_lid_.resize(num_subgraph_nodes_); + subgraph_out_degrees_.resize(num_subgraph_nodes_); + subgraph_in_degrees_.resize(num_subgraph_nodes_); + + galois::do_all( + galois::iterate(gnn_graph.begin(), gnn_graph.end()), + [&](uint32_t node_id) { + if (gnn_graph.IsInSampledGraph(node_id)) { + uint32_t subgraph_id = lid_to_subgraph_id_[node_id]; + subgraph_id_to_lid_[subgraph_id] = node_id; + + uint32_t out_degrees = 0; + for (auto out_edge_iter : gnn_graph.edges(node_id)) { + if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) { + out_degrees++; + } + } + subgraph_out_degrees_[subgraph_id] = out_degrees; + + uint32_t in_degrees = 0; + for (auto in_edge_iter : gnn_graph.in_edges(node_id)) { + if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) { + in_degrees++; + } + } + subgraph_in_degrees_[subgraph_id] = in_degrees; + // galois::gDebug("Local ID ", node_id, " SID ", subgraph_id, " out ", + // out_degrees, " in ", in_degrees); + } + }, + galois::steal()); + + timer.stop(); +} + +void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( + const GNNGraph& gnn_graph) { + galois::StatTimer timer("EdgeConstruction", kRegionName); + timer.start(); + + // prefix sum over subgraph degrees from previous phase to get starting points + for (size_t i = 1; i < num_subgraph_nodes_; i++) { + subgraph_out_degrees_[i] += subgraph_out_degrees_[i - 1]; + subgraph_in_degrees_[i] += subgraph_in_degrees_[i - 1]; + } + + // allocate then set node endpoints + num_subgraph_edges_ = subgraph_out_degrees_.back(); + underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_); + underlying_graph_.CSCAllocate(); + galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_), + [&](uint32_t subgraph_id) { + underlying_graph_.fixEndEdge( + subgraph_id, subgraph_out_degrees_[subgraph_id]); + underlying_graph_.FixEndInEdge( + subgraph_id, subgraph_in_degrees_[subgraph_id]); + }); + subedge_to_original_edge_.resize(num_subgraph_edges_); + in_subedge_to_original_edge_.resize(num_subgraph_edges_); + + // save edges + save reference to layer sample status + galois::do_all( + galois::iterate(gnn_graph.begin(), gnn_graph.end()), + [&](uint32_t node_id) { + if (gnn_graph.IsInSampledGraph(node_id)) { + uint32_t subgraph_id = lid_to_subgraph_id_[node_id]; + + uint32_t out_location = 0; + uint32_t in_location = 0; + if (subgraph_id != 0) { + out_location = subgraph_out_degrees_[subgraph_id - 1]; + in_location = subgraph_in_degrees_[subgraph_id - 1]; + } + + for (auto out_edge_iter : gnn_graph.edges(node_id)) { + if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) { + subedge_to_original_edge_[out_location] = *out_edge_iter; + underlying_graph_.constructEdge( + out_location++, gnn_graph.GetEdgeDest(out_edge_iter)); + } + } + + for (auto in_edge_iter : gnn_graph.in_edges(node_id)) { + if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) { + in_subedge_to_original_edge_[in_location] = + *(gnn_graph.InEdgeToOutEdge(in_edge_iter)); + underlying_graph_.ConstructInEdge( + in_location++, gnn_graph.GetInEdgeDest(in_edge_iter)); + } + } + assert(out_location == subgraph_out_degrees_[subgraph_id]); + assert(in_location == subgraph_in_degrees_[subgraph_id]); + } + }, + galois::steal()); + timer.stop(); +} + +void galois::graphs::GNNGraph::GNNSubgraph::NodeFeatureCreation( + GNNGraph& gnn_graph) { + galois::StatTimer timer("NodeFeatureCreation", kRegionName); + timer.start(); + size_t feat_length = gnn_graph.node_feature_length(); + // assumes everything is already setup + subgraph_node_features_.resize(feat_length * num_subgraph_nodes_); + + galois::do_all(galois::iterate(begin(), end()), [&](size_t subgraph_node_id) { + size_t local_id = subgraph_id_to_lid_[subgraph_node_id]; + std::memcpy( + &(subgraph_node_features_[subgraph_node_id * feat_length]), + &((gnn_graph.GetLocalFeatures().data())[local_id * feat_length]), + feat_length * sizeof(GNNFeature)); + // for (unsigned i = 0; i < feat_length; i++) { + // galois::gPrint(feat_length * sizeof(GNNFeature) , " ", subgraph_node_id, + // " local id " , local_id, " feat at ", i, " is ", + // subgraph_node_features_[subgraph_node_id * feat_length + i], " ", + // gnn_graph.GetLocalFeatures()[local_id * feat_length + i], "\n"); + //} + }); + timer.stop(); +} From 5b48172ddd90311d65c006ada8fcadfe1712380c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 26 Apr 2021 17:00:16 -0500 Subject: [PATCH 516/660] WIP inductive training on GNN Works on single host seemingly, but code not checked in detail yet. --- libgnn/include/galois/layers/GNNLayer.h | 6 ++ libgnn/include/galois/layers/SAGELayer.h | 9 ++- libgnn/src/GraphNeuralNetwork.cpp | 82 +++++++++++++++------- libgnn/src/layers/GNNLayer.cpp | 19 +++--- libgnn/src/layers/SAGELayer.cpp | 86 +++++++++++------------- libgnn/src/layers/SoftmaxLayer.cpp | 16 +++-- libgnn/test/sample-bit-test.cpp | 2 +- 7 files changed, 131 insertions(+), 89 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index b5fb109ffe..82b149ee5e 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -94,6 +94,12 @@ class GNNLayer { : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, GNNLayerConfig()) {} + virtual void ResizeRows(size_t new_row_count) { + layer_dimensions_.input_rows = new_row_count; + // TODO(loc) output matrix should be resized if space becomes an issue, + // else just use first S rows (S = subgraph size) + } + GNNPhase layer_phase() { return layer_phase_; } //! Changes this layer's phase void SetLayerPhase(GNNPhase new_phase) { layer_phase_ = new_phase; } diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index b5ee978067..dd9ceb6e7b 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -28,7 +28,6 @@ class SAGELayer : public GNNLayer { //! memory for temporary matrices. Also initializes sync substrate for the //! weight matrix SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, - PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions, const GNNLayerConfig& config, const SAGELayerConfig& sage_config); @@ -45,6 +44,14 @@ class SAGELayer : public GNNLayer { : SAGELayer(layer_num, graph, backward_output_matrix, dimensions, GNNLayerConfig(), SAGELayerConfig()) {} + void ResizeRows(size_t new_row_count) { + galois::gDebug("Resizing SAGE layer for sampled graph from ", + layer_dimensions_.input_rows); + GNNLayer::ResizeRows(new_row_count); + galois::gDebug("To ", layer_dimensions_.input_rows); + // TODO(loc) resize input matrices if space is reason for doing this + } + void InitSelfWeightsTo1() { if (layer_weights_2_.size()) { layer_weights_2_.assign(layer_weights_2_.size(), 1); diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 42bcfc3b08..1b492c34ec 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -13,6 +13,10 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( galois::GraphNeuralNetworkConfig&& config) : graph_(std::move(graph)), optimizer_(std::move(optimizer)), config_(std::move(config)) { + if (config_.do_sampling_ && config_.inductive_training_) { + GALOIS_LOG_FATAL("Do not set inductive training and sampling at same time " + "(sampling is inductive already)"); + } // max number of rows that can be passed as inputs; allocate space for it as // this will be the # of rows for each layer size_t max_rows = graph_->size(); @@ -99,8 +103,10 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( break; } } - // output layer not included; it will never involve sampling - graph_->InitializeEdgeData(gnn_layers_.size()); + if (config_.do_sampling() || config_.inductive_training_) { + // output layer not included; it will never involve sampling + graph_->InitializeSamplingData(gnn_layers_.size()); + } // create the output layer GNNLayerDimensions output_dims = { @@ -134,7 +140,7 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( assert(false); } - // flip sampling + // flip sampling on layers if (config_.do_sampling()) { for (std::unique_ptr& ptr : gnn_layers_) { ptr->EnableSampling(); @@ -145,20 +151,24 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( float galois::GraphNeuralNetwork::Train(size_t num_epochs) { const size_t this_host = graph_->host_id(); float train_accuracy{0.f}; - - /* - if (config_.do_sampling()) { - for (std::unique_ptr& ptr : gnn_layers_) { - assert(ptr->IsSampledLayer()); - } - } - */ - - bool altered_norm_factor = - config_.inductive_training_ || config_.do_sampling(); - + size_t inductive_nodes = 0; if (config_.inductive_training_) { - graph_->CalculateSpecialNormFactor(false, true); + // Setup the subgraph to only be the training graph + graph_->SetupNeighborhoodSample(); + for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); + back_iter++) { + GNNLayerType layer_type = (*back_iter)->layer_type(); + if (layer_type == GNNLayerType::kGraphConvolutional || + layer_type == GNNLayerType::kSAGE) { + graph_->SampleAllEdges((*back_iter)->layer_number()); + } + } + // resize layer matrices + inductive_nodes = graph_->ConstructSampledSubgraph(); + for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); + layer++) { + (*layer)->ResizeRows(inductive_nodes); + } } galois::StatTimer epoch_timer("TrainingTime", "GraphNeuralNetwork"); @@ -168,6 +178,13 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // TODO incorporate validation/test intervals for (size_t epoch = 0; epoch < num_epochs; epoch++) { epoch_timer.start(); + if (config_.inductive_training_) { + graph_->EnableSubgraph(); + for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); + layer++) { + (*layer)->ResizeRows(inductive_nodes); + } + } if (config_.do_sampling()) { graph_->SetupNeighborhoodSample(); @@ -180,11 +197,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { GNNLayerType layer_type = (*back_iter)->layer_type(); if (layer_type == GNNLayerType::kGraphConvolutional || layer_type == GNNLayerType::kSAGE) { - graph_->SampleEdges((*back_iter)->layer_number(), 5); + graph_->SampleEdges((*back_iter)->layer_number(), 30); num_sampled_layers++; } } galois::gDebug("Number of sampled layers is ", num_sampled_layers); + + // resize layer matrices + size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph(); + for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); + layer++) { + (*layer)->ResizeRows(num_subgraph_nodes); + } } const PointerWithSize predictions = DoInference(); @@ -210,8 +234,14 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false; // get real norm factor back if altered by sampling or inductive training - if ((do_validate || do_test) && altered_norm_factor) { - graph_->CalculateFullNormFactor(); + if (do_validate || do_test) { + // disable subgraph + graph_->DisableSubgraph(); + // TODO only do this when necessary + for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); + layer++) { + (*layer)->ResizeRows(graph_->size()); + } } if (do_validate) { @@ -256,18 +286,17 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { SetLayerPhases(galois::GNNPhase::kTrain); // get back inductive norm factor as necessary; sampling norm is handled // at beginning of every iteration - if (config_.inductive_training_ && !config_.do_sampling()) { - graph_->CalculateSpecialNormFactor(false, true); - } } } uint64_t average_epoch_time = epoch_timer.get() / num_epochs; galois::runtime::reportStat_Tavg("GraphNeuralNetwork", "AverageEpochTime", average_epoch_time); - - if (altered_norm_factor) { - graph_->CalculateFullNormFactor(); + // disable subgraph + graph_->DisableSubgraph(); + // TODO only do this when necessary + for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { + (*layer)->ResizeRows(graph_->size()); } // check test accuracy @@ -284,7 +313,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { "FinalTestAccuracy", global_accuracy); } - return global_accuracy; + // return global_accuracy; + return 0; } const galois::PointerWithSize diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index b88f91b631..14d8bd8759 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -154,9 +154,8 @@ void galois::GNNLayer::RandomInitVector(std::vector* vector_to_init) { void galois::GNNLayer::DoDropoutCPU( const PointerWithSize input_to_dropout, PointerWithSize* output_matrix) { - size_t num_elements = output_matrix->size(); - assert(num_elements == dropout_mask_.size()); - assert(num_elements == input_to_dropout.size()); + size_t num_elements = + layer_dimensions_.input_rows * layer_dimensions_.input_columns; // determine which parts to drop galois::do_all( @@ -263,7 +262,9 @@ void galois::GNNLayer::Activation() { // TODO only does relu at the moment; should check user specified activation // and act accordingly galois::do_all( - galois::iterate(static_cast(0), forward_output_matrix_.size()), + galois::iterate(static_cast(0), + layer_dimensions_.input_rows * + layer_dimensions_.output_columns), [&](size_t i) { if (forward_output_matrix_[i] > 0.0) { // do nothing, keep value; set the memo though @@ -285,7 +286,9 @@ void galois::GNNLayer::ActivationDerivative( // and act accordingly // keep gradient if the original output was greater than 0 galois::do_all( - galois::iterate(static_cast(0), gradient->size()), + galois::iterate(static_cast(0), + layer_dimensions_.input_rows * + layer_dimensions_.output_columns), [&](size_t i) { // it was <= 0 before; set back to 0 if (!activation_memo_.test(i)) { @@ -326,9 +329,9 @@ void galois::GNNLayer::MaskInputNonMasters(PointerWithSize* input) { #else assert(*(graph_.begin_owned()) == 0); size_t start_node = *(graph_.end_owned()); - size_t end_node = graph_.size(); + size_t end_node = graph_.active_size(); size_t row_index = layer_dimensions_.input_columns; - assert((row_index * layer_dimensions_.input_rows) == input->size()); + assert((row_index * layer_dimensions_.input_rows) <= input->size()); galois::do_all( galois::iterate(start_node, end_node), [&](size_t non_master) { @@ -349,7 +352,7 @@ void galois::GNNLayer::MaskGradientNonMasters( #else assert(*(graph_.begin_owned()) == 0); size_t start_node = *(graph_.end_owned()); - size_t end_node = graph_.size(); + size_t end_node = graph_.active_size(); size_t row_index = layer_dimensions_.output_columns; galois::do_all( galois::iterate(start_node, end_node), diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 8a01c8fe1d..dfae86cbd2 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -125,10 +125,11 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( galois::StatTimer timer("ForwardPhase", kRegionName); timer.start(); - assert(input_embeddings.size() == + assert(input_embeddings.size() >= (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); - assert(p_forward_output_matrix_.size() == + assert(p_forward_output_matrix_.size() >= (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); + // pointer to input to operate on const GNNFloat* input_data = input_embeddings.data(); GNNFloat* agg_data; @@ -172,7 +173,7 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( Activation(); } - assert(p_forward_output_matrix_.size() == + assert(p_forward_output_matrix_.size() >= (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); timer.stop(); @@ -272,7 +273,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( if (layer_number_ != 0) { // ---unmasked--- // transposed sgemm for derivative; in_temp is output - assert(input_gradient->size() == + assert(input_gradient->size() >= layer_dimensions_.input_rows * layer_dimensions_.output_columns); // pintemp1 contains (AF)' // overwrites the dropout matrix that was in ptemp1 (needed for second @@ -365,9 +366,14 @@ void galois::SAGELayer::AggregateAll( #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AggregateAllGPU( - graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings, - aggregate_output, !config_.disable_normalization); + if (!IsSampledLayer()) { + gpu_object_.AggregateAllGPU( + graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings, + aggregate_output, !config_.disable_normalization); + } else { + // TODO(hochan) + GALOIS_LOG_FATAL("SAMPLING IMPLEMENTATION"); + } graph_.AggregateSync(aggregate_output, column_length, layer_number_); } else { #endif @@ -385,10 +391,8 @@ void galois::SAGELayer::AggregateAllCPU( galois::substrate::PerThreadStorage>*, bool is_backward) { - size_t num_nodes = graph_.size(); - galois::do_all( - galois::iterate(static_cast(0), num_nodes), + galois::iterate(graph_.begin(), graph_.end()), [&](size_t src) { // TODO(loc) this is currently a hack: the sync substrate blows // up if not the entire bitset is set for sync call like in @@ -400,20 +404,14 @@ void galois::SAGELayer::AggregateAllCPU( aggregate_output[index_to_src_feature + i] = 0; } - if (layer_phase_ == GNNPhase::kTrain) { - if (IsInductiveLayer()) { - // if inductive, all non-training nodes do not exist - if (!graph_.IsValidForPhase(src, GNNPhase::kTrain)) - return; - } - - if (IsSampledLayer()) { - // check if node is part of sampled graph - if (!graph_.IsInSampledGraph(src)) { - return; - } - } - } + // if (layer_phase_ == GNNPhase::kTrain) { + // // XXX + // if (IsInductiveLayer()) { + // // if inductive, all non-training nodes do not exist + // if (!graph_.IsValidForPhase(src, GNNPhase::kTrain)) + // return; + // } + //} GNNFloat source_norm = 0.0; if (!config_.disable_normalization) { @@ -426,22 +424,19 @@ void galois::SAGELayer::AggregateAllCPU( e++) { // graphs::bitset_graph_aggregate.set(src); size_t dst = graph_.GetEdgeDest(e); + // galois::gPrint("(", src, " ", dst, ")\n"); if (layer_phase_ == GNNPhase::kTrain) { - if (IsInductiveLayer()) { - // if inductive, all non-training nodes do not exist - if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain)) - return; - } - + //// XXX + // if (IsInductiveLayer()) { + // // if inductive, all non-training nodes do not exist + // if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain)) + // return; + //} if (IsSampledLayer()) { if (!graph_.IsEdgeSampled(e, layer_number_)) { continue; } - // ignore non-sampled nodes - if (layer_phase_ == GNNPhase::kTrain && - !graph_.IsInSampledGraph(dst)) - continue; } } @@ -475,20 +470,16 @@ void galois::SAGELayer::AggregateAllCPU( size_t dst = graph_.GetInEdgeDest(e); if (layer_phase_ == GNNPhase::kTrain) { - if (IsInductiveLayer()) { - // if inductive, all non-training nodes do not exist - if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain)) - return; - } - + // XXX + // if (IsInductiveLayer()) { + // // if inductive, all non-training nodes do not exist + // if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain)) + // return; + //} if (IsSampledLayer()) { if (!graph_.IsInEdgeSampled(e, layer_number_)) { continue; } - // ignore non-sampled nodes - if (layer_phase_ == GNNPhase::kTrain && - !graph_.IsInSampledGraph(dst)) - continue; } } @@ -530,6 +521,9 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, base_gpu_object_.layer_weights(), output); } else { #endif + galois::gPrint(layer_dimensions_.input_rows, " ", + layer_dimensions_.input_columns, " ", + layer_dimensions_.output_columns, "\n"); // CPU version is just a call into CBlas galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, layer_dimensions_.input_columns, @@ -564,7 +558,7 @@ void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, galois::StatTimer timer("BackwardXForm", kRegionName); timer.start(); - assert(p_layer_weights_.size() == + assert(p_layer_weights_.size() >= layer_dimensions_.input_columns * layer_dimensions_.output_columns); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { @@ -591,7 +585,7 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative( galois::StatTimer timer("SelfBackwardXForm", kRegionName); timer.start(); - assert(p_layer_weights_.size() == + assert(p_layer_weights_.size() >= layer_dimensions_.input_columns * layer_dimensions_.output_columns); #ifdef GALOIS_ENABLE_GPU // TODO gpu self diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 47d5f2ce0b..94523ce327 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -8,13 +8,13 @@ galois::SoftmaxLayer::ForwardPhaseCPU( // note: p_backward == input_embeddings input_loss_.assign(input_loss_.size(), 0.0); const size_t feature_length = layer_dimensions_.input_columns; -#ifndef NDEBUG + //#ifndef NDEBUG //#ifdef NDEBUG galois::DGAccumulator loss_accum; galois::DGAccumulator handled; loss_accum.reset(); handled.reset(); -#endif + //#endif galois::do_all( galois::iterate(graph_.begin(), graph_.end()), @@ -44,11 +44,11 @@ galois::SoftmaxLayer::ForwardPhaseCPU( input_loss_[i] = GNNCrossEntropy(feature_length, ground_truth_vec->data(), &p_backward_output_matrix_[feature_length * i]); -#ifndef NDEBUG + //#ifndef NDEBUG //#ifdef NDEBUG loss_accum += input_loss_[i]; handled += 1; -#endif + //#endif } else { VectorZero(feature_length, &p_backward_output_matrix_[i * feature_length]); @@ -57,12 +57,14 @@ galois::SoftmaxLayer::ForwardPhaseCPU( // TODO chunk size? // steal on as some threads may have nothing to work on galois::steal(), galois::loopname("SoftmaxForward")); -#ifndef NDEBUG + //#ifndef NDEBUG //#ifdef NDEBUG + GNNFloat reduced_loss = loss_accum.reduce(); size_t t = handled.reduce(); - galois::gPrint("Loss is ", reduced_loss / t, "\n"); -#endif + galois::gPrint("Loss is ", reduced_loss / t, " ", reduced_loss, " ", t, "\n"); + + //#endif return p_backward_output_matrix_; } diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp index 66d739a6d7..89ed60d0ad 100644 --- a/libgnn/test/sample-bit-test.cpp +++ b/libgnn/test/sample-bit-test.cpp @@ -15,7 +15,7 @@ int main() { galois::graphs::GNNGraph graph( "tester", galois::graphs::GNNPartitionScheme::kOEC, true); - graph.InitializeEdgeData(3); + graph.InitializeSamplingData(3); // first, assert all edges are not sampled (should come with all 0s) for (size_t node = 0; node < graph.size(); node++) { From 8674b9059fb43cb9ca828660a786f8da1bf1f3d8 Mon Sep 17 00:00:00 2001 From: Hochan Lee Date: Mon, 26 Apr 2021 22:10:31 -0500 Subject: [PATCH 517/660] Implement split/combine communications based on Katana's implementation (#3) --- .../include/galois/graphs/DistributedGraph.h | 18 +- .../include/galois/graphs/MiningPartitioner.h | 31 +- libcusp/include/galois/graphs/NewGeneric.h | 66 ++--- libdist/include/galois/runtime/Network.h | 13 +- libdist/include/galois/runtime/Serialize.h | 192 +++++------- libdist/src/DistStats.cpp | 32 +- libdist/src/Network.cpp | 22 +- libdist/src/NetworkBuffered.cpp | 279 ++++++++---------- libdist/src/NetworkLCI.cpp | 6 +- .../galois/graphs/GluonEdgeSubstrate.h | 48 ++- .../include/galois/graphs/GluonSubstrate.h | 50 ++-- 11 files changed, 326 insertions(+), 431 deletions(-) diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h index e13f71e4d2..bf88a17acf 100644 --- a/libcusp/include/galois/graphs/DistributedGraph.h +++ b/libcusp/include/galois/graphs/DistributedGraph.h @@ -260,14 +260,14 @@ class DistGraph { for (unsigned d = 0; d < DecomposeFactor; ++d) { galois::runtime::gSerialize(b, gid2host[id + d * numHosts]); } - net.sendTagged(h, galois::runtime::evilPhase, b); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } net.flush(); unsigned received = 1; while (received < numHosts) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); assert(p->first != id); auto& b = p->second; @@ -330,14 +330,14 @@ class DistGraph { continue; galois::runtime::SendBuffer b; galois::runtime::gSerialize(b, gid2host[id]); - net.sendTagged(h, galois::runtime::evilPhase, b); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } net.flush(); unsigned received = 1; while (received < numHosts) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); assert(p->first != id); auto& b = p->second; @@ -447,14 +447,14 @@ class DistGraph { continue; galois::runtime::SendBuffer b; galois::runtime::gSerialize(b, gid2host[id]); - net.sendTagged(h, galois::runtime::evilPhase, b); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } net.flush(); unsigned received = 1; while (received < numHosts) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); assert(p->first != id); auto& b = p->second; diff --git a/libcusp/include/galois/graphs/MiningPartitioner.h b/libcusp/include/galois/graphs/MiningPartitioner.h index e49d16023e..c809c24dd0 100644 --- a/libcusp/include/galois/graphs/MiningPartitioner.h +++ b/libcusp/include/galois/graphs/MiningPartitioner.h @@ -540,15 +540,15 @@ class MiningGraph : public DistGraph { if (h != base_DistGraph::id) { galois::runtime::SendBuffer bitsetBuffer; galois::runtime::gSerialize(bitsetBuffer, presentProxies); - net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer); + net.sendTagged(h, galois::runtime::evilPhase, std::move(bitsetBuffer)); } } // receive loop for (unsigned h = 0; h < net.Num - 1; h++) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); uint32_t sendingHost = p->first; // deserialize proxiesOnOtherHosts @@ -653,8 +653,7 @@ class MiningGraph : public DistGraph { bytesSent.update(b.size()); // send buffer and free memory - net.sendTagged(h, galois::runtime::evilPhase, b); - b.getVec().clear(); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } galois::runtime::reportStat_Tsum( GRNAME, std::string("EdgeInspectionBytesSent"), bytesSent.reduce()); @@ -675,9 +674,9 @@ class MiningGraph : public DistGraph { for (unsigned h = 0; h < net.Num - 1; h++) { // expect data from comm partner back - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); uint32_t sendingHost = p->first; @@ -1057,15 +1056,15 @@ class MiningGraph : public DistGraph { bytesSent.update(b.size()); maxBytesSent.update(b.size()); - net.sendTagged(h, galois::runtime::evilPhase, b); - b.getVec().clear(); - b.getVec().reserve(edgePartitionSendBufSize * 1.25); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + b = galois::runtime::SerializeBuffer(); + b.reserve(edgePartitionSendBufSize * 1.25); } } } // overlap receives - auto buffer = net.recieveTagged(galois::runtime::evilPhase, nullptr); + auto buffer = net.recieveTagged(galois::runtime::evilPhase); this->processReceivedEdgeBuffer(buffer, graph, receivedNodes); }, #if MORE_DIST_STATS @@ -1085,8 +1084,8 @@ class MiningGraph : public DistGraph { bytesSent.update(sendBuffer.size()); maxBytesSent.update(sendBuffer.size()); - net.sendTagged(h, galois::runtime::evilPhase, sendBuffer); - sendBuffer.getVec().clear(); + net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer)); + sendBuffer = galois::runtime::SerializeBuffer(); } } } @@ -1108,7 +1107,7 @@ class MiningGraph : public DistGraph { GraphTy& graph, std::atomic& receivedNodes) { if (buffer) { auto& rb = buffer->second; - while (rb.r_size() > 0) { + while (rb.size() > 0) { uint64_t n; std::vector gdst_vec; galois::runtime::gDeserialize(rb, n); @@ -1134,8 +1133,8 @@ class MiningGraph : public DistGraph { // receive edges for all mirror nodes while (receivedNodes < nodesToReceive) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; + p = net.recieveTagged(galois::runtime::evilPhase); processReceivedEdgeBuffer(p, graph, receivedNodes); } } diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index 710ba82996..6ece9e2c51 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -625,16 +625,16 @@ class NewDistGraphGeneric : public DistGraph { if (h != base_DistGraph::id) { galois::runtime::gSerialize(bitsetBuffer, syncNodes[h]); bytesSent += bitsetBuffer.size(); - net.sendTagged(h, galois::runtime::evilPhase, bitsetBuffer); + net.sendTagged(h, galois::runtime::evilPhase, std::move(bitsetBuffer)); } } // Step 5: recv bitset to other hosts; this indicates which local nodes each // other host needs to be informed of updates of for (unsigned h = 0; h < net.Num - 1; h++) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); uint32_t sendingHost = p->first; // deserialize into neighbor bitsets @@ -724,7 +724,7 @@ class NewDistGraphGeneric : public DistGraph { // note the +1 on evil phase; load messages send using a different // phase to avoid conflicts - net.sendTagged(h, base_DistGraph::evilPhasePlus1(), b); + net.sendTagged(h, base_DistGraph::evilPhasePlus1(), std::move(b)); } } sendTimer.stop(); @@ -744,13 +744,13 @@ class NewDistGraphGeneric : public DistGraph { std::vector& edgeLoads, galois::DynamicBitSet& loadsClear) { auto& net = galois::runtime::getSystemNetworkInterface(); - decltype(net.recieveTagged(base_DistGraph::evilPhasePlus1(), nullptr)) p; + decltype(net.recieveTagged(base_DistGraph::evilPhasePlus1())) p; galois::StatTimer recvTimer("Phase0AsyncRecvLoadTime", GRNAME); recvTimer.start(); do { // note the +1 - p = net.recieveTagged(base_DistGraph::evilPhasePlus1(), nullptr); + p = net.recieveTagged(base_DistGraph::evilPhasePlus1()); if (p) { unsigned messageType = (unsigned)-1; @@ -945,13 +945,13 @@ class NewDistGraphGeneric : public DistGraph { galois::runtime::gSerialize(b, mastersToSend); } bytesSent += b.size(); - net.sendTagged(targetHost, galois::runtime::evilPhase, b); + net.sendTagged(targetHost, galois::runtime::evilPhase, std::move(b)); } else { // send empty no-op message, tag 0 galois::runtime::SendBuffer b; galois::runtime::gSerialize(b, 0u); bytesSent += b.size(); - net.sendTagged(targetHost, galois::runtime::evilPhase, b); + net.sendTagged(targetHost, galois::runtime::evilPhase, std::move(b)); } sendOffsetsTimer.stop(); @@ -1020,9 +1020,9 @@ class NewDistGraphGeneric : public DistGraph { bytesSent += b.size(); // assumes phase is 0 or 1 if (phase == 1) { - net.sendTagged(h, base_DistGraph::evilPhasePlus1(), b); + net.sendTagged(h, base_DistGraph::evilPhasePlus1(), std::move(b)); } else if (phase == 0) { - net.sendTagged(h, galois::runtime::evilPhase, b); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } else { GALOIS_DIE("unexpected phase: ", phase); } @@ -1067,9 +1067,9 @@ class NewDistGraphGeneric : public DistGraph { std::vector& receivedMasters) { auto& net = galois::runtime::getSystemNetworkInterface(); - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); uint32_t sendingHost = p->first; @@ -1109,11 +1109,11 @@ class NewDistGraphGeneric : public DistGraph { std::unordered_map& gid2offsets, galois::DynamicBitSet& hostFinished) { auto& net = galois::runtime::getSystemNetworkInterface(); - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; // repeat loop until no message do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); if (p) { uint32_t sendingHost = p->first; unsigned messageType = (unsigned)-1; @@ -2131,8 +2131,7 @@ class NewDistGraphGeneric : public DistGraph { bytesSent.update(b.size()); // send buffer and free memory - net.sendTagged(h, galois::runtime::evilPhase, b); - b.getVec().clear(); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } galois::runtime::reportStat_Tsum( @@ -2156,9 +2155,9 @@ class NewDistGraphGeneric : public DistGraph { for (unsigned h = 0; h < net.Num - 1; h++) { // expect data from comm partner back - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); uint32_t sendingHost = p->first; @@ -2739,16 +2738,15 @@ class NewDistGraphGeneric : public DistGraph { bytesSent.update(b.size()); maxBytesSent.update(b.size()); - net.sendTagged(h, galois::runtime::evilPhase, b); - b.getVec().clear(); - b.getVec().reserve(edgePartitionSendBufSize * 1.25); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + b = galois::runtime::SerializeBuffer(); + b.reserve(edgePartitionSendBufSize * 1.25); } } } // overlap receives - auto buffer = - net.recieveTagged(galois::runtime::evilPhase, nullptr); + auto buffer = net.recieveTagged(galois::runtime::evilPhase); this->processReceivedEdgeBuffer(buffer, graph, receivedNodes); }, #if MORE_DIST_STATS @@ -2771,8 +2769,8 @@ class NewDistGraphGeneric : public DistGraph { bytesSent.update(sendBuffer.size()); maxBytesSent.update(sendBuffer.size()); - net.sendTagged(h, galois::runtime::evilPhase, sendBuffer); - sendBuffer.getVec().clear(); + net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer)); + sendBuffer = galois::runtime::SerializeBuffer(); } } } @@ -2885,16 +2883,15 @@ class NewDistGraphGeneric : public DistGraph { bytesSent.update(b.size()); maxBytesSent.update(b.size()); - net.sendTagged(h, galois::runtime::evilPhase, b); - b.getVec().clear(); - b.getVec().reserve(edgePartitionSendBufSize * 1.25); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + b = galois::runtime::SerializeBuffer(); + b.reserve(edgePartitionSendBufSize * 1.25); } } } // overlap receives - auto buffer = - net.recieveTagged(galois::runtime::evilPhase, nullptr); + auto buffer = net.recieveTagged(galois::runtime::evilPhase); this->processReceivedEdgeBuffer(buffer, graph, receivedNodes); }, #if MORE_DIST_STATS @@ -2917,8 +2914,7 @@ class NewDistGraphGeneric : public DistGraph { bytesSent.update(sendBuffer.size()); maxBytesSent.update(sendBuffer.size()); - net.sendTagged(h, galois::runtime::evilPhase, sendBuffer); - sendBuffer.getVec().clear(); + net.sendTagged(h, galois::runtime::evilPhase, std::move(sendBuffer)); } } } @@ -2940,7 +2936,7 @@ class NewDistGraphGeneric : public DistGraph { GraphTy& graph, std::atomic& receivedNodes) { if (buffer) { auto& rb = buffer->second; - while (rb.r_size() > 0) { + while (rb.size() > 0) { uint64_t n; std::vector gdst_vec; galois::runtime::gDeserialize(rb, n); @@ -2966,8 +2962,8 @@ class NewDistGraphGeneric : public DistGraph { // receive edges for all mirror nodes while (receivedNodes < nodesToReceive) { - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; + p = net.recieveTagged(galois::runtime::evilPhase); processReceivedEdgeBuffer(p, graph, receivedNodes); } } diff --git a/libdist/include/galois/runtime/Network.h b/libdist/include/galois/runtime/Network.h index e4695c0c2b..1560b20914 100644 --- a/libdist/include/galois/runtime/Network.h +++ b/libdist/include/galois/runtime/Network.h @@ -109,7 +109,7 @@ class NetworkInterface { //! tag (tag) and some data (buf) //! on the receiver, buf will be returned on a receiveTagged(tag) //! buf is invalidated by this operation - virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer& buf, + virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer&& buf, int type = 0) = 0; //! Send a message to all hosts. A message is simply a @@ -123,9 +123,6 @@ class NetworkInterface { template void broadcastSimple(void (*recv)(uint32_t, Args...), Args... param); - //! Receive and dispatch messages - void handleReceives(); - //! Wrapper to reset the mem usage tracker's stats inline void resetMemUsage() { memUsageTracker.resetMemUsage(); } @@ -134,8 +131,7 @@ class NetworkInterface { //! Receive a tagged message virtual std::optional> - recieveTagged(uint32_t tag, std::unique_lock* rlg, - int type = 0) = 0; + recieveTagged(uint32_t tag, int type = 0) = 0; //! move send buffers out to network virtual void flush() = 0; @@ -195,9 +191,6 @@ NetworkInterface& makeNetworkLCI(); //! @warning Should not be called within a parallel region; assumes only one //! thread is calling it substrate::Barrier& getHostBarrier(); -//! Returns a fence that ensures all pending messages are delivered, acting -//! like a memory-barrier -substrate::Barrier& getHostFence(); //////////////////////////////////////////////////////////////////////////////// // Implementations @@ -220,7 +213,7 @@ void NetworkInterface::sendSimple(uint32_t dest, SendBuffer buf; gSerialize(buf, (uintptr_t)recv, param..., (uintptr_t)genericLandingPad); - sendTagged(dest, 0, buf); + sendTagged(dest, 0, std::move(buf)); } template diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h index 94517e34ca..8110c954e9 100644 --- a/libdist/include/galois/runtime/Serialize.h +++ b/libdist/include/galois/runtime/Serialize.h @@ -49,30 +49,48 @@ namespace galois { namespace runtime { +struct BufferHeader { + enum class BufferType { kSingleMessage, kMultipleMessages, kPartialMessage }; + BufferType type{BufferType::kSingleMessage}; + uint8_t num_segments{1}; + uint8_t segment_id{0}; + uint8_t segment_tag{0}; +}; + class DeSerializeBuffer; // forward declaration for friend declaration /** * Buffer for serialization of data. Mainly used during network communication. */ class SerializeBuffer { + static constexpr size_t kHeaderSize = sizeof(BufferHeader); + //! Access to a deserialize buffer friend DeSerializeBuffer; //! type of data buffer // using vTy = std::vector; - using vTy = galois::PODResizeableArray; + using vTy = galois::PODResizeableArray; + using size_type = vTy::size_type; + //! the actual data stored in this buffer vTy bufdata; public: //! default constructor - SerializeBuffer() = default; + SerializeBuffer() { + BufferHeader header; + insert(reinterpret_cast(&header), kHeaderSize); + } + //! disabled copy constructor SerializeBuffer(SerializeBuffer&& rhs) = default; - //! Creates a buffer from another buffer - //! @param d buffer to create from - //! @param len amount of copy from buffer d - SerializeBuffer(const char* d, unsigned len) : bufdata(d, d + len) {} + + SerializeBuffer& operator=(SerializeBuffer&& rhs) { + auto buf = std::move(rhs); + bufdata = std::move(buf.get()); + return *this; + } //! Push a character onto the serialize buffer inline void push(const char c) { bufdata.push_back(c); } @@ -87,25 +105,19 @@ class SerializeBuffer { //! Insert characters from a buffer into the serialize buffer at a particular //! offset void insertAt(const uint8_t* c, size_t bytes, size_t offset) { + offset += kHeaderSize; + assert((offset + bytes) <= bufdata.size()); if (bytes > 0) { std::copy_n(c, bytes, bufdata.begin() + offset); } } - /** - * Reserve space at the end for inserting new data into the serialize - * buffer - * - * @param bytes number of bytes to reserve at the end - * @returns offset to the end of the buffer before new space was reserved - */ - size_t encomber(size_t bytes) { - size_t retval = bufdata.size(); - bufdata.resize(retval + bytes); - return retval; - } + //! Returns an iterator to the beginning of the data in this serialize buffer + vTy::const_iterator begin() const { return bufdata.cbegin(); } + //! Returns an iterator to the end of the data in this serialize buffer + vTy::const_iterator end() const { return bufdata.cend(); } - void resize(size_t bytes) { bufdata.resize(bytes); } + void resize(size_t bytes) { bufdata.resize(kHeaderSize + bytes); } /** * Reserve more space in the serialize buffer. @@ -115,34 +127,17 @@ class SerializeBuffer { void reserve(size_t s) { bufdata.reserve(bufdata.size() + s); } //! Returns a pointer to the data stored in this serialize buffer - const uint8_t* linearData() const { return bufdata.data(); } + const uint8_t* linearData() const { return bufdata.data() + kHeaderSize; } //! Returns vector of data stored in this serialize buffer - vTy& getVec() { return bufdata; } - - //! Returns an iterator to the beginning of the data in this serialize buffer - vTy::const_iterator begin() const { return bufdata.cbegin(); } - //! Returns an iterator to the end of the data in this serialize buffer - vTy::const_iterator end() const { return bufdata.cend(); } + vTy& get() { return bufdata; } - using size_type = vTy::size_type; + //! Get a pointer to the remaining data of the deserialize buffer + //! (as determined by offset) + const uint8_t* data() const { return bufdata.data() + kHeaderSize; } + uint8_t* data() { return bufdata.data() + kHeaderSize; } //! Returns the size of the serialize buffer - size_type size() const { return bufdata.size(); } - - //! Utility print function for the serialize buffer - //! @param o stream to print to - void print(std::ostream& o) const { - o << "<{" << std::hex; - for (auto& i : bufdata) - o << (unsigned int)i << " "; - o << std::dec << "}>"; - } - - //! Operator that calls the print function of the serialize buffer - friend std::ostream& operator<<(std::ostream& os, const SerializeBuffer& b) { - b.print(os); - return os; - } + size_type size() const { return bufdata.size() - kHeaderSize; } }; /** @@ -150,50 +145,54 @@ class SerializeBuffer { * communication. */ class DeSerializeBuffer { + static constexpr size_t kHeaderSize = sizeof(BufferHeader); //! Access to serialize buffer friend SerializeBuffer; //! type of data buffer // using vTy = std::vector; using vTy = galois::PODResizeableArray; //! the actual data stored in this buffer - vTy bufdata; - int offset; + vTy bufdata{kHeaderSize}; + size_t offset{kHeaderSize}; public: //! Constructor initializes offset into buffer to 0 - DeSerializeBuffer() : offset(0) {} + DeSerializeBuffer() : offset(kHeaderSize) {} //! Disable copy constructor DeSerializeBuffer(DeSerializeBuffer&&) = default; //! Move constructor //! @param v vector to act as deserialize buffer //! @param start offset to start saving data into DeSerializeBuffer(vTy&& v, uint32_t start = 0) - : bufdata(std::move(v)), offset(start) {} + : bufdata(std::move(v)), offset(start + kHeaderSize) { + assert(bufdata.size() >= offset); + } //! Constructor that takes an existing vector to use as the deserialize //! buffer explicit DeSerializeBuffer(vTy& data) { bufdata.swap(data); - offset = 0; + offset = kHeaderSize; } /** * Initializes the deserialize buffer with a certain size * @param [in] count size to initialize buffer to */ - explicit DeSerializeBuffer(int count) : bufdata(count), offset(0) {} + explicit DeSerializeBuffer(int count) + : bufdata(count + kHeaderSize), offset(kHeaderSize) {} /** * Initializes the deserialize buffer using vector initialization from * 2 iterators. */ template - DeSerializeBuffer(Iter b, Iter e) : bufdata(b, e), offset{0} {} + DeSerializeBuffer(Iter b, Iter e) : bufdata(b, e), offset{kHeaderSize} {} /** * Initialize a deserialize buffer from a serialize buffer */ - explicit DeSerializeBuffer(SerializeBuffer&& buf) : offset(0) { + explicit DeSerializeBuffer(SerializeBuffer&& buf) : offset(kHeaderSize) { bufdata.swap(buf.bufdata); } @@ -207,31 +206,15 @@ class DeSerializeBuffer { * @param count new size of buffer */ void reset(int count) { - offset = 0; - bufdata.resize(count); - } - - //! Gets the current offset into the deserialize buffer - unsigned getOffset() const { return offset; } - //! Sets the offset into the deserialize buffer - void setOffset(unsigned off) { - assert(off <= size()); - offset = off; + offset = kHeaderSize; + bufdata.resize(count + kHeaderSize); } - //! Gets the size of the deserialize buffer - unsigned size() const { return bufdata.size(); } - - //! Returns true if the deserialize buffer is empty - //! @returns true if the deserialize buffer is empty - bool empty() const { return bufdata.empty(); } - //! Get the next character in the deserialize buffer unsigned char pop() { return bufdata.at(offset++); } - //! Clears the last x bytes of the deserialize buffer, resizing it as well - //! @param x How many bytes from the end to clear - void pop_back(unsigned x) { bufdata.resize(bufdata.size() - x); } + //! Gets the size of the deserialize buffer + unsigned size() const { return bufdata.size() - offset; } /** * Extracts a certain amount of data from the deserialize buffer @@ -240,6 +223,8 @@ class DeSerializeBuffer { * @param num Amount of data to get from deserialize buffer */ void extract(uint8_t* dst, size_t num) { + assert(offset >= kHeaderSize); + assert((offset + num) <= bufdata.size()); if (num > 0) { std::copy_n(&bufdata[offset], num, dst); offset += num; @@ -248,37 +233,13 @@ class DeSerializeBuffer { //! Get the underlying vector storing the data of the deserialize //! buffer - vTy& getVec() { return bufdata; } + vTy& get() { return bufdata; } //! Get a pointer to the underlying data of the deserialize buffer - void* linearData() { return &bufdata[0]; } + void* linearData() { return &bufdata[offset]; } - //! Get a pointer to the remaining data of the deserialize buffer - //! (as determined by offset) - const uint8_t* r_linearData() const { return &bufdata[offset]; } - //! Get the remaining size of the deserialize buffer (as determined - //! by offset) - size_t r_size() const { return bufdata.size() - offset; } - - //! Checks if the current location in the deserialize buffer is aligned - //! to some size a - bool atAlignment(size_t a) { return (uintptr_t)r_linearData() % a == 0; } - - //! Utility print of deserialize buffer - //! @param o stream to print to - void print(std::ostream& o) const { - o << "<{(" << offset << ") " << std::hex; - for (auto ii = bufdata.begin(), ee = bufdata.end(); ii != ee; ++ii) - o << (unsigned int)*ii << " "; - o << std::dec << "}>"; - } - - //! Operator for printing deserialize buffer - friend std::ostream& operator<<(std::ostream& os, - const DeSerializeBuffer& buf) { - buf.print(os); - return os; - } + const uint8_t* data() const { return &bufdata[offset]; } + uint8_t* data() { return &bufdata[offset]; } }; namespace internal { @@ -411,7 +372,7 @@ inline size_t gSizedObj(const SerializeBuffer& data) { return data.size(); } * * @returns size of the deserialize buffer passed into it */ -inline size_t gSizedObj(const DeSerializeBuffer& rbuf) { return rbuf.r_size(); } +inline size_t gSizedObj(const DeSerializeBuffer& rbuf) { return rbuf.size(); } /** * Returns the size of the passed in insert bag. @@ -682,7 +643,7 @@ inline void gSerializeObj(SerializeBuffer& buf, * @param [in] data serialize buffer to get data from */ inline void gSerializeObj(SerializeBuffer& buf, const SerializeBuffer& data) { - buf.insert(data.linearData(), data.size()); + buf.insert(data.data(), data.size()); } /** @@ -693,7 +654,7 @@ inline void gSerializeObj(SerializeBuffer& buf, const SerializeBuffer& data) { */ inline void gSerializeObj(SerializeBuffer& buf, const DeSerializeBuffer& rbuf) { // buf.reserve(rbuf.r_size()); - buf.insert(rbuf.r_linearData(), rbuf.r_size()); + buf.insert(rbuf.data(), rbuf.size()); } /** @@ -757,8 +718,10 @@ gSerializeLazySeq(SerializeBuffer& buf, unsigned num, Seq*) { "Not POD Sequence"); typename Seq::size_type size = num; internal::gSerializeObj(buf, size); - size_t tsize = sizeof(typename Seq::value_type); - return LazyRef{buf.encomber(tsize * num)}; + size_t tsize = sizeof(typename Seq::value_type); + size_t cur_size = buf.size(); + buf.resize(cur_size + (tsize * num)); + return LazyRef{cur_size}; } /** @@ -980,18 +943,10 @@ void gDeserializeSeq(DeSerializeBuffer& buf, Seq& seq) { template void gDeserializeLinearSeq(DeSerializeBuffer& buf, Seq& seq) { typedef typename Seq::value_type T; - // seq.clear(); typename Seq::size_type size; gDeserializeObj(buf, size); - // If the alignment is right, cast to a T array and insert - if (buf.atAlignment(alignof(T))) { - T* src = (T*)buf.r_linearData(); - seq.assign(src, &src[size]); - buf.setOffset(buf.getOffset() + size * sizeof(T)); - } else { - seq.resize(size); - buf.extract((uint8_t*)seq.data(), size * sizeof(T)); - } + seq.resize(size); + buf.extract((uint8_t*)seq.data(), size * sizeof(T)); } /** @@ -1025,7 +980,7 @@ template void gDeserializeObj(DeSerializeBuffer& buf, galois::BufferWrapper& bf) { if (is_memory_copyable::value) { // manual deserialization here - size_t buffer_size; + size_t buffer_size{0}; gDeserializeObj(buf, buffer_size); bf.resize(buffer_size); buf.extract((uint8_t*)bf.get_vec_data(), buffer_size * sizeof(T)); @@ -1097,9 +1052,10 @@ inline void gDeserialize(DeSerializeBuffer&) {} * @param data Object to save data in the iterator type into */ template -auto gDeserializeRaw(Iter iter, T& data) -> decltype( - std::declval::value>::type>(), - Iter()) { +auto gDeserializeRaw(Iter iter, T& data) + -> decltype(std::declval::value>::type>(), + Iter()) { unsigned char* pdata = (unsigned char*)&data; for (size_t i = 0; i < sizeof(T); ++i) pdata[i] = *iter++; diff --git a/libdist/src/DistStats.cpp b/libdist/src/DistStats.cpp index 8faf4cee5a..e8399451f3 100644 --- a/libdist/src/DistStats.cpp +++ b/libdist/src/DistStats.cpp @@ -105,8 +105,8 @@ void DistStatManager::combineAtHost_0_helper(void) { SendBuffer b; gSerialize(b, hTotalMap.region(i), hTotalMap.category(i), hTotalMap.stat(i).totalTy()); - getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b, - syncTypePhase); + getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, + std::move(b), syncTypePhase); } } @@ -126,8 +126,8 @@ void DistStatManager::combineAtHost_0_helper(void) { } else { SendBuffer b; gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals); - getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b, - syncTypePhase); + getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, + std::move(b), syncTypePhase); } } } @@ -151,8 +151,8 @@ void DistStatManager::combineAtHost_0_helper2(void) { } else { SendBuffer b; gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals); - getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b, - syncTypePhase); + getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, + std::move(b), syncTypePhase); } } @@ -172,8 +172,8 @@ void DistStatManager::combineAtHost_0_helper2(void) { } else { SendBuffer b; gSerialize(b, ln, cat, thrdTotal, totalTy, thrdVals); - getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, b, - syncTypePhase); + getSystemNetworkInterface().sendTagged(0, galois::runtime::evilPhase, + std::move(b), syncTypePhase); } } } @@ -182,10 +182,10 @@ void DistStatManager::receiveAtHost_0_helper(void) { size_t syncTypePhase = 0; { decltype(getSystemNetworkInterface().recieveTagged( - galois::runtime::evilPhase, nullptr, syncTypePhase)) p; + galois::runtime::evilPhase, syncTypePhase)) p; do { p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase, - nullptr, syncTypePhase); + syncTypePhase); if (p) { RecvBuffer& b = p->second; @@ -203,10 +203,10 @@ void DistStatManager::receiveAtHost_0_helper(void) { ++syncTypePhase; { decltype(getSystemNetworkInterface().recieveTagged( - galois::runtime::evilPhase, nullptr, syncTypePhase)) p; + galois::runtime::evilPhase, syncTypePhase)) p; do { p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase, - nullptr, syncTypePhase); + syncTypePhase); if (p) { uint32_t hostID = p->first; @@ -230,10 +230,10 @@ void DistStatManager::receiveAtHost_0_helper2(void) { size_t syncTypePhase = 0; { decltype(getSystemNetworkInterface().recieveTagged( - galois::runtime::evilPhase, nullptr, syncTypePhase)) p; + galois::runtime::evilPhase, syncTypePhase)) p; do { p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase, - nullptr, syncTypePhase); + syncTypePhase); if (p) { uint32_t hostID = p->first; @@ -255,10 +255,10 @@ void DistStatManager::receiveAtHost_0_helper2(void) { ++syncTypePhase; { decltype(getSystemNetworkInterface().recieveTagged( - galois::runtime::evilPhase, nullptr, syncTypePhase)) p; + galois::runtime::evilPhase, syncTypePhase)) p; do { p = getSystemNetworkInterface().recieveTagged(galois::runtime::evilPhase, - nullptr, syncTypePhase); + syncTypePhase); if (p) { uint32_t hostID = p->first; diff --git a/libdist/src/Network.cpp b/libdist/src/Network.cpp index 44a189f7ad..7bf499a00b 100644 --- a/libdist/src/Network.cpp +++ b/libdist/src/Network.cpp @@ -93,7 +93,7 @@ void NetworkInterface::sendMsg(uint32_t dest, void (*recv)(uint32_t, RecvBuffer&), SendBuffer& buf) { gSerialize(buf, recv); - sendTagged(dest, 0, buf); + sendTagged(dest, 0, std::move(buf)); } void NetworkInterface::broadcast(void (*recv)(uint32_t, RecvBuffer&), @@ -104,30 +104,14 @@ void NetworkInterface::broadcast(void (*recv)(uint32_t, RecvBuffer&), if (x != ID) { SendBuffer b; gSerialize(b, fp, buf, (uintptr_t)&bcastLandingPad); - sendTagged(x, 0, b); + sendTagged(x, 0, std::move(b)); } else if (self) { - RecvBuffer rb(buf.begin(), buf.end()); + RecvBuffer rb = RecvBuffer(std::move(buf.get())); recv(ID, rb); } } } -void NetworkInterface::handleReceives() { - std::unique_lock lg; - auto opt = recieveTagged(0, &lg); - while (opt) { - uint32_t src = std::get<0>(*opt); - RecvBuffer& buf = std::get<1>(*opt); - uintptr_t fp = 0; - gDeserializeRaw(buf.r_linearData() + buf.r_size() - sizeof(uintptr_t), fp); - buf.pop_back(sizeof(uintptr_t)); - assert(fp); - auto f = (void (*)(uint32_t, RecvBuffer&))fp; - f(src, buf); - opt = recieveTagged(0, &lg); - } -} - NetworkInterface& galois::runtime::getSystemNetworkInterface() { #ifndef GALOIS_USE_LCI return makeNetworkBuffered(); diff --git a/libdist/src/NetworkBuffered.cpp b/libdist/src/NetworkBuffered.cpp index 7b6d6c6ce1..a58f16c3ab 100644 --- a/libdist/src/NetworkBuffered.cpp +++ b/libdist/src/NetworkBuffered.cpp @@ -67,6 +67,12 @@ class NetworkInterfaceBuffered : public NetworkInterface { // using vTy = std::vector; using vTy = galois::PODResizeableArray; + static constexpr size_t kHeaderSize = sizeof(BufferHeader); + static constexpr uint8_t kMaxSegmentTag = std::numeric_limits::max(); + static constexpr size_t kMaxBufferSize = + static_cast(std::numeric_limits::max()); + static constexpr size_t kMaxDataSize = kMaxBufferSize - kHeaderSize; + /** * Receive buffers for the buffered network interface */ @@ -77,6 +83,38 @@ class NetworkInterfaceBuffered : public NetworkInterface { // tag of head of queue std::atomic dataPresent; + struct PartialMessages { + uint8_t num_segments{0}; + std::vector segments; + }; + std::unordered_map partial_messages_map_; + + std::optional CombinePartialMessages(const BufferHeader& header, + vTy&& vec) { + auto& partial_messages = partial_messages_map_[header.segment_tag]; + if (partial_messages.num_segments == 0) { + partial_messages.segments.resize(header.num_segments); + } + + partial_messages.segments[header.segment_id] = std::move(vec); + ++partial_messages.num_segments; + + if (partial_messages.num_segments != header.num_segments) { + assert(partial_messages.num_segments < header.num_segments); + assert(partial_messages.segments.size() == header.num_segments); + return std::nullopt; + } + + std::vector& segments = partial_messages.segments; + vTy message = std::move(segments[0]); + for (size_t i = 1, end = segments.size(); i < end; ++i) { + message.insert(message.end(), segments[i].begin() + kHeaderSize, + segments[i].end()); + } + partial_messages_map_.erase(header.segment_tag); + return std::make_optional(std::move(message)); + } + bool sizeAtLeast(size_t n, uint32_t tag) { size_t tot = -frontOffset; for (auto& v : data) { @@ -163,30 +201,6 @@ class NetworkInterfaceBuffered : public NetworkInterface { std::optional popMsg(uint32_t tag, std::atomic& inflightRecvs) { std::lock_guard lg(qlock); -#ifndef NO_AGG - uint32_t len = getLenFromFront(tag); - // assert(len); - if (len == ~0U || len == 0) - return std::optional(); - if (!sizeAtLeast(sizeof(uint32_t) + len, tag)) - return std::optional(); - erase(4, inflightRecvs); - - // Try just using the buffer - if (auto r = popVec(len, inflightRecvs)) { - auto start = r->size() - len; - // std::cerr << "FP " << r->size() << " " << len << " " << start - // << "\n"; - return std::optional(RecvBuffer(std::move(*r), start)); - } - - RecvBuffer buf(len); - // FIXME: This is slows things down 25% - copyOut((char*)buf.linearData(), len); - erase(len, inflightRecvs); - // std::cerr << "p " << tag << " " << len << "\n"; - return std::optional(std::move(buf)); -#else if (data.empty() || data.front().tag != tag) return std::optional(); @@ -201,31 +215,28 @@ class NetworkInterfaceBuffered : public NetworkInterface { } return std::optional(RecvBuffer(std::move(vec), 0)); -#endif } // Worker thread interface - void add(NetworkIO::message m) { + bool add(NetworkIO::message m) { + BufferHeader* header = reinterpret_cast(m.data.data()); + if (header->type == BufferHeader::BufferType::kPartialMessage) { + std::optional segment = + CombinePartialMessages(*header, std::move(m.data)); + if (!segment) { + return false; + } + + m.data = std::move(*segment); + } std::lock_guard lg(qlock); if (data.empty()) { galois::runtime::trace("ADD LATEST ", m.tag); dataPresent = m.tag; } - // std::cerr << m.data.size() << " " << - // std::count(m.data.begin(), m.data.end(), 0) << "\n"; - // for (auto x : m.data) { - // std::cerr << (int) x << " "; - // } - // std::cerr << "\n"; - // std::cerr << "A " << m.host << " " << m.tag << " " << m.data.size() << - // "\n"; - data.push_back(std::move(m)); - - assert(data.back().data.size() != - (unsigned int)std::count(data.back().data.begin(), - data.back().data.end(), 0)); + return true; } bool hasData(uint32_t tag) { return dataPresent == tag; } @@ -245,7 +256,7 @@ class NetworkInterfaceBuffered : public NetworkInterface { struct msg { uint32_t tag; vTy data; - msg(uint32_t t, vTy& _data) : tag(t), data(std::move(_data)) {} + msg(uint32_t t, vTy&& _data) : tag(t), data(std::move(_data)) {} }; std::deque messages; @@ -254,6 +265,43 @@ class NetworkInterfaceBuffered : public NetworkInterface { //! @todo FIXME track time since some epoch in an atomic. std::chrono::high_resolution_clock::time_point time; SimpleLock lock, timelock; + uint8_t segment_tag_{0}; + + void IncrementSegmentTag() { + if (segment_tag_ == kMaxSegmentTag) { + segment_tag_ = 0; + } else { + ++segment_tag_; + } + } + + std::vector Split(uint32_t host, uint32_t tag, + vTy&& vec) { + std::vector segments; + segments.emplace_back(std::move(vec)); + auto begin = segments[0].begin(); + for (size_t i = kMaxBufferSize, end = segments[0].size(); i < end; + i += kMaxDataSize) { + vTy segment(kHeaderSize); + size_t segment_end = std::min(end, i + kMaxDataSize); + segment.insert(segment.end(), begin + i, begin + segment_end); + segments.emplace_back(std::move(segment)); + } + segments[0].resize(kMaxBufferSize); + + std::vector msg; + for (size_t i = 0; i < segments.size(); ++i) { + auto& segment = segments[i]; + BufferHeader* header = reinterpret_cast(segment.data()); + header->type = BufferHeader::BufferType::kPartialMessage; + header->num_segments = segments.size(); + header->segment_id = i; + header->segment_tag = segment_tag_; + msg.emplace_back(host, tag, std::move(segment)); + } + IncrementSegmentTag(); + return msg; + } public: unsigned long statSendTimeout; @@ -269,103 +317,35 @@ class NetworkInterfaceBuffered : public NetworkInterface { } } - bool ready() { -#ifndef NO_AGG - if (numBytes == 0) - return false; - if (urgent) { - ++statSendUrgent; - return true; - } - if (numBytes > COMM_MIN) { - ++statSendOverflow; - return true; - } - auto n = std::chrono::high_resolution_clock::now(); - decltype(n) mytime; - { - std::lock_guard lg(timelock); - mytime = time; - } - auto elapsed = - std::chrono::duration_cast(n - mytime); - if (elapsed.count() > COMM_DELAY) { - ++statSendTimeout; - return true; - } - return false; -#else - return messages.size() > 0; -#endif - } + bool ready() { return messages.size() > 0; } - std::pair - assemble(std::atomic& GALOIS_UNUSED(inflightSends)) { + std::vector assemble(uint32_t host) { std::unique_lock lg(lock); - if (messages.empty()) - return std::make_pair(~0, vTy()); -#ifndef NO_AGG - // compute message size - uint32_t len = 0; - int num = 0; - uint32_t tag = messages.front().tag; - for (auto& m : messages) { - if (m.tag != tag) { - break; - } else { - // do not let it go over the integer limit because MPI_Isend cannot - // deal with it - if ((m.data.size() + sizeof(uint32_t) + len + num) > - static_cast(std::numeric_limits::max())) { - break; - } - len += m.data.size(); - num += sizeof(uint32_t); - } - } - lg.unlock(); - // construct message - vTy vec; - vec.reserve(len + num); - // go out of our way to avoid locking out senders when making messages - lg.lock(); - do { - auto& m = messages.front(); - lg.unlock(); - union { - uint32_t a; - uint8_t b[sizeof(uint32_t)]; - } foo; - foo.a = m.data.size(); - vec.insert(vec.end(), &foo.b[0], &foo.b[sizeof(uint32_t)]); - vec.insert(vec.end(), m.data.begin(), m.data.end()); - if (urgent) - --urgent; - lg.lock(); - messages.pop_front(); - --inflightSends; - } while (vec.size() < len + num); - ++inflightSends; - numBytes -= len; -#else + assert(!messages.empty()); uint32_t tag = messages.front().tag; vTy vec(std::move(messages.front().data)); messages.pop_front(); -#endif - return std::make_pair(tag, std::move(vec)); + + if (vec.size() > kMaxBufferSize) { + return Split(host, tag, std::move(vec)); + } + + BufferHeader* header = reinterpret_cast(vec.data()); + header->type = BufferHeader::BufferType::kSingleMessage; + std::vector msgs; + msgs.emplace_back(host, tag, std::move(vec)); + return msgs; } - void add(uint32_t tag, vTy& b) { + void add(uint32_t tag, vTy&& b) { std::lock_guard lg(lock); if (messages.empty()) { std::lock_guard lg(timelock); time = std::chrono::high_resolution_clock::now(); } - unsigned oldNumBytes = numBytes; + assert(b.size() >= kHeaderSize); numBytes += b.size(); - galois::runtime::trace("BufferedAdd", oldNumBytes, numBytes, tag, - galois::runtime::printVec(b)); - messages.emplace_back(tag, b); + messages.emplace_back(tag, std::move(b)); } }; // end send buffer class @@ -402,24 +382,26 @@ class NetworkInterfaceBuffered : public NetworkInterface { // handle send queue i auto& sd = sendData[i]; if (sd.ready()) { - NetworkIO::message msg; - msg.host = i; - std::tie(msg.tag, msg.data) = sd.assemble(inflightSends); - galois::runtime::trace("BufferedSending", msg.host, msg.tag, - galois::runtime::printVec(msg.data)); - ++statSendEnqueued; - netio->enqueue(std::move(msg)); + std::vector msgs = sd.assemble(i); + if (msgs.size() > 1) { + inflightSends += msgs.size() - 1; + } + + for (auto& msg : msgs) { + ++statSendEnqueued; + netio->enqueue(std::move(msg)); + } } + // handle receive NetworkIO::message rdata = netio->dequeue(); if (rdata.data.size()) { ++statRecvDequeued; - assert(rdata.data.size() != - (unsigned int)std::count(rdata.data.begin(), rdata.data.end(), - 0)); - galois::runtime::trace("BufferedRecieving", rdata.host, rdata.tag, - galois::runtime::printVec(rdata.data)); - recvData[rdata.host].add(std::move(rdata)); + uint32_t h = rdata.host; + bool not_partial_segment = recvData[h].add(std::move(rdata)); + if (!not_partial_segment) { + --inflightRecvs; + } } } } @@ -454,22 +436,19 @@ class NetworkInterfaceBuffered : public NetworkInterface { std::unique_ptr netio; - virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer& buf, + virtual void sendTagged(uint32_t dest, uint32_t tag, SendBuffer&& buf, int phase) { - ++inflightSends; tag += phase; statSendNum += 1; - statSendBytes += buf.size(); - galois::runtime::trace("sendTagged", dest, tag, - galois::runtime::printVec(buf.getVec())); + statSendBytes += buf.size() + kHeaderSize; + memUsageTracker.incrementMemUsage(buf.size() + kHeaderSize); + ++inflightSends; auto& sd = sendData[dest]; - sd.add(tag, buf.getVec()); + sd.add(tag, std::move(buf.get())); } virtual std::optional> - recieveTagged(uint32_t tag, - std::unique_lock* rlg, - int phase) { + recieveTagged(uint32_t tag, int phase) { tag += phase; for (unsigned h = 0; h < recvData.size(); ++h) { auto& rq = recvData[h]; @@ -480,12 +459,8 @@ class NetworkInterfaceBuffered : public NetworkInterface { auto buf = rq.popMsg(tag, inflightRecvs); if (buf) { ++statRecvNum; - statRecvBytes += buf->size(); - memUsageTracker.decrementMemUsage(buf->size()); - if (rlg) - *rlg = std::move(lg); - galois::runtime::trace("recvTagged", h, tag, - galois::runtime::printVec(buf->getVec())); + statRecvBytes += buf->size() + kHeaderSize; + memUsageTracker.decrementMemUsage(buf->size() + kHeaderSize); anyReceivedMessages = true; return std::optional>( std::make_pair(h, std::move(*buf))); diff --git a/libdist/src/NetworkLCI.cpp b/libdist/src/NetworkLCI.cpp index 59b17a1d35..3770356c8c 100644 --- a/libdist/src/NetworkLCI.cpp +++ b/libdist/src/NetworkLCI.cpp @@ -182,8 +182,8 @@ class NetworkInterfaceLCI : public NetworkInterface { statSendBytes += buf.size(); // int count = 0; #ifndef GALOIS_SUPPORT_ASYNC - if (buf.getVec().size() < 8192) { - while (lc_sendm(buf.getVec().data(), buf.getVec().size(), dest, tag, + if (buf.get().size() < 8192) { + while (lc_sendm(buf.get().data(), buf.get().size(), dest, tag, lc_p2p_ep[phase]) != LC_OK) { sched_yield(); } @@ -191,7 +191,7 @@ class NetworkInterfaceLCI : public NetworkInterface { #endif { pendingReq* msg = - new pendingReq(dest, tag, phase, buf.getVec(), inflightSends); + new pendingReq(dest, tag, phase, buf.get(), inflightSends); while (lc_sendl(msg->buf.data(), msg->buf.size(), dest, tag, lc_p2p_ep[phase], free_req, msg) != LC_OK) { sched_yield(); diff --git a/libgluon/include/galois/graphs/GluonEdgeSubstrate.h b/libgluon/include/galois/graphs/GluonEdgeSubstrate.h index 7e39a5b7c0..7342c9a57e 100644 --- a/libgluon/include/galois/graphs/GluonEdgeSubstrate.h +++ b/libgluon/include/galois/graphs/GluonEdgeSubstrate.h @@ -133,7 +133,7 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject { galois::runtime::SendBuffer b; gSerialize(b, mirrorEdges[x]); - net.sendTagged(x, galois::runtime::evilPhase, b); + net.sendTagged(x, galois::runtime::evilPhase, std::move(b)); } // receive the mirror edges @@ -141,9 +141,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject { if (x == id) continue; - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); galois::runtime::gDeserialize(p->second, masterEdges[p->first]); @@ -169,7 +169,7 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject { galois::runtime::SendBuffer b; gSerialize(b, totalMirrorEdges, totalOwnedEdges); - net.sendTagged(x, galois::runtime::evilPhase, b); + net.sendTagged(x, galois::runtime::evilPhase, std::move(b)); } // receive @@ -177,9 +177,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject { if (x == id) continue; - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); uint64_t totalMirrorFromOther; @@ -1097,9 +1097,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject { template inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b) { if (syncType == syncReduce) { - return FnTy::extract_reset_batch(x, b.getVec().data()); + return FnTy::extract_reset_batch(x, b.data()); } else { - return FnTy::extract_batch(x, b.getVec().data()); + return FnTy::extract_batch(x, b.data()); } } @@ -1125,9 +1125,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject { inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b, size_t& s, DataCommMode& data_mode) { if (syncType == syncReduce) { - return FnTy::extract_reset_batch(x, b.getVec().data(), &s, &data_mode); + return FnTy::extract_reset_batch(x, b.data(), &s, &data_mode); } else { - return FnTy::extract_batch(x, b.getVec().data(), &s, &data_mode); + return FnTy::extract_batch(x, b.data(), &s, &data_mode); } } @@ -1243,12 +1243,12 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject { template inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b) { if (syncType == syncReduce) { - return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset()); + return FnTy::reduce_batch(x, b.data()); } else { if (async) { - return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset()); + return FnTy::reduce_mirror_batch(x, b.data()); } else { - return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset()); + return FnTy::setVal_batch(x, b.data()); } } } @@ -1273,15 +1273,12 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject { inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b, DataCommMode& data_mode) { if (syncType == syncReduce) { - return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset(), - data_mode); + return FnTy::reduce_batch(x, b.data(), data_mode); } else { if (async) { - return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset(), - data_mode); + return FnTy::reduce_mirror_batch(x, b.data(), data_mode); } else { - return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset(), - data_mode); + return FnTy::setVal_batch(x, b.data(), data_mode); } } } @@ -1723,7 +1720,8 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject { size_t syncTypePhase = 0; if (async && (syncType == syncBroadcast)) syncTypePhase = 1; - net.sendTagged(x, galois::runtime::evilPhase, b, syncTypePhase); + net.sendTagged(x, galois::runtime::evilPhase, std::move(b), + syncTypePhase); ++numMessages; } } @@ -1958,11 +1956,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject { size_t syncTypePhase = 0; if (syncType == syncBroadcast) syncTypePhase = 1; - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr, - syncTypePhase)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase, syncTypePhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr, - syncTypePhase); + p = net.recieveTagged(galois::runtime::evilPhase, syncTypePhase); if (p) { syncRecvApply( @@ -1977,9 +1973,9 @@ class GluonEdgeSubstrate : public galois::runtime::GlobalObject { continue; Twait.start(); - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); Twait.stop(); diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index 9e7a7738a4..7a1e5b6665 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -190,7 +190,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { galois::runtime::SendBuffer b; gSerialize(b, mirrorNodes[x]); - net.sendTagged(x, galois::runtime::evilPhase, b); + net.sendTagged(x, galois::runtime::evilPhase, std::move(b)); } // receive the mirror nodes @@ -198,9 +198,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { if (x == id) continue; - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); galois::runtime::gDeserialize(p->second, masterNodes[p->first]); @@ -226,7 +226,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { galois::runtime::SendBuffer b; gSerialize(b, global_total_mirror_nodes, global_total_owned_nodes); - net.sendTagged(x, galois::runtime::evilPhase, b); + net.sendTagged(x, galois::runtime::evilPhase, std::move(b)); } // receive @@ -234,9 +234,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { if (x == id) continue; - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); uint64_t total_mirror_nodes_from_others; @@ -1348,9 +1348,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { template inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b) { if (syncType == syncReduce) { - return FnTy::extract_reset_batch(x, b.getVec().data()); + return FnTy::extract_reset_batch(x, b.data()); } else { - return FnTy::extract_batch(x, b.getVec().data()); + return FnTy::extract_batch(x, b.data()); } } @@ -1376,9 +1376,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { inline bool extractBatchWrapper(unsigned x, galois::runtime::SendBuffer& b, size_t& s, DataCommMode& data_mode) { if (syncType == syncReduce) { - return FnTy::extract_reset_batch(x, b.getVec().data(), &s, &data_mode); + return FnTy::extract_reset_batch(x, b.data(), &s, &data_mode); } else { - return FnTy::extract_batch(x, b.getVec().data(), &s, &data_mode); + return FnTy::extract_batch(x, b.data(), &s, &data_mode); } } @@ -1602,12 +1602,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject { template inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b) { if (syncType == syncReduce) { - return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset()); + return FnTy::reduce_batch(x, b.data()); } else { if (async) { - return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset()); + return FnTy::reduce_mirror_batch(x, b.data()); } else { - return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset()); + return FnTy::setVal_batch(x, b.data()); } } } @@ -1632,15 +1632,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject { inline bool setBatchWrapper(unsigned x, galois::runtime::RecvBuffer& b, DataCommMode& data_mode) { if (syncType == syncReduce) { - return FnTy::reduce_batch(x, b.getVec().data() + b.getOffset(), - data_mode); + return FnTy::reduce_batch(x, b.data(), data_mode); } else { if (async) { - return FnTy::reduce_mirror_batch(x, b.getVec().data() + b.getOffset(), - data_mode); + return FnTy::reduce_mirror_batch(x, b.data(), data_mode); } else { - return FnTy::setVal_batch(x, b.getVec().data() + b.getOffset(), - data_mode); + return FnTy::setVal_batch(x, b.data(), data_mode); } } } @@ -2223,7 +2220,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { MPI_Wait(&request[x], MPI_STATUS_IGNORE); } if (b[x].size() > 0) { - b[x].getVec().clear(); + b[x].get().clear(); } getSendBuffer(loopName, x, @@ -2325,7 +2322,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { size_t syncTypePhase = 0; if (async && (syncType == syncBroadcast)) syncTypePhase = 1; - net.sendTagged(x, galois::runtime::evilPhase, b, syncTypePhase); + net.sendTagged(x, galois::runtime::evilPhase, std::move(b), + syncTypePhase); ++numMessages; } } @@ -2806,11 +2804,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { size_t syncTypePhase = 0; if (syncType == syncBroadcast) syncTypePhase = 1; - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr, - syncTypePhase)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase, syncTypePhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr, - syncTypePhase); + p = net.recieveTagged(galois::runtime::evilPhase, syncTypePhase); if (p) { syncRecvApply( @@ -2825,9 +2821,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { continue; Twait.start(); - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) p; + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; do { - p = net.recieveTagged(galois::runtime::evilPhase, nullptr); + p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); Twait.stop(); From 79701576db20348e01827901a25d7a5ac638ebf1 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 28 Apr 2021 12:34:15 -0500 Subject: [PATCH 518/660] Inductive/sampling for training now co-exist Can use inductive and sampling (separately) on single host without any issues. There are some remaining issues that need to be resolved like using the correct degree for sampling (requires dist comms for degrees which will be implemented next). --- .../include/galois/graphs/LC_CSR_Graph.h | 8 +++ libgnn/include/galois/GraphNeuralNetwork.h | 2 + libgnn/include/galois/graphs/GNNGraph.h | 29 +++++--- libgnn/include/galois/graphs/GNNSubgraph.h | 2 + libgnn/include/galois/layers/GNNLayer.h | 6 ++ libgnn/src/GraphNeuralNetwork.cpp | 10 ++- libgnn/src/graphs/GNNGraph.cpp | 69 +++++++++++++++---- libgnn/src/graphs/GNNSubgraph.cpp | 3 + libgnn/src/layers/SAGELayer.cpp | 8 ++- 9 files changed, 106 insertions(+), 31 deletions(-) diff --git a/libgalois/include/galois/graphs/LC_CSR_Graph.h b/libgalois/include/galois/graphs/LC_CSR_Graph.h index 9f849d0efc..45d39fafaa 100644 --- a/libgalois/include/galois/graphs/LC_CSR_Graph.h +++ b/libgalois/include/galois/graphs/LC_CSR_Graph.h @@ -606,6 +606,14 @@ class LC_CSR_Graph : edgeData.destroy(); } + //! No destroy, only deallocate + void DeallocateOnly() { + nodeData.deallocate(); + edgeIndData.deallocate(); + edgeDst.deallocate(); + edgeData.deallocate(); + } + void constructEdge(EdgeIndexTy e, NodeIndexTy dst, const typename EdgeData::value_type& val) { edgeData.set(e, val); diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 3df6fbe94e..953e925d9a 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -198,6 +198,8 @@ class GraphNeuralNetwork { std::vector> gnn_layers_; //! Current phase of the GNN: train, validation, test GNNPhase phase_{GNNPhase::kTrain}; + //! Number of layers that use the graph (e.g. SAGE, GCN) + size_t num_graph_user_layers_; #ifdef GALOIS_ENABLE_GPU //! Holds all GPU functions diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 3a538d9da5..7e0d016e06 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -115,9 +115,9 @@ class GNNGraph { // Edges ////////////////////////////////////////////////////////////////////////////// - void InitializeSamplingData() { InitializeSamplingData(1); } + void InitializeSamplingData() { InitializeSamplingData(1, false); } //! Initialize data required to do graph sampling - void InitializeSamplingData(size_t num_layers); + void InitializeSamplingData(size_t num_layers, bool is_inductive); ////////////////////////////////////////////////////////////////////////////// // Out Edges @@ -286,16 +286,23 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// GNNFloat GetNormFactor(GraphNode n) const { return norm_factors_[n]; } + //! Degree norm (1 / degree) of current functional graph (e.g., sampled, //! inductive graph, etc); calculated whenever norm factor is calculated - GNNFloat GetDegreeNorm(GraphNode n) const { - if (!use_subgraph_) { - return degree_norm_[n]; + GNNFloat GetGlobalDegreeNorm(GraphNode n) const { return degree_norm_[n]; } + + //! Get degree of subgraph for particular layer + GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const { + if (use_subgraph_) { + if (!subgraph_is_inductive_) { + // case because degrees in each layer differ + return 1.0 / sampled_out_degrees_[graph_user_layer_num] + [subgraph_->SIDToLID(n)]; + } else { + return 1.0 / sampled_out_degrees_[0][subgraph_->SIDToLID(n)]; + } } else { - // XXX does not work in distributed case, fix there - // XXX also need to account for current layer number in sampling - // case because degrees in each layer differ - return 1.0 / subgraph_->GetLocalDegree(n); + return degree_norm_[n]; } } @@ -521,8 +528,7 @@ class GNNGraph { std::unique_ptr subgraph_; // Degrees for sampled subgraph - galois::LargeArray sampled_out_degrees_; - galois::LargeArray sampled_in_degrees_; + std::vector> sampled_out_degrees_; //! Sample data on edges: each edge gets a small bitset to mark //! if it's been sampled for a particular layer galois::LargeArray> edge_sample_status_; @@ -568,6 +574,7 @@ class GNNGraph { // TODO vars for subgraphs as necessary bool use_subgraph_{false}; + bool subgraph_is_inductive_{false}; ////////////////////////////////////////////////////////////////////////////// // GPU things diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h index c3c931f0da..4ac7c739eb 100644 --- a/libgnn/include/galois/graphs/GNNSubgraph.h +++ b/libgnn/include/galois/graphs/GNNSubgraph.h @@ -104,6 +104,8 @@ class GNNSubgraph { static const constexpr char* kRegionName = "GNNSubgraph"; + bool inductive_subgraph_{false}; + // name is self explanatory LC_CSR_CSC_Graph underlying_graph_; // size vars diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 82b149ee5e..5cfe69b83e 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -142,6 +142,7 @@ class GNNLayer { return output_layer_type_; } size_t layer_number() const { return layer_number_; } + size_t graph_user_layer_number() const { return graph_user_layer_number_; } //! Conducts the forward phase given the input to this layer which //! ultimately leads to an output (classfication of node labels) at the end @@ -175,6 +176,9 @@ class GNNLayer { void EnableSampling() { config_.do_sampling = true; } bool IsSampledLayer() const { return config_.do_sampling; } bool IsInductiveLayer() const { return config_.inductive_training_; } + //! Sets the graph user layer number; important for sampling as this index + //! determines which index to use when checking for sampled edges + void SetGraphUserLayerNumber(size_t num) { graph_user_layer_number_ = num; } #ifdef GALOIS_ENABLE_GPU //! Utility function for allocating @@ -207,6 +211,8 @@ class GNNLayer { //! 0 does not need to do some things that other layers need to do // XXX be more specific size_t layer_number_; + //! Graph layer number: only layers that use the graph are numbered + size_t graph_user_layer_number_; //! Pointer to the graph being trained by this layer. //! This is owned by the creator of this layer, so no need to free it when //! this layer is destroyed. diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 1b492c34ec..46b8a6bcdd 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -28,6 +28,7 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( #endif // used for chaining layers together; begins as nullptr PointerWithSize prev_output_layer(nullptr, 0); + num_graph_user_layers_ = 0; // create the intermediate layers for (size_t i = 0; i < config_.num_intermediate_layers(); i++) { @@ -52,6 +53,7 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( gnn_layers_.push_back(std::move(std::make_unique( i, *graph_, &prev_output_layer, layer_dims, config_.default_layer_config()))); + gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { graph_->InitLayerVectorMetaObjects( @@ -64,6 +66,7 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( gnn_layers_.push_back(std::move(std::make_unique( i, *graph_, &prev_output_layer, layer_dims, config_.default_layer_config()))); + gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++); #ifdef GALOIS_ENABLE_GPU // TODO(loc/hochan) sage layer gpu #endif @@ -105,7 +108,8 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( } if (config_.do_sampling() || config_.inductive_training_) { // output layer not included; it will never involve sampling - graph_->InitializeSamplingData(gnn_layers_.size()); + graph_->InitializeSamplingData(num_graph_user_layers_, + config_.inductive_training_); } // create the output layer @@ -160,7 +164,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { GNNLayerType layer_type = (*back_iter)->layer_type(); if (layer_type == GNNLayerType::kGraphConvolutional || layer_type == GNNLayerType::kSAGE) { - graph_->SampleAllEdges((*back_iter)->layer_number()); + graph_->SampleAllEdges((*back_iter)->graph_user_layer_number()); } } // resize layer matrices @@ -197,7 +201,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { GNNLayerType layer_type = (*back_iter)->layer_type(); if (layer_type == GNNLayerType::kGraphConvolutional || layer_type == GNNLayerType::kSAGE) { - graph_->SampleEdges((*back_iter)->layer_number(), 30); + graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 5); num_sampled_layers++; } } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 56572ccb76..80ea988166 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -857,11 +857,26 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti( //////////////////////////////////////////////////////////////////////////////// -void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers) { +void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, + bool is_inductive) { subgraph_ = std::make_unique(partitioned_graph_->size()); - edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers); - sampled_out_degrees_.create(partitioned_graph_->size(), 0); - sampled_in_degrees_.create(partitioned_graph_->size(), 0); + edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers, 0); + // this is to hold the *global* degree of a sampled graph; yes, memory wise + // this is slightly problematic possibly, but each layer is its own + // subgraph + if (!is_inductive) { + sampled_out_degrees_.resize(num_layers); + for (galois::LargeArray& array : sampled_out_degrees_) { + array.create(partitioned_graph_->size()); + } + } else { + // TODO(loc) optimize possible: inductive setting means # nodes always + // only training/other nodes, so can allocate only what is required + // Allocating full size is inefficient + sampled_out_degrees_.resize(1); + sampled_out_degrees_[0].create(partitioned_graph_->size()); + subgraph_is_inductive_ = true; + } } void galois::graphs::GNNGraph::SetupNeighborhoodSample() { @@ -882,9 +897,17 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample() { std::fill(edge_sample_status_[edge_id].begin(), edge_sample_status_[edge_id].end(), 0); }); + // reset all degrees + galois::do_all( + galois::iterate(sampled_out_degrees_), + [&](galois::LargeArray& array) { + std::fill(array.begin(), array.end(), 0); + }, + galois::chunk_size<1>()); } void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { + assert(subgraph_is_inductive_); use_subgraph_ = false; galois::GAccumulator sampled; @@ -894,18 +917,27 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { galois::do_all( galois::iterate(begin(), end()), - [&](const NodeIterator& x) { + [&](const NodeIterator& src_iter) { // only operate on if sampled - if (partitioned_graph_->getData(*x)) { + if (partitioned_graph_->getData(*src_iter)) { // marks ALL edges of nodes that connect to train/other nodes - for (auto edge_iter : partitioned_graph_->edges(*x)) { + for (auto edge_iter : partitioned_graph_->edges(*src_iter)) { if (IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), GNNPhase::kTrain) || IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), GNNPhase::kOther)) { MakeEdgeSampled(edge_iter, agg_layer_num); - new_sampled_nodes_.set(partitioned_graph_->getEdgeDst(edge_iter)); + if (!IsInSampledGraph( + partitioned_graph_->getEdgeDst(edge_iter))) { + new_sampled_nodes_.set( + partitioned_graph_->getEdgeDst(edge_iter)); + } sampled += 1; + // only count once for last layer (last layer is where all + // relevant nodes will be included) + if (agg_layer_num == 0) { + sampled_out_degrees_[0][*src_iter]++; + } } total += 1; } @@ -930,6 +962,7 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, size_t num_to_sample) { + assert(!subgraph_is_inductive_); use_subgraph_ = false; galois::GAccumulator sampled; @@ -938,15 +971,16 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, total.reset(); galois::do_all( galois::iterate(begin(), end()), - [&](const NodeIterator& x) { + [&](const NodeIterator& src_iter) { // only operate on if sampled - if (partitioned_graph_->getData(*x)) { + if (partitioned_graph_->getData(*src_iter)) { // chance of not uniformly choosing an edge of this node num_to_sample // times (degree norm is 1 / degree) + // XXX training degree + other norm, not global double probability_of_reject = - std::pow(1 - GetDegreeNorm(*x), num_to_sample); + std::pow(1 - GetGlobalDegreeNorm(*src_iter), num_to_sample); // loop through edges, turn "on" edge with some probability - for (auto edge_iter : partitioned_graph_->edges(*x)) { + for (auto edge_iter : partitioned_graph_->edges(*src_iter)) { if (sample_rng_.DoBernoulli(probability_of_reject)) { // only take if node is training node or a node not classified // into train/test/val @@ -957,13 +991,20 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, // if here, it means edge accepted; set sampled on, mark source // as part of next set MakeEdgeSampled(edge_iter, sample_layer_num); - new_sampled_nodes_.set( - partitioned_graph_->getEdgeDst(edge_iter)); + if (!IsInSampledGraph( + partitioned_graph_->getEdgeDst(edge_iter))) { + new_sampled_nodes_.set( + partitioned_graph_->getEdgeDst(edge_iter)); + } + // degree increment + sampled_out_degrees_[sample_layer_num][*src_iter]++; sampled += 1; } } total += 1; } + // galois::gDebug(*src_iter, " with degree ", + // sampled_out_degrees_[sample_layer_num][*src_iter]); } }, galois::steal(), galois::loopname("NeighborhoodSample")); diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index e80dfffbc9..cfacf02f4f 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -58,6 +58,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping( timer.stop(); } +// TODO optimize further? void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting( const GNNGraph& gnn_graph) { galois::StatTimer timer("DegreeCounting", kRegionName); @@ -98,6 +99,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting( timer.stop(); } +// TODO optimize further? void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( const GNNGraph& gnn_graph) { galois::StatTimer timer("EdgeConstruction", kRegionName); @@ -111,6 +113,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( // allocate then set node endpoints num_subgraph_edges_ = subgraph_out_degrees_.back(); + underlying_graph_.DeallocateOnly(); underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_); underlying_graph_.CSCAllocate(); galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_), diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index dfae86cbd2..1240280b48 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -415,7 +415,7 @@ void galois::SAGELayer::AggregateAllCPU( GNNFloat source_norm = 0.0; if (!config_.disable_normalization) { - source_norm = graph_.GetDegreeNorm(src); + source_norm = graph_.GetDegreeNorm(src, graph_user_layer_number_); } if (!is_backward) { @@ -447,7 +447,8 @@ void galois::SAGELayer::AggregateAllCPU( if (!is_backward) { norm_scale = source_norm; } else { - norm_scale = graph_.GetDegreeNorm(dst); + norm_scale = + graph_.GetDegreeNorm(dst, graph_user_layer_number_); } galois::VectorMulAdd( @@ -486,7 +487,8 @@ void galois::SAGELayer::AggregateAllCPU( size_t index_to_dst_feature = dst * column_length; if (!config_.disable_normalization) { - GNNFloat norm_scale = graph_.GetDegreeNorm(dst); + GNNFloat norm_scale = + graph_.GetDegreeNorm(dst, graph_user_layer_number_); galois::VectorMulAdd( column_length, &aggregate_output[index_to_src_feature], From d001e004f3c77a4b5d68156b6c0c234d01dc5bd1 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 28 Apr 2021 17:29:06 -0500 Subject: [PATCH 519/660] SAGE sync fix + bitset fix for GNNs 1) SAGE layer iterates over in-edges for backward phase, meaning you write destination. Sync call, therefore, needs to be write destination for the backward phase. 2) Non-full bitsets were not compatible with GNN applications because the manyvec to single vec hack did not account for size changes from the bitset. This commit fixes that. --- .../include/galois/graphs/GluonSubstrate.h | 72 +++++++++++++++---- libgnn/include/galois/graphs/GNNGraph.h | 11 ++- libgnn/src/graphs/GNNGraph.cpp | 16 +++-- libgnn/src/layers/SAGELayer.cpp | 33 ++------- 4 files changed, 85 insertions(+), 47 deletions(-) diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index 7a1e5b6665..f102e3a4a1 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -841,6 +841,48 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } } + // only difference is val_vec doesn't get resized ever (it's the single array + // from the hack call + template + void + serializeMessageVecHack(std::string loopName, DataCommMode data_mode, + size_t bit_set_count, std::vector& indices, + galois::PODResizeableArray& offsets, + galois::DynamicBitSet& bit_set_comm, VecType& val_vec, + galois::runtime::SendBuffer& b) { + std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; + std::string serialize_timer_str(syncTypeStr + "SerializeMessage_" + + get_run_identifier(loopName)); + galois::CondStatTimer Tserialize( + serialize_timer_str.c_str(), RNAME); + if (data_mode == noData) { + if (!async) { + Tserialize.start(); + gSerialize(b, data_mode); + Tserialize.stop(); + } + } else if (data_mode == gidsData) { + offsets.resize(bit_set_count); + convertLIDToGID(loopName, indices, offsets); + Tserialize.start(); + gSerialize(b, data_mode, bit_set_count, offsets, val_vec); + Tserialize.stop(); + } else if (data_mode == offsetsData) { + offsets.resize(bit_set_count); + Tserialize.start(); + gSerialize(b, data_mode, bit_set_count, offsets, val_vec); + Tserialize.stop(); + } else if (data_mode == bitsetData) { + Tserialize.start(); + gSerialize(b, data_mode, bit_set_count, bit_set_comm, val_vec); + Tserialize.stop(); + } else { // onlyData + Tserialize.start(); + gSerialize(b, data_mode, val_vec); + Tserialize.stop(); + } + } + /** * Given the data mode, deserialize the rest of a message in a Receive Buffer. * @@ -2030,20 +2072,24 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // Vector of vectors is in val_vec // val vec over to contiguous array of #s - size_t num_nodes = val_vec.size(); - size_t feature_size = val_vec[0].size(); - single_array.resize(num_nodes * feature_size); - galois::do_all( - galois::iterate(size_t{0}, num_nodes), - [&](size_t node) { - std::memcpy(&(single_array.data()[node * feature_size]), - val_vec[node].data(), feature_size * sizeof(float)); - }, - galois::loopname("GluonSerializeManyVecToOne")); + size_t num_nodes = bit_set_count; + size_t feature_size = 0; + if (bit_set_count != 0) { + feature_size = val_vec[0].size(); + single_array.resize(num_nodes * feature_size); + galois::do_all( + galois::iterate(size_t{0}, num_nodes), + [&](size_t index) { + std::memcpy(&(single_array.data()[index * feature_size]), + val_vec[index].data(), + feature_size * sizeof(float)); + }, + galois::loopname("GluonSerializeManyVecToOne")); + } - serializeMessage(loopName, data_mode, bit_set_count, - indices, offsets, bit_set_comm, - single_array, b); + serializeMessageVecHack( + loopName, data_mode, bit_set_count, indices, offsets, bit_set_comm, + single_array, b); gSerialize(b, feature_size); } else { // TODO(loc/hochan) vector gpu hack for gnns diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 7e0d016e06..01840c39fc 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -377,13 +377,20 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// + // TODO(loc) Should not be a default version of this to avoid potential + // issues later + void AggregateSync(GNNFloat* matrix_to_sync, + const size_t matrix_column_size) const { + AggregateSync(matrix_to_sync, matrix_column_size, false); + }; + //! Given a matrix and the column size, do an aggregate sync where each row //! is considered a node's data and sync using the graph's Gluon //! substrate //! Note that it's const because the only thing being used is the graph //! topology of this object; the thing modified is the passed in matrix - void AggregateSync(GNNFloat* matrix_to_sync, - const size_t matrix_column_size) const; + void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size, + bool is_backward) const; ////////////////////////////////////////////////////////////////////////////// // Sampling related diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 80ea988166..861a982a98 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -173,14 +173,20 @@ bool galois::graphs::GNNGraph::IsValidForPhaseMasked( return (*mask_to_use)[lid]; } -void galois::graphs::GNNGraph::AggregateSync( - GNNFloat* matrix_to_sync, const size_t matrix_column_size) const { +void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync, + const size_t matrix_column_size, + bool is_backward) const { // set globals for the sync substrate gnn_matrix_to_sync_ = matrix_to_sync; gnn_matrix_to_sync_column_length_ = matrix_column_size; - sync_substrate_ - ->sync( - "GraphAggregateSync"); + if (!is_backward) { + sync_substrate_ + ->sync( + "GraphAggregateSync"); + } else { + sync_substrate_->sync("BackwardGraphAggregateSync"); + } } #ifdef GALOIS_ENABLE_GPU diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 1240280b48..48ab1e0b4e 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -394,25 +394,12 @@ void galois::SAGELayer::AggregateAllCPU( galois::do_all( galois::iterate(graph_.begin(), graph_.end()), [&](size_t src) { - // TODO(loc) this is currently a hack: the sync substrate blows - // up if not the entire bitset is set for sync call like in - // edge sampling - graphs::bitset_graph_aggregate.set(src); size_t index_to_src_feature = src * column_length; // zero out src feature first for (size_t i = 0; i < column_length; i++) { aggregate_output[index_to_src_feature + i] = 0; } - // if (layer_phase_ == GNNPhase::kTrain) { - // // XXX - // if (IsInductiveLayer()) { - // // if inductive, all non-training nodes do not exist - // if (!graph_.IsValidForPhase(src, GNNPhase::kTrain)) - // return; - // } - //} - GNNFloat source_norm = 0.0; if (!config_.disable_normalization) { source_norm = graph_.GetDegreeNorm(src, graph_user_layer_number_); @@ -422,17 +409,13 @@ void galois::SAGELayer::AggregateAllCPU( // loop through all destinations to grab the feature to aggregate for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src); e++) { - // graphs::bitset_graph_aggregate.set(src); + // XXX set LID + graphs::bitset_graph_aggregate.set(src); size_t dst = graph_.GetEdgeDest(e); // galois::gPrint("(", src, " ", dst, ")\n"); if (layer_phase_ == GNNPhase::kTrain) { - //// XXX - // if (IsInductiveLayer()) { - // // if inductive, all non-training nodes do not exist - // if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain)) - // return; - //} + // XXX if (IsSampledLayer()) { if (!graph_.IsEdgeSampled(e, layer_number_)) { continue; @@ -467,16 +450,12 @@ void galois::SAGELayer::AggregateAllCPU( // loop through all destinations to grab the feature to aggregate for (auto e = graph_.in_edge_begin(src); e != graph_.in_edge_end(src); e++) { - // graphs::bitset_graph_aggregate.set(src); + // XXX LID not SID + graphs::bitset_graph_aggregate.set(src); size_t dst = graph_.GetInEdgeDest(e); if (layer_phase_ == GNNPhase::kTrain) { // XXX - // if (IsInductiveLayer()) { - // // if inductive, all non-training nodes do not exist - // if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain)) - // return; - //} if (IsSampledLayer()) { if (!graph_.IsInEdgeSampled(e, layer_number_)) { continue; @@ -507,7 +486,7 @@ void galois::SAGELayer::AggregateAllCPU( galois::chunk_size<1>(), galois::steal(), galois::loopname("ConvolutionalAggregateAll")); // aggregate sync - graph_.AggregateSync(aggregate_output, column_length); + graph_.AggregateSync(aggregate_output, column_length, is_backward); } void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, From d7d2e3dd7cd446c5fa559cc7c8094bfd091b9b88 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 28 Apr 2021 17:40:47 -0500 Subject: [PATCH 520/660] sample-bit-test fix: call update --- libgnn/test/sample-bit-test.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp index 89ed60d0ad..f603578c13 100644 --- a/libgnn/test/sample-bit-test.cpp +++ b/libgnn/test/sample-bit-test.cpp @@ -15,7 +15,7 @@ int main() { galois::graphs::GNNGraph graph( "tester", galois::graphs::GNNPartitionScheme::kOEC, true); - graph.InitializeSamplingData(3); + graph.InitializeSamplingData(3, false); // first, assert all edges are not sampled (should come with all 0s) for (size_t node = 0; node < graph.size(); node++) { From 6a7e09d63fc180332d92bd6be719f52031beb8e9 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 29 Apr 2021 15:29:31 -0500 Subject: [PATCH 521/660] Distributed calculation of gnn node degrees Before this commit, degrees were calculated on every host by loading in full original graph and using it. That is not efficient: this commit makes every host count degrees and send it to all other hosts. The reading of the whole graph topology has been removed. Removes a bunch of older unused functions like SpecialNormFactor and such as well. --- .../galois/graphs/DegreeSyncStructures.h | 58 +++++++++ libgnn/include/galois/graphs/GNNGraph.h | 49 +++++--- .../graphs/GraphAggregationSyncStructures.h | 1 - libgnn/src/graphs/GNNGraph.cpp | 112 ++++-------------- libgnn/src/layers/GraphConvolutionalLayer.cpp | 4 +- 5 files changed, 114 insertions(+), 110 deletions(-) create mode 100644 libgnn/include/galois/graphs/DegreeSyncStructures.h diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h new file mode 100644 index 0000000000..0141805df0 --- /dev/null +++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h @@ -0,0 +1,58 @@ +#include "galois/GNNTypes.h" + +namespace galois { +namespace graphs { + +extern uint32_t* gnn_degree_vec_1_; +extern uint32_t* gnn_degree_vec_2_; + +struct InitialDegreeSync { + using ValTy = std::pair; + + //! return a vector of floats to sync + static ValTy extract(uint32_t lid, char&) { + return std::make_pair(gnn_degree_vec_1_[lid], gnn_degree_vec_2_[lid]); + } + + //! reduction is addition in this case; add received vector to + //! own vector + static bool reduce(uint32_t lid, char&, ValTy y) { + gnn_degree_vec_1_[lid] += y.first; + gnn_degree_vec_2_[lid] += y.second; + if (y.first || y.second) { + return true; + } else { + return false; + } + } + + //! No-op: readAny = overwritten anyways + static void reset(uint32_t lid, char&) { + gnn_degree_vec_1_[lid] = 0; + gnn_degree_vec_2_[lid] = 0; + } + + //! element wise set + static void setVal(uint32_t lid, char&, ValTy y) { + gnn_degree_vec_1_[lid] = y.first; + gnn_degree_vec_2_[lid] = y.second; + } + + // GPU options TODO for GPU + static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_batch(unsigned, uint8_t*) { return false; } + static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { + return false; + } + static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_reset_batch(unsigned, uint8_t*) { return false; } +}; + +} // namespace graphs +} // namespace galois diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 01840c39fc..c5817a9b07 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -285,24 +285,42 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// - GNNFloat GetNormFactor(GraphNode n) const { return norm_factors_[n]; } + GNNFloat GetGCNNormFactor(GraphNode lid) const { + if (global_degrees_[lid]) { + return 1.0 / std::sqrt(static_cast(global_degrees_[lid]) + 1); + } else { + return 0.0; + } + } //! Degree norm (1 / degree) of current functional graph (e.g., sampled, //! inductive graph, etc); calculated whenever norm factor is calculated - GNNFloat GetGlobalDegreeNorm(GraphNode n) const { return degree_norm_[n]; } + GNNFloat GetGlobalDegreeNorm(GraphNode n) const { + if (global_degrees_[n]) { + return 1.0 / global_degrees_[n]; + } else { + return 0.0; + } + } - //! Get degree of subgraph for particular layer + //! Get degree norm of subgraph for particular layer (i.e. includes training) GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const { if (use_subgraph_) { + size_t degree; if (!subgraph_is_inductive_) { // case because degrees in each layer differ - return 1.0 / sampled_out_degrees_[graph_user_layer_num] - [subgraph_->SIDToLID(n)]; + degree = + sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)]; + } else { + degree = sampled_out_degrees_[0][subgraph_->SIDToLID(n)]; + } + if (degree) { + return 1.0 / degree; } else { - return 1.0 / sampled_out_degrees_[0][subgraph_->SIDToLID(n)]; + return 0; } } else { - return degree_norm_[n]; + return GetGlobalDegreeNorm(n); } } @@ -427,9 +445,6 @@ class GNNGraph { //! Calculate norm factor considering the entire graph void CalculateFullNormFactor(); - //! Calculate norm factor considering sampled nodes and/or training nodes - //! only (inductive) - void CalculateSpecialNormFactor(bool is_sampled, bool is_inductive); #ifdef GALOIS_ENABLE_GPU void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size, @@ -518,9 +533,6 @@ class GNNGraph { size_t node_feature_length_{0}; //! Partitioned graph std::unique_ptr partitioned_graph_; - //! The entire topology of the dataset: used for things like norm factor - //! calculation or sampling - WholeGraph whole_graph_; //! Sync substrate for the partitioned graph std::unique_ptr> sync_substrate_; //! True if labels are single class @@ -570,15 +582,14 @@ class GNNGraph { //! falling in range != part of that set) bool incomplete_masks_{false}; - //! Normalization constant based on structure of the graph (degrees) - std::vector norm_factors_; - //! Normalization constant based on degrees (unlike nomral norm factors - //! it's only division without a square root) - std::vector degree_norm_; - //! RNG for subgraph sampling galois::PerThreadRNG sample_rng_; + // TODO LargeArray instead of vector? + //! Degrees: needed since graph is distributed + std::vector global_degrees_; + std::vector global_train_degrees_; + // TODO vars for subgraphs as necessary bool use_subgraph_{false}; bool subgraph_is_inductive_{false}; diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index 7759c26dca..073cde32c3 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -1,7 +1,6 @@ // defined in GNNGraph.cpp; set in order to control which matrix // gets synchronized #include "galois/GNNTypes.h" -#include "galois/BufferWrapper.h" #ifdef GALOIS_ENABLE_GPU #include "galois/GNNCudaContextHostDecls.h" #endif diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 861a982a98..265066b361 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -3,6 +3,7 @@ #include "galois/graphs/ReadGraph.h" #include "galois/graphs/GNNGraph.h" #include "galois/GNNMath.h" +#include "galois/graphs/DegreeSyncStructures.h" #include namespace { @@ -34,11 +35,15 @@ LoadPartition(const std::string& input_directory, } // end namespace +// Sync structure variables; global to get around sync structure +// limitations at the moment namespace galois { namespace graphs { GNNFloat* gnn_matrix_to_sync_ = nullptr; size_t gnn_matrix_to_sync_column_length_ = 0; galois::DynamicBitSet bitset_graph_aggregate; +uint32_t* gnn_degree_vec_1_; +uint32_t* gnn_degree_vec_2_; #ifdef GALOIS_ENABLE_GPU struct CUDA_Context* cuda_ctx_for_sync; unsigned layer_number_to_sync; @@ -84,9 +89,7 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, partitioned_graph_->cartesianGrid()); bitset_graph_aggregate.resize(partitioned_graph_->size()); - // read in entire graph topology - ReadWholeGraph(dataset_name); - // init norm factors using the whole graph topology + // init norm factors (involves a sync call) InitNormFactor(); #ifdef GALOIS_ENABLE_GPU @@ -580,104 +583,37 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { } } -void galois::graphs::GNNGraph::ReadWholeGraph(const std::string& dataset_name) { - std::string input_file = input_directory_ + dataset_name + ".csgr"; - GALOIS_LOG_VERBOSE("[{}] Reading entire graph: file to read is {}", host_id_, - input_file); - galois::graphs::readGraph(whole_graph_, input_file); -} - void galois::graphs::GNNGraph::InitNormFactor() { GALOIS_LOG_VERBOSE("[{}] Initializing norm factors", host_id_); - norm_factors_.resize(partitioned_graph_->size(), 0.0); - degree_norm_.resize(partitioned_graph_->size(), 0.0); + global_degrees_.resize(partitioned_graph_->size(), 0.0); + global_train_degrees_.resize(partitioned_graph_->size(), 0.0); CalculateFullNormFactor(); } void galois::graphs::GNNGraph::CalculateFullNormFactor() { - norm_factors_.assign(partitioned_graph_->size(), 0.0); - - // get the norm factor contribution for each node based on the GLOBAL graph - galois::do_all( - galois::iterate(static_cast(0), partitioned_graph_->size()), - [&](size_t local_id) { - // translate lid into gid to get global degree - size_t global_id = partitioned_graph_->getGID(local_id); - // +1 because simulated self edge - size_t global_degree = whole_graph_.edge_end(global_id) - - whole_graph_.edge_begin(global_id) + 1; - // only set if non-zero - if (global_degree != 0) { - norm_factors_[local_id] = - 1.0 / std::sqrt(static_cast(global_degree)); - degree_norm_[local_id] = 1.0 / static_cast(global_degree); - } - }, - galois::loopname("CalculateFullNormFactor")); -} - -void galois::graphs::GNNGraph::CalculateSpecialNormFactor(bool is_sampled, - bool is_inductive) { - if (galois::runtime::getSystemNetworkInterface().Num > 1) { - GALOIS_LOG_FATAL("cannot run special norm factor in dist setting yet"); - } - - norm_factors_.assign(partitioned_graph_->size(), 0.0); + // TODO(loc) reset all degrees if this is called multiple times? // get the norm factor contribution for each node based on the GLOBAL graph galois::do_all( galois::iterate(static_cast(0), partitioned_graph_->size()), - [&](size_t local_id) { - // ignore node if not valid - if (is_sampled && is_inductive) { - if (!IsValidForPhase(local_id, GNNPhase::kTrain) || - !IsInSampledGraph(local_id)) { - return; - } - } else if (is_sampled) { - if (!IsInSampledGraph(local_id)) { - return; - } - } else if (is_inductive) { - if (!IsValidForPhase(local_id, GNNPhase::kTrain)) { - return; - } - } - - size_t degree = 0; - - // TODO(loc) make this work in a distributed setting; assuming - // whole graph is present on single host at the moment - for (EdgeIterator e = edge_begin(local_id); e != edge_end(local_id); - e++) { - size_t dest = GetEdgeDest(e); - if (is_sampled && is_inductive) { - if (!IsValidForPhase(dest, GNNPhase::kTrain) || - !IsInSampledGraph(dest)) { - continue; - } - } else if (is_sampled) { - if (!IsInSampledGraph(dest)) { - continue; - } - } else if (is_inductive) { - if (!IsValidForPhase(dest, GNNPhase::kTrain)) { - continue; - } - } else { - GALOIS_LOG_WARN( - "Why is special norm factor called if not sampled/inductive?"); + [&](size_t src) { + for (auto edge_iter = partitioned_graph_->edge_begin(src); + edge_iter != partitioned_graph_->edge_end(src); edge_iter++) { + // count degrees for all + train/other + size_t dest = GetEdgeDest(edge_iter); + if (IsValidForPhase(dest, GNNPhase::kTrain) || + IsValidForPhase(dest, GNNPhase::kOther)) { + global_train_degrees_[src] += 1; } - degree += 1; - } - - // only set if non-zero - if (degree != 0) { - norm_factors_[local_id] = 1.0 / std::sqrt(static_cast(degree)); - degree_norm_[local_id] = 1.0 / static_cast(degree); + global_degrees_[src] += 1; } }, - galois::loopname("CalculateSpecialNormFactor")); + galois::loopname("CalculateLocalDegrees")); + // degree sync + gnn_degree_vec_1_ = global_train_degrees_.data(); + gnn_degree_vec_2_ = global_degrees_.data(); + sync_substrate_->sync( + "InitialDegreeSync"); } float galois::graphs::GNNGraph::GetGlobalAccuracy( diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index b5a538d314..3bca821078 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -320,7 +320,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( GNNFloat source_norm = 0.0; if (!config_.disable_normalization) { - source_norm = graph_.GetNormFactor(src); + source_norm = graph_.GetGCNNormFactor(src); } // init to self @@ -359,7 +359,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( size_t index_to_dst_feature = dst * column_length; if (!config_.disable_normalization) { - GNNFloat norm_scale = source_norm * graph_.GetNormFactor(dst); + GNNFloat norm_scale = source_norm * graph_.GetGCNNormFactor(dst); galois::VectorMulAdd( column_length, &aggregate_output[index_to_src_feature], &node_embeddings[index_to_dst_feature], norm_scale, From bcffe89b1a504cea94ffa569966c75b9de70dae8 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 29 Apr 2021 16:21:35 -0500 Subject: [PATCH 522/660] Inductive graph uses train degrees rather Before this commit, inductive subgraph creation recounted train degrees: this is unnecessary now with the train degrees array. What this means is that this should theoretically work in distributed setting. This is the next step/commit. --- libgnn/include/galois/graphs/GNNGraph.h | 12 +++++++++--- libgnn/src/graphs/GNNGraph.cpp | 26 ++++++++----------------- 2 files changed, 17 insertions(+), 21 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index c5817a9b07..cab41370c8 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -293,8 +293,6 @@ class GNNGraph { } } - //! Degree norm (1 / degree) of current functional graph (e.g., sampled, - //! inductive graph, etc); calculated whenever norm factor is calculated GNNFloat GetGlobalDegreeNorm(GraphNode n) const { if (global_degrees_[n]) { return 1.0 / global_degrees_[n]; @@ -303,6 +301,14 @@ class GNNGraph { } } + GNNFloat GetGlobalTrainDegreeNorm(GraphNode n) const { + if (global_train_degrees_[n]) { + return 1.0 / global_train_degrees_[n]; + } else { + return 0.0; + } + } + //! Get degree norm of subgraph for particular layer (i.e. includes training) GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const { if (use_subgraph_) { @@ -312,7 +318,7 @@ class GNNGraph { degree = sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)]; } else { - degree = sampled_out_degrees_[0][subgraph_->SIDToLID(n)]; + degree = global_train_degrees_[subgraph_->SIDToLID(n)]; } if (degree) { return 1.0 / degree; diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 265066b361..47f78b2173 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -812,11 +812,6 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, array.create(partitioned_graph_->size()); } } else { - // TODO(loc) optimize possible: inductive setting means # nodes always - // only training/other nodes, so can allocate only what is required - // Allocating full size is inefficient - sampled_out_degrees_.resize(1); - sampled_out_degrees_[0].create(partitioned_graph_->size()); subgraph_is_inductive_ = true; } } @@ -840,12 +835,14 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample() { edge_sample_status_[edge_id].end(), 0); }); // reset all degrees - galois::do_all( - galois::iterate(sampled_out_degrees_), - [&](galois::LargeArray& array) { - std::fill(array.begin(), array.end(), 0); - }, - galois::chunk_size<1>()); + if (!subgraph_is_inductive_) { + galois::do_all( + galois::iterate(sampled_out_degrees_), + [&](galois::LargeArray& array) { + std::fill(array.begin(), array.end(), 0); + }, + galois::chunk_size<1>()); + } } void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { @@ -875,11 +872,6 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { partitioned_graph_->getEdgeDst(edge_iter)); } sampled += 1; - // only count once for last layer (last layer is where all - // relevant nodes will be included) - if (agg_layer_num == 0) { - sampled_out_degrees_[0][*src_iter]++; - } } total += 1; } @@ -945,8 +937,6 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, } total += 1; } - // galois::gDebug(*src_iter, " with degree ", - // sampled_out_degrees_[sample_layer_num][*src_iter]); } }, galois::steal(), galois::loopname("NeighborhoodSample")); From 42448f0cb7ae8221887e5e9af88f443f214cf740 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 30 Apr 2021 12:55:37 -0500 Subject: [PATCH 523/660] GNNSubgraph fix edge dest construction Subgraph edge dest construction was using the original ID from the graph and not the subgraph ID. This was causing issues when original ID != SID which happens a lot more in distributed setting. This commit fixes it by mapping the ID correctly. --- libgnn/src/graphs/GNNSubgraph.cpp | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index cfacf02f4f..9bd467e8e3 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -35,6 +35,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping( if (gnn_graph.IsInSampledGraph(local_node_id)) { // TODO should bound check the SID to max uint32_t // note: if SID is max uint32t, then it's not valid + // galois::gInfo(local_node_id, " maps to ", current_sid); lid_to_subgraph_id_[local_node_id] = current_sid++; } } @@ -47,6 +48,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping( if (gnn_graph.IsInSampledGraph(local_node_id)) { // TODO should bound check the SID to max uint32_t // note: if SID is max uint32t, then it's not valid + // galois::gInfo(local_node_id, " maps to ", current_sid); lid_to_subgraph_id_[local_node_id] = current_sid++; } } @@ -144,7 +146,8 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) { subedge_to_original_edge_[out_location] = *out_edge_iter; underlying_graph_.constructEdge( - out_location++, gnn_graph.GetEdgeDest(out_edge_iter)); + out_location++, + lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)]); } } @@ -153,7 +156,8 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( in_subedge_to_original_edge_[in_location] = *(gnn_graph.InEdgeToOutEdge(in_edge_iter)); underlying_graph_.ConstructInEdge( - in_location++, gnn_graph.GetInEdgeDest(in_edge_iter)); + in_location++, + lid_to_subgraph_id_[gnn_graph.GetInEdgeDest(in_edge_iter)]); } } assert(out_location == subgraph_out_degrees_[subgraph_id]); From 7464acfa47304bc1b85434ab428a9d440d6a508e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 30 Apr 2021 12:57:50 -0500 Subject: [PATCH 524/660] GNN graph (S)ID conversion helper functions SID conversion functions and a function exposing the LID to SID map that will be used in distributed sync (substrate needs to map LID to correct SID to get the necessary info). --- libgnn/include/galois/graphs/GNNGraph.h | 36 +++++++++++++++++++++- libgnn/include/galois/graphs/GNNSubgraph.h | 5 +++ 2 files changed, 40 insertions(+), 1 deletion(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index cab41370c8..25bc8f8f4f 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -280,8 +280,42 @@ class GNNGraph { } void EnableSubgraph() { use_subgraph_ = true; } - void DisableSubgraph() { use_subgraph_ = false; } + bool IsSubgraphOn() const { return use_subgraph_; } + + //! Converts an id to an lid for the graph if subgraphs are in use + uint32_t ConvertToLID(GraphNode sid) const { + if (use_subgraph_) { + return subgraph_->SIDToLID(sid); + } else { + return sid; + } + } + //! Converts an LID to an SID if subgraphs are in use + uint32_t ConvertToSID(GraphNode lid) const { + if (use_subgraph_) { + return subgraph_->LIDToSID(lid); + } else { + return lid; + } + } + //! Converts SID to GID if subgraphs in use (else just return GID) + uint32_t SIDToGID(GraphNode sid) const { + if (use_subgraph_) { + return GetGID(subgraph_->SIDToLID(sid)); + } else { + return GetGID(sid); + } + } + //! Returns a pointer to the LID to SID map from the subgraph if subgraphs + //! are in use + galois::LargeArray* GetLIDToSIDPointer() { + if (use_subgraph_) { + return subgraph_->GetLIDToSIDPointer(); + } else { + return nullptr; + } + } ////////////////////////////////////////////////////////////////////////////// diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h index 4ac7c739eb..21642b189b 100644 --- a/libgnn/include/galois/graphs/GNNSubgraph.h +++ b/libgnn/include/galois/graphs/GNNSubgraph.h @@ -35,6 +35,7 @@ class GNNSubgraph { } uint32_t SIDToLID(uint32_t sid) const { return subgraph_id_to_lid_[sid]; } + uint32_t LIDToSID(uint32_t lid) const { return lid_to_subgraph_id_[lid]; } ////////////////////////////////////////////////////////////////////////////// // Edge iteration and destination @@ -91,6 +92,10 @@ class GNNSubgraph { ////////////////////////////////////////////////////////////////////////////// + galois::LargeArray* GetLIDToSIDPointer() { + return &lid_to_subgraph_id_; + } + private: //! Creates subgraph ID mapping from the number of sampled nodes from the //! original graph. Should be done every epoch when sampled graph changes. From 1217c4bdc1b156f68091eee3130d6e14ace4e034 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 30 Apr 2021 13:00:16 -0500 Subject: [PATCH 525/660] SAGE layer: distributed subgraph compatibility Below only applies to SAGE layer. GCN layer is behind. 1) Bitset set maps from SID to LID as necessary. 2) New sync struct for subgraphs that accounts for LID/SID mapping. 3) Aggregate sync uses correct sync struct as necessary. --- .../graphs/GraphAggregationSyncStructures.h | 81 +++++++++++++++++++ libgnn/src/graphs/GNNGraph.cpp | 30 +++++-- libgnn/src/layers/SAGELayer.cpp | 9 +-- 3 files changed, 107 insertions(+), 13 deletions(-) diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index 073cde32c3..bcf7ed5078 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -11,6 +11,7 @@ namespace graphs { extern GNNFloat* gnn_matrix_to_sync_; extern size_t gnn_matrix_to_sync_column_length_; extern galois::DynamicBitSet bitset_graph_aggregate; +extern galois::LargeArray* gnn_lid_to_sid_pointer_; #ifdef GALOIS_ENABLE_GPU extern struct CUDA_Context* cuda_ctx_for_sync; extern unsigned layer_number_to_sync; @@ -66,6 +67,7 @@ struct GNNSumAggregate { // assert(device_personality == DevicePersonality::CPU); ValTy extracted_vec(gnn_matrix_to_sync_column_length_); for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + // XXX memcpy extracted_vec[i] = gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i]; } @@ -79,6 +81,7 @@ struct GNNSumAggregate { assert(y.size() == gnn_matrix_to_sync_column_length_); // loop and do addition for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + // XXX vectorized add gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] += y[i]; } @@ -121,6 +124,84 @@ struct GNNSumAggregate { static bool extract_reset_batch(unsigned, uint8_t*) { return false; } }; +struct GNNSampleSumAggregate { + using ValTy = galois::gstl::Vector; + + //! return a vector of floats to sync + static ValTy extract(uint32_t node_id, char&) { + // It should be a CPU synchronizing substrate. + // If the GPU flag is turned off, then personality does not exist. + // assert(device_personality == DevicePersonality::CPU); + ValTy extracted_vec(gnn_matrix_to_sync_column_length_, 0.0); + if ((*gnn_lid_to_sid_pointer_)[node_id] == + std::numeric_limits::max()) { + return extracted_vec; + } + + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + // XXX memcpy + extracted_vec[i] = + gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] * + gnn_matrix_to_sync_column_length_ + + i]; + } + // move constructor should kick in here to avoid return copy + return extracted_vec; + } + + //! reduction is addition in this case; add received vector to + //! own vector + static bool reduce(uint32_t node_id, char&, ValTy y) { + assert(y.size() == gnn_matrix_to_sync_column_length_); + if ((*gnn_lid_to_sid_pointer_)[node_id] == + std::numeric_limits::max()) { + return false; + } + + // loop and do addition + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] * + gnn_matrix_to_sync_column_length_ + + i] += y[i]; + } + return true; + } + + //! No-op: readAny = overwritten anyways + static void reset(uint32_t, char&) {} + + //! element wise set + static void setVal(uint32_t node_id, char&, ValTy y) { + assert(y.size() == gnn_matrix_to_sync_column_length_); + if ((*gnn_lid_to_sid_pointer_)[node_id] == + std::numeric_limits::max()) { + return; + } + + // loop and do addition + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] * + gnn_matrix_to_sync_column_length_ + + i] = y[i]; + } + } + + // GPU options TODO for GPU + static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_batch(unsigned, uint8_t*) { return false; } + static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { + return false; + } + static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_reset_batch(unsigned, uint8_t*) { return false; } +}; + #ifdef GALOIS_ENABLE_GPU GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_input, cuda_ctx_for_sync, gnn_matrix_to_sync_column_length_, diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 47f78b2173..e1b0bcab67 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -42,6 +42,7 @@ namespace graphs { GNNFloat* gnn_matrix_to_sync_ = nullptr; size_t gnn_matrix_to_sync_column_length_ = 0; galois::DynamicBitSet bitset_graph_aggregate; +galois::LargeArray* gnn_lid_to_sid_pointer_ = nullptr; uint32_t* gnn_degree_vec_1_; uint32_t* gnn_degree_vec_2_; #ifdef GALOIS_ENABLE_GPU @@ -179,16 +180,31 @@ bool galois::graphs::GNNGraph::IsValidForPhaseMasked( void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size, bool is_backward) const { - // set globals for the sync substrate gnn_matrix_to_sync_ = matrix_to_sync; gnn_matrix_to_sync_column_length_ = matrix_column_size; - if (!is_backward) { - sync_substrate_ - ->sync( - "GraphAggregateSync"); + if (!use_subgraph_) { + // set globals for the sync substrate + if (!is_backward) { + sync_substrate_ + ->sync( + "GraphAggregateSync"); + } else { + sync_substrate_->sync( + "BackwardGraphAggregateSync"); + } } else { - sync_substrate_->sync("BackwardGraphAggregateSync"); + // setup the SID to LID map for the sync substrate to use (SID != LID) + gnn_lid_to_sid_pointer_ = subgraph_->GetLIDToSIDPointer(); + + if (!is_backward) { + sync_substrate_->sync("GraphAggregateSync"); + } else { + sync_substrate_->sync( + "BackwardGraphAggregateSync"); + } } } diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 48ab1e0b4e..134f24f3f2 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -390,7 +390,6 @@ void galois::SAGELayer::AggregateAllCPU( GNNFloat* aggregate_output, galois::substrate::PerThreadStorage>*, bool is_backward) { - galois::do_all( galois::iterate(graph_.begin(), graph_.end()), [&](size_t src) { @@ -409,10 +408,8 @@ void galois::SAGELayer::AggregateAllCPU( // loop through all destinations to grab the feature to aggregate for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src); e++) { - // XXX set LID - graphs::bitset_graph_aggregate.set(src); + graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src)); size_t dst = graph_.GetEdgeDest(e); - // galois::gPrint("(", src, " ", dst, ")\n"); if (layer_phase_ == GNNPhase::kTrain) { // XXX @@ -450,8 +447,7 @@ void galois::SAGELayer::AggregateAllCPU( // loop through all destinations to grab the feature to aggregate for (auto e = graph_.in_edge_begin(src); e != graph_.in_edge_end(src); e++) { - // XXX LID not SID - graphs::bitset_graph_aggregate.set(src); + graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src)); size_t dst = graph_.GetInEdgeDest(e); if (layer_phase_ == GNNPhase::kTrain) { @@ -485,6 +481,7 @@ void galois::SAGELayer::AggregateAllCPU( }, galois::chunk_size<1>(), galois::steal(), galois::loopname("ConvolutionalAggregateAll")); + // aggregate sync graph_.AggregateSync(aggregate_output, column_length, is_backward); } From 4f03bdfe77f7f0f8e2bc99e66582616bab33dd09 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 1 May 2021 14:08:32 -0500 Subject: [PATCH 526/660] Sampled degree sync for subgraphs, bug fix Sync subgraph degrees for all layers using a single sync call by serializing degrees into a vector. Fix bug with subgraph mode being on when sampling new subgraphs which would cause nodes from previous samples to be included unintentionally. Cleanup of some code/prints. --- .../galois/graphs/DegreeSyncStructures.h | 67 +++++++++++++++++++ libgnn/include/galois/graphs/GNNGraph.h | 12 +--- libgnn/include/galois/graphs/GNNSubgraph.h | 4 +- libgnn/src/graphs/GNNGraph.cpp | 45 +++++++++++-- libgnn/src/graphs/GNNSubgraph.cpp | 37 +++++----- libgnn/src/layers/SAGELayer.cpp | 5 +- libgnn/src/layers/SoftmaxLayer.cpp | 17 ++--- 7 files changed, 135 insertions(+), 52 deletions(-) diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h index 0141805df0..04c696f6ab 100644 --- a/libgnn/include/galois/graphs/DegreeSyncStructures.h +++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h @@ -6,6 +6,9 @@ namespace graphs { extern uint32_t* gnn_degree_vec_1_; extern uint32_t* gnn_degree_vec_2_; +extern galois::DynamicBitSet bitset_sampled_degrees_; +extern std::vector>* gnn_sampled_out_degrees_; + struct InitialDegreeSync { using ValTy = std::pair; @@ -54,5 +57,69 @@ struct InitialDegreeSync { static bool extract_reset_batch(unsigned, uint8_t*) { return false; } }; +struct SubgraphDegreeSync { + using ValTy = galois::gstl::Vector; + + //! return a vector of floats to sync + static ValTy extract(uint32_t lid, char&) { + ValTy vec_to_send(gnn_sampled_out_degrees_->size()); + size_t count = 0; + for (galois::LargeArray& layer_degrees : + *gnn_sampled_out_degrees_) { + vec_to_send[count++] = layer_degrees[lid]; + } + assert(count == vec_to_send.size()); + return vec_to_send; + } + + static bool reduce(uint32_t lid, char&, ValTy y) { + assert(y.size() == gnn_sampled_out_degrees_->size()); + for (size_t degree_index = 0; degree_index < y.size(); degree_index++) { + (*gnn_sampled_out_degrees_)[degree_index][lid] += y[degree_index]; + } + return true; + } + + //! No-op: readAny = overwritten anyways; can probably get away with no-op + static void reset(uint32_t lid, char&) { + for (galois::LargeArray& layer_degrees : + *gnn_sampled_out_degrees_) { + layer_degrees[lid] = 0; + } + } + + //! element wise set + static void setVal(uint32_t lid, char&, ValTy y) { + assert(y.size() == gnn_sampled_out_degrees_->size()); + for (size_t degree_index = 0; degree_index < y.size(); degree_index++) { + (*gnn_sampled_out_degrees_)[degree_index][lid] = y[degree_index]; + } + } + + // GPU options TODO for GPU + static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_batch(unsigned, uint8_t*) { return false; } + static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { + return false; + } + static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_reset_batch(unsigned, uint8_t*) { return false; } +}; + +struct SubgraphDegreeBitset { + static constexpr bool is_vector_bitset() { return false; } + static constexpr bool is_valid() { return true; } + static galois::DynamicBitSet& get() { return bitset_sampled_degrees_; } + static void reset_range(size_t begin, size_t end) { + bitset_sampled_degrees_.reset(begin, end); + } +}; + } // namespace graphs } // namespace galois diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 25bc8f8f4f..2812cd4210 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -269,15 +269,7 @@ class GNNGraph { void SampleEdges(size_t sample_layer_num, size_t num_to_sample); //! Construct the subgraph from sampled edges and corresponding nodes - size_t ConstructSampledSubgraph() { - // false first so that the build process can use functions to access the - // real graph - use_subgraph_ = false; - size_t num_subgraph_nodes = subgraph_->BuildSubgraph(*this); - // after this, this graph is a subgraph - use_subgraph_ = true; - return num_subgraph_nodes; - } + size_t ConstructSampledSubgraph(); void EnableSubgraph() { use_subgraph_ = true; } void DisableSubgraph() { use_subgraph_ = false; } @@ -380,8 +372,6 @@ class GNNGraph { } if (local_ground_truth_labels_[to_use] != num_label_classes_) { - // galois::gPrint(lid, " ", to_use, " ", - // (int)local_ground_truth_labels_[to_use], "\n"); return local_ground_truth_labels_[to_use]; } else { GALOIS_LOG_FATAL( diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h index 21642b189b..976303be84 100644 --- a/libgnn/include/galois/graphs/GNNSubgraph.h +++ b/libgnn/include/galois/graphs/GNNSubgraph.h @@ -128,8 +128,8 @@ class GNNSubgraph { //! is for static) galois::gstl::Vector subgraph_id_to_lid_; // intermediate degrees used for edge construction - galois::gstl::Vector subgraph_out_degrees_; - galois::gstl::Vector subgraph_in_degrees_; + galois::gstl::Vector local_subgraph_out_degrees_; + galois::gstl::Vector local_subgraph_in_degrees_; //! Maps from subgraph out-edge id to original graph edge id (used to check if //! edge exists in particular layer) galois::gstl::Vector subedge_to_original_edge_; diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index e1b0bcab67..1a288365bd 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -41,10 +41,16 @@ namespace galois { namespace graphs { GNNFloat* gnn_matrix_to_sync_ = nullptr; size_t gnn_matrix_to_sync_column_length_ = 0; +//! For synchronization of graph aggregations galois::DynamicBitSet bitset_graph_aggregate; galois::LargeArray* gnn_lid_to_sid_pointer_ = nullptr; uint32_t* gnn_degree_vec_1_; uint32_t* gnn_degree_vec_2_; + +//! For synchronization of sampled degrees +galois::DynamicBitSet bitset_sampled_degrees_; +std::vector>* gnn_sampled_out_degrees_; + #ifdef GALOIS_ENABLE_GPU struct CUDA_Context* cuda_ctx_for_sync; unsigned layer_number_to_sync; @@ -692,8 +698,9 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( size_t global_correct = num_correct_.reduce(); size_t global_checked = total_checked_.reduce(); - GALOIS_LOG_WARN("Sub: {}, Accuracy: {} / {}", use_subgraph_, global_correct, - global_checked); + // GALOIS_LOG_WARN("Sub: {}, Accuracy: {} / {}", use_subgraph_, + // global_correct, + // global_checked); return static_cast(global_correct) / static_cast(global_checked); @@ -833,6 +840,7 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, } void galois::graphs::GNNGraph::SetupNeighborhoodSample() { + use_subgraph_ = false; new_sampled_nodes_.resize(size()); new_sampled_nodes_.reset(); @@ -859,6 +867,8 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample() { }, galois::chunk_size<1>()); } + bitset_sampled_degrees_.resize(partitioned_graph_->size()); + bitset_sampled_degrees_.reset(); } void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { @@ -874,7 +884,7 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { galois::iterate(begin(), end()), [&](const NodeIterator& src_iter) { // only operate on if sampled - if (partitioned_graph_->getData(*src_iter)) { + if (IsInSampledGraph(src_iter)) { // marks ALL edges of nodes that connect to train/other nodes for (auto edge_iter : partitioned_graph_->edges(*src_iter)) { if (IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), @@ -895,8 +905,8 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { }, galois::steal(), galois::loopname("ChooseAllEdges")); - galois::gPrint("Num sampled edges is ", sampled.reduce(), " out of ", - total.reduce(), "\n"); + galois::gPrint("Num sampled edges in inductive graph is ", sampled.reduce(), + " out of ", total.reduce(), "\n"); std::vector new_nodes = new_sampled_nodes_.getOffsets(); // update nodes, then communicate update to all hosts so that they can @@ -917,13 +927,16 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, galois::GAccumulator sampled; galois::GAccumulator total; + // galois::GAccumulator total_nodes; sampled.reset(); total.reset(); + // total_nodes.reset(); + galois::do_all( galois::iterate(begin(), end()), [&](const NodeIterator& src_iter) { // only operate on if sampled - if (partitioned_graph_->getData(*src_iter)) { + if (IsInSampledGraph(src_iter)) { // chance of not uniformly choosing an edge of this node num_to_sample // times (degree norm is 1 / degree) // XXX training degree + other norm, not global @@ -946,6 +959,7 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, new_sampled_nodes_.set( partitioned_graph_->getEdgeDst(edge_iter)); } + bitset_sampled_degrees_.set(*src_iter); // degree increment sampled_out_degrees_[sample_layer_num][*src_iter]++; sampled += 1; @@ -953,10 +967,13 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, } total += 1; } + // total_nodes += 1; } }, galois::steal(), galois::loopname("NeighborhoodSample")); + // galois::gInfo(host_prefix(), "sampled nodes for layer ", sample_layer_num, + // " is ", total_nodes.reduce()); galois::gDebug("Num sampled edges for layer ", sample_layer_num, " is ", sampled.reduce(), " out of ", total.reduce()); @@ -973,6 +990,22 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, sync_substrate_->sync("SampleSync"); } +//! Construct the subgraph from sampled edges and corresponding nodes +size_t galois::graphs::GNNGraph::ConstructSampledSubgraph() { + // false first so that the build process can use functions to access the + // real graph + use_subgraph_ = false; + gnn_sampled_out_degrees_ = &sampled_out_degrees_; + // first, sync the degres of the sampled edges across all hosts + sync_substrate_ + ->sync( + "SubgraphDegree"); + size_t num_subgraph_nodes = subgraph_->BuildSubgraph(*this); + // after this, this graph is a subgraph + use_subgraph_ = true; + return num_subgraph_nodes; +} + //////////////////////////////////////////////////////////////////////////////// #ifdef GALOIS_ENABLE_GPU diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index 9bd467e8e3..387e3fc250 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -6,6 +6,9 @@ galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(GNNGraph& gnn_graph) { galois::StatTimer timer("BuildSubgraph", kRegionName); timer.start(); CreateLocalToSubgraphMapping(gnn_graph); + if (num_subgraph_nodes_ == 0) { + return 0; + } DegreeCounting(gnn_graph); EdgeCreation(gnn_graph); NodeFeatureCreation(gnn_graph); @@ -52,7 +55,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping( lid_to_subgraph_id_[local_node_id] = current_sid++; } } - galois::gDebug("Numbered sampled nodes for subgraph construction is ", + galois::gDebug("Number of sampled nodes for subgraph construction is ", current_sid); num_subgraph_nodes_ = current_sid; @@ -67,8 +70,8 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting( timer.start(); subgraph_id_to_lid_.resize(num_subgraph_nodes_); - subgraph_out_degrees_.resize(num_subgraph_nodes_); - subgraph_in_degrees_.resize(num_subgraph_nodes_); + local_subgraph_out_degrees_.resize(num_subgraph_nodes_); + local_subgraph_in_degrees_.resize(num_subgraph_nodes_); galois::do_all( galois::iterate(gnn_graph.begin(), gnn_graph.end()), @@ -83,7 +86,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting( out_degrees++; } } - subgraph_out_degrees_[subgraph_id] = out_degrees; + local_subgraph_out_degrees_[subgraph_id] = out_degrees; uint32_t in_degrees = 0; for (auto in_edge_iter : gnn_graph.in_edges(node_id)) { @@ -91,7 +94,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting( in_degrees++; } } - subgraph_in_degrees_[subgraph_id] = in_degrees; + local_subgraph_in_degrees_[subgraph_id] = in_degrees; // galois::gDebug("Local ID ", node_id, " SID ", subgraph_id, " out ", // out_degrees, " in ", in_degrees); } @@ -109,21 +112,21 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( // prefix sum over subgraph degrees from previous phase to get starting points for (size_t i = 1; i < num_subgraph_nodes_; i++) { - subgraph_out_degrees_[i] += subgraph_out_degrees_[i - 1]; - subgraph_in_degrees_[i] += subgraph_in_degrees_[i - 1]; + local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1]; + local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1]; } // allocate then set node endpoints - num_subgraph_edges_ = subgraph_out_degrees_.back(); + num_subgraph_edges_ = local_subgraph_out_degrees_.back(); underlying_graph_.DeallocateOnly(); underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_); underlying_graph_.CSCAllocate(); galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_), [&](uint32_t subgraph_id) { underlying_graph_.fixEndEdge( - subgraph_id, subgraph_out_degrees_[subgraph_id]); + subgraph_id, local_subgraph_out_degrees_[subgraph_id]); underlying_graph_.FixEndInEdge( - subgraph_id, subgraph_in_degrees_[subgraph_id]); + subgraph_id, local_subgraph_in_degrees_[subgraph_id]); }); subedge_to_original_edge_.resize(num_subgraph_edges_); in_subedge_to_original_edge_.resize(num_subgraph_edges_); @@ -138,8 +141,8 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( uint32_t out_location = 0; uint32_t in_location = 0; if (subgraph_id != 0) { - out_location = subgraph_out_degrees_[subgraph_id - 1]; - in_location = subgraph_in_degrees_[subgraph_id - 1]; + out_location = local_subgraph_out_degrees_[subgraph_id - 1]; + in_location = local_subgraph_in_degrees_[subgraph_id - 1]; } for (auto out_edge_iter : gnn_graph.edges(node_id)) { @@ -160,8 +163,8 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( lid_to_subgraph_id_[gnn_graph.GetInEdgeDest(in_edge_iter)]); } } - assert(out_location == subgraph_out_degrees_[subgraph_id]); - assert(in_location == subgraph_in_degrees_[subgraph_id]); + assert(out_location == local_subgraph_out_degrees_[subgraph_id]); + assert(in_location == local_subgraph_in_degrees_[subgraph_id]); } }, galois::steal()); @@ -182,12 +185,6 @@ void galois::graphs::GNNGraph::GNNSubgraph::NodeFeatureCreation( &(subgraph_node_features_[subgraph_node_id * feat_length]), &((gnn_graph.GetLocalFeatures().data())[local_id * feat_length]), feat_length * sizeof(GNNFeature)); - // for (unsigned i = 0; i < feat_length; i++) { - // galois::gPrint(feat_length * sizeof(GNNFeature) , " ", subgraph_node_id, - // " local id " , local_id, " feat at ", i, " is ", - // subgraph_node_features_[subgraph_node_id * feat_length + i], " ", - // gnn_graph.GetLocalFeatures()[local_id * feat_length + i], "\n"); - //} }); timer.stop(); } diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 134f24f3f2..9d6ca7c5cc 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -499,9 +499,10 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, base_gpu_object_.layer_weights(), output); } else { #endif - galois::gPrint(layer_dimensions_.input_rows, " ", + galois::gDebug("Layer ", graph_user_layer_number_, " ", + layer_dimensions_.input_rows, " ", layer_dimensions_.input_columns, " ", - layer_dimensions_.output_columns, "\n"); + layer_dimensions_.output_columns); // CPU version is just a call into CBlas galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, layer_dimensions_.input_columns, diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 94523ce327..f7a345050d 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -8,13 +8,12 @@ galois::SoftmaxLayer::ForwardPhaseCPU( // note: p_backward == input_embeddings input_loss_.assign(input_loss_.size(), 0.0); const size_t feature_length = layer_dimensions_.input_columns; - //#ifndef NDEBUG - //#ifdef NDEBUG +#ifndef NDEBUG galois::DGAccumulator loss_accum; galois::DGAccumulator handled; loss_accum.reset(); handled.reset(); - //#endif +#endif galois::do_all( galois::iterate(graph_.begin(), graph_.end()), @@ -44,11 +43,10 @@ galois::SoftmaxLayer::ForwardPhaseCPU( input_loss_[i] = GNNCrossEntropy(feature_length, ground_truth_vec->data(), &p_backward_output_matrix_[feature_length * i]); - //#ifndef NDEBUG - //#ifdef NDEBUG +#ifndef NDEBUG loss_accum += input_loss_[i]; handled += 1; - //#endif +#endif } else { VectorZero(feature_length, &p_backward_output_matrix_[i * feature_length]); @@ -57,14 +55,11 @@ galois::SoftmaxLayer::ForwardPhaseCPU( // TODO chunk size? // steal on as some threads may have nothing to work on galois::steal(), galois::loopname("SoftmaxForward")); - //#ifndef NDEBUG - //#ifdef NDEBUG - +#ifndef NDEBUG GNNFloat reduced_loss = loss_accum.reduce(); size_t t = handled.reduce(); galois::gPrint("Loss is ", reduced_loss / t, " ", reduced_loss, " ", t, "\n"); - - //#endif +#endif return p_backward_output_matrix_; } From 4b19e5c14183c1d91e2710abdea31b1c0ef9817e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 3 May 2021 13:44:58 -0500 Subject: [PATCH 527/660] GNN single host training minibatching Allows for training using minibatching on a single host via a command line argument. A minibatcher class has been added that takes in a mask and batches it via linear scan over it (O(n) ultimately per epoch). This mask is then used to mark the seed nodes for the sampler (which can do full sampling or it can take all nodes; this needs to be implemented). The rest of execution works the same as previous sampling subgraphs: main difference is the seed nodes aren't all training nodes, and everything works out the same way from there. Other notable changes - GNNMask class for masks - kBatch phase for minibatching; set the phase and entire pipeline works as you might expect - Removed old node sampling stuff because unused right now; might bring back SAINT sampling if it turns out to be useful - Some signature changes to functions --- libgnn/CMakeLists.txt | 1 + libgnn/include/galois/GNNTypes.h | 4 +- libgnn/include/galois/GraphNeuralNetwork.h | 2 + libgnn/include/galois/MinibatchGenerator.h | 28 +++++ libgnn/include/galois/graphs/GNNGraph.h | 56 ++++++---- libgnn/src/GraphNeuralNetwork.cpp | 81 ++++++++++++-- libgnn/src/MinibatchGenerator.cpp | 33 ++++++ libgnn/src/graphs/GNNGraph.cpp | 116 ++++----------------- libgnn/src/layers/SAGELayer.cpp | 2 +- libgnn/src/layers/SoftmaxLayer.cpp | 1 + lonestar/libgnnbench/src/Input.cpp | 13 ++- 11 files changed, 208 insertions(+), 129 deletions(-) create mode 100644 libgnn/include/galois/MinibatchGenerator.h create mode 100644 libgnn/src/MinibatchGenerator.cpp diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 2393ce043b..46ea0fd67c 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -2,6 +2,7 @@ set(sources src/GNNMath.cpp src/GNNOptimizers.cpp src/GraphNeuralNetwork.cpp + src/MinibatchGenerator.cpp src/graphs/GNNGraph.cpp src/graphs/GNNSubgraph.cpp src/layers/DenseLayer.cpp diff --git a/libgnn/include/galois/GNNTypes.h b/libgnn/include/galois/GNNTypes.h index 492bc841dc..5dbcf4771b 100644 --- a/libgnn/include/galois/GNNTypes.h +++ b/libgnn/include/galois/GNNTypes.h @@ -19,13 +19,15 @@ using GNNFloat = float; using GNNLabel = uint8_t; //! Type of a feature on vertices using GNNFeature = float; +//! Type of mask +using GNNMask = std::vector; //! Type of node index on gpus using GPUNodeIndex = uint32_t; //! Type of edge index on gpus using GPUEdgeIndex = uint64_t; //! Phase of GNN computation -enum class GNNPhase { kTrain, kValidate, kTest, kOther }; +enum class GNNPhase { kTrain, kValidate, kTest, kOther, kBatch }; //! Vector like wrapper over a pointer and size; exists solely to pass around //! raw pointers with size (because vectors are a no-go due to the code diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 953e925d9a..580738b133 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -92,6 +92,7 @@ class GraphNeuralNetworkConfig { } bool do_sampling() const { return do_sampling_; } + unsigned train_minibatch_size() const { return train_minibatch_size_; } //! Get the default layer config of layers in this GNN const GNNLayerConfig& default_layer_config() const { @@ -107,6 +108,7 @@ class GraphNeuralNetworkConfig { unsigned validation_interval_{0}; //! Interval to run testing set on network at; 0 = no run unsigned test_interval_{0}; + unsigned train_minibatch_size_{0}; private: //! Number of layers to construct in the GNN not including the output diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h new file mode 100644 index 0000000000..0bd063b90c --- /dev/null +++ b/libgnn/include/galois/MinibatchGenerator.h @@ -0,0 +1,28 @@ +#pragma once + +#include "galois/GNNTypes.h" + +namespace galois { + +//! Generates minibatchs given a mask for the class of things to generate +//! the minibatch for +class MinibatchGenerator { +public: + MinibatchGenerator(const GNNMask& mask_to_minibatch, size_t minibatch_size) + : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size} { + } + void GetNextMinibatch(std::vector* batch_mask); + //! True if no more minibatches from this generator + bool NoMoreMinibatches() { + return current_position_ == mask_to_minibatch_.size(); + } + //! Reset the only state (a position bit) + void ResetMinibatchState() { current_position_ = 0; } + +private: + const GNNMask& mask_to_minibatch_; + size_t minibatch_size_; + size_t current_position_{0}; +}; + +} // namespace galois diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 2812cd4210..ded867787c 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -5,6 +5,7 @@ #include "galois/graphs/CuSPPartitioner.h" #include "galois/graphs/GluonSubstrate.h" #include "galois/graphs/GraphAggregationSyncStructures.h" +#include "galois/MinibatchGenerator.h" #ifdef GALOIS_ENABLE_GPU #include "galois/graphs/GNNGraph.cuh" @@ -261,7 +262,8 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// //! Set seed nodes, i.e., nodes that are being predicted on - void SetupNeighborhoodSample(); + void SetupNeighborhoodSample() { SetupNeighborhoodSample(GNNPhase::kTrain); } + void SetupNeighborhoodSample(GNNPhase seed_phase); //! Choose all edges from sampled nodes void SampleAllEdges(size_t agg_layer_num); @@ -310,7 +312,24 @@ class GNNGraph { } ////////////////////////////////////////////////////////////////////////////// + void SetupTrainBatcher(size_t train_batch_size) { + if (train_batcher_) { + // clear before remake + train_batcher_.reset(); + } + train_batcher_ = std::make_unique(local_training_mask_, + train_batch_size); + local_minibatch_mask_.resize(partitioned_graph_->size()); + } + + void ResetTrainMinibatcher() { train_batcher_->ResetMinibatchState(); } + //! Setup the state for the next minibatch sampling call by using the + //! minibatcher to pick up the next set batch of nodes + void PrepareNextTrainMinibatch(); + //! Returns true if there are still more minibatches in this graph + bool MoreTrainMinibatches() { return !train_batcher_->NoMoreMinibatches(); }; + ////////////////////////////////////////////////////////////////////////////// GNNFloat GetGCNNormFactor(GraphNode lid) const { if (global_degrees_[lid]) { return 1.0 / std::sqrt(static_cast(global_degrees_[lid]) + 1); @@ -416,7 +435,11 @@ class GNNGraph { if (use_subgraph_) { to_use = subgraph_->SIDToLID(lid); } - if (!incomplete_masks_ && current_phase != GNNPhase::kOther) { + // re: phase checks in this if: ranges are not used for these + // phases even if they might exist; it's something to look into + // possibly, though at the same time it may not be worth it + if (!incomplete_masks_ && current_phase != GNNPhase::kOther && + current_phase != GNNPhase::kBatch) { return IsValidForPhaseCompleteRange(to_use, current_phase); } else { return IsValidForPhaseMasked(to_use, current_phase); @@ -444,17 +467,6 @@ class GNNGraph { // Sampling related ////////////////////////////////////////////////////////////////////////////// - //! Loops through all master nodes and determines if it is "on" or "off" - //! (the meaning of on and off depends on how it is used; for now, it is used - //! to indicate subgraph presence); droprate controls chance of being dropped - //! (e.g. if 0.8, a node is 80% likely to not be included in subgraph) - void UniformNodeSample() { UniformNodeSample(0.5); } - void UniformNodeSample(float droprate); - - //! Use the sampling method present in GraphSAINT - void GraphSAINTSample() { GraphSAINTSample(3000, 2); }; - void GraphSAINTSample(size_t num_roots, size_t walk_depth); - //! Makes a node "sampled"; used for debugging/testing void SetSampledNode(size_t node) { partitioned_graph_->getData(node) = 1; } //! Makes a node "not sampled"; used for debugging/testing @@ -514,7 +526,7 @@ class GNNGraph { //! given a name, mask type, and arrays to save into size_t ReadLocalMasksFromFile(const std::string& dataset_name, const std::string& mask_type, - GNNRange* mask_range, char* masks); + GNNRange* mask_range, std::vector* masks); //! Finds nodes that aren't part of the 3 main GNN phase classifications size_t FindOtherMask(); //! Read masks of local nodes only for training, validation, and testing @@ -589,14 +601,17 @@ class GNNGraph { // TODO maybe revisit this and use an actual bitset //! Bitset indicating which nodes are training nodes - std::vector local_training_mask_; + GNNMask local_training_mask_; //! Bitset indicating which nodes are validation nodes - std::vector local_validation_mask_; + GNNMask local_validation_mask_; //! Bitset indicating which nodes are testing nodes - std::vector local_testing_mask_; - size_t valid_other_{0}; + GNNMask local_testing_mask_; //! Bitset indicating which nodes don't fall anywhere - std::vector other_mask_; + GNNMask other_mask_; + //! Bitset indicating which nodes are part of the minibatch + GNNMask local_minibatch_mask_; + + size_t valid_other_{0}; //! Global mask range for training nodes; must convert to LIDs when using //! in this class @@ -624,6 +639,9 @@ class GNNGraph { bool use_subgraph_{false}; bool subgraph_is_inductive_{false}; + std::unique_ptr train_batcher_; + std::unique_ptr test_batcher_; + ////////////////////////////////////////////////////////////////////////////// // GPU things ////////////////////////////////////////////////////////////////////////////// diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 46b8a6bcdd..dc2ebb2834 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -106,12 +106,20 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( break; } } - if (config_.do_sampling() || config_.inductive_training_) { + + // XXX test minibatch + if (config_.do_sampling() || config_.inductive_training_ || + config.train_minibatch_size()) { // output layer not included; it will never involve sampling graph_->InitializeSamplingData(num_graph_user_layers_, config_.inductive_training_); } + if (config_.train_minibatch_size()) { + graph_->SetupTrainBatcher(config_.train_minibatch_size()); + } + // XXX test minibatch size + // create the output layer GNNLayerDimensions output_dims = { .input_rows = max_rows, @@ -156,7 +164,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { const size_t this_host = graph_->host_id(); float train_accuracy{0.f}; size_t inductive_nodes = 0; - if (config_.inductive_training_) { + if (config_.inductive_training_ && !config_.train_minibatch_size()) { // Setup the subgraph to only be the training graph graph_->SetupNeighborhoodSample(); for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); @@ -179,10 +187,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { galois::StatTimer validation_timer("ValidationTime", "GraphNeuralNetwork"); galois::StatTimer epoch_test_timer("TestTime", "GraphNeuralNetwork"); - // TODO incorporate validation/test intervals for (size_t epoch = 0; epoch < num_epochs; epoch++) { epoch_timer.start(); - if (config_.inductive_training_) { + if (config_.inductive_training_ && !config_.train_minibatch_size()) { graph_->EnableSubgraph(); for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { @@ -190,7 +197,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } } - if (config_.do_sampling()) { + if (config_.do_sampling() && !config_.train_minibatch_size()) { graph_->SetupNeighborhoodSample(); size_t num_sampled_layers = 0; @@ -201,7 +208,11 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { GNNLayerType layer_type = (*back_iter)->layer_type(); if (layer_type == GNNLayerType::kGraphConvolutional || layer_type == GNNLayerType::kSAGE) { - graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 5); + if (num_sampled_layers == 0) { + graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 10); + } else { + graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 25); + } num_sampled_layers++; } } @@ -215,11 +226,59 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } } - const PointerWithSize predictions = DoInference(); - // have to get accuracy here because gradient prop destroys the predictions - // matrix - train_accuracy = GetGlobalAccuracy(predictions); - GradientPropagation(); + if (!config_.train_minibatch_size()) { + // no minibatching, full batch + const PointerWithSize predictions = DoInference(); + // have to get accuracy here because gradient prop destroys the + // predictions matrix + train_accuracy = GetGlobalAccuracy(predictions); + GradientPropagation(); + } else { + graph_->ResetTrainMinibatcher(); + SetLayerPhases(galois::GNNPhase::kBatch); + + size_t batch_num = 0; + + // XXX + // create mini batch graphs and loop until minibatches on all hosts done + while (true) { + galois::gInfo("Epoch ", epoch, " batch ", batch_num++); + // break when all hosts are done with minibatches + graph_->PrepareNextTrainMinibatch(); + size_t num_sampled_layers = 0; + for (auto back_iter = gnn_layers_.rbegin(); + back_iter != gnn_layers_.rend(); back_iter++) { + GNNLayerType layer_type = (*back_iter)->layer_type(); + if (layer_type == GNNLayerType::kGraphConvolutional || + layer_type == GNNLayerType::kSAGE) { + if (num_sampled_layers == 0) { + graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 10); + } else { + graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 25); + } + num_sampled_layers++; + } + } + // resize layer matrices + size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph(); + galois::gPrint(num_subgraph_nodes, "\n"); + for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); + layer++) { + (*layer)->ResizeRows(num_subgraph_nodes); + } + + const PointerWithSize batch_pred = DoInference(); + DoInference(); + train_accuracy = GetGlobalAccuracy(batch_pred); + GradientPropagation(); + galois::gPrint("Epoch ", epoch, " Batch ", batch_num, + ": Train accuracy/F1 micro is ", train_accuracy, "\n"); + // XXX sync across all hosts minibatcher state + if (!graph_->MoreTrainMinibatches()) { + break; + } + } + } epoch_timer.stop(); if (this_host == 0) { diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp new file mode 100644 index 0000000000..7c3b6dd831 --- /dev/null +++ b/libgnn/src/MinibatchGenerator.cpp @@ -0,0 +1,33 @@ +#include "galois/MinibatchGenerator.h" +#include + +void galois::MinibatchGenerator::GetNextMinibatch( + std::vector* batch_mask) { + std::fill(batch_mask->begin(), batch_mask->end(), 0); + assert(current_position_ <= mask_to_minibatch_.size()); + assert(batch_mask->size() == mask_to_minibatch_.size()); + if (current_position_ >= mask_to_minibatch_.size()) { + return; + } + + size_t current_count = 0; + // start from last positiion + while (current_position_ < mask_to_minibatch_.size()) { + if (mask_to_minibatch_[current_position_]) { + // XXX and a master node; seed nodes only exist locally + (*batch_mask)[current_position_] = 1; + current_count++; + } + // break when minibatch is large enough + current_position_++; + if (current_count == minibatch_size_) + break; + } + + // advance current position to next set bit for next call (or to end to detect + // no more minibatches + while (!mask_to_minibatch_[current_position_] && + (current_position_ < mask_to_minibatch_.size())) { + current_position_++; + } +} diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 1a288365bd..d46f75305f 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -99,6 +99,10 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, // init norm factors (involves a sync call) InitNormFactor(); + // XXX remove this + test_batcher_ = + std::make_unique(local_testing_mask_, 2000); + #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { // allocate/copy data structures over to GPU @@ -158,7 +162,7 @@ bool galois::graphs::GNNGraph::IsValidForPhaseCompleteRange( bool galois::graphs::GNNGraph::IsValidForPhaseMasked( const unsigned lid, const galois::GNNPhase current_phase) const { // select mask to use based on phase - const std::vector* mask_to_use; + const GNNMask* mask_to_use; switch (current_phase) { case GNNPhase::kTrain: mask_to_use = &local_training_mask_; @@ -175,6 +179,9 @@ bool galois::graphs::GNNGraph::IsValidForPhaseMasked( } mask_to_use = &other_mask_; break; + case GNNPhase::kBatch: + mask_to_use = &local_minibatch_mask_; + break; default: GALOIS_LOG_FATAL("Invalid phase used"); mask_to_use = nullptr; @@ -246,84 +253,6 @@ void galois::graphs::GNNGraph::AggregateSync( } #endif -void galois::graphs::GNNGraph::UniformNodeSample(float droprate) { - galois::do_all( - galois::iterate(begin_owned(), end_owned()), [&](const NodeIterator& x) { - partitioned_graph_->getData(*x) = sample_rng_.DoBernoulli(droprate); - }); - // TODO(loc) GPU - // TODO(loc) sync the flags across all machines to have same sample on all of - // them -} - -// TODO(loc) does not work in a distributed setting: assumes the partitioned -// graph is the entire graph -void galois::graphs::GNNGraph::GraphSAINTSample(size_t num_roots, - size_t walk_depth) { - // reset sample - galois::do_all(galois::iterate(begin(), end()), - [&](size_t n) { partitioned_graph_->getData(n) = 0; }); - - galois::on_each([&](size_t thread_id, size_t num_threads) { - size_t my_start = 0; - size_t my_end = 0; - std::tie(my_start, my_end) = - galois::block_range(size_t{0}, num_roots, thread_id, num_threads); - size_t thread_roots = my_end - my_start; - size_t train_range = global_training_mask_range_.size; - // init RNG - drand48_data seed_struct; - srand48_r(sample_rng_.GetRandomNumber() * thread_id * num_threads, - &seed_struct); - - for (size_t root_num = 0; root_num < thread_roots; root_num++) { - // pick a random training node root at random (with replacement); - size_t root = 0; - while (true) { - long int rand_num; - lrand48_r(&seed_struct, &rand_num); - root = global_training_mask_range_.begin + (rand_num % train_range); - if (IsValidForPhase(root, GNNPhase::kTrain)) { - break; - } - } - // mark this root as sampled - SetSampledNode(root); - assert(IsInSampledGraph(root)); - - // sample more nodes based on depth of the walk - for (size_t current_depth = 0; current_depth < walk_depth; - current_depth++) { - // pick random edge, mark sampled, swap roots - EdgeIterator first_edge = edge_begin(root); - size_t num_edges = std::distance(first_edge, edge_end(root)); - if (num_edges == 0) { - break; - } - - // must select training neighbor: if it doesn't, then ignore and - // continue - // To prevent infinite loop in case node has NO training neighbor, - // this implementation will not loop until one is found and will - // not find full depth if it doesn't find any training nodes randomly - long int rand_num; - lrand48_r(&seed_struct, &rand_num); - EdgeIterator selected_edge = first_edge + (rand_num % num_edges); - size_t candidate_dest = GetEdgeDest(selected_edge); - - // TODO(loc) another possibility is to just pick it anyways regardless - // but don't mark it as sampled, though this would lead to disconnected - // graph - if (IsValidForPhase(candidate_dest, GNNPhase::kTrain)) { - SetSampledNode(candidate_dest); - assert(IsInSampledGraph(candidate_dest)); - root = candidate_dest; - } - } - } - }); -} - void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, bool has_single_class_label) { GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_); @@ -470,7 +399,7 @@ void galois::graphs::GNNGraph::ReadLocalFeatures( //! given a name, mask type, and arrays to save into size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( const std::string& dataset_name, const std::string& mask_type, - GNNRange* mask_range, char* masks) { + GNNRange* mask_range, std::vector* masks) { size_t range_begin; size_t range_end; @@ -504,7 +433,7 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( if (mask == 1) { valid_count++; if (partitioned_graph_->isLocal(cur_line_num)) { - masks[partitioned_graph_->getLID(cur_line_num)] = 1; + (*masks)[partitioned_graph_->getLID(cur_line_num)] = 1; local_sample_count++; } } @@ -587,13 +516,13 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { // XXX i can get local sample counts from here if i need it size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train", &global_training_mask_range_, - local_training_mask_.data()); + &local_training_mask_); size_t valid_val = ReadLocalMasksFromFile(dataset_name, "val", &global_validation_mask_range_, - local_validation_mask_.data()); + &local_validation_mask_); size_t valid_test = ReadLocalMasksFromFile(dataset_name, "test", &global_testing_mask_range_, - local_testing_mask_.data()); + &local_testing_mask_); valid_other_ = FindOtherMask(); // the "other" set of nodes that don't fall into any classification if (galois::runtime::getSystemNetworkInterface().ID == 0) { @@ -671,13 +600,7 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( galois::iterate(begin_owned(), end_owned()), // this is possibly the subgraph id [&](const unsigned node_id) { - unsigned lid = node_id; - if (use_subgraph_) { - // convert SID over to LID - lid = subgraph_->SIDToLID(node_id); - } - - if (IsValidForPhase(lid, phase)) { + if (IsValidForPhase(node_id, phase)) { total_checked_ += 1; // get prediction by getting max // note the use of node_id here: lid only used to check original @@ -687,7 +610,7 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( // check against ground truth and track accordingly // TODO static cast used here is dangerous if (predicted_label == - static_cast(GetSingleClassLabel(lid))) { + static_cast(GetSingleClassLabel(node_id))) { num_correct_ += 1; } } @@ -839,14 +762,14 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, } } -void galois::graphs::GNNGraph::SetupNeighborhoodSample() { +void galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { use_subgraph_ = false; new_sampled_nodes_.resize(size()); new_sampled_nodes_.reset(); // for now, if training node, it goes into seed node galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { - if (IsValidForPhase(*x, GNNPhase::kTrain)) { + if (IsValidForPhase(*x, seed_phase)) { SetSampledNode(*x); } else { UnsetSampledNode(*x); @@ -1006,6 +929,11 @@ size_t galois::graphs::GNNGraph::ConstructSampledSubgraph() { return num_subgraph_nodes; } +void galois::graphs::GNNGraph::PrepareNextTrainMinibatch() { + train_batcher_->GetNextMinibatch(&local_minibatch_mask_); + SetupNeighborhoodSample(GNNPhase::kBatch); +} + //////////////////////////////////////////////////////////////////////////////// #ifdef GALOIS_ENABLE_GPU diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 9d6ca7c5cc..22178ee2fa 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -187,7 +187,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( galois::StatTimer timer("BackwardPhase", kRegionName); timer.start(); - assert(layer_phase_ == GNNPhase::kTrain); + assert(layer_phase_ == GNNPhase::kTrain || layer_phase_ == GNNPhase::kBatch); // derivative of activation if (!config_.disable_activation) { diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index f7a345050d..beccf42289 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -20,6 +20,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU( [&](const unsigned i) { if (IsSampledLayer()) { if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i)) { + // XXX VectorZero(feature_length, &p_backward_output_matrix_[i * feature_length]); return; diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index 0bc508963d..921baaa4df 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -114,6 +114,11 @@ llvm::cl::opt "all non-train nodes are ignored"), cll::init(false)); +llvm::cl::opt + train_minibatch_size("trainMinibatchSize", + cll::desc("Size of training minibatch (default 0)"), + cll::init(0)); + llvm::cl::opt val_interval("valInterval", cll::desc("# of epochs to test validation set (default 0)"), @@ -264,9 +269,11 @@ std::unique_ptr InitializeGraphNeuralNetwork() { galois::GraphNeuralNetworkConfig gnn_config( num_layers, layer_types, layer_sizes_vector, output_layer_type, do_graph_sampling, layer_config); - gnn_config.inductive_training_ = do_inductive_training; - gnn_config.validation_interval_ = val_interval; - gnn_config.test_interval_ = test_interval; + gnn_config.inductive_training_ = do_inductive_training; + gnn_config.validation_interval_ = val_interval; + gnn_config.test_interval_ = test_interval; + gnn_config.train_minibatch_size_ = train_minibatch_size; + // optimizer std::unique_ptr opt = CreateOptimizer(gnn_graph.get()); From 69b5268d4b33095c2121161a9fb0b11cf840352a Mon Sep 17 00:00:00 2001 From: Hochan Lee Date: Mon, 3 May 2021 21:35:24 -0500 Subject: [PATCH 528/660] GPU update (#2) * Implement multi-GPU sage layer and test * Temporarily disable unsupported options for GPU on gpu-conv * Implement GPU Relu activation and update tests * Temporarily modify PairGlorotBengioInit() for GPUs * Implement collective MPI communications for weight synchronizations on the GPU side * Implement bitset aggregation, nonmaster masking, dropout matrix reconstruction for GPU * Remove backward correctness check of the layer 0 * Fix GPU sagelayer * Fix gpu-sage normalization * Fix proxy handling of reduce() * Add gnn log parser * Fix galois gnn log parsing scripit * Add gnn-gpu test script * Fix galois gnn log parsing script * Complete rebasing on the gnn * Add gnn experimental script * Distinguish synch() of CPU and GPU * Fix gpu conv test * Update degree normalization * Fix gpu-gnn tests * Avoid unnecessary CPU memory allocation on GPU * Add TODO for gpudirect * Rebase + optimize memory allocations Authored-by: Hochan Lee --- .../include/galois/runtime/SyncStructures.h | 7 +- .../include/galois/runtime/cuda/DeviceSync.h | 74 +++++ libgnn/CMakeLists.txt | 1 + libgnn/include/galois/CUDAUtil.h | 4 +- .../include/galois/GNNCudaContextHostDecls.h | 7 +- libgnn/include/galois/GNNMath.cuh | 7 + libgnn/include/galois/graphs/GNNGraph.cuh | 27 +- libgnn/include/galois/graphs/GNNGraph.h | 7 +- .../graphs/GraphAggregationSyncStructures.h | 3 +- libgnn/include/galois/layers/DenseLayer.h | 3 +- libgnn/include/galois/layers/GNNLayer.cuh | 31 +- libgnn/include/galois/layers/GNNLayer.h | 36 ++- .../galois/layers/GraphConvolutionalLayer.cuh | 10 +- libgnn/include/galois/layers/SAGELayer.cuh | 82 ++++++ libgnn/include/galois/layers/SAGELayer.h | 32 ++- libgnn/include/galois/layers/SoftmaxLayer.cuh | 2 + libgnn/src/GNNCudaContext.cu | 20 +- libgnn/src/GNNMath.cu | 27 +- libgnn/src/GNNOptimizers.cu | 2 +- libgnn/src/GraphNeuralNetwork.cpp | 24 +- libgnn/src/graphs/GNNGraph.cpp | 11 +- libgnn/src/graphs/GNNGraph.cu | 107 ++++++- libgnn/src/layers/DenseLayer.cpp | 4 + libgnn/src/layers/GNNLayer.cpp | 192 +++++++------ libgnn/src/layers/GNNLayer.cu | 144 +++++++++- libgnn/src/layers/GraphConvolutionalLayer.cpp | 43 ++- libgnn/src/layers/GraphConvolutionalLayer.cu | 92 ++++-- libgnn/src/layers/SAGELayer.cpp | 186 ++++++++---- libgnn/src/layers/SAGELayer.cu | 209 ++++++++++++++ libgnn/src/layers/SoftmaxLayer.cpp | 4 +- libgnn/src/layers/SoftmaxLayer.cu | 64 ++++- libgnn/test/CMakeLists.txt | 25 ++ libgnn/test/gpu-adam-test.cpp | 10 +- libgnn/test/gpu-aggregate-sync-test.cpp | 19 +- libgnn/test/gpu-back-conv-test.cpp | 167 +++++++++++ libgnn/test/gpu-convlayer-test.cpp | 168 +++++------ libgnn/test/gpu-epoch-test.cpp | 16 +- libgnn/test/gpu-sage-layer-test.cpp | 270 ++++++++++++++++++ libgnn/test/gpu-softmaxlayer-test.cpp | 39 +-- scripts/galois_gnn_log_parser.R | 221 ++++++++++++++ scripts/run-gpu.sh | 44 +++ scripts/run_gnnsys.sh | 57 ++++ 42 files changed, 2101 insertions(+), 397 deletions(-) create mode 100644 libgnn/include/galois/layers/SAGELayer.cuh create mode 100644 libgnn/src/layers/SAGELayer.cu create mode 100644 libgnn/test/gpu-back-conv-test.cpp create mode 100644 libgnn/test/gpu-sage-layer-test.cpp create mode 100644 scripts/galois_gnn_log_parser.R create mode 100644 scripts/run-gpu.sh create mode 100644 scripts/run_gnnsys.sh diff --git a/libgluon/include/galois/runtime/SyncStructures.h b/libgluon/include/galois/runtime/SyncStructures.h index 44264461cd..75398c4f02 100644 --- a/libgluon/include/galois/runtime/SyncStructures.h +++ b/libgluon/include/galois/runtime/SyncStructures.h @@ -1920,17 +1920,17 @@ class FieldFlags { static bool is_valid() { return true; } \ \ static galois::DynamicBitSet& get() { \ - if (personality == GPU_CUDA) \ + if (device_personality == DevicePersonality::GPU_CUDA) \ get_bitset_##fieldname##_cuda( \ cuda_ctx, (uint64_t*)bitset_##fieldname.get_vec().data()); \ return bitset_##fieldname; \ } \ \ static void reset_range(size_t begin, size_t end) { \ - if (personality == GPU_CUDA) { \ + if (device_personality == DevicePersonality::GPU_CUDA) { \ bitset_##fieldname##_reset_cuda(cuda_ctx, begin, end); \ } else { \ - assert(personality == CPU); \ + assert(device_personality == DevicePersonality::CPU); \ bitset_##fieldname.reset(begin, end); \ } \ } \ @@ -2079,5 +2079,4 @@ class FieldFlags { } \ }; #endif - #endif // header guard diff --git a/libgluon/include/galois/runtime/cuda/DeviceSync.h b/libgluon/include/galois/runtime/cuda/DeviceSync.h index a9512b1cc1..6b49aa743f 100644 --- a/libgluon/include/galois/runtime/cuda/DeviceSync.h +++ b/libgluon/include/galois/runtime/cuda/DeviceSync.h @@ -425,6 +425,80 @@ void reset_bitset_field(struct CUDA_Context_Field* field, mask1, test2, bit_index2, mask2); } +// TODO(lhc) we may not need this later, but for now just use this +void reset_bitset_field(Shared& bitset, size_t begin, + size_t end) { + dim3 blocks; + dim3 threads; + kernel_sizing(blocks, threads); + const DynamicBitset* bitset_cpu = bitset.cpu_rd_ptr(); + assert(begin <= (bitset_cpu->size() - 1)); + assert(end <= (bitset_cpu->size() - 1)); + + size_t vec_begin = (begin + 63) / 64; + size_t vec_end; + + if (end == (bitset_cpu->size() - 1)) + vec_end = bitset_cpu->vec_size(); + else + vec_end = (end + 1) / 64; // floor + + size_t begin2 = vec_begin * 64; + size_t end2 = vec_end * 64; + + bool test1; + size_t bit_index1; + uint64_t mask1; + + bool test2; + size_t bit_index2; + uint64_t mask2; + + if (begin2 > end2) { + test2 = false; + + if (begin < begin2) { + test1 = true; + bit_index1 = begin / 64; + size_t diff = begin2 - begin; + assert(diff < 64); + mask1 = ((uint64_t)1 << (64 - diff)) - 1; + + // create or mask + size_t diff2 = end - end2 + 1; + assert(diff2 < 64); + mask2 = ~(((uint64_t)1 << diff2) - 1); + mask1 |= ~mask2; + } else { + test1 = false; + } + } else { + if (begin < begin2) { + test1 = true; + bit_index1 = begin / 64; + size_t diff = begin2 - begin; + assert(diff < 64); + mask1 = ((uint64_t)1 << (64 - diff)) - 1; + } else { + test1 = false; + } + + if (end >= end2) { + test2 = true; + bit_index2 = end / 64; + size_t diff = end - end2 + 1; + assert(diff < 64); + mask2 = ~(((uint64_t)1 << diff) - 1); + } else { + test2 = false; + } + } + + bitset_reset_range<<>>(bitset.gpu_rd_ptr(), vec_begin, + vec_end, test1, bit_index1, mask1, + test2, bit_index2, mask2); +} + template void reset_data_field(struct CUDA_Context_Field* field, size_t begin, size_t end, DataType val) { diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 46ea0fd67c..c5d9ee6e7a 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -41,6 +41,7 @@ if (GALOIS_ENABLE_GPU) src/layers/GNNLayer.cu src/layers/GraphConvolutionalLayer.cu src/layers/SoftmaxLayer.cu + src/layers/SAGELayer.cu src/GraphNeuralNetwork.cu src/GNNOptimizers.cu src/GNNCudaContext.cu diff --git a/libgnn/include/galois/CUDAUtil.h b/libgnn/include/galois/CUDAUtil.h index fd51eb1362..e19b0d9525 100644 --- a/libgnn/include/galois/CUDAUtil.h +++ b/libgnn/include/galois/CUDAUtil.h @@ -58,8 +58,8 @@ inline int CUDA_GET_BLOCKS(const int N) { //! Basic kernel loop for CUDA threads //! Caffe describes it as "grid stride" -#define CUDA_KERNEL_LOOP(i, n) \ - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ +#define CUDA_KERNEL_LOOP(i, s, e) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x + s; i < (e); \ i += blockDim.x * gridDim.x) //! Wrap a CuBLAS call with this to check if it threw any errors diff --git a/libgnn/include/galois/GNNCudaContextHostDecls.h b/libgnn/include/galois/GNNCudaContextHostDecls.h index fea68d5fec..58c45c3b97 100644 --- a/libgnn/include/galois/GNNCudaContextHostDecls.h +++ b/libgnn/include/galois/GNNCudaContextHostDecls.h @@ -1,5 +1,4 @@ #pragma once - #include "galois/cuda/HostDecls.h" extern int gpudevice; @@ -7,6 +6,7 @@ extern int gpudevice; void load_graph_CUDA_GNN(struct CUDA_Context* ctx, PartitionedGraphInfo& g, unsigned num_hosts); void resize_CUDA_layer_vector(struct CUDA_Context* ctx, size_t num_layers); +void resize_CUDA_bitset(struct CUDA_Context* ctx, size_t bitset_size); void init_CUDA_layer_vector_meta_obj(struct CUDA_Context* ctx, unsigned layer_number, unsigned num_hosts, unsigned nnodes, size_t infl_in_size, @@ -71,6 +71,11 @@ void batch_get_reset_node_layer_output_matrix_cuda(struct CUDA_Context* ctx, uint8_t* buf, size_t column_size, unsigned layer_number); +void get_bitset_graph_aggregate_cuda(struct CUDA_Context* ctx, + uint64_t* bitset_compute); + +void bitset_graph_aggregate_reset_cuda(struct CUDA_Context* ctx, size_t begin, + size_t end); void cudaSetLayerInputOutput(struct CUDA_Context* ctx, GNNFloat* layer_matrix, size_t column_size, size_t num_nodes, diff --git a/libgnn/include/galois/GNNMath.cuh b/libgnn/include/galois/GNNMath.cuh index 1b262fa6a3..a50e8974ba 100644 --- a/libgnn/include/galois/GNNMath.cuh +++ b/libgnn/include/galois/GNNMath.cuh @@ -25,6 +25,12 @@ void CBlasSGEMMGPU(const cublasOperation_t trans_a, size_t input_columns, size_t output_columns, const GNNFloat* a, const GNNFloat* b, GNNFloat* output); +void CBlasSGEMMGPU(const cublasOperation_t trans_a, + const cublasOperation_t trans_b, size_t input_rows, + size_t input_columns, size_t output_columns, + const GNNFloat* a, const GNNFloat* b, GNNFloat* output, + bool accumulate); + //! Runs softmax + cross entropy on masked nodes. Will not overwrite all of //! the output, so make sure it's been zero'd out beforehand. //! At this point in time cross entropy is ignored because it only calculates a @@ -48,5 +54,6 @@ SoftmaxCrossEntropyBackward(char* mask, size_t num_nodes, size_t feature_length, __device__ void DoSoftmax(size_t vector_length, const GNNFloat* input, GNNFloat* output); +__device__ void GPUVectorZero(size_t vector_length, GNNFloat* vec); } // namespace galois #endif diff --git a/libgnn/include/galois/graphs/GNNGraph.cuh b/libgnn/include/galois/graphs/GNNGraph.cuh index 2012dcd7c9..6b6ff2bb74 100644 --- a/libgnn/include/galois/graphs/GNNGraph.cuh +++ b/libgnn/include/galois/graphs/GNNGraph.cuh @@ -23,8 +23,8 @@ public: //! Copy over masks for the 3 sets to GPU void SetMasks(const std::vector& train, const std::vector& val, const std::vector& test); - //! Copy over norm factors - void SetNormFactors(const std::vector norm_factors); + + void AllocAggregateBitset(size_t size); GNNFeature* feature_vector() const { return feature_vector_; }; int* edge_index() const { return edge_index_; } @@ -33,7 +33,19 @@ public: char* local_training_mask() const { return local_training_mask_; } char* local_validation_mask() const { return local_validation_mask_; } char* local_testing_mask() const { return local_testing_mask_; } - GNNFloat* norm_factors() const { return norm_factors_; } + + //! Get the total degree of the partitioned graph + uint32_t* get_global_degrees() const { return global_degrees_; } + //! Get the total degree of the sampled subgraph + uint32_t* get_global_train_degrees() const { return global_train_degrees_; } + //! Allocate memory to objects related to normalization + void InitNormFactor(size_t num_nodes); + //! Copy degree of the partitioned graph from CPU + void SetGlobalDegrees(const std::vector global_degrees); + //! Copy degree of the sampled subgraph from CPU + void SetGlobalTrainDegrees(const std::vector global_train_degrees); + + void CopyToCPU(const PointerWithSize& input); private: // ALL THESE VARIABLES ARE DEVICE SIDE (GPU) POINTERS @@ -53,14 +65,19 @@ private: int* edge_destinations_{nullptr}; //! (Local) feature vector GNNFeature* feature_vector_{nullptr}; + //! (Local) ground truth vector GNNLabel* ground_truth_{nullptr}; + // masks for phases char* local_training_mask_{nullptr}; char* local_validation_mask_{nullptr}; char* local_testing_mask_{nullptr}; - //! Norm factors used during aggregation - GNNFloat* norm_factors_; + + uint32_t* global_degrees_{nullptr}; + size_t global_degree_size_{0}; + uint32_t* global_train_degrees_{nullptr}; + size_t global_train_degree_size_{0}; }; } // namespace graphs diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index ded867787c..2d4bb5356b 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -489,13 +489,14 @@ class GNNGraph { void CalculateFullNormFactor(); #ifdef GALOIS_ENABLE_GPU - void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size, - const unsigned layer_number) const; + void AggregateSyncGPU(GNNFloat* matrix_to_sync, + const size_t matrix_column_size, + const unsigned layer_number) const; void InitLayerVectorMetaObjects(size_t layer_number, unsigned num_hosts, size_t infl_in_size, size_t infl_out_size); - void ResizeLayerVector(size_t num_layers); + void ResizeGPULayerVector(size_t num_layers); const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; } diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index bcf7ed5078..0dd43c3308 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -203,6 +203,7 @@ struct GNNSampleSumAggregate { }; #ifdef GALOIS_ENABLE_GPU +extern struct CUDA_Context* cuda_ctx; GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_input, cuda_ctx_for_sync, gnn_matrix_to_sync_column_length_, layer_number_to_sync); @@ -210,7 +211,7 @@ GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_output, cuda_ctx_for_sync, gnn_matrix_to_sync_column_length_, layer_number_to_sync); #endif - GALOIS_SYNC_STRUCTURE_BITSET(graph_aggregate); + } // namespace graphs } // namespace galois diff --git a/libgnn/include/galois/layers/DenseLayer.h b/libgnn/include/galois/layers/DenseLayer.h index 7b00d1987c..e7dc46e9f3 100644 --- a/libgnn/include/galois/layers/DenseLayer.h +++ b/libgnn/include/galois/layers/DenseLayer.h @@ -1,3 +1,4 @@ + #pragma once #include "galois/layers/GNNLayer.h" @@ -50,7 +51,7 @@ class DenseLayer : public GNNLayer { #ifdef GALOIS_ENABLE_GPU // TODO(hochan/loc) replace with dense gpu object - GCNGPUAllocations gpu_object_; + // GCNGPUAllocations gpu_object_; #endif }; diff --git a/libgnn/include/galois/layers/GNNLayer.cuh b/libgnn/include/galois/layers/GNNLayer.cuh index 9dfd09e0da..439faad738 100644 --- a/libgnn/include/galois/layers/GNNLayer.cuh +++ b/libgnn/include/galois/layers/GNNLayer.cuh @@ -16,12 +16,16 @@ public: void InitDropoutMemory(size_t dropout_size); //! Copy provided data in vector to GPU weights void CopyToWeights(const std::vector& cpu_layer_weights); + //! Copy provided data in vector to GPU weight gradients + void CopyToWeightGradients(const std::vector& cpu_gradients); //! Copy GPU forward output to the provided vector (assumes vector is already //! correct size) - void CopyForwardOutputToCPU(std::vector* cpu_forward_output); + void CopyForwardOutputToCPU(GNNFloat* cpu_forward_output, + size_t forward_output_size); //! Copy GPU backward output to the provided vector (assumes vector is already //! correct size) - void CopyBackwardOutputToCPU(std::vector* cpu_backward_output); + void CopyBackwardOutputToCPU(GNNFloat* cpu_backward_output, + size_t backward_output_size); //! Copy GPU weight gradients to the provided vector (assumes vector is //! already correct size) void CopyWeightGradientsToCPU(std::vector* cpu_gradients); @@ -29,6 +33,9 @@ public: //! Prints forward output matrix on gpu void PrintForwardOutput(size_t num); + //! Prints backward output matrix on gpu + void PrintBackwardOutput(size_t num); + //! Does dropout on the GPU; saves non-dropped weights to output void DoDropoutGPU(const PointerWithSize input_to_dropout, PointerWithSize output, float dropout_rate); @@ -39,11 +46,30 @@ public: //! memory is allocated as necessary) GNNFloat* Allocate(const std::vector& v); + //! Initializes vectors on GPU to 1 + void InitGPUVectorTo1(GNNFloat* vector, size_t vector_size); + + //! Apply an activation function + void ActivationGPU(size_t num_forward_output_elements); + //! Apply an activation function for derivative + void ActivationDerivativeGPU(GNNFloat* gradients, + size_t num_gradients_elements); + void + ReconstructDropoutMatrixGPU(const PointerWithSize input_to_drouput, + PointerWithSize* output_matrix, + size_t num_elements, GNNFloat scale); + + void MaskNonMastersGPU(PointerWithSize* input, size_t start_node, + size_t end_node, size_t row_index); + GNNFloat* forward_output() { return forward_output_matrix_; } GNNFloat* backward_output() { return backward_output_matrix_; } GNNFloat* layer_weights() { return layer_weights_; } GNNFloat* layer_weight_gradients() { return layer_weight_gradients_; } + void CopyToCPU(PointerWithSize* input); + void CopyToCPU(GNNFloat* input, size_t size); + private: size_t* num_weights_{nullptr}; GNNFloat* forward_output_matrix_{nullptr}; @@ -52,6 +78,7 @@ private: GNNFloat* layer_weight_gradients_{nullptr}; GNNFloat* rng_results_{nullptr}; char* dropout_mask_{nullptr}; + uint8_t* activation_memo_{nullptr}; }; } // namespace galois diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 5cfe69b83e..9a71432471 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -8,6 +8,9 @@ #include "galois/layers/GNNLayer.cuh" #endif +//#define PRINT_VEC_LOG_ +//#define PRINT_GPU_VEC_ + namespace galois { //! Supported layer types in the GNN @@ -185,16 +188,29 @@ class GNNLayer { PointerWithSize AllocateGPU(const std::vector& v) { return PointerWithSize(base_gpu_object_.Allocate(v), v.size()); } + //! Copies over forward output results to CPU from GPU - const std::vector& CopyForwardOutputFromGPU() { - base_gpu_object_.CopyForwardOutputToCPU(&forward_output_matrix_); - return forward_output_matrix_; + const std::vector CopyForwardOutputFromGPU() { + size_t cpu_forward_output_size = p_forward_output_matrix_.size(); + GNNFloat* cpu_forward_output = + (GNNFloat*)malloc(cpu_forward_output_size * sizeof(GNNFloat)); + base_gpu_object_.CopyForwardOutputToCPU(cpu_forward_output, + cpu_forward_output_size); + return std::vector(cpu_forward_output, + cpu_forward_output + cpu_forward_output_size); } + //! Copies over backward output results to CPU from GPU - const std::vector& CopyBackwardOutputFromGPU() { - base_gpu_object_.CopyBackwardOutputToCPU(&backward_output_matrix_); - return backward_output_matrix_; + const PointerWithSize CopyBackwardOutputFromGPU() { + size_t cpu_backward_output_size = p_backward_output_matrix_.size(); + GNNFloat* cpu_backward_output = + (GNNFloat*)malloc(cpu_backward_output_size * sizeof(GNNFloat)); + base_gpu_object_.CopyBackwardOutputToCPU(cpu_backward_output, + cpu_backward_output_size); + return PointerWithSize(cpu_backward_output, + cpu_backward_output_size); } + //! Copies over weight gradients to CPU from GPU const std::vector& CopyWeightGradientsFromGPU() { base_gpu_object_.CopyWeightGradientsToCPU(&layer_weight_gradients_); @@ -204,6 +220,10 @@ class GNNLayer { void PrintForwardOutputGPU() { base_gpu_object_.PrintForwardOutput(forward_output_matrix_.size()); } + + void PrintBackwardOutputGPU() { + base_gpu_object_.PrintBackwardOutput(p_backward_output_matrix_.size()); + } #endif protected: @@ -293,6 +313,7 @@ class GNNLayer { //! Does some activation function based on configuration on forward output //! matrix void Activation(); + void ActivationCPU(); //! Calculate derivative of activation function based on config on the matrix void ActivationDerivative(PointerWithSize* matrix); @@ -317,6 +338,9 @@ class GNNLayer { double FloatElementsToGB(size_t num_of_floats) const { return num_of_floats * double{4} / (1 << 30); } + + void MaskNonMastersGPU(PointerWithSize* input, size_t start_node, + size_t end_node, size_t row_index); }; } // namespace galois diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh index c59617828d..51a167b9c1 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.cuh @@ -9,8 +9,11 @@ class GCNGPUAllocations { public: // free memory ~GCNGPUAllocations(); - // allocate the 3 temp arrays - void Allocate(size_t input_elements, size_t output_elements); + + void AllocateInTemp1(const size_t size); + void AllocateInTemp2(const size_t size); + void AllocateOutTemp(const size_t size); + GNNFloat* in_temp_1() { return in_temp_1_; } GNNFloat* in_temp_2() { return in_temp_2_; } GNNFloat* out_temp() { return out_temp_; } @@ -18,7 +21,8 @@ public: void AggregateAllGPU(const graphs::GNNGraphGPUAllocations& gpu_graph, size_t num_nodes, size_t column_length, const GNNFloat* node_embeddings, - GNNFloat* aggregate_output, bool use_norm); + GNNFloat* aggregate_output, bool use_norm, + bool disable_self_aggregate, size_t last_master); void UpdateEmbeddingsGPU(size_t num_nodes, size_t input_columns, size_t output_columns, diff --git a/libgnn/include/galois/layers/SAGELayer.cuh b/libgnn/include/galois/layers/SAGELayer.cuh new file mode 100644 index 0000000000..05f9e8556c --- /dev/null +++ b/libgnn/include/galois/layers/SAGELayer.cuh @@ -0,0 +1,82 @@ +#pragma once +#include "galois/GNNTypes.h" +#include "galois/graphs/GNNGraph.cuh" + +namespace galois { + +//! Holds pointers for GPU memory for SAGE layer +class SAGEGPUAllocations { +public: + // free memory + ~SAGEGPUAllocations(); + + // allocate the 3 temp arrays + void AllocateInTemp1(const size_t size); + void AllocateInTemp2(const size_t size); + void AllocateOutTemp(const size_t size); + + GNNFloat* in_temp_1() { return in_temp_1_; } + GNNFloat* in_temp_2() { return in_temp_2_; } + GNNFloat* out_temp() { return out_temp_; } + + void AllocateWeight2(const size_t size); + void AllocateWeightGradient2(const size_t size); + + GNNFloat* layer_weights_2() { return layer_weights_2_; } + GNNFloat* layer_weight_gradients_2() { return layer_weight_gradients_2_; } + + void AggregateAllGPU(const graphs::GNNGraphGPUAllocations& gpu_graph, + size_t num_nodes, size_t column_length, + const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, bool use_norm, + bool is_backward); + + void UpdateEmbeddingsGPU(size_t num_nodes, size_t input_columns, + size_t output_columns, + const GNNFloat* node_embeddings, + const GNNFloat* layer_weights, GNNFloat* output); + void UpdateEmbeddingsDerivativeGPU(size_t num_nodes, size_t input_columns, + size_t output_columns, + const GNNFloat* node_embeddings, + const GNNFloat* layer_weights, + GNNFloat* output); + + void GetWeightGradientsGPU(size_t num_nodes, size_t input_columns, + size_t output_columns, const GNNFloat* prev_input, + const GNNFloat* gradients, GNNFloat* output); + + void SelfFeatureUpdateEmbeddingsGPU(size_t input_rows, size_t input_columns, + size_t output_columns, + const GNNFloat* node_embeddings, + GNNFloat* output); + + void SelfFeatureUpdateEmbeddingsDerivativeGPU(size_t input_rows, + size_t output_columns, + size_t input_columns, + const GNNFloat* gradients, + GNNFloat* output); + + void UpdateWeight2DerivativeGPU(size_t input_columns, size_t input_rows, + size_t output_columns, + const GNNFloat* prev_layer_inputs, + const GNNFloat* input_gradients, + GNNFloat* output); + + //! Copy provided data in vector to GPU self weight + void CopyToWeights2(const std::vector& cpu_layer_weights); + //! Copy provided data in vector to GPU self weight gradients + void CopyToWeight2Gradients(const std::vector& cpu_gradients); + + //! Copy GPU self weight gradients to the provided vector (assumes vector is + //! already correct size) + void CopyWeight2GradientsToCPU(std::vector* cpu_gradients); + +private: + GNNFloat* in_temp_1_{nullptr}; + GNNFloat* in_temp_2_{nullptr}; + GNNFloat* out_temp_{nullptr}; + GNNFloat* layer_weights_2_{nullptr}; + GNNFloat* layer_weight_gradients_2_{nullptr}; +}; + +} // namespace galois diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index dd9ceb6e7b..3f12978663 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -3,7 +3,7 @@ #include "galois/layers/GradientSyncStructures.h" #ifdef GALOIS_ENABLE_GPU -// TODO(loc/hochan) +#include "galois/layers/SAGELayer.cuh" #endif namespace galois { @@ -53,9 +53,21 @@ class SAGELayer : public GNNLayer { } void InitSelfWeightsTo1() { - if (layer_weights_2_.size()) { - layer_weights_2_.assign(layer_weights_2_.size(), 1); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + size_t layer_weights_2_size = p_layer_weights_2_.size(); + if (layer_weights_2_size > 0) { + base_gpu_object_.InitGPUVectorTo1(gpu_object_.layer_weights_2(), + layer_weights_2_size); + } + } else { +#endif + if (layer_weights_2_.size()) { + layer_weights_2_.assign(layer_weights_2_.size(), 1); + } +#ifdef GALOIS_ENABLE_GPU } +#endif } //! Returns the 2nd set of weight gradients @@ -71,6 +83,17 @@ class SAGELayer : public GNNLayer { BackwardPhase(PointerWithSize prev_layer_input, PointerWithSize* input_gradient) final; +#ifdef GALOIS_ENABLE_GPU + //! Copies over self weight gradients to CPU from GPU + const std::vector& CopyWeight2GradientsFromGPU() { + if (!layer_weight_gradients_2_.size()) { + layer_weight_gradients_2_.resize(p_layer_weight_gradients_2_.size()); + } + gpu_object_.CopyWeight2GradientsToCPU(&layer_weight_gradients_2_); + return layer_weight_gradients_2_; + } +#endif + private: static const constexpr char* kRegionName = "SAGELayer"; //! CPU aggregation @@ -143,8 +166,7 @@ class SAGELayer : public GNNLayer { output_column_intermediates_; #ifdef GALOIS_ENABLE_GPU - // TODO(loc/hochan) - GCNGPUAllocations gpu_object_; + SAGEGPUAllocations gpu_object_; #endif }; diff --git a/libgnn/include/galois/layers/SoftmaxLayer.cuh b/libgnn/include/galois/layers/SoftmaxLayer.cuh index 8e1e5d21d7..6387edaeb6 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.cuh +++ b/libgnn/include/galois/layers/SoftmaxLayer.cuh @@ -20,6 +20,8 @@ public: size_t feature_length, const GNNFloat* predictions, GNNFloat* output_gradient); + void CopyToCPU(GNNFloat* input, size_t size); + private: char* train_mask_; char* val_mask_; diff --git a/libgnn/src/GNNCudaContext.cu b/libgnn/src/GNNCudaContext.cu index d0512f8e72..28589da00c 100644 --- a/libgnn/src/GNNCudaContext.cu +++ b/libgnn/src/GNNCudaContext.cu @@ -6,6 +6,8 @@ #include "galois/runtime/cuda/DeviceSync.h" #include "galois/GNNCudaContextHostDecls.h" +extern Shared cuda_bitset_graph_aggregate; + // The forward declaration is in the original Context.h file; as long as // pointers to it are used it shouldn't be an issue (since space usage is // unknown at that point) @@ -120,7 +122,7 @@ void batch_set_mirror_node_layer_input_matrix_cuda( void batch_get_reset_node_layer_input_matrix_cuda( struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size, DataCommMode* mode, size_t column_size, unsigned layer_number) { - batch_get_shared_field( + batch_get_shared_field( ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, buf_size, mode, column_size); } @@ -130,7 +132,7 @@ void batch_get_reset_node_layer_input_matrix_cuda(struct CUDA_Context* ctx, uint8_t* buf, size_t column_size, unsigned layer_number) { - batch_get_shared_field( + batch_get_shared_field( ctx, &ctx->layer_input_matrix[layer_number], from_id, buf, column_size); } @@ -189,7 +191,7 @@ void batch_set_mirror_node_layer_output_matrix_cuda( void batch_get_reset_node_layer_output_matrix_cuda( struct CUDA_Context* ctx, unsigned from_id, uint8_t* buf, size_t* buf_size, DataCommMode* mode, size_t column_size, unsigned layer_number) { - batch_get_shared_field( + batch_get_shared_field( ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, buf_size, mode, column_size); } @@ -199,10 +201,20 @@ void batch_get_reset_node_layer_output_matrix_cuda(struct CUDA_Context* ctx, uint8_t* buf, size_t column_size, unsigned layer_number) { - batch_get_shared_field( + batch_get_shared_field( ctx, &ctx->layer_output_matrix[layer_number], from_id, buf, column_size); } +void get_bitset_graph_aggregate_cuda(struct CUDA_Context*, + uint64_t* bitset_compute) { + cuda_bitset_graph_aggregate.cpu_rd_ptr()->copy_to_cpu(bitset_compute); +} + +void bitset_graph_aggregate_reset_cuda(struct CUDA_Context*, size_t begin, + size_t end) { + reset_bitset_field(cuda_bitset_graph_aggregate, begin, end); +} + void cudaSetLayerInputOutput(struct CUDA_Context* ctx, GNNFloat* layer_matrix, size_t column_size, size_t num_nodes, unsigned layer_number) { diff --git a/libgnn/src/GNNMath.cu b/libgnn/src/GNNMath.cu index 8771b75d5b..8305990fc8 100644 --- a/libgnn/src/GNNMath.cu +++ b/libgnn/src/GNNMath.cu @@ -30,20 +30,29 @@ void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a, size_t input_columns, size_t output_columns, const GNNFloat* a, const GNNFloat* b, GNNFloat* output) { + CBlasSGEMMGPU(trans_a, trans_b, input_rows, input_columns, output_columns, a, + b, output, false); +} + +void galois::CBlasSGEMMGPU(const cublasOperation_t trans_a, + const cublasOperation_t trans_b, size_t input_rows, + size_t input_columns, size_t output_columns, + const GNNFloat* a, const GNNFloat* b, + GNNFloat* output, bool accumulate) { if (!cublas_is_init) { InitCuBLAS(); } size_t lead_dim_a = (trans_a == CUBLAS_OP_N) ? input_columns : input_rows; size_t lead_dim_b = (trans_b == CUBLAS_OP_N) ? output_columns : input_columns; - float dummy0 = 0.0; - float dummy1 = 1.0; + float beta = (accumulate) ? 1.0 : 0.0; + float dummy0 = 1.0; // because cusparse assumes column major even though we're passing in row // major, the order of multiply is reversed so that it does what we // want anyways // https://stackoverflow.com/questions/56043539/cublassgemm-row-major-multiplication CUBLAS_CHECK(cublasSgemm(global_cublas_handle, trans_b, trans_a, - output_columns, input_rows, input_columns, &dummy1, - b, lead_dim_b, a, lead_dim_a, &dummy0, output, + output_columns, input_rows, input_columns, &dummy0, + b, lead_dim_b, a, lead_dim_a, &beta, output, output_columns)); CUDA_TEST("cublas sgemm failure"); } @@ -54,13 +63,15 @@ __global__ void galois::SoftmaxCrossEntropyForward( // NOTE: assumes that output is already 0'd out as it will not overwrite the // entire thing - CUDA_KERNEL_LOOP(i, num_nodes) { + CUDA_KERNEL_LOOP(i, 0, num_nodes) { if (mask[i] == 1) { galois::DoSoftmax(feature_length, input_embeddings + feature_length * i, output + feature_length * i); // ignoring crossentropy loss calculation for now because I'm not using // loss for anything + didn't bother allocating an array to store loss // anyways + } else { + galois::GPUVectorZero(feature_length, output + feature_length * i); } } } @@ -170,3 +181,9 @@ __device__ void galois::DoSoftmax(size_t vector_length, const GNNFloat* input, output[i] /= denominator; } } + +__device__ void galois::GPUVectorZero(size_t vector_length, GNNFloat* vec) { + for (size_t i = 0; i < vector_length; i++) { + vec[i] = 0; + } +} diff --git a/libgnn/src/GNNOptimizers.cu b/libgnn/src/GNNOptimizers.cu index 77f3e74f5f..840554ddd4 100644 --- a/libgnn/src/GNNOptimizers.cu +++ b/libgnn/src/GNNOptimizers.cu @@ -42,7 +42,7 @@ __global__ void DoAdamUpdate(const galois::GNNFloat* derivatives, galois::GNNFloat alpha, galois::GNNFloat beta1, galois::GNNFloat beta2, galois::GNNFloat epsilon, galois::GNNFloat beta1t, galois::GNNFloat beta2t) { - CUDA_KERNEL_LOOP(i, matrix_size) { + CUDA_KERNEL_LOOP(i, 0, matrix_size) { first_moment[i] = beta1 * first_moment[i] + (1.0 - beta1) * derivatives[i]; second_moment[i] = beta2 * second_moment[i] + (1.0 - beta2) * (derivatives[i] * derivatives[i]); diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index dc2ebb2834..cb139191b4 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -23,7 +23,7 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { - graph_->ResizeLayerVector(config_.num_intermediate_layers()); + graph_->ResizeGPULayerVector(config_.num_intermediate_layers()); } #endif // used for chaining layers together; begins as nullptr @@ -54,13 +54,6 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( i, *graph_, &prev_output_layer, layer_dims, config_.default_layer_config()))); gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - graph_->InitLayerVectorMetaObjects( - i, galois::runtime::getSystemNetworkInterface().Num, - layer_dims.input_columns, layer_dims.output_columns); - } -#endif break; case GNNLayerType::kSAGE: gnn_layers_.push_back(std::move(std::make_unique( @@ -75,23 +68,25 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( gnn_layers_.push_back(std::move(std::make_unique( i, *graph_, &prev_output_layer, layer_dims, config_.default_layer_config()))); -#ifdef GALOIS_ENABLE_GPU - // TODO(loc/hochan) l2 layer gpu -#endif break; case GNNLayerType::kDense: gnn_layers_.push_back(std::move(std::make_unique( i, *graph_, &prev_output_layer, layer_dims, config_.default_layer_config()))); -#ifdef GALOIS_ENABLE_GPU - // TODO(loc/hochan) dense layer gpu -#endif break; default: GALOIS_LOG_FATAL("Invalid layer type during network construction"); } + // update output layer for next layer prev_output_layer = gnn_layers_.back()->GetForwardOutput(); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + graph_->InitLayerVectorMetaObjects( + i, galois::runtime::getSystemNetworkInterface().Num, + layer_dims.input_columns, layer_dims.output_columns); + } +#endif } // loop backward and find last GCN/SAGE (main) layer to disable activation @@ -385,6 +380,7 @@ galois::GraphNeuralNetwork::DoInference() { // start with graph features and pass it through all layers of the network galois::PointerWithSize layer_input = graph_->GetLocalFeatures(); + for (std::unique_ptr& ptr : gnn_layers_) { layer_input = ptr->ForwardPhase(layer_input); } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index d46f75305f..cb63fbe307 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -53,6 +53,7 @@ std::vector>* gnn_sampled_out_degrees_; #ifdef GALOIS_ENABLE_GPU struct CUDA_Context* cuda_ctx_for_sync; +struct CUDA_Context* cuda_ctx; unsigned layer_number_to_sync; #endif } // namespace graphs @@ -222,7 +223,7 @@ void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync, } #ifdef GALOIS_ENABLE_GPU -void galois::graphs::GNNGraph::AggregateSync( +void galois::graphs::GNNGraph::AggregateSyncGPU( GNNFloat* matrix_to_sync, const size_t matrix_column_size, const unsigned layer_number) const { size_t layer_input_mtx_column_size = @@ -539,11 +540,11 @@ void galois::graphs::GNNGraph::InitNormFactor() { global_degrees_.resize(partitioned_graph_->size(), 0.0); global_train_degrees_.resize(partitioned_graph_->size(), 0.0); CalculateFullNormFactor(); + gpu_memory_.InitNormFactor(partitioned_graph_->size()); } void galois::graphs::GNNGraph::CalculateFullNormFactor() { // TODO(loc) reset all degrees if this is called multiple times? - // get the norm factor contribution for each node based on the GLOBAL graph galois::do_all( galois::iterate(static_cast(0), partitioned_graph_->size()), @@ -983,7 +984,9 @@ void galois::graphs::GNNGraph::InitGPUMemory() { gpu_memory_.SetLabels(local_ground_truth_labels_); gpu_memory_.SetMasks(local_training_mask_, local_validation_mask_, local_testing_mask_); - gpu_memory_.SetNormFactors(norm_factors_); + gpu_memory_.AllocAggregateBitset(partitioned_graph_->size()); + gpu_memory_.SetGlobalTrainDegrees(global_train_degrees_); + gpu_memory_.SetGlobalDegrees(global_degrees_); } void galois::graphs::GNNGraph::InitLayerVectorMetaObjects( @@ -993,7 +996,7 @@ void galois::graphs::GNNGraph::InitLayerVectorMetaObjects( infl_in_size, infl_out_size); } -void galois::graphs::GNNGraph::ResizeLayerVector(size_t num_layers) { +void galois::graphs::GNNGraph::ResizeGPULayerVector(size_t num_layers) { resize_CUDA_layer_vector(cuda_ctx_, num_layers); } #endif diff --git a/libgnn/src/graphs/GNNGraph.cu b/libgnn/src/graphs/GNNGraph.cu index 96ba37db15..065e84be6c 100644 --- a/libgnn/src/graphs/GNNGraph.cu +++ b/libgnn/src/graphs/GNNGraph.cu @@ -1,5 +1,13 @@ +#include "gg.h" +#include "ggcuda.h" + +#include "galois/cuda/DynamicBitset.h" + #include "galois/CUDAUtil.h" #include "galois/graphs/GNNGraph.cuh" +#include "sharedptr.h" + +Shared cuda_bitset_graph_aggregate; galois::graphs::GNNGraphGPUAllocations::~GNNGraphGPUAllocations() { GALOIS_LOG_VERBOSE("Freeing GPU graph allocations"); @@ -13,6 +21,8 @@ galois::graphs::GNNGraphGPUAllocations::~GNNGraphGPUAllocations() { CUDA_FREE(local_training_mask_); CUDA_FREE(local_validation_mask_); CUDA_FREE(local_testing_mask_); + CUDA_FREE(global_degrees_); + CUDA_FREE(global_train_degrees_); } void galois::graphs::GNNGraphGPUAllocations::SetGraphTopology( @@ -83,11 +93,96 @@ void galois::graphs::GNNGraphGPUAllocations::SetMasks( test.size() * sizeof(char), cudaMemcpyHostToDevice)); } -void galois::graphs::GNNGraphGPUAllocations::SetNormFactors( - const std::vector norm_factors) { - CUDA_CHECK(cudaMalloc((void**)(&norm_factors_), - norm_factors.size() * sizeof(GNNFloat))); - CUDA_CHECK(cudaMemcpy(norm_factors_, norm_factors.data(), - norm_factors.size() * sizeof(GNNFloat), +void galois::graphs::GNNGraphGPUAllocations::InitNormFactor(size_t num_nodes) { + GALOIS_LOG_ASSERT(global_degrees_ == nullptr); + GALOIS_LOG_ASSERT(global_train_degrees_ == nullptr); + + CUDA_CHECK( + cudaMalloc((void**)(&global_degrees_), sizeof(uint32_t) * num_nodes)); + CUDA_CHECK(cudaMalloc((void**)(&global_train_degrees_), + sizeof(uint32_t) * num_nodes)); + global_degree_size_ = num_nodes; + global_train_degree_size_ = num_nodes; +} + +#if 0 // TODO(lhc) will be added +__global__ void CalculateFullNormFactorGPU() { + const unsigned thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const unsigned thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const unsigned warp_id = thread_id / WARP_SIZE; // global warp index + const unsigned warp_lane = + threadIdx.x / WARP_SIZE; // warp index within the CTA + const unsigned num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + // each warp gets a source: this var holds the first/last edge worked on by + // that warp + __shared__ int edge_begin_end[BLOCK_SIZE / WARP_SIZE][2]; + + // each warp works on a source: threads in warp split the feature + for (int src = warp_id; src < static_cast(num_nodes); src += num_warps) { + if (thread_lane < 2) { + edge_begin_end[warp_lane][thread_lane] = edge_index[src + thread_lane]; + } + __syncthreads(); + + const int edge_begin = edge_begin_end[warp_lane][0]; + const int edge_end = edge_begin_end[warp_lane][1]; + for (int offest = edge_begin; offset < edge_end; offset++) { + + } + } +} + +void galois::graphs::GNNGraphGPUAllocations::CalculateFullNormFactor() { + +} +#endif + +void galois::graphs::GNNGraphGPUAllocations::SetGlobalDegrees( + const std::vector global_degrees) { + if (global_degree_size_ < global_degrees.size()) { + if (global_degree_size_ > 0) { + CUDA_CHECK(cudaFree(global_degrees_)); + } + CUDA_CHECK(cudaMalloc((void**)(&global_degrees_), + global_degrees.size() * sizeof(uint32_t))); + global_degree_size_ = global_degrees.size(); + } + + CUDA_CHECK(cudaMemcpy(global_degrees_, global_degrees.data(), + global_degrees.size() * sizeof(uint32_t), + cudaMemcpyHostToDevice)); +} + +void galois::graphs::GNNGraphGPUAllocations::SetGlobalTrainDegrees( + const std::vector global_train_degrees) { + if (global_train_degree_size_ < global_train_degrees.size()) { + if (global_train_degree_size_ > 0) { + CUDA_CHECK(cudaFree(global_train_degrees_)); + } + CUDA_CHECK(cudaMalloc((void**)(&global_train_degrees_), + global_train_degrees.size() * sizeof(uint32_t))); + global_train_degree_size_ = global_train_degrees.size(); + } + + CUDA_CHECK(cudaMemcpy(global_train_degrees_, global_train_degrees.data(), + global_train_degrees.size() * sizeof(uint32_t), cudaMemcpyHostToDevice)); } + +void galois::graphs::GNNGraphGPUAllocations::AllocAggregateBitset(size_t size) { + cuda_bitset_graph_aggregate.alloc(1); + cuda_bitset_graph_aggregate.cpu_wr_ptr()->alloc(size); +} + +void galois::graphs::GNNGraphGPUAllocations::CopyToCPU( + const PointerWithSize& input) { + GNNFloat* cpu_input = (GNNFloat*)malloc(sizeof(GNNFloat) * input.size()); + cudaMemcpy(cpu_input, input.data(), sizeof(GNNFloat) * input.size(), + cudaMemcpyDeviceToHost); + for (size_t i = 0; i < input.size(); i++) + fprintf(stdout, "** %lu is %f\n", i, cpu_input[i]); +} diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp index 75e715e482..483ceb7850 100644 --- a/libgnn/src/layers/DenseLayer.cpp +++ b/libgnn/src/layers/DenseLayer.cpp @@ -97,10 +97,12 @@ void galois::DenseLayer::UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output) { #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { + /* TODO(lhc) implement this gpu_object_.UpdateEmbeddingsGPU( layer_dimensions_.input_rows, layer_dimensions_.input_columns, layer_dimensions_.output_columns, node_embeddings, base_gpu_object_.layer_weights(), output); + */ } else { #endif // CPU version is just a call into CBlas @@ -119,10 +121,12 @@ void galois::DenseLayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, layer_dimensions_.input_columns * layer_dimensions_.output_columns); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { + /* TODO(lhc) implement this gpu_object_.UpdateEmbeddingsDerivativeGPU( layer_dimensions_.input_rows, layer_dimensions_.input_columns, layer_dimensions_.output_columns, gradients, base_gpu_object_.layer_weights(), output); + */ } else { #endif // difference is Trans for B matrix (data) to get z by y (weights is y by z diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 14d8bd8759..0c01bb788b 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -38,9 +38,10 @@ galois::GNNLayer::GNNLayer(size_t layer_num, GlorotBengioInit(&layer_weights_); } + size_t num_output_elements = + layer_dimensions_.input_rows * layer_dimensions_.output_columns; + if (!config_.disable_output) { - size_t num_output_elements = - layer_dimensions_.input_rows * layer_dimensions_.output_columns; galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, ", forward output matrix ", num_output_elements, " (", FloatElementsToGB(num_output_elements), " GB)"); @@ -75,7 +76,8 @@ galois::GNNLayer::GNNLayer(size_t layer_num, layer_weight_gradients_.size()); p_forward_output_matrix_ = PointerWithSize( base_gpu_object_.forward_output(), forward_output_matrix_.size()); - p_backward_output_matrix_ = *backward_output_matrix; + p_backward_output_matrix_ = PointerWithSize( + base_gpu_object_.backward_output(), backward_output_matrix->size()); // TODO can clear the cpu side vectors/don't use .size() since optimally // they aren't initialized } else { @@ -127,9 +129,8 @@ void galois::GNNLayer::PairGlorotBengioInit(std::vector* vector1, for (size_t i = 0; i < vector2->size(); i++) { (*vector2)[i] = dist(rng); } + #ifdef GALOIS_ENABLE_GPU - // TODO - GALOIS_LOG_FATAL("TODO: copy both not 1"); if (device_personality == DevicePersonality::GPU_CUDA) { CopyLayerWeightsToGPU(); } @@ -200,15 +201,15 @@ void galois::GNNLayer::ReconstructDropoutMatrix( PointerWithSize* output_matrix) { galois::StatTimer timer("ReconstructDropoutMatrix", "GNNLayer"); timer.start(); + // reuse the dropout mask from a previous dropout call + size_t num_elements = output_matrix->size(); + GNNFloat scale = 1. / (1. - config_.dropout_rate); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { - // TODO(hochan) - GALOIS_LOG_FATAL("Implement me"); + base_gpu_object_.ReconstructDropoutMatrixGPU( + input_to_dropout, output_matrix, num_elements, scale); } else { #endif - // reuse the dropout mask from a previous dropout call - size_t num_elements = output_matrix->size(); - GNNFloat scale = 1. / (1. - config_.dropout_rate); galois::do_all( galois::iterate(static_cast(0), num_elements), [&](size_t i) { @@ -254,59 +255,69 @@ void galois::GNNLayer::Activation() { galois::StatTimer timer("ForwardActivation", "GNNLayer"); timer.start(); - if (activation_memo_.size() == 0) { - activation_memo_.resize(forward_output_matrix_.size()); - } - activation_memo_.reset(); - // TODO only does relu at the moment; should check user specified activation // and act accordingly - galois::do_all( - galois::iterate(static_cast(0), - layer_dimensions_.input_rows * - layer_dimensions_.output_columns), - [&](size_t i) { - if (forward_output_matrix_[i] > 0.0) { - // do nothing, keep value; set the memo though - activation_memo_.set(i); - } else { - forward_output_matrix_[i] = 0; - } - }, - galois::loopname("ReLU")); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.ActivationGPU(p_forward_output_matrix_.size()); + } else { +#endif + if (activation_memo_.size() == 0) { + activation_memo_.resize(forward_output_matrix_.size()); + } + activation_memo_.reset(); + + galois::do_all( + galois::iterate(static_cast(0), + layer_dimensions_.input_rows * + layer_dimensions_.output_columns), + [&](size_t i) { + if (forward_output_matrix_[i] > 0.0) { + // do nothing, keep value; set the memo though + activation_memo_.set(i); + } else { + forward_output_matrix_[i] = 0; + } + }, + galois::loopname("ReLU")); +#ifdef GALOIS_ENABLE_GPU + } +#endif timer.stop(); } void galois::GNNLayer::ActivationDerivative( PointerWithSize* gradient) { - galois::StatTimer timer("BackwardActivation", "GNNLayer"); - timer.start(); - - // TODO only does relu at the moment; should check user specified activation - // and act accordingly - // keep gradient if the original output was greater than 0 - galois::do_all( - galois::iterate(static_cast(0), - layer_dimensions_.input_rows * - layer_dimensions_.output_columns), - [&](size_t i) { - // it was <= 0 before; set back to 0 - if (!activation_memo_.test(i)) { - (*gradient)[i] = 0; - } - }, - galois::loopname("ReLU-Derivative")); - timer.stop(); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.ActivationDerivativeGPU(gradient->data(), + gradient->size()); + } else { +#endif + // TODO only does relu at the moment; should check user specified activation + // and act accordingly + // keep gradient if the original output was greater than 0 + galois::do_all( + galois::iterate(static_cast(0), + layer_dimensions_.input_rows * + layer_dimensions_.output_columns), + [&](size_t i) { + // it was <= 0 before; set back to 0 + if (!activation_memo_.test(i)) { + (*gradient)[i] = 0; + } + }, + galois::loopname("ReLU-Derivative")); +#ifdef GALOIS_ENABLE_GPU + } +#endif } void galois::GNNLayer::WeightGradientSyncSum() { galois::StatTimer t("Sync_WeightGradientsSum", "GNNLayer"); t.start(); -#ifdef GALOIS_ENABLE_GPU - // TODO(hochan) collectives here rather than gluon sync if possible like the - // CPU code - // preferably without needing to do a gpu->cpu copy -#else + int weight_size = static_cast(p_layer_weight_gradients_.size()); + // TODO(loc) remove this limitation later; can just do a loop over the weight // matrix if (p_layer_weight_gradients_.size() > @@ -314,54 +325,73 @@ void galois::GNNLayer::WeightGradientSyncSum() { GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max " "int at the moment"); } - MPI_Allreduce(MPI_IN_PLACE, - static_cast(p_layer_weight_gradients_.data()), - static_cast(p_layer_weight_gradients_.size()), MPI_FLOAT, - MPI_SUM, MPI_COMM_WORLD); +#ifdef GALOIS_ENABLE_GPU + // TODO(lhc) make this clang option later + bool gpu_direct_enabled = false; + if (device_personality == DevicePersonality::GPU_CUDA && + !gpu_direct_enabled) { + base_gpu_object_.CopyWeightGradientsToCPU(&layer_weight_gradients_); + MPI_Allreduce(MPI_IN_PLACE, layer_weight_gradients_.data(), weight_size, + MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); + base_gpu_object_.CopyToWeightGradients(layer_weight_gradients_); + } else { +#endif + MPI_Allreduce(MPI_IN_PLACE, + static_cast(p_layer_weight_gradients_.data()), + weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); +#ifdef GALOIS_ENABLE_GPU + } #endif t.stop(); } void galois::GNNLayer::MaskInputNonMasters(PointerWithSize* input) { -#ifdef GALOIS_ENABLE_GPU - // TODO(hochan) mask away the **non** masters on gpu - GALOIS_LOG_FATAL("implement this"); -#else assert(*(graph_.begin_owned()) == 0); size_t start_node = *(graph_.end_owned()); size_t end_node = graph_.active_size(); size_t row_index = layer_dimensions_.input_columns; assert((row_index * layer_dimensions_.input_rows) <= input->size()); - galois::do_all( - galois::iterate(start_node, end_node), - [&](size_t non_master) { - // TODO(loc) use a std function for this for max efficiency - for (size_t i = 0; i < row_index; i++) { - (*input)[non_master * row_index + i] = 0; - } - }, - galois::loopname("MaskInputNonMasters")); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node, row_index); + } else { +#endif + galois::do_all( + galois::iterate(start_node, end_node), + [&](size_t non_master) { + // TODO(loc) use a std function for this for max efficiency + for (size_t i = 0; i < row_index; i++) { + (*input)[non_master * row_index + i] = 0; + } + }, + galois::loopname("MaskInputNonMasters")); +#ifdef GALOIS_ENABLE_GPU + } #endif } void galois::GNNLayer::MaskGradientNonMasters( PointerWithSize* gradient) { -#ifdef GALOIS_ENABLE_GPU - // TODO(hochan) mask away the **non** masters on gpu - GALOIS_LOG_FATAL("implement this"); -#else assert(*(graph_.begin_owned()) == 0); size_t start_node = *(graph_.end_owned()); size_t end_node = graph_.active_size(); size_t row_index = layer_dimensions_.output_columns; - galois::do_all( - galois::iterate(start_node, end_node), - [&](size_t non_master) { - // TODO(loc) use a std function for this for max efficiency - for (size_t i = 0; i < row_index; i++) { - (*gradient)[non_master * row_index + i] = 0; - } - }, - galois::loopname("MaskGradientNonMasters")); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.MaskNonMastersGPU(gradient, start_node, end_node, + row_index); + } else { +#endif + galois::do_all( + galois::iterate(start_node, end_node), + [&](size_t non_master) { + // TODO(loc) use a std function for this for max efficiency + for (size_t i = 0; i < row_index; i++) { + (*gradient)[non_master * row_index + i] = 0; + } + }, + galois::loopname("MaskGradientNonMasters")); +#ifdef GALOIS_ENABLE_GPU + } #endif } diff --git a/libgnn/src/layers/GNNLayer.cu b/libgnn/src/layers/GNNLayer.cu index d6616be5fe..71b2b4512f 100644 --- a/libgnn/src/layers/GNNLayer.cu +++ b/libgnn/src/layers/GNNLayer.cu @@ -9,6 +9,7 @@ galois::GNNLayerGPUAllocations::~GNNLayerGPUAllocations() { CUDA_FREE(backward_output_matrix_); CUDA_FREE(layer_weights_); CUDA_FREE(layer_weight_gradients_); + CUDA_FREE(activation_memo_); } void galois::GNNLayerGPUAllocations::InitInOutMemory(size_t forward_size, @@ -47,17 +48,24 @@ void galois::GNNLayerGPUAllocations::CopyToWeights( cudaMemcpyHostToDevice)); } +void galois::GNNLayerGPUAllocations::CopyToWeightGradients( + const std::vector& cpu_gradients) { + CUDA_CHECK(cudaMemcpy(layer_weight_gradients_, cpu_gradients.data(), + cpu_gradients.size() * sizeof(GNNFloat), + cudaMemcpyHostToDevice)); +} + void galois::GNNLayerGPUAllocations::CopyForwardOutputToCPU( - std::vector* cpu_forward_output) { - CUDA_CHECK(cudaMemcpy(cpu_forward_output->data(), forward_output_matrix_, - cpu_forward_output->size() * sizeof(GNNFloat), + GNNFloat* cpu_forward_output, size_t forward_output_size) { + CUDA_CHECK(cudaMemcpy(cpu_forward_output, forward_output_matrix_, + forward_output_size * sizeof(GNNFloat), cudaMemcpyDeviceToHost)); } void galois::GNNLayerGPUAllocations::CopyBackwardOutputToCPU( - std::vector* cpu_backward_output) { - CUDA_CHECK(cudaMemcpy(cpu_backward_output->data(), backward_output_matrix_, - cpu_backward_output->size() * sizeof(GNNFloat), + GNNFloat* cpu_backward_output, size_t backward_output_size) { + CUDA_CHECK(cudaMemcpy(cpu_backward_output, backward_output_matrix_, + backward_output_size * sizeof(GNNFloat), cudaMemcpyDeviceToHost)); } @@ -74,7 +82,7 @@ __global__ void DoDropoutImpl(size_t input_size, const galois::GNNFloat* input_to_dropout, galois::GNNFloat* output, const galois::GNNFloat* rng_vector, char* dropout_mask, float dropout_rate, galois::GNNFloat scale) { - CUDA_KERNEL_LOOP(i, input_size) { + CUDA_KERNEL_LOOP(i, 0, input_size) { // convert the rng floats into a mask dropout_mask[i] = rng_vector[i] > dropout_rate ? 1 : 0; // use mask to keep/drop weights @@ -86,7 +94,7 @@ __global__ void DoDropoutDerivativeImpl(size_t input_size, galois::GNNFloat* input, char* dropout_mask, galois::GNNFloat scale) { - CUDA_KERNEL_LOOP(i, input_size) { + CUDA_KERNEL_LOOP(i, 0, input_size) { input[i] = input[i] * (float)dropout_mask[i] * scale; } } @@ -138,3 +146,123 @@ __global__ void PrintVector(galois::GNNFloat* v, unsigned size) { void galois::GNNLayerGPUAllocations::PrintForwardOutput(size_t size) { PrintVector<<<1, 1>>>(forward_output_matrix_, size); } + +// TODO copy from gpu function as well just in case I need to check +void galois::GNNLayerGPUAllocations::PrintBackwardOutput(size_t size) { + PrintVector<<<1, 1>>>(backward_output_matrix_, size); +} + +namespace { +__global__ void InitVectorTo1Kernel(galois::GNNFloat* vector, + size_t num_vector_elements) { + CUDA_KERNEL_LOOP(idx, 0, num_vector_elements) { vector[idx] = 1.0; } +} + +__global__ void ReluActivationKernel(galois::GNNFloat* forward_output_matrix, + size_t num_forward_output_elements, + uint8_t* activation_memo) { + CUDA_KERNEL_LOOP(idx, 0, num_forward_output_elements) { + if (forward_output_matrix[idx] > galois::GNNFloat{0}) { + activation_memo[idx] = 1; + } else { + forward_output_matrix[idx] = 0; + } + } +} + +__global__ void ReluActivationDerivativeKernel( + galois::GNNFloat* gradients, galois::GNNFloat* forward_output_matrix, + const size_t num_gradients_elements, const uint8_t* activation_memo) { + CUDA_KERNEL_LOOP(idx, 0, num_gradients_elements) { + if (!activation_memo[idx]) { + gradients[idx] = 0; + } + } +} + +__global__ void +ReconstructDropoutMatrixKernel(const galois::GNNFloat* input_to_dropout, + galois::GNNFloat* output_matrix, + char* dropout_mask, const size_t num_elements, + const galois::GNNFloat scale) { + CUDA_KERNEL_LOOP(i, 0, num_elements) { + output_matrix[i] = input_to_dropout[i] * scale; + } + + CUDA_KERNEL_LOOP(i, 0, num_elements) { + output_matrix[i] *= static_cast(dropout_mask[i]); + } +} + +__global__ void MaskNonMastersKernel(galois::GNNFloat* input, + uint32_t start_node, uint32_t end_node, + uint32_t row_index) { + // TODO(lhc) implement nested parallelism if it is worth + CUDA_KERNEL_LOOP(non_master, start_node, end_node) { + for (uint32_t j = 0; j < row_index; j++) { + input[non_master * row_index + j] = 0; + } + } +} +} // namespace + +void galois::GNNLayerGPUAllocations::InitGPUVectorTo1(GNNFloat* vector, + size_t vector_size) { + InitVectorTo1Kernel<<>>( + vector, vector_size); + CUDA_TEST("Failed to initialize vector to 1."); +} + +void galois::GNNLayerGPUAllocations::ActivationGPU( + size_t num_forward_output_elements) { + if (activation_memo_ == nullptr) { + CUDA_CHECK(cudaMalloc((void**)(&activation_memo_), + num_forward_output_elements * sizeof(uint8_t))); + } + ReluActivationKernel<<>>( + forward_output_matrix_, num_forward_output_elements, activation_memo_); + CUDA_TEST("Activation GPU failed."); +} + +void galois::GNNLayerGPUAllocations::ActivationDerivativeGPU( + GNNFloat* gradients, size_t num_gradients_elements) { + ReluActivationDerivativeKernel<<>>( + gradients, forward_output_matrix_, num_gradients_elements, + activation_memo_); + CUDA_TEST("ActivationDerivative GPU failed."); +} + +void galois::GNNLayerGPUAllocations::ReconstructDropoutMatrixGPU( + const PointerWithSize input_to_dropout, + PointerWithSize* output_matrix, size_t num_elements, + GNNFloat scale) { + ReconstructDropoutMatrixKernel<<>>( + input_to_dropout.data(), output_matrix->data(), dropout_mask_, + num_elements, scale); +} + +void galois::GNNLayerGPUAllocations::MaskNonMastersGPU( + PointerWithSize* input, size_t start_node, size_t end_node, + size_t row_index) { + MaskNonMastersKernel<<>>( + input->data(), start_node, end_node, row_index); +} + +void galois::GNNLayerGPUAllocations::CopyToCPU( + PointerWithSize* input) { + GNNFloat* cpu_input = (GNNFloat*)malloc(sizeof(GNNFloat) * input->size()); + cudaMemcpy(cpu_input, input->data(), sizeof(GNNFloat) * input->size(), + cudaMemcpyDeviceToHost); + for (size_t i = 0; i < input->size(); i++) + fprintf(stderr, "%lu = %f\n", i, cpu_input[i]); +} + +void galois::GNNLayerGPUAllocations::CopyToCPU(GNNFloat* input, size_t size) { + GNNFloat* cpu_input = (GNNFloat*)malloc(sizeof(GNNFloat) * size); + cudaMemcpy(cpu_input, input, sizeof(GNNFloat) * size, cudaMemcpyDeviceToHost); + for (size_t i = 0; i < size; i++) + fprintf(stderr, "%lu = %f\n", i, cpu_input[i]); +} diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 3bca821078..7c22627f2f 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -16,7 +16,15 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, ", GCN input temp var 1 ", num_input_elements, " (", FloatElementsToGB(num_input_elements), " GB)"); - in_temp_1_.resize(num_input_elements, 0); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateInTemp1(num_input_elements); + } else { +#endif + in_temp_1_.resize(num_input_elements, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif } // only on in dropout case + if in temp is smaller than out temp @@ -26,7 +34,15 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, ", GCN input temp var 2 ", num_input_elements, " (", FloatElementsToGB(num_input_elements), " GB)"); - in_temp_2_.resize(num_input_elements, 0); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateInTemp2(num_input_elements); + } else { +#endif + in_temp_2_.resize(num_input_elements, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif } size_t num_output_elements = @@ -39,20 +55,27 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, ", GCN output temp var ", num_output_elements, " (", FloatElementsToGB(num_output_elements), " GB)"); - out_temp_.resize(num_output_elements, 0); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateOutTemp(num_output_elements); + } else { +#endif + out_temp_.resize(num_output_elements, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif } layer_type_ = galois::GNNLayerType::kGraphConvolutional; #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.Allocate(num_input_elements, num_output_elements); // init pointers with size p_in_temp_1_ = - PointerWithSize(gpu_object_.in_temp_1(), in_temp_1_.size()); + PointerWithSize(gpu_object_.in_temp_1(), num_input_elements); p_in_temp_2_ = - PointerWithSize(gpu_object_.in_temp_2(), in_temp_2_.size()); + PointerWithSize(gpu_object_.in_temp_2(), num_input_elements); p_out_temp_ = - PointerWithSize(gpu_object_.out_temp(), out_temp_.size()); + PointerWithSize(gpu_object_.out_temp(), num_output_elements); } else { #endif p_in_temp_1_ = PointerWithSize(in_temp_1_); @@ -270,10 +293,12 @@ void galois::GraphConvolutionalLayer::AggregateAll( #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { + size_t last_master = *(graph_.end_owned()); gpu_object_.AggregateAllGPU( graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings, - aggregate_output, !config_.disable_normalization); - graph_.AggregateSync(aggregate_output, column_length, layer_number_); + aggregate_output, !config_.disable_normalization, + config_.disable_self_aggregate, last_master); + graph_.AggregateSyncGPU(aggregate_output, column_length, layer_number_); } else { #endif AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts); diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cu b/libgnn/src/layers/GraphConvolutionalLayer.cu index 882cb32391..4ef8b62eca 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cu +++ b/libgnn/src/layers/GraphConvolutionalLayer.cu @@ -1,5 +1,13 @@ +#include "gg.h" +#include "ggcuda.h" #include "galois/GNNMath.cuh" #include "galois/layers/GraphConvolutionalLayer.cuh" +#include "galois/cuda/DynamicBitset.h" +#include "sharedptr.h" + +// TODO(lhc) better way for this declaration is to declare it +// inside of the cuda context, but this messed linking to Gluon +extern Shared cuda_bitset_graph_aggregate; galois::GCNGPUAllocations::~GCNGPUAllocations() { GALOIS_LOG_VERBOSE("Freeing GCN layer allocations"); @@ -8,24 +16,26 @@ galois::GCNGPUAllocations::~GCNGPUAllocations() { CUDA_FREE(out_temp_); } -void galois::GCNGPUAllocations::Allocate(size_t input_elements, - size_t output_elements) { - CUDA_CHECK( - cudaMalloc((void**)(&in_temp_1_), input_elements * sizeof(GNNFloat))); - CUDA_CHECK( - cudaMalloc((void**)(&in_temp_2_), input_elements * sizeof(GNNFloat))); - CUDA_CHECK( - cudaMalloc((void**)(&out_temp_), output_elements * sizeof(GNNFloat))); +void galois::GCNGPUAllocations::AllocateInTemp1(const size_t size) { + CUDA_CHECK(cudaMalloc((void**)(&in_temp_1_), size * sizeof(GNNFloat))); +} + +void galois::GCNGPUAllocations::AllocateInTemp2(const size_t size) { + CUDA_CHECK(cudaMalloc((void**)(&in_temp_2_), size * sizeof(GNNFloat))); +} + +void galois::GCNGPUAllocations::AllocateOutTemp(const size_t size) { + CUDA_CHECK(cudaMalloc((void**)(&out_temp_), size * sizeof(GNNFloat))); } namespace { // GPU side aggregation call: no matrix multiply, just regular dst accesses -__global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length, - const int* edge_index, - const int* edge_destination, - const galois::GNNFloat* norm_factors, - const galois::GNNFloat* node_embeddings, - galois::GNNFloat* aggregate_output) { +__global__ void AggregateAllKernel( + unsigned num_nodes, size_t column_length, const int* edge_index, + const int* edge_destination, const uint32_t* global_degrees, + const galois::GNNFloat* node_embeddings, galois::GNNFloat* aggregate_output, + bool disable_self_aggregate, size_t last_master, + DynamicBitset* cuda_bitset_graph_aggregate) { const unsigned thread_id = BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index const unsigned thread_lane = @@ -43,10 +53,13 @@ __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length, // each warp works on a source: threads in warp split the feature for (int src = warp_id; src < static_cast(num_nodes); src += num_warps) { galois::GNNFloat src_norm = 0.0; + galois::GNNFloat dst_norm = 0.0; galois::GNNFloat norm_to_use = 1.0; - if (norm_factors != nullptr) { - src_norm = norm_factors[src]; + if (global_degrees != nullptr) { + src_norm = (global_degrees[src]) + ? (1.0 / sqrt(static_cast(global_degrees[src] + 1))) + : 0.0; } if (thread_lane < 2) { @@ -60,21 +73,44 @@ __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length, const int row_end = edge_begin_end[warp_lane][1]; unsigned base_src_index = src * column_length; + if (!disable_self_aggregate) { + cuda_bitset_graph_aggregate->set(src); + if (src < last_master) { + norm_to_use = src_norm * src_norm; + for (int i = 0; i < column_length; i += WARP_SIZE) { + if (thread_lane + i < column_length) { + aggregate_output[base_src_index + thread_lane + i] = + node_embeddings[base_src_index + thread_lane + i] * norm_to_use; + } + } + } + } + for (int offset = row_begin; offset < row_end; offset++) { int dst = edge_destination[offset]; unsigned base_dst_index = dst * column_length; + cuda_bitset_graph_aggregate->set(src); - if (norm_factors != nullptr) { + if (global_degrees != nullptr) { + dst_norm = + (global_degrees[dst]) + ? (1.0 / sqrt(static_cast(global_degrees[dst] + 1))) + : 0.0; // note that otherwise it's 1.0, so a no-op when it comes to multiply - norm_to_use = src_norm * norm_factors[dst]; + norm_to_use = src_norm * dst_norm; } // NOTE: this is where warp diverges // the feature aggregation is split among thread in a warp for (int i = 0; i < column_length; i += WARP_SIZE) { if ((thread_lane + i) < column_length) { - aggregate_output[base_src_index + thread_lane + i] += - node_embeddings[base_dst_index + thread_lane + i] * norm_to_use; + if (global_degrees != nullptr) { + aggregate_output[base_src_index + thread_lane + i] += + node_embeddings[base_dst_index + thread_lane + i] * norm_to_use; + } else { + aggregate_output[base_src_index + thread_lane + i] += + node_embeddings[base_dst_index + thread_lane + i]; + } } } } @@ -86,19 +122,27 @@ __global__ void AggregateAllKernel(unsigned num_nodes, size_t column_length, void galois::GCNGPUAllocations::AggregateAllGPU( const graphs::GNNGraphGPUAllocations& gpu_graph, size_t num_nodes, size_t column_length, const GNNFloat* node_embeddings, - GNNFloat* aggregate_output, bool use_norm) { + GNNFloat* aggregate_output, bool use_norm, bool disable_self_aggregate, + size_t last_master) { + // num_nodes should be greater than 0 to avoid negative number of thread + if (num_nodes == 0) { + return; + } + CUDA_CHECK(cudaMemset(aggregate_output, 0, num_nodes * column_length * sizeof(GNNFloat))); if (use_norm) { AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>( num_nodes, column_length, gpu_graph.edge_index(), - gpu_graph.edge_destinations(), gpu_graph.norm_factors(), - node_embeddings, aggregate_output); + gpu_graph.edge_destinations(), gpu_graph.get_global_degrees(), + node_embeddings, aggregate_output, disable_self_aggregate, last_master, + cuda_bitset_graph_aggregate.gpu_wr_ptr()); } else { AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>( num_nodes, column_length, gpu_graph.edge_index(), gpu_graph.edge_destinations(), nullptr, node_embeddings, - aggregate_output); + aggregate_output, disable_self_aggregate, last_master, + cuda_bitset_graph_aggregate.gpu_wr_ptr()); } CUDA_TEST("GPU aggregate all failure"); } diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 22178ee2fa..9696a9b460 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -20,21 +20,44 @@ galois::SAGELayer::SAGELayer(size_t layer_num, galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, ", SAGE second layer weights ", num_weight_elements, " (", FloatElementsToGB(num_weight_elements), " GB)"); + // TODO(lhc) for now, allocate dummy cpu weight2 for copying to GPU layer_weights_2_.resize(num_weight_elements); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateWeight2(num_weight_elements); + } +#endif galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, ", SAGE second layer gradients ", num_weight_elements, " (", FloatElementsToGB(num_weight_elements), " GB)"); layer_weight_gradients_2_.resize(num_weight_elements, 0); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateWeightGradient2(num_weight_elements); + } +#endif // reinit both weight matrices as one unit PairGlorotBengioInit(&layer_weights_, &layer_weights_2_); - - // update the pointers to them as well as realloc will require it - p_layer_weights_2_ = PointerWithSize(layer_weights_2_); - p_layer_weight_gradients_2_ = - PointerWithSize(layer_weight_gradients_2_); - // initialize the optimizer +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + // copy weight2 to GPU + gpu_object_.CopyToWeights2(layer_weights_2_); + p_layer_weights_2_ = PointerWithSize( + gpu_object_.layer_weights_2(), num_weight_elements); + p_layer_weight_gradients_2_ = PointerWithSize( + gpu_object_.layer_weight_gradients_2(), num_weight_elements); + } else { +#endif + // update the pointers to them as well as realloc will require it + p_layer_weights_2_ = PointerWithSize(layer_weights_2_); + p_layer_weight_gradients_2_ = + PointerWithSize(layer_weight_gradients_2_); +#ifdef GALOIS_ENABLE_GPU + } +#endif std::vector weight_size = {num_weight_elements}; + // initialize the optimizer second_weight_optimizer_ = std::make_unique(weight_size, 1); } @@ -47,7 +70,15 @@ galois::SAGELayer::SAGELayer(size_t layer_num, galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, ", SAGE input temp var 1 ", num_input_elements, " (", FloatElementsToGB(num_input_elements), " GB)"); - in_temp_1_.resize(num_input_elements, 0); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateInTemp1(num_input_elements); + } else { +#endif + in_temp_1_.resize(num_input_elements, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif } // only on in dropout case + if in temp is smaller than out temp @@ -57,40 +88,52 @@ galois::SAGELayer::SAGELayer(size_t layer_num, galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, ", SAGE input temp var 2 ", num_input_elements, " (", FloatElementsToGB(num_input_elements), " GB)"); - in_temp_2_.resize(num_input_elements, 0); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateInTemp2(num_input_elements); + } else { +#endif + in_temp_2_.resize(num_input_elements, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif } size_t num_output_elements = layer_dimensions_.input_rows * layer_dimensions_.output_columns; - // only needed if out temp would be smaller than intemp if (!config_.disable_aggregate_after_update && layer_dimensions_.input_columns > layer_dimensions_.output_columns) { galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, ", SAGE output temp var ", num_output_elements, " (", FloatElementsToGB(num_output_elements), " GB)"); - out_temp_.resize(num_output_elements, 0); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateOutTemp(num_output_elements); + } else { +#endif + out_temp_.resize(num_output_elements, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif } layer_type_ = galois::GNNLayerType::kSAGE; #ifdef GALOIS_ENABLE_GPU - // TODO(loc/hochan) GPU SAGE if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.Allocate(num_input_elements, num_output_elements); // init pointers with size p_in_temp_1_ = - PointerWithSize(gpu_object_.in_temp_1(), in_temp_1_.size()); + PointerWithSize(gpu_object_.in_temp_1(), num_input_elements); p_in_temp_2_ = - PointerWithSize(gpu_object_.in_temp_2(), in_temp_2_.size()); + PointerWithSize(gpu_object_.in_temp_2(), num_input_elements); p_out_temp_ = - PointerWithSize(gpu_object_.out_temp(), out_temp_.size()); + PointerWithSize(gpu_object_.out_temp(), num_output_elements); } else { #endif p_in_temp_1_ = PointerWithSize(in_temp_1_); p_in_temp_2_ = PointerWithSize(in_temp_2_); p_out_temp_ = PointerWithSize(out_temp_); #ifdef GALOIS_ENABLE_GPU - // TODO concat parameters } #endif @@ -100,22 +143,30 @@ galois::SAGELayer::SAGELayer(size_t layer_num, void galois::SAGELayer::WeightGradientSyncSum2() { galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName); t.start(); + int weight_size = static_cast(p_layer_weight_gradients_2_.size()); +#ifdef GALOIS_ENABLE_GPU + bool gpu_direct_enabled = false; + if (device_personality == DevicePersonality::GPU_CUDA && + !gpu_direct_enabled) { + gpu_object_.CopyWeight2GradientsToCPU(&layer_weight_gradients_2_); + MPI_Allreduce(MPI_IN_PLACE, + static_cast(layer_weight_gradients_2_.data()), + weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); + gpu_object_.CopyToWeight2Gradients(layer_weight_gradients_2_); + } else { +#endif + // TODO(loc) remove this limitation later; can just do a loop over the + // weight matrix + if (p_layer_weight_gradients_2_.size() > + size_t{std::numeric_limits::max()}) { + GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max " + "int at the moment"); + } + MPI_Allreduce(MPI_IN_PLACE, + static_cast(p_layer_weight_gradients_2_.data()), + weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); #ifdef GALOIS_ENABLE_GPU - // TODO(hochan) collectives here rather than gluon sync if possible like the - // CPU code - GALOIS_LOG_FATAL("implement me"); -#else - // TODO(loc) remove this limitation later; can just do a loop over the weight - // matrix - if (p_layer_weight_gradients_2_.size() > - size_t{std::numeric_limits::max()}) { - GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max " - "int at the moment"); } - MPI_Allreduce(MPI_IN_PLACE, - static_cast(p_layer_weight_gradients_2_.data()), - static_cast(p_layer_weight_gradients_2_.size()), MPI_FLOAT, - MPI_SUM, MPI_COMM_WORLD); #endif t.stop(); } @@ -226,19 +277,29 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // this is fine because gradient won't be used to get feature gradients MaskGradientNonMasters(input_gradient); } - // input data (prev layer input or temp1) or gradient need mask - // can mask gradient if layer == 0 - // otherwise must mask other - galois::CBlasSGEMM( - CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, - layer_dimensions_.input_rows, layer_dimensions_.output_columns, - input_data.data(), input_gradient->data(), - p_layer_weight_gradients_2_.data()); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.UpdateWeight2DerivativeGPU( + layer_dimensions_.input_columns, layer_dimensions_.input_rows, + layer_dimensions_.output_columns, input_data.data(), + input_gradient->data(), p_layer_weight_gradients_2_.data()); + } else { +#endif + // input data (prev layer input or temp1) or gradient need mask + // can mask gradient if layer == 0 + // otherwise must mask other + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, + layer_dimensions_.input_rows, layer_dimensions_.output_columns, + input_data.data(), input_gradient->data(), + p_layer_weight_gradients_2_.data()); +#ifdef GALOIS_ENABLE_GPU + } +#endif } WeightGradientSyncSum2(); - // AFW = O - // derivative of aggregation/update // TODO clean up logic here to reduce nesting if (config_.disable_aggregate_after_update || @@ -369,12 +430,12 @@ void galois::SAGELayer::AggregateAll( if (!IsSampledLayer()) { gpu_object_.AggregateAllGPU( graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings, - aggregate_output, !config_.disable_normalization); + aggregate_output, !config_.disable_normalization, is_backward); } else { // TODO(hochan) GALOIS_LOG_FATAL("SAMPLING IMPLEMENTATION"); } - graph_.AggregateSync(aggregate_output, column_length, layer_number_); + graph_.AggregateSyncGPU(aggregate_output, column_length, layer_number_); } else { #endif AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts, @@ -519,17 +580,21 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddings( galois::StatTimer timer("SelfForwardXForm", kRegionName); timer.start(); #ifdef GALOIS_ENABLE_GPU - // TODO self change + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.SelfFeatureUpdateEmbeddingsGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, node_embeddings, output); + } else { #endif - // note use of layer weights 2 differentiates this from above - galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, - layer_dimensions_.input_columns, - layer_dimensions_.output_columns, node_embeddings, - layer_weights_2_.data(), output, true); + // note use of layer weights 2 differentiates this from above + galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, + layer_dimensions_.input_columns, + layer_dimensions_.output_columns, node_embeddings, + layer_weights_2_.data(), output, true); #ifdef GALOIS_ENABLE_GPU -} + } #endif -timer.stop(); + timer.stop(); } void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, @@ -567,16 +632,21 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative( assert(p_layer_weights_.size() >= layer_dimensions_.input_columns * layer_dimensions_.output_columns); #ifdef GALOIS_ENABLE_GPU - // TODO gpu self + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.SelfFeatureUpdateEmbeddingsDerivativeGPU( + layer_dimensions_.input_rows, layer_dimensions_.output_columns, + layer_dimensions_.input_columns, gradients, output); + } else { #endif - // difference is Trans for B matrix (data) to get z by y (weights is y by z - // normally); result is x by y - // true at end -> accumulate - galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, - layer_dimensions_.output_columns, - layer_dimensions_.input_columns, gradients, - layer_weights_2_.data(), output, true); + // difference is Trans for B matrix (data) to get z by y (weights is y by z + // normally); result is x by y + // true at end -> accumulate + galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, + layer_dimensions_.output_columns, + layer_dimensions_.input_columns, gradients, + layer_weights_2_.data(), output, true); #ifdef GALOIS_ENABLE_GPU + } #endif timer.stop(); } diff --git a/libgnn/src/layers/SAGELayer.cu b/libgnn/src/layers/SAGELayer.cu new file mode 100644 index 0000000000..33cf32d9d3 --- /dev/null +++ b/libgnn/src/layers/SAGELayer.cu @@ -0,0 +1,209 @@ +#include "gg.h" +#include "ggcuda.h" +#include "galois/cuda/DynamicBitset.h" +#include "galois/GNNMath.cuh" +#include "galois/layers/SAGELayer.cuh" + +extern Shared cuda_bitset_graph_aggregate; + +galois::SAGEGPUAllocations::~SAGEGPUAllocations() { + GALOIS_LOG_VERBOSE("Freeing SAGE layer allocations"); + CUDA_FREE(in_temp_1_); + CUDA_FREE(in_temp_2_); + CUDA_FREE(out_temp_); + CUDA_FREE(layer_weights_2_); + CUDA_FREE(layer_weight_gradients_2_); +} + +void galois::SAGEGPUAllocations::AllocateWeight2(const size_t size) { + CUDA_CHECK(cudaMalloc((void**)(&layer_weights_2_), size * sizeof(GNNFloat))); +} + +void galois::SAGEGPUAllocations::AllocateWeightGradient2(const size_t size) { + CUDA_CHECK(cudaMalloc((void**)(&layer_weight_gradients_2_), + size * sizeof(GNNFloat))); +} + +void galois::SAGEGPUAllocations::AllocateInTemp1(const size_t size) { + CUDA_CHECK(cudaMalloc((void**)(&in_temp_1_), size * sizeof(GNNFloat))); +} + +void galois::SAGEGPUAllocations::AllocateInTemp2(const size_t size) { + CUDA_CHECK(cudaMalloc((void**)(&in_temp_2_), size * sizeof(GNNFloat))); +} + +void galois::SAGEGPUAllocations::AllocateOutTemp(const size_t size) { + CUDA_CHECK(cudaMalloc((void**)(&out_temp_), size * sizeof(GNNFloat))); +} + +namespace { +// GPU side aggregation call: no matrix multiply, just regular dst accesses +__global__ void AggregateAllKernel( + unsigned num_nodes, size_t column_length, const int* edge_index, + const int* edge_destination, const uint32_t* degree_for_norm, + const galois::GNNFloat* node_embeddings, galois::GNNFloat* aggregate_output, + DynamicBitset* cuda_bitset_graph_aggregate, bool is_backward) { + const unsigned thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const unsigned thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const unsigned warp_id = thread_id / WARP_SIZE; // global warp index + const unsigned warp_lane = + threadIdx.x / WARP_SIZE; // warp index within the CTA + const unsigned num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + // each warp gets a source: this var holds the first/last edge worked on by + // that warp + __shared__ int edge_begin_end[BLOCK_SIZE / WARP_SIZE][2]; + + // each warp works on a source: threads in warp split the feature + for (int src = warp_id; src < static_cast(num_nodes); src += num_warps) { + galois::GNNFloat norm_to_use = 0.0; + + if (degree_for_norm != nullptr && !is_backward) { + norm_to_use = (degree_for_norm[src]) ? (1.0 / degree_for_norm[src]) : 0.0; + } + + if (thread_lane < 2) { + edge_begin_end[warp_lane][thread_lane] = edge_index[src + thread_lane]; + } + // essentially what this is doing is making 2 of the threads set edge + // begin/end; all threads wait for sync + __syncthreads(); + + const int row_begin = edge_begin_end[warp_lane][0]; + const int row_end = edge_begin_end[warp_lane][1]; + unsigned base_src_index = src * column_length; + + for (int offset = row_begin; offset < row_end; offset++) { + cuda_bitset_graph_aggregate->set(src); + int dst = edge_destination[offset]; + unsigned base_dst_index = dst * column_length; + + if (degree_for_norm != nullptr && is_backward) { + norm_to_use = + (degree_for_norm[dst]) ? (1.0 / degree_for_norm[dst]) : 0.0; + } + + // NOTE: this is where warp diverges + // the feature aggregation is split among thread in a warp + for (int i = 0; i < column_length; i += WARP_SIZE) { + if ((thread_lane + i) < column_length) { + if (degree_for_norm != nullptr) { + aggregate_output[base_src_index + thread_lane + i] += + node_embeddings[base_dst_index + thread_lane + i] * norm_to_use; + } else { + aggregate_output[base_src_index + thread_lane + i] += + node_embeddings[base_dst_index + thread_lane + i]; + } + } + } + } + } +} + +} // namespace + +// TODO(lhc) Will need to iterate over in-edges if is_backward is on +void galois::SAGEGPUAllocations::AggregateAllGPU( + const graphs::GNNGraphGPUAllocations& gpu_graph, size_t num_nodes, + size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, bool use_norm, bool is_backward) { + // num_nodes should be greater than 0 to avoid negative number of thread + if (num_nodes == 0) { + return; + } + + CUDA_CHECK(cudaMemset(aggregate_output, 0, + num_nodes * column_length * sizeof(GNNFloat))); + if (use_norm) { + uint32_t* degree_for_norm{nullptr}; + // TODO(lhc) will be added for sampling + // if (use_subgraph_) { + //} else { + degree_for_norm = gpu_graph.get_global_degrees(); + //} + AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>( + num_nodes, column_length, gpu_graph.edge_index(), + gpu_graph.edge_destinations(), degree_for_norm, node_embeddings, + aggregate_output, cuda_bitset_graph_aggregate.gpu_wr_ptr(), + is_backward); + } else { + AggregateAllKernel<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>( + num_nodes, column_length, gpu_graph.edge_index(), + gpu_graph.edge_destinations(), nullptr, node_embeddings, + aggregate_output, cuda_bitset_graph_aggregate.gpu_wr_ptr(), + is_backward); + } + CUDA_TEST("GPU aggregate all failure"); +} + +void galois::SAGEGPUAllocations::UpdateEmbeddingsGPU( + size_t num_nodes, size_t input_columns, size_t output_columns, + const GNNFloat* node_embeddings, const GNNFloat* layer_weights, + GNNFloat* output) { + CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_N, num_nodes, input_columns, + output_columns, node_embeddings, layer_weights, output); +} + +void galois::SAGEGPUAllocations::UpdateEmbeddingsDerivativeGPU( + size_t num_nodes, size_t input_columns, size_t output_columns, + const GNNFloat* gradients, const GNNFloat* layer_weights, + GNNFloat* output) { + // note output clumns/input columns are flipped due to transpose of the + // layer weights + CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_T, num_nodes, output_columns, + input_columns, gradients, layer_weights, output); +} + +void galois::SAGEGPUAllocations::GetWeightGradientsGPU( + size_t num_nodes, size_t input_columns, size_t output_columns, + const GNNFloat* prev_input, const GNNFloat* gradients, GNNFloat* output) { + CBlasSGEMMGPU(CUBLAS_OP_T, CUBLAS_OP_N, input_columns, num_nodes, + output_columns, prev_input, gradients, output); +} + +void galois::SAGEGPUAllocations::SelfFeatureUpdateEmbeddingsGPU( + size_t input_rows, size_t input_columns, size_t output_columns, + const GNNFloat* node_embeddings, GNNFloat* output) { + CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_N, input_rows, input_columns, + output_columns, node_embeddings, layer_weights_2_, output, + true); +} + +void galois::SAGEGPUAllocations::SelfFeatureUpdateEmbeddingsDerivativeGPU( + size_t input_rows, size_t output_columns, size_t input_columns, + const GNNFloat* gradients, GNNFloat* output) { + CBlasSGEMMGPU(CUBLAS_OP_N, CUBLAS_OP_T, input_rows, output_columns, + input_columns, gradients, layer_weights_2_, output, true); +} + +void galois::SAGEGPUAllocations::UpdateWeight2DerivativeGPU( + size_t input_columns, size_t input_rows, size_t output_columns, + const GNNFloat* prev_layer_inputs, const GNNFloat* input_gradients, + GNNFloat* output) { + CBlasSGEMMGPU(CUBLAS_OP_T, CUBLAS_OP_N, input_columns, input_rows, + output_columns, prev_layer_inputs, input_gradients, output); +} + +void galois::SAGEGPUAllocations::CopyToWeights2( + const std::vector& cpu_layer_weights) { + CUDA_CHECK(cudaMemcpy(layer_weights_2_, cpu_layer_weights.data(), + cpu_layer_weights.size() * sizeof(GNNFloat), + cudaMemcpyHostToDevice)); +} + +void galois::SAGEGPUAllocations::CopyToWeight2Gradients( + const std::vector& cpu_gradients) { + CUDA_CHECK(cudaMemcpy(layer_weight_gradients_2_, cpu_gradients.data(), + cpu_gradients.size() * sizeof(GNNFloat), + cudaMemcpyHostToDevice)); +} + +void galois::SAGEGPUAllocations::CopyWeight2GradientsToCPU( + std::vector* cpu_gradients) { + CUDA_CHECK(cudaMemcpy(cpu_gradients->data(), layer_weight_gradients_2_, + cpu_gradients->size() * sizeof(GNNFloat), + cudaMemcpyDeviceToHost)); +} diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index beccf42289..312bdab9ac 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -72,8 +72,8 @@ galois::SoftmaxLayer::ForwardPhase( if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.ForwardPhaseGPU( layer_phase_, graph_.size(), layer_dimensions_.input_columns, - input_embeddings.data(), p_forward_output_matrix_.data()); - return p_forward_output_matrix_; + input_embeddings.data(), p_backward_output_matrix_.data()); + return p_backward_output_matrix_; } #endif return ForwardPhaseCPU(input_embeddings); diff --git a/libgnn/src/layers/SoftmaxLayer.cu b/libgnn/src/layers/SoftmaxLayer.cu index f24a6f1e77..e29c1bb201 100644 --- a/libgnn/src/layers/SoftmaxLayer.cu +++ b/libgnn/src/layers/SoftmaxLayer.cu @@ -3,33 +3,77 @@ #include "galois/Logging.h" #include "galois/layers/SoftmaxLayer.cuh" +void galois::SoftmaxLayerGPU::CopyToCPU(GNNFloat* input, size_t size) { + GNNFloat* cpu_input = (GNNFloat*)malloc(sizeof(GNNFloat) * size); + cudaMemcpy(cpu_input, input, sizeof(GNNFloat) * size, cudaMemcpyDeviceToHost); + for (size_t i = 0; i < size; i++) + fprintf(stderr, "%lu = %f\n", i, cpu_input[i]); +} + void galois::SoftmaxLayerGPU::ForwardPhaseGPU(galois::GNNPhase phase, size_t num_nodes, size_t feature_length, const GNNFloat* input_embeddings, GNNFloat* output) { char* mask_to_use = ChooseMask(phase); - CUDA_CHECK( - cudaMemset(output, 0, num_nodes * feature_length * sizeof(GNNFloat))); SoftmaxCrossEntropyForward<<>>( mask_to_use, num_nodes, feature_length, input_embeddings, output); CUDA_TEST("Softmax cross entropy forward failed"); } +__global__ void SoftmaxBackward(char* mask, size_t num_nodes, + size_t feature_length, + const galois::GNNFloat* predictions, + const galois::GNNLabel* ground_truth, + galois::GNNFloat* output_gradient) { + const unsigned global_thread_id = + BLOCK_SIZE * blockIdx.x + threadIdx.x; // global thread index + const unsigned warp_thread_lane = + threadIdx.x & (WARP_SIZE - 1); // thread index within the warp + const unsigned warp_id = global_thread_id / WARP_SIZE; // global warp index + const unsigned num_warps = + (BLOCK_SIZE / WARP_SIZE) * gridDim.x; // total number of active warps + + // a warp works on a single node at once + for (unsigned wid = warp_id; wid < num_nodes; wid += num_warps) { + // operate only if masked + if (mask[wid] == 1) { + unsigned base_index = wid * feature_length; + // TODO can refactor below to device functions + // cross entropy derivative + // each thread of warp takes different feature + for (unsigned feat_index = warp_thread_lane; feat_index < feature_length; + feat_index += WARP_SIZE) { + if (feat_index < feature_length) { + if (feat_index == (unsigned)ground_truth[wid]) { + output_gradient[base_index + feat_index] = + predictions[base_index + feat_index] - 1; + } else { + output_gradient[base_index + feat_index] = + predictions[base_index + feat_index]; + } + } + } + __syncthreads(); + } + } +} + void galois::SoftmaxLayerGPU::BackwardPhaseGPU(galois::GNNPhase phase, size_t num_nodes, size_t feature_length, const GNNFloat* predictions, GNNFloat* output_gradient) { assert(feature_length <= MAX_NUM_CLASSES); + // num_nodes should be greater than 0 to avoid negative number of thread + if (num_nodes == 0) { + return; + } + char* mask_to_use = ChooseMask(phase); - CUDA_CHECK(cudaMemset(output_gradient, 0, - num_nodes * feature_length * sizeof(GNNFloat))); - // TODO check the launch parameters; this is taken directly from the original - // code - SoftmaxCrossEntropyBackward<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, - BLOCK_SIZE>>>(mask_to_use, num_nodes, - feature_length, predictions, - local_labels_, output_gradient); + SoftmaxBackward<<<(num_nodes - 1) / WARPS_PER_BLOCK + 1, BLOCK_SIZE>>>( + mask_to_use, num_nodes, feature_length, predictions, local_labels_, + output_gradient); + CUDA_TEST("Softmax cross entropy backward failed"); } diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 91835cfc07..9834b302e7 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -86,6 +86,10 @@ if (NOT GALOIS_ENABLE_GPU) target_link_libraries(sample-bit-test galois_gnn) add_test(NAME sample-bit-test COMMAND sample-bit-test) else() + add_executable(gpu-sage-layer-test gpu-sage-layer-test.cpp) + target_link_libraries(gpu-sage-layer-test galois_gnn) + add_test(NAME gpu-sage-layer-test COMMAND gpu-sage-layer-test) + add_executable(gpu-convlayer-test gpu-convlayer-test.cpp) target_link_libraries(gpu-convlayer-test galois_gnn) add_test(NAME gpu-convlayer-test COMMAND gpu-convlayer-test) @@ -104,6 +108,27 @@ else() add_executable(gpu-aggregate-sync-test gpu-aggregate-sync-test.cpp) target_link_libraries(gpu-aggregate-sync-test galois_gnn) + + set(gpu_hosts) + set(gpu_host 3) #TODO(lhc) more than 4 gpus, test failed + # seems like it happened due to graph size 0. + # so let me postpone this + while (${gpu_host} GREATER 1) + list(APPEND gpu_hosts ${gpu_host}) + math(EXPR gpu_host "${gpu_host} - 1") + endwhile() + list(APPEND gpu_hosts "1") + + add_executable(gpu-back-conv-test gpu-back-conv-test.cpp) + target_link_libraries(gpu-back-conv-test galois_gnn) + foreach(gpu_host_count ${gpu_hosts}) + set(PSET "-pset=") + foreach(iter RANGE 1 ${gpu_host_count}) + set(PSET "${PSET}g") + endforeach() + add_test(NAME run-gpu-back-conv-${gpu_host_count} COMMAND mpiexec --bind-to none -n ${gpu_host_count} ./gpu-back-conv-test ${PSET} -numNodes=1) + set_tests_properties(run-gpu-back-conv-${gpu_host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1") + endforeach() endif() # TODO multi host tests? diff --git a/libgnn/test/gpu-adam-test.cpp b/libgnn/test/gpu-adam-test.cpp index ed99982a78..58da1d3b68 100644 --- a/libgnn/test/gpu-adam-test.cpp +++ b/libgnn/test/gpu-adam-test.cpp @@ -1,4 +1,4 @@ -//! @file adam-test.cpp +//! @file gpu-adam-test.cpp //! Tests the adam optimizer #include "galois/DistGalois.h" #include "galois/GNNOptimizers.h" @@ -32,8 +32,12 @@ int main() { dimension_0.input_rows = 7; dimension_0.input_columns = test_graph.GetNumLabelClasses(); dimension_0.output_columns = test_graph.GetNumLabelClasses(); - auto alloc_layer = - std::make_unique(3, test_graph, dimension_0); + std::vector output_matrix; + output_matrix.resize(dimension_0.input_rows * dimension_0.input_columns); + + galois::PointerWithSize output_layer(output_matrix); + auto alloc_layer = std::make_unique( + 3, test_graph, &output_layer, dimension_0); std::vector weights1 = {1, 1}; std::vector weights2 = {10}; diff --git a/libgnn/test/gpu-aggregate-sync-test.cpp b/libgnn/test/gpu-aggregate-sync-test.cpp index a3f645c5ee..3a0ee7f3d4 100644 --- a/libgnn/test/gpu-aggregate-sync-test.cpp +++ b/libgnn/test/gpu-aggregate-sync-test.cpp @@ -29,17 +29,22 @@ int main() { l_config.disable_aggregate_after_update = true; unsigned num_layers = 2; - test_graph->ResizeLayerVector(num_layers); + test_graph->ResizeGPULayerVector(num_layers); test_graph->InitLayerVectorMetaObjects( 0, galois::runtime::getSystemNetworkInterface().Num, dimension_0.input_columns, dimension_0.output_columns); test_graph->InitLayerVectorMetaObjects( 1, galois::runtime::getSystemNetworkInterface().Num, dimension_0.input_columns, dimension_0.output_columns); + + galois::PointerWithSize p_null(nullptr, 0); + std::vector back_matrix(21); + galois::PointerWithSize p_back(back_matrix); + // create the layer, no norm factor std::unique_ptr layer_0 = - std::make_unique(0, *(test_graph.get()), - dimension_0, l_config); + std::make_unique( + 0, *(test_graph.get()), &p_null, dimension_0, l_config); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner layer_0->ForwardPhase(test_graph->GetLocalFeatures()); @@ -110,7 +115,7 @@ int main() { // layer 0 means that an empty weight matrix is returned since there is no // point passing back anything layer_0->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones); - const std::vector& layer_0_backward_output = + const galois::PointerWithSize& layer_0_backward_output = layer_0->CopyBackwardOutputFromGPU(); ////////////////////////////////////////////////////////////////////////////// @@ -126,8 +131,8 @@ int main() { // layer 1 to check backward output ////////////////////////////////////////////////////////////////////////////// std::unique_ptr layer_1 = - std::make_unique(1, *(test_graph.get()), - dimension_0, l_config); + std::make_unique( + 1, *(test_graph.get()), &p_back, dimension_0, l_config); layer_1->InitAllWeightsTo1(); layer_1->ForwardPhase(test_graph->GetLocalFeatures()); const std::vector& layer_1_forward_output = @@ -176,7 +181,7 @@ int main() { // since layer isn't 0 anymore, backward phase will actually return something dummy_ones_v.assign(test_graph->size() * 2, 1); layer_1->BackwardPhase(test_graph->GetLocalFeatures(), &dummy_ones); - const std::vector& layer_1_backward_output = + const galois::PointerWithSize& layer_1_backward_output = layer_1->CopyBackwardOutputFromGPU(); for (size_t row = 0; row < test_graph->size(); row++) { diff --git a/libgnn/test/gpu-back-conv-test.cpp b/libgnn/test/gpu-back-conv-test.cpp new file mode 100644 index 0000000000..c089ffb698 --- /dev/null +++ b/libgnn/test/gpu-back-conv-test.cpp @@ -0,0 +1,167 @@ +//! @file gpu-back-conv-test.cpp +#include "galois/Logging.h" +#include "galois/layers/GraphConvolutionalLayer.h" +#include "galois/CUDAUtilHostDecls.h" + +extern int gpudevice; + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + + const unsigned my_host_id = galois::runtime::getHostID(); + gpudevice = my_host_id; + SetCUDADeviceId(gpudevice); + device_personality = DevicePersonality::GPU_CUDA; + + GALOIS_LOG_VERBOSE("[{}] Using {} threads", + galois::runtime::getSystemNetworkInterface().ID, + num_threads); + // load test graph + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kCVC, true); + + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = test_graph.size(); + dimension_0.input_columns = 3; + dimension_0.output_columns = 2; + + galois::GNNLayerConfig dcon; + dcon.DebugConfig(); + + galois::PointerWithSize p_null(nullptr, 0); + std::vector back_matrix(test_graph.size() * 3); + galois::PointerWithSize p_back(back_matrix); + + // dummy 1 matrix + std::vector dummy_ones_v(test_graph.size() * 2, 1); + + unsigned num_layers = 2; + test_graph.ResizeGPULayerVector(num_layers); + // require 0th substrate initialization + test_graph.InitLayerVectorMetaObjects( + 0, galois::runtime::getSystemNetworkInterface().Num, + dimension_0.input_columns, dimension_0.output_columns); + test_graph.InitLayerVectorMetaObjects( + 1, galois::runtime::getSystemNetworkInterface().Num, + dimension_0.input_columns, dimension_0.output_columns); + + std::vector output_matrix; + output_matrix.resize(dimension_0.input_rows * dimension_0.input_columns); + galois::PointerWithSize output_layer(output_matrix); + + // create layer 1 for testing backward prop actually giving weights back + std::unique_ptr layer_1 = + std::make_unique(1, test_graph, &p_back, + dimension_0, dcon); + galois::PointerWithSize dummy_ones = layer_1->AllocateGPU(dummy_ones_v); + layer_1->InitAllWeightsTo1(); + layer_1->ForwardPhase(test_graph.GetLocalFeatures()); + + const std::vector& layer_1_forward_output = + layer_1->CopyForwardOutputFromGPU(); + + for (size_t row = 0; row < test_graph.size(); row++) { + // row -> GID + size_t global_row = test_graph.GetGID(row); + galois::GNNFloat ground_truth = 0.0; + + switch (global_row) { + case 0: + ground_truth = 3; + break; + case 1: + ground_truth = 6; + break; + case 2: + ground_truth = 12; + break; + case 3: + ground_truth = 18; + break; + case 4: + ground_truth = 24; + break; + case 5: + ground_truth = 30; + break; + case 6: + ground_truth = 15; + break; + default: + GALOIS_LOG_FATAL("bad global row for test graph"); + break; + } + // size 2 columns + for (size_t c = 0; c < 2; c++) { + GALOIS_LOG_VASSERT(layer_1_forward_output[row * 2 + c] == ground_truth, + "{} not {}", ground_truth, + layer_1_forward_output[row * 2 + c]); + } + } + + layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + const galois::PointerWithSize& layer_1_backward_output = + layer_1->CopyBackwardOutputFromGPU(); + + for (size_t row = 0; row < test_graph.size(); row++) { + // row -> GID + size_t global_row = test_graph.GetGID(row); + galois::GNNFloat ground_truth = 0.0; + + switch (global_row) { + case 0: + ground_truth = 2; + break; + case 1: + ground_truth = 4; + break; + case 2: + ground_truth = 4; + break; + case 3: + ground_truth = 4; + break; + case 4: + ground_truth = 4; + break; + case 5: + ground_truth = 4; + break; + case 6: + ground_truth = 2; + break; + default: + GALOIS_LOG_FATAL("bad global row for test graph"); + break; + } + // size 2 columns + for (size_t c = 0; c < 3; c++) { + GALOIS_LOG_ASSERT(layer_1_backward_output[row * 3 + c] == ground_truth); + } + } + + const std::vector& layer_1_weight_gradients = + layer_1->CopyWeightGradientsFromGPU(); + + // make sure they are sane + GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); + GALOIS_LOG_VASSERT(layer_1_weight_gradients[0] == 36, "36 not {}", + layer_1_weight_gradients[0]); + GALOIS_LOG_VASSERT(layer_1_weight_gradients[1] == 36, "36 not {}", + layer_1_weight_gradients[1]); + GALOIS_LOG_VASSERT(layer_1_weight_gradients[2] == 36, "36 not {}", + layer_1_weight_gradients[2]); + GALOIS_LOG_VASSERT(layer_1_weight_gradients[3] == 36, "36 not {}", + layer_1_weight_gradients[3]); + GALOIS_LOG_VASSERT(layer_1_weight_gradients[4] == 36, "36 not {}", + layer_1_weight_gradients[4]); + GALOIS_LOG_VASSERT(layer_1_weight_gradients[5] == 36, "36 not {}", + layer_1_weight_gradients[5]); + + layer_1.reset(); + + return 0; +} diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp index 947a0b8703..553d96e1a2 100644 --- a/libgnn/test/gpu-convlayer-test.cpp +++ b/libgnn/test/gpu-convlayer-test.cpp @@ -33,18 +33,27 @@ int main() { galois::GNNLayerConfig dcon; dcon.disable_aggregate_after_update = false; + dcon.DebugConfig(); - unsigned num_layers = 2; - test_graph.ResizeLayerVector(num_layers); + galois::PointerWithSize p_null(nullptr, 0); + std::vector back_matrix(21); + galois::PointerWithSize p_back(back_matrix); + + unsigned num_layers = 3; + test_graph.ResizeGPULayerVector(num_layers); test_graph.InitLayerVectorMetaObjects( 0, galois::runtime::getSystemNetworkInterface().Num, dimension_0.input_columns, dimension_0.output_columns); test_graph.InitLayerVectorMetaObjects( 1, galois::runtime::getSystemNetworkInterface().Num, dimension_0.input_columns, dimension_0.output_columns); + test_graph.InitLayerVectorMetaObjects( + 2, galois::runtime::getSystemNetworkInterface().Num, + dimension_0.input_columns, dimension_0.output_columns); + // create the layer, no norm factor std::unique_ptr layer_0 = - std::make_unique(0, test_graph, + std::make_unique(0, test_graph, &p_null, dimension_0, dcon); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner @@ -86,36 +95,6 @@ int main() { // point passing back anything // galois::PointerWithSize layer_0_backward_output = layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); - const std::vector& layer_0_backward_output = - layer_0->CopyBackwardOutputFromGPU(); - - ////////////////////////////////////////////////////////////////////////////// - // sanity check layer 0 backward output; all 0 because layer 0 - ////////////////////////////////////////////////////////////////////////////// - // since norm factors aren't invovled it is possible to do full assertions - // 7 x 3 - GALOIS_LOG_ASSERT(layer_0_backward_output.size() == 21); - GALOIS_LOG_ASSERT((layer_0_backward_output)[0] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[1] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[2] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[3] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[4] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[5] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[6] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[7] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[8] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[9] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[10] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[11] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[12] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[13] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[14] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[15] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[16] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[17] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[18] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[19] == 0); - GALOIS_LOG_ASSERT((layer_0_backward_output)[20] == 0); const std::vector& layer_0_weight_gradients = layer_0->CopyWeightGradientsFromGPU(); @@ -134,7 +113,7 @@ int main() { // create layer 1 for testing backward prop actually giving weights back std::unique_ptr layer_1 = - std::make_unique(1, test_graph, + std::make_unique(1, test_graph, &p_back, dimension_0, dcon); layer_1->InitAllWeightsTo1(); layer_1->ForwardPhase(test_graph.GetLocalFeatures()); @@ -160,8 +139,9 @@ int main() { // since layer isn't 0 anymore, backward phase will actually return something dummy_ones = layer_1->AllocateGPU(dummy_ones_v); layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); - const std::vector& layer_1_backward_output = - layer_1->CopyBackwardOutputFromGPU(); + const galois::PointerWithSize& + layer_1_backward_output = layer_1->CopyBackwardOutputFromGPU(); + ////////////////////////////////////////////////////////////////////////////// // check that multiplies go as expected ////////////////////////////////////////////////////////////////////////////// @@ -204,62 +184,66 @@ int main() { // TODO get dropout and activation working - // galois::GNNLayerConfig config; - // config.do_dropout = true; - // config.do_activation = true; - // config.do_normalization = true; - // config.allow_aggregate_after_update = false; - - //// finally, just make sure dropout and activation run without crashes - //// (verification requires floating point accuracy or setting a seed which I - //// don't have time for at the moment - //// TODO in future maybe add better unit test for this - // std::unique_ptr layer_2 = - // std::make_unique(1, test_graph, - // dimension_0, config); - // galois::PointerWithSize l2_fo = - // layer_2->ForwardPhase(test_graph.GetLocalFeatures()); - // GALOIS_LOG_ASSERT(l2_fo.size() == 14); - // GALOIS_LOG_VERBOSE("{}", l2_fo[0]); - // GALOIS_LOG_VERBOSE("{}", l2_fo[1]); - // GALOIS_LOG_VERBOSE("{}", l2_fo[2]); - // GALOIS_LOG_VERBOSE("{}", l2_fo[3]); - // GALOIS_LOG_VERBOSE("{}", l2_fo[4]); - // GALOIS_LOG_VERBOSE("{}", l2_fo[5]); - // GALOIS_LOG_VERBOSE("{}", l2_fo[6]); - // GALOIS_LOG_VERBOSE("{}", l2_fo[7]); - // GALOIS_LOG_VERBOSE("{}", l2_fo[8]); - // GALOIS_LOG_VERBOSE("{}", l2_fo[9]); - // GALOIS_LOG_VERBOSE("{}", l2_fo[10]); - // GALOIS_LOG_VERBOSE("{}", l2_fo[11]); - // GALOIS_LOG_VERBOSE("{}", l2_fo[12]); - // GALOIS_LOG_VERBOSE("{}", l2_fo[13]); - - // galois::PointerWithSize l2_bo = - // layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); - - // GALOIS_LOG_ASSERT(l2_bo.size() == 21); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]); - // GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]); + galois::GNNLayerConfig config; + config.disable_dropout = false; + config.disable_activation = false; + config.disable_normalization = false; + config.disable_aggregate_after_update = true; + + // finally, just make sure dropout and activation run without crashes + // (verification requires floating point accuracy or setting a seed which I + // don't have time for at the moment + // TODO in future maybe add better unit test for this + std::unique_ptr layer_2 = + std::make_unique(2, test_graph, &p_back, + dimension_0, config); + layer_2->ForwardPhase(test_graph.GetLocalFeatures()); + // pointer is to GPU memory: copy it over to a CPU source for verification + const std::vector& l2_fo = + layer_2->CopyForwardOutputFromGPU(); + + GALOIS_LOG_ASSERT(l2_fo.size() == 14); + GALOIS_LOG_VERBOSE("{}", l2_fo[0]); + GALOIS_LOG_VERBOSE("{}", l2_fo[1]); + GALOIS_LOG_VERBOSE("{}", l2_fo[2]); + GALOIS_LOG_VERBOSE("{}", l2_fo[3]); + GALOIS_LOG_VERBOSE("{}", l2_fo[4]); + GALOIS_LOG_VERBOSE("{}", l2_fo[5]); + GALOIS_LOG_VERBOSE("{}", l2_fo[6]); + GALOIS_LOG_VERBOSE("{}", l2_fo[7]); + GALOIS_LOG_VERBOSE("{}", l2_fo[8]); + GALOIS_LOG_VERBOSE("{}", l2_fo[9]); + GALOIS_LOG_VERBOSE("{}", l2_fo[10]); + GALOIS_LOG_VERBOSE("{}", l2_fo[11]); + GALOIS_LOG_VERBOSE("{}", l2_fo[12]); + GALOIS_LOG_VERBOSE("{}", l2_fo[13]); + + layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + const galois::PointerWithSize& l2_bo = + layer_2->CopyBackwardOutputFromGPU(); + + GALOIS_LOG_ASSERT(l2_bo.size() == 21); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]); return 0; } diff --git a/libgnn/test/gpu-epoch-test.cpp b/libgnn/test/gpu-epoch-test.cpp index 3ac2c2b2ed..8b71b81e3f 100644 --- a/libgnn/test/gpu-epoch-test.cpp +++ b/libgnn/test/gpu-epoch-test.cpp @@ -1,4 +1,4 @@ -//! @file epoch-test.cpp +//! @file gpu-epoch-test.cpp //! Run 50 epochs of training to see if results improve. #include "galois/Logging.h" @@ -23,9 +23,7 @@ int main() { std::vector layer_output_sizes = { 16, test_graph->GetNumLabelClasses(), test_graph->GetNumLabelClasses()}; galois::GNNLayerConfig layer_config; - layer_config.do_dropout = true; - layer_config.do_activation = false; - layer_config.do_normalization = true; + layer_config.DebugConfig(); // XXX Activation kills accuracy compared to old code, esp. for cora galois::GraphNeuralNetworkConfig gnn_config( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, @@ -49,22 +47,18 @@ int main() { main_timer.start(); for (size_t epoch = 0; epoch < 100; epoch++) { galois::PointerWithSize predictions = gnn->DoInference(); - if (cpu_pred.size() != predictions.size()) { - cpu_pred.resize(predictions.size()); - } gnn->GradientPropagation(); // copy to cpu // TODO currently adam has this helper function; it should be handled // by other class though - adam->CopyToVector(cpu_pred, predictions); galois::gPrint("Epoch ", epoch, ": Accuracy is ", - gnn->GetGlobalAccuracy(cpu_pred), "\n"); + gnn->GetGlobalAccuracy(predictions), "\n"); } // check test accuracy gnn->SetLayerPhases(galois::GNNPhase::kTest); galois::PointerWithSize predictions = gnn->DoInference(); - adam->CopyToVector(cpu_pred, predictions); - galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(cpu_pred), "\n"); + galois::gPrint("Test accuracy is ", gnn->GetGlobalAccuracy(predictions), + "\n"); main_timer.stop(); } diff --git a/libgnn/test/gpu-sage-layer-test.cpp b/libgnn/test/gpu-sage-layer-test.cpp new file mode 100644 index 0000000000..7cec3b9a2b --- /dev/null +++ b/libgnn/test/gpu-sage-layer-test.cpp @@ -0,0 +1,270 @@ +//! @file gpu-sage-layer-test.cpp +//! Sage layer test + +#include "galois/Logging.h" +#include "galois/layers/SAGELayer.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = galois::setActiveThreads( + 56 / galois::runtime::getSystemNetworkInterface().Num); + + GALOIS_LOG_VERBOSE("[{}] Using {} threads", + galois::runtime::getSystemNetworkInterface().ID, + num_threads); + device_personality = DevicePersonality::GPU_CUDA; + + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = 7; + dimension_0.input_columns = 3; + dimension_0.output_columns = 2; + + // load test graph + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + unsigned num_layers = 3; + test_graph.ResizeGPULayerVector(num_layers); + test_graph.InitLayerVectorMetaObjects( + 0, galois::runtime::getSystemNetworkInterface().Num, + dimension_0.input_columns, dimension_0.output_columns); + test_graph.InitLayerVectorMetaObjects( + 1, galois::runtime::getSystemNetworkInterface().Num, + dimension_0.input_columns, dimension_0.output_columns); + test_graph.InitLayerVectorMetaObjects( + 2, galois::runtime::getSystemNetworkInterface().Num, + dimension_0.input_columns, dimension_0.output_columns); + + galois::GNNLayerConfig dcon; + dcon.disable_aggregate_after_update = false; + dcon.DebugConfig(); + + galois::PointerWithSize p_null(nullptr, 0); + std::vector back_matrix(21); + galois::PointerWithSize p_back(back_matrix); + + galois::SAGELayerConfig scon; + scon.disable_concat = false; + + std::unique_ptr layer_0 = + std::make_unique(0, test_graph, &p_null, dimension_0, + dcon, scon); + layer_0->InitAllWeightsTo1(); + // sage weights for self + layer_0->InitSelfWeightsTo1(); + + // make sure it runs in a sane manner + layer_0->ForwardPhase(test_graph.GetLocalFeatures()); + const std::vector& layer_0_forward_output = + layer_0->CopyForwardOutputFromGPU(); + + ////////////////////////////////////////////////////////////////////////////// + // sanity check layer 0 output + ////////////////////////////////////////////////////////////////////////////// + // since norm factors aren't invovled it is possible to do full assertions + // 7 x 2 + + GALOIS_LOG_ASSERT(layer_0_forward_output.size() == 14); + GALOIS_LOG_VASSERT(layer_0_forward_output[0] == 3, "{} should be 3", + layer_0_forward_output[0]); + GALOIS_LOG_ASSERT(layer_0_forward_output[1] == 3); + GALOIS_LOG_VASSERT(layer_0_forward_output[2] == 9, "{} should be 6", + layer_0_forward_output[2]); + GALOIS_LOG_ASSERT(layer_0_forward_output[3] == 9); + GALOIS_LOG_ASSERT(layer_0_forward_output[4] == 18); + GALOIS_LOG_ASSERT(layer_0_forward_output[5] == 18); + GALOIS_LOG_ASSERT(layer_0_forward_output[6] == 27); + GALOIS_LOG_ASSERT(layer_0_forward_output[7] == 27); + GALOIS_LOG_ASSERT(layer_0_forward_output[8] == 36); + GALOIS_LOG_ASSERT(layer_0_forward_output[9] == 36); + GALOIS_LOG_ASSERT(layer_0_forward_output[10] == 45); + GALOIS_LOG_ASSERT(layer_0_forward_output[11] == 45); + GALOIS_LOG_ASSERT(layer_0_forward_output[12] == 33); + GALOIS_LOG_ASSERT(layer_0_forward_output[13] == 33); + ////////////////////////////////////////////////////////////////////////////// + + // dummy 1 matrix + std::vector dummy_ones_v(14, 1); + galois::PointerWithSize dummy_ones = + layer_0->AllocateGPU(dummy_ones_v); + + // backward pass checking + // layer 0 means that an empty weight matrix is returned since there is no + // point passing back anything + layer_0->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + + const std::vector& layer_0_weight_gradients = + layer_0->CopyWeightGradientsFromGPU(); + const std::vector& layer_0_weight_gradients_2 = + layer_0->CopyWeight2GradientsFromGPU(); + + // make sure they are sane + GALOIS_LOG_ASSERT(layer_0_weight_gradients.size() == 6); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[0] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[1] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[2] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[3] == 36); + GALOIS_LOG_ASSERT(layer_0_weight_gradients[4] == 36); + + // make sure they are sane + GALOIS_LOG_ASSERT(layer_0_weight_gradients_2.size() == 6); + GALOIS_LOG_VASSERT(layer_0_weight_gradients_2[0] == 21, + "{} is wrong should be {}", layer_0_weight_gradients_2[0], + 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[1] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[2] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[3] == 21); + GALOIS_LOG_ASSERT(layer_0_weight_gradients_2[4] == 21); + + layer_0.reset(); + + //////////////////////////////////////////////////////////////////////////////// + + // create layer 1 for testing backward prop actually giving weights back + auto layer_1 = std::make_unique(1, test_graph, &p_back, + dimension_0, dcon, scon); + layer_1->InitAllWeightsTo1(); + layer_1->InitSelfWeightsTo1(); + + layer_1->ForwardPhase(test_graph.GetLocalFeatures()); + const std::vector& layer_1_forward_output = + layer_1->CopyForwardOutputFromGPU(); + + // same check as before for sanity purposes + GALOIS_LOG_ASSERT(layer_1_forward_output.size() == 14); + GALOIS_LOG_VASSERT(layer_1_forward_output[0] == 3, "{} should be 3", + layer_1_forward_output[0]); + GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3); + GALOIS_LOG_VASSERT(layer_1_forward_output[2] == 9, "{} should be 6", + layer_1_forward_output[2]); + GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 9); + GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 18); + GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 18); + GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 27); + GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 27); + GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 36); + GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 36); + GALOIS_LOG_ASSERT(layer_1_forward_output[10] == 45); + GALOIS_LOG_ASSERT(layer_1_forward_output[11] == 45); + GALOIS_LOG_ASSERT(layer_1_forward_output[12] == 33); + GALOIS_LOG_ASSERT(layer_1_forward_output[13] == 33); + + // since layer isn't 0 anymore, backward phase will actually return something + dummy_ones_v.assign(14, 1); + dummy_ones = layer_1->AllocateGPU(dummy_ones_v); + layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + const galois::PointerWithSize& layer_1_backward_output = + layer_1->CopyBackwardOutputFromGPU(); + + ////////////////////////////////////////////////////////////////////////////// + // check that multiplies go as expected + ////////////////////////////////////////////////////////////////////////////// + GALOIS_LOG_ASSERT(layer_1_backward_output.size() == 21); + GALOIS_LOG_ASSERT((layer_1_backward_output)[0] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[1] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[2] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[3] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[4] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[5] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[6] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[7] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[8] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[9] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[10] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[11] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[12] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[13] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[14] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[15] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[16] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[17] == 6); + GALOIS_LOG_ASSERT((layer_1_backward_output)[18] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[19] == 4); + GALOIS_LOG_ASSERT((layer_1_backward_output)[20] == 4); + + const std::vector& layer_1_weight_gradients = + layer_1->CopyWeightGradientsFromGPU(); + // make sure they are sane + GALOIS_LOG_ASSERT(layer_1_weight_gradients.size() == 6); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 36); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 36); + + const std::vector& layer_1_weight_gradients_2 = + layer_1->CopyWeight2GradientsFromGPU(); + GALOIS_LOG_ASSERT(layer_1_weight_gradients_2.size() == 6); + GALOIS_LOG_VASSERT(layer_1_weight_gradients_2[0] == 21, + "{} is wrong should be {}", layer_1_weight_gradients_2[0], + 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[1] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[2] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[3] == 21); + GALOIS_LOG_ASSERT(layer_1_weight_gradients_2[4] == 21); + + layer_1.reset(); + + //////////////////////////////////////////////////////////////////////////////// + + galois::GNNLayerConfig config; + config.disable_dropout = false; + config.disable_activation = false; + config.disable_normalization = false; + config.disable_aggregate_after_update = false; + + // finally, just make sure dropout and activation run without crashes + // (verification requires floating point accuracy or setting a seed which I + // don't have time for at the moment + // TODO in future maybe add better unit test for this + auto layer_2 = std::make_unique(2, test_graph, &p_back, + dimension_0, config, scon); + layer_2->ForwardPhase(test_graph.GetLocalFeatures()); + const std::vector& l2_fo = + layer_2->CopyForwardOutputFromGPU(); + + GALOIS_LOG_ASSERT(l2_fo.size() == 14); + GALOIS_LOG_VERBOSE("{}", l2_fo[0]); + GALOIS_LOG_VERBOSE("{}", l2_fo[1]); + GALOIS_LOG_VERBOSE("{}", l2_fo[2]); + GALOIS_LOG_VERBOSE("{}", l2_fo[3]); + GALOIS_LOG_VERBOSE("{}", l2_fo[4]); + GALOIS_LOG_VERBOSE("{}", l2_fo[5]); + GALOIS_LOG_VERBOSE("{}", l2_fo[6]); + GALOIS_LOG_VERBOSE("{}", l2_fo[7]); + GALOIS_LOG_VERBOSE("{}", l2_fo[8]); + GALOIS_LOG_VERBOSE("{}", l2_fo[9]); + GALOIS_LOG_VERBOSE("{}", l2_fo[10]); + GALOIS_LOG_VERBOSE("{}", l2_fo[11]); + GALOIS_LOG_VERBOSE("{}", l2_fo[12]); + GALOIS_LOG_VERBOSE("{}", l2_fo[13]); + + layer_2->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); + const galois::PointerWithSize& l2_bo = + layer_2->CopyBackwardOutputFromGPU(); + + GALOIS_LOG_ASSERT(l2_bo.size() == 21); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[0]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[1]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[2]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[3]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[4]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[5]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[6]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[7]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[8]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[9]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[10]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[11]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[12]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[13]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[14]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[15]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[16]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[17]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[18]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[19]); + GALOIS_LOG_VERBOSE("{}", (l2_bo)[20]); + + return 0; +} diff --git a/libgnn/test/gpu-softmaxlayer-test.cpp b/libgnn/test/gpu-softmaxlayer-test.cpp index 5d52e80e35..64b7c9e6f0 100644 --- a/libgnn/test/gpu-softmaxlayer-test.cpp +++ b/libgnn/test/gpu-softmaxlayer-test.cpp @@ -1,4 +1,4 @@ -//! @file convlayer-test.cpp +//! @file gpu-softmaxlayer-test.cpp //! Softmax layer test with a test graph #include "galois/Logging.h" @@ -25,9 +25,12 @@ int main() { GALOIS_LOG_VERBOSE("Num output classes is {}", dimension_0.input_columns); + std::vector back_matrix(49); + galois::PointerWithSize p_back(back_matrix); + // train mode - auto output_layer = - std::make_unique(3, test_graph, dimension_0); + auto output_layer = std::make_unique( + 3, test_graph, &p_back, dimension_0); // input to softmax std::vector softmax_input(49, 0.0); // create input with perfect accuracy @@ -42,9 +45,11 @@ int main() { output_layer->AllocateGPU(softmax_input); output_layer->ForwardPhase(p_softmax_input); + output_layer->PrintForwardOutputGPU(); - const std::vector& prediction_distribution = - output_layer->CopyForwardOutputFromGPU(); + // Softmax reuses output vector for forward phase + const galois::PointerWithSize prediction_distribution = + output_layer->CopyBackwardOutputFromGPU(); // assert that predictions are as expected for (size_t i = 0; i < 5; i++) { @@ -63,17 +68,12 @@ int main() { } output_layer->BackwardPhase(p_softmax_input, nullptr); - const std::vector& backward_output = - output_layer->CopyBackwardOutputFromGPU(); - printf("Output 1\n========\n"); - for (galois::GNNFloat a : backward_output) { - printf("%f\n", a); - } // validation mode output_layer->SetLayerPhase(galois::GNNPhase::kValidate); output_layer->ForwardPhase(p_softmax_input); - std::vector pd2 = output_layer->CopyForwardOutputFromGPU(); + galois::PointerWithSize pd2 = + output_layer->CopyBackwardOutputFromGPU(); // validate vertex is index 5 GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd2[5 * 7])) == 5); @@ -97,17 +97,12 @@ int main() { } output_layer->BackwardPhase(p_softmax_input, nullptr); - const std::vector& backward_output2 = - output_layer->CopyBackwardOutputFromGPU(); - printf("Output 2\n========\n"); - for (galois::GNNFloat a : backward_output2) { - printf("%f\n", a); - } // test mode output_layer->SetLayerPhase(galois::GNNPhase::kTest); output_layer->ForwardPhase(p_softmax_input); - std::vector pd3 = output_layer->CopyForwardOutputFromGPU(); + galois::PointerWithSize pd3 = + output_layer->CopyBackwardOutputFromGPU(); // validate vertex is index 6 GALOIS_LOG_ASSERT(galois::MaxIndex(7, &(pd3[6 * 7])) == 6); // all but last are empty distributions @@ -122,12 +117,6 @@ int main() { } output_layer->BackwardPhase(softmax_input, nullptr); - const std::vector& backward_output3 = - output_layer->CopyBackwardOutputFromGPU(); - printf("Output 3\n========\n"); - for (galois::GNNFloat a : backward_output3) { - printf("%f\n", a); - } // TODO in future maybe: add better test for backward phase besides just // running it diff --git a/scripts/galois_gnn_log_parser.R b/scripts/galois_gnn_log_parser.R new file mode 100644 index 0000000000..4e60af0c5d --- /dev/null +++ b/scripts/galois_gnn_log_parser.R @@ -0,0 +1,221 @@ +#!/usr/bin/env Rscript + +####################################################### +# Author: Gurbinder Gill +# Email: gill@cs.utexas.edu +# Date: Oct 8, 2017 +###################################################### +library("optparse") +library('data.table') + +convertZeroTosStr <- function(a) { + if (identical(numeric(0), as.numeric(a)) == 0) { + a <- as.numeric(a) / 1000 + } else { + a <- "0" + } + return (a) +} + +####START: @function to parse commadline################## +# Parses the command line to get the arguments used +parseCmdLine <- function (logData, isSharedMemGaloisLog, graphPassedAsInput) { + ## Select commandline & param rows + cmdLineRow <- subset(logData, CATEGORY == "CommandLine" & STAT_TYPE == "PARAM") + + ## Distributed has extra column: HostID + if(isTRUE(isSharedMemGaloisLog)){ + cmdLine <- substring(cmdLineRow[,5], 0) + } + else { + cmdLine <- substring(cmdLineRow[,6], 0) + } + + cmdLineSplit = strsplit(cmdLine, "\\s+")[[1]] + deviceKind = "CPU" + if(!isTRUE(isSharedMemGaloisLog)){ + ## To check the device kind + pos = regexpr('-pset', cmdLineSplit) + deviceKind = "" + if(sum(pos>0) > 0){ + deviceKind = "GPU" + } else { + deviceKind = "CPU" + } + } + + ## First postitional argument is always name of the executable + ### WORKING: split the exePath name found at the position 1 of the argument list and split on "/". + exePathSplit <- strsplit(cmdLineSplit[1], "/")[[1]] + benchmark <- exePathSplit[length(exePathSplit)] + + ## subset the threads row from the table + numThreads <- (subset(logData, CATEGORY == "Threads" & TOTAL_TYPE != "HostValues"))$TOTAL + + input = "noInput" + if(isTRUE(graphPassedAsInput)){ + ## subset the input row from the table + inputPath <- (subset(logData, CATEGORY == "Input" & STAT_TYPE == "PARAM"))$TOTAL + print(inputPath) + if(!identical(inputPath, character(0))){ + inputPathSplit <- strsplit(inputPath, "/")[[1]] + input <- inputPathSplit[length(inputPathSplit)] + } + else { + inputPathSplit <- strsplit(inputPath[[2]], "/")[[1]] + input <- inputPathSplit[length(inputPathSplit)] + } + + ### This is to remore the extension for example .gr or .sgr + inputsplit <- strsplit(input, "[.]")[[1]] + if(length(inputsplit) > 1) { + input <- inputsplit[1] + } + } + + if(isTRUE(isSharedMemGaloisLog)){ + returnList <- list("benchmark" = benchmark, "input" = input, "numThreads" = numThreads, "deviceKind" = deviceKind) + return(returnList) + } + + ## Need more params for distributed galois logs + numHosts <- (subset(logData, CATEGORY == "Hosts"& TOTAL_TYPE != "HostValues"))$TOTAL + + partitionScheme <- (subset(logData, CATEGORY == "PartitionScheme"& TOTAL_TYPE != "HostValues"))$TOTAL + + runID <- (subset(logData, CATEGORY == "Run_UUID"& TOTAL_TYPE != "HostValues"))$TOTAL + + numIterations <- (subset(logData, CATEGORY == "NumIterations_0"& TOTAL_TYPE != "HostValues"))$TOTAL + #If numIterations is not printed in the log files + if(identical(numIterations, character(0))){ + numIterations <- 0 + } + + end2endTimer <- (subset(logData, CATEGORY == "Timer_0"& TOTAL_TYPE != "HostValues"))$TOTAL + end2endTimer <- convertZeroTosStr(end2endTimer) + + aggr_fwd <- (subset(logData, CATEGORY == "AggregateForward"))$TOTAL + aggr_fwd <- convertZeroTosStr(aggr_fwd) + + aggr_bwd <- (subset(logData, CATEGORY == "AggregateBackward"))$TOTAL + aggr_bwd <- convertZeroTosStr(aggr_bwd) + + fwd_total <- (subset(logData, CATEGORY == "ForwardPhase"))$TOTAL + fwd_total <- convertZeroTosStr(fwd_total) + + fwd_xform <- (subset(logData, CATEGORY == "ForwardXForm"))$TOTAL + fwd_xform <- convertZeroTosStr(fwd_xform) + + bwd_total <- (subset(logData, CATEGORY == "BackwardPhase"))$TOTAL + bwd_total <- convertZeroTosStr(bwd_total) + + bwd_xform <- (subset(logData, CATEGORY == "BackwardXForm"))$TOTAL + bwd_xform <- convertZeroTosStr(bwd_xform) + + avg_epoch <- (subset(logData, CATEGORY == "AverageEpochTime"))$TOTAL + avg_epoch <- convertZeroTosStr(avg_epoch) + + final_accuracy <- (subset(logData, CATEGORY == "FinalTestAccuracy"))$TOTAL + + train_time <- (subset(logData, CATEGORY == "TrainingTime"))$TOTAL + train_time <- convertZeroTosStr(train_time) + + sync_aggr <- (subset(logData, CATEGORY == "Sync_GraphAggregateSync_0"))$TOTAL + sync_aggr <- convertZeroTosStr(sync_aggr) + + sync_weight <- (subset(logData, CATEGORY == "Sync_WeightGradientsSum"))$TOTAL + sync_weight <- convertZeroTosStr(sync_weight) + + buff_breserve_time <- (subset(logData, CATEGORY == + "BroadcastExtract_GraphAggregateSync_0"))$TOTAL + buff_breserve_time <- convertZeroTosStr(buff_breserve_time) + buff_bextract_time <- (subset(logData, CATEGORY == + "BroadcastExtractBatch_GraphAggregateSync_0"))$TOTAL + buff_bextract_time <- convertZeroTosStr(buff_bextract_time) + buff_rreserve_time <- (subset(logData, CATEGORY == + "ReduceExtract_GraphAggregateSync_0"))$TOTAL + buff_rreserve_time <- convertZeroTosStr(buff_rreserve_time) + buff_rextract_time <- (subset(logData, CATEGORY == + "ReduceExtractBatch_GraphAggregateSync_0"))$TOTAL + buff_rextract_time <- convertZeroTosStr(buff_rextract_time) + + print(input) + print(partitionScheme) + print(numHosts) + ## returnList for distributed galois log + returnList <- list("RunID" = runID, "Benchmark" = benchmark, + "Input" = input, "PartitionScheme" = partitionScheme, + "Hosts" = numHosts, "NumThreads" = numThreads, + "EndToEndTime" = end2endTimer, + "TrainTime" = train_time, + "TotalForwardTime" = fwd_total, + "ForwardAggregate" = aggr_fwd, + "ForwardXform" = fwd_xform, + "TotalBackwardTime" = bwd_total, + "BackwardAggregate" = aggr_bwd, + "BackwardXfrom" = bwd_xform, + "AverageEpochTime" = avg_epoch, + "FinalTestAccuracy" = final_accuracy, + "AggregateSync" = sync_aggr, + "Broadcast_buf_reserve" = buff_breserve_time, + "Broadcast_buf_extract" = buff_bextract_time, + "Reduce_buf_reserve" = buff_rreserve_time, + "Reduce_buf_extract" = buff_rreserve_time, + "AggregateWeight" = sync_weight, + "DeviceKind" = deviceKind) + + print("List") + print(returnList) + # Timer is milli-sec unit + return(returnList) +} +#### END: @function to parse commadline ################## + +#### START: @function entry point for galois log parser ################## +galoisLogParser <- function(input, output) { + logData <- read.csv(input, stringsAsFactors=F, strip.white=T) + + printNormalStats = TRUE; + print("Parsing commadline") + paramList <- parseCmdLine(logData, F, T) + print("Parsing timers for shared memory galois log") + + ## if computing RSD then normal stats are not printed + if(isTRUE(printNormalStats)){ + if(!file.exists(output)){ + print(paste(output, "Does not exist. Creating new file")) + print(as.data.frame(paramList)) + write.csv(as.data.frame(paramList), file=output, row.names=F, quote=F) + } else { + print(paste("Appending data to the existing file", output)) + write.table(as.data.frame(paramList), file=output, row.names=F, col.names=F, quote=F, append=T, sep=",") + } + } +} +#### END: @function entry point for shared memory galois log ################## + +############################################# +## Commandline options. +####################################### +option_list = list( + make_option(c("-i", "--input"), action="store", default=NA, type='character', + help="name of the input file to parse"), + make_option(c("-o", "--output"), action="store", default=NA, type='character', + help="name of the output file parsed") + ) + +opt_parser <- OptionParser(usage = "%prog [options] -i input.log -o output.csv", option_list=option_list) +opt <- parse_args(opt_parser) + +if (is.na(opt$i)){ + print_help(opt_parser) + stop("At least one argument must be supplied (input file)", call.=FALSE) +} else { + if (is.na(opt$o)){ + print("Output file name is not specified. Using name ouput.csv as default") + opt$o <- "output.csv" + } + galoisLogParser(opt$i, opt$o) +} + +##################### END ##################### diff --git a/scripts/run-gpu.sh b/scripts/run-gpu.sh new file mode 100644 index 0000000000..9f78915a03 --- /dev/null +++ b/scripts/run-gpu.sh @@ -0,0 +1,44 @@ +EXECS=( "gcn-dist" "gcn-dist-pinned" ) +#INPUTS=( "ogbn-products" ) +INPUTS=( "reddit" "ogbn-products" ) +#INPUTS=( "ogbn-papers100M" ) +TYPES=( "sage" ) +LSIZE=16 +NLAYERS=2 +EPOCH=200 +PSET="g" + +for e in "${EXECS[@]}" +do + for t in 0 + do + echo "Iter:"$t + PSET="g" + for n in 1 2 3 4 + do + for i in "${INPUTS[@]}" + do + for k in "${TYPES[@]}" + do + TYPES_STR=${k} + LSIZE_STR=${LSIZE} + for nr in {1..${NLAYERS}} + do + TYPES_STR+=","${k} + LSIZE_STR+=","${LSIZE} + done + echo GALOIS_DO_NOT_BIND_THREADS=1 mpirun -np $n ./${e} -inputDirectory=/net/ohm/export/iss/inputs/Learning/ -epochs=${EPOCH} \ + -layerTypes=${TYPES_STR} -disableDropout ${i} -layerSizes=${LSIZE_STR} \ + -numLayers=${NLAYERS} -t=56 -statFile=${e}_${i}_${k}_${LSIZE}_${NLAYERS}_${PSET}_${t}.stats -pset=${PSET} -numNodes=1 + + + CUDA_VISIBLE_DEVICES=2,3,4,5 GALOIS_DO_NOT_BIND_THREADS=1 mpirun -np $n ./${e} -inputDirectory=/net/ohm/export/iss/inputs/Learning/ -epochs=${EPOCH} \ + -layerTypes=${TYPES_STR} -disableDropout ${i} -layerSizes=${LSIZE_STR} \ + -numLayers=${NLAYERS} -t=56 -statFile=${e}_${i}_${k}_${LSIZE}_${NLAYERS}_${PSET}_${t}.stats -pset=${PSET} -numNodes=1 + done + done + PSET+="g" + echo $PSET + done + done +done diff --git a/scripts/run_gnnsys.sh b/scripts/run_gnnsys.sh new file mode 100644 index 0000000000..3b6ec31e70 --- /dev/null +++ b/scripts/run_gnnsys.sh @@ -0,0 +1,57 @@ +#!/bin/bash + +INPUTDIR="/net/ohm/export/iss/inputs/Learning/" +#EXECS=( "gcn-dist" "gcn-dist-pinned" ) +EXECS=( "gcn-dist" ) +#INPUTS=( "cora" "reddit" "ogbn-products" ) +INPUTS=( "reddit" ) +LAYERTYPES=( "sage" "gcn" ) +#LAYERTYPES=( "gcn" ) +LAYERSIZE=16 +NUMLAYERS=2 +#PARTITIONS=( "oec" "cvc" ) +PARTITIONS=( "oec" ) +DEVICES="0" + +FLAGS=" -epochs=200" +#FLAGS+=" -disableDropout" +#FLAGS+=" -testInterval=50" + +PREFIX="GALOIS_DO_NOT_BIND_THREADS=1 " + +for input in "${INPUTS[@]}" +do + for partition in "${PARTITIONS[@]}" + do +#for num_gpus in {2..4} + for num_gpus in 1 + do + PSET="g" + for ngpus in $(seq 2 ${num_gpus}) + do + PSET+="g" + done + for layer in "${LAYERTYPES[@]}" + do + for exe in "${EXECS[@]}" + do + # Variable parameters + LSIZE_STR=$LAYERSIZE + LTYPE_STR=$layer + for r in {1..${NUMLAYERS}} + do + LSIZE_STR+=","$LAYERSIZE + LTYPE_STR+=","$layer + done + echo "CUDA_VISIBLE_DEVICES=${DEVICES} GALOIS_DO_NOT_BIND_THREADS=1 mpirun -np $num_gpus ./${exe} $input $FLAGS -layerTypes=${LTYPE_STR} -t=1 \ + -pset=${PSET} -layerSizes=${LSIZE_STR} -numNodes=1 --inputDirectory=${INPUTDIR} \ + -statFile=${exe}_${input}_${layer}_${NUMLAYERS}_${LAYERSIZE}_${PSET}_${partition}.stat -partition=${partition}" + + CUDA_VISIBLE_DEVICES=${DEVICES} GALOIS_DO_NOT_BIND_THREADS=1 mpirun -np $num_gpus ./${exe} $input $FLAGS -layerTypes=${LTYPE_STR} -t=1 \ + -pset=${PSET} -layerSizes=${LSIZE_STR} -numNodes=1 --inputDirectory=${INPUTDIR} \ + -statFile=${exe}_${input}_${layer}_${NUMLAYERS}_${LAYERSIZE}_${PSET}_${partition}.stat -partition=${partition} + done + done + done + done +done From bd7b7cda0d831622cc5616606aa6a46fa50ec081 Mon Sep 17 00:00:00 2001 From: Hochan Lee Date: Mon, 3 May 2021 22:18:32 -0500 Subject: [PATCH 529/660] Add a GPU wrapper and fix libgnn CMakList error --- libgnn/src/graphs/GNNGraph.cpp | 6 +++++- libgnn/test/CMakeLists.txt | 4 ++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index cb63fbe307..dc175b5afc 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -540,7 +540,11 @@ void galois::graphs::GNNGraph::InitNormFactor() { global_degrees_.resize(partitioned_graph_->size(), 0.0); global_train_degrees_.resize(partitioned_graph_->size(), 0.0); CalculateFullNormFactor(); - gpu_memory_.InitNormFactor(partitioned_graph_->size()); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_memory_.InitNormFactor(partitioned_graph_->size()); + } +#endif } void galois::graphs::GNNGraph::CalculateFullNormFactor() { diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 9834b302e7..11c7ab78b8 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -59,14 +59,14 @@ if (NOT GALOIS_ENABLE_GPU) add_executable(aggregate-sync-test aggregate-sync-test.cpp) target_link_libraries(aggregate-sync-test galois_gnn) foreach(host_count ${hosts}) - add_test(NAME run-aggsync-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} aggregate-sync-test) + add_test(NAME run-aggsync-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} ./aggregate-sync-test) set_tests_properties(run-aggsync-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1") endforeach() add_executable(back-conv-test back-conv-test.cpp) target_link_libraries(back-conv-test galois_gnn) foreach(host_count ${hosts}) - add_test(NAME run-back-conv-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} back-conv-test) + add_test(NAME run-back-conv-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} ./back-conv-test) set_tests_properties(run-back-conv-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1") endforeach() From 50a767a8686bbf3844690ed6a28f7fb02630f763 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 3 May 2021 16:18:42 -0500 Subject: [PATCH 530/660] Minibatch generator loops through masters only Minibatch generator should only loop through masters to avoid duplicates in minibatches in a distributed setting. This assumes like all other parts of the code at the moment are that master nodes always take the prefix of node ids. Generator takes the right bound of it and uses it instead of the mask size. --- libgnn/include/galois/MinibatchGenerator.h | 15 +++++++++------ libgnn/include/galois/graphs/GNNGraph.h | 4 ++-- libgnn/src/MinibatchGenerator.cpp | 10 ++++++---- libgnn/src/graphs/GNNGraph.cpp | 13 +++++++++---- 4 files changed, 26 insertions(+), 16 deletions(-) diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h index 0bd063b90c..11bce02848 100644 --- a/libgnn/include/galois/MinibatchGenerator.h +++ b/libgnn/include/galois/MinibatchGenerator.h @@ -1,6 +1,7 @@ #pragma once #include "galois/GNNTypes.h" +#include "galois/Logging.h" namespace galois { @@ -8,21 +9,23 @@ namespace galois { //! the minibatch for class MinibatchGenerator { public: - MinibatchGenerator(const GNNMask& mask_to_minibatch, size_t minibatch_size) - : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size} { + MinibatchGenerator(const GNNMask& mask_to_minibatch, size_t minibatch_size, + size_t master_bound) + : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size}, + master_bound_{master_bound} { + GALOIS_LOG_ASSERT(master_bound_ <= mask_to_minibatch_.size()); } void GetNextMinibatch(std::vector* batch_mask); //! True if no more minibatches from this generator - bool NoMoreMinibatches() { - return current_position_ == mask_to_minibatch_.size(); - } + bool NoMoreMinibatches() { return current_position_ == master_bound_; } //! Reset the only state (a position bit) void ResetMinibatchState() { current_position_ = 0; } private: const GNNMask& mask_to_minibatch_; size_t minibatch_size_; - size_t current_position_{0}; + size_t current_position_; + size_t master_bound_; }; } // namespace galois diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 2d4bb5356b..7d6867a5c8 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -317,8 +317,8 @@ class GNNGraph { // clear before remake train_batcher_.reset(); } - train_batcher_ = std::make_unique(local_training_mask_, - train_batch_size); + train_batcher_ = std::make_unique( + local_training_mask_, train_batch_size, *end_owned()); local_minibatch_mask_.resize(partitioned_graph_->size()); } diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp index 7c3b6dd831..48570c094e 100644 --- a/libgnn/src/MinibatchGenerator.cpp +++ b/libgnn/src/MinibatchGenerator.cpp @@ -3,16 +3,18 @@ void galois::MinibatchGenerator::GetNextMinibatch( std::vector* batch_mask) { - std::fill(batch_mask->begin(), batch_mask->end(), 0); assert(current_position_ <= mask_to_minibatch_.size()); + assert(current_position_ <= master_bound_); assert(batch_mask->size() == mask_to_minibatch_.size()); - if (current_position_ >= mask_to_minibatch_.size()) { + + std::fill(batch_mask->begin(), batch_mask->end(), 0); + if (current_position_ >= master_bound_) { return; } size_t current_count = 0; // start from last positiion - while (current_position_ < mask_to_minibatch_.size()) { + while (current_position_ < master_bound_) { if (mask_to_minibatch_[current_position_]) { // XXX and a master node; seed nodes only exist locally (*batch_mask)[current_position_] = 1; @@ -27,7 +29,7 @@ void galois::MinibatchGenerator::GetNextMinibatch( // advance current position to next set bit for next call (or to end to detect // no more minibatches while (!mask_to_minibatch_[current_position_] && - (current_position_ < mask_to_minibatch_.size())) { + (current_position_ < master_bound_)) { current_position_++; } } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index dc175b5afc..8b1374b271 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -100,10 +100,6 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, // init norm factors (involves a sync call) InitNormFactor(); - // XXX remove this - test_batcher_ = - std::make_unique(local_testing_mask_, 2000); - #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { // allocate/copy data structures over to GPU @@ -936,6 +932,15 @@ size_t galois::graphs::GNNGraph::ConstructSampledSubgraph() { void galois::graphs::GNNGraph::PrepareNextTrainMinibatch() { train_batcher_->GetNextMinibatch(&local_minibatch_mask_); +#ifndef NDEBUG + galois::gPrint("Minibatch : "); + for (unsigned i = 0; i < local_minibatch_mask_.size(); i++) { + if (local_minibatch_mask_[i]) { + galois::gPrint(i, ","); + } + } + galois::gPrint("\n"); +#endif SetupNeighborhoodSample(GNNPhase::kBatch); } From 8366824ba002505687af716304959abbb0e6d2db Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 4 May 2021 16:32:52 -0500 Subject: [PATCH 531/660] Fixes to make distributed minibatch work This commit allows for distributed minibatching. 1) Fixes sample sync: write dest rather than write source since sample activation is occuring on dest. 2) Adds reduce call to check if all hosts no longer have any more minibatches before moving on to the next minibatch. 3) Safety assertions added to subgraph. --- libgnn/include/galois/GraphNeuralNetwork.h | 3 +++ .../graphs/GraphAggregationSyncStructures.h | 4 +--- libgnn/src/GraphNeuralNetwork.cpp | 5 ++-- libgnn/src/graphs/GNNGraph.cpp | 23 +++++++++++-------- libgnn/src/graphs/GNNSubgraph.cpp | 6 ++++- 5 files changed, 26 insertions(+), 15 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 580738b133..9925764bef 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -203,6 +203,9 @@ class GraphNeuralNetwork { //! Number of layers that use the graph (e.g. SAGE, GCN) size_t num_graph_user_layers_; + //! Termination detection for minibatching + galois::DGAccumulator work_left_; + #ifdef GALOIS_ENABLE_GPU //! Holds all GPU functions GraphNeuralNetworkGPU gpu_object_; diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index 0dd43c3308..51f52e5323 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -23,10 +23,8 @@ struct SampleFlagSync { //! return a vector of floats to sync static ValTy extract(uint32_t, char& i) { return i; } - //! reduction is addition in this case; add received vector to - //! own vector static bool reduce(uint32_t, char& i, ValTy y) { - if (y > i) { + if (y) { i = y; assert(i == 1); return true; diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index cb139191b4..c5d2aea91e 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -237,6 +237,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // XXX // create mini batch graphs and loop until minibatches on all hosts done while (true) { + work_left_.reset(); galois::gInfo("Epoch ", epoch, " batch ", batch_num++); // break when all hosts are done with minibatches graph_->PrepareNextTrainMinibatch(); @@ -263,13 +264,13 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } const PointerWithSize batch_pred = DoInference(); - DoInference(); train_accuracy = GetGlobalAccuracy(batch_pred); GradientPropagation(); galois::gPrint("Epoch ", epoch, " Batch ", batch_num, ": Train accuracy/F1 micro is ", train_accuracy, "\n"); + work_left_ += graph_->MoreTrainMinibatches(); // XXX sync across all hosts minibatcher state - if (!graph_->MoreTrainMinibatches()) { + if (!work_left_.reduce()) { break; } } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 8b1374b271..50af592b99 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -510,7 +510,6 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { } } } else { - // XXX i can get local sample counts from here if i need it size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train", &global_training_mask_range_, &local_training_mask_); @@ -837,11 +836,13 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { // continue the exploration galois::do_all( galois::iterate(new_nodes), - [&](uint32_t new_node_id) { SetSampledNode(new_node_id); }, + [&](uint32_t new_node_id) { + SetSampledNode(new_node_id); + }, galois::loopname("NeighborhoodSampleSet")); - // XXX(loc) bitset; can readAny be weaker? - sync_substrate_->sync("SampleSync"); + sync_substrate_->sync( + "SampleSync"); } void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, @@ -865,7 +866,7 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, // times (degree norm is 1 / degree) // XXX training degree + other norm, not global double probability_of_reject = - std::pow(1 - GetGlobalDegreeNorm(*src_iter), num_to_sample); + std::pow(1 - GetGlobalTrainDegreeNorm(*src_iter), num_to_sample); // loop through edges, turn "on" edge with some probability for (auto edge_iter : partitioned_graph_->edges(*src_iter)) { if (sample_rng_.DoBernoulli(probability_of_reject)) { @@ -911,7 +912,8 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, galois::loopname("NeighborhoodSampleSet")); // XXX(loc) bitset; can readAny be weaker? - sync_substrate_->sync("SampleSync"); + sync_substrate_->sync( + "SampleSync"); } //! Construct the subgraph from sampled edges and corresponding nodes @@ -933,13 +935,16 @@ size_t galois::graphs::GNNGraph::ConstructSampledSubgraph() { void galois::graphs::GNNGraph::PrepareNextTrainMinibatch() { train_batcher_->GetNextMinibatch(&local_minibatch_mask_); #ifndef NDEBUG - galois::gPrint("Minibatch : "); + size_t count = 0; + // galois::gPrint("Minibatch : "); for (unsigned i = 0; i < local_minibatch_mask_.size(); i++) { if (local_minibatch_mask_[i]) { - galois::gPrint(i, ","); + // galois::gPrint(partitioned_graph_->getGID(i), ","); + count++; } } - galois::gPrint("\n"); + // galois::gPrint("\n"); + galois::gInfo(host_prefix(), "num batched nodes ", count); #endif SetupNeighborhoodSample(GNNPhase::kBatch); } diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index 387e3fc250..a5f6d925ec 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -137,7 +137,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( [&](uint32_t node_id) { if (gnn_graph.IsInSampledGraph(node_id)) { uint32_t subgraph_id = lid_to_subgraph_id_[node_id]; - + assert(subgraph_id != std::numeric_limits::max()); uint32_t out_location = 0; uint32_t in_location = 0; if (subgraph_id != 0) { @@ -147,7 +147,11 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( for (auto out_edge_iter : gnn_graph.edges(node_id)) { if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) { + assert( + lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)] != + std::numeric_limits::max()); subedge_to_original_edge_[out_location] = *out_edge_iter; + underlying_graph_.constructEdge( out_location++, lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)]); From 6566351b4478b11fd342a5b8e2f12c3206f4c43a Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 5 May 2021 13:35:15 -0500 Subject: [PATCH 532/660] Fanout CLI, sample bitset, sampling behavior, time 1) Added batch timer to minibatch setting 2) Fanout command line argument for sampling 3) "DoGraphSampling" argument needs to be toggled to do any kind of sampling, else all edges will be selected when creating subgraph. 4) Bitset used to sync sampling flag during sampling. TODO doInductiveTraining needs to be changed to "use training subgraph" or something similar; inductive training needs to be its own argument (that will be done next commit) --- libgnn/include/galois/GraphNeuralNetwork.h | 4 +++ .../graphs/GraphAggregationSyncStructures.h | 10 ++++++ libgnn/src/GraphNeuralNetwork.cpp | 35 +++++++++++-------- libgnn/src/graphs/GNNGraph.cpp | 33 +++++++++-------- lonestar/libgnnbench/src/Input.cpp | 30 ++++++++++++++-- 5 files changed, 78 insertions(+), 34 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 9925764bef..fcc620738b 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -102,6 +102,8 @@ class GraphNeuralNetworkConfig { // public because they are independent of other settings //! Graph sampling bool do_sampling_{false}; + // XXX Change the name of this var; it just means to create subgraph + // based on training nodes //! Inductive = training ignores test/val set bool inductive_training_{false}; //! Interval to run validation set on network at; 0 = no run @@ -109,6 +111,8 @@ class GraphNeuralNetworkConfig { //! Interval to run testing set on network at; 0 = no run unsigned test_interval_{0}; unsigned train_minibatch_size_{0}; + //! Fan out used for sampling (if sampling is enabled) + std::vector fan_out_vector_; private: //! Number of layers to construct in the GNN not including the output diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index 51f52e5323..89ccc83324 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -12,6 +12,7 @@ extern GNNFloat* gnn_matrix_to_sync_; extern size_t gnn_matrix_to_sync_column_length_; extern galois::DynamicBitSet bitset_graph_aggregate; extern galois::LargeArray* gnn_lid_to_sid_pointer_; +extern galois::DynamicBitSet bitset_sample_flag_; #ifdef GALOIS_ENABLE_GPU extern struct CUDA_Context* cuda_ctx_for_sync; extern unsigned layer_number_to_sync; @@ -55,6 +56,15 @@ struct SampleFlagSync { static bool extract_reset_batch(unsigned, uint8_t*) { return false; } }; +struct SampleFlagBitset { + static constexpr bool is_vector_bitset() { return false; } + static constexpr bool is_valid() { return true; } + static galois::DynamicBitSet& get() { return bitset_sample_flag_; } + static void reset_range(size_t begin, size_t end) { + bitset_sample_flag_.reset(begin, end); + } +}; + struct GNNSumAggregate { using ValTy = galois::gstl::Vector; diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index c5d2aea91e..5336e07756 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -159,6 +159,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { const size_t this_host = graph_->host_id(); float train_accuracy{0.f}; size_t inductive_nodes = 0; + // this subgraph only needs to be created once if (config_.inductive_training_ && !config_.train_minibatch_size()) { // Setup the subgraph to only be the training graph graph_->SetupNeighborhoodSample(); @@ -184,6 +185,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { for (size_t epoch = 0; epoch < num_epochs; epoch++) { epoch_timer.start(); + // swap to inductive graph if (config_.inductive_training_ && !config_.train_minibatch_size()) { graph_->EnableSubgraph(); for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); @@ -192,6 +194,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } } + // beginning of epoch sampling if (config_.do_sampling() && !config_.train_minibatch_size()) { graph_->SetupNeighborhoodSample(); size_t num_sampled_layers = 0; @@ -203,16 +206,11 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { GNNLayerType layer_type = (*back_iter)->layer_type(); if (layer_type == GNNLayerType::kGraphConvolutional || layer_type == GNNLayerType::kSAGE) { - if (num_sampled_layers == 0) { - graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 10); - } else { - graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 25); - } + graph_->SampleEdges((*back_iter)->graph_user_layer_number(), + config_.fan_out_vector_[num_sampled_layers]); num_sampled_layers++; } } - galois::gDebug("Number of sampled layers is ", num_sampled_layers); - // resize layer matrices size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph(); for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); @@ -234,9 +232,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { size_t batch_num = 0; - // XXX // create mini batch graphs and loop until minibatches on all hosts done while (true) { + const std::string btime_name("Epoch" + std::to_string(epoch) + "Batch" + + std::to_string(batch_num)); + galois::StatTimer batch_timer(btime_name.c_str(), "GraphNeuralNetwork"); + batch_timer.start(); work_left_.reset(); galois::gInfo("Epoch ", epoch, " batch ", batch_num++); // break when all hosts are done with minibatches @@ -247,17 +248,19 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { GNNLayerType layer_type = (*back_iter)->layer_type(); if (layer_type == GNNLayerType::kGraphConvolutional || layer_type == GNNLayerType::kSAGE) { - if (num_sampled_layers == 0) { - graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 10); + // you can minibatch with sampling or minibatch and grab all + // relevant neighbors + if (config_.do_sampling()) { + graph_->SampleEdges((*back_iter)->graph_user_layer_number(), + config_.fan_out_vector_[num_sampled_layers]); } else { - graph_->SampleEdges((*back_iter)->graph_user_layer_number(), 25); + graph_->SampleAllEdges((*back_iter)->graph_user_layer_number()); } num_sampled_layers++; } } // resize layer matrices size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph(); - galois::gPrint(num_subgraph_nodes, "\n"); for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { (*layer)->ResizeRows(num_subgraph_nodes); @@ -266,11 +269,13 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { const PointerWithSize batch_pred = DoInference(); train_accuracy = GetGlobalAccuracy(batch_pred); GradientPropagation(); - galois::gPrint("Epoch ", epoch, " Batch ", batch_num, + + galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1, ": Train accuracy/F1 micro is ", train_accuracy, "\n"); work_left_ += graph_->MoreTrainMinibatches(); - // XXX sync across all hosts minibatcher state - if (!work_left_.reduce()) { + char global_work_left = work_left_.reduce(); + batch_timer.stop(); + if (!global_work_left) { break; } } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 50af592b99..8e021327b6 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -47,6 +47,8 @@ galois::LargeArray* gnn_lid_to_sid_pointer_ = nullptr; uint32_t* gnn_degree_vec_1_; uint32_t* gnn_degree_vec_2_; +galois::DynamicBitSet bitset_sample_flag_; + //! For synchronization of sampled degrees galois::DynamicBitSet bitset_sampled_degrees_; std::vector>* gnn_sampled_out_degrees_; @@ -764,8 +766,8 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, void galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { use_subgraph_ = false; - new_sampled_nodes_.resize(size()); - new_sampled_nodes_.reset(); + bitset_sample_flag_.resize(size()); + bitset_sample_flag_.reset(); // for now, if training node, it goes into seed node galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { @@ -795,7 +797,6 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { } void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { - assert(subgraph_is_inductive_); use_subgraph_ = false; galois::GAccumulator sampled; @@ -817,7 +818,7 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { MakeEdgeSampled(edge_iter, agg_layer_num); if (!IsInSampledGraph( partitioned_graph_->getEdgeDst(edge_iter))) { - new_sampled_nodes_.set( + bitset_sample_flag_.set( partitioned_graph_->getEdgeDst(edge_iter)); } sampled += 1; @@ -831,18 +832,16 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { galois::gPrint("Num sampled edges in inductive graph is ", sampled.reduce(), " out of ", total.reduce(), "\n"); - std::vector new_nodes = new_sampled_nodes_.getOffsets(); + std::vector new_nodes = bitset_sample_flag_.getOffsets(); // update nodes, then communicate update to all hosts so that they can // continue the exploration galois::do_all( galois::iterate(new_nodes), - [&](uint32_t new_node_id) { - SetSampledNode(new_node_id); - }, + [&](uint32_t new_node_id) { SetSampledNode(new_node_id); }, galois::loopname("NeighborhoodSampleSet")); - // XXX(loc) bitset; can readAny be weaker? - sync_substrate_->sync( - "SampleSync"); + sync_substrate_ + ->sync( + "SampleSync"); } void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, @@ -881,7 +880,7 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, MakeEdgeSampled(edge_iter, sample_layer_num); if (!IsInSampledGraph( partitioned_graph_->getEdgeDst(edge_iter))) { - new_sampled_nodes_.set( + bitset_sample_flag_.set( partitioned_graph_->getEdgeDst(edge_iter)); } bitset_sampled_degrees_.set(*src_iter); @@ -902,7 +901,7 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, galois::gDebug("Num sampled edges for layer ", sample_layer_num, " is ", sampled.reduce(), " out of ", total.reduce()); - std::vector new_nodes = new_sampled_nodes_.getOffsets(); + std::vector new_nodes = bitset_sample_flag_.getOffsets(); // update nodes, then communicate update to all hosts so that they can // continue the exploration @@ -911,9 +910,9 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, [&](uint32_t new_node_id) { SetSampledNode(new_node_id); }, galois::loopname("NeighborhoodSampleSet")); - // XXX(loc) bitset; can readAny be weaker? - sync_substrate_->sync( - "SampleSync"); + sync_substrate_ + ->sync( + "SampleSync"); } //! Construct the subgraph from sampled edges and corresponding nodes @@ -944,7 +943,7 @@ void galois::graphs::GNNGraph::PrepareNextTrainMinibatch() { } } // galois::gPrint("\n"); - galois::gInfo(host_prefix(), "num batched nodes ", count); + galois::gInfo(host_prefix(), "Batched nodes ", count); #endif SetupNeighborhoodSample(GNNPhase::kBatch); } diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index 921baaa4df..7914cdf6ea 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -26,13 +26,13 @@ llvm::cl::opt partition_scheme( "Original Cartesian Vertex-Cut")), cll::init(galois::graphs::GNNPartitionScheme::kOEC)); -llvm::cl::opt num_layers( +llvm::cl::opt num_layers( "numLayers", cll::desc( "Number of intermediate layers in the neural network (default 2))"), cll::init(2)); -llvm::cl::list layer_sizes( +llvm::cl::list layer_sizes( "layerSizes", cll::desc( "Comma separated list of numbers specifying " @@ -53,6 +53,12 @@ llvm::cl::list cl_layer_types( clEnumValN(galois::GNNLayerType::kDense, "dense", "Dense layer")), cll::CommaSeparated); +llvm::cl::list cl_fan_out_vector( + "samplingFanOut", + cll::desc( + "Comma separated list of layer fanout if sampling/batching is used"), + cll::CommaSeparated); + llvm::cl::opt disable_dropout("disableDropout", cll::desc("If true (off by default), disables dropout of " @@ -253,6 +259,25 @@ CreateOptimizer(const galois::graphs::GNNGraph* gnn_graph) { return std::make_unique(opt_sizes, num_layers); } +std::vector CreateFanOutVector() { + std::vector fan_out; + // fan out only matters if graph sampling is enabled + if (do_graph_sampling) { + // assert fan out size is the same + if (cl_fan_out_vector.size() == num_layers) { + + } else { + galois::gWarn("Fan out specification does not equal number of layers: " + "using default 10 followed by 25s"); + fan_out.emplace_back(10); + for (unsigned i = 1; i < num_layers; i++) { + fan_out.emplace_back(25); + } + } + } + return fan_out; +} + std::unique_ptr InitializeGraphNeuralNetwork() { // partition/load graph auto gnn_graph = std::make_unique( @@ -273,6 +298,7 @@ std::unique_ptr InitializeGraphNeuralNetwork() { gnn_config.validation_interval_ = val_interval; gnn_config.test_interval_ = test_interval; gnn_config.train_minibatch_size_ = train_minibatch_size; + gnn_config.fan_out_vector_ = CreateFanOutVector(); // optimizer std::unique_ptr opt = CreateOptimizer(gnn_graph.get()); From b6696aafbcf746b0f968ed0feb520a0a92884695 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 5 May 2021 13:51:12 -0500 Subject: [PATCH 533/660] doInductiveTraining changed to useTrainingSubgraph Inductive training in my definition means that you only use training nodes and nonval/test nodes during training phase. Option changed to training subgraph because it's more in line with what occurs: you create subgraph based on training subgraph and don't need to compute the rest. Right now in this commit it will still only pick up training/other nodes, but next commit will add another option that will allow it to pick up val/test nodes during sampling/subgraph construction. --- libgnn/include/galois/GraphNeuralNetwork.h | 5 ++-- libgnn/include/galois/graphs/GNNGraph.h | 4 +-- libgnn/include/galois/layers/GNNLayer.h | 3 --- libgnn/src/GraphNeuralNetwork.cpp | 27 +++++++++---------- libgnn/src/graphs/GNNGraph.cpp | 6 ++--- libgnn/src/layers/GraphConvolutionalLayer.cpp | 12 --------- lonestar/libgnnbench/src/Input.cpp | 14 +++++----- 7 files changed, 27 insertions(+), 44 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index fcc620738b..bf4b4b2f3b 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -104,8 +104,9 @@ class GraphNeuralNetworkConfig { bool do_sampling_{false}; // XXX Change the name of this var; it just means to create subgraph // based on training nodes - //! Inductive = training ignores test/val set - bool inductive_training_{false}; + //! Creates subgraph that is only composed of training nodes (reduces + //! redundant work since you won't calculate things you don't need) + bool use_train_subgraph_{false}; //! Interval to run validation set on network at; 0 = no run unsigned validation_interval_{0}; //! Interval to run testing set on network at; 0 = no run diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 7d6867a5c8..29d25afaf7 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -358,7 +358,7 @@ class GNNGraph { GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const { if (use_subgraph_) { size_t degree; - if (!subgraph_is_inductive_) { + if (!subgraph_is_train_) { // case because degrees in each layer differ degree = sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)]; @@ -638,7 +638,7 @@ class GNNGraph { // TODO vars for subgraphs as necessary bool use_subgraph_{false}; - bool subgraph_is_inductive_{false}; + bool subgraph_is_train_{false}; std::unique_ptr train_batcher_; std::unique_ptr test_batcher_; diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 9a71432471..728c0ecae4 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -65,8 +65,6 @@ struct GNNLayerConfig { bool disable_self_aggregate{false}; //! Graph sampling flag in use or not bool do_sampling{false}; - //! Inductive layer means for aggregation all non-training nodes are ignored - bool inductive_training_{false}; // TODO activation type; for now default is softmax //! Sets settings such that testing is easy @@ -178,7 +176,6 @@ class GNNLayer { //! Flip sampling switch on void EnableSampling() { config_.do_sampling = true; } bool IsSampledLayer() const { return config_.do_sampling; } - bool IsInductiveLayer() const { return config_.inductive_training_; } //! Sets the graph user layer number; important for sampling as this index //! determines which index to use when checking for sampled edges void SetGraphUserLayerNumber(size_t num) { graph_user_layer_number_ = num; } diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 5336e07756..462d8813fc 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -13,9 +13,9 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( galois::GraphNeuralNetworkConfig&& config) : graph_(std::move(graph)), optimizer_(std::move(optimizer)), config_(std::move(config)) { - if (config_.do_sampling_ && config_.inductive_training_) { - GALOIS_LOG_FATAL("Do not set inductive training and sampling at same time " - "(sampling is inductive already)"); + if (config_.do_sampling_ && config_.use_train_subgraph_) { + GALOIS_LOG_FATAL("Do not set train subgraph and sampling at same time " + "(sampling uses training subgraph already)"); } // max number of rows that can be passed as inputs; allocate space for it as // this will be the # of rows for each layer @@ -103,11 +103,11 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( } // XXX test minibatch - if (config_.do_sampling() || config_.inductive_training_ || + if (config_.do_sampling() || config_.use_train_subgraph_ || config.train_minibatch_size()) { // output layer not included; it will never involve sampling graph_->InitializeSamplingData(num_graph_user_layers_, - config_.inductive_training_); + config_.use_train_subgraph_); } if (config_.train_minibatch_size()) { @@ -158,9 +158,9 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( float galois::GraphNeuralNetwork::Train(size_t num_epochs) { const size_t this_host = graph_->host_id(); float train_accuracy{0.f}; - size_t inductive_nodes = 0; + size_t train_subgraph_nodes = 0; // this subgraph only needs to be created once - if (config_.inductive_training_ && !config_.train_minibatch_size()) { + if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) { // Setup the subgraph to only be the training graph graph_->SetupNeighborhoodSample(); for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); @@ -172,10 +172,10 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } } // resize layer matrices - inductive_nodes = graph_->ConstructSampledSubgraph(); + train_subgraph_nodes = graph_->ConstructSampledSubgraph(); for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { - (*layer)->ResizeRows(inductive_nodes); + (*layer)->ResizeRows(train_subgraph_nodes); } } @@ -185,12 +185,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { for (size_t epoch = 0; epoch < num_epochs; epoch++) { epoch_timer.start(); - // swap to inductive graph - if (config_.inductive_training_ && !config_.train_minibatch_size()) { + // swap to train subgraph + if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) { graph_->EnableSubgraph(); for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { - (*layer)->ResizeRows(inductive_nodes); + (*layer)->ResizeRows(train_subgraph_nodes); } } @@ -297,7 +297,6 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { bool do_test = config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false; - // get real norm factor back if altered by sampling or inductive training if (do_validate || do_test) { // disable subgraph graph_->DisableSubgraph(); @@ -348,8 +347,6 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { epoch_timer.get()); // revert to training phase for next epoch SetLayerPhases(galois::GNNPhase::kTrain); - // get back inductive norm factor as necessary; sampling norm is handled - // at beginning of every iteration } } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 8e021327b6..f2650ca8f9 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -760,7 +760,7 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, array.create(partitioned_graph_->size()); } } else { - subgraph_is_inductive_ = true; + subgraph_is_train_ = true; } } @@ -784,7 +784,7 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { edge_sample_status_[edge_id].end(), 0); }); // reset all degrees - if (!subgraph_is_inductive_) { + if (!subgraph_is_train_) { galois::do_all( galois::iterate(sampled_out_degrees_), [&](galois::LargeArray& array) { @@ -846,7 +846,7 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, size_t num_to_sample) { - assert(!subgraph_is_inductive_); + assert(!subgraph_is_train_); use_subgraph_ = false; galois::GAccumulator sampled; diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 7c22627f2f..82522fafd9 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -326,12 +326,6 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( } if (layer_phase_ == GNNPhase::kTrain) { - if (IsInductiveLayer()) { - // if inductive, all non-training nodes do not exist - if (!graph_.IsValidForPhase(src, GNNPhase::kTrain)) - return; - } - if (IsSampledLayer()) { // XXX(loc) GALOIS_LOG_WARN( @@ -367,12 +361,6 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( graphs::bitset_graph_aggregate.set(src); if (layer_phase_ == GNNPhase::kTrain) { - if (IsInductiveLayer()) { - // if inductive, all non-training nodes do not exist - if (!graph_.IsValidForPhase(dst, GNNPhase::kTrain)) - return; - } - if (IsSampledLayer()) { // ignore non-sampled nodes if (layer_phase_ == GNNPhase::kTrain && diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index 7914cdf6ea..d33a9bf422 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -114,11 +114,12 @@ llvm::cl::opt "use every epoch at a 50\% drop rate"), cll::init(false)); -llvm::cl::opt - do_inductive_training("doInductiveTraining", - cll::desc("If true (off by default), during training " - "all non-train nodes are ignored"), - cll::init(false)); +llvm::cl::opt use_train_subgraph( + "useTrainingSubgraph", + cll::desc( + "If true (off by default), during training " + "only compute minimum required for training nodes in training phase"), + cll::init(false)); llvm::cl::opt train_minibatch_size("trainMinibatchSize", @@ -219,7 +220,6 @@ galois::GNNLayerConfig CreateLayerConfig() { layer_config.disable_normalization = disable_normalization; layer_config.disable_aggregate_after_update = disable_agg_after_update; layer_config.disable_self_aggregate = disable_self_aggregate; - layer_config.inductive_training_ = do_inductive_training; return layer_config; } @@ -294,7 +294,7 @@ std::unique_ptr InitializeGraphNeuralNetwork() { galois::GraphNeuralNetworkConfig gnn_config( num_layers, layer_types, layer_sizes_vector, output_layer_type, do_graph_sampling, layer_config); - gnn_config.inductive_training_ = do_inductive_training; + gnn_config.use_train_subgraph_ = use_train_subgraph; gnn_config.validation_interval_ = val_interval; gnn_config.test_interval_ = test_interval; gnn_config.train_minibatch_size_ = train_minibatch_size; From 90205221f8c459971aa0394394f6250917125cc2 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 5 May 2021 15:27:47 -0500 Subject: [PATCH 534/660] Inductive subgraph option, fan out fix 1) Adds CLI to make it so only train/other nodes are looked at when constructing a subgraph. 2) Fixes the fanout implementation; I did not finish implementing it when it was intially committed. --- libgnn/include/galois/GraphNeuralNetwork.h | 4 +- libgnn/include/galois/graphs/GNNGraph.h | 5 +- libgnn/src/GraphNeuralNetwork.cpp | 12 ++-- libgnn/src/graphs/GNNGraph.cpp | 83 +++++++++++++--------- lonestar/libgnnbench/src/Input.cpp | 11 ++- 5 files changed, 72 insertions(+), 43 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index bf4b4b2f3b..fe1cb17477 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -102,11 +102,11 @@ class GraphNeuralNetworkConfig { // public because they are independent of other settings //! Graph sampling bool do_sampling_{false}; - // XXX Change the name of this var; it just means to create subgraph - // based on training nodes //! Creates subgraph that is only composed of training nodes (reduces //! redundant work since you won't calculate things you don't need) bool use_train_subgraph_{false}; + //! If on, subgraphs cannot pick up val/test nodes + bool inductive_subgraph_{false}; //! Interval to run validation set on network at; 0 = no run unsigned validation_interval_{0}; //! Interval to run testing set on network at; 0 = no run diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 29d25afaf7..fee528c4b8 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -266,9 +266,10 @@ class GNNGraph { void SetupNeighborhoodSample(GNNPhase seed_phase); //! Choose all edges from sampled nodes - void SampleAllEdges(size_t agg_layer_num); + void SampleAllEdges(size_t agg_layer_num, bool inductive_subgraph); //! Sample neighbors of nodes that are marked as ready for sampling - void SampleEdges(size_t sample_layer_num, size_t num_to_sample); + void SampleEdges(size_t sample_layer_num, size_t num_to_sample, + bool inductive_subgraph); //! Construct the subgraph from sampled edges and corresponding nodes size_t ConstructSampledSubgraph(); diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 462d8813fc..e53cac0b13 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -168,7 +168,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { GNNLayerType layer_type = (*back_iter)->layer_type(); if (layer_type == GNNLayerType::kGraphConvolutional || layer_type == GNNLayerType::kSAGE) { - graph_->SampleAllEdges((*back_iter)->graph_user_layer_number()); + graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(), + config_.inductive_subgraph_); } } // resize layer matrices @@ -207,7 +208,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { if (layer_type == GNNLayerType::kGraphConvolutional || layer_type == GNNLayerType::kSAGE) { graph_->SampleEdges((*back_iter)->graph_user_layer_number(), - config_.fan_out_vector_[num_sampled_layers]); + config_.fan_out_vector_[num_sampled_layers], + config_.inductive_subgraph_); num_sampled_layers++; } } @@ -252,9 +254,11 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // relevant neighbors if (config_.do_sampling()) { graph_->SampleEdges((*back_iter)->graph_user_layer_number(), - config_.fan_out_vector_[num_sampled_layers]); + config_.fan_out_vector_[num_sampled_layers], + config_.inductive_subgraph_); } else { - graph_->SampleAllEdges((*back_iter)->graph_user_layer_number()); + graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(), + config_.inductive_subgraph_); } num_sampled_layers++; } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index f2650ca8f9..5c159dc816 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -796,7 +796,8 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { bitset_sampled_degrees_.reset(); } -void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { +void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, + bool inductive_subgraph) { use_subgraph_ = false; galois::GAccumulator sampled; @@ -811,19 +812,22 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { if (IsInSampledGraph(src_iter)) { // marks ALL edges of nodes that connect to train/other nodes for (auto edge_iter : partitioned_graph_->edges(*src_iter)) { - if (IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), - GNNPhase::kTrain) || - IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), - GNNPhase::kOther)) { - MakeEdgeSampled(edge_iter, agg_layer_num); - if (!IsInSampledGraph( - partitioned_graph_->getEdgeDst(edge_iter))) { - bitset_sample_flag_.set( - partitioned_graph_->getEdgeDst(edge_iter)); + total += 1; + if (inductive_subgraph) { + if (!IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), + GNNPhase::kTrain) && + !IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), + GNNPhase::kOther)) { + continue; } - sampled += 1; } - total += 1; + + MakeEdgeSampled(edge_iter, agg_layer_num); + if (!IsInSampledGraph(partitioned_graph_->getEdgeDst(edge_iter))) { + bitset_sample_flag_.set( + partitioned_graph_->getEdgeDst(edge_iter)); + } + sampled += 1; } } }, @@ -845,7 +849,8 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num) { } void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, - size_t num_to_sample) { + size_t num_to_sample, + bool inductive_subgraph) { assert(!subgraph_is_train_); use_subgraph_ = false; @@ -863,33 +868,43 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, if (IsInSampledGraph(src_iter)) { // chance of not uniformly choosing an edge of this node num_to_sample // times (degree norm is 1 / degree) - // XXX training degree + other norm, not global - double probability_of_reject = - std::pow(1 - GetGlobalTrainDegreeNorm(*src_iter), num_to_sample); + double probability_of_reject; + if (!inductive_subgraph) { + probability_of_reject = + std::pow(1 - GetGlobalDegreeNorm(*src_iter), num_to_sample); + } else { + probability_of_reject = std::pow( + 1 - GetGlobalTrainDegreeNorm(*src_iter), num_to_sample); + } + // loop through edges, turn "on" edge with some probability for (auto edge_iter : partitioned_graph_->edges(*src_iter)) { + total += 1; if (sample_rng_.DoBernoulli(probability_of_reject)) { - // only take if node is training node or a node not classified - // into train/test/val - if (IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), - GNNPhase::kTrain) || - IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), - GNNPhase::kOther)) { - // if here, it means edge accepted; set sampled on, mark source - // as part of next set - MakeEdgeSampled(edge_iter, sample_layer_num); - if (!IsInSampledGraph( - partitioned_graph_->getEdgeDst(edge_iter))) { - bitset_sample_flag_.set( - partitioned_graph_->getEdgeDst(edge_iter)); + if (inductive_subgraph) { + // only take if node is training node or a node not classified + // into train/test/val + if (!IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), + GNNPhase::kTrain) && + !IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), + GNNPhase::kOther)) { + continue; } - bitset_sampled_degrees_.set(*src_iter); - // degree increment - sampled_out_degrees_[sample_layer_num][*src_iter]++; - sampled += 1; } + + // if here, it means edge accepted; set sampled on, mark source + // as part of next set + MakeEdgeSampled(edge_iter, sample_layer_num); + if (!IsInSampledGraph( + partitioned_graph_->getEdgeDst(edge_iter))) { + bitset_sample_flag_.set( + partitioned_graph_->getEdgeDst(edge_iter)); + } + bitset_sampled_degrees_.set(*src_iter); + // degree increment + sampled_out_degrees_[sample_layer_num][*src_iter]++; + sampled += 1; } - total += 1; } // total_nodes += 1; } diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index d33a9bf422..d15adf2d9f 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -121,6 +121,12 @@ llvm::cl::opt use_train_subgraph( "only compute minimum required for training nodes in training phase"), cll::init(false)); +llvm::cl::opt inductive_subgraph( + "inductiveSubgraph", + cll::desc("If true (off by default), only sample training/other nodes when " + "constructing subgraph"), + cll::init(false)); + llvm::cl::opt train_minibatch_size("trainMinibatchSize", cll::desc("Size of training minibatch (default 0)"), @@ -265,7 +271,9 @@ std::vector CreateFanOutVector() { if (do_graph_sampling) { // assert fan out size is the same if (cl_fan_out_vector.size() == num_layers) { - + for (unsigned i = 0; i < num_layers; i++) { + fan_out.emplace_back(cl_fan_out_vector[i]); + } } else { galois::gWarn("Fan out specification does not equal number of layers: " "using default 10 followed by 25s"); @@ -298,6 +306,7 @@ std::unique_ptr InitializeGraphNeuralNetwork() { gnn_config.validation_interval_ = val_interval; gnn_config.test_interval_ = test_interval; gnn_config.train_minibatch_size_ = train_minibatch_size; + gnn_config.inductive_subgraph_ = inductive_subgraph; gnn_config.fan_out_vector_ = CreateFanOutVector(); // optimizer From f56ef5c4a1f10c9cc7b7e5a405ab235ce20974fd Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 6 May 2021 15:37:43 -0500 Subject: [PATCH 535/660] Addition of an output rows dim for layer dims Adding output rows field to the layer dims struct; this will be required for an optimization coming later to resize intermediate matrices as moving down layers in a sampled subgraph setting. --- libgnn/include/galois/layers/GNNLayer.h | 2 ++ libgnn/src/GraphNeuralNetwork.cpp | 6 ++++-- libgnn/src/layers/GNNLayer.cpp | 13 +++++++++++-- libgnn/src/layers/SAGELayer.cpp | 6 +++--- 4 files changed, 20 insertions(+), 7 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 728c0ecae4..10e44511aa 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -41,6 +41,8 @@ struct GNNLayerDimensions { size_t input_columns; //! Number of columns output of this layer size_t output_columns; + //! If rows change, this is set. Otherwise, ignored. + size_t output_rows; }; //! Config options for operations that can occur in a layer diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index e53cac0b13..2ed988bed0 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -46,7 +46,8 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( GNNLayerDimensions layer_dims = {.input_rows = max_rows, .input_columns = prev_layer_columns, .output_columns = - config_.intermediate_layer_size(i)}; + config_.intermediate_layer_size(i), + .output_rows = max_rows}; switch (layer_type) { case GNNLayerType::kGraphConvolutional: @@ -121,7 +122,8 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( // get last intermediate layer column size .input_columns = config_.intermediate_layer_size( config_.num_intermediate_layers() - 1), - .output_columns = config_.output_layer_size()}; + .output_columns = config_.output_layer_size(), + .output_rows = max_rows}; switch (config_.output_layer_type()) { case (GNNOutputLayerType::kSoftmax): diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 0c01bb788b..2ba3aa5ae3 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -9,6 +9,11 @@ galois::GNNLayer::GNNLayer(size_t layer_num, const GNNLayerConfig& config) : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions), config_(config) { + // TODO(loc) + // this is currently a backward-compatibility hack, need to have caller + // set output rows rather than created here + layer_dimensions_.output_rows = layer_dimensions_.input_rows; + if (config_.allocate_weights) { // dropout allocation; dropout is same as input if (!config_.disable_dropout) { @@ -38,6 +43,10 @@ galois::GNNLayer::GNNLayer(size_t layer_num, GlorotBengioInit(&layer_weights_); } + // TODO(loc) optimize this and layer creation in general + // this does not use output_rows and assumes the worst case where + // all nodes are generated + // for now it's kept as input_rows so as to not break things size_t num_output_elements = layer_dimensions_.input_rows * layer_dimensions_.output_columns; @@ -269,7 +278,7 @@ void galois::GNNLayer::Activation() { galois::do_all( galois::iterate(static_cast(0), - layer_dimensions_.input_rows * + layer_dimensions_.output_rows * layer_dimensions_.output_columns), [&](size_t i) { if (forward_output_matrix_[i] > 0.0) { @@ -299,7 +308,7 @@ void galois::GNNLayer::ActivationDerivative( // keep gradient if the original output was greater than 0 galois::do_all( galois::iterate(static_cast(0), - layer_dimensions_.input_rows * + layer_dimensions_.output_rows * layer_dimensions_.output_columns), [&](size_t i) { // it was <= 0 before; set back to 0 diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 9696a9b460..32aa863a0a 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -100,7 +100,7 @@ galois::SAGELayer::SAGELayer(size_t layer_num, } size_t num_output_elements = - layer_dimensions_.input_rows * layer_dimensions_.output_columns; + layer_dimensions_.output_rows * layer_dimensions_.output_columns; // only needed if out temp would be smaller than intemp if (!config_.disable_aggregate_after_update && layer_dimensions_.input_columns > layer_dimensions_.output_columns) { @@ -179,7 +179,7 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( assert(input_embeddings.size() >= (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); assert(p_forward_output_matrix_.size() >= - (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); + (layer_dimensions_.output_rows * layer_dimensions_.output_columns)); // pointer to input to operate on const GNNFloat* input_data = input_embeddings.data(); @@ -225,7 +225,7 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( } assert(p_forward_output_matrix_.size() >= - (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); + (layer_dimensions_.output_rows * layer_dimensions_.output_columns)); timer.stop(); From 068437e1fdc9d448dfd455e0c649a320f32445c4 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 7 May 2021 15:30:27 -0500 Subject: [PATCH 536/660] WIP: optimize row size as layers progress 1) Seed choosing for sampling needs to sync the seeds across hosts if only master nodes get sampled (which is what it has been changed to if I remember correctly). Needs sync esp. if minibatching since not all hosts know what other hosts are minibatching. 2) In minibatch setting, rows will decrease as some nodes will not need to aggregate/not need xform. This has been implemented in minibatching setting (but not others, since WIP). 3) To support #2 above, subgraph construction has changed so that the SIDs are suffixed in the order that they are deactivated as the rows progress. This increases construction time (how much is something I need to check). 4) SAGE layer updated to deal with #2; input/output rows used depending on phase. NOTE: Same as other commits: SAGE only; GCN is super outdated at this point. --- libgnn/include/galois/graphs/GNNGraph.h | 25 ++++-- libgnn/include/galois/graphs/GNNSubgraph.h | 5 +- libgnn/include/galois/layers/GNNLayer.h | 6 ++ libgnn/src/GraphNeuralNetwork.cpp | 94 ++++++++++++++++----- libgnn/src/graphs/GNNGraph.cpp | 97 ++++++++++++++++++---- libgnn/src/graphs/GNNSubgraph.cpp | 38 ++++++--- libgnn/src/layers/GNNLayer.cpp | 1 + libgnn/src/layers/SAGELayer.cpp | 65 ++++++++++----- 8 files changed, 247 insertions(+), 84 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index fee528c4b8..5ff892057c 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -262,17 +262,24 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// //! Set seed nodes, i.e., nodes that are being predicted on - void SetupNeighborhoodSample() { SetupNeighborhoodSample(GNNPhase::kTrain); } - void SetupNeighborhoodSample(GNNPhase seed_phase); + size_t SetupNeighborhoodSample() { + return SetupNeighborhoodSample(GNNPhase::kTrain); + } + size_t SetupNeighborhoodSample(GNNPhase seed_phase); //! Choose all edges from sampled nodes - void SampleAllEdges(size_t agg_layer_num, bool inductive_subgraph); + size_t SampleAllEdges(size_t agg_layer_num, bool inductive_subgraph, + size_t timestamp); //! Sample neighbors of nodes that are marked as ready for sampling - void SampleEdges(size_t sample_layer_num, size_t num_to_sample, - bool inductive_subgraph); + size_t SampleEdges(size_t sample_layer_num, size_t num_to_sample, + bool inductive_subgraph, size_t timestamp); //! Construct the subgraph from sampled edges and corresponding nodes - size_t ConstructSampledSubgraph(); + size_t ConstructSampledSubgraph(size_t num_sampled_layers); + + unsigned SampleNodeTimestamp(unsigned lid) const { + return sample_node_timestamps_[lid]; + } void EnableSubgraph() { use_subgraph_ = true; } void DisableSubgraph() { use_subgraph_ = false; } @@ -327,7 +334,7 @@ class GNNGraph { //! Setup the state for the next minibatch sampling call by using the //! minibatcher to pick up the next set batch of nodes - void PrepareNextTrainMinibatch(); + size_t PrepareNextTrainMinibatch(); //! Returns true if there are still more minibatches in this graph bool MoreTrainMinibatches() { return !train_batcher_->NoMoreMinibatches(); }; ////////////////////////////////////////////////////////////////////////////// @@ -595,6 +602,10 @@ class GNNGraph { //! Sample data on edges: each edge gets a small bitset to mark //! if it's been sampled for a particular layer galois::LargeArray> edge_sample_status_; + // TODO use a char maybe? unlikely anyone will go over 2^8 layers... + //! What timestep a node was added to sampled set; used to determine + //! size of subgraph at each layer + galois::LargeArray sample_node_timestamps_; //! Indicates newly sampled nodes (for distributed synchronization of sampling //! status galois::DynamicBitSet new_sampled_nodes_; diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h index 976303be84..29b4429e17 100644 --- a/libgnn/include/galois/graphs/GNNSubgraph.h +++ b/libgnn/include/galois/graphs/GNNSubgraph.h @@ -15,7 +15,7 @@ class GNNSubgraph { } //! Given sampled bits set on gnn_graph, builds an explicit subgraph //! for the sampled bits - size_t BuildSubgraph(GNNGraph& gnn_graph); + size_t BuildSubgraph(GNNGraph& gnn_graph, size_t num_sampled_layers); galois::gstl::Vector& GetLocalFeatures() { return subgraph_node_features_; @@ -99,7 +99,8 @@ class GNNSubgraph { private: //! Creates subgraph ID mapping from the number of sampled nodes from the //! original graph. Should be done every epoch when sampled graph changes. - void CreateLocalToSubgraphMapping(const GNNGraph& gnn_graph); + void CreateLocalToSubgraphMapping(const GNNGraph& gnn_graph, + size_t num_sampled_layers); //! Counts in and out degrees of all sampled nodes in the graph void DegreeCounting(const GNNGraph& gnn_graph); //! Creates edges diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 10e44511aa..124d3a80a7 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -102,6 +102,12 @@ class GNNLayer { // TODO(loc) output matrix should be resized if space becomes an issue, // else just use first S rows (S = subgraph size) } + virtual void ResizeInputOutputRows(size_t input_row, size_t output_row) { + layer_dimensions_.input_rows = input_row; + layer_dimensions_.output_rows = output_row; + // TODO(loc) output matrix should be resized if space becomes an issue, + // else just use first S rows (S = subgraph size) + } GNNPhase layer_phase() { return layer_phase_; } //! Changes this layer's phase diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 2ed988bed0..9d45265afe 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -150,7 +150,7 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( } // flip sampling on layers - if (config_.do_sampling()) { + if (config_.do_sampling() || config_.train_minibatch_size()) { for (std::unique_ptr& ptr : gnn_layers_) { ptr->EnableSampling(); } @@ -164,18 +164,30 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // this subgraph only needs to be created once if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) { // Setup the subgraph to only be the training graph - graph_->SetupNeighborhoodSample(); + size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); + galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", + local_seed_node_count); + size_t num_sampled_layers = 0; for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); back_iter++) { GNNLayerType layer_type = (*back_iter)->layer_type(); if (layer_type == GNNLayerType::kGraphConvolutional || layer_type == GNNLayerType::kSAGE) { - graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(), - config_.inductive_subgraph_); + size_t current_sample_size = graph_->SampleAllEdges( + (*back_iter)->graph_user_layer_number(), + config_.inductive_subgraph_, num_sampled_layers + 1); + galois::gDebug(graph_->host_prefix(), + "Number of local nodes for train subgraph for layer ", + (*back_iter)->graph_user_layer_number(), " is ", + current_sample_size); + num_sampled_layers++; + // XXX resizing of layers } } + // resize layer matrices - train_subgraph_nodes = graph_->ConstructSampledSubgraph(); + // XXX resizing of layers should be done above, not here + train_subgraph_nodes = graph_->ConstructSampledSubgraph(num_sampled_layers); for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { (*layer)->ResizeRows(train_subgraph_nodes); @@ -191,6 +203,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // swap to train subgraph if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) { graph_->EnableSubgraph(); + // XXX resizing based on sampled per layer for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { (*layer)->ResizeRows(train_subgraph_nodes); @@ -199,7 +212,10 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // beginning of epoch sampling if (config_.do_sampling() && !config_.train_minibatch_size()) { - graph_->SetupNeighborhoodSample(); + size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); + galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", + local_seed_node_count); + size_t num_sampled_layers = 0; // work backwards on GCN/SAGE layers @@ -209,14 +225,20 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { GNNLayerType layer_type = (*back_iter)->layer_type(); if (layer_type == GNNLayerType::kGraphConvolutional || layer_type == GNNLayerType::kSAGE) { - graph_->SampleEdges((*back_iter)->graph_user_layer_number(), - config_.fan_out_vector_[num_sampled_layers], - config_.inductive_subgraph_); + size_t current_sample_size = graph_->SampleEdges( + (*back_iter)->graph_user_layer_number(), + config_.fan_out_vector_[num_sampled_layers], + config_.inductive_subgraph_, num_sampled_layers + 1); + galois::gDebug(graph_->host_prefix(), + "Number of local nodes for layer ", + (*back_iter)->graph_user_layer_number(), " is ", + current_sample_size); num_sampled_layers++; } } // resize layer matrices - size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph(); + size_t num_subgraph_nodes = + graph_->ConstructSampledSubgraph(num_sampled_layers); for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { (*layer)->ResizeRows(num_subgraph_nodes); @@ -245,7 +267,15 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { work_left_.reset(); galois::gInfo("Epoch ", epoch, " batch ", batch_num++); // break when all hosts are done with minibatches - graph_->PrepareNextTrainMinibatch(); + size_t seed_node_count = graph_->PrepareNextTrainMinibatch(); + galois::gDebug(graph_->host_prefix(), + "Number of local seed nodes is for batch is ", + seed_node_count); + + // last layer input size/output rows becomes seed node size + gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, + seed_node_count); + size_t num_sampled_layers = 0; for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); back_iter++) { @@ -254,33 +284,51 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { layer_type == GNNLayerType::kSAGE) { // you can minibatch with sampling or minibatch and grab all // relevant neighbors + size_t current_sample_size; if (config_.do_sampling()) { - graph_->SampleEdges((*back_iter)->graph_user_layer_number(), - config_.fan_out_vector_[num_sampled_layers], - config_.inductive_subgraph_); + current_sample_size = graph_->SampleEdges( + (*back_iter)->graph_user_layer_number(), + config_.fan_out_vector_[num_sampled_layers], + config_.inductive_subgraph_, num_sampled_layers + 1); } else { - graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(), - config_.inductive_subgraph_); + current_sample_size = graph_->SampleAllEdges( + (*back_iter)->graph_user_layer_number(), + config_.inductive_subgraph_, num_sampled_layers + 1); } + galois::gDebug(graph_->host_prefix(), + "Number of local nodes for layer ", + (*back_iter)->graph_user_layer_number(), " is ", + current_sample_size); + // resize this layer, change seed node count + (*back_iter) + ->ResizeInputOutputRows(current_sample_size, seed_node_count); + seed_node_count = current_sample_size; num_sampled_layers++; } } + // resize layer matrices - size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph(); - for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); - layer++) { - (*layer)->ResizeRows(num_subgraph_nodes); - } + // size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph(); + graph_->ConstructSampledSubgraph(num_sampled_layers); + // XXX resizes above only work for SAGE layers; will break if other + // layers are tested + + // for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); + // layer++) { + // (*layer)->ResizeRows(num_subgraph_nodes); + //} const PointerWithSize batch_pred = DoInference(); train_accuracy = GetGlobalAccuracy(batch_pred); GradientPropagation(); - galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1, - ": Train accuracy/F1 micro is ", train_accuracy, "\n"); work_left_ += graph_->MoreTrainMinibatches(); char global_work_left = work_left_.reduce(); batch_timer.stop(); + galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1, + ": Train accuracy/F1 micro is ", train_accuracy, + " time ", batch_timer.get(), "\n"); + if (!global_work_left) { break; } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 5c159dc816..fff79ea4fe 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -86,6 +86,11 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, // reverse edges partitioned_graph_->ConstructIncomingEdges(); + galois::gInfo(host_prefix_, "Number of local proxies is ", + partitioned_graph_->size()); + galois::gInfo(host_prefix_, "Number of local edges is ", + partitioned_graph_->sizeEdges()); + // read additional graph data ReadLocalLabels(dataset_name, has_single_class_label); ReadLocalFeatures(dataset_name); @@ -750,6 +755,8 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti( void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, bool is_inductive) { subgraph_ = std::make_unique(partitioned_graph_->size()); + sample_node_timestamps_.create(partitioned_graph_->size(), + std::numeric_limits::max()); edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers, 0); // this is to hold the *global* degree of a sampled graph; yes, memory wise // this is slightly problematic possibly, but each layer is its own @@ -764,19 +771,25 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, } } -void galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { +size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { use_subgraph_ = false; bitset_sample_flag_.resize(size()); bitset_sample_flag_.reset(); // for now, if training node, it goes into seed node - galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { - if (IsValidForPhase(*x, seed_phase)) { - SetSampledNode(*x); - } else { - UnsetSampledNode(*x); - } - }); + galois::do_all(galois::iterate(begin_owned(), end_owned()), + [&](const NodeIterator& x) { + if (IsValidForPhase(*x, seed_phase)) { + SetSampledNode(*x); + bitset_sample_flag_.set(*x); + } else { + UnsetSampledNode(*x); + } + }); + + // clear node timestamps + std::fill(sample_node_timestamps_.begin(), sample_node_timestamps_.end(), + std::numeric_limits::max()); // clear all sampled edges galois::do_all(galois::iterate(size_t{0}, partitioned_graph_->sizeEdges()), [&](size_t edge_id) { @@ -794,10 +807,28 @@ void galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { } bitset_sampled_degrees_.resize(partitioned_graph_->size()); bitset_sampled_degrees_.reset(); + + // Write source = masters + sync_substrate_->sync( + "SampleSync"); + + galois::GAccumulator local_seed_count; + local_seed_count.reset(); + // count # of seed nodes + galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { + if (IsInSampledGraph(x)) { + local_seed_count += 1; + // 0 = seed node + sample_node_timestamps_[*x] = 0; + } + }); + + return local_seed_count.reduce(); } -void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, - bool inductive_subgraph) { +size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, + bool inductive_subgraph, + size_t timestamp) { use_subgraph_ = false; galois::GAccumulator sampled; @@ -846,11 +877,26 @@ void galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, sync_substrate_ ->sync( "SampleSync"); + + galois::GAccumulator local_sample_count; + local_sample_count.reset(); + // count # of seed nodes + galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { + if (IsInSampledGraph(x)) { + local_sample_count += 1; + if (sample_node_timestamps_[*x] == std::numeric_limits::max()) { + sample_node_timestamps_[*x] = timestamp; + } + } + }); + + return local_sample_count.reduce(); } -void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, - size_t num_to_sample, - bool inductive_subgraph) { +size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, + size_t num_to_sample, + bool inductive_subgraph, + size_t timestamp) { assert(!subgraph_is_train_); use_subgraph_ = false; @@ -928,10 +974,26 @@ void galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, sync_substrate_ ->sync( "SampleSync"); + + // count sampled node size + galois::GAccumulator local_sample_count; + local_sample_count.reset(); + // count # of seed nodes + galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { + if (IsInSampledGraph(x)) { + local_sample_count += 1; + if (sample_node_timestamps_[*x] == std::numeric_limits::max()) { + sample_node_timestamps_[*x] = timestamp; + } + } + }); + + return local_sample_count.reduce(); } //! Construct the subgraph from sampled edges and corresponding nodes -size_t galois::graphs::GNNGraph::ConstructSampledSubgraph() { +size_t +galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers) { // false first so that the build process can use functions to access the // real graph use_subgraph_ = false; @@ -940,13 +1002,14 @@ size_t galois::graphs::GNNGraph::ConstructSampledSubgraph() { sync_substrate_ ->sync( "SubgraphDegree"); - size_t num_subgraph_nodes = subgraph_->BuildSubgraph(*this); + size_t num_subgraph_nodes = + subgraph_->BuildSubgraph(*this, num_sampled_layers); // after this, this graph is a subgraph use_subgraph_ = true; return num_subgraph_nodes; } -void galois::graphs::GNNGraph::PrepareNextTrainMinibatch() { +size_t galois::graphs::GNNGraph::PrepareNextTrainMinibatch() { train_batcher_->GetNextMinibatch(&local_minibatch_mask_); #ifndef NDEBUG size_t count = 0; @@ -960,7 +1023,7 @@ void galois::graphs::GNNGraph::PrepareNextTrainMinibatch() { // galois::gPrint("\n"); galois::gInfo(host_prefix(), "Batched nodes ", count); #endif - SetupNeighborhoodSample(GNNPhase::kBatch); + return SetupNeighborhoodSample(GNNPhase::kBatch); } //////////////////////////////////////////////////////////////////////////////// diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index a5f6d925ec..67d4b74fd0 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -1,11 +1,11 @@ #include "galois/graphs/GNNGraph.h" #include -size_t -galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(GNNGraph& gnn_graph) { +size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph( + GNNGraph& gnn_graph, size_t num_sampled_layers) { galois::StatTimer timer("BuildSubgraph", kRegionName); timer.start(); - CreateLocalToSubgraphMapping(gnn_graph); + CreateLocalToSubgraphMapping(gnn_graph, num_sampled_layers); if (num_subgraph_nodes_ == 0) { return 0; } @@ -19,7 +19,7 @@ galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph(GNNGraph& gnn_graph) { } void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping( - const GNNGraph& gnn_graph) { + const GNNGraph& gnn_graph, size_t num_sampled_layers) { galois::StatTimer timer("LIDToSIDMapping", kRegionName); timer.start(); @@ -27,6 +27,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping( // clear all mappings std::fill(lid_to_subgraph_id_.begin(), lid_to_subgraph_id_.end(), std::numeric_limits::max()); + // TODO(loc) depending on overhead, can parallelize this with a prefix sum // serial loop over LIDs to construct lid -> subgraph id mapping uint32_t current_sid = 0; @@ -35,31 +36,44 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping( size_t last_owned_node = *(gnn_graph.end_owned()); for (size_t local_node_id = 0; local_node_id < last_owned_node; local_node_id++) { - if (gnn_graph.IsInSampledGraph(local_node_id)) { + if (gnn_graph.SampleNodeTimestamp(local_node_id) == 0) { // TODO should bound check the SID to max uint32_t // note: if SID is max uint32t, then it's not valid - // galois::gInfo(local_node_id, " maps to ", current_sid); lid_to_subgraph_id_[local_node_id] = current_sid++; } } - // all nodes before this SID are master nodes + // all nodes before this SID are master nodes *that matter* + // NOTE: there is a very subtle distinction here implementation wise + // that needs to be resolved in slightly more detail than this subgraph_master_boundary_ = current_sid; for (size_t local_node_id = last_owned_node; local_node_id < gnn_graph.size(); local_node_id++) { - if (gnn_graph.IsInSampledGraph(local_node_id)) { + if (gnn_graph.SampleNodeTimestamp(local_node_id) == 0) { // TODO should bound check the SID to max uint32_t // note: if SID is max uint32t, then it's not valid - // galois::gInfo(local_node_id, " maps to ", current_sid); lid_to_subgraph_id_[local_node_id] = current_sid++; } } - galois::gDebug("Number of sampled nodes for subgraph construction is ", - current_sid); + galois::gDebug( + "Number of sampled nodes for subgraph construction layer 0 is ", + current_sid); + + // XXX each sampled layer can be queried in parallel (think prefix sum); do + // this if this becomes a bottleneck + for (size_t i = 1; i < num_sampled_layers + 1; i++) { + for (size_t local_node_id = 0; local_node_id < gnn_graph.size(); + local_node_id++) { + if (gnn_graph.SampleNodeTimestamp(local_node_id) == i) { + lid_to_subgraph_id_[local_node_id] = current_sid++; + } + } + galois::gDebug("Number of sampled nodes for subgraph construction, layer ", + i, " is ", current_sid); + } num_subgraph_nodes_ = current_sid; - timer.stop(); } diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 2ba3aa5ae3..1dabce8476 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -164,6 +164,7 @@ void galois::GNNLayer::RandomInitVector(std::vector* vector_to_init) { void galois::GNNLayer::DoDropoutCPU( const PointerWithSize input_to_dropout, PointerWithSize* output_matrix) { + // XXX(loc) check this to make sure it works in subgraph setting size_t num_elements = layer_dimensions_.input_rows * layer_dimensions_.input_columns; diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 32aa863a0a..3f712df0f7 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -270,6 +270,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } if (!sage_config_.disable_concat) { + // XXX masking may not be required in sampling case where rows change if (layer_number_ != 0) { MaskInputNonMasters(&input_data); } else { @@ -291,7 +292,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // otherwise must mask other galois::CBlasSGEMM( CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, - layer_dimensions_.input_rows, layer_dimensions_.output_columns, + layer_dimensions_.output_rows, layer_dimensions_.output_columns, input_data.data(), input_gradient->data(), p_layer_weight_gradients_2_.data()); #ifdef GALOIS_ENABLE_GPU @@ -306,6 +307,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { // aggdata can == p_intemp1; in other words, need to use before overwrite // mask it, then use it + // XXX masking may not be required in sampling case where rows change if (layer_number_ != 0 || sage_config_.disable_concat) { MaskInputNonMasters(&agg_data); } @@ -314,16 +316,17 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { + // XXX output rows gpu_object_.GetWeightGradientsGPU( layer_dimensions_.input_rows, layer_dimensions_.input_columns, layer_dimensions_.output_columns, agg_data.data(), input_gradient->data(), p_layer_weight_gradients_.data()); } else { #endif - // temp 2 holds aggregated feature vectors from forward phase + // agg data holds aggregated feature vectors from forward phase galois::CBlasSGEMM( CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, - layer_dimensions_.input_rows, layer_dimensions_.output_columns, + layer_dimensions_.output_rows, layer_dimensions_.output_columns, agg_data.data(), input_gradient->data(), p_layer_weight_gradients_.data()); #ifdef GALOIS_ENABLE_GPU @@ -349,6 +352,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } else { // --unmasked-- // disable concat part is here because otherwise it would get done elsewhere + // XXX masking may not be required in sampling case where rows change if (layer_number_ != 0 && sage_config_.disable_concat) { MaskInputNonMasters(&input_data); } else { @@ -367,6 +371,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( p_out_temp_.data(), p_layer_weight_gradients_.data()); } else { #endif + // input col x input row * input row x output col galois::CBlasSGEMM(CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, layer_dimensions_.input_rows, @@ -451,8 +456,16 @@ void galois::SAGELayer::AggregateAllCPU( GNNFloat* aggregate_output, galois::substrate::PerThreadStorage>*, bool is_backward) { + // aggregation causes a row count change + size_t num_rows_to_handle; + if (!is_backward) { + num_rows_to_handle = layer_dimensions_.output_rows; + } else { + num_rows_to_handle = layer_dimensions_.input_rows; + } + galois::do_all( - galois::iterate(graph_.begin(), graph_.end()), + galois::iterate(*(graph_.begin()), num_rows_to_handle), [&](size_t src) { size_t index_to_src_feature = src * column_length; // zero out src feature first @@ -469,10 +482,8 @@ void galois::SAGELayer::AggregateAllCPU( // loop through all destinations to grab the feature to aggregate for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src); e++) { - graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src)); - size_t dst = graph_.GetEdgeDest(e); - - if (layer_phase_ == GNNPhase::kTrain) { + if (layer_phase_ == GNNPhase::kTrain || + layer_phase_ == GNNPhase::kBatch) { // XXX if (IsSampledLayer()) { if (!graph_.IsEdgeSampled(e, layer_number_)) { @@ -480,7 +491,8 @@ void galois::SAGELayer::AggregateAllCPU( } } } - + size_t dst = graph_.GetEdgeDest(e); + graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src)); size_t index_to_dst_feature = dst * column_length; if (!config_.disable_normalization) { @@ -508,10 +520,8 @@ void galois::SAGELayer::AggregateAllCPU( // loop through all destinations to grab the feature to aggregate for (auto e = graph_.in_edge_begin(src); e != graph_.in_edge_end(src); e++) { - graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src)); - size_t dst = graph_.GetInEdgeDest(e); - - if (layer_phase_ == GNNPhase::kTrain) { + if (layer_phase_ == GNNPhase::kTrain || + layer_phase_ == GNNPhase::kBatch) { // XXX if (IsSampledLayer()) { if (!graph_.IsInEdgeSampled(e, layer_number_)) { @@ -519,6 +529,13 @@ void galois::SAGELayer::AggregateAllCPU( } } } + size_t dst = graph_.GetInEdgeDest(e); + graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src)); + + // input row x output row in backward means that i shouldn't be + // touching nodes past output rows; the above sample check + // should deal with this where this matters + assert(dst < layer_dimensions_.output_rows); size_t index_to_dst_feature = dst * column_length; @@ -553,6 +570,7 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, timer.start(); #ifdef GALOIS_ENABLE_GPU // TODO self change + // XXX(hochan) output rows if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.UpdateEmbeddingsGPU( layer_dimensions_.input_rows, layer_dimensions_.input_columns, @@ -561,14 +579,14 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, } else { #endif galois::gDebug("Layer ", graph_user_layer_number_, " ", - layer_dimensions_.input_rows, " ", + layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, " ", layer_dimensions_.output_columns); // CPU version is just a call into CBlas - galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, - layer_dimensions_.input_columns, - layer_dimensions_.output_columns, node_embeddings, - layer_weights_.data(), output); + galois::CBlasSGEMM( + CblasNoTrans, CblasNoTrans, layer_dimensions_.output_rows, + layer_dimensions_.input_columns, layer_dimensions_.output_columns, + node_embeddings, layer_weights_.data(), output); #ifdef GALOIS_ENABLE_GPU } #endif @@ -587,10 +605,10 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddings( } else { #endif // note use of layer weights 2 differentiates this from above - galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, - layer_dimensions_.input_columns, - layer_dimensions_.output_columns, node_embeddings, - layer_weights_2_.data(), output, true); + galois::CBlasSGEMM( + CblasNoTrans, CblasNoTrans, layer_dimensions_.output_rows, + layer_dimensions_.input_columns, layer_dimensions_.output_columns, + node_embeddings, layer_weights_2_.data(), output, true); #ifdef GALOIS_ENABLE_GPU } #endif @@ -614,6 +632,7 @@ void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, #endif // difference is Trans for B matrix (data) to get z by y (weights is y by z // normally); result is x by y + // note input rows is used here due to transpose of aggregation galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, layer_dimensions_.output_columns, layer_dimensions_.input_columns, gradients, @@ -641,7 +660,7 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative( // difference is Trans for B matrix (data) to get z by y (weights is y by z // normally); result is x by y // true at end -> accumulate - galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, + galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.output_rows, layer_dimensions_.output_columns, layer_dimensions_.input_columns, gradients, layer_weights_2_.data(), output, true); From faf6b66f54e9e94ea27bffae6b73ef40776c317b Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 11 May 2021 12:24:41 -0500 Subject: [PATCH 537/660] GNN resize layer affects output; reset all sampled 1) Resize layer changes the output rows. 2) (Un)set sampled nodes needs to apply to all nodes, not just owned nodes. Note that loop can be unconditional unset because seed nodes will always only be owned nodes. (mirror of a seed node on another host will get synchronized) --- libgnn/include/galois/layers/GNNLayer.h | 1 + libgnn/src/graphs/GNNGraph.cpp | 17 +++++++++++------ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 124d3a80a7..4f5822d1b2 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -99,6 +99,7 @@ class GNNLayer { virtual void ResizeRows(size_t new_row_count) { layer_dimensions_.input_rows = new_row_count; + layer_dimensions_.output_rows = new_row_count; // TODO(loc) output matrix should be resized if space becomes an issue, // else just use first S rows (S = subgraph size) } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index fff79ea4fe..92dee12a28 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -628,9 +628,8 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( size_t global_correct = num_correct_.reduce(); size_t global_checked = total_checked_.reduce(); - // GALOIS_LOG_WARN("Sub: {}, Accuracy: {} / {}", use_subgraph_, - // global_correct, - // global_checked); + galois::gDebug("Sub: {}, Accuracy: {} / {}", use_subgraph_, global_correct, + global_checked); return static_cast(global_correct) / static_cast(global_checked); @@ -786,6 +785,12 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { UnsetSampledNode(*x); } }); + // unsets nodes set in previous iterations; for some reason they get + // synchronized along with everything else even though bitset sample flag + // should prevent it (that, or it's because they don't get sync'd that they + // remain the same) + galois::do_all(galois::iterate(end_owned(), end()), + [&](const NodeIterator& x) { UnsetSampledNode(*x); }); // clear node timestamps std::fill(sample_node_timestamps_.begin(), sample_node_timestamps_.end(), @@ -958,9 +963,9 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, galois::steal(), galois::loopname("NeighborhoodSample")); // galois::gInfo(host_prefix(), "sampled nodes for layer ", sample_layer_num, - // " is ", total_nodes.reduce()); - galois::gDebug("Num sampled edges for layer ", sample_layer_num, " is ", - sampled.reduce(), " out of ", total.reduce()); + // " is ", total_nodes.reduce()); + // galois::gInfo("Num sampled edges for layer ", sample_layer_num, " is ", + // sampled.reduce(), " out of ", total.reduce()); std::vector new_nodes = bitset_sample_flag_.getOffsets(); From 9772fb98be02cf5066cb221b7e679694ae1e336e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 11 May 2021 18:18:53 -0500 Subject: [PATCH 538/660] distgraphconvert builds again distgraphconvert was not updated to keep up with changes to send buffers. This commit fixes that. Made to build again so I could use the node reordering function of it for papers100M. --- .../dist-graph-convert-helpers.cpp | 23 +++++++++---------- .../dist-graph-convert-helpers.h | 22 ++++++++++-------- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/tools/dist-graph-convert/dist-graph-convert-helpers.cpp b/tools/dist-graph-convert/dist-graph-convert-helpers.cpp index 4764598bbf..886103881d 100644 --- a/tools/dist-graph-convert/dist-graph-convert-helpers.cpp +++ b/tools/dist-graph-convert/dist-graph-convert-helpers.cpp @@ -269,7 +269,7 @@ void sendAndReceiveEdgeChunkCounts(std::vector& chunkCounts) { continue; galois::runtime::SendBuffer b; galois::runtime::gSerialize(b, chunkCounts); - net.sendTagged(h, galois::runtime::evilPhase, b); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } // receive chunk counts @@ -279,10 +279,10 @@ void sendAndReceiveEdgeChunkCounts(std::vector& chunkCounts) { for (unsigned h = 0; h < totalNumHosts; h++) { if (h == hostID) continue; - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer; + decltype(net.recieveTagged(galois::runtime::evilPhase)) rBuffer; do { - rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr); + rBuffer = net.recieveTagged(galois::runtime::evilPhase); } while (!rBuffer); galois::runtime::gDeserialize(rBuffer->second, recvChunkCounts); @@ -416,12 +416,12 @@ uint64_t receiveEdgeCounts() { for (unsigned h = 0; h < totalNumHosts; h++) { if (h == hostID) continue; - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer; + decltype(net.recieveTagged(galois::runtime::evilPhase)) rBuffer; uint64_t recvCount; do { - rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr); + rBuffer = net.recieveTagged(galois::runtime::evilPhase); } while (!rBuffer); galois::runtime::gDeserialize(rBuffer->second, recvCount); @@ -450,9 +450,8 @@ void receiveAssignedEdges(std::atomic& edgesToReceive, std::vector recvDataVector; while (edgesToReceive) { - decltype( - net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer; - rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr); + decltype(net.recieveTagged(galois::runtime::evilPhase)) rBuffer; + rBuffer = net.recieveTagged(galois::runtime::evilPhase); // the buffer will have edge data as well if localsrctodata is // nonempty (it will be nonempty if initialized to non-empty by the @@ -460,7 +459,7 @@ void receiveAssignedEdges(std::atomic& edgesToReceive, // going to send edge data if (rBuffer) { auto& receiveBuffer = rBuffer->second; - while (receiveBuffer.r_size() > 0) { + while (receiveBuffer.size() > 0) { uint64_t src; if (localSrcToData.empty()) { // receive only dest data @@ -514,7 +513,7 @@ std::vector getEdgesPerHost(uint64_t localAssignedEdges) { continue; galois::runtime::SendBuffer b; galois::runtime::gSerialize(b, localAssignedEdges); - net.sendTagged(h, galois::runtime::evilPhase, b); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } // receive @@ -524,10 +523,10 @@ std::vector getEdgesPerHost(uint64_t localAssignedEdges) { continue; } - decltype(net.recieveTagged(galois::runtime::evilPhase, nullptr)) rBuffer; + decltype(net.recieveTagged(galois::runtime::evilPhase)) rBuffer; uint64_t otherAssignedEdges; do { - rBuffer = net.recieveTagged(galois::runtime::evilPhase, nullptr); + rBuffer = net.recieveTagged(galois::runtime::evilPhase); } while (!rBuffer); galois::runtime::gDeserialize(rBuffer->second, otherAssignedEdges); diff --git a/tools/dist-graph-convert/dist-graph-convert-helpers.h b/tools/dist-graph-convert/dist-graph-convert-helpers.h index abf932056c..dc8d2a954a 100644 --- a/tools/dist-graph-convert/dist-graph-convert-helpers.h +++ b/tools/dist-graph-convert/dist-graph-convert-helpers.h @@ -838,7 +838,7 @@ void sendEdgeCounts(const std::vector& hostToNodes, continue; galois::runtime::SendBuffer b; galois::runtime::gSerialize(b, numEdgesPerHost[h].reduce()); - net.sendTagged(h, galois::runtime::evilPhase, b); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); } }; @@ -929,8 +929,9 @@ void sendAssignedEdges(const std::vector& hostToNodes, dstVector.clear(); if (hostSendBuffer.size() > 1400) { net.sendTagged(edgeOwner, galois::runtime::evilPhase, - hostSendBuffer); - hostSendBuffer.getVec().clear(); + std::move(hostSendBuffer)); + (*(sendBuffers.getLocal()))[edgeOwner] = + galois::runtime::SendBuffer(); } } @@ -966,8 +967,9 @@ void sendAssignedEdges(const std::vector& hostToNodes, } if (hostSendBuffer.size() > 0) { - net.sendTagged(h, galois::runtime::evilPhase, hostSendBuffer); - hostSendBuffer.getVec().clear(); + net.sendTagged(h, galois::runtime::evilPhase, + std::move(hostSendBuffer)); + (*(sendBuffers.getLocal()))[h] = galois::runtime::SendBuffer(); } } }, @@ -1049,8 +1051,9 @@ void sendAssignedEdges(const std::vector& hostToNodes, dataVector.clear(); if (hostSendBuffer.size() > 1400) { net.sendTagged(edgeOwner, galois::runtime::evilPhase, - hostSendBuffer); - hostSendBuffer.getVec().clear(); + std::move(hostSendBuffer)); + (*(sendBuffers.getLocal()))[edgeOwner] = + galois::runtime::SendBuffer(); } } @@ -1090,8 +1093,9 @@ void sendAssignedEdges(const std::vector& hostToNodes, } if (hostSendBuffer.size() > 0) { - net.sendTagged(h, galois::runtime::evilPhase, hostSendBuffer); - hostSendBuffer.getVec().clear(); + net.sendTagged(h, galois::runtime::evilPhase, + std::move(hostSendBuffer)); + (*(sendBuffers.getLocal()))[h] = galois::runtime::SendBuffer(); } } }, From b5a79f4025a85af6427e0ee5ecb2630ba7dff1ab Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 11 May 2021 18:29:44 -0500 Subject: [PATCH 539/660] papers100M contiguous remapping related changes 1) added code to gnngraph that remaps labels, features, and node ids to make train/val/test ranges contiguous. This is mostly useful for partitioning so that I can partitioning training nodes evenly which is key to minibatching. 2) CuSP hardcodes the training range for 100M-remap. 3) 100M-remap hardcoded mask ranges to reduce reading time for it in GNNGraph. 4) 100M-remap custom label reader that isn't text based; all labels should eventually switch to this format in any case. 5) commented out code in gnngraph-test that I used to test/do remapping of 100M. --- libcusp/include/galois/graphs/NewGeneric.h | 5 + libgnn/include/galois/graphs/GNNGraph.h | 5 + libgnn/src/graphs/GNNGraph.cpp | 247 +++++++++++++++++++-- libgnn/test/gnngraph-test.cpp | 11 + libgnn/test/gpu-convlayer-test.cpp | 4 +- 5 files changed, 257 insertions(+), 15 deletions(-) diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index 6ece9e2c51..6f13f42737 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -119,6 +119,11 @@ class NewDistGraphGeneric : public DistGraph { // this is entire graph: amazon's mask isn't contiguous bps.push_back(0); bps.push_back(86618); + } else if (filename.find("ogbn-papers100M-remap") != std::string::npos) { + galois::gInfo("papers remap being used"); + // whole graph (non contiguous mask) + bps.push_back(0); + bps.push_back(1207178); } else if (filename.find("ogbn-papers100M") != std::string::npos) { // whole graph (non contiguous mask) bps.push_back(0); diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 5ff892057c..f970288718 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -517,6 +517,8 @@ class GNNGraph { } #endif + void ContiguousRemap(const std::string& new_name); + private: // included like this to avoid cyclic dependency issues + not used anywhere but // in this class anyways @@ -526,6 +528,7 @@ class GNNGraph { // Initialization ////////////////////////////////////////////////////////////////////////////// + void ReadLocalLabelsBin(const std::string& dataset_name); //! Read labels of local nodes only void ReadLocalLabels(const std::string& dataset_name, bool has_single_class_label); @@ -655,6 +658,8 @@ class GNNGraph { std::unique_ptr train_batcher_; std::unique_ptr test_batcher_; + std::vector node_remapping_; + ////////////////////////////////////////////////////////////////////////////// // GPU things ////////////////////////////////////////////////////////////////////////////// diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 92dee12a28..0c1d3b4d8f 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -92,7 +92,12 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, partitioned_graph_->sizeEdges()); // read additional graph data - ReadLocalLabels(dataset_name, has_single_class_label); + if (dataset_name != "ogbn-papers100M-remap") { + ReadLocalLabels(dataset_name, has_single_class_label); + } else { + galois::gInfo("Remapped ogbn 100M"); + ReadLocalLabelsBin(dataset_name); + } ReadLocalFeatures(dataset_name); ReadLocalMasks(dataset_name); @@ -256,6 +261,49 @@ void galois::graphs::GNNGraph::AggregateSyncGPU( } } #endif +void galois::graphs::GNNGraph::ReadLocalLabelsBin( + const std::string& dataset_name) { + GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_); + + std::ifstream file_stream; + file_stream.open(input_directory_ + dataset_name + "-labels-dims.txt", + std::ios::in); + size_t num_nodes; + file_stream >> num_nodes >> num_label_classes_ >> std::ws; + assert(num_nodes == partitioned_graph_->globalSize()); + if (host_id_ == 0) { + galois::gInfo("Number of label classes is ", num_label_classes_); + } + file_stream.close(); + + std::string filename = input_directory_ + dataset_name + "-labels.bin"; + std::ifstream file_stream_bin; + file_stream_bin.open(filename, std::ios::binary | std::ios::in); + + std::vector all_labels(num_nodes); + // read all labels into a vector + file_stream_bin.read((char*)all_labels.data(), sizeof(GNNLabel) * num_nodes); + + using_single_class_labels_ = true; + local_ground_truth_labels_.resize(partitioned_graph_->size()); + + galois::GAccumulator found_local_vertices; + found_local_vertices.reset(); + + // save only local ones; can do in parallel as well + // assumes -1 already dealt with + galois::do_all(galois::iterate(size_t{0}, partitioned_graph_->size()), + [&](size_t lid) { + local_ground_truth_labels_[lid] = all_labels[GetGID(lid)]; + found_local_vertices += 1; + }); + + size_t fli = found_local_vertices.reduce(); + galois::gInfo(host_prefix_, "Read ", fli, " labels (", + local_ground_truth_labels_.size() * double{4} / (1 << 30), + " GB)"); + GALOIS_LOG_ASSERT(fli == partitioned_graph_->size()); +} void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, bool has_single_class_label) { @@ -380,23 +428,25 @@ void galois::graphs::GNNGraph::ReadLocalFeatures( node_feature_length_); // copy over features for local nodes only - size_t num_kept_vertices = 0; - for (size_t gid = 0; gid < num_global_vertices; gid++) { - if (partitioned_graph_->isLocal(gid)) { - // copy over feature vector - std::copy(full_feature_set.get() + gid * node_feature_length_, - full_feature_set.get() + (gid + 1) * node_feature_length_, - &local_node_features_[partitioned_graph_->getLID(gid) * - node_feature_length_]); - num_kept_vertices++; - } - } + galois::GAccumulator num_kept_vertices; + num_kept_vertices.reset(); + galois::do_all( + galois::iterate(size_t{0}, num_global_vertices), [&](size_t gid) { + if (partitioned_graph_->isLocal(gid)) { + // copy over feature vector + std::copy(full_feature_set.get() + gid * node_feature_length_, + full_feature_set.get() + (gid + 1) * node_feature_length_, + &local_node_features_[partitioned_graph_->getLID(gid) * + node_feature_length_]); + num_kept_vertices += 1; + } + }); full_feature_set.reset(); galois::gInfo(host_prefix_, "Read ", local_node_features_.size(), " features (", local_node_features_.size() * double{4} / (1 << 30), " GB)"); - GALOIS_LOG_ASSERT(num_kept_vertices == partitioned_graph_->size()); + GALOIS_LOG_ASSERT(num_kept_vertices.reduce() == partitioned_graph_->size()); } //! Helper function to read masks from file into the appropriate structures @@ -516,6 +566,35 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { local_testing_mask_[partitioned_graph_->getLID(i)] = 1; } } + } else if (dataset_name == "ogbn-papers100M-remap") { + global_training_mask_range_ = {.begin = 0, .end = 1207178, .size = 1207178}; + global_validation_mask_range_ = { + .begin = 1207178, .end = 1207178 + 125264, .size = 125264}; + global_testing_mask_range_ = { + .begin = 1332442, .end = 1332442 + 214337, .size = 214337}; + // training + for (size_t i = global_training_mask_range_.begin; + i < global_training_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_training_mask_[partitioned_graph_->getLID(i)] = 1; + } + } + // validation + for (size_t i = global_validation_mask_range_.begin; + i < global_validation_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_validation_mask_[partitioned_graph_->getLID(i)] = 1; + } + } + // testing + for (size_t i = global_testing_mask_range_.begin; + i < global_testing_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_testing_mask_[partitioned_graph_->getLID(i)] = 1; + } + } + valid_other_ = FindOtherMask(); + GALOIS_LOG_ASSERT(valid_other_ == 109513177); } else { size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train", &global_training_mask_range_, @@ -1096,3 +1175,145 @@ void galois::graphs::GNNGraph::ResizeGPULayerVector(size_t num_layers) { resize_CUDA_layer_vector(cuda_ctx_, num_layers); } #endif +void galois::graphs::GNNGraph::ContiguousRemap(const std::string& new_name) { + node_remapping_.resize(partitioned_graph_->size()); + + uint32_t new_node_id = 0; + + // serial loops because new ID needs to be kept consistent + // first, train nodes + for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) { + if (IsValidForPhase(cur_node, GNNPhase::kTrain)) { + node_remapping_[new_node_id++] = cur_node; + } + } + galois::gInfo("Train nodes are from 0 to ", new_node_id); + + // second, val nodes + uint32_t val_start = new_node_id; + for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) { + if (IsValidForPhase(cur_node, GNNPhase::kValidate)) { + node_remapping_[new_node_id++] = cur_node; + } + } + galois::gInfo("Val nodes are from ", val_start, " to ", new_node_id, "(", + new_node_id - val_start, ")"); + + // third, test nodes + uint32_t test_start = new_node_id; + for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) { + if (IsValidForPhase(cur_node, GNNPhase::kTest)) { + node_remapping_[new_node_id++] = cur_node; + } + } + galois::gInfo("Test nodes are from ", test_start, " to ", new_node_id, "(", + new_node_id - test_start, ")"); + + // last, everything else + uint32_t other_start = new_node_id; + for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) { + if (IsValidForPhase(cur_node, GNNPhase::kOther)) { + node_remapping_[new_node_id++] = cur_node; + } + } + galois::gInfo("Other nodes are from ", other_start, " to ", new_node_id, "(", + new_node_id - other_start, ")"); + GALOIS_LOG_ASSERT(new_node_id == partitioned_graph_->size()); + + // remap features to match new node mapping, save to disk + // std::vector remapped_features(local_node_features_.size()); + //// do all works because can copy in parallel + // galois::do_all( + // galois::iterate(size_t{0}, partitioned_graph_->size()), + // [&] (size_t remap_node_id) { + // std::memcpy( + // &(remapped_features[remap_node_id * node_feature_length_]), + // &((local_node_features_.data())[node_remapping_[remap_node_id] * + // node_feature_length_]), node_feature_length_ * sizeof(GNNFeature)); + // } + //); + //// sanity check + // galois::do_all( + // galois::iterate(size_t{0}, partitioned_graph_->size()), + // [&] (size_t remap_node_id) { + // for (size_t i = 0; i < node_feature_length_; i++) { + // GALOIS_LOG_ASSERT(remapped_features[remap_node_id * + // node_feature_length_ + i] == + // local_node_features_[node_remapping_[remap_node_id] + // * node_feature_length_ + i]); + // } + // } + //); + //// save to disk + // std::ofstream write_file_stream; + // std::string feature_file = input_directory_ + new_name + "-feats.bin"; + // galois::gPrint(feature_file, "\n"); + // write_file_stream.open(feature_file, std::ios::binary | std::ios::out); + // write_file_stream.write((char*)remapped_features.data(), sizeof(GNNFeature) + // * + // partitioned_graph_->size() + // * node_feature_length_); + // write_file_stream.close(); + + // std::ifstream file_stream; + // file_stream.open(feature_file, std::ios::binary | std::ios::in); + // file_stream.read((char*)remapped_features.data(), sizeof(GNNFloat) * + // partitioned_graph_->size() + // * node_feature_length_); + // file_stream.close(); + //// sanity check again + // galois::do_all( + // galois::iterate(size_t{0}, partitioned_graph_->size()), + // [&] (size_t remap_node_id) { + // for (size_t i = 0; i < node_feature_length_; i++) { + // GALOIS_LOG_ASSERT(remapped_features[remap_node_id * + // node_feature_length_ + i] == + // local_node_features_[node_remapping_[remap_node_id] + // * node_feature_length_ + i]); + // } + // } + //); + // remapped_features.clear(); + + // std::vector remapped_labels(local_ground_truth_labels_.size()); + //// save new labels order to disk (binary file) + // galois::do_all( + // galois::iterate(size_t{0}, partitioned_graph_->size()), + // [&] (size_t remap_node_id) { + // remapped_labels[remap_node_id] = + // local_ground_truth_labels_[node_remapping_[remap_node_id]]; + // } + //); + + // std::string label_filename = input_directory_ + new_name + "-labels.bin"; + // std::ofstream label_write_stream; + // label_write_stream.open(label_filename, std::ios::binary | std::ios::out); + // label_write_stream.write((char*)remapped_labels.data(), sizeof(GNNLabel) * + // partitioned_graph_->size()); + // label_write_stream.close(); + + // galois::do_all( + // galois::iterate(size_t{0}, partitioned_graph_->size()), + // [&] (size_t remap_node_id) { + // remapped_labels[remap_node_id] = + // local_ground_truth_labels_[remap_node_id]; + // } + //); + // ReadLocalLabelsBin(new_name); + // galois::do_all( + // galois::iterate(size_t{0}, partitioned_graph_->size()), + // [&] (size_t remap_node_id) { + // GALOIS_LOG_ASSERT(local_ground_truth_labels_[remap_node_id] == + // remapped_labels[node_remapping_[remap_node_id]]); + // } + //); + + // save the mapping to a binary file for use by graph convert to deal with + // the gr + std::string label_filename = input_directory_ + new_name + "-mapping.bin"; + std::ofstream label_write_stream; + label_write_stream.open(label_filename, std::ios::binary | std::ios::out); + label_write_stream.write((char*)node_remapping_.data(), + sizeof(uint32_t) * node_remapping_.size()); + label_write_stream.close(); +} diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp index 101540f4d5..6e12b13899 100644 --- a/libgnn/test/gnngraph-test.cpp +++ b/libgnn/test/gnngraph-test.cpp @@ -22,5 +22,16 @@ int main() { galois::graphs::GNNGraph("cora", galois::graphs::GNNPartitionScheme::kCVC, true); + // below for when I want to check the remapper + // galois::graphs::GNNGraph remapper("ogbn-papers100M", + // galois::graphs::GNNPartitionScheme::kOEC, true); + // remapper.ContiguousRemap("ogbn-papers100M-remap"); + // galois::graphs::GNNGraph remapper("ogbn-papers100M-remap", + // galois::graphs::GNNPartitionScheme::kOEC, true); + + // galois::graphs::GNNGraph remapper("yelp", + // galois::graphs::GNNPartitionScheme::kOEC, true); + // remapper.ContiguousRemap("yelp-remap"); + return 0; } diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp index 553d96e1a2..3a822cf9c5 100644 --- a/libgnn/test/gpu-convlayer-test.cpp +++ b/libgnn/test/gpu-convlayer-test.cpp @@ -139,8 +139,8 @@ int main() { // since layer isn't 0 anymore, backward phase will actually return something dummy_ones = layer_1->AllocateGPU(dummy_ones_v); layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); - const galois::PointerWithSize& - layer_1_backward_output = layer_1->CopyBackwardOutputFromGPU(); + const galois::PointerWithSize& layer_1_backward_output = + layer_1->CopyBackwardOutputFromGPU(); ////////////////////////////////////////////////////////////////////////////// // check that multiplies go as expected From fa608aecf7713741e5de142ad88ec6092a40394d Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 13 May 2021 14:10:20 -0500 Subject: [PATCH 540/660] GNN subgraph view implementation Adds another form of subgraph creation: views. Does not construct the explicit subgraph but only creates the LID-SID mappings. Issue is that this moves the overhead to the forward/backward aggregation where the program must now loop over a very large number of nodes + do mappings from LID to SID in the background; per minibatch this overhead adds up. --- libgnn/include/galois/graphs/GNNGraph.h | 121 ++++++++++++++---- libgnn/include/galois/graphs/GNNSubgraph.h | 7 +- libgnn/src/graphs/GNNGraph.cpp | 43 +++++-- libgnn/src/graphs/GNNSubgraph.cpp | 137 ++++++++++++--------- 4 files changed, 212 insertions(+), 96 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index f970288718..154a4027ff 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -34,12 +34,15 @@ enum class GNNPartitionScheme { kOEC, kCVC, kOCVC }; class GNNGraph { public: using GNNDistGraph = galois::graphs::DistGraph; - using WholeGraph = galois::graphs::LC_CSR_Graph; using GraphNode = GNNDistGraph::GraphNode; // defined as such because dist graph range objects used long unsigned using NodeIterator = boost::counting_iterator; using EdgeIterator = GNNDistGraph::edge_iterator; + // using GNNEdgeSortIterator = internal::EdgeSortIterator, + // galois::LargeArray>>; + GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme, bool has_single_class_label); //! Loads a graph and all relevant metadata (labels, features, masks, etc.) @@ -68,7 +71,7 @@ class GNNGraph { size_t size() const { return partitioned_graph_->size(); } //! Returns # of nodes in the *graph that is currently active*. size_t active_size() const { - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->size(); } else { return subgraph_->size(); @@ -81,7 +84,7 @@ class GNNGraph { //! Node begin for all local nodes NodeIterator begin() const { - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->allNodesRange().begin(); } else { return subgraph_->begin(); @@ -89,7 +92,7 @@ class GNNGraph { } //! Node end for all local nodes NodeIterator end() const { - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->allNodesRange().end(); } else { return subgraph_->end(); @@ -97,7 +100,7 @@ class GNNGraph { } NodeIterator begin_owned() const { - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->masterNodesRange().begin(); } else { return subgraph_->begin_owned(); @@ -105,7 +108,7 @@ class GNNGraph { } NodeIterator end_owned() const { - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->masterNodesRange().end(); } else { return subgraph_->end_owned(); @@ -126,32 +129,46 @@ class GNNGraph { // All following functions take a local node id EdgeIterator edge_begin(GraphNode n) const { - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->edge_begin(n); + } else if (use_subgraph_view_) { + return partitioned_graph_->edge_begin(ConvertToLID(n)); } else { return subgraph_->edge_begin(n); } }; EdgeIterator edge_end(GraphNode n) const { - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->edge_end(n); + } else if (use_subgraph_view_) { + return partitioned_graph_->edge_end(ConvertToLID(n)); } else { return subgraph_->edge_end(n); } }; GraphNode GetEdgeDest(EdgeIterator ei) const { - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->getEdgeDst(ei); + } else if (use_subgraph_view_) { + // WARNING: this may return max of uint32 if the edge destination doesn't + // exist in the subgraph view + // get edge dest should NOT be called in that case + GraphNode rv = ConvertToSID(partitioned_graph_->getEdgeDst(ei)); + assert(rv != std::numeric_limits::max()); + return rv; } else { return subgraph_->GetEdgeDest(ei); } }; + galois::runtime::iterable< galois::NoDerefIterator> edges(GraphNode N) const { - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->edges(N); + } else if (use_subgraph_view_) { + return partitioned_graph_->edges(ConvertToLID(N)); } else { return subgraph_->edges(N); } @@ -166,14 +183,16 @@ class GNNGraph { } bool IsEdgeSampled(uint32_t ei, size_t layer_num) const { if (!use_subgraph_) { + // view uses original graph edge iterators return edge_sample_status_[ei][layer_num]; } else { - GALOIS_LOG_FATAL("This shouldn't be called with subgraph"); + return subgraph_->OutEdgeSampled(ei, layer_num, *this); return false; } }; bool IsEdgeSampled(EdgeIterator ei, size_t layer_num) const { if (!use_subgraph_) { + // view uses original graph edge iterators return edge_sample_status_[*ei][layer_num]; } else { return subgraph_->OutEdgeSampled(ei, layer_num, *this); @@ -193,19 +212,32 @@ class GNNGraph { edge_sample_status_[*ei][layer_num] = 0; }; + // GNNEdgeSortIterator EdgeSortBegin(GraphNode n) { + // return GNNEdgeSortIterator(*edge_begin(n), + // partitioned_graph_->edge_dst_ptr_LA(), &edge_sample_status_); + //} + // GNNEdgeSortIterator EdgeSortEnd(GraphNode n) { + // return GNNEdgeSortIterator(*edge_begin(n), + // partitioned_graph_->edge_dst_ptr_LA(), &edge_sample_status_); + //} + ////////////////////////////////////////////////////////////////////////////// // in edges ////////////////////////////////////////////////////////////////////////////// EdgeIterator in_edge_begin(GraphNode n) const { - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->in_edge_begin(n); + } else if (use_subgraph_view_) { + return partitioned_graph_->in_edge_begin(ConvertToLID(n)); } else { return subgraph_->in_edge_begin(n); } } EdgeIterator in_edge_end(GraphNode n) const { - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->in_edge_end(n); + } else if (use_subgraph_view_) { + return partitioned_graph_->in_edge_end(ConvertToLID(n)); } else { return subgraph_->in_edge_end(n); } @@ -213,15 +245,22 @@ class GNNGraph { galois::runtime::iterable< galois::NoDerefIterator> in_edges(GraphNode N) const { - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->in_edges(N); + } else if (use_subgraph_view_) { + return partitioned_graph_->in_edges(ConvertToLID(N)); } else { return subgraph_->in_edges(N); } } GraphNode GetInEdgeDest(EdgeIterator ei) const { - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { + return partitioned_graph_->GetInEdgeDest(ei); + } else if (use_subgraph_view_) { return partitioned_graph_->GetInEdgeDest(ei); + GraphNode rv = ConvertToSID(partitioned_graph_->GetInEdgeDest(ei)); + assert(rv != std::numeric_limits::max()); + return rv; } else { return subgraph_->GetInEdgeDest(ei); } @@ -241,6 +280,7 @@ class GNNGraph { }; bool IsInEdgeSampled(EdgeIterator ei, size_t layer_num) const { if (!use_subgraph_) { + // view can use this fine + requires it return edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)] [layer_num]; } else { @@ -274,20 +314,28 @@ class GNNGraph { size_t SampleEdges(size_t sample_layer_num, size_t num_to_sample, bool inductive_subgraph, size_t timestamp); + size_t ConstructSampledSubgraph(size_t num_sampled_layers) { + return ConstructSampledSubgraph(num_sampled_layers, false); + }; //! Construct the subgraph from sampled edges and corresponding nodes - size_t ConstructSampledSubgraph(size_t num_sampled_layers); + size_t ConstructSampledSubgraph(size_t num_sampled_layers, bool use_view); unsigned SampleNodeTimestamp(unsigned lid) const { return sample_node_timestamps_[lid]; } void EnableSubgraph() { use_subgraph_ = true; } - void DisableSubgraph() { use_subgraph_ = false; } + void EnableSubgraphView() { use_subgraph_view_ = true; } + void DisableSubgraph() { + use_subgraph_ = false; + use_subgraph_view_ = false; + } bool IsSubgraphOn() const { return use_subgraph_; } + bool IsSubgraphViewOn() const { return use_subgraph_view_; } //! Converts an id to an lid for the graph if subgraphs are in use uint32_t ConvertToLID(GraphNode sid) const { - if (use_subgraph_) { + if (use_subgraph_ || use_subgraph_view_) { return subgraph_->SIDToLID(sid); } else { return sid; @@ -295,7 +343,7 @@ class GNNGraph { } //! Converts an LID to an SID if subgraphs are in use uint32_t ConvertToSID(GraphNode lid) const { - if (use_subgraph_) { + if (use_subgraph_ || use_subgraph_view_) { return subgraph_->LIDToSID(lid); } else { return lid; @@ -303,7 +351,7 @@ class GNNGraph { } //! Converts SID to GID if subgraphs in use (else just return GID) uint32_t SIDToGID(GraphNode sid) const { - if (use_subgraph_) { + if (use_subgraph_ || use_subgraph_view_) { return GetGID(subgraph_->SIDToLID(sid)); } else { return GetGID(sid); @@ -312,13 +360,34 @@ class GNNGraph { //! Returns a pointer to the LID to SID map from the subgraph if subgraphs //! are in use galois::LargeArray* GetLIDToSIDPointer() { - if (use_subgraph_) { + if (use_subgraph_ || use_subgraph_view_) { return subgraph_->GetLIDToSIDPointer(); } else { return nullptr; } } + // void SortAllInEdgesBySID() { + // // check it out for node 0 + // //for (auto iter : in_edges(0)) { + // // galois::gInfo("0 to ", GetInEdgeDest(*iter), " with in out edge map ", + // *InEdgeToOutEdge(iter), " SID ", + // subgraph_->LIDToSID(GetInEdgeDest(*iter))); + // //} + // //galois::gInfo("Starting sort"); + // galois::StatTimer t("SortBySID"); + // t.start(); + // partitioned_graph_->SortAllInEdgesBySID(*(subgraph_->GetLIDToSIDPointer())); + // t.stop(); + // galois::gInfo("sort took ", t.get()); + // //galois::gInfo("End Sort"); + // //for (auto iter : in_edges(0)) { + // // galois::gInfo("0 to ", GetInEdgeDest(*iter), " with in out edge map ", + // *InEdgeToOutEdge(iter), " SID ", + // subgraph_->LIDToSID(GetInEdgeDest(*iter))); + // //} + //} + ////////////////////////////////////////////////////////////////////////////// void SetupTrainBatcher(size_t train_batch_size) { if (train_batcher_) { @@ -364,7 +433,7 @@ class GNNGraph { //! Get degree norm of subgraph for particular layer (i.e. includes training) GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const { - if (use_subgraph_) { + if (use_subgraph_ || use_subgraph_view_) { size_t degree; if (!subgraph_is_train_) { // case because degrees in each layer differ @@ -373,6 +442,7 @@ class GNNGraph { } else { degree = global_train_degrees_[subgraph_->SIDToLID(n)]; } + if (degree) { return 1.0 / degree; } else { @@ -394,7 +464,7 @@ class GNNGraph { GNNFloat GetSingleClassLabel(const unsigned lid) const { assert(using_single_class_labels_); unsigned to_use = lid; - if (use_subgraph_) { + if (use_subgraph_ || use_subgraph_view_) { to_use = subgraph_->SIDToLID(lid); } @@ -424,7 +494,7 @@ class GNNGraph { local_node_features_.size()); } #endif - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { return PointerWithSize(local_node_features_); } else { return PointerWithSize(subgraph_->GetLocalFeatures().data(), @@ -440,7 +510,7 @@ class GNNGraph { // XXX maybe just map this all over to subgraph, though in that case // issue is that subgraph doesn't necessarily know about test/val unsigned to_use = lid; - if (use_subgraph_) { + if (use_subgraph_ || use_subgraph_view_) { to_use = subgraph_->SIDToLID(lid); } // re: phase checks in this if: ranges are not used for these @@ -653,6 +723,7 @@ class GNNGraph { // TODO vars for subgraphs as necessary bool use_subgraph_{false}; + bool use_subgraph_view_{false}; bool subgraph_is_train_{false}; std::unique_ptr train_batcher_; diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h index 29b4429e17..0a7f2670c7 100644 --- a/libgnn/include/galois/graphs/GNNSubgraph.h +++ b/libgnn/include/galois/graphs/GNNSubgraph.h @@ -17,6 +17,8 @@ class GNNSubgraph { //! for the sampled bits size_t BuildSubgraph(GNNGraph& gnn_graph, size_t num_sampled_layers); + size_t BuildSubgraphView(GNNGraph& gnn_graph, size_t num_sampled_layers); + galois::gstl::Vector& GetLocalFeatures() { return subgraph_node_features_; } @@ -99,8 +101,9 @@ class GNNSubgraph { private: //! Creates subgraph ID mapping from the number of sampled nodes from the //! original graph. Should be done every epoch when sampled graph changes. - void CreateLocalToSubgraphMapping(const GNNGraph& gnn_graph, - size_t num_sampled_layers); + void CreateSubgraphMapping(const GNNGraph& gnn_graph, + size_t num_sampled_layers); + //! Counts in and out degrees of all sampled nodes in the graph void DegreeCounting(const GNNGraph& gnn_graph); //! Creates edges diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 0c1d3b4d8f..0a8a29ad0e 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -204,7 +204,7 @@ void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync, bool is_backward) const { gnn_matrix_to_sync_ = matrix_to_sync; gnn_matrix_to_sync_column_length_ = matrix_column_size; - if (!use_subgraph_) { + if (!use_subgraph_ && !use_subgraph_view_) { // set globals for the sync substrate if (!is_backward) { sync_substrate_ @@ -594,7 +594,7 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { } } valid_other_ = FindOtherMask(); - GALOIS_LOG_ASSERT(valid_other_ == 109513177); + GALOIS_LOG_ASSERT(valid_other_ <= 109513177); } else { size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train", &global_training_mask_range_, @@ -707,8 +707,8 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( size_t global_correct = num_correct_.reduce(); size_t global_checked = total_checked_.reduce(); - galois::gDebug("Sub: {}, Accuracy: {} / {}", use_subgraph_, global_correct, - global_checked); + GALOIS_LOG_DEBUG("Sub: {}, Accuracy: {} / {}", use_subgraph_, global_correct, + global_checked); return static_cast(global_correct) / static_cast(global_checked); @@ -850,7 +850,9 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, } size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { - use_subgraph_ = false; + use_subgraph_ = false; + use_subgraph_view_ = false; + bitset_sample_flag_.resize(size()); bitset_sample_flag_.reset(); @@ -913,7 +915,8 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, bool inductive_subgraph, size_t timestamp) { - use_subgraph_ = false; + use_subgraph_ = false; + use_subgraph_view_ = false; galois::GAccumulator sampled; galois::GAccumulator total; @@ -982,7 +985,8 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, bool inductive_subgraph, size_t timestamp) { assert(!subgraph_is_train_); - use_subgraph_ = false; + use_subgraph_ = false; + use_subgraph_view_ = false; galois::GAccumulator sampled; galois::GAccumulator total; @@ -1077,19 +1081,36 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, //! Construct the subgraph from sampled edges and corresponding nodes size_t -galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers) { +galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, + bool use_view) { // false first so that the build process can use functions to access the // real graph use_subgraph_ = false; + use_subgraph_view_ = false; gnn_sampled_out_degrees_ = &sampled_out_degrees_; + // first, sync the degres of the sampled edges across all hosts sync_substrate_ ->sync( "SubgraphDegree"); - size_t num_subgraph_nodes = - subgraph_->BuildSubgraph(*this, num_sampled_layers); + size_t num_subgraph_nodes; + // use_view = true; + if (!use_view) { + num_subgraph_nodes = subgraph_->BuildSubgraph(*this, num_sampled_layers); + } else { + // a view only has lid<->sid mappings + num_subgraph_nodes = + subgraph_->BuildSubgraphView(*this, num_sampled_layers); + //SortAllInEdgesBySID(); + } + // after this, this graph is a subgraph - use_subgraph_ = true; + if (!use_view) { + use_subgraph_ = true; + } else { + use_subgraph_view_ = true; + } + return num_subgraph_nodes; } diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index 67d4b74fd0..f5bde956f2 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -5,7 +5,7 @@ size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph( GNNGraph& gnn_graph, size_t num_sampled_layers) { galois::StatTimer timer("BuildSubgraph", kRegionName); timer.start(); - CreateLocalToSubgraphMapping(gnn_graph, num_sampled_layers); + CreateSubgraphMapping(gnn_graph, num_sampled_layers); if (num_subgraph_nodes_ == 0) { return 0; } @@ -18,9 +18,19 @@ size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph( return num_subgraph_nodes_; } -void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping( +size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraphView( + GNNGraph& gnn_graph, size_t num_sampled_layers) { + galois::StatTimer timer("BuildSubgraphView", kRegionName); + timer.start(); + CreateSubgraphMapping(gnn_graph, num_sampled_layers); + NodeFeatureCreation(gnn_graph); + timer.stop(); + return num_subgraph_nodes_; +} + +void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( const GNNGraph& gnn_graph, size_t num_sampled_layers) { - galois::StatTimer timer("LIDToSIDMapping", kRegionName); + galois::StatTimer timer("SIDMapping", kRegionName); timer.start(); assert(gnn_graph.size() == lid_to_subgraph_id_.size()); @@ -28,6 +38,17 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping( std::fill(lid_to_subgraph_id_.begin(), lid_to_subgraph_id_.end(), std::numeric_limits::max()); + galois::GAccumulator subgraph_count; + subgraph_count.reset(); + galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()), + [&](uint32_t node_id) { + if (gnn_graph.IsInSampledGraph(node_id)) { + subgraph_count += 1; + } + }); + num_subgraph_nodes_ = subgraph_count.reduce(); + subgraph_id_to_lid_.resize(num_subgraph_nodes_, 0); + // TODO(loc) depending on overhead, can parallelize this with a prefix sum // serial loop over LIDs to construct lid -> subgraph id mapping uint32_t current_sid = 0; @@ -39,13 +60,18 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping( if (gnn_graph.SampleNodeTimestamp(local_node_id) == 0) { // TODO should bound check the SID to max uint32_t // note: if SID is max uint32t, then it's not valid + subgraph_id_to_lid_[current_sid] = local_node_id; lid_to_subgraph_id_[local_node_id] = current_sid++; } } // all nodes before this SID are master nodes *that matter* // NOTE: there is a very subtle distinction here implementation wise - // that needs to be resolved in slightly more detail than this + // that needs to be resolved in slightly more detail than this; + // there may be master nodes that are past this boundary that will + // not be covered by this begin_owned loop, which may cause problems down + // the line + // TODO(loc) see above subgraph_master_boundary_ = current_sid; for (size_t local_node_id = last_owned_node; local_node_id < gnn_graph.size(); @@ -53,6 +79,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping( if (gnn_graph.SampleNodeTimestamp(local_node_id) == 0) { // TODO should bound check the SID to max uint32_t // note: if SID is max uint32t, then it's not valid + subgraph_id_to_lid_[current_sid] = local_node_id; lid_to_subgraph_id_[local_node_id] = current_sid++; } } @@ -66,6 +93,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping( for (size_t local_node_id = 0; local_node_id < gnn_graph.size(); local_node_id++) { if (gnn_graph.SampleNodeTimestamp(local_node_id) == i) { + subgraph_id_to_lid_[current_sid] = local_node_id; lid_to_subgraph_id_[local_node_id] = current_sid++; } } @@ -73,7 +101,8 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateLocalToSubgraphMapping( i, " is ", current_sid); } - num_subgraph_nodes_ = current_sid; + GALOIS_LOG_ASSERT(num_subgraph_nodes_ == current_sid); + // num_subgraph_nodes_ = current_sid; timer.stop(); } @@ -83,35 +112,30 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting( galois::StatTimer timer("DegreeCounting", kRegionName); timer.start(); - subgraph_id_to_lid_.resize(num_subgraph_nodes_); local_subgraph_out_degrees_.resize(num_subgraph_nodes_); local_subgraph_in_degrees_.resize(num_subgraph_nodes_); galois::do_all( - galois::iterate(gnn_graph.begin(), gnn_graph.end()), - [&](uint32_t node_id) { - if (gnn_graph.IsInSampledGraph(node_id)) { - uint32_t subgraph_id = lid_to_subgraph_id_[node_id]; - subgraph_id_to_lid_[subgraph_id] = node_id; - - uint32_t out_degrees = 0; - for (auto out_edge_iter : gnn_graph.edges(node_id)) { - if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) { - out_degrees++; - } + galois::iterate(begin(), end()), + [&](uint32_t subgraph_id) { + uint32_t node_id = subgraph_id_to_lid_[subgraph_id]; + uint32_t out_degrees = 0; + for (auto out_edge_iter : gnn_graph.edges(node_id)) { + if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) { + out_degrees++; } - local_subgraph_out_degrees_[subgraph_id] = out_degrees; + } + local_subgraph_out_degrees_[subgraph_id] = out_degrees; - uint32_t in_degrees = 0; - for (auto in_edge_iter : gnn_graph.in_edges(node_id)) { - if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) { - in_degrees++; - } + uint32_t in_degrees = 0; + for (auto in_edge_iter : gnn_graph.in_edges(node_id)) { + if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) { + in_degrees++; } - local_subgraph_in_degrees_[subgraph_id] = in_degrees; - // galois::gDebug("Local ID ", node_id, " SID ", subgraph_id, " out ", - // out_degrees, " in ", in_degrees); } + local_subgraph_in_degrees_[subgraph_id] = in_degrees; + // galois::gDebug("Local ID ", node_id, " SID ", subgraph_id, " out ", + // out_degrees, " in ", in_degrees); }, galois::steal()); @@ -147,43 +171,40 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( // save edges + save reference to layer sample status galois::do_all( - galois::iterate(gnn_graph.begin(), gnn_graph.end()), - [&](uint32_t node_id) { - if (gnn_graph.IsInSampledGraph(node_id)) { - uint32_t subgraph_id = lid_to_subgraph_id_[node_id]; - assert(subgraph_id != std::numeric_limits::max()); - uint32_t out_location = 0; - uint32_t in_location = 0; - if (subgraph_id != 0) { - out_location = local_subgraph_out_degrees_[subgraph_id - 1]; - in_location = local_subgraph_in_degrees_[subgraph_id - 1]; - } + galois::iterate(begin(), end()), + [&](uint32_t subgraph_id) { + uint32_t node_id = subgraph_id_to_lid_[subgraph_id]; + assert(subgraph_id != std::numeric_limits::max()); + uint32_t out_location = 0; + uint32_t in_location = 0; + if (subgraph_id != 0) { + out_location = local_subgraph_out_degrees_[subgraph_id - 1]; + in_location = local_subgraph_in_degrees_[subgraph_id - 1]; + } + + for (auto out_edge_iter : gnn_graph.edges(node_id)) { + if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) { + assert(lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)] != + std::numeric_limits::max()); + subedge_to_original_edge_[out_location] = *out_edge_iter; - for (auto out_edge_iter : gnn_graph.edges(node_id)) { - if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) { - assert( - lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)] != - std::numeric_limits::max()); - subedge_to_original_edge_[out_location] = *out_edge_iter; - - underlying_graph_.constructEdge( - out_location++, - lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)]); - } + underlying_graph_.constructEdge( + out_location++, + lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)]); } + } - for (auto in_edge_iter : gnn_graph.in_edges(node_id)) { - if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) { - in_subedge_to_original_edge_[in_location] = - *(gnn_graph.InEdgeToOutEdge(in_edge_iter)); - underlying_graph_.ConstructInEdge( - in_location++, - lid_to_subgraph_id_[gnn_graph.GetInEdgeDest(in_edge_iter)]); - } + for (auto in_edge_iter : gnn_graph.in_edges(node_id)) { + if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) { + in_subedge_to_original_edge_[in_location] = + *(gnn_graph.InEdgeToOutEdge(in_edge_iter)); + underlying_graph_.ConstructInEdge( + in_location++, + lid_to_subgraph_id_[gnn_graph.GetInEdgeDest(in_edge_iter)]); } - assert(out_location == local_subgraph_out_degrees_[subgraph_id]); - assert(in_location == local_subgraph_in_degrees_[subgraph_id]); } + assert(out_location == local_subgraph_out_degrees_[subgraph_id]); + assert(in_location == local_subgraph_in_degrees_[subgraph_id]); }, galois::steal()); timer.stop(); From 27826c21232b4d27a13d9720fcd716aac81f9497 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 13 May 2021 16:19:51 -0500 Subject: [PATCH 541/660] Training subgraph resizes rows in GNN; degree norm 1) resize gnn layer row counts to reduce linear xform cost 2) training subgraph uses global degrees now since it will take all edges and not just training nodes --- libgnn/include/galois/graphs/GNNGraph.h | 4 ++- libgnn/src/GraphNeuralNetwork.cpp | 38 +++++++++++++++---------- 2 files changed, 26 insertions(+), 16 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 154a4027ff..f78ab15bfc 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -440,7 +440,9 @@ class GNNGraph { degree = sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)]; } else { - degree = global_train_degrees_[subgraph_->SIDToLID(n)]; + // XXX if inductive + // degree = global_train_degrees_[subgraph_->SIDToLID(n)]; + degree = global_degrees_[subgraph_->SIDToLID(n)]; } if (degree) { diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 9d45265afe..ce2a111af8 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -160,14 +160,16 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( float galois::GraphNeuralNetwork::Train(size_t num_epochs) { const size_t this_host = graph_->host_id(); float train_accuracy{0.f}; - size_t train_subgraph_nodes = 0; + std::vector subgraph_layer_sizes; // this subgraph only needs to be created once if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) { // Setup the subgraph to only be the training graph size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); + subgraph_layer_sizes.emplace_back(local_seed_node_count); galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", local_seed_node_count); size_t num_sampled_layers = 0; + gnn_layers_.back()->ResizeRows(local_seed_node_count); for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); back_iter++) { GNNLayerType layer_type = (*back_iter)->layer_type(); @@ -180,18 +182,15 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { "Number of local nodes for train subgraph for layer ", (*back_iter)->graph_user_layer_number(), " is ", current_sample_size); + // resizing + (*back_iter) + ->ResizeInputOutputRows(current_sample_size, local_seed_node_count); + local_seed_node_count = current_sample_size; + subgraph_layer_sizes.emplace_back(local_seed_node_count); num_sampled_layers++; - // XXX resizing of layers } } - - // resize layer matrices - // XXX resizing of layers should be done above, not here - train_subgraph_nodes = graph_->ConstructSampledSubgraph(num_sampled_layers); - for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); - layer++) { - (*layer)->ResizeRows(train_subgraph_nodes); - } + graph_->ConstructSampledSubgraph(num_sampled_layers); } galois::StatTimer epoch_timer("TrainingTime", "GraphNeuralNetwork"); @@ -203,10 +202,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // swap to train subgraph if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) { graph_->EnableSubgraph(); - // XXX resizing based on sampled per layer - for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); - layer++) { - (*layer)->ResizeRows(train_subgraph_nodes); + size_t l_count = 0; + gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]); + for (auto back_iter = gnn_layers_.rbegin(); + back_iter != gnn_layers_.rend(); back_iter++) { + GNNLayerType layer_type = (*back_iter)->layer_type(); + if (layer_type == GNNLayerType::kGraphConvolutional || + layer_type == GNNLayerType::kSAGE) { + (*back_iter) + ->ResizeInputOutputRows(subgraph_layer_sizes[l_count + 1], + subgraph_layer_sizes[l_count]); + l_count++; + } } } @@ -354,7 +361,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { if (do_validate || do_test) { // disable subgraph graph_->DisableSubgraph(); - // TODO only do this when necessary + // XXX test batching for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { (*layer)->ResizeRows(graph_->size()); @@ -415,6 +422,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } // check test accuracy + // XXX test batching galois::StatTimer test_timer("FinalTestRun", "GraphNeuralNetwork"); test_timer.start(); SetLayerPhases(galois::GNNPhase::kTest); From 9a016eb97568a444b8976da459bbff130c88f509 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 13 May 2021 16:34:37 -0500 Subject: [PATCH 542/660] row change optimization for full batch sampling --- libgnn/src/GraphNeuralNetwork.cpp | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index ce2a111af8..c1b51e757c 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -220,9 +220,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // beginning of epoch sampling if (config_.do_sampling() && !config_.train_minibatch_size()) { size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); + gnn_layers_.back()->ResizeRows(local_seed_node_count); galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", local_seed_node_count); - size_t num_sampled_layers = 0; // work backwards on GCN/SAGE layers @@ -240,16 +240,16 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { "Number of local nodes for layer ", (*back_iter)->graph_user_layer_number(), " is ", current_sample_size); + + (*back_iter) + ->ResizeInputOutputRows(current_sample_size, + local_seed_node_count); + local_seed_node_count = current_sample_size; num_sampled_layers++; } } // resize layer matrices - size_t num_subgraph_nodes = - graph_->ConstructSampledSubgraph(num_sampled_layers); - for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); - layer++) { - (*layer)->ResizeRows(num_subgraph_nodes); - } + graph_->ConstructSampledSubgraph(num_sampled_layers); } if (!config_.train_minibatch_size()) { @@ -315,16 +315,10 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } // resize layer matrices - // size_t num_subgraph_nodes = graph_->ConstructSampledSubgraph(); graph_->ConstructSampledSubgraph(num_sampled_layers); // XXX resizes above only work for SAGE layers; will break if other // layers are tested - // for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); - // layer++) { - // (*layer)->ResizeRows(num_subgraph_nodes); - //} - const PointerWithSize batch_pred = DoInference(); train_accuracy = GetGlobalAccuracy(batch_pred); GradientPropagation(); @@ -336,6 +330,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { ": Train accuracy/F1 micro is ", train_accuracy, " time ", batch_timer.get(), "\n"); + // XXX mid batch test accuracy checking? + if (!global_work_left) { break; } From a39868cb25aceb14d43fb7d622e229a0a6b66df1 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 13 May 2021 19:06:03 -0500 Subject: [PATCH 543/660] Test batching Adds test batching capabilities to GNNs that allow test accuracy to be evaluated in chunks (reason is that maybe it's not possible to evaluate entire graph at once due to memory concerns). --- libgnn/include/galois/GraphNeuralNetwork.h | 4 + libgnn/include/galois/graphs/GNNGraph.h | 22 ++++ libgnn/src/GraphNeuralNetwork.cpp | 126 +++++++++++++++++++-- libgnn/src/graphs/GNNGraph.cpp | 54 +++++++-- lonestar/libgnnbench/src/Input.cpp | 6 + 5 files changed, 191 insertions(+), 21 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index fe1cb17477..712be6a8ec 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -93,6 +93,7 @@ class GraphNeuralNetworkConfig { bool do_sampling() const { return do_sampling_; } unsigned train_minibatch_size() const { return train_minibatch_size_; } + unsigned test_minibatch_size() const { return test_minibatch_size_; } //! Get the default layer config of layers in this GNN const GNNLayerConfig& default_layer_config() const { @@ -112,6 +113,7 @@ class GraphNeuralNetworkConfig { //! Interval to run testing set on network at; 0 = no run unsigned test_interval_{0}; unsigned train_minibatch_size_{0}; + unsigned test_minibatch_size_{0}; //! Fan out used for sampling (if sampling is enabled) std::vector fan_out_vector_; @@ -173,6 +175,8 @@ class GraphNeuralNetwork { //! Returns the output layer galois::GNNLayer* GetOutputLayer() { return gnn_layers_.back().get(); } + float MinibatchedTesting(); + //! Do training for a specified # of epochs and return test accuracy at the //! end of it float Train(size_t num_epochs); diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index f78ab15bfc..c3bc396551 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -406,6 +406,25 @@ class GNNGraph { size_t PrepareNextTrainMinibatch(); //! Returns true if there are still more minibatches in this graph bool MoreTrainMinibatches() { return !train_batcher_->NoMoreMinibatches(); }; + + ////////////////////////////////////////////////////////////////////////////// + + void SetupTestBatcher(size_t test_batch_size) { + if (test_batcher_) { + // clear before remake + test_batcher_.reset(); + } + test_batcher_ = std::make_unique( + local_testing_mask_, test_batch_size, *end_owned()); + local_minibatch_mask_.resize(partitioned_graph_->size()); + } + void ResetTestMinibatcher() { test_batcher_->ResetMinibatchState(); } + //! Setup the state for the next minibatch sampling call by using the + //! minibatcher to pick up the next set batch of nodes + size_t PrepareNextTestMinibatch(); + //! Returns true if there are still more minibatches in this graph + bool MoreTestMinibatches() { return !test_batcher_->NoMoreMinibatches(); }; + ////////////////////////////////////////////////////////////////////////////// GNNFloat GetGCNNormFactor(GraphNode lid) const { if (global_degrees_[lid]) { @@ -461,6 +480,9 @@ class GNNGraph { float GetGlobalAccuracy(PointerWithSize predictions, GNNPhase phase, bool sampling); + std::pair + GetBatchAccuracy(PointerWithSize predictions); + //! Returns the ground truth label of some local id assuming labels are single //! class labels. GNNFloat GetSingleClassLabel(const unsigned lid) const { diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index c1b51e757c..ab1d89e066 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -103,9 +103,8 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( } } - // XXX test minibatch if (config_.do_sampling() || config_.use_train_subgraph_ || - config.train_minibatch_size()) { + config.train_minibatch_size() || config.test_minibatch_size()) { // output layer not included; it will never involve sampling graph_->InitializeSamplingData(num_graph_user_layers_, config_.use_train_subgraph_); @@ -114,7 +113,9 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( if (config_.train_minibatch_size()) { graph_->SetupTrainBatcher(config_.train_minibatch_size()); } - // XXX test minibatch size + if (config_.test_minibatch_size()) { + graph_->SetupTestBatcher(config_.test_minibatch_size()); + } // create the output layer GNNLayerDimensions output_dims = { @@ -150,13 +151,70 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( } // flip sampling on layers - if (config_.do_sampling() || config_.train_minibatch_size()) { + if (config_.use_train_subgraph_ || config_.do_sampling() || + config_.train_minibatch_size()) { for (std::unique_ptr& ptr : gnn_layers_) { ptr->EnableSampling(); } } } +float galois::GraphNeuralNetwork::MinibatchedTesting() { + galois::gDebug("minibatched testing"); + graph_->ResetTestMinibatcher(); + SetLayerPhases(galois::GNNPhase::kBatch); + + uint32_t correct = 0; + uint32_t total = 0; + while (true) { + work_left_.reset(); + size_t seed_node_count = graph_->PrepareNextTestMinibatch(); + // last layer input size/output rows becomes seed node size + gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, seed_node_count); + size_t num_sampled_layers = 0; + for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); + back_iter++) { + GNNLayerType layer_type = (*back_iter)->layer_type(); + if (layer_type == GNNLayerType::kGraphConvolutional || + layer_type == GNNLayerType::kSAGE) { + // you can minibatch with sampling or minibatch and grab all + // relevant neighbors + size_t current_sample_size; + current_sample_size = + graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(), + false, num_sampled_layers + 1); + // resize this layer, change seed node count + (*back_iter) + ->ResizeInputOutputRows(current_sample_size, seed_node_count); + seed_node_count = current_sample_size; + num_sampled_layers++; + // XXX resizes above only work for SAGE layers; will break if other + // layers are tested + } + } + + // resize layer matrices + graph_->ConstructSampledSubgraph(num_sampled_layers); + + const PointerWithSize batch_pred = DoInference(); + std::pair correct_total = + graph_->GetBatchAccuracy(batch_pred); + + correct += correct_total.first; + total += correct_total.second; + + work_left_ += graph_->MoreTestMinibatches(); + char global_work_left = work_left_.reduce(); + if (!global_work_left) { + break; + } + } + + galois::gDebug("correct / total ", correct, " ", total); + + return (1.0 * correct) / (1.0 * total); +} + float galois::GraphNeuralNetwork::Train(size_t num_epochs) { const size_t this_host = graph_->host_id(); float train_accuracy{0.f}; @@ -357,7 +415,6 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { if (do_validate || do_test) { // disable subgraph graph_->DisableSubgraph(); - // XXX test batching for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { (*layer)->ResizeRows(graph_->size()); @@ -383,11 +440,19 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { if (do_test) { epoch_test_timer.start(); - SetLayerPhases(galois::GNNPhase::kTest); - const PointerWithSize test_pred = DoInference(); - epoch_test_timer.stop(); + float test_acc; + + if (!config_.test_minibatch_size()) { + SetLayerPhases(galois::GNNPhase::kTest); + const PointerWithSize test_pred = DoInference(); + epoch_test_timer.stop(); + + test_acc = GetGlobalAccuracy(test_pred); + } else { + test_acc = MinibatchedTesting(); + epoch_test_timer.stop(); + } - float test_acc = GetGlobalAccuracy(test_pred); if (this_host == 0) { galois::gPrint("Epoch ", epoch, ": Test accuracy is ", test_acc, "\n"); const std::string test_name_acc = @@ -404,6 +469,35 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { epoch_timer.get()); // revert to training phase for next epoch SetLayerPhases(galois::GNNPhase::kTrain); + + // TODO too much code dupe + // Resconstruct the train subgraph since it was replaced by test subgraph + if (config_.use_train_subgraph_ && !config_.train_minibatch_size() && + config_.test_minibatch_size() && do_test) { + // Setup the subgraph to only be the training graph + size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); + galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", + local_seed_node_count); + size_t num_sampled_layers = 0; + gnn_layers_.back()->ResizeRows(local_seed_node_count); + for (auto back_iter = gnn_layers_.rbegin(); + back_iter != gnn_layers_.rend(); back_iter++) { + GNNLayerType layer_type = (*back_iter)->layer_type(); + if (layer_type == GNNLayerType::kGraphConvolutional || + layer_type == GNNLayerType::kSAGE) { + size_t current_sample_size = graph_->SampleAllEdges( + (*back_iter)->graph_user_layer_number(), + config_.inductive_subgraph_, num_sampled_layers + 1); + // resizing + (*back_iter) + ->ResizeInputOutputRows(current_sample_size, + local_seed_node_count); + local_seed_node_count = current_sample_size; + num_sampled_layers++; + } + } + graph_->ConstructSampledSubgraph(num_sampled_layers); + } } } @@ -420,10 +514,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // check test accuracy // XXX test batching galois::StatTimer test_timer("FinalTestRun", "GraphNeuralNetwork"); + float global_accuracy; + test_timer.start(); - SetLayerPhases(galois::GNNPhase::kTest); - const PointerWithSize predictions = DoInference(); - float global_accuracy = GetGlobalAccuracy(predictions); + + if (!config_.test_minibatch_size()) { + SetLayerPhases(galois::GNNPhase::kTest); + const PointerWithSize predictions = DoInference(); + global_accuracy = GetGlobalAccuracy(predictions); + } else { + global_accuracy = MinibatchedTesting(); + } + test_timer.stop(); if (this_host == 0) { diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 0a8a29ad0e..fca2c78cfd 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -713,6 +713,36 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( return static_cast(global_correct) / static_cast(global_checked); } +std::pair galois::graphs::GNNGraph::GetBatchAccuracy( + PointerWithSize predictions) { + // check owned nodes' accuracy + assert((num_label_classes_ * size()) == predictions.size()); + num_correct_.reset(); + total_checked_.reset(); + + galois::do_all( + // will only loop over sampled nodes if sampling is on + galois::iterate(begin_owned(), end_owned()), + // this is possibly the subgraph id + [&](const unsigned node_id) { + if (IsValidForPhase(node_id, GNNPhase::kBatch)) { + total_checked_ += 1; + size_t predicted_label = galois::MaxIndex( + num_label_classes_, &(predictions[node_id * num_label_classes_])); + if (predicted_label == + static_cast(GetSingleClassLabel(node_id))) { + num_correct_ += 1; + } + } + }, + // steal on as some threads may have nothing to work on + galois::steal(), galois::loopname("GlobalAccuracy")); + + size_t global_correct = num_correct_.reduce(); + size_t global_checked = total_checked_.reduce(); + + return std::make_pair(global_correct, global_checked); +} float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti( PointerWithSize predictions, GNNPhase phase, bool sampling) { @@ -918,10 +948,10 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, use_subgraph_ = false; use_subgraph_view_ = false; - galois::GAccumulator sampled; - galois::GAccumulator total; - sampled.reset(); - total.reset(); + // galois::GAccumulator sampled; + // galois::GAccumulator total; + // sampled.reset(); + // total.reset(); galois::do_all( galois::iterate(begin(), end()), @@ -930,7 +960,7 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, if (IsInSampledGraph(src_iter)) { // marks ALL edges of nodes that connect to train/other nodes for (auto edge_iter : partitioned_graph_->edges(*src_iter)) { - total += 1; + // total += 1; if (inductive_subgraph) { if (!IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), GNNPhase::kTrain) && @@ -945,14 +975,15 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, bitset_sample_flag_.set( partitioned_graph_->getEdgeDst(edge_iter)); } - sampled += 1; + // sampled += 1; } } }, galois::steal(), galois::loopname("ChooseAllEdges")); - galois::gPrint("Num sampled edges in inductive graph is ", sampled.reduce(), - " out of ", total.reduce(), "\n"); + // galois::gPrint("Num sampled edges in inductive graph is ", + // sampled.reduce(), + // " out of ", total.reduce(), "\n"); std::vector new_nodes = bitset_sample_flag_.getOffsets(); // update nodes, then communicate update to all hosts so that they can @@ -1101,7 +1132,7 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, // a view only has lid<->sid mappings num_subgraph_nodes = subgraph_->BuildSubgraphView(*this, num_sampled_layers); - //SortAllInEdgesBySID(); + // SortAllInEdgesBySID(); } // after this, this graph is a subgraph @@ -1131,6 +1162,11 @@ size_t galois::graphs::GNNGraph::PrepareNextTrainMinibatch() { return SetupNeighborhoodSample(GNNPhase::kBatch); } +size_t galois::graphs::GNNGraph::PrepareNextTestMinibatch() { + test_batcher_->GetNextMinibatch(&local_minibatch_mask_); + return SetupNeighborhoodSample(GNNPhase::kBatch); +} + //////////////////////////////////////////////////////////////////////////////// #ifdef GALOIS_ENABLE_GPU diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index d15adf2d9f..4b7717eac3 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -132,6 +132,11 @@ llvm::cl::opt cll::desc("Size of training minibatch (default 0)"), cll::init(0)); +llvm::cl::opt + test_minibatch_size("testMinibatchSize", + cll::desc("Size of test minibatch (default 0)"), + cll::init(0)); + llvm::cl::opt val_interval("valInterval", cll::desc("# of epochs to test validation set (default 0)"), @@ -306,6 +311,7 @@ std::unique_ptr InitializeGraphNeuralNetwork() { gnn_config.validation_interval_ = val_interval; gnn_config.test_interval_ = test_interval; gnn_config.train_minibatch_size_ = train_minibatch_size; + gnn_config.test_minibatch_size_ = test_minibatch_size; gnn_config.inductive_subgraph_ = inductive_subgraph; gnn_config.fan_out_vector_ = CreateFanOutVector(); From 8ff7cfc7e3a749190092944dd6b6dad9b3903a65 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 14 May 2021 22:38:22 -0500 Subject: [PATCH 544/660] Minibatch test interval For minibatching, allows testing at a particular step of minibatches so that you can evaluate accuracy in the middle of a minibatch epoch. --- libgnn/include/galois/GraphNeuralNetwork.h | 1 + libgnn/src/GraphNeuralNetwork.cpp | 42 +++++++++++++++++++++- lonestar/libgnnbench/src/Input.cpp | 20 +++++++---- 3 files changed, 55 insertions(+), 8 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 712be6a8ec..af1955d258 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -112,6 +112,7 @@ class GraphNeuralNetworkConfig { unsigned validation_interval_{0}; //! Interval to run testing set on network at; 0 = no run unsigned test_interval_{0}; + unsigned minibatch_test_interval_{10}; unsigned train_minibatch_size_{0}; unsigned test_minibatch_size_{0}; //! Fan out used for sampling (if sampling is enabled) diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index ab1d89e066..fbaad4ff4d 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -384,11 +384,51 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { work_left_ += graph_->MoreTrainMinibatches(); char global_work_left = work_left_.reduce(); batch_timer.stop(); + epoch_timer.stop(); galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1, ": Train accuracy/F1 micro is ", train_accuracy, " time ", batch_timer.get(), "\n"); - // XXX mid batch test accuracy checking? + bool test_eval = + config_.minibatch_test_interval_ + ? (batch_num - 1) % config_.minibatch_test_interval_ == 0 + : false; + + if (test_eval) { + float test_acc; + if (!config_.test_minibatch_size()) { + graph_->DisableSubgraph(); + for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); + layer++) { + (*layer)->ResizeRows(graph_->size()); + } + SetLayerPhases(galois::GNNPhase::kTest); + const PointerWithSize test_pred = DoInference(); + test_acc = GetGlobalAccuracy(test_pred); + } else { + test_acc = MinibatchedTesting(); + } + + if (this_host == 0) { + galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1, + ": Test accuracy is ", test_acc, "\n"); + const std::string test_name_acc = + "TestEpoch" + std::to_string(epoch) + "Batch" + + std::to_string(batch_num - 1) + "Accuracy"; + galois::runtime::reportStat_Single("GraphNeuralNetwork", + test_name_acc, test_acc); + } + // report the training time elapsed at this point in time + galois::runtime::reportStat_Single( + "GraphNeuralNetwork", + "ElapsedTrainTimeEpoch" + std::to_string(epoch) + "Batch" + + std::to_string(batch_num - 1), + epoch_timer.get()); + // revert to training phase for next epoch + SetLayerPhases(galois::GNNPhase::kTrain); + } + + epoch_timer.start(); if (!global_work_left) { break; diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index 4b7717eac3..5facfa95c5 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -137,6 +137,11 @@ llvm::cl::opt cll::desc("Size of test minibatch (default 0)"), cll::init(0)); +llvm::cl::opt minibatch_test_interval( + "minibatchTestInterval", + cll::desc("Size of test intervals for minibatch (default 0)"), + cll::init(0)); + llvm::cl::opt val_interval("valInterval", cll::desc("# of epochs to test validation set (default 0)"), @@ -307,13 +312,14 @@ std::unique_ptr InitializeGraphNeuralNetwork() { galois::GraphNeuralNetworkConfig gnn_config( num_layers, layer_types, layer_sizes_vector, output_layer_type, do_graph_sampling, layer_config); - gnn_config.use_train_subgraph_ = use_train_subgraph; - gnn_config.validation_interval_ = val_interval; - gnn_config.test_interval_ = test_interval; - gnn_config.train_minibatch_size_ = train_minibatch_size; - gnn_config.test_minibatch_size_ = test_minibatch_size; - gnn_config.inductive_subgraph_ = inductive_subgraph; - gnn_config.fan_out_vector_ = CreateFanOutVector(); + gnn_config.use_train_subgraph_ = use_train_subgraph; + gnn_config.validation_interval_ = val_interval; + gnn_config.test_interval_ = test_interval; + gnn_config.train_minibatch_size_ = train_minibatch_size; + gnn_config.test_minibatch_size_ = test_minibatch_size; + gnn_config.minibatch_test_interval_ = minibatch_test_interval; + gnn_config.inductive_subgraph_ = inductive_subgraph; + gnn_config.fan_out_vector_ = CreateFanOutVector(); // optimizer std::unique_ptr opt = CreateOptimizer(gnn_graph.get()); From 6ab5a9e2298cb8bdde101d898d6edc751e6e2bf7 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 17 May 2021 22:53:37 -0500 Subject: [PATCH 545/660] MKL microbenchmark 3 variants: no galois, init galois shared, and init galois dist --- libgnn/test/CMakeLists.txt | 17 +++++++ libgnn/test/mkl_micro.cpp | 98 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+) create mode 100644 libgnn/test/mkl_micro.cpp diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 11c7ab78b8..3a7fec8729 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -1,3 +1,20 @@ +add_executable(mkl_micro mkl_micro.cpp) +target_link_directories(mkl_micro PUBLIC ${MKL_LIBRARIES}) +target_include_directories(mkl_micro PUBLIC + ${MKL_INCLUDE_DIRS} +) +target_link_libraries(mkl_micro ${INTEL_LIBS}) + +add_executable(mkl_micro_sgalois mkl_micro.cpp) +target_link_libraries(mkl_micro_sgalois galois_gnn) +target_compile_definitions(mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1) + +add_executable(mkl_micro_dgalois mkl_micro.cpp) +target_link_libraries(mkl_micro_dgalois galois_gnn) +target_compile_definitions(mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1) + +################################################################################ + add_executable(gnngraph-test gnngraph-test.cpp) target_link_libraries(gnngraph-test galois_gnn) add_test(NAME gnngraph-test COMMAND gnngraph-test) diff --git a/libgnn/test/mkl_micro.cpp b/libgnn/test/mkl_micro.cpp new file mode 100644 index 0000000000..e784b2cde0 --- /dev/null +++ b/libgnn/test/mkl_micro.cpp @@ -0,0 +1,98 @@ +#include +#include +#include +#include + +#ifdef USE_SHARED_GALOIS +#include "galois/Galois.h" +#endif +#ifdef USE_DIST_GALOIS +#include "galois/DistGalois.h" +#endif + +// MKL wrapper +void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, + size_t input_rows, size_t input_columns, size_t output_columns, + const float* a, const float* b, float* output) { + // set lead dimension based on cblas spec w.r.t. transpose setting + size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows; + size_t lead_dim_b = + (trans_b == CblasNoTrans) ? output_columns : input_columns; + // do the MM + cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns, + input_columns, 1.0, a, lead_dim_a, b, lead_dim_b, + false ? 1.0 : 0.0, output, output_columns); +} + +void CacheFlush(std::vector* matrix) { + for (size_t i = 0; i < matrix->size(); i++) { + (*matrix)[i] = i; + } +} + +int main(int argc, char* argv[]) { +#ifdef USE_SHARED_GALOIS + galois::SharedMemSys G; + if (argc != 2) { + printf("Thread arg not specified\n"); + exit(1); + } + galois::setActiveThreads(std::stoi(argv[1])); + printf("Initialized Galois Shared Mem with %u threads\n", + galois::getActiveThreads()); +#endif + +#ifdef USE_DIST_GALOIS + galois::DistMemSys G; + if (argc != 2) { + printf("Thread arg not specified\n"); + exit(1); + } + galois::setActiveThreads(std::stoi(argv[1])); + printf("Initialized Galois Dist Mem with %u threads\n", + galois::getActiveThreads()); +#endif + + printf("%d %s\n", argc, argv[0]); + + // dimensions from test case + size_t a_dim = 12000000; + size_t b_dim = 128; + size_t c_dim = 16; + + // inputs + std::vector matrix_1(a_dim * b_dim); + std::vector matrix_2(a_dim * c_dim); + // output + std::vector matrix_3(b_dim * c_dim); + + size_t kBigSize = 1000000000; + std::vector very_big_matrix(kBigSize); + + // change reps here; maybe make it command line arg + for (size_t reps = 0; reps < 3; reps++) { + // reinit + srand(0); + for (size_t i = 0; i < matrix_1.size(); i++) { + matrix_1[i] = rand() / static_cast(RAND_MAX / 10); + } + srand(1); + for (size_t i = 0; i < matrix_2.size(); i++) { + matrix_2[i] = rand() / static_cast(RAND_MAX / 10); + } + + very_big_matrix.clear(); + very_big_matrix.resize(kBigSize); + // cache flush + CacheFlush(&very_big_matrix); + + printf("Rep %lu\n", reps); + + // transpose because it's the same as the problematic call in GNN + // TODO(loc) non transpose version + CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(), + matrix_2.data(), matrix_3.data()); + } + + return 0; +} From 24551108c8fb2928dda4cab2b95f41fc5c8885b4 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 18 May 2021 16:42:02 -0500 Subject: [PATCH 546/660] GNN subgraph PODResizableArray rather than vector --- libgnn/include/galois/graphs/GNNGraph.h | 2 +- libgnn/include/galois/graphs/GNNSubgraph.h | 14 +++++------ libgnn/src/graphs/GNNSubgraph.cpp | 28 +++++++++++++++++----- 3 files changed, 30 insertions(+), 14 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index c3bc396551..f3689b00be 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -522,7 +522,7 @@ class GNNGraph { return PointerWithSize(local_node_features_); } else { return PointerWithSize(subgraph_->GetLocalFeatures().data(), - subgraph_->GetLocalFeatures().size()); + subgraph_->size() * node_feature_length_); } } diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h index 0a7f2670c7..6be5fb04fc 100644 --- a/libgnn/include/galois/graphs/GNNSubgraph.h +++ b/libgnn/include/galois/graphs/GNNSubgraph.h @@ -19,7 +19,7 @@ class GNNSubgraph { size_t BuildSubgraphView(GNNGraph& gnn_graph, size_t num_sampled_layers); - galois::gstl::Vector& GetLocalFeatures() { + galois::PODResizeableArray& GetLocalFeatures() { return subgraph_node_features_; } @@ -124,20 +124,20 @@ class GNNSubgraph { //! Features corresponding only to this subgraph; copied from main graph //! (in other words, redundant; would be nice if there was a way to //! fake contiguous memory - galois::gstl::Vector subgraph_node_features_; + galois::PODResizeableArray subgraph_node_features_; //! Dense array mapping local ids to subgraph id (not space efficient) galois::LargeArray lid_to_subgraph_id_; //! Map subgraph ids back to local graph ids //! gstl vector because this will get resized every epoch (LargeArray //! is for static) - galois::gstl::Vector subgraph_id_to_lid_; + galois::PODResizeableArray subgraph_id_to_lid_; // intermediate degrees used for edge construction - galois::gstl::Vector local_subgraph_out_degrees_; - galois::gstl::Vector local_subgraph_in_degrees_; + galois::PODResizeableArray local_subgraph_out_degrees_; + galois::PODResizeableArray local_subgraph_in_degrees_; //! Maps from subgraph out-edge id to original graph edge id (used to check if //! edge exists in particular layer) - galois::gstl::Vector subedge_to_original_edge_; + galois::PODResizeableArray subedge_to_original_edge_; //! Maps from subgraph in-edge id to original graph edge id (used to check if //! edge exists in particular layer) - galois::gstl::Vector in_subedge_to_original_edge_; + galois::PODResizeableArray in_subedge_to_original_edge_; }; diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index f5bde956f2..332cd98072 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -47,7 +47,9 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( } }); num_subgraph_nodes_ = subgraph_count.reduce(); - subgraph_id_to_lid_.resize(num_subgraph_nodes_, 0); + if (subgraph_id_to_lid_.size() < num_subgraph_nodes_) { + subgraph_id_to_lid_.resize(num_subgraph_nodes_ * 1.02); + } // TODO(loc) depending on overhead, can parallelize this with a prefix sum // serial loop over LIDs to construct lid -> subgraph id mapping @@ -112,8 +114,13 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting( galois::StatTimer timer("DegreeCounting", kRegionName); timer.start(); - local_subgraph_out_degrees_.resize(num_subgraph_nodes_); - local_subgraph_in_degrees_.resize(num_subgraph_nodes_); + if (local_subgraph_out_degrees_.size() < num_subgraph_nodes_) { + local_subgraph_out_degrees_.resize(num_subgraph_nodes_ * 1.02); + } + + if (local_subgraph_in_degrees_.size() < num_subgraph_nodes_) { + local_subgraph_in_degrees_.resize(num_subgraph_nodes_ * 1.02); + } galois::do_all( galois::iterate(begin(), end()), @@ -155,10 +162,15 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( } // allocate then set node endpoints - num_subgraph_edges_ = local_subgraph_out_degrees_.back(); + num_subgraph_edges_ = local_subgraph_out_degrees_[num_subgraph_nodes_ - 1]; + + galois::StatTimer alloc_time("EdgeCreationAlloc", kRegionName); + alloc_time.start(); underlying_graph_.DeallocateOnly(); underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_); underlying_graph_.CSCAllocate(); + alloc_time.stop(); + galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_), [&](uint32_t subgraph_id) { underlying_graph_.fixEndEdge( @@ -166,8 +178,12 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( underlying_graph_.FixEndInEdge( subgraph_id, local_subgraph_in_degrees_[subgraph_id]); }); - subedge_to_original_edge_.resize(num_subgraph_edges_); - in_subedge_to_original_edge_.resize(num_subgraph_edges_); + if (subedge_to_original_edge_.size() < num_subgraph_edges_) { + subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02); + } + if (in_subedge_to_original_edge_.size() < num_subgraph_edges_) { + in_subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02); + } // save edges + save reference to layer sample status galois::do_all( From 154fd9c358a679d5db8452b653a7a06100bb02db Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 18 May 2021 17:10:32 -0500 Subject: [PATCH 547/660] MKL micro delete galois and OMP loop --- libgnn/test/CMakeLists.txt | 12 ++++++++++++ libgnn/test/mkl_micro.cpp | 30 ++++++++++++++++++++++++++++++ 2 files changed, 42 insertions(+) diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 3a7fec8729..b9c1eea043 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -5,6 +5,14 @@ target_include_directories(mkl_micro PUBLIC ) target_link_libraries(mkl_micro ${INTEL_LIBS}) +add_executable(mkl_micro_omp mkl_micro.cpp) +target_link_directories(mkl_micro_omp PUBLIC ${MKL_LIBRARIES}) +target_include_directories(mkl_micro_omp PUBLIC + ${MKL_INCLUDE_DIRS} +) +target_link_libraries(mkl_micro_omp ${INTEL_LIBS}) +target_link_libraries(mkl_micro_omp -fopenmp) + add_executable(mkl_micro_sgalois mkl_micro.cpp) target_link_libraries(mkl_micro_sgalois galois_gnn) target_compile_definitions(mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1) @@ -13,6 +21,10 @@ add_executable(mkl_micro_dgalois mkl_micro.cpp) target_link_libraries(mkl_micro_dgalois galois_gnn) target_compile_definitions(mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1) +add_executable(mkl_micro_delete_galois mkl_micro.cpp) +target_link_libraries(mkl_micro_delete_galois galois_gnn) +target_compile_definitions(mkl_micro_delete_galois PUBLIC USE_SHARED_GALOIS_DELETE=1) + ################################################################################ add_executable(gnngraph-test gnngraph-test.cpp) diff --git a/libgnn/test/mkl_micro.cpp b/libgnn/test/mkl_micro.cpp index e784b2cde0..63a3f3f33b 100644 --- a/libgnn/test/mkl_micro.cpp +++ b/libgnn/test/mkl_micro.cpp @@ -9,6 +9,13 @@ #ifdef USE_DIST_GALOIS #include "galois/DistGalois.h" #endif +#ifdef USE_SHARED_GALOIS_DELETE +#include "galois/Galois.h" +#endif + +#ifdef USE_OMP +#include "omp.h" +#endif // MKL wrapper void CBlasSGEMM(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, @@ -42,6 +49,21 @@ int main(int argc, char* argv[]) { galois::getActiveThreads()); #endif +#ifdef USE_SHARED_GALOIS_DELETE + std::unique_ptr G; + G = std::make_unique(); + + if (argc != 2) { + printf("Thread arg not specified\n"); + exit(1); + } + galois::setActiveThreads(std::stoi(argv[1])); + printf("Initialized Galois Shared Mem with %u threads\n", + galois::getActiveThreads()); + printf("Deleting galois\n"); + G.reset(); +#endif + #ifdef USE_DIST_GALOIS galois::DistMemSys G; if (argc != 2) { @@ -86,6 +108,14 @@ int main(int argc, char* argv[]) { // cache flush CacheFlush(&very_big_matrix); + // dummy OMP TBB loop +#ifdef USE_OMP +#pragma omp parallel + for (size_t i = 0; i < very_big_matrix.size(); i++) { + very_big_matrix[i] = i; + } +#endif + printf("Rep %lu\n", reps); // transpose because it's the same as the problematic call in GNN From ee42d7e9d9bedcc0ec8b7b892f36678e86bebfb3 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 18 May 2021 22:42:05 -0500 Subject: [PATCH 548/660] mkl micro: use Large arrays when galois active --- libgnn/test/mkl_micro.cpp | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/libgnn/test/mkl_micro.cpp b/libgnn/test/mkl_micro.cpp index 63a3f3f33b..ea9511df74 100644 --- a/libgnn/test/mkl_micro.cpp +++ b/libgnn/test/mkl_micro.cpp @@ -5,9 +5,11 @@ #ifdef USE_SHARED_GALOIS #include "galois/Galois.h" +#include "galois/LargeArray.h" #endif #ifdef USE_DIST_GALOIS #include "galois/DistGalois.h" +#include "galois/LargeArray.h" #endif #ifdef USE_SHARED_GALOIS_DELETE #include "galois/Galois.h" @@ -82,11 +84,23 @@ int main(int argc, char* argv[]) { size_t b_dim = 128; size_t c_dim = 16; +#if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS) + printf("Using Galois large arrays\n"); + // inputs + galois::LargeArray matrix_1; + matrix_1.create(a_dim * b_dim); + galois::LargeArray matrix_2; + matrix_2.create(a_dim * c_dim); + // output + galois::LargeArray matrix_3; + matrix_3.create(b_dim * c_dim); +#else // inputs std::vector matrix_1(a_dim * b_dim); std::vector matrix_2(a_dim * c_dim); // output std::vector matrix_3(b_dim * c_dim); +#endif size_t kBigSize = 1000000000; std::vector very_big_matrix(kBigSize); From 27feed3824eea88c792cf22e2a45c6702f91c821 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 19 May 2021 18:17:38 -0500 Subject: [PATCH 549/660] Timer disabling option in GNNs 1) Adds method to disable timers for GNNs; this is mostly going to be used to not time anything for Test phase since I don't want test time to get included in existing timers (all I care for is training time breakdown). 2) Small change to how newly sampled nodes are set in sampling: rather than use getOffsets on bitset, just loop over it myself and call test individually on each bit; saves materialization of vector. --- libgnn/include/galois/GraphNeuralNetwork.h | 16 +++ libgnn/include/galois/graphs/GNNGraph.h | 15 +++ libgnn/include/galois/graphs/GNNSubgraph.h | 12 +++ libgnn/include/galois/layers/GNNLayer.h | 14 ++- libgnn/include/galois/layers/SAGELayer.h | 1 + libgnn/src/GraphNeuralNetwork.cpp | 47 ++++++--- libgnn/src/graphs/GNNGraph.cpp | 111 +++++++++++++++------ libgnn/src/graphs/GNNSubgraph.cpp | 28 +++--- libgnn/src/layers/GNNLayer.cpp | 48 ++++----- libgnn/src/layers/SAGELayer.cpp | 52 ++++++---- libgnn/src/layers/SoftmaxLayer.cpp | 9 ++ 11 files changed, 248 insertions(+), 105 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index af1955d258..fc200e7baa 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -200,6 +200,22 @@ class GraphNeuralNetwork { void GradientPropagation(); private: + static const constexpr char* kRegionName = "GraphNeuralNetwork"; + + void EnableTimers() { + galois::gDebug("Enabling timers"); + graph_->EnableTimers(); + for (auto& layer : gnn_layers_) + layer->EnableTimers(); + } + + void DisableTimers() { + galois::gDebug("Disabling timers"); + graph_->DisableTimers(); + for (auto& layer : gnn_layers_) + layer->DisableTimers(); + } + //! Underlying graph to train std::unique_ptr graph_; //! Optimizer object for weight updates diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index f3689b00be..6e2b211e00 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -613,6 +613,19 @@ class GNNGraph { void ContiguousRemap(const std::string& new_name); + void EnableTimers() { + use_timer_ = true; + if (subgraph_) { + subgraph_->EnableTimers(); + } + } + void DisableTimers() { + use_timer_ = false; + if (subgraph_) { + subgraph_->DisableTimers(); + } + } + private: // included like this to avoid cyclic dependency issues + not used anywhere but // in this class anyways @@ -776,6 +789,8 @@ class GNNGraph { DGAccumulator local_true_negative_; DGAccumulator local_false_positive_; DGAccumulator local_false_negative_; + + bool use_timer_{true}; }; } // namespace graphs diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h index 6be5fb04fc..81825e2ed1 100644 --- a/libgnn/include/galois/graphs/GNNSubgraph.h +++ b/libgnn/include/galois/graphs/GNNSubgraph.h @@ -97,8 +97,20 @@ class GNNSubgraph { galois::LargeArray* GetLIDToSIDPointer() { return &lid_to_subgraph_id_; } + void EnableTimers() { use_timer_ = true; } + void DisableTimers() { use_timer_ = false; } private: + bool use_timer_{true}; + void TimerStart(galois::StatTimer* t) { + if (use_timer_) + t->start(); + } + void TimerStop(galois::StatTimer* t) { + if (use_timer_) + t->stop(); + } + //! Creates subgraph ID mapping from the number of sampled nodes from the //! original graph. Should be done every epoch when sampled graph changes. void CreateSubgraphMapping(const GNNGraph& gnn_graph, diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 4f5822d1b2..c835d05454 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -98,7 +98,7 @@ class GNNLayer { GNNLayerConfig()) {} virtual void ResizeRows(size_t new_row_count) { - layer_dimensions_.input_rows = new_row_count; + layer_dimensions_.input_rows = new_row_count; layer_dimensions_.output_rows = new_row_count; // TODO(loc) output matrix should be resized if space becomes an issue, // else just use first S rows (S = subgraph size) @@ -231,6 +231,8 @@ class GNNLayer { base_gpu_object_.PrintBackwardOutput(p_backward_output_matrix_.size()); } #endif + void EnableTimers() { use_timer_ = true; } + void DisableTimers() { use_timer_ = false; } protected: //! Layer order (starts from 0); used in backward to shortcut output as layer @@ -287,6 +289,16 @@ class GNNLayer { ////////////////////////////////////////////////////////////////////////////// + bool use_timer_{true}; + void TimerStart(galois::StatTimer* t) { + if (use_timer_) + t->start(); + } + void TimerStop(galois::StatTimer* t) { + if (use_timer_) + t->stop(); + } + //! Init based from following paper //! http://proceedings.mlr.press/v9/glorot10a/glorot10a.pdf //! Since it is unclear what j and j+1 refer to in that paper, the things diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index 3f12978663..0711862240 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -96,6 +96,7 @@ class SAGELayer : public GNNLayer { private: static const constexpr char* kRegionName = "SAGELayer"; + //! CPU aggregation void AggregateAllCPU( size_t column_length, const GNNFloat* node_embeddings, diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index fbaad4ff4d..02be5edbf4 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -216,6 +216,7 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() { } float galois::GraphNeuralNetwork::Train(size_t num_epochs) { + EnableTimers(); const size_t this_host = graph_->host_id(); float train_accuracy{0.f}; std::vector subgraph_layer_sizes; @@ -251,9 +252,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { graph_->ConstructSampledSubgraph(num_sampled_layers); } - galois::StatTimer epoch_timer("TrainingTime", "GraphNeuralNetwork"); - galois::StatTimer validation_timer("ValidationTime", "GraphNeuralNetwork"); - galois::StatTimer epoch_test_timer("TestTime", "GraphNeuralNetwork"); + galois::StatTimer epoch_timer("TrainingTime", kRegionName); + galois::StatTimer validation_timer("ValidationTime", kRegionName); + galois::StatTimer epoch_test_timer("TestTime", kRegionName); for (size_t epoch = 0; epoch < num_epochs; epoch++) { epoch_timer.start(); @@ -277,6 +278,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // beginning of epoch sampling if (config_.do_sampling() && !config_.train_minibatch_size()) { + galois::StatTimer mb_timer("EpochSubgraphCreation", kRegionName); + mb_timer.start(); + size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); gnn_layers_.back()->ResizeRows(local_seed_node_count); galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", @@ -308,6 +312,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } // resize layer matrices graph_->ConstructSampledSubgraph(num_sampled_layers); + + mb_timer.stop(); } if (!config_.train_minibatch_size()) { @@ -325,9 +331,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // create mini batch graphs and loop until minibatches on all hosts done while (true) { + galois::StatTimer mb_timer("MinibatchSubgraphCreation", kRegionName); + mb_timer.start(); + const std::string btime_name("Epoch" + std::to_string(epoch) + "Batch" + std::to_string(batch_num)); - galois::StatTimer batch_timer(btime_name.c_str(), "GraphNeuralNetwork"); + galois::StatTimer batch_timer(btime_name.c_str(), kRegionName); batch_timer.start(); work_left_.reset(); galois::gInfo("Epoch ", epoch, " batch ", batch_num++); @@ -377,6 +386,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // XXX resizes above only work for SAGE layers; will break if other // layers are tested + mb_timer.stop(); + const PointerWithSize batch_pred = DoInference(); train_accuracy = GetGlobalAccuracy(batch_pred); GradientPropagation(); @@ -395,6 +406,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { : false; if (test_eval) { + DisableTimers(); float test_acc; if (!config_.test_minibatch_size()) { graph_->DisableSubgraph(); @@ -415,17 +427,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { const std::string test_name_acc = "TestEpoch" + std::to_string(epoch) + "Batch" + std::to_string(batch_num - 1) + "Accuracy"; - galois::runtime::reportStat_Single("GraphNeuralNetwork", - test_name_acc, test_acc); + galois::runtime::reportStat_Single(kRegionName, test_name_acc, + test_acc); } // report the training time elapsed at this point in time galois::runtime::reportStat_Single( - "GraphNeuralNetwork", + kRegionName, "ElapsedTrainTimeEpoch" + std::to_string(epoch) + "Batch" + std::to_string(batch_num - 1), epoch_timer.get()); // revert to training phase for next epoch SetLayerPhases(galois::GNNPhase::kTrain); + EnableTimers(); } epoch_timer.start(); @@ -442,7 +455,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { "TrainEpoch" + std::to_string(epoch) + "Accuracy"; galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ", train_accuracy, "\n"); - galois::runtime::reportStat_Single("GraphNeuralNetwork", t_name_acc, + galois::runtime::reportStat_Single(kRegionName, t_name_acc, train_accuracy); } @@ -453,6 +466,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false; if (do_validate || do_test) { + DisableTimers(); // disable subgraph graph_->DisableSubgraph(); for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); @@ -473,8 +487,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { "\n"); const std::string v_name_acc = "ValEpoch" + std::to_string(epoch) + "Accuracy"; - galois::runtime::reportStat_Single("GraphNeuralNetwork", v_name_acc, - val_acc); + galois::runtime::reportStat_Single(kRegionName, v_name_acc, val_acc); } } @@ -497,7 +510,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { galois::gPrint("Epoch ", epoch, ": Test accuracy is ", test_acc, "\n"); const std::string test_name_acc = "TestEpoch" + std::to_string(epoch) + "Accuracy"; - galois::runtime::reportStat_Single("GraphNeuralNetwork", test_name_acc, + galois::runtime::reportStat_Single(kRegionName, test_name_acc, test_acc); } } @@ -505,7 +518,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { if (do_validate || do_test) { // report the training time elapsed at this point in time galois::runtime::reportStat_Single( - "GraphNeuralNetwork", "ElapsedTrainTimeEpoch" + std::to_string(epoch), + kRegionName, "ElapsedTrainTimeEpoch" + std::to_string(epoch), epoch_timer.get()); // revert to training phase for next epoch SetLayerPhases(galois::GNNPhase::kTrain); @@ -538,11 +551,13 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } graph_->ConstructSampledSubgraph(num_sampled_layers); } + + EnableTimers(); } } uint64_t average_epoch_time = epoch_timer.get() / num_epochs; - galois::runtime::reportStat_Tavg("GraphNeuralNetwork", "AverageEpochTime", + galois::runtime::reportStat_Tavg(kRegionName, "AverageEpochTime", average_epoch_time); // disable subgraph graph_->DisableSubgraph(); @@ -553,7 +568,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // check test accuracy // XXX test batching - galois::StatTimer test_timer("FinalTestRun", "GraphNeuralNetwork"); + galois::StatTimer test_timer("FinalTestRun", kRegionName); float global_accuracy; test_timer.start(); @@ -570,8 +585,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { if (this_host == 0) { galois::gPrint("Final test accuracy is ", global_accuracy, "\n"); - galois::runtime::reportStat_Single("GraphNeuralNetwork", - "FinalTestAccuracy", global_accuracy); + galois::runtime::reportStat_Single(kRegionName, "FinalTestAccuracy", + global_accuracy); } // return global_accuracy; diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index fca2c78cfd..b77d27eb7a 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -207,9 +207,13 @@ void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync, if (!use_subgraph_ && !use_subgraph_view_) { // set globals for the sync substrate if (!is_backward) { - sync_substrate_ - ->sync( - "GraphAggregateSync"); + if (use_timer_) { + sync_substrate_->sync("GraphAggregateSync"); + } else { + sync_substrate_->sync("Ignore"); + } } else { sync_substrate_->sync( @@ -220,8 +224,13 @@ void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync, gnn_lid_to_sid_pointer_ = subgraph_->GetLIDToSIDPointer(); if (!is_backward) { - sync_substrate_->sync("GraphAggregateSync"); + if (use_timer_) { + sync_substrate_->sync("GraphAggregateSync"); + } else { + sync_substrate_->sync("Ignore"); + } } else { sync_substrate_->sync( @@ -248,13 +257,24 @@ void galois::graphs::GNNGraph::AggregateSyncGPU( cudaSetLayerInputOutput(cuda_ctx_, matrix_to_sync, matrix_column_size, size(), layer_number); + // XXX no timer if use_timer is off if (gnn_matrix_to_sync_column_length_ == layer_input_mtx_column_size) { - sync_substrate_->sync( - "GraphAggregateSync", gnn_matrix_to_sync_column_length_); + if (use_timer_) { + sync_substrate_->sync( + "GraphAggregateSync", gnn_matrix_to_sync_column_length_); + } else { + sync_substrate_->sync( + "Ignore", gnn_matrix_to_sync_column_length_); + } } else if (gnn_matrix_to_sync_column_length_ == layer_output_mtx_column_size) { - sync_substrate_->sync( - "GraphAggregateSync", gnn_matrix_to_sync_column_length_); + if (use_timer_) { + sync_substrate_->sync( + "GraphAggregateSync", gnn_matrix_to_sync_column_length_); + } else { + sync_substrate_->sync( + "Ignore", gnn_matrix_to_sync_column_length_); + } } else { GALOIS_LOG_FATAL("Column size of the synchronized matrix does not" " match to the column size of the CUDA context"); @@ -924,10 +944,16 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { bitset_sampled_degrees_.resize(partitioned_graph_->size()); bitset_sampled_degrees_.reset(); - // Write source = masters - sync_substrate_->sync( - "SampleSync"); - + // Seed nodes sync + if (use_timer_) { + sync_substrate_ + ->sync( + "SeedNodeSample"); + } else { + sync_substrate_ + ->sync( + "Ignore"); + } galois::GAccumulator local_seed_count; local_seed_count.reset(); // count # of seed nodes @@ -985,16 +1011,26 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, // sampled.reduce(), // " out of ", total.reduce(), "\n"); - std::vector new_nodes = bitset_sample_flag_.getOffsets(); // update nodes, then communicate update to all hosts so that they can // continue the exploration galois::do_all( - galois::iterate(new_nodes), - [&](uint32_t new_node_id) { SetSampledNode(new_node_id); }, + galois::iterate(size_t{0}, bitset_sample_flag_.size()), + [&](uint32_t new_node_id) { + if (bitset_sample_flag_.test(new_node_id)) { + SetSampledNode(new_node_id); + } + }, galois::loopname("NeighborhoodSampleSet")); - sync_substrate_ - ->sync( - "SampleSync"); + + if (use_timer_) { + sync_substrate_ + ->sync( + "SampleFlag"); + } else { + sync_substrate_ + ->sync( + "Ignore"); + } galois::GAccumulator local_sample_count; local_sample_count.reset(); @@ -1081,18 +1117,29 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, // galois::gInfo("Num sampled edges for layer ", sample_layer_num, " is ", // sampled.reduce(), " out of ", total.reduce()); - std::vector new_nodes = bitset_sample_flag_.getOffsets(); - // update nodes, then communicate update to all hosts so that they can // continue the exploration galois::do_all( - galois::iterate(new_nodes), - [&](uint32_t new_node_id) { SetSampledNode(new_node_id); }, + galois::iterate(size_t{0}, bitset_sample_flag_.size()), + [&](uint32_t new_node_id) { + if (bitset_sample_flag_.test(new_node_id)) { + SetSampledNode(new_node_id); + } + }, galois::loopname("NeighborhoodSampleSet")); - sync_substrate_ - ->sync( - "SampleSync"); + // why not read source? even if it doesn't need to sample anything, it needs + // to know that it's active so that subgraph construction can proceed + // correctly + if (use_timer_) { + sync_substrate_ + ->sync( + "SampleFlag"); + } else { + sync_substrate_ + ->sync( + "Ignore"); + } // count sampled node size galois::GAccumulator local_sample_count; @@ -1121,9 +1168,15 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, gnn_sampled_out_degrees_ = &sampled_out_degrees_; // first, sync the degres of the sampled edges across all hosts - sync_substrate_ - ->sync( - "SubgraphDegree"); + if (use_timer_) { + sync_substrate_ + ->sync( + "SubgraphDegree"); + } else { + sync_substrate_ + ->sync( + "Ignore"); + } size_t num_subgraph_nodes; // use_view = true; if (!use_view) { diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index 332cd98072..dcb5c0f2db 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -4,7 +4,7 @@ size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph( GNNGraph& gnn_graph, size_t num_sampled_layers) { galois::StatTimer timer("BuildSubgraph", kRegionName); - timer.start(); + TimerStart(&timer); CreateSubgraphMapping(gnn_graph, num_sampled_layers); if (num_subgraph_nodes_ == 0) { return 0; @@ -14,24 +14,24 @@ size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph( NodeFeatureCreation(gnn_graph); // loop over each node, grab out/in edges, construct them in LC_CSR_CSC // no edge data, just topology - timer.stop(); + TimerStop(&timer); return num_subgraph_nodes_; } size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraphView( GNNGraph& gnn_graph, size_t num_sampled_layers) { galois::StatTimer timer("BuildSubgraphView", kRegionName); - timer.start(); + TimerStart(&timer); CreateSubgraphMapping(gnn_graph, num_sampled_layers); NodeFeatureCreation(gnn_graph); - timer.stop(); + TimerStop(&timer); return num_subgraph_nodes_; } void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( const GNNGraph& gnn_graph, size_t num_sampled_layers) { galois::StatTimer timer("SIDMapping", kRegionName); - timer.start(); + TimerStart(&timer); assert(gnn_graph.size() == lid_to_subgraph_id_.size()); // clear all mappings @@ -105,14 +105,14 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( GALOIS_LOG_ASSERT(num_subgraph_nodes_ == current_sid); // num_subgraph_nodes_ = current_sid; - timer.stop(); + TimerStop(&timer); } // TODO optimize further? void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting( const GNNGraph& gnn_graph) { galois::StatTimer timer("DegreeCounting", kRegionName); - timer.start(); + TimerStart(&timer); if (local_subgraph_out_degrees_.size() < num_subgraph_nodes_) { local_subgraph_out_degrees_.resize(num_subgraph_nodes_ * 1.02); @@ -146,14 +146,14 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting( }, galois::steal()); - timer.stop(); + TimerStop(&timer); } // TODO optimize further? void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( const GNNGraph& gnn_graph) { galois::StatTimer timer("EdgeConstruction", kRegionName); - timer.start(); + TimerStart(&timer); // prefix sum over subgraph degrees from previous phase to get starting points for (size_t i = 1; i < num_subgraph_nodes_; i++) { @@ -165,11 +165,11 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( num_subgraph_edges_ = local_subgraph_out_degrees_[num_subgraph_nodes_ - 1]; galois::StatTimer alloc_time("EdgeCreationAlloc", kRegionName); - alloc_time.start(); + TimerStart(&alloc_time); underlying_graph_.DeallocateOnly(); underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_); underlying_graph_.CSCAllocate(); - alloc_time.stop(); + TimerStop(&alloc_time); galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_), [&](uint32_t subgraph_id) { @@ -223,13 +223,13 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( assert(in_location == local_subgraph_in_degrees_[subgraph_id]); }, galois::steal()); - timer.stop(); + TimerStop(&timer); } void galois::graphs::GNNGraph::GNNSubgraph::NodeFeatureCreation( GNNGraph& gnn_graph) { galois::StatTimer timer("NodeFeatureCreation", kRegionName); - timer.start(); + TimerStart(&timer); size_t feat_length = gnn_graph.node_feature_length(); // assumes everything is already setup subgraph_node_features_.resize(feat_length * num_subgraph_nodes_); @@ -241,5 +241,5 @@ void galois::graphs::GNNGraph::GNNSubgraph::NodeFeatureCreation( &((gnn_graph.GetLocalFeatures().data())[local_id * feat_length]), feat_length * sizeof(GNNFeature)); }); - timer.stop(); + TimerStop(&timer); } diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 1dabce8476..171ae5c05d 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -192,7 +192,7 @@ void galois::GNNLayer::DoDropout( const PointerWithSize input_to_dropout, PointerWithSize* output_matrix) { galois::StatTimer timer("ForwardDropout", "GNNLayer"); - timer.start(); + TimerStart(&timer); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix, @@ -203,14 +203,14 @@ void galois::GNNLayer::DoDropout( #ifdef GALOIS_ENABLE_GPU } #endif - timer.stop(); + TimerStop(&timer); } void galois::GNNLayer::ReconstructDropoutMatrix( const PointerWithSize input_to_dropout, PointerWithSize* output_matrix) { galois::StatTimer timer("ReconstructDropoutMatrix", "GNNLayer"); - timer.start(); + TimerStart(&timer); // reuse the dropout mask from a previous dropout call size_t num_elements = output_matrix->size(); GNNFloat scale = 1. / (1. - config_.dropout_rate); @@ -230,12 +230,12 @@ void galois::GNNLayer::ReconstructDropoutMatrix( #ifdef GALOIS_ENABLE_GPU } #endif - timer.stop(); + TimerStop(&timer); } void galois::GNNLayer::DoDropoutDerivative() { galois::StatTimer timer("BackwardDropout", "GNNLayer"); - timer.start(); + TimerStart(&timer); assert(p_backward_output_matrix_.size() == dropout_mask_.size()); GNNFloat scale = 1. / (1. - config_.dropout_rate); @@ -258,12 +258,12 @@ void galois::GNNLayer::DoDropoutDerivative() { #ifdef GALOIS_ENABLE_GPU } #endif - timer.stop(); + TimerStop(&timer); } void galois::GNNLayer::Activation() { galois::StatTimer timer("ForwardActivation", "GNNLayer"); - timer.start(); + TimerStart(&timer); // TODO only does relu at the moment; should check user specified activation // and act accordingly @@ -277,27 +277,28 @@ void galois::GNNLayer::Activation() { } activation_memo_.reset(); - galois::do_all( - galois::iterate(static_cast(0), - layer_dimensions_.output_rows * - layer_dimensions_.output_columns), - [&](size_t i) { - if (forward_output_matrix_[i] > 0.0) { - // do nothing, keep value; set the memo though - activation_memo_.set(i); - } else { - forward_output_matrix_[i] = 0; - } - }, - galois::loopname("ReLU")); + galois::do_all(galois::iterate(static_cast(0), + layer_dimensions_.output_rows * + layer_dimensions_.output_columns), + [&](size_t i) { + if (forward_output_matrix_[i] > 0.0) { + // do nothing, keep value; set the memo though + activation_memo_.set(i); + } else { + forward_output_matrix_[i] = 0; + } + }); #ifdef GALOIS_ENABLE_GPU } #endif - timer.stop(); + TimerStop(&timer); } void galois::GNNLayer::ActivationDerivative( PointerWithSize* gradient) { + galois::StatTimer timer("BackwardActivation", "GNNLayer"); + TimerStart(&timer); + #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { base_gpu_object_.ActivationDerivativeGPU(gradient->data(), @@ -321,11 +322,12 @@ void galois::GNNLayer::ActivationDerivative( #ifdef GALOIS_ENABLE_GPU } #endif + TimerStop(&timer); } void galois::GNNLayer::WeightGradientSyncSum() { galois::StatTimer t("Sync_WeightGradientsSum", "GNNLayer"); - t.start(); + TimerStart(&t); int weight_size = static_cast(p_layer_weight_gradients_.size()); // TODO(loc) remove this limitation later; can just do a loop over the weight @@ -352,7 +354,7 @@ void galois::GNNLayer::WeightGradientSyncSum() { #ifdef GALOIS_ENABLE_GPU } #endif - t.stop(); + TimerStop(&t); } void galois::GNNLayer::MaskInputNonMasters(PointerWithSize* input) { diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 3f712df0f7..8e2470ffda 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -142,7 +142,7 @@ galois::SAGELayer::SAGELayer(size_t layer_num, void galois::SAGELayer::WeightGradientSyncSum2() { galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName); - t.start(); + TimerStart(&t); int weight_size = static_cast(p_layer_weight_gradients_2_.size()); #ifdef GALOIS_ENABLE_GPU bool gpu_direct_enabled = false; @@ -168,13 +168,13 @@ void galois::SAGELayer::WeightGradientSyncSum2() { #ifdef GALOIS_ENABLE_GPU } #endif - t.stop(); + TimerStop(&t); } const galois::PointerWithSize galois::SAGELayer::ForwardPhase( const galois::PointerWithSize input_embeddings) { galois::StatTimer timer("ForwardPhase", kRegionName); - timer.start(); + TimerStart(&timer); assert(input_embeddings.size() >= (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); @@ -227,7 +227,7 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( assert(p_forward_output_matrix_.size() >= (layer_dimensions_.output_rows * layer_dimensions_.output_columns)); - timer.stop(); + TimerStop(&timer); return p_forward_output_matrix_; } @@ -236,7 +236,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( galois::PointerWithSize prev_layer_input, galois::PointerWithSize* input_gradient) { galois::StatTimer timer("BackwardPhase", kRegionName); - timer.start(); + TimerStart(&timer); assert(layer_phase_ == GNNPhase::kTrain || layer_phase_ == GNNPhase::kBatch); @@ -290,11 +290,15 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // input data (prev layer input or temp1) or gradient need mask // can mask gradient if layer == 0 // otherwise must mask other + + galois::StatTimer concat_grad_timer("ConcatGradMultiply", kRegionName); + TimerStart(&concat_grad_timer); galois::CBlasSGEMM( CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, layer_dimensions_.output_rows, layer_dimensions_.output_columns, input_data.data(), input_gradient->data(), p_layer_weight_gradients_2_.data()); + TimerStop(&concat_grad_timer); #ifdef GALOIS_ENABLE_GPU } #endif @@ -324,11 +328,14 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } else { #endif // agg data holds aggregated feature vectors from forward phase + galois::StatTimer normal_grad_timer("NormalGradMultiply", kRegionName); + TimerStart(&normal_grad_timer); galois::CBlasSGEMM( CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, layer_dimensions_.output_rows, layer_dimensions_.output_columns, agg_data.data(), input_gradient->data(), p_layer_weight_gradients_.data()); + TimerStop(&normal_grad_timer); #ifdef GALOIS_ENABLE_GPU } #endif @@ -372,11 +379,14 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } else { #endif // input col x input row * input row x output col + galois::StatTimer normal_grad_timer("NormalGradMultiply", kRegionName); + TimerStart(&normal_grad_timer); galois::CBlasSGEMM(CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, layer_dimensions_.input_rows, layer_dimensions_.output_columns, input_data.data(), p_out_temp_.data(), p_layer_weight_gradients_.data()); + TimerStop(&normal_grad_timer); #ifdef GALOIS_ENABLE_GPU } #endif @@ -403,7 +413,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( DoDropoutDerivative(); } - timer.stop(); + TimerStop(&timer); return p_backward_output_matrix_; } @@ -421,14 +431,14 @@ void galois::SAGELayer::AggregateAll( [[maybe_unused]] galois::substrate::PerThreadStorage>* pts, bool is_backward) { - std::string agg_timer_name = "Aggregate"; + std::string agg_timer_name = "AggregateCompute"; if (!is_backward) { agg_timer_name += "Forward"; } else { agg_timer_name += "Backward"; } galois::StatTimer timer(agg_timer_name.c_str(), kRegionName); - timer.start(); + TimerStart(&timer); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { @@ -445,10 +455,12 @@ void galois::SAGELayer::AggregateAll( #endif AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts, is_backward); + TimerStop(&timer); + // aggregate sync + graph_.AggregateSync(aggregate_output, column_length, is_backward); #ifdef GALOIS_ENABLE_GPU } #endif - timer.stop(); } void galois::SAGELayer::AggregateAllCPU( @@ -557,17 +569,13 @@ void galois::SAGELayer::AggregateAllCPU( } } }, - galois::chunk_size<1>(), galois::steal(), - galois::loopname("ConvolutionalAggregateAll")); - - // aggregate sync - graph_.AggregateSync(aggregate_output, column_length, is_backward); + galois::chunk_size<1>(), galois::steal()); } void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output) { galois::StatTimer timer("ForwardXForm", kRegionName); - timer.start(); + TimerStart(&timer); #ifdef GALOIS_ENABLE_GPU // TODO self change // XXX(hochan) output rows @@ -590,13 +598,13 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, #ifdef GALOIS_ENABLE_GPU } #endif - timer.stop(); + TimerStop(&timer); } void galois::SAGELayer::SelfFeatureUpdateEmbeddings( const GNNFloat* node_embeddings, GNNFloat* output) { galois::StatTimer timer("SelfForwardXForm", kRegionName); - timer.start(); + TimerStart(&timer); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.SelfFeatureUpdateEmbeddingsGPU( @@ -612,13 +620,13 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddings( #ifdef GALOIS_ENABLE_GPU } #endif - timer.stop(); + TimerStop(&timer); } void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output) { galois::StatTimer timer("BackwardXForm", kRegionName); - timer.start(); + TimerStart(&timer); assert(p_layer_weights_.size() >= layer_dimensions_.input_columns * layer_dimensions_.output_columns); @@ -640,13 +648,13 @@ void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, #ifdef GALOIS_ENABLE_GPU } #endif - timer.stop(); + TimerStop(&timer); } void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative( const GNNFloat* gradients, GNNFloat* output) { galois::StatTimer timer("SelfBackwardXForm", kRegionName); - timer.start(); + TimerStart(&timer); assert(p_layer_weights_.size() >= layer_dimensions_.input_columns * layer_dimensions_.output_columns); @@ -667,7 +675,7 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative( #ifdef GALOIS_ENABLE_GPU } #endif - timer.stop(); + TimerStop(&timer); } void galois::SAGELayer::OptimizeLayer(BaseOptimizer* optimizer, diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 312bdab9ac..e7cd7b00d1 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -5,6 +5,9 @@ const galois::PointerWithSize galois::SoftmaxLayer::ForwardPhaseCPU( const galois::PointerWithSize input_embeddings) { + galois::StatTimer timer("SoftmaxForward", "SoftmaxLayer"); + TimerStart(&timer); + // note: p_backward == input_embeddings input_loss_.assign(input_loss_.size(), 0.0); const size_t feature_length = layer_dimensions_.input_columns; @@ -62,6 +65,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU( galois::gPrint("Loss is ", reduced_loss / t, " ", reduced_loss, " ", t, "\n"); #endif + TimerStop(&timer); return p_backward_output_matrix_; } @@ -81,6 +85,9 @@ galois::SoftmaxLayer::ForwardPhase( galois::PointerWithSize galois::SoftmaxLayer::BackwardPhaseCPU() { + galois::StatTimer timer("SoftmaxForward", "SoftmaxLayer"); + TimerStart(&timer); + const size_t feature_length = layer_dimensions_.input_columns; galois::do_all( @@ -114,6 +121,8 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { }, galois::steal(), galois::loopname("SoftmaxBackward")); + TimerStop(&timer); + return p_backward_output_matrix_; } From 6e008daafc81cac132ae28e90062008bda5897a5 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 20 May 2021 19:11:33 -0500 Subject: [PATCH 550/660] GNN subgraphs always use global norm factor Subgraph degree norm factor was buggy because I wasn't finding the incoming degrees and using them for the subgraph; this caused accuracy to get really weird depending on code route taken. For simplicity (and to speedup subgraph construction) only global degrees are used even for subgraphs. This will 100% affect time to accuracy due to possible overcompensation during training, but it saves a lot of micromanagement. --- libgnn/include/galois/MinibatchGenerator.h | 2 +- libgnn/include/galois/graphs/GNNGraph.h | 43 ++++++++------ libgnn/src/GraphNeuralNetwork.cpp | 3 +- libgnn/src/graphs/GNNGraph.cpp | 67 +++++++++++----------- libgnn/src/layers/SAGELayer.cpp | 8 +-- libgnn/src/layers/SoftmaxLayer.cpp | 4 +- 6 files changed, 65 insertions(+), 62 deletions(-) diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h index 11bce02848..8a5063ed1d 100644 --- a/libgnn/include/galois/MinibatchGenerator.h +++ b/libgnn/include/galois/MinibatchGenerator.h @@ -12,7 +12,7 @@ class MinibatchGenerator { MinibatchGenerator(const GNNMask& mask_to_minibatch, size_t minibatch_size, size_t master_bound) : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size}, - master_bound_{master_bound} { + current_position_{0}, master_bound_{master_bound} { GALOIS_LOG_ASSERT(master_bound_ <= mask_to_minibatch_.size()); } void GetNextMinibatch(std::vector* batch_mask); diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 6e2b211e00..971b00e676 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -451,27 +451,33 @@ class GNNGraph { } //! Get degree norm of subgraph for particular layer (i.e. includes training) - GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const { + // GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const { + GNNFloat GetDegreeNorm(GraphNode n, size_t) const { if (use_subgraph_ || use_subgraph_view_) { - size_t degree; - if (!subgraph_is_train_) { - // case because degrees in each layer differ - degree = - sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)]; - } else { - // XXX if inductive - // degree = global_train_degrees_[subgraph_->SIDToLID(n)]; - degree = global_degrees_[subgraph_->SIDToLID(n)]; - } - - if (degree) { - return 1.0 / degree; - } else { - return 0; - } + // TODO(loc) this is impresise: subgraph degrees differ from global + // degrees, but going to always use global degree -> not correct + return GetGlobalDegreeNorm(subgraph_->SIDToLID(n)); } else { return GetGlobalDegreeNorm(n); } + + // size_t degree; + // if (!subgraph_is_train_) { + // // case because degrees in each layer differ + // degree = + // sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)]; + // } else { + // // XXX if inductive + // // degree = global_train_degrees_[subgraph_->SIDToLID(n)]; + // degree = global_degrees_[subgraph_->SIDToLID(n)]; + // } + // //degree = global_degrees_[subgraph_->SIDToLID(n)]; + + // if (degree) { + // return 1.0 / degree; + // } else { + // return 0; + // } } // Get accuracy: sampling is by default false @@ -708,7 +714,8 @@ class GNNGraph { std::unique_ptr subgraph_; // Degrees for sampled subgraph - std::vector> sampled_out_degrees_; + // std::vector> sampled_out_degrees_; + // std::vector> sampled_in_degrees_; //! Sample data on edges: each edge gets a small bitset to mark //! if it's been sampled for a particular layer galois::LargeArray> edge_sample_status_; diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 02be5edbf4..92af85b278 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -172,6 +172,7 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() { // last layer input size/output rows becomes seed node size gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, seed_node_count); size_t num_sampled_layers = 0; + for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); back_iter++) { GNNLayerType layer_type = (*back_iter)->layer_type(); @@ -499,7 +500,6 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { SetLayerPhases(galois::GNNPhase::kTest); const PointerWithSize test_pred = DoInference(); epoch_test_timer.stop(); - test_acc = GetGlobalAccuracy(test_pred); } else { test_acc = MinibatchedTesting(); @@ -559,6 +559,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { uint64_t average_epoch_time = epoch_timer.get() / num_epochs; galois::runtime::reportStat_Tavg(kRegionName, "AverageEpochTime", average_epoch_time); + DisableTimers(); // disable subgraph graph_->DisableSubgraph(); // TODO only do this when necessary diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index b77d27eb7a..a445e299a5 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -890,10 +890,10 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, // this is slightly problematic possibly, but each layer is its own // subgraph if (!is_inductive) { - sampled_out_degrees_.resize(num_layers); - for (galois::LargeArray& array : sampled_out_degrees_) { - array.create(partitioned_graph_->size()); - } + // sampled_out_degrees_.resize(num_layers); + // for (galois::LargeArray& array : sampled_out_degrees_) { + // array.create(partitioned_graph_->size()); + //} } else { subgraph_is_train_ = true; } @@ -906,7 +906,6 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { bitset_sample_flag_.resize(size()); bitset_sample_flag_.reset(); - // for now, if training node, it goes into seed node galois::do_all(galois::iterate(begin_owned(), end_owned()), [&](const NodeIterator& x) { if (IsValidForPhase(*x, seed_phase)) { @@ -933,16 +932,16 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { edge_sample_status_[edge_id].end(), 0); }); // reset all degrees - if (!subgraph_is_train_) { - galois::do_all( - galois::iterate(sampled_out_degrees_), - [&](galois::LargeArray& array) { - std::fill(array.begin(), array.end(), 0); - }, - galois::chunk_size<1>()); - } - bitset_sampled_degrees_.resize(partitioned_graph_->size()); - bitset_sampled_degrees_.reset(); + // if (!subgraph_is_train_) { + // galois::do_all( + // galois::iterate(sampled_out_degrees_), + // [&](galois::LargeArray& array) { + // std::fill(array.begin(), array.end(), 0); + // }, + // galois::chunk_size<1>()); + //} + // bitset_sampled_degrees_.resize(partitioned_graph_->size()); + // bitset_sampled_degrees_.reset(); // Seed nodes sync if (use_timer_) { @@ -1093,7 +1092,7 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, } } - // if here, it means edge accepted; set sampled on, mark source + // if here, it means edge accepted; set sampled on, mark // as part of next set MakeEdgeSampled(edge_iter, sample_layer_num); if (!IsInSampledGraph( @@ -1101,9 +1100,9 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, bitset_sample_flag_.set( partitioned_graph_->getEdgeDst(edge_iter)); } - bitset_sampled_degrees_.set(*src_iter); + // bitset_sampled_degrees_.set(*src_iter); // degree increment - sampled_out_degrees_[sample_layer_num][*src_iter]++; + // sampled_out_degrees_[sample_layer_num][*src_iter]++; sampled += 1; } } @@ -1163,29 +1162,29 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, bool use_view) { // false first so that the build process can use functions to access the // real graph - use_subgraph_ = false; - use_subgraph_view_ = false; - gnn_sampled_out_degrees_ = &sampled_out_degrees_; - - // first, sync the degres of the sampled edges across all hosts - if (use_timer_) { - sync_substrate_ - ->sync( - "SubgraphDegree"); - } else { - sync_substrate_ - ->sync( - "Ignore"); - } + use_subgraph_ = false; + use_subgraph_view_ = false; + // gnn_sampled_out_degrees_ = &sampled_out_degrees_; + + //// first, sync the degres of the sampled edges across all hosts + // if (use_timer_) { + // sync_substrate_ + // ->sync( + // "SubgraphDegree"); + //} else { + // sync_substrate_ + // ->sync( + // "Ignore"); + //} size_t num_subgraph_nodes; - // use_view = true; if (!use_view) { num_subgraph_nodes = subgraph_->BuildSubgraph(*this, num_sampled_layers); } else { // a view only has lid<->sid mappings num_subgraph_nodes = subgraph_->BuildSubgraphView(*this, num_sampled_layers); - // SortAllInEdgesBySID(); } // after this, this graph is a subgraph diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 8e2470ffda..48a7da9b94 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -508,13 +508,7 @@ void galois::SAGELayer::AggregateAllCPU( size_t index_to_dst_feature = dst * column_length; if (!config_.disable_normalization) { - GNNFloat norm_scale; - if (!is_backward) { - norm_scale = source_norm; - } else { - norm_scale = - graph_.GetDegreeNorm(dst, graph_user_layer_number_); - } + GNNFloat norm_scale = source_norm; galois::VectorMulAdd( column_length, &aggregate_output[index_to_src_feature], diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index e7cd7b00d1..ade63b9d1e 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -22,7 +22,9 @@ galois::SoftmaxLayer::ForwardPhaseCPU( galois::iterate(graph_.begin(), graph_.end()), [&](const unsigned i) { if (IsSampledLayer()) { - if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(i)) { + if ((layer_phase_ == GNNPhase::kTrain || + layer_phase_ == GNNPhase::kBatch) && + !graph_.IsInSampledGraph(i)) { // XXX VectorZero(feature_length, &p_backward_output_matrix_[i * feature_length]); From 7f8779eb106ab4a639925ea2dc961e023bd9f591 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 21 May 2021 15:17:25 -0500 Subject: [PATCH 551/660] Fixed rows for GNN/SAGE (not GCN) input/output rows can differ now, so intermediate matrices also have different sizes that must be used. This commit fixes that (and adds a argument to update embeddings and some other functions that deals with the rows). --- libgnn/include/galois/layers/GNNLayer.h | 11 ++- libgnn/include/galois/layers/SAGELayer.h | 6 +- libgnn/src/layers/GNNLayer.cpp | 21 ++++- libgnn/src/layers/SAGELayer.cpp | 97 ++++++++++++++---------- 4 files changed, 89 insertions(+), 46 deletions(-) diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index c835d05454..ac8f2c8f05 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -348,9 +348,16 @@ class GNNLayer { #endif //! Mask a input size'd matrix's rows that correspond to mirrors - void MaskInputNonMasters(PointerWithSize* input); + void MaskInputNonMasters(PointerWithSize* input) { + MaskInputNonMasters(input, std::numeric_limits::max()); + } + void MaskInputNonMasters(PointerWithSize* input, size_t max_rows); //! Mask a gradient size'd matrix's rows that correspond to mirrors - void MaskGradientNonMasters(PointerWithSize* gradients); + void MaskGradientNonMasters(PointerWithSize* input) { + MaskGradientNonMasters(input, std::numeric_limits::max()); + } + void MaskGradientNonMasters(PointerWithSize* gradients, + size_t max_rows); //! Does some math to get GB used by some # of floats double FloatElementsToGB(size_t num_of_floats) const { diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index 0711862240..e127b78e73 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -118,12 +118,14 @@ class SAGELayer : public GNNLayer { bool is_backward); //! Do embedding update via mxm with this layer's weights (forward) - void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output); + void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output, + bool after); //! Same as above but uses the second set of weights (self feature weights) void SelfFeatureUpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output); //! Calculate graident via mxm with last layer's gradients (backward) - void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output); + void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output, + bool after); //! Same as above but uses the second set of weights (self feature weights) void SelfFeatureUpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output); diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 171ae5c05d..07e839cb48 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -357,12 +357,21 @@ void galois::GNNLayer::WeightGradientSyncSum() { TimerStop(&t); } -void galois::GNNLayer::MaskInputNonMasters(PointerWithSize* input) { +void galois::GNNLayer::MaskInputNonMasters(PointerWithSize* input, + size_t max_rows) { assert(*(graph_.begin_owned()) == 0); size_t start_node = *(graph_.end_owned()); size_t end_node = graph_.active_size(); size_t row_index = layer_dimensions_.input_columns; assert((row_index * layer_dimensions_.input_rows) <= input->size()); + + if (start_node > max_rows) { + start_node = max_rows; + } + if (end_node > max_rows) { + end_node = max_rows; + } + #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node, row_index); @@ -383,11 +392,19 @@ void galois::GNNLayer::MaskInputNonMasters(PointerWithSize* input) { } void galois::GNNLayer::MaskGradientNonMasters( - PointerWithSize* gradient) { + PointerWithSize* gradient, size_t max_rows) { assert(*(graph_.begin_owned()) == 0); size_t start_node = *(graph_.end_owned()); size_t end_node = graph_.active_size(); size_t row_index = layer_dimensions_.output_columns; + + if (start_node > max_rows) { + start_node = max_rows; + } + if (end_node > max_rows) { + end_node = max_rows; + } + #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { base_gpu_object_.MaskNonMastersGPU(gradient, start_node, end_node, diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 48a7da9b94..5aa8e24a77 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -61,21 +61,22 @@ galois::SAGELayer::SAGELayer(size_t layer_num, second_weight_optimizer_ = std::make_unique(weight_size, 1); } - size_t num_input_elements = - layer_dimensions_.input_rows * layer_dimensions_.input_columns; + // TODO(loc) dropout uses input rows; this won't work if dropout is enabled + size_t num_in_temp_elements = + layer_dimensions_.output_rows * layer_dimensions_.input_columns; // if in temp is smaller than out temp, or if dropout exists if (!config_.disable_dropout || config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", SAGE input temp var 1 ", num_input_elements, " (", - FloatElementsToGB(num_input_elements), " GB)"); + ", SAGE input temp var 1 ", num_in_temp_elements, " (", + FloatElementsToGB(num_in_temp_elements), " GB)"); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AllocateInTemp1(num_input_elements); + gpu_object_.AllocateInTemp1(num_in_temp_elements); } else { #endif - in_temp_1_.resize(num_input_elements, 0); + in_temp_1_.resize(num_in_temp_elements, 0); #ifdef GALOIS_ENABLE_GPU } #endif @@ -86,33 +87,33 @@ galois::SAGELayer::SAGELayer(size_t layer_num, (config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns)) { galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", SAGE input temp var 2 ", num_input_elements, " (", - FloatElementsToGB(num_input_elements), " GB)"); + ", SAGE input temp var 2 ", num_in_temp_elements, " (", + FloatElementsToGB(num_in_temp_elements), " GB)"); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AllocateInTemp2(num_input_elements); + gpu_object_.AllocateInTemp2(num_in_temp_elements); } else { #endif - in_temp_2_.resize(num_input_elements, 0); + in_temp_2_.resize(num_in_temp_elements, 0); #ifdef GALOIS_ENABLE_GPU } #endif } - size_t num_output_elements = - layer_dimensions_.output_rows * layer_dimensions_.output_columns; + size_t num_out_temp = + layer_dimensions_.input_rows * layer_dimensions_.output_columns; // only needed if out temp would be smaller than intemp if (!config_.disable_aggregate_after_update && layer_dimensions_.input_columns > layer_dimensions_.output_columns) { galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", SAGE output temp var ", num_output_elements, " (", - FloatElementsToGB(num_output_elements), " GB)"); + ", SAGE output temp var ", num_out_temp, " (", + FloatElementsToGB(num_out_temp), " GB)"); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AllocateOutTemp(num_output_elements); + gpu_object_.AllocateOutTemp(num_out_temp); } else { #endif - out_temp_.resize(num_output_elements, 0); + out_temp_.resize(num_out_temp, 0); #ifdef GALOIS_ENABLE_GPU } #endif @@ -122,10 +123,10 @@ galois::SAGELayer::SAGELayer(size_t layer_num, #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { // init pointers with size - p_in_temp_1_ = - PointerWithSize(gpu_object_.in_temp_1(), num_input_elements); - p_in_temp_2_ = - PointerWithSize(gpu_object_.in_temp_2(), num_input_elements); + p_in_temp_1_ = PointerWithSize(gpu_object_.in_temp_1(), + num_in_temp_elements); + p_in_temp_2_ = PointerWithSize(gpu_object_.in_temp_2(), + num_in_temp_elements); p_out_temp_ = PointerWithSize(gpu_object_.out_temp(), num_output_elements); } else { @@ -202,11 +203,11 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( // aggregation and update AggregateAll(layer_dimensions_.input_columns, input_data, agg_data, &input_column_intermediates_); - UpdateEmbeddings(agg_data, p_forward_output_matrix_.data()); + UpdateEmbeddings(agg_data, p_forward_output_matrix_.data(), true); } else { // update to aggregate // FW - UpdateEmbeddings(input_data, p_out_temp_.data()); + UpdateEmbeddings(input_data, p_out_temp_.data(), false); // A(FW) AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(), p_forward_output_matrix_.data(), @@ -272,11 +273,11 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( if (!sage_config_.disable_concat) { // XXX masking may not be required in sampling case where rows change if (layer_number_ != 0) { - MaskInputNonMasters(&input_data); + MaskInputNonMasters(&input_data, layer_dimensions_.input_rows); } else { // if 0 then no input to mask: mask the gradient // this is fine because gradient won't be used to get feature gradients - MaskGradientNonMasters(input_gradient); + MaskGradientNonMasters(input_gradient, layer_dimensions_.output_rows); } #ifdef GALOIS_ENABLE_GPU @@ -313,7 +314,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // mask it, then use it // XXX masking may not be required in sampling case where rows change if (layer_number_ != 0 || sage_config_.disable_concat) { - MaskInputNonMasters(&agg_data); + MaskInputNonMasters(&agg_data, layer_dimensions_.output_rows); } // if concat is disabled, then input grad isn't masked; therefore, mask // this to get the same effect @@ -345,11 +346,12 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // ---unmasked--- // transposed sgemm for derivative; in_temp is output assert(input_gradient->size() >= - layer_dimensions_.input_rows * layer_dimensions_.output_columns); + layer_dimensions_.output_rows * layer_dimensions_.output_columns); // pintemp1 contains (AF)' // overwrites the dropout matrix that was in ptemp1 (needed for second // weight matrix) - UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data()); + UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data(), + true); // pback contains F' // derivative of aggregate is the same due to symmetric graph AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(), @@ -361,11 +363,11 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // disable concat part is here because otherwise it would get done elsewhere // XXX masking may not be required in sampling case where rows change if (layer_number_ != 0 && sage_config_.disable_concat) { - MaskInputNonMasters(&input_data); + MaskInputNonMasters(&input_data, layer_dimensions_.input_rows); } else { // if 0 then no input to mask: mask the gradient // this is fine because gradient won't be used to get feature gradients - MaskGradientNonMasters(&p_out_temp_); + MaskGradientNonMasters(&p_out_temp_, layer_dimensions_.input_rows); } // W' = F^T (FW)' @@ -395,7 +397,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // derivative for update // backout = F' UpdateEmbeddingsDerivative(p_out_temp_.data(), - p_backward_output_matrix_.data()); + p_backward_output_matrix_.data(), false); } } WeightGradientSyncSum(); @@ -567,7 +569,7 @@ void galois::SAGELayer::AggregateAllCPU( } void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, - GNNFloat* output) { + GNNFloat* output, bool after) { galois::StatTimer timer("ForwardXForm", kRegionName); TimerStart(&timer); #ifdef GALOIS_ENABLE_GPU @@ -585,10 +587,17 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, layer_dimensions_.input_columns, " ", layer_dimensions_.output_columns); // CPU version is just a call into CBlas - galois::CBlasSGEMM( - CblasNoTrans, CblasNoTrans, layer_dimensions_.output_rows, - layer_dimensions_.input_columns, layer_dimensions_.output_columns, - node_embeddings, layer_weights_.data(), output); + if (after) { + galois::CBlasSGEMM( + CblasNoTrans, CblasNoTrans, layer_dimensions_.output_rows, + layer_dimensions_.input_columns, layer_dimensions_.output_columns, + node_embeddings, layer_weights_.data(), output); + } else { + galois::CBlasSGEMM( + CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, + layer_dimensions_.input_columns, layer_dimensions_.output_columns, + node_embeddings, layer_weights_.data(), output); + } #ifdef GALOIS_ENABLE_GPU } #endif @@ -618,7 +627,8 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddings( } void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, - GNNFloat* output) { + GNNFloat* output, + bool after) { galois::StatTimer timer("BackwardXForm", kRegionName); TimerStart(&timer); @@ -635,10 +645,17 @@ void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, // difference is Trans for B matrix (data) to get z by y (weights is y by z // normally); result is x by y // note input rows is used here due to transpose of aggregation - galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, - layer_dimensions_.output_columns, - layer_dimensions_.input_columns, gradients, - layer_weights_.data(), output); + if (after) { + galois::CBlasSGEMM( + CblasNoTrans, CblasTrans, layer_dimensions_.output_rows, + layer_dimensions_.output_columns, layer_dimensions_.input_columns, + gradients, layer_weights_.data(), output); + } else { + galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, + layer_dimensions_.output_columns, + layer_dimensions_.input_columns, gradients, + layer_weights_.data(), output); + } #ifdef GALOIS_ENABLE_GPU } #endif From a23cb1437039625dc285d27f94f6889731e72e9e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 21 May 2021 18:43:55 -0500 Subject: [PATCH 552/660] Back to using subgraph degrees Subgraph degrees used again because accuracy when sampling suffers otherwise. Don't actually need in-degrees because out-degree was used in forward (meaning it's used in backward). Also, norm factor should **NEVER** be 0 after some more thought. Added an assertion checking for it in the debug build. The other fix this commit includes is that the "choose all" mode must be set appropriately when testing occurs (since test will always use all degrees). --- libgnn/include/galois/graphs/GNNGraph.h | 49 +++++++------ libgnn/include/galois/layers/GNNLayer.h | 1 + libgnn/src/GraphNeuralNetwork.cpp | 20 +++++- libgnn/src/graphs/GNNGraph.cpp | 92 ++++++++++--------------- libgnn/src/layers/SAGELayer.cpp | 8 ++- libgnn/src/layers/SoftmaxLayer.cpp | 2 +- 6 files changed, 87 insertions(+), 85 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 971b00e676..c40a3e20de 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -451,33 +451,26 @@ class GNNGraph { } //! Get degree norm of subgraph for particular layer (i.e. includes training) - // GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const { - GNNFloat GetDegreeNorm(GraphNode n, size_t) const { + GNNFloat GetDegreeNorm(GraphNode n, size_t graph_user_layer_num) const { if (use_subgraph_ || use_subgraph_view_) { - // TODO(loc) this is impresise: subgraph degrees differ from global - // degrees, but going to always use global degree -> not correct - return GetGlobalDegreeNorm(subgraph_->SIDToLID(n)); + size_t degree; + if (!subgraph_choose_all_) { + // case because degrees in each layer differ + degree = + sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)]; + } else { + // XXX if inductive + // degree = global_train_degrees_[subgraph_->SIDToLID(n)]; + degree = global_degrees_[subgraph_->SIDToLID(n)]; + } + if (degree) { + return 1.0 / degree; + } else { + return 0; + } } else { return GetGlobalDegreeNorm(n); } - - // size_t degree; - // if (!subgraph_is_train_) { - // // case because degrees in each layer differ - // degree = - // sampled_out_degrees_[graph_user_layer_num][subgraph_->SIDToLID(n)]; - // } else { - // // XXX if inductive - // // degree = global_train_degrees_[subgraph_->SIDToLID(n)]; - // degree = global_degrees_[subgraph_->SIDToLID(n)]; - // } - // //degree = global_degrees_[subgraph_->SIDToLID(n)]; - - // if (degree) { - // return 1.0 / degree; - // } else { - // return 0; - // } } // Get accuracy: sampling is by default false @@ -632,6 +625,11 @@ class GNNGraph { } } + bool SubgraphChooseAllStatus() { return subgraph_choose_all_; } + void EnableSubgraphChooseAll() { subgraph_choose_all_ = true; } + void DisableSubgraphChooseAll() { subgraph_choose_all_ = false; } + void SetSubgraphChooseAll(bool a) { subgraph_choose_all_ = a; } + private: // included like this to avoid cyclic dependency issues + not used anywhere but // in this class anyways @@ -714,8 +712,7 @@ class GNNGraph { std::unique_ptr subgraph_; // Degrees for sampled subgraph - // std::vector> sampled_out_degrees_; - // std::vector> sampled_in_degrees_; + std::vector> sampled_out_degrees_; //! Sample data on edges: each edge gets a small bitset to mark //! if it's been sampled for a particular layer galois::LargeArray> edge_sample_status_; @@ -768,7 +765,7 @@ class GNNGraph { // TODO vars for subgraphs as necessary bool use_subgraph_{false}; bool use_subgraph_view_{false}; - bool subgraph_is_train_{false}; + bool subgraph_choose_all_{false}; std::unique_ptr train_batcher_; std::unique_ptr test_batcher_; diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index ac8f2c8f05..45a9c08893 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -184,6 +184,7 @@ class GNNLayer { //! Flip sampling switch on void EnableSampling() { config_.do_sampling = true; } + void DisableSampling() { config_.do_sampling = false; } bool IsSampledLayer() const { return config_.do_sampling; } //! Sets the graph user layer number; important for sampling as this index //! determines which index to use when checking for sampled edges diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 92af85b278..01974baaca 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -160,10 +160,12 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( } float galois::GraphNeuralNetwork::MinibatchedTesting() { - galois::gDebug("minibatched testing"); + galois::gDebug("Minibatched Testing"); graph_->ResetTestMinibatcher(); SetLayerPhases(galois::GNNPhase::kBatch); + bool choose_all_status = graph_->SubgraphChooseAllStatus(); + uint32_t correct = 0; uint32_t total = 0; while (true) { @@ -196,6 +198,7 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() { // resize layer matrices graph_->ConstructSampledSubgraph(num_sampled_layers); + graph_->EnableSubgraphChooseAll(); const PointerWithSize batch_pred = DoInference(); std::pair correct_total = @@ -211,7 +214,13 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() { } } - galois::gDebug("correct / total ", correct, " ", total); + galois::gDebug("Minibatching Correct / Total ", correct, " ", total); + + if (choose_all_status) { + graph_->EnableSubgraphChooseAll(); + } else { + graph_->DisableSubgraphChooseAll(); + } return (1.0 * correct) / (1.0 * total); } @@ -410,14 +419,17 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { DisableTimers(); float test_acc; if (!config_.test_minibatch_size()) { + bool f = graph_->SubgraphChooseAllStatus(); graph_->DisableSubgraph(); for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { (*layer)->ResizeRows(graph_->size()); } SetLayerPhases(galois::GNNPhase::kTest); + graph_->EnableSubgraphChooseAll(); const PointerWithSize test_pred = DoInference(); test_acc = GetGlobalAccuracy(test_pred); + graph_->SetSubgraphChooseAll(f); } else { test_acc = MinibatchedTesting(); } @@ -466,6 +478,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { bool do_test = config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false; + bool subgraph_choose_all_status = graph_->SubgraphChooseAllStatus(); + if (do_validate || do_test) { DisableTimers(); // disable subgraph @@ -474,6 +488,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { layer++) { (*layer)->ResizeRows(graph_->size()); } + graph_->EnableSubgraphChooseAll(); } if (do_validate) { @@ -522,6 +537,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { epoch_timer.get()); // revert to training phase for next epoch SetLayerPhases(galois::GNNPhase::kTrain); + graph_->SetSubgraphChooseAll(subgraph_choose_all_status); // TODO too much code dupe // Resconstruct the train subgraph since it was replaced by test subgraph diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index a445e299a5..0cb05e9a4f 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -881,21 +881,21 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti( //////////////////////////////////////////////////////////////////////////////// void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, - bool is_inductive) { + bool choose_all) { subgraph_ = std::make_unique(partitioned_graph_->size()); sample_node_timestamps_.create(partitioned_graph_->size(), std::numeric_limits::max()); edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers, 0); - // this is to hold the *global* degree of a sampled graph; yes, memory wise - // this is slightly problematic possibly, but each layer is its own - // subgraph - if (!is_inductive) { - // sampled_out_degrees_.resize(num_layers); - // for (galois::LargeArray& array : sampled_out_degrees_) { - // array.create(partitioned_graph_->size()); - //} + // this is to hold the degree of a sampled graph considering all hosts; yes, + // memory wise this is slightly problematic possibly, but each layer is its + // own subgraph + if (!choose_all) { + sampled_out_degrees_.resize(num_layers); + for (galois::LargeArray& array : sampled_out_degrees_) { + array.create(partitioned_graph_->size()); + } } else { - subgraph_is_train_ = true; + subgraph_choose_all_ = true; } } @@ -932,16 +932,17 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { edge_sample_status_[edge_id].end(), 0); }); // reset all degrees - // if (!subgraph_is_train_) { - // galois::do_all( - // galois::iterate(sampled_out_degrees_), - // [&](galois::LargeArray& array) { - // std::fill(array.begin(), array.end(), 0); - // }, - // galois::chunk_size<1>()); - //} - // bitset_sampled_degrees_.resize(partitioned_graph_->size()); - // bitset_sampled_degrees_.reset(); + if (!subgraph_choose_all_) { + galois::do_all( + galois::iterate(sampled_out_degrees_), + [&](galois::LargeArray& array) { + std::fill(array.begin(), array.end(), 0); + }, + galois::chunk_size<1>()); + } + + bitset_sampled_degrees_.resize(partitioned_graph_->size()); + bitset_sampled_degrees_.reset(); // Seed nodes sync if (use_timer_) { @@ -1050,17 +1051,9 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, size_t num_to_sample, bool inductive_subgraph, size_t timestamp) { - assert(!subgraph_is_train_); use_subgraph_ = false; use_subgraph_view_ = false; - galois::GAccumulator sampled; - galois::GAccumulator total; - // galois::GAccumulator total_nodes; - sampled.reset(); - total.reset(); - // total_nodes.reset(); - galois::do_all( galois::iterate(begin(), end()), [&](const NodeIterator& src_iter) { @@ -1079,7 +1072,6 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, // loop through edges, turn "on" edge with some probability for (auto edge_iter : partitioned_graph_->edges(*src_iter)) { - total += 1; if (sample_rng_.DoBernoulli(probability_of_reject)) { if (inductive_subgraph) { // only take if node is training node or a node not classified @@ -1100,22 +1092,15 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, bitset_sample_flag_.set( partitioned_graph_->getEdgeDst(edge_iter)); } - // bitset_sampled_degrees_.set(*src_iter); + bitset_sampled_degrees_.set(*src_iter); // degree increment - // sampled_out_degrees_[sample_layer_num][*src_iter]++; - sampled += 1; + sampled_out_degrees_[sample_layer_num][*src_iter]++; } } - // total_nodes += 1; } }, galois::steal(), galois::loopname("NeighborhoodSample")); - // galois::gInfo(host_prefix(), "sampled nodes for layer ", sample_layer_num, - // " is ", total_nodes.reduce()); - // galois::gInfo("Num sampled edges for layer ", sample_layer_num, " is ", - // sampled.reduce(), " out of ", total.reduce()); - // update nodes, then communicate update to all hosts so that they can // continue the exploration galois::do_all( @@ -1162,22 +1147,21 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, bool use_view) { // false first so that the build process can use functions to access the // real graph - use_subgraph_ = false; - use_subgraph_view_ = false; - // gnn_sampled_out_degrees_ = &sampled_out_degrees_; - - //// first, sync the degres of the sampled edges across all hosts - // if (use_timer_) { - // sync_substrate_ - // ->sync( - // "SubgraphDegree"); - //} else { - // sync_substrate_ - // ->sync( - // "Ignore"); - //} + use_subgraph_ = false; + use_subgraph_view_ = false; + gnn_sampled_out_degrees_ = &sampled_out_degrees_; + + // first, sync the degres of the sampled edges across all hosts + // read any because destinations need it to for reverse phase + if (use_timer_) { + sync_substrate_ + ->sync( + "SubgraphDegree"); + } else { + sync_substrate_ + ->sync( + "Ignore"); + } size_t num_subgraph_nodes; if (!use_view) { num_subgraph_nodes = subgraph_->BuildSubgraph(*this, num_sampled_layers); diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 5aa8e24a77..a342cfbe14 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -499,8 +499,9 @@ void galois::SAGELayer::AggregateAllCPU( if (layer_phase_ == GNNPhase::kTrain || layer_phase_ == GNNPhase::kBatch) { // XXX + // galois::gDebug("In here"); if (IsSampledLayer()) { - if (!graph_.IsEdgeSampled(e, layer_number_)) { + if (!graph_.IsEdgeSampled(e, graph_user_layer_number_)) { continue; } } @@ -511,6 +512,7 @@ void galois::SAGELayer::AggregateAllCPU( if (!config_.disable_normalization) { GNNFloat norm_scale = source_norm; + assert(norm_scale != 0); galois::VectorMulAdd( column_length, &aggregate_output[index_to_src_feature], @@ -532,7 +534,7 @@ void galois::SAGELayer::AggregateAllCPU( layer_phase_ == GNNPhase::kBatch) { // XXX if (IsSampledLayer()) { - if (!graph_.IsInEdgeSampled(e, layer_number_)) { + if (!graph_.IsInEdgeSampled(e, graph_user_layer_number_)) { continue; } } @@ -551,6 +553,8 @@ void galois::SAGELayer::AggregateAllCPU( GNNFloat norm_scale = graph_.GetDegreeNorm(dst, graph_user_layer_number_); + assert(norm_scale != 0); + galois::VectorMulAdd( column_length, &aggregate_output[index_to_src_feature], &node_embeddings[index_to_dst_feature], norm_scale, diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index ade63b9d1e..bf9a376092 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -19,7 +19,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU( #endif galois::do_all( - galois::iterate(graph_.begin(), graph_.end()), + galois::iterate(size_t{0}, layer_dimensions_.input_rows), [&](const unsigned i) { if (IsSampledLayer()) { if ((layer_phase_ == GNNPhase::kTrain || From 2fb3dac5be54bdcce223803a918bdbf2e6c18608 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 21 May 2021 23:33:01 -0500 Subject: [PATCH 553/660] GNN subgraph "choose_all" fix Choose all needs to be turned on if sample all edges use used (and vice versa). Letting init decide if this var is to be turned on was a bad idea and this commit fixes that by turning it on after the appropriate call. --- libgnn/src/graphs/GNNGraph.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 0cb05e9a4f..8af26898a1 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -1044,6 +1044,7 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, } }); + EnableSubgraphChooseAll(); return local_sample_count.reduce(); } @@ -1138,6 +1139,7 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, } }); + DisableSubgraphChooseAll(); return local_sample_count.reduce(); } From aba12dc555d17ee9c3f49b081ca493cec98f753e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 22 May 2021 17:17:54 -0500 Subject: [PATCH 554/660] Dynamic resizing of intermediate/output matrices Subgraphs don't need full space allocation. This commit is a first pass at dynamically resizing these matrices to only use the amount of space required. Will need cleaning. --- libgnn/include/galois/GraphNeuralNetwork.h | 3 + libgnn/include/galois/graphs/GNNGraph.h | 9 ++ libgnn/include/galois/layers/GNNLayer.h | 21 +++- libgnn/include/galois/layers/SAGELayer.h | 20 ++-- libgnn/include/galois/layers/SoftmaxLayer.h | 19 ++++ libgnn/src/GraphNeuralNetwork.cpp | 72 +++++++++++--- libgnn/src/graphs/GNNGraph.cpp | 4 +- libgnn/src/graphs/GNNSubgraph.cpp | 4 +- libgnn/src/layers/GNNLayer.cpp | 55 ++++++++-- libgnn/src/layers/SAGELayer.cpp | 105 +++++++++++++++++++- libgnn/src/layers/SoftmaxLayer.cpp | 10 +- 11 files changed, 277 insertions(+), 45 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index fc200e7baa..3b5b268daa 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -199,6 +199,9 @@ class GraphNeuralNetwork { //! most literature void GradientPropagation(); + //! Call whenever resize occurs to correct reuse of pointers for layers + void CorrectBackwardLinks(); + private: static const constexpr char* kRegionName = "GraphNeuralNetwork"; diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index c40a3e20de..775fd2af3a 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -585,6 +585,15 @@ class GNNGraph { assert(node_id < size()); return partitioned_graph_->getData(node_id); } + bool IsInSampledGraphSubgraph(size_t node_id) const { + // TODO(loc) GPU + assert(node_id < size()); + if (use_subgraph_) { + return partitioned_graph_->getData(ConvertToLID(node_id)); + } else { + return partitioned_graph_->getData(node_id); + } + } //! Calculate norm factor considering the entire graph void CalculateFullNormFactor(); diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 45a9c08893..e61d398a64 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -100,14 +100,26 @@ class GNNLayer { virtual void ResizeRows(size_t new_row_count) { layer_dimensions_.input_rows = new_row_count; layer_dimensions_.output_rows = new_row_count; - // TODO(loc) output matrix should be resized if space becomes an issue, - // else just use first S rows (S = subgraph size) + ResizeOutputMatrix(new_row_count); } + virtual void ResizeInputOutputRows(size_t input_row, size_t output_row) { layer_dimensions_.input_rows = input_row; layer_dimensions_.output_rows = output_row; - // TODO(loc) output matrix should be resized if space becomes an issue, - // else just use first S rows (S = subgraph size) + ResizeOutputMatrix(output_row); + } + + void ResizeOutputMatrix(size_t new_output_row); + + void UpdateBackwardOutput(PointerWithSize* backward_output_matrix) { + // XXX(hochan) gpu + if (layer_number_ != 0) { + assert(backward_output_matrix->size() >= + layer_dimensions_.input_rows * layer_dimensions_.input_columns); + } else { + GALOIS_LOG_FATAL("Layer 0 should not need to update backward output"); + } + p_backward_output_matrix_ = *backward_output_matrix; } GNNPhase layer_phase() { return layer_phase_; } @@ -348,7 +360,6 @@ class GNNLayer { } #endif - //! Mask a input size'd matrix's rows that correspond to mirrors void MaskInputNonMasters(PointerWithSize* input) { MaskInputNonMasters(input, std::numeric_limits::max()); } diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index e127b78e73..581115a00e 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -44,14 +44,6 @@ class SAGELayer : public GNNLayer { : SAGELayer(layer_num, graph, backward_output_matrix, dimensions, GNNLayerConfig(), SAGELayerConfig()) {} - void ResizeRows(size_t new_row_count) { - galois::gDebug("Resizing SAGE layer for sampled graph from ", - layer_dimensions_.input_rows); - GNNLayer::ResizeRows(new_row_count); - galois::gDebug("To ", layer_dimensions_.input_rows); - // TODO(loc) resize input matrices if space is reason for doing this - } - void InitSelfWeightsTo1() { #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { @@ -136,6 +128,18 @@ class SAGELayer : public GNNLayer { //! Sync second set of weight gradients void WeightGradientSyncSum2(); + void ResizeRows(size_t new_row_count) { + GNNLayer::ResizeRows(new_row_count); + ResizeIntermediates(new_row_count, new_row_count); + } + + void ResizeInputOutputRows(size_t input_row, size_t output_row) { + GNNLayer::ResizeInputOutputRows(input_row, output_row); + ResizeIntermediates(input_row, output_row); + } + + void ResizeIntermediates(size_t new_input_rows, size_t new_output_rows); + //! SAGE config params SAGELayerConfig sage_config_; //! Need own optimizer for the 2nd weight matrix diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index 433d055f83..3878b29685 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -47,6 +47,25 @@ class SoftmaxLayer : public GNNLayer { BackwardPhase(PointerWithSize in_out, PointerWithSize* input_gradient) final; + void ResizeRows(size_t new_row_count) { + layer_dimensions_.input_rows = new_row_count; + layer_dimensions_.output_rows = new_row_count; + // no output resize + if (input_loss_.size() < new_row_count) { + input_loss_.resize(new_row_count * 1.02); + } + } + + void ResizeInputOutputRows(size_t in, size_t out) { + assert(in == out); + layer_dimensions_.input_rows = in; + layer_dimensions_.output_rows = out; + // no output resize + if (input_loss_.size() < in) { + input_loss_.resize(in * 1.02); + } + } + private: #ifdef GALOIS_ENABLE_GPU SoftmaxLayerGPU gpu_object_; diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 01974baaca..b29ec3af88 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -43,12 +43,24 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( prev_layer_columns = graph_->node_feature_length(); } + // max dims GNNLayerDimensions layer_dims = {.input_rows = max_rows, .input_columns = prev_layer_columns, .output_columns = config_.intermediate_layer_size(i), .output_rows = max_rows}; + // test minibatch size: if it's not enabled, then currently the full + // graph is used (should really only subgraph the test nodes, though; + // that's a TODO) + if ((config_.train_minibatch_size() || config_.use_train_subgraph_) && + config_.test_minibatch_size()) { + galois::gInfo("Not allocating rows"); + // set to 0 here to make it allocate nothing + layer_dims.input_rows = 0; + layer_dims.output_rows = 0; + } + switch (layer_type) { case GNNLayerType::kGraphConvolutional: gnn_layers_.push_back(std::move(std::make_unique( @@ -126,6 +138,12 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( .output_columns = config_.output_layer_size(), .output_rows = max_rows}; + if ((config_.train_minibatch_size() || config_.use_train_subgraph_) && + config_.test_minibatch_size()) { + output_dims.input_rows = 0; + output_dims.output_rows = 0; + } + switch (config_.output_layer_type()) { case (GNNOutputLayerType::kSoftmax): gnn_layers_.push_back(std::move(std::make_unique( @@ -199,6 +217,7 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() { // resize layer matrices graph_->ConstructSampledSubgraph(num_sampled_layers); graph_->EnableSubgraphChooseAll(); + CorrectBackwardLinks(); const PointerWithSize batch_pred = DoInference(); std::pair correct_total = @@ -260,6 +279,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } } graph_->ConstructSampledSubgraph(num_sampled_layers); + CorrectBackwardLinks(); } galois::StatTimer epoch_timer("TrainingTime", kRegionName); @@ -284,6 +304,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { l_count++; } } + CorrectBackwardLinks(); } // beginning of epoch sampling @@ -322,7 +343,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } // resize layer matrices graph_->ConstructSampledSubgraph(num_sampled_layers); - + CorrectBackwardLinks(); mb_timer.stop(); } @@ -344,9 +365,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { galois::StatTimer mb_timer("MinibatchSubgraphCreation", kRegionName); mb_timer.start(); - const std::string btime_name("Epoch" + std::to_string(epoch) + "Batch" + - std::to_string(batch_num)); - galois::StatTimer batch_timer(btime_name.c_str(), kRegionName); + galois::Timer batch_timer; batch_timer.start(); work_left_.reset(); galois::gInfo("Epoch ", epoch, " batch ", batch_num++); @@ -393,6 +412,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // resize layer matrices graph_->ConstructSampledSubgraph(num_sampled_layers); + CorrectBackwardLinks(); // XXX resizes above only work for SAGE layers; will break if other // layers are tested @@ -423,8 +443,10 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { graph_->DisableSubgraph(); for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { + // TODO nuclear resize (*layer)->ResizeRows(graph_->size()); } + CorrectBackwardLinks(); SetLayerPhases(galois::GNNPhase::kTest); graph_->EnableSubgraphChooseAll(); const PointerWithSize test_pred = DoInference(); @@ -443,6 +465,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { galois::runtime::reportStat_Single(kRegionName, test_name_acc, test_acc); } + // report the training time elapsed at this point in time galois::runtime::reportStat_Single( kRegionName, @@ -484,14 +507,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { DisableTimers(); // disable subgraph graph_->DisableSubgraph(); + graph_->EnableSubgraphChooseAll(); + } + + if (do_validate) { + // XXX induced subgraph here for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { + // nuclear resize (*layer)->ResizeRows(graph_->size()); } - graph_->EnableSubgraphChooseAll(); - } - if (do_validate) { + CorrectBackwardLinks(); validation_timer.start(); SetLayerPhases(galois::GNNPhase::kValidate); const PointerWithSize val_pred = DoInference(); @@ -512,6 +539,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { float test_acc; if (!config_.test_minibatch_size()) { + for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); + layer++) { + // nuclear resize + (*layer)->ResizeRows(graph_->size()); + } + CorrectBackwardLinks(); SetLayerPhases(galois::GNNPhase::kTest); const PointerWithSize test_pred = DoInference(); epoch_test_timer.stop(); @@ -566,6 +599,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } } graph_->ConstructSampledSubgraph(num_sampled_layers); + CorrectBackwardLinks(); } EnableTimers(); @@ -578,19 +612,21 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { DisableTimers(); // disable subgraph graph_->DisableSubgraph(); - // TODO only do this when necessary - for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { - (*layer)->ResizeRows(graph_->size()); - } + graph_->EnableSubgraphChooseAll(); // check test accuracy - // XXX test batching galois::StatTimer test_timer("FinalTestRun", kRegionName); float global_accuracy; test_timer.start(); if (!config_.test_minibatch_size()) { + for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); + layer++) { + // TODO nuclear resize + (*layer)->ResizeRows(graph_->size()); + } + CorrectBackwardLinks(); SetLayerPhases(galois::GNNPhase::kTest); const PointerWithSize predictions = DoInference(); global_accuracy = GetGlobalAccuracy(predictions); @@ -673,3 +709,15 @@ void galois::GraphNeuralNetwork::GradientPropagation() { gnn_layers_[layer_index]->OptimizeLayer(optimizer_.get(), layer_index); } } + +void galois::GraphNeuralNetwork::CorrectBackwardLinks() { + // layer chain pointer + PointerWithSize prev_output_layer(nullptr, 0); + for (size_t layer_num = 0; layer_num < gnn_layers_.size(); layer_num++) { + // first layer is nullptr so can be ignored + if (layer_num != 0) { + gnn_layers_[layer_num]->UpdateBackwardOutput(&prev_output_layer); + } + prev_output_layer = gnn_layers_[layer_num]->GetForwardOutput(); + } +} diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 8af26898a1..b7cb9596e0 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -697,7 +697,6 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPU( float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( PointerWithSize predictions, GNNPhase phase, bool) { // check owned nodes' accuracy - assert((num_label_classes_ * size()) == predictions.size()); num_correct_.reset(); total_checked_.reset(); @@ -722,7 +721,7 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( } }, // steal on as some threads may have nothing to work on - galois::steal(), galois::loopname("GlobalAccuracy")); + galois::steal()); size_t global_correct = num_correct_.reduce(); size_t global_checked = total_checked_.reduce(); @@ -736,7 +735,6 @@ float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( std::pair galois::graphs::GNNGraph::GetBatchAccuracy( PointerWithSize predictions) { // check owned nodes' accuracy - assert((num_label_classes_ * size()) == predictions.size()); num_correct_.reset(); total_checked_.reset(); diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index dcb5c0f2db..2493319904 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -144,7 +144,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting( // galois::gDebug("Local ID ", node_id, " SID ", subgraph_id, " out ", // out_degrees, " in ", in_degrees); }, - galois::steal()); + galois::loopname("DegreeCountingDoAll"), galois::steal()); TimerStop(&timer); } @@ -222,7 +222,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( assert(out_location == local_subgraph_out_degrees_[subgraph_id]); assert(in_location == local_subgraph_in_degrees_[subgraph_id]); }, - galois::steal()); + galois::loopname("EdgeCreationDoAll"), galois::steal()); TimerStop(&timer); } diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 07e839cb48..4c828dbb19 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -103,6 +103,31 @@ galois::GNNLayer::GNNLayer(size_t layer_num, #endif } +void galois::GNNLayer::ResizeOutputMatrix(size_t new_output_row) { + size_t num_output_elements = + new_output_row * layer_dimensions_.output_columns; + + if (!config_.disable_output && + (forward_output_matrix_.size() < num_output_elements)) { + galois::gInfo(graph_.host_prefix(), "Resizing layer ", layer_number_, + ", forward output matrix to ", num_output_elements, " (", + FloatElementsToGB(num_output_elements), " GB)"); + // resize with a bit of a buffer to prevent possible future resizes + size_t buffer_size = (num_output_elements * 0.02); + forward_output_matrix_.resize(num_output_elements + buffer_size, 0); + } + + // XXX(hochan) GPU end +#ifdef GALOIS_ENABLE_GPU + // XXX(hochan) +#endif + // reinitialize the PointerWithSize wrappers + p_forward_output_matrix_ = PointerWithSize(forward_output_matrix_); +#ifdef GALOIS_ENABLE_GPU + // XXX(hochan) +#endif +} + void galois::GNNLayer::GlorotBengioInit(std::vector* vector_to_init) { float max = std::sqrt(6.0) / std::sqrt(layer_dimensions_.output_columns + layer_dimensions_.input_columns); @@ -272,20 +297,23 @@ void galois::GNNLayer::Activation() { base_gpu_object_.ActivationGPU(p_forward_output_matrix_.size()); } else { #endif - if (activation_memo_.size() == 0) { - activation_memo_.resize(forward_output_matrix_.size()); + if (activation_memo_.size() != p_forward_output_matrix_.size()) { + activation_memo_.resize(p_forward_output_matrix_.size()); } activation_memo_.reset(); + assert(activation_memo_.size() == p_forward_output_matrix_.size()); + assert(layer_dimensions_.output_rows * layer_dimensions_.output_columns <= + p_forward_output_matrix_.size()); galois::do_all(galois::iterate(static_cast(0), layer_dimensions_.output_rows * layer_dimensions_.output_columns), [&](size_t i) { - if (forward_output_matrix_[i] > 0.0) { + if (p_forward_output_matrix_[i] > 0.0) { // do nothing, keep value; set the memo though activation_memo_.set(i); } else { - forward_output_matrix_[i] = 0; + p_forward_output_matrix_[i] = 0; } }); #ifdef GALOIS_ENABLE_GPU @@ -305,6 +333,8 @@ void galois::GNNLayer::ActivationDerivative( gradient->size()); } else { #endif + assert(gradient->size() >= + layer_dimensions_.output_rows * layer_dimensions_.output_columns); // TODO only does relu at the moment; should check user specified activation // and act accordingly // keep gradient if the original output was greater than 0 @@ -362,8 +392,6 @@ void galois::GNNLayer::MaskInputNonMasters(PointerWithSize* input, assert(*(graph_.begin_owned()) == 0); size_t start_node = *(graph_.end_owned()); size_t end_node = graph_.active_size(); - size_t row_index = layer_dimensions_.input_columns; - assert((row_index * layer_dimensions_.input_rows) <= input->size()); if (start_node > max_rows) { start_node = max_rows; @@ -372,6 +400,10 @@ void galois::GNNLayer::MaskInputNonMasters(PointerWithSize* input, end_node = max_rows; } + size_t row_index = layer_dimensions_.input_columns; + assert(start_node * row_index <= input->size()); + assert(end_node * row_index <= input->size()); + #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node, row_index); @@ -396,7 +428,6 @@ void galois::GNNLayer::MaskGradientNonMasters( assert(*(graph_.begin_owned()) == 0); size_t start_node = *(graph_.end_owned()); size_t end_node = graph_.active_size(); - size_t row_index = layer_dimensions_.output_columns; if (start_node > max_rows) { start_node = max_rows; @@ -405,6 +436,16 @@ void galois::GNNLayer::MaskGradientNonMasters( end_node = max_rows; } + size_t row_index = layer_dimensions_.output_columns; + if (start_node > max_rows) { + start_node = max_rows; + } + if (end_node > max_rows) { + end_node = max_rows; + } + assert(start_node * row_index <= gradient->size()); + assert(end_node * row_index <= gradient->size()); + #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { base_gpu_object_.MaskNonMastersGPU(gradient, start_node, end_node, diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index a342cfbe14..70d85b853a 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -141,6 +141,86 @@ galois::SAGELayer::SAGELayer(size_t layer_num, GALOIS_LOG_VERBOSE("SAGE layer initialized"); } +void galois::SAGELayer::ResizeIntermediates(size_t new_input_rows, + size_t new_output_rows) { + size_t num_in_temp_elements = + new_output_rows * layer_dimensions_.input_columns; + galois::gDebug("Layer num ", layer_number_, " ", in_temp_1_.size(), " and ", + num_in_temp_elements, " ", layer_dimensions_.input_columns, + " ", layer_dimensions_.output_columns); + + // if in temp is smaller than out temp, or if dropout exists + if (!config_.disable_dropout || config_.disable_aggregate_after_update || + layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + galois::gDebug("in first if"); + if (in_temp_1_.size() < num_in_temp_elements) { + galois::gDebug("in the resize"); + galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_, + ", SAGE input temp var 1 ", num_in_temp_elements, " (", + FloatElementsToGB(num_in_temp_elements), " GB)"); + size_t buffer_size = num_in_temp_elements * 0.02; +#ifdef GALOIS_ENABLE_GPU + // XXX(hochan) + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateInTemp1(num_in_temp_elements + buffer_size); + } else { +#endif + in_temp_1_.resize(num_in_temp_elements + buffer_size, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif + // XXX(hochan) GPU + p_in_temp_1_ = PointerWithSize(in_temp_1_); + } + } + + // only on in dropout case + if in temp is smaller than out temp + if (!config_.disable_dropout && + (config_.disable_aggregate_after_update || + layer_dimensions_.input_columns <= layer_dimensions_.output_columns)) { + if (in_temp_2_.size() < num_in_temp_elements) { + galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_, + ", SAGE input temp var 2 ", num_in_temp_elements, " (", + FloatElementsToGB(num_in_temp_elements), " GB)"); + size_t buffer_size = num_in_temp_elements * 0.02; +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateInTemp2(num_in_temp_elements + buffer_size); + } else { +#endif + in_temp_2_.resize(num_in_temp_elements + buffer_size, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif + // XXX(hochan) GPU + p_in_temp_2_ = PointerWithSize(in_temp_2_); + } + } + + size_t num_output_temp_elements = + new_input_rows * layer_dimensions_.output_columns; + // only needed if out temp would be smaller than intemp + if (!config_.disable_aggregate_after_update && + layer_dimensions_.input_columns > layer_dimensions_.output_columns) { + if (out_temp_.size() < num_output_temp_elements) { + galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_, + ", SAGE output temp var ", num_output_temp_elements, " (", + FloatElementsToGB(num_output_temp_elements), " GB)"); + size_t buffer_size = (num_output_temp_elements * 0.02); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateOutTemp(num_output_temp_elements + buffer_size); + } else { +#endif + out_temp_.resize(num_output_temp_elements + buffer_size, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif + p_out_temp_ = PointerWithSize(out_temp_); + } + } +} + void galois::SAGELayer::WeightGradientSyncSum2() { galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName); TimerStart(&t); @@ -174,6 +254,10 @@ void galois::SAGELayer::WeightGradientSyncSum2() { const galois::PointerWithSize galois::SAGELayer::ForwardPhase( const galois::PointerWithSize input_embeddings) { + galois::gDebug( + "Layer ", layer_number_, " dims: ", layer_dimensions_.input_rows, " ", + layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, " ", + layer_dimensions_.output_columns); galois::StatTimer timer("ForwardPhase", kRegionName); TimerStart(&timer); @@ -200,15 +284,28 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( // flip aggregate/update if dimensions favor it (do less work) if (config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { + if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) { + assert(p_in_temp_2_.size() >= + layer_dimensions_.output_rows * layer_dimensions_.input_columns); + } else { + assert(p_in_temp_1_.size() >= + layer_dimensions_.output_rows * layer_dimensions_.input_columns); + } // aggregation and update AggregateAll(layer_dimensions_.input_columns, input_data, agg_data, &input_column_intermediates_); + assert(p_forward_output_matrix_.size() >= + layer_dimensions_.output_columns * layer_dimensions_.output_columns); UpdateEmbeddings(agg_data, p_forward_output_matrix_.data(), true); } else { + assert(p_out_temp_.size() >= + layer_dimensions_.input_rows * layer_dimensions_.output_columns); // update to aggregate // FW UpdateEmbeddings(input_data, p_out_temp_.data(), false); // A(FW) + assert(p_forward_output_matrix_.size() >= + layer_dimensions_.output_columns * layer_dimensions_.output_columns); AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(), p_forward_output_matrix_.data(), &output_column_intermediates_); @@ -595,12 +692,12 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, galois::CBlasSGEMM( CblasNoTrans, CblasNoTrans, layer_dimensions_.output_rows, layer_dimensions_.input_columns, layer_dimensions_.output_columns, - node_embeddings, layer_weights_.data(), output); + node_embeddings, p_layer_weights_.data(), output); } else { galois::CBlasSGEMM( CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, layer_dimensions_.input_columns, layer_dimensions_.output_columns, - node_embeddings, layer_weights_.data(), output); + node_embeddings, p_layer_weights_.data(), output); } #ifdef GALOIS_ENABLE_GPU } @@ -653,12 +750,12 @@ void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, galois::CBlasSGEMM( CblasNoTrans, CblasTrans, layer_dimensions_.output_rows, layer_dimensions_.output_columns, layer_dimensions_.input_columns, - gradients, layer_weights_.data(), output); + gradients, p_layer_weights_.data(), output); } else { galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, layer_dimensions_.output_columns, layer_dimensions_.input_columns, gradients, - layer_weights_.data(), output); + p_layer_weights_.data(), output); } #ifdef GALOIS_ENABLE_GPU } diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index bf9a376092..eb6e900413 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -24,7 +24,7 @@ galois::SoftmaxLayer::ForwardPhaseCPU( if (IsSampledLayer()) { if ((layer_phase_ == GNNPhase::kTrain || layer_phase_ == GNNPhase::kBatch) && - !graph_.IsInSampledGraph(i)) { + !graph_.IsInSampledGraphSubgraph(i)) { // XXX VectorZero(feature_length, &p_backward_output_matrix_[i * feature_length]); @@ -60,7 +60,8 @@ galois::SoftmaxLayer::ForwardPhaseCPU( }, // TODO chunk size? // steal on as some threads may have nothing to work on - galois::steal(), galois::loopname("SoftmaxForward")); + // galois::steal(), galois::loopname("SoftmaxForward")); + galois::steal()); #ifndef NDEBUG GNNFloat reduced_loss = loss_accum.reduce(); size_t t = handled.reduce(); @@ -93,12 +94,13 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { const size_t feature_length = layer_dimensions_.input_columns; galois::do_all( - galois::iterate(graph_.begin(), graph_.end()), + // galois::iterate(graph_.begin(), graph_.end()), + galois::iterate(size_t{0}, layer_dimensions_.input_rows), [&](const unsigned node) { if (graph_.IsValidForPhase(node, layer_phase_)) { if (IsSampledLayer()) { if (layer_phase_ == GNNPhase::kTrain && - !graph_.IsInSampledGraph(node)) + !graph_.IsInSampledGraphSubgraph(node)) return; } From 3e22e29d87c2f545b5bd9afa9b8035ed06017c82 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 1 Jun 2021 17:43:17 -0500 Subject: [PATCH 555/660] QoL layers/size CLI specification User no longer has to specify layer name for every layer specified; just replicate one layer type + size. This makes it so you can't have heterogeneous layers/sizes, but that setting isn't explored much anyways. Also reports more CLI options in stats file. --- lonestar/libgnnbench/include/GNNBench/Start.h | 7 + lonestar/libgnnbench/src/Input.cpp | 169 +++++++++++------- lonestar/libgnnbench/src/Start.cpp | 22 ++- 3 files changed, 129 insertions(+), 69 deletions(-) diff --git a/lonestar/libgnnbench/include/GNNBench/Start.h b/lonestar/libgnnbench/include/GNNBench/Start.h index 75ec167f78..48507df80e 100644 --- a/lonestar/libgnnbench/include/GNNBench/Start.h +++ b/lonestar/libgnnbench/include/GNNBench/Start.h @@ -13,6 +13,11 @@ extern llvm::cl::opt num_threads; extern llvm::cl::opt num_epochs; +extern llvm::cl::opt layer_size; +extern llvm::cl::opt cl_layer_type; +extern llvm::cl::opt train_minibatch_size; +extern llvm::cl::opt test_minibatch_size; +extern llvm::cl::opt do_graph_sampling; #ifdef GALOIS_ENABLE_GPU std::string personality_str(DevicePersonality p); @@ -24,6 +29,8 @@ void heteroSetup(); }; #endif +const char* GNNLayerToString(galois::GNNLayerType s); + //////////////////////////////////////////////////////////////////////////////// // Init functions //////////////////////////////////////////////////////////////////////////////// diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index 5facfa95c5..3ebee8adea 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -32,18 +32,26 @@ llvm::cl::opt num_layers( "Number of intermediate layers in the neural network (default 2))"), cll::init(2)); -llvm::cl::list layer_sizes( - "layerSizes", +// llvm::cl::list layer_sizes( +// "layerSizes", +// cll::desc( +// "Comma separated list of numbers specifying " +// "intermediate layer sizes (does not include output); default sizes are +// " "16 until last layer which is the size of the # of labels"), +// cll::CommaSeparated); + +llvm::cl::opt layer_size( + "layerSize", cll::desc( - "Comma separated list of numbers specifying " + "Number specifying " "intermediate layer sizes (does not include output); default sizes are " "16 until last layer which is the size of the # of labels"), - cll::CommaSeparated); + cll::init(16)); -llvm::cl::list cl_layer_types( - "layerTypes", - cll::desc("Comma separated list of layer types specifying " - "intermediate layers (does not include output)"), +llvm::cl::opt cl_layer_type( + "layerType", + cll::desc("Layer type specifying " + "intermediate layers (does not include output); default SAGE"), cll::values( clEnumValN(galois::GNNLayerType::kGraphConvolutional, "gcn", "Graph Convolutional Layer (default)"), @@ -51,7 +59,7 @@ llvm::cl::list cl_layer_types( "SAGE layer (GCN with concat + mean)"), clEnumValN(galois::GNNLayerType::kL2Norm, "l2norm", "L2 norm layer"), clEnumValN(galois::GNNLayerType::kDense, "dense", "Dense layer")), - cll::CommaSeparated); + cll::init(galois::GNNLayerType::kSAGE)); llvm::cl::list cl_fan_out_vector( "samplingFanOut", @@ -169,19 +177,22 @@ const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) { //! Initializes the vector of layer sizes from command line args + graph std::vector CreateLayerTypesVector() { std::vector layer_types; - if (!cl_layer_types.size()) { - // default is all GCN layers - for (size_t i = 0; i < num_layers; i++) { - layer_types.emplace_back(galois::GNNLayerType::kGraphConvolutional); - } - } else { - GALOIS_LOG_VASSERT(cl_layer_types.size() == num_layers, - "Number layer types should be {} not {}", num_layers, - cl_layer_types.size()); - for (size_t i = 0; i < num_layers; i++) { - layer_types.emplace_back(cl_layer_types[i]); - } + for (size_t i = 0; i < num_layers; i++) { + layer_types.emplace_back(cl_layer_type); } + // if (!cl_layer_types.size()) { + // // default is all GCN layers + // for (size_t i = 0; i < num_layers; i++) { + // layer_types.emplace_back(galois::GNNLayerType::kGraphConvolutional); + // } + //} else { + // GALOIS_LOG_VASSERT(cl_layer_types.size() == num_layers, + // "Number layer types should be {} not {}", num_layers, + // cl_layer_types.size()); + // for (size_t i = 0; i < num_layers; i++) { + // layer_types.emplace_back(cl_layer_types[i]); + // } + //} return layer_types; } @@ -190,34 +201,41 @@ std::vector CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) { // set layer sizes for intermdiate and output layers std::vector layer_sizes_vector; - if (layer_sizes.size()) { - GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers); - for (size_t i = 0; i < num_layers; i++) { - layer_sizes_vector.emplace_back(layer_sizes[i]); - } - // verify user satisfies last intermediate layer needing to have same size - // as # label classes - if (layer_sizes_vector.back() != gnn_graph->GetNumLabelClasses()) { - galois::gWarn( - "Size of last layer (", layer_sizes_vector.back(), - ") is not equal to # label classes: forcefully changing it to ", - gnn_graph->GetNumLabelClasses()); - layer_sizes_vector.back() = gnn_graph->GetNumLabelClasses(); - layer_sizes[num_layers - 1] = gnn_graph->GetNumLabelClasses(); - } - GALOIS_LOG_ASSERT(layer_sizes_vector.back() == - gnn_graph->GetNumLabelClasses()); - } else { - // default 16 for everything until last 2 - for (size_t i = 0; i < num_layers - 1; i++) { - layer_sizes_vector.emplace_back(16); - } - // last 2 sizes must be equivalent to # label classes; this is the last - // intermediate layer - layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses()); + // if (layer_sizes.size()) { + // GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers); + // for (size_t i = 0; i < num_layers; i++) { + // layer_sizes_vector.emplace_back(layer_sizes[i]); + // } + // // verify user satisfies last intermediate layer needing to have same size + // // as # label classes + // if (layer_sizes_vector.back() != gnn_graph->GetNumLabelClasses()) { + // galois::gWarn( + // "Size of last layer (", layer_sizes_vector.back(), + // ") is not equal to # label classes: forcefully changing it to ", + // gnn_graph->GetNumLabelClasses()); + // layer_sizes_vector.back() = gnn_graph->GetNumLabelClasses(); + // layer_sizes[num_layers - 1] = gnn_graph->GetNumLabelClasses(); + // } + + // GALOIS_LOG_ASSERT(layer_sizes_vector.back() == + // gnn_graph->GetNumLabelClasses()); + //} else { + // // default 16 for everything until last 2 + // for (size_t i = 0; i < num_layers - 1; i++) { + // layer_sizes_vector.emplace_back(16); + // } + // // last 2 sizes must be equivalent to # label classes; this is the last + // // intermediate layer + // layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses()); + //} + + for (size_t i = 0; i < num_layers - 1; i++) { + layer_sizes_vector.emplace_back(layer_size); } - + // last 2 sizes must be equivalent to # label classes; this is the last + // intermediate layer + layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses()); // TODO // for now only softmax layer which dictates the output size of the last // intermediate layer + size of the output layer @@ -245,29 +263,44 @@ CreateOptimizer(const galois::graphs::GNNGraph* gnn_graph) { // optimizer sizes are based on intermediate layer sizes, input feats, and // # label classes - if (layer_sizes.size()) { - GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers); - opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_sizes[0]); - // assumption here is that if it reached this point then layer sizes were - // already sanity checked previously (esp. last layer) - for (size_t i = 1; i < num_layers; i++) { - opt_sizes.emplace_back(layer_sizes[i] * layer_sizes[i - 1]); - } + // if (layer_sizes.size()) { + // GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers); + // opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_sizes[0]); + // // assumption here is that if it reached this point then layer sizes were + // // already sanity checked previously (esp. last layer) + // for (size_t i = 1; i < num_layers; i++) { + // opt_sizes.emplace_back(layer_sizes[i] * layer_sizes[i - 1]); + // } + //} else { + // // everything is size 16 until last + // if (num_layers == 1) { + // // single layer requires a bit of special handling + // opt_sizes.emplace_back(gnn_graph->node_feature_length() * + // gnn_graph->GetNumLabelClasses()); + // } else { + // // first + // opt_sizes.emplace_back(gnn_graph->node_feature_length() * 16); + // for (size_t i = 1; i < num_layers - 1; i++) { + // opt_sizes.emplace_back(16 * 16); + // } + // // last + // opt_sizes.emplace_back(16 * gnn_graph->GetNumLabelClasses()); + // } + //} + + // everything is size 16 until last + if (num_layers == 1) { + // single layer requires a bit of special handling + opt_sizes.emplace_back(gnn_graph->node_feature_length() * + gnn_graph->GetNumLabelClasses()); } else { - // everything is size 16 until last - if (num_layers == 1) { - // single layer requires a bit of special handling - opt_sizes.emplace_back(gnn_graph->node_feature_length() * - gnn_graph->GetNumLabelClasses()); - } else { - // first - opt_sizes.emplace_back(gnn_graph->node_feature_length() * 16); - for (size_t i = 1; i < num_layers - 1; i++) { - opt_sizes.emplace_back(16 * 16); - } - // last - opt_sizes.emplace_back(16 * gnn_graph->GetNumLabelClasses()); + // first + opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_size); + for (size_t i = 1; i < num_layers - 1; i++) { + opt_sizes.emplace_back(layer_size * layer_size); } + // last + opt_sizes.emplace_back(layer_size * gnn_graph->GetNumLabelClasses()); } GALOIS_LOG_ASSERT(opt_sizes.size() == num_layers); diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp index aa059c60f6..9a7e747744 100644 --- a/lonestar/libgnnbench/src/Start.cpp +++ b/lonestar/libgnnbench/src/Start.cpp @@ -45,6 +45,18 @@ static void PrintVersion(llvm::raw_ostream& out) { out.flush(); } +const char* GNNLayerToString(galois::GNNLayerType s) { + switch (s) { + case galois::GNNLayerType::kSAGE: + return "sage"; + case galois::GNNLayerType::kGraphConvolutional: + return "gcn"; + default: + GALOIS_LOG_FATAL("Invalid gnn layer"); + return ""; + } +} + //////////////////////////////////////////////////////////////////////////////// void GNNBenchStart(int argc, char** argv, const char* app) { @@ -95,7 +107,15 @@ void GNNBenchStart(int argc, char** argv, const char* app, const char* desc, galois::runtime::reportParam("GNNBench", "Input", input_name); galois::runtime::reportParam("GNNBench", "PartitionScheme", GNNPartitionToString(partition_scheme)); - // XXX report the rest of the command line options + galois::runtime::reportParam("GNNBench", "HiddenLayerSize", layer_size); + galois::runtime::reportParam("GNNBench", "LayerType", + GNNLayerToString(cl_layer_type)); + galois::runtime::reportParam("GNNBench", "TrainingMinibatchSize", + train_minibatch_size); + galois::runtime::reportParam("GNNBench", "TestingMinibatchSize", + test_minibatch_size); + galois::runtime::reportParam("GNNBench", "IsGraphSampled", + do_graph_sampling); } char name[256]; From d4a8a362cdce1a69d0db7ebeb0c77015a3f3b6f9 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 3 Jun 2021 18:35:18 -0500 Subject: [PATCH 556/660] GSTL vs std memory leak test Test written to isolate effects of memory leaks in GNN gluon communication. Basically, gstl vector must be allocated/freed by same parallel operator or memory will leak. --- libgnn/test/CMakeLists.txt | 3 +++ libgnn/test/gstl_test.cpp | 42 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) create mode 100644 libgnn/test/gstl_test.cpp diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index b9c1eea043..a1ea769105 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -25,6 +25,9 @@ add_executable(mkl_micro_delete_galois mkl_micro.cpp) target_link_libraries(mkl_micro_delete_galois galois_gnn) target_compile_definitions(mkl_micro_delete_galois PUBLIC USE_SHARED_GALOIS_DELETE=1) +add_executable(gstl_test gstl_test.cpp) +target_link_libraries(gstl_test galois_shmem) + ################################################################################ add_executable(gnngraph-test gnngraph-test.cpp) diff --git a/libgnn/test/gstl_test.cpp b/libgnn/test/gstl_test.cpp new file mode 100644 index 0000000000..ef89d96a8b --- /dev/null +++ b/libgnn/test/gstl_test.cpp @@ -0,0 +1,42 @@ +#include "galois/Galois.h" +#include "galois/gstl.h" + +int main(int argc, char* argv[]) { + galois::SharedMemSys G; + if (argc != 2) { + printf("Thread arg not specified\n"); + exit(1); + } + galois::setActiveThreads(std::stoi(argv[1])); + printf("Initialized Galois Shared Mem with %u threads\n", + galois::getActiveThreads()); + + // std vector has no leak issues + using VecType = galois::gstl::Vector; + // using VecType = std::vector; + + for (size_t i = 0; i < 1000000; i++) { + if (i % 10000 == 0) + galois::gPrint("Current is ", i, "\n"); + size_t how_many = 100000; + + std::vector carrier; + carrier.resize(how_many); + + galois::do_all(galois::iterate(size_t{0}, how_many), [&](size_t iter) { + // allocate some vector then do something with it + VecType dummy_vec(16); + for (unsigned j = 0; j < dummy_vec.size(); j++) { + dummy_vec[j] = j; + } + carrier[iter].swap(dummy_vec); + }); + + galois::do_all(galois::iterate(size_t{0}, how_many), [&](size_t iter) { + VecType to_swap; + carrier[iter].swap(to_swap); + }); + } + + return 0; +} From 05888c4be7c52c98227dd7f30e27ff5095a701b4 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 4 Jun 2021 20:03:20 -0500 Subject: [PATCH 557/660] wip 2dvec --- libgalois/include/galois/TwoDVector.h | 41 ++ .../include/galois/graphs/GluonSubstrate.h | 520 ++++++++++++++++-- .../galois/graphs/DegreeSyncStructures.h | 18 + .../graphs/GraphAggregationSyncStructures.h | 55 ++ 4 files changed, 597 insertions(+), 37 deletions(-) create mode 100644 libgalois/include/galois/TwoDVector.h diff --git a/libgalois/include/galois/TwoDVector.h b/libgalois/include/galois/TwoDVector.h new file mode 100644 index 0000000000..ab3a7ff882 --- /dev/null +++ b/libgalois/include/galois/TwoDVector.h @@ -0,0 +1,41 @@ +#pragma once + +#include "gstl.h" +#include "PODResizeableArray.h" + +namespace galois { + +template +class TwoDVector { +public: + using value_type = T; + + void SetVecSize(size_t fixed_vector_size) { + fixed_vector_size_ = fixed_vector_size; + } + + //! Call this before using this else bad things will happen: initializes + //! the memory + fixed size metadata + void Create(size_t num_elements) { + num_elements_ = num_elements; + underlying_memory_.resize(num_elements_ * fixed_vector_size_); + } + void SetVector(size_t index, const galois::gstl::Vector& to_copy) { + // TODO(loc) for generality should work with any vector type, but for + // now just use gstl + assert(index < num_elements_); + assert(to_copy == fixed_vector_size_); + size_t array_index = index * fixed_vector_size_; + std::memcpy((void*)(&(underlying_memory_[array_index])), + (void*)to_copy.data(), + sizeof(T) * fixed_vector_size_); + } + + const PODResizeableArray& data() { return underlying_memory_; } +private: + size_t num_elements_{0}; + size_t fixed_vector_size_{0}; + PODResizeableArray underlying_memory_; +}; + +} diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index f102e3a4a1..6ccf35edec 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -29,6 +29,7 @@ #include #include +#include "galois/TwoDVector.h" #include "galois/runtime/GlobalObj.h" #include "galois/runtime/DistStats.h" #include "galois/runtime/SyncStructures.h" @@ -716,9 +717,38 @@ class GluonSubstrate : public galois::runtime::GlobalObject { struct is_vector_of_vec, A>> : public std::true_type {}; + template + struct IsVector : public std::false_type {}; + template + struct IsVector> : public std::true_type {}; + + template + struct Is2DVector : public std::false_type {}; + template + struct Is2DVector> : public std::true_type {}; + //////////////////////////////////////////////////////////////////////////////// // Message prep functions (buffering, send buffer getting, etc.) //////////////////////////////////////////////////////////////////////////////// + + template < + SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, + bool async, + typename std::enable_if::value>::type* = nullptr> + void getSendBuffer(std::string loopName, unsigned x, + galois::runtime::SendBuffer& b, size_t elem_size) { + auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes; + + SyncExtract2D( + loopName, x, sharedNodes[x], b, elem_size); + + std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; + std::string statSendBytes_str(syncTypeStr + "SendBytes_" + + get_run_identifier(loopName)); + + galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size()); + } + /** * Get data that is going to be sent for synchronization and returns * it in a send buffer. @@ -735,27 +765,17 @@ class GluonSubstrate : public galois::runtime::GlobalObject { template < SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, bool async, - typename std::enable_if::type* = nullptr> + typename std::enable_if::value>::type* = nullptr> void getSendBuffer(std::string loopName, unsigned x, galois::runtime::SendBuffer& b, size_t elem_size) { auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes; if (BitsetFnTy::is_valid()) { - if (is_vector_of_vec::value) { - syncExtractFloatVecHack( - loopName, x, sharedNodes[x], b, elem_size); - } else { - syncExtract( - loopName, x, sharedNodes[x], b, elem_size); - } + syncExtract( + loopName, x, sharedNodes[x], b, elem_size); } else { - if (is_vector_of_vec::value) { - // TODO (loc) - GALOIS_LOG_FATAL("implement me"); - } else { - syncExtract( - loopName, x, sharedNodes[x], b, elem_size); - } + syncExtract( + loopName, x, sharedNodes[x], b, elem_size); } std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; @@ -764,23 +784,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject { galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size()); } - template < - SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, - bool async, - typename std::enable_if::type* = nullptr> - void getSendBuffer(std::string loopName, unsigned x, - galois::runtime::SendBuffer& b, size_t elem_size) { - auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes; - - syncExtract( - loopName, x, sharedNodes[x], b, elem_size); - - std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; - std::string statSendBytes_str(syncTypeStr + "SendBytesVector_" + - get_run_identifier(loopName)); - - galois::runtime::reportStat_Tsum(RNAME, statSendBytes_str, b.size()); - } /** * Given data to serialize in val_vec, serialize it into the send buffer @@ -883,6 +886,47 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } } + // Calls data on the TwoDVector + template + void + SerializeMessage2D(std::string loopName, DataCommMode data_mode, + size_t bit_set_count, std::vector& indices, + galois::PODResizeableArray& offsets, + galois::DynamicBitSet& bit_set_comm, TwoDVecType& two_d_vec, + galois::runtime::SendBuffer& b) { + std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; + std::string serialize_timer_str(syncTypeStr + "SerializeMessage_" + + get_run_identifier(loopName)); + galois::CondStatTimer Tserialize( + serialize_timer_str.c_str(), RNAME); + if (data_mode == noData) { + if (!async) { + Tserialize.start(); + gSerialize(b, data_mode); + Tserialize.stop(); + } + } else if (data_mode == gidsData) { + offsets.resize(bit_set_count); + convertLIDToGID(loopName, indices, offsets); + Tserialize.start(); + gSerialize(b, data_mode, bit_set_count, offsets, two_d_vec.data()); + Tserialize.stop(); + } else if (data_mode == offsetsData) { + offsets.resize(bit_set_count); + Tserialize.start(); + gSerialize(b, data_mode, bit_set_count, offsets, two_d_vec.data()); + Tserialize.stop(); + } else if (data_mode == bitsetData) { + Tserialize.start(); + gSerialize(b, data_mode, bit_set_count, bit_set_comm, two_d_vec.data()); + Tserialize.stop(); + } else { // onlyData + Tserialize.start(); + gSerialize(b, data_mode, two_d_vec.data()); + Tserialize.stop(); + } + } + /** * Given the data mode, deserialize the rest of a message in a Receive Buffer. * @@ -1239,6 +1283,47 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } } + template + void ExtractSubset2D(const std::string& loopName, + const std::vector& indices, size_t size, + const galois::PODResizeableArray& offsets, + VecTy& two_d_vector, size_t start = 0) { + if (parallelize) { + std::string syncTypeStr = + (syncType == syncReduce) ? "Reduce" : "Broadcast"; + std::string doall_str(syncTypeStr + "ExtractVal_" + loopName); + + galois::do_all( + galois::iterate(start, start + size), + [&](unsigned int n) { + unsigned int offset; + if (identity_offsets) + offset = n; + else + offset = offsets[n]; + size_t lid = indices[offset]; + two_d_vector.SetVector(n - start, extractWrapper(lid)); + }, +#if GALOIS_COMM_STATS + galois::loopname(get_run_identifier(doall_str).c_str()), +#endif + galois::no_stats()); + } else { // non-parallel version + for (unsigned n = start; n < start + size; ++n) { + unsigned int offset; + if (identity_offsets) + offset = n; + else + offset = offsets[n]; + + size_t lid = indices[offset]; + two_d_vector.SetVector(n - start, extractWrapper(lid)); + } + } + } + + /** * Based on provided arguments, extracts the data that we are interested * in sending into val_vec. Same as above, except it has the vecIndex @@ -1455,6 +1540,23 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } } + // 2D + template + void SetWrapper2D(size_t lid, typename FnTy::ValTy::value_type* pointer_to_data, + galois::DynamicBitSet& bit_set_compute) { + if (syncType == syncReduce) { + if (FnTy::reduce(lid, userGraph.getData(lid), pointer_to_data)) { + if (bit_set_compute.size() != 0) + bit_set_compute.set(lid); + } + } else { + if (async) + FnTy::reduce(lid, userGraph.getData(lid), pointer_to_data); + else + FnTy::setVal(lid, userGraph.getData(lid), pointer_to_data); + } + } + /** * VECTOR VARIANT. * @@ -1554,6 +1656,51 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } } + // 2D; vecty is a PODResize + template + void SetSubset2D(const std::string& loopName, const IndicesVecTy& indices, + size_t size, + const galois::PODResizeableArray& offsets, + VecTy& val_vec, galois::DynamicBitSet& bit_set_compute, + size_t start = 0) { + std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; + std::string doall_str(syncTypeStr + "SetVal_" + + get_run_identifier(loopName)); + + if (parallelize) { + galois::do_all( + galois::iterate(start, start + size), + [&](unsigned int n) { + unsigned int offset; + if (identity_offsets) + offset = n; + else + offset = offsets[n]; + auto lid = indices[offset]; + SetWrapper2D(lid, &val_vec[(n - start) * FnTy::FeatVecSize()], + bit_set_compute); + }, +#if GALOIS_COMM_STATS + galois::loopname(get_run_identifier(doall_str).c_str()), +#endif + galois::no_stats()); + } else { + for (unsigned int n = start; n < start + size; ++n) { + unsigned int offset; + if (identity_offsets) + offset = n; + else + offset = offsets[n]; + auto lid = indices[offset]; + SetWrapper2D(lid, &val_vec[(n - start) * FnTy::FeatVecSize()], + bit_set_compute); + } + } + } + + /** * VECTOR BITSET VARIANT. * @@ -2140,6 +2287,128 @@ class GluonSubstrate : public galois::runtime::GlobalObject { 1); } + template < + SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, + bool async> + void SyncExtract2D(std::string loopName, unsigned from_id, + std::vector& indices, + galois::runtime::SendBuffer& b, size_t elem_size) { + uint32_t num = indices.size() * elem_size; + galois::DynamicBitSet& bit_set_comm = syncBitset; + static VecTy two_d_array; + two_d_array.SetVecSize(SyncFnTy::FeatVecSize()); + galois::PODResizeableArray& offsets = syncOffsets; + + ////////////////////////////////////////////////////////////////////////////// + std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; + std::string extract_timer_str(syncTypeStr + "Extract_" + + get_run_identifier(loopName)); + galois::CondStatTimer Textract(extract_timer_str.c_str(), + RNAME); + std::string extract_alloc_timer_str(syncTypeStr + "ExtractAlloc_" + + get_run_identifier(loopName)); + galois::CondStatTimer Textractalloc( + extract_alloc_timer_str.c_str(), RNAME); + std::string extract_batch_timer_str(syncTypeStr + "ExtractBatch_" + + get_run_identifier(loopName)); + galois::CondStatTimer Textractbatch( + extract_batch_timer_str.c_str(), RNAME); + ////////////////////////////////////////////////////////////////////////////// + + DataCommMode data_mode; + Textract.start(); + + if (num > 0) { + size_t bit_set_count = 0; + Textractalloc.start(); + b.reserve(getMaxSendBufferSize(num)); + Textractalloc.stop(); + + Textractbatch.start(); + bool batch_succeeded = extractBatchWrapper( + from_id, b, bit_set_count, data_mode); + Textractbatch.stop(); + + // GPUs have a batch function they can use; CPUs do not; therefore, + // CPUS always enter this if block + if (!batch_succeeded) { + Textractalloc.start(); + b.resize(0); + bit_set_comm.reserve(maxSharedSize); + offsets.reserve(maxSharedSize); + bit_set_comm.resize(num); + offsets.resize(num); + two_d_array.Create(num); + Textractalloc.stop(); + const galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get(); + + getBitsetAndOffsets( + loopName, indices, bit_set_compute, bit_set_comm, offsets, + bit_set_count, data_mode); + + if (data_mode == onlyData) { + bit_set_count = indices.size(); + ExtractSubset2D( + loopName, indices, bit_set_count, offsets, two_d_array); + } else if (data_mode != + noData) { // bitsetData or offsetsData or gidsData + ExtractSubset2D( + loopName, indices, bit_set_count, offsets, two_d_array); + } + + SerializeMessage2D( + loopName, data_mode, bit_set_count, indices, offsets, bit_set_comm, + two_d_array, b); + } else { + // TODO(loc/hochan) GPU + //if (data_mode == noData) { + // b.resize(0); + // if (!async) { + // gSerialize(b, data_mode); + // } + //} else if (data_mode == gidsData) { + // b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) + + // sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) + + // sizeof(size_t) + + // (bit_set_count * sizeof(typename SyncFnTy::ValTy))); + //} else if (data_mode == offsetsData) { + // b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) + + // sizeof(size_t) + (bit_set_count * sizeof(unsigned int)) + + // sizeof(size_t) + + // (bit_set_count * sizeof(typename SyncFnTy::ValTy))); + //} else if (data_mode == bitsetData) { + // size_t bitset_alloc_size = ((num + 63) / 64) * sizeof(uint64_t); + // b.resize(sizeof(DataCommMode) + sizeof(bit_set_count) + + // sizeof(size_t) // bitset size + // + sizeof(size_t) // bitset vector size + // + bitset_alloc_size + sizeof(size_t) + + // (bit_set_count * sizeof(typename SyncFnTy::ValTy))); + //} else { // onlyData + // b.resize(sizeof(DataCommMode) + sizeof(size_t) + + // (num * sizeof(typename SyncFnTy::ValTy))); + //} + GALOIS_LOG_FATAL("Make sure this is implemented correctly"); + } + reportRedundantSize(loopName, syncTypeStr, num, bit_set_count, + bit_set_comm); + } else { + data_mode = noData; + b.resize(0); + if (!async) { + gSerialize(b, noData); + } + } + + Textract.stop(); + + std::string metadata_str(syncTypeStr + "MetadataMode_" + + std::to_string(data_mode) + "_" + + get_run_identifier(loopName)); + galois::runtime::reportStatCond_Single(RNAME, metadata_str, + 1); + } + + /** * Vector bitset variant. * @@ -2521,6 +2790,103 @@ class GluonSubstrate : public galois::runtime::GlobalObject { return retval; } + // TODO (loc) way too much code duplication + template < + SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, + bool async> + size_t SyncRecvApply2D(uint32_t from_id, + galois::runtime::RecvBuffer& buf, + std::string loopName) { + //////////////////////////////////////////////////////////////////////////// + std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; + std::string set_timer_str(syncTypeStr + "Set_" + + get_run_identifier(loopName)); + galois::CondStatTimer Tset(set_timer_str.c_str(), RNAME); + std::string set_batch_timer_str(syncTypeStr + "SetBatch_" + + get_run_identifier(loopName)); + galois::CondStatTimer Tsetbatch( + set_batch_timer_str.c_str(), RNAME); + //////////////////////////////////////////////////////////////////////////// + + galois::DynamicBitSet& bit_set_comm = syncBitset; + //static VecTy two_d_vector; + galois::PODResizeableArray& offsets = syncOffsets; + + auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes; + uint32_t num = sharedNodes[from_id].size(); + size_t retval = 0; + + Tset.start(); + + if (num > 0) { // only enter if we expect message from that host + DataCommMode data_mode; + // 1st deserialize gets data mode + galois::runtime::gDeserialize(buf, data_mode); + + if (data_mode != noData) { + Tsetbatch.start(); + bool batch_succeeded = + setBatchWrapper(from_id, buf, data_mode); + Tsetbatch.stop(); + + // cpu always enters this block + if (!batch_succeeded) { + size_t bit_set_count = num; + size_t buf_start = 0; + + using DeserialPOD = galois::PODResizeableArray; + DeserialPOD deserial_pod; + + // deserialize the rest of the data in the buffer depending on the + // data mode; arguments passed in here are mostly output vars + deserializeMessage(loopName, data_mode, num, buf, + bit_set_count, offsets, bit_set_comm, + buf_start, retval, deserial_pod); + + bit_set_comm.reserve(maxSharedSize); + offsets.reserve(maxSharedSize); + + galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get(); + + if (data_mode == bitsetData) { + size_t bit_set_count2; + getOffsetsFromBitset(loopName, bit_set_comm, offsets, + bit_set_count2); + assert(bit_set_count == bit_set_count2); + } + + if (data_mode == onlyData) { + SetSubset2D(loopName, sharedNodes[from_id], + bit_set_count, offsets, deserial_pod, + bit_set_compute); + } else if (data_mode == dataSplit || data_mode == dataSplitFirst) { + SetSubset2D(loopName, sharedNodes[from_id], + bit_set_count, offsets, deserial_pod, + bit_set_compute, buf_start); + } else if (data_mode == gidsData) { + SetSubset2D(loopName, offsets, bit_set_count, offsets, deserial_pod, + bit_set_compute); + } else { // bitsetData or offsetsData + SetSubset2D(loopName, sharedNodes[from_id], + bit_set_count, offsets, deserial_pod, + bit_set_compute); + } + } else { + // TODO(loc/hochan) + GALOIS_LOG_FATAL("Implement GPU"); + } + } + } + + Tset.stop(); + + return retval; + } + // TODO (loc) way too much code duplication template < SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, @@ -2820,6 +3186,39 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } #endif + template ::value>::type* = nullptr> + void syncNetRecv(std::string loopName) { + auto& net = galois::runtime::getSystemNetworkInterface(); + std::string wait_timer_str("Wait_" + get_run_identifier(loopName)); + galois::CondStatTimer Twait(wait_timer_str.c_str(), + RNAME); + + if (async) { + GALOIS_LOG_FATAL("2d vector + async = unimplemented"); + } else { + for (unsigned x = 0; x < numHosts; ++x) { + if (x == id) + continue; + if (nothingToRecv(x, syncType, writeLocation, readLocation)) + continue; + + Twait.start(); + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; + do { + p = net.recieveTagged(galois::runtime::evilPhase); + } while (!p); + Twait.stop(); + + SyncRecvApply2D( + p->first, p->second, loopName); + } + incrementEvilPhase(); + } + } + /** * Determines if there is anything to receive from a host and receives/applies * the messages. @@ -2834,7 +3233,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { */ template + typename VecTy, bool async, + typename std::enable_if::value>::type* = nullptr> void syncNetRecv(std::string loopName) { auto& net = galois::runtime::getSystemNetworkInterface(); std::string wait_timer_str("Wait_" + get_run_identifier(loopName)); @@ -3061,6 +3461,28 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // Higher Level Sync Calls (broadcast/reduce, etc) //////////////////////////////////////////////////////////////////////////////// + // 2D vector + template ::value>::type* = nullptr> + void reduce(std::string loopName, size_t elem_size) { + std::string timer_str("Reduce_" + get_run_identifier(loopName)); + galois::CondStatTimer TsyncReduce(timer_str.c_str(), + RNAME); + + using T = typename ReduceFnTy::ValTy::value_type; + using VecTy = galois::TwoDVector; + + TsyncReduce.start(); + + syncSend(loopName, elem_size); + syncRecv(loopName); + + TsyncReduce.stop(); + } + /** * Does a reduction of data from mirror nodes to master nodes. * @@ -3072,8 +3494,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { * @param loopName used to name timers for statistics */ template - inline void reduce(std::string loopName, size_t elem_size) { + typename ReduceFnTy, typename BitsetFnTy, bool async, + typename std::enable_if::value>::type* = nullptr> + void reduce(std::string loopName, size_t elem_size) { std::string timer_str("Reduce_" + get_run_identifier(loopName)); galois::CondStatTimer TsyncReduce(timer_str.c_str(), RNAME); @@ -3112,6 +3535,28 @@ class GluonSubstrate : public galois::runtime::GlobalObject { TsyncReduce.stop(); } + // 2d + template ::value>::type* = nullptr> + void broadcast(std::string loopname, size_t elem_size) { + std::string timer_str("Broadcast_" + get_run_identifier(loopname)); + galois::CondStatTimer TsyncBroadcast(timer_str.c_str(), + RNAME); + + typedef typename BroadcastFnTy::ValTy::value_type T; + using VecTy = galois::TwoDVector; + + TsyncBroadcast.start(); + + syncSend(loopname, elem_size); + syncRecv(loopname); + + TsyncBroadcast.stop(); + } + /** * Does a broadcast of data from master to mirror nodes. * @@ -3123,8 +3568,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { * @param loopName used to name timers for statistics */ template - inline void broadcast(std::string loopName, size_t elem_size) { + typename BroadcastFnTy, typename BitsetFnTy, bool async, + typename std::enable_if::value>::type* = nullptr> + void broadcast(std::string loopName, size_t elem_size) { std::string timer_str("Broadcast_" + get_run_identifier(loopName)); galois::CondStatTimer TsyncBroadcast(timer_str.c_str(), RNAME); diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h index 04c696f6ab..0ba0ad2bd9 100644 --- a/libgnn/include/galois/graphs/DegreeSyncStructures.h +++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h @@ -60,6 +60,10 @@ struct InitialDegreeSync { struct SubgraphDegreeSync { using ValTy = galois::gstl::Vector; + static size_t FeatVecSize() { + return gnn_sampled_out_degrees_->size();; + } + //! return a vector of floats to sync static ValTy extract(uint32_t lid, char&) { ValTy vec_to_send(gnn_sampled_out_degrees_->size()); @@ -80,6 +84,13 @@ struct SubgraphDegreeSync { return true; } + static bool reduce(uint32_t lid, char&, ValTy::value_type* y) { + for (size_t degree_index = 0; degree_index < gnn_sampled_out_degrees_->size(); degree_index++) { + (*gnn_sampled_out_degrees_)[degree_index][lid] += y[degree_index]; + } + return true; + } + //! No-op: readAny = overwritten anyways; can probably get away with no-op static void reset(uint32_t lid, char&) { for (galois::LargeArray& layer_degrees : @@ -96,6 +107,13 @@ struct SubgraphDegreeSync { } } + static void setVal(uint32_t lid, char&, ValTy::value_type* y) { + for (size_t degree_index = 0; degree_index < gnn_sampled_out_degrees_->size(); degree_index++) { + (*gnn_sampled_out_degrees_)[degree_index][lid] = y[degree_index]; + } + } + + // GPU options TODO for GPU static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { return false; diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index 89ccc83324..17063fe8ec 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -68,6 +68,10 @@ struct SampleFlagBitset { struct GNNSumAggregate { using ValTy = galois::gstl::Vector; + static size_t FeatVecSize() { + return gnn_matrix_to_sync_column_length_; + } + //! return a vector of floats to sync static ValTy extract(uint32_t node_id, char&) { // It should be a CPU synchronizing substrate. @@ -96,6 +100,16 @@ struct GNNSumAggregate { return true; } + static bool reduce(uint32_t node_id, char&, const ValTy::value_type* y) { + // loop and do addition + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + // XXX vectorized add + gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] += + y[i]; + } + return true; + } + //! No-op: readAny = overwritten anyways static void reset(uint32_t, char&) {} // Reset is here in case anyone wants to bring it back @@ -116,6 +130,15 @@ struct GNNSumAggregate { } } + static void setVal(uint32_t node_id, char&, const ValTy::value_type* y) { + // loop and do addition + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] = + y[i]; + } + } + + // GPU options TODO for GPU static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { return false; @@ -135,6 +158,10 @@ struct GNNSumAggregate { struct GNNSampleSumAggregate { using ValTy = galois::gstl::Vector; + static size_t FeatVecSize() { + return gnn_matrix_to_sync_column_length_; + } + //! return a vector of floats to sync static ValTy extract(uint32_t node_id, char&) { // It should be a CPU synchronizing substrate. @@ -175,6 +202,21 @@ struct GNNSampleSumAggregate { return true; } + static bool reduce(uint32_t node_id, char&, ValTy::value_type* y) { + if ((*gnn_lid_to_sid_pointer_)[node_id] == + std::numeric_limits::max()) { + return false; + } + + // loop and do addition + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] * + gnn_matrix_to_sync_column_length_ + + i] += y[i]; + } + return true; + } + //! No-op: readAny = overwritten anyways static void reset(uint32_t, char&) {} @@ -193,6 +235,19 @@ struct GNNSampleSumAggregate { i] = y[i]; } } + static void setVal(uint32_t node_id, char&, ValTy::value_type* y) { + if ((*gnn_lid_to_sid_pointer_)[node_id] == + std::numeric_limits::max()) { + return; + } + + // loop and do addition + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] * + gnn_matrix_to_sync_column_length_ + + i] = y[i]; + } + } // GPU options TODO for GPU static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { From 8aa64e4d69bebf8f5e64e57f5d0f9608d59bfc5c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 4 Jun 2021 20:46:33 -0500 Subject: [PATCH 558/660] WIP part 2: cut off 0s --- libgalois/include/galois/TwoDVector.h | 1 + libgluon/include/galois/graphs/GluonSubstrate.h | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/libgalois/include/galois/TwoDVector.h b/libgalois/include/galois/TwoDVector.h index ab3a7ff882..240e7750a5 100644 --- a/libgalois/include/galois/TwoDVector.h +++ b/libgalois/include/galois/TwoDVector.h @@ -32,6 +32,7 @@ class TwoDVector { } const PODResizeableArray& data() { return underlying_memory_; } + void resize(size_t s) { underlying_memory_.resize(s); } private: size_t num_elements_{0}; size_t fixed_vector_size_{0}; diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index 6ccf35edec..37e6cca573 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -893,7 +893,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { size_t bit_set_count, std::vector& indices, galois::PODResizeableArray& offsets, galois::DynamicBitSet& bit_set_comm, TwoDVecType& two_d_vec, - galois::runtime::SendBuffer& b) { + galois::runtime::SendBuffer& b, size_t feat_size) { std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; std::string serialize_timer_str(syncTypeStr + "SerializeMessage_" + get_run_identifier(loopName)); @@ -908,16 +908,19 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } else if (data_mode == gidsData) { offsets.resize(bit_set_count); convertLIDToGID(loopName, indices, offsets); + two_d_vec.resize(bit_set_count * feat_size); Tserialize.start(); gSerialize(b, data_mode, bit_set_count, offsets, two_d_vec.data()); Tserialize.stop(); } else if (data_mode == offsetsData) { offsets.resize(bit_set_count); + two_d_vec.resize(bit_set_count * feat_size); Tserialize.start(); gSerialize(b, data_mode, bit_set_count, offsets, two_d_vec.data()); Tserialize.stop(); } else if (data_mode == bitsetData) { Tserialize.start(); + two_d_vec.resize(bit_set_count * feat_size); gSerialize(b, data_mode, bit_set_count, bit_set_comm, two_d_vec.data()); Tserialize.stop(); } else { // onlyData @@ -2358,7 +2361,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { SerializeMessage2D( loopName, data_mode, bit_set_count, indices, offsets, bit_set_comm, - two_d_array, b); + two_d_array, b, SyncFnTy::FeatVecSize()); } else { // TODO(loc/hochan) GPU //if (data_mode == noData) { From 2de8ab57de2548f1b384f0df58dc43510963e90b Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 5 Jun 2021 15:20:18 -0500 Subject: [PATCH 559/660] WIP #3: write directly to 2d vector --- libgalois/include/galois/TwoDVector.h | 2 ++ .../include/galois/graphs/GluonSubstrate.h | 17 ++++++++-- .../galois/graphs/DegreeSyncStructures.h | 13 ++++++- .../graphs/GraphAggregationSyncStructures.h | 34 +++++++++++++++---- 4 files changed, 56 insertions(+), 10 deletions(-) diff --git a/libgalois/include/galois/TwoDVector.h b/libgalois/include/galois/TwoDVector.h index 240e7750a5..1af9fba505 100644 --- a/libgalois/include/galois/TwoDVector.h +++ b/libgalois/include/galois/TwoDVector.h @@ -31,8 +31,10 @@ class TwoDVector { sizeof(T) * fixed_vector_size_); } + PODResizeableArray& edit_data() { return underlying_memory_; } const PODResizeableArray& data() { return underlying_memory_; } void resize(size_t s) { underlying_memory_.resize(s); } + size_t size() const { return underlying_memory_.size(); } private: size_t num_elements_{0}; size_t fixed_vector_size_{0}; diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index 37e6cca573..73282a0644 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -1199,6 +1199,18 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } } + + template + void ExtractWrapper2D(size_t lid, typename FnTy::ValTy::value_type* location_to_write) { + if (syncType == syncReduce) { + FnTy::ExtractDirect(lid, location_to_write); + char dummy = 0; + FnTy::reset(lid, dummy); + } else { + FnTy::ExtractDirect(lid, location_to_write); + } + } + /** * Extracts data at provided lid; uses vecIndex to get the correct element * from the vector. @@ -1306,7 +1318,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { else offset = offsets[n]; size_t lid = indices[offset]; - two_d_vector.SetVector(n - start, extractWrapper(lid)); + + ExtractWrapper2D(lid, (&(two_d_vector.edit_data()[(n - start) * FnTy::FeatVecSize()]))); }, #if GALOIS_COMM_STATS galois::loopname(get_run_identifier(doall_str).c_str()), @@ -1321,7 +1334,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { offset = offsets[n]; size_t lid = indices[offset]; - two_d_vector.SetVector(n - start, extractWrapper(lid)); + ExtractWrapper2D(lid, &((two_d_vector.edit_data())[(n - start) * FnTy::FeatVecSize()])); } } } diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h index 0ba0ad2bd9..44102a3807 100644 --- a/libgnn/include/galois/graphs/DegreeSyncStructures.h +++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h @@ -1,4 +1,5 @@ #include "galois/GNNTypes.h" +//#include "galois/Logging.h" namespace galois { namespace graphs { @@ -64,7 +65,6 @@ struct SubgraphDegreeSync { return gnn_sampled_out_degrees_->size();; } - //! return a vector of floats to sync static ValTy extract(uint32_t lid, char&) { ValTy vec_to_send(gnn_sampled_out_degrees_->size()); size_t count = 0; @@ -76,6 +76,17 @@ struct SubgraphDegreeSync { return vec_to_send; } + static void ExtractDirect(uint32_t lid, typename ValTy::value_type* to_write) { + size_t count = 0; + for (galois::LargeArray& layer_degrees : + *gnn_sampled_out_degrees_) { + std::memcpy(&to_write[count], + &layer_degrees[lid], + sizeof(typename ValTy::value_type)); + count++; + } + } + static bool reduce(uint32_t lid, char&, ValTy y) { assert(y.size() == gnn_sampled_out_degrees_->size()); for (size_t degree_index = 0; degree_index < y.size(); degree_index++) { diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index 17063fe8ec..1270df5ff5 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -72,21 +72,30 @@ struct GNNSumAggregate { return gnn_matrix_to_sync_column_length_; } + //! return a vector of floats to sync static ValTy extract(uint32_t node_id, char&) { // It should be a CPU synchronizing substrate. // If the GPU flag is turned off, then personality does not exist. // assert(device_personality == DevicePersonality::CPU); - ValTy extracted_vec(gnn_matrix_to_sync_column_length_); + ValTy extracted_vec; + extracted_vec.reserve(gnn_matrix_to_sync_column_length_); for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { // XXX memcpy - extracted_vec[i] = - gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i]; + extracted_vec.emplace_back( + gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i]); } // move constructor should kick in here to avoid return copy return extracted_vec; } + //! return a vector of floats to sync + static void ExtractDirect(uint32_t node_id, typename ValTy::value_type* to_write) { + std::memcpy(to_write, + (char*)&(gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_]), + gnn_matrix_to_sync_column_length_ * sizeof(typename ValTy::value_type)); + } + //! reduction is addition in this case; add received vector to //! own vector static bool reduce(uint32_t node_id, char&, ValTy y) { @@ -138,7 +147,6 @@ struct GNNSumAggregate { } } - // GPU options TODO for GPU static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { return false; @@ -167,7 +175,9 @@ struct GNNSampleSumAggregate { // It should be a CPU synchronizing substrate. // If the GPU flag is turned off, then personality does not exist. // assert(device_personality == DevicePersonality::CPU); - ValTy extracted_vec(gnn_matrix_to_sync_column_length_, 0.0); + //ValTy extracted_vec(gnn_matrix_to_sync_column_length_); + ValTy extracted_vec; + extracted_vec.reserve(gnn_matrix_to_sync_column_length_); if ((*gnn_lid_to_sid_pointer_)[node_id] == std::numeric_limits::max()) { return extracted_vec; @@ -175,15 +185,25 @@ struct GNNSampleSumAggregate { for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { // XXX memcpy - extracted_vec[i] = + extracted_vec.emplace_back( gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] * gnn_matrix_to_sync_column_length_ + - i]; + i]); } // move constructor should kick in here to avoid return copy return extracted_vec; } + static void ExtractDirect(uint32_t node_id, typename ValTy::value_type* to_write) { + if ((*gnn_lid_to_sid_pointer_)[node_id] == + std::numeric_limits::max()) { + return; + } + std::memcpy(to_write, + (char*)&(gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id]* gnn_matrix_to_sync_column_length_]), + gnn_matrix_to_sync_column_length_ * sizeof(typename ValTy::value_type)); + } + //! reduction is addition in this case; add received vector to //! own vector static bool reduce(uint32_t node_id, char&, ValTy y) { From 3d33e96d3ad7dce8d86ada68cb466c07b5055639 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 7 Jun 2021 16:53:21 -0500 Subject: [PATCH 560/660] Directly serialize vectors to comm buffer --- libdist/include/galois/runtime/Serialize.h | 10 +- .../include/galois/graphs/GluonSubstrate.h | 321 +++++++++++++----- 2 files changed, 245 insertions(+), 86 deletions(-) diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h index 8110c954e9..bfd25c3cf3 100644 --- a/libdist/include/galois/runtime/Serialize.h +++ b/libdist/include/galois/runtime/Serialize.h @@ -135,6 +135,9 @@ class SerializeBuffer { //! (as determined by offset) const uint8_t* data() const { return bufdata.data() + kHeaderSize; } uint8_t* data() { return bufdata.data() + kHeaderSize; } + uint8_t* DataAtOffset(size_t offset) { + return bufdata.data() + kHeaderSize + offset; + } //! Returns the size of the serialize buffer size_type size() const { return bufdata.size() - kHeaderSize; } @@ -1052,10 +1055,9 @@ inline void gDeserialize(DeSerializeBuffer&) {} * @param data Object to save data in the iterator type into */ template -auto gDeserializeRaw(Iter iter, T& data) - -> decltype(std::declval::value>::type>(), - Iter()) { +auto gDeserializeRaw(Iter iter, T& data) -> decltype( + std::declval::value>::type>(), + Iter()) { unsigned char* pdata = (unsigned char*)&data; for (size_t i = 0; i < sizeof(T); ++i) pdata[i] = *iter++; diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index 73282a0644..c60308ac93 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -595,6 +595,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { galois::PODResizeableArray& offsets, size_t& bit_set_count, DataCommMode& data_mode) const { + // i.e. not set by user if (substrateDataMode != onlyData) { bitset_comm.reset(); std::string syncTypeStr = @@ -619,10 +620,60 @@ class GluonSubstrate : public galois::runtime::GlobalObject { galois::no_stats()); // get the number of set bits and the offsets into the comm bitset + // i.e., the things thaneed to be grabbed getOffsetsFromBitset(loopName, bitset_comm, offsets, bit_set_count); } + // from the count of things that need to be grabbed determine the data mode + // to use + data_mode = + get_data_mode(bit_set_count, indices.size()); + } + + template + void GetBitsetAndOffsets2D(const std::string& loopName, + const std::vector& indices, + const galois::DynamicBitSet& bitset_compute, + galois::DynamicBitSet& bitset_comm, + galois::PODResizeableArray& offsets, + size_t& bit_set_count, + DataCommMode& data_mode) const { + // i.e. not set by user + if (substrateDataMode != onlyData) { + bitset_comm.reset(); + std::string syncTypeStr = + (syncType == syncReduce) ? "Reduce" : "Broadcast"; + std::string doall_str(syncTypeStr + "Bitset_" + loopName); + + bitset_comm.reset(); + // determine which local nodes in the indices array need to be + // sychronized + galois::do_all( + galois::iterate(size_t{0}, indices.size()), + [&](size_t n) { + // assumes each lid is unique as test is not thread safe + size_t lid = indices[n]; + if (bitset_compute.test(lid)) { + bitset_comm.set(n); + } + }, +#if GALOIS_COMM_STATS + galois::loopname(get_run_identifier(doall_str).c_str()), +#endif + galois::no_stats()); + + // get the number of set bits and the offsets into the comm bitset + // i.e., the things thaneed to be grabbed + getOffsetsFromBitset(loopName, bitset_comm, offsets, + bit_set_count); + } + + // from the count of things that need to be grabbed determine the data mode + // to use + // NOTE: this function is imprecise as it doesn't get actual size of + // vectors but only the size of the wrapper itself, but doesn't matter + // for selection purposes data_mode = get_data_mode(bit_set_count, indices.size()); } @@ -654,6 +705,39 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } } + template + size_t GetMaxSendBufferSizeVecs(uint32_t numShared) { + if (substrateDataMode == gidsData) { + return sizeof(DataCommMode) + sizeof(size_t) + sizeof(size_t) + + (numShared * sizeof(unsigned int)) + sizeof(size_t) + + sizeof(size_t) + + (numShared * sizeof(typename SyncFnTy::ValTy::value_type) * + SyncFnTy::FeatVecSize()); + } else if (substrateDataMode == offsetsData) { + return sizeof(DataCommMode) + sizeof(size_t) + sizeof(size_t) + + (numShared * sizeof(unsigned int)) + sizeof(size_t) + + sizeof(size_t) + + (numShared * sizeof(typename SyncFnTy::ValTy::value_type) * + SyncFnTy::FeatVecSize()); + } else if (substrateDataMode == bitsetData) { + size_t bitset_alloc_size = ((numShared + 63) / 64) * sizeof(uint64_t); + return sizeof(DataCommMode) + sizeof(size_t) + + sizeof(size_t) // bitset size + + sizeof(size_t) // bitset vector size + + bitset_alloc_size + sizeof(size_t) + sizeof(size_t) + + (numShared * sizeof(typename SyncFnTy::ValTy::value_type) * + SyncFnTy::FeatVecSize()); + } else { // onlyData or noData (auto) + size_t bitset_alloc_size = ((numShared + 63) / 64) * sizeof(uint64_t); + return sizeof(DataCommMode) + sizeof(size_t) + + sizeof(size_t) // bitset size + + sizeof(size_t) // bitset vector size + + bitset_alloc_size + sizeof(size_t) + sizeof(size_t) + + (numShared * sizeof(typename SyncFnTy::ValTy::value_type) * + SyncFnTy::FeatVecSize()); + } + } + //////////////////////////////////////////////////////////////////////////////// // Local to global ID conversion //////////////////////////////////////////////////////////////////////////////// @@ -731,10 +815,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // Message prep functions (buffering, send buffer getting, etc.) //////////////////////////////////////////////////////////////////////////////// - template < - SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, - bool async, - typename std::enable_if::value>::type* = nullptr> + template ::value>::type* = nullptr> void getSendBuffer(std::string loopName, unsigned x, galois::runtime::SendBuffer& b, size_t elem_size) { auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes; @@ -762,10 +845,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { * @param b OUTPUT: Buffer that will hold data to send * @param elem_size The inner-vector dimesnion of a vector of the vector */ - template < - SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, - bool async, - typename std::enable_if::value>::type* = nullptr> + template ::value>::type* = nullptr> void getSendBuffer(std::string loopName, unsigned x, galois::runtime::SendBuffer& b, size_t elem_size) { auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes; @@ -774,8 +856,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { syncExtract( loopName, x, sharedNodes[x], b, elem_size); } else { - syncExtract( - loopName, x, sharedNodes[x], b, elem_size); + syncExtract(loopName, x, sharedNodes[x], + b, elem_size); } std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; @@ -888,12 +970,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // Calls data on the TwoDVector template - void - SerializeMessage2D(std::string loopName, DataCommMode data_mode, - size_t bit_set_count, std::vector& indices, - galois::PODResizeableArray& offsets, - galois::DynamicBitSet& bit_set_comm, TwoDVecType& two_d_vec, - galois::runtime::SendBuffer& b, size_t feat_size) { + void SerializeMessage2D(std::string loopName, DataCommMode data_mode, + size_t bit_set_count, std::vector& indices, + galois::PODResizeableArray& offsets, + galois::DynamicBitSet& bit_set_comm, + TwoDVecType& two_d_vec, + galois::runtime::SendBuffer& b, size_t feat_size) { std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; std::string serialize_timer_str(syncTypeStr + "SerializeMessage_" + get_run_identifier(loopName)); @@ -930,6 +1012,45 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } } + // Only serializes the prefix + template + void + SerializeMessagePrefix2D(std::string loopName, DataCommMode data_mode, + size_t bit_set_count, std::vector& indices, + galois::PODResizeableArray& offsets, + galois::DynamicBitSet& bit_set_comm, + galois::runtime::SendBuffer& b) { + std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; + std::string serialize_timer_str(syncTypeStr + "SerializeMessagePrefix_" + + get_run_identifier(loopName)); + galois::CondStatTimer Tserialize( + serialize_timer_str.c_str(), RNAME); + if (data_mode == noData) { + Tserialize.start(); + gSerialize(b, data_mode); + Tserialize.stop(); + } else if (data_mode == gidsData) { + offsets.resize(bit_set_count); + convertLIDToGID(loopName, indices, offsets); + Tserialize.start(); + gSerialize(b, data_mode, bit_set_count, offsets); + Tserialize.stop(); + } else if (data_mode == offsetsData) { + offsets.resize(bit_set_count); + Tserialize.start(); + gSerialize(b, data_mode, bit_set_count, offsets); + Tserialize.stop(); + } else if (data_mode == bitsetData) { + Tserialize.start(); + gSerialize(b, data_mode, bit_set_count, bit_set_comm); + Tserialize.stop(); + } else if (data_mode == onlyData) { + Tserialize.start(); + gSerialize(b, data_mode); + Tserialize.stop(); + } + } + /** * Given the data mode, deserialize the rest of a message in a Receive Buffer. * @@ -1199,9 +1320,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } } - template - void ExtractWrapper2D(size_t lid, typename FnTy::ValTy::value_type* location_to_write) { + void ExtractWrapper2D(size_t lid, + typename FnTy::ValTy::value_type* location_to_write) { if (syncType == syncReduce) { FnTy::ExtractDirect(lid, location_to_write); char dummy = 0; @@ -1300,10 +1421,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject { template - void ExtractSubset2D(const std::string& loopName, - const std::vector& indices, size_t size, - const galois::PODResizeableArray& offsets, - VecTy& two_d_vector, size_t start = 0) { + void ExtractSubsetLazy2D( + const std::string& loopName, const std::vector& indices, + size_t size, const galois::PODResizeableArray& offsets, + galois::runtime::SendBuffer& send_buffer, size_t base_offset) { + size_t start = 0; if (parallelize) { std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; @@ -1317,9 +1439,15 @@ class GluonSubstrate : public galois::runtime::GlobalObject { offset = n; else offset = offsets[n]; - size_t lid = indices[offset]; + size_t lid = indices[offset]; - ExtractWrapper2D(lid, (&(two_d_vector.edit_data()[(n - start) * FnTy::FeatVecSize()]))); + ExtractWrapper2D( + lid, + (typename FnTy::ValTy::value_type*)(&(send_buffer.DataAtOffset( + base_offset)[(n - start) * FnTy::FeatVecSize() * + sizeof(typename FnTy::ValTy::value_type)]))); + // ExtractWrapper2D(lid, + // (&(two_d_vector.edit_data()[(n - start) * FnTy::FeatVecSize()]))); }, #if GALOIS_COMM_STATS galois::loopname(get_run_identifier(doall_str).c_str()), @@ -1333,13 +1461,18 @@ class GluonSubstrate : public galois::runtime::GlobalObject { else offset = offsets[n]; - size_t lid = indices[offset]; - ExtractWrapper2D(lid, &((two_d_vector.edit_data())[(n - start) * FnTy::FeatVecSize()])); + size_t lid = indices[offset]; + + ExtractWrapper2D( + lid, (typename FnTy::ValTy::value_type*)(&(send_buffer.DataAtOffset( + base_offset)[(n - start) * FnTy::FeatVecSize() * + sizeof(typename FnTy::ValTy::value_type)]))); + // ExtractWrapper2D(lid, &((two_d_vector.edit_data())[(n + // - start) * FnTy::FeatVecSize()])); } } } - /** * Based on provided arguments, extracts the data that we are interested * in sending into val_vec. Same as above, except it has the vecIndex @@ -1558,7 +1691,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // 2D template - void SetWrapper2D(size_t lid, typename FnTy::ValTy::value_type* pointer_to_data, + void SetWrapper2D(size_t lid, + typename FnTy::ValTy::value_type* pointer_to_data, galois::DynamicBitSet& bit_set_compute) { if (syncType == syncReduce) { if (FnTy::reduce(lid, userGraph.getData(lid), pointer_to_data)) { @@ -1695,8 +1829,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { else offset = offsets[n]; auto lid = indices[offset]; - SetWrapper2D(lid, &val_vec[(n - start) * FnTy::FeatVecSize()], - bit_set_compute); + SetWrapper2D( + lid, &val_vec[(n - start) * FnTy::FeatVecSize()], + bit_set_compute); }, #if GALOIS_COMM_STATS galois::loopname(get_run_identifier(doall_str).c_str()), @@ -1710,13 +1845,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject { else offset = offsets[n]; auto lid = indices[offset]; - SetWrapper2D(lid, &val_vec[(n - start) * FnTy::FeatVecSize()], - bit_set_compute); + SetWrapper2D( + lid, &val_vec[(n - start) * FnTy::FeatVecSize()], bit_set_compute); } } } - /** * VECTOR BITSET VARIANT. * @@ -2303,16 +2437,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject { 1); } - template < - SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, - bool async> + template void SyncExtract2D(std::string loopName, unsigned from_id, std::vector& indices, galois::runtime::SendBuffer& b, size_t elem_size) { uint32_t num = indices.size() * elem_size; galois::DynamicBitSet& bit_set_comm = syncBitset; - static VecTy two_d_array; - two_d_array.SetVecSize(SyncFnTy::FeatVecSize()); galois::PODResizeableArray& offsets = syncOffsets; ////////////////////////////////////////////////////////////////////////////// @@ -2337,7 +2468,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { if (num > 0) { size_t bit_set_count = 0; Textractalloc.start(); - b.reserve(getMaxSendBufferSize(num)); + b.reserve(GetMaxSendBufferSizeVecs(num)); Textractalloc.stop(); Textractbatch.start(); @@ -2354,30 +2485,53 @@ class GluonSubstrate : public galois::runtime::GlobalObject { offsets.reserve(maxSharedSize); bit_set_comm.resize(num); offsets.resize(num); - two_d_array.Create(num); Textractalloc.stop(); const galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get(); - getBitsetAndOffsets( + GetBitsetAndOffsets2D( loopName, indices, bit_set_compute, bit_set_comm, offsets, bit_set_count, data_mode); - if (data_mode == onlyData) { - bit_set_count = indices.size(); - ExtractSubset2D( - loopName, indices, bit_set_count, offsets, two_d_array); - } else if (data_mode != - noData) { // bitsetData or offsetsData or gidsData - ExtractSubset2D( - loopName, indices, bit_set_count, offsets, two_d_array); - } + // serialize the prefix for the buffer based on data type: the data + // itself gets serialized directly into the buffer later + SerializeMessagePrefix2D(loopName, data_mode, bit_set_count, + indices, offsets, bit_set_comm, b); - SerializeMessage2D( - loopName, data_mode, bit_set_count, indices, offsets, bit_set_comm, - two_d_array, b, SyncFnTy::FeatVecSize()); + if (data_mode != noData) { + size_t lazy_buffer_size = 0; + if (data_mode == gidsData) { + lazy_buffer_size = bit_set_count * SyncFnTy::FeatVecSize(); + } else if (data_mode == offsetsData) { + lazy_buffer_size = bit_set_count * SyncFnTy::FeatVecSize(); + } else if (data_mode == bitsetData) { + lazy_buffer_size = bit_set_count * SyncFnTy::FeatVecSize(); + } else if (data_mode == onlyData) { + lazy_buffer_size = num * SyncFnTy::FeatVecSize(); + } + + size_t base_offset = 0; + if (lazy_buffer_size > 0) { + auto lazy_buffer = gSerializeLazySeq( + b, lazy_buffer_size, + (galois::PODResizeableArray< + typename SyncFnTy::ValTy::value_type>*)nullptr); + base_offset = lazy_buffer.off; + } + + // serialize the actual data directly into the buffer with lazy + // serialization + if (data_mode == onlyData) { + bit_set_count = indices.size(); + ExtractSubsetLazy2D( + loopName, indices, bit_set_count, offsets, b, base_offset); + } else { // bitsetData or offsetsData or gidsData + ExtractSubsetLazy2D( + loopName, indices, bit_set_count, offsets, b, base_offset); + } + } } else { // TODO(loc/hochan) GPU - //if (data_mode == noData) { + // if (data_mode == noData) { // b.resize(0); // if (!async) { // gSerialize(b, data_mode); @@ -2424,7 +2578,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject { 1); } - /** * Vector bitset variant. * @@ -2807,11 +2960,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } // TODO (loc) way too much code duplication - template < - SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, - bool async> - size_t SyncRecvApply2D(uint32_t from_id, - galois::runtime::RecvBuffer& buf, + template + size_t SyncRecvApply2D(uint32_t from_id, galois::runtime::RecvBuffer& buf, std::string loopName) { //////////////////////////////////////////////////////////////////////////// std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; @@ -2825,7 +2976,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { //////////////////////////////////////////////////////////////////////////// galois::DynamicBitSet& bit_set_comm = syncBitset; - //static VecTy two_d_vector; + // static VecTy two_d_vector; galois::PODResizeableArray& offsets = syncOffsets; auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes; @@ -2850,7 +3001,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { size_t bit_set_count = num; size_t buf_start = 0; - using DeserialPOD = galois::PODResizeableArray; + using DeserialPOD = + galois::PODResizeableArray; DeserialPOD deserial_pod; // deserialize the rest of the data in the buffer depending on the @@ -2872,24 +3024,25 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } if (data_mode == onlyData) { - SetSubset2D(loopName, sharedNodes[from_id], - bit_set_count, offsets, deserial_pod, - bit_set_compute); + SetSubset2D( + loopName, sharedNodes[from_id], bit_set_count, offsets, + deserial_pod, bit_set_compute); } else if (data_mode == dataSplit || data_mode == dataSplitFirst) { - SetSubset2D(loopName, sharedNodes[from_id], - bit_set_count, offsets, deserial_pod, - bit_set_compute, buf_start); + SetSubset2D( + loopName, sharedNodes[from_id], bit_set_count, offsets, + deserial_pod, bit_set_compute, buf_start); } else if (data_mode == gidsData) { - SetSubset2D(loopName, offsets, bit_set_count, offsets, deserial_pod, - bit_set_compute); + SetSubset2D(loopName, offsets, bit_set_count, + offsets, deserial_pod, + bit_set_compute); } else { // bitsetData or offsetsData - SetSubset2D(loopName, sharedNodes[from_id], - bit_set_count, offsets, deserial_pod, - bit_set_compute); + SetSubset2D( + loopName, sharedNodes[from_id], bit_set_count, offsets, + deserial_pod, bit_set_compute); } } else { // TODO(loc/hochan) @@ -3480,13 +3633,14 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // 2D vector template ::value>::type* = nullptr> + typename std::enable_if< + IsVector::value>::type* = nullptr> void reduce(std::string loopName, size_t elem_size) { std::string timer_str("Reduce_" + get_run_identifier(loopName)); galois::CondStatTimer TsyncReduce(timer_str.c_str(), RNAME); - using T = typename ReduceFnTy::ValTy::value_type; + using T = typename ReduceFnTy::ValTy::value_type; using VecTy = galois::TwoDVector; TsyncReduce.start(); @@ -3511,7 +3665,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { */ template ::value>::type* = nullptr> + typename std::enable_if< + !IsVector::value>::type* = nullptr> void reduce(std::string loopName, size_t elem_size) { std::string timer_str("Reduce_" + get_run_identifier(loopName)); galois::CondStatTimer TsyncReduce(timer_str.c_str(), @@ -3554,7 +3709,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // 2d template ::value>::type* = nullptr> + typename std::enable_if::value>::type* = nullptr> void broadcast(std::string loopname, size_t elem_size) { std::string timer_str("Broadcast_" + get_run_identifier(loopname)); galois::CondStatTimer TsyncBroadcast(timer_str.c_str(), @@ -3585,7 +3741,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { */ template ::value>::type* = nullptr> + typename std::enable_if::value>::type* = nullptr> void broadcast(std::string loopName, size_t elem_size) { std::string timer_str("Broadcast_" + get_run_identifier(loopName)); galois::CondStatTimer TsyncBroadcast(timer_str.c_str(), From 5b442736958b2deb08f8bc6563f4b15876b9db12 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 7 Jun 2021 18:19:09 -0500 Subject: [PATCH 561/660] Direct deserialization of GNN sync --- .../include/galois/graphs/GluonSubstrate.h | 105 ++++++++++++------ 1 file changed, 69 insertions(+), 36 deletions(-) diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index c60308ac93..ae50e0e10f 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -1114,6 +1114,44 @@ class GluonSubstrate : public galois::runtime::GlobalObject { Tdeserialize.stop(); } + template + void DeserializeMessagePrefix( + std::string loopName, DataCommMode data_mode, uint32_t num, + galois::runtime::RecvBuffer& buf, size_t& bit_set_count, + galois::PODResizeableArray& offsets, + galois::DynamicBitSet& bit_set_comm, size_t& buf_start, size_t& retval, + size_t& vec_size) { + std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; + std::string serialize_timer_str(syncTypeStr + "DeserializeMessage_" + + get_run_identifier(loopName)); + galois::CondStatTimer Tdeserialize( + serialize_timer_str.c_str(), RNAME); + Tdeserialize.start(); + + // get other metadata associated with message if mode isn't OnlyData + if (data_mode != onlyData) { + galois::runtime::gDeserialize(buf, bit_set_count); + + if (data_mode == gidsData) { + galois::runtime::gDeserialize(buf, offsets); + convertGIDToLID(loopName, offsets); + } else if (data_mode == offsetsData) { + galois::runtime::gDeserialize(buf, offsets); + } else if (data_mode == bitsetData) { + bit_set_comm.resize(num); + galois::runtime::gDeserialize(buf, bit_set_comm); + } else if (data_mode == dataSplit) { + galois::runtime::gDeserialize(buf, buf_start); + } else if (data_mode == dataSplitFirst) { + galois::runtime::gDeserialize(buf, retval); + } + } + // Grab data size but not data + galois::runtime::gDeserialize(buf, vec_size); + + Tdeserialize.stop(); + } + //////////////////////////////////////////////////////////////////////////////// // Other helper functions //////////////////////////////////////////////////////////////////////////////// @@ -1446,8 +1484,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject { (typename FnTy::ValTy::value_type*)(&(send_buffer.DataAtOffset( base_offset)[(n - start) * FnTy::FeatVecSize() * sizeof(typename FnTy::ValTy::value_type)]))); - // ExtractWrapper2D(lid, - // (&(two_d_vector.edit_data()[(n - start) * FnTy::FeatVecSize()]))); }, #if GALOIS_COMM_STATS galois::loopname(get_run_identifier(doall_str).c_str()), @@ -1467,8 +1503,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject { lid, (typename FnTy::ValTy::value_type*)(&(send_buffer.DataAtOffset( base_offset)[(n - start) * FnTy::FeatVecSize() * sizeof(typename FnTy::ValTy::value_type)]))); - // ExtractWrapper2D(lid, &((two_d_vector.edit_data())[(n - // - start) * FnTy::FeatVecSize()])); } } } @@ -1807,18 +1841,16 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } // 2D; vecty is a PODResize - template + template void SetSubset2D(const std::string& loopName, const IndicesVecTy& indices, size_t size, const galois::PODResizeableArray& offsets, - VecTy& val_vec, galois::DynamicBitSet& bit_set_compute, - size_t start = 0) { + galois::runtime::RecvBuffer& buf, + galois::DynamicBitSet& bit_set_compute, size_t start = 0) { std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; std::string doall_str(syncTypeStr + "SetVal_" + get_run_identifier(loopName)); - if (parallelize) { galois::do_all( galois::iterate(start, start + size), @@ -1830,7 +1862,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject { offset = offsets[n]; auto lid = indices[offset]; SetWrapper2D( - lid, &val_vec[(n - start) * FnTy::FeatVecSize()], + lid, + (typename FnTy::ValTy::value_type*)&( + buf.data()[(n - start) * FnTy::FeatVecSize() * + sizeof(typename FnTy::ValTy::value_type)]), bit_set_compute); }, #if GALOIS_COMM_STATS @@ -1846,7 +1881,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject { offset = offsets[n]; auto lid = indices[offset]; SetWrapper2D( - lid, &val_vec[(n - start) * FnTy::FeatVecSize()], bit_set_compute); + lid, + (typename FnTy::ValTy::value_type*)(&( + buf.data()[(n - start) * FnTy::FeatVecSize() * + sizeof(typename FnTy::ValTy::value_type)])), + bit_set_compute); } } } @@ -2976,7 +3015,6 @@ class GluonSubstrate : public galois::runtime::GlobalObject { //////////////////////////////////////////////////////////////////////////// galois::DynamicBitSet& bit_set_comm = syncBitset; - // static VecTy two_d_vector; galois::PODResizeableArray& offsets = syncOffsets; auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes; @@ -3001,21 +3039,15 @@ class GluonSubstrate : public galois::runtime::GlobalObject { size_t bit_set_count = num; size_t buf_start = 0; - using DeserialPOD = - galois::PODResizeableArray; - DeserialPOD deserial_pod; - - // deserialize the rest of the data in the buffer depending on the - // data mode; arguments passed in here are mostly output vars - deserializeMessage(loopName, data_mode, num, buf, - bit_set_count, offsets, bit_set_comm, - buf_start, retval, deserial_pod); + size_t vec_size = 0; + DeserializeMessagePrefix( + loopName, data_mode, num, buf, bit_set_count, offsets, + bit_set_comm, buf_start, retval, vec_size); bit_set_comm.reserve(maxSharedSize); offsets.reserve(maxSharedSize); galois::DynamicBitSet& bit_set_compute = BitsetFnTy::get(); - if (data_mode == bitsetData) { size_t bit_set_count2; getOffsetsFromBitset(loopName, bit_set_comm, offsets, @@ -3023,26 +3055,27 @@ class GluonSubstrate : public galois::runtime::GlobalObject { assert(bit_set_count == bit_set_count2); } + // note for all these the deserialize buffer is extracted from + // directly rather than copying it over to another vector if (data_mode == onlyData) { SetSubset2D( - loopName, sharedNodes[from_id], bit_set_count, offsets, - deserial_pod, bit_set_compute); + async, true, true>(loopName, sharedNodes[from_id], + bit_set_count, offsets, buf, + bit_set_compute); } else if (data_mode == dataSplit || data_mode == dataSplitFirst) { SetSubset2D( - loopName, sharedNodes[from_id], bit_set_count, offsets, - deserial_pod, bit_set_compute, buf_start); + async, true, true>(loopName, sharedNodes[from_id], + bit_set_count, offsets, buf, + bit_set_compute, buf_start); } else if (data_mode == gidsData) { - SetSubset2D(loopName, offsets, bit_set_count, - offsets, deserial_pod, - bit_set_compute); + SetSubset2D(loopName, offsets, bit_set_count, offsets, buf, + bit_set_compute); } else { // bitsetData or offsetsData SetSubset2D( - loopName, sharedNodes[from_id], bit_set_count, offsets, - deserial_pod, bit_set_compute); + async, false, true>(loopName, sharedNodes[from_id], + bit_set_count, offsets, buf, + bit_set_compute); } } else { // TODO(loc/hochan) From 3a80d2483d8151f0c84b658d6e43a361271d7131 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 9 Jun 2021 13:59:46 -0500 Subject: [PATCH 562/660] MKL microbenchmark additions and changes - Version that uses galois wrapper around single thread MKL calls - removal of unused things from original test --- libgnn/CMakeLists.txt | 11 ++ libgnn/test/CMakeLists.txt | 33 +++++- libgnn/test/mkl_micro.cpp | 26 +++-- libgnn/test/single_mkl_micro.cpp | 168 +++++++++++++++++++++++++++++++ 4 files changed, 222 insertions(+), 16 deletions(-) create mode 100644 libgnn/test/single_mkl_micro.cpp diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index c5d9ee6e7a..5bf32581d7 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -16,6 +16,7 @@ set(sources set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64) set(INTEL_LIBS "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5") +set(SINGLE_INTEL_LIBS "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core") add_library(galois_gnn STATIC ${sources}) @@ -28,6 +29,16 @@ target_include_directories(galois_gnn PUBLIC ${MKL_INCLUDE_DIRS} ) +add_library(galois_gnn_single STATIC ${sources}) +target_link_directories(galois_gnn_single PUBLIC ${MKL_LIBRARIES}) +target_link_libraries(galois_gnn_single galois_shmem) +target_link_libraries(galois_gnn_single ${SINGLE_INTEL_LIBS}) +target_link_libraries(galois_gnn_single galois_dist_async galois_cusp galois_gluon galois_support) +target_include_directories(galois_gnn_single PUBLIC + ${CMAKE_CURRENT_SOURCE_DIR}/include + ${MKL_INCLUDE_DIRS} +) + set_target_properties(galois_gnn PROPERTIES EXPORT_NAME galois_gnn) add_subdirectory(test) diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index a1ea769105..98b1d01e3e 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -1,3 +1,5 @@ +find_package(OpenMP) + add_executable(mkl_micro mkl_micro.cpp) target_link_directories(mkl_micro PUBLIC ${MKL_LIBRARIES}) target_include_directories(mkl_micro PUBLIC @@ -10,8 +12,8 @@ target_link_directories(mkl_micro_omp PUBLIC ${MKL_LIBRARIES}) target_include_directories(mkl_micro_omp PUBLIC ${MKL_INCLUDE_DIRS} ) -target_link_libraries(mkl_micro_omp ${INTEL_LIBS}) -target_link_libraries(mkl_micro_omp -fopenmp) +target_link_libraries(mkl_micro_omp PUBLIC ${INTEL_LIBS} OpenMP::OpenMP_CXX) +target_compile_definitions(mkl_micro_omp PUBLIC USE_OMP=1) add_executable(mkl_micro_sgalois mkl_micro.cpp) target_link_libraries(mkl_micro_sgalois galois_gnn) @@ -25,6 +27,33 @@ add_executable(mkl_micro_delete_galois mkl_micro.cpp) target_link_libraries(mkl_micro_delete_galois galois_gnn) target_compile_definitions(mkl_micro_delete_galois PUBLIC USE_SHARED_GALOIS_DELETE=1) +################################################################################ + +#add_executable(single_mkl_micro single_mkl_micro.cpp) +#target_link_directories(single_mkl_micro PUBLIC ${MKL_LIBRARIES}) +#target_include_directories(single_mkl_micro PUBLIC +# ${MKL_INCLUDE_DIRS} +#) +#target_link_libraries(single_mkl_micro ${SINGLE_INTEL_LIBS}) + +add_executable(single_mkl_micro_omp single_mkl_micro.cpp) +target_link_directories(single_mkl_micro_omp PUBLIC ${MKL_LIBRARIES}) +target_include_directories(single_mkl_micro_omp PUBLIC + ${MKL_INCLUDE_DIRS} +) +target_link_libraries(single_mkl_micro_omp ${SINGLE_INTEL_LIBS} OpenMP::OpenMP_CXX) +target_compile_definitions(single_mkl_micro_omp PUBLIC USE_OMP=1) + +add_executable(single_mkl_micro_sgalois single_mkl_micro.cpp) +target_link_libraries(single_mkl_micro_sgalois galois_gnn_single) +target_compile_definitions(single_mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1) + +add_executable(single_mkl_micro_dgalois single_mkl_micro.cpp) +target_link_libraries(single_mkl_micro_dgalois galois_gnn_single) +target_compile_definitions(single_mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1) + +################################################################################ + add_executable(gstl_test gstl_test.cpp) target_link_libraries(gstl_test galois_shmem) diff --git a/libgnn/test/mkl_micro.cpp b/libgnn/test/mkl_micro.cpp index ea9511df74..10867a8c63 100644 --- a/libgnn/test/mkl_micro.cpp +++ b/libgnn/test/mkl_micro.cpp @@ -1,6 +1,7 @@ #include #include #include +#include #include #ifdef USE_SHARED_GALOIS @@ -84,29 +85,18 @@ int main(int argc, char* argv[]) { size_t b_dim = 128; size_t c_dim = 16; -#if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS) - printf("Using Galois large arrays\n"); - // inputs - galois::LargeArray matrix_1; - matrix_1.create(a_dim * b_dim); - galois::LargeArray matrix_2; - matrix_2.create(a_dim * c_dim); - // output - galois::LargeArray matrix_3; - matrix_3.create(b_dim * c_dim); -#else // inputs std::vector matrix_1(a_dim * b_dim); std::vector matrix_2(a_dim * c_dim); // output + //std::vector matrix_3(a_dim * c_dim); std::vector matrix_3(b_dim * c_dim); -#endif size_t kBigSize = 1000000000; std::vector very_big_matrix(kBigSize); // change reps here; maybe make it command line arg - for (size_t reps = 0; reps < 3; reps++) { + for (size_t reps = 0; reps < 5; reps++) { // reinit srand(0); for (size_t i = 0; i < matrix_1.size(); i++) { @@ -124,7 +114,7 @@ int main(int argc, char* argv[]) { // dummy OMP TBB loop #ifdef USE_OMP -#pragma omp parallel +#pragma omp parallel for for (size_t i = 0; i < very_big_matrix.size(); i++) { very_big_matrix[i] = i; } @@ -132,10 +122,18 @@ int main(int argc, char* argv[]) { printf("Rep %lu\n", reps); + auto start = std::chrono::high_resolution_clock::now(); // transpose because it's the same as the problematic call in GNN // TODO(loc) non transpose version + //CBlasSGEMM(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(), + // matrix_2.data(), matrix_3.data()); CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(), matrix_2.data(), matrix_3.data()); + auto stop = std::chrono::high_resolution_clock::now(); + + auto duration = std::chrono::time_point_cast(stop) - + std::chrono::time_point_cast(start); + printf("Run duration is %lf ms\n", duration.count() / 1000.0); } return 0; diff --git a/libgnn/test/single_mkl_micro.cpp b/libgnn/test/single_mkl_micro.cpp new file mode 100644 index 0000000000..ecbf9da6fd --- /dev/null +++ b/libgnn/test/single_mkl_micro.cpp @@ -0,0 +1,168 @@ +#include +#include +#include +#include +#include + +#ifdef USE_SHARED_GALOIS +#include "galois/Galois.h" +#include "galois/LargeArray.h" +#endif +#ifdef USE_DIST_GALOIS +#include "galois/DistGalois.h" +#include "galois/LargeArray.h" +#endif + +#ifdef USE_OMP +#include "omp.h" +#endif + +// MKL wrapper +#ifdef USE_OMP +void CBlasSGEMMOMP(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, + size_t input_rows, size_t input_columns, size_t output_columns, + const float* a, const float* b, float* output) { + // set lead dimension based on cblas spec w.r.t. transpose setting + size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows; + size_t lead_dim_b = + (trans_b == CblasNoTrans) ? output_columns : input_columns; + + #pragma omp parallel for + for (int i = 0; i < omp_get_num_threads(); i++) { + unsigned chunk_size = input_rows / omp_get_num_threads(); + unsigned my_start = chunk_size * i; + unsigned my_end = chunk_size * (i + 1); + if (omp_get_num_threads() - 1 == i) { + my_end = input_rows; + } + unsigned rows_to_use = my_end - my_start; + + const float* my_a = a + (my_start * input_columns); + float* my_output = output + (my_start * output_columns); + + // do the MM + cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns, + input_columns, 1.0, my_a, lead_dim_a, b, lead_dim_b, + false ? 1.0 : 0.0, my_output, output_columns); + } +} +#endif + +#if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS) +void CBlasSGEMMGalois(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, + size_t input_rows, size_t input_columns, size_t output_columns, + const float* a, const float* b, float* output) { + // set lead dimension based on cblas spec w.r.t. transpose setting + size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows; + size_t lead_dim_b = + (trans_b == CblasNoTrans) ? output_columns : input_columns; + + galois::on_each( + [&] (size_t i, size_t num_threads) { + unsigned chunk_size = input_rows / num_threads; + unsigned my_start = chunk_size * i; + unsigned my_end = chunk_size * (i + 1); + if (num_threads - 1 == i) { + my_end = input_rows; + } + unsigned rows_to_use = my_end - my_start; + + const float* my_a = a + (my_start * input_columns); + float* my_output = output + (my_start * output_columns); + + // do the MM + cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns, + input_columns, 1.0, my_a, lead_dim_a, b, lead_dim_b, + false ? 1.0 : 0.0, my_output, output_columns); + } + ); +} +#endif + + +void CacheFlush(std::vector* matrix) { + for (size_t i = 0; i < matrix->size(); i++) { + (*matrix)[i] = i; + } +} + +int main(int argc, char* argv[]) { +#ifdef USE_SHARED_GALOIS + galois::SharedMemSys G; + if (argc != 2) { + printf("Thread arg not specified\n"); + exit(1); + } + galois::setActiveThreads(std::stoi(argv[1])); + printf("Initialized Galois Shared Mem with %u threads\n", + galois::getActiveThreads()); +#endif + +#ifdef USE_DIST_GALOIS + galois::DistMemSys G; + if (argc != 2) { + printf("Thread arg not specified\n"); + exit(1); + } + galois::setActiveThreads(std::stoi(argv[1])); + printf("Initialized Galois Dist Mem with %u threads\n", + galois::getActiveThreads()); +#endif + + printf("%d %s\n", argc, argv[0]); + + // dimensions from test case + size_t a_dim = 12000000; + size_t b_dim = 128; + size_t c_dim = 16; + + // inputs + std::vector matrix_1(a_dim * b_dim); + std::vector matrix_2(a_dim * c_dim); + // output + std::vector matrix_3(a_dim * c_dim); + + size_t kBigSize = 1000000000; + std::vector very_big_matrix(kBigSize); + + // change reps here; maybe make it command line arg + for (size_t reps = 0; reps < 5; reps++) { + // reinit + srand(0); + for (size_t i = 0; i < matrix_1.size(); i++) { + matrix_1[i] = rand() / static_cast(RAND_MAX / 10); + } + srand(1); + for (size_t i = 0; i < matrix_2.size(); i++) { + matrix_2[i] = rand() / static_cast(RAND_MAX / 10); + } + + very_big_matrix.clear(); + very_big_matrix.resize(kBigSize); + // cache flush + CacheFlush(&very_big_matrix); + + printf("Rep %lu\n", reps); + + auto start = std::chrono::high_resolution_clock::now(); + // transpose because it's the same as the problematic call in GNN + // TODO(loc) non transpose version +#ifdef USE_OMP + CBlasSGEMMOMP(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(), + matrix_2.data(), matrix_3.data()); +#endif +#if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS) + CBlasSGEMMGalois(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(), + matrix_2.data(), matrix_3.data()); +#endif + //CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(), + // matrix_2.data(), matrix_3.data()); + auto stop = std::chrono::high_resolution_clock::now(); + + auto duration = std::chrono::time_point_cast(stop) - + std::chrono::time_point_cast(start); + printf("Run duration is %lf ms\n", duration.count() / 1000.0); + } + + return 0; +} From 7ef6e893b0ea6ba6872ff09e8aa2c9198485e95b Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 9 Jun 2021 19:12:07 -0500 Subject: [PATCH 563/660] mkl micro: transpose for single thread mkl --- libgnn/test/mkl_micro.cpp | 3 ++ libgnn/test/single_mkl_micro.cpp | 74 ++++++++++++++++++++++++-------- 2 files changed, 59 insertions(+), 18 deletions(-) diff --git a/libgnn/test/mkl_micro.cpp b/libgnn/test/mkl_micro.cpp index 10867a8c63..73b3a08893 100644 --- a/libgnn/test/mkl_micro.cpp +++ b/libgnn/test/mkl_micro.cpp @@ -82,6 +82,7 @@ int main(int argc, char* argv[]) { // dimensions from test case size_t a_dim = 12000000; + //size_t a_dim = 120000; size_t b_dim = 128; size_t c_dim = 16; @@ -129,6 +130,8 @@ int main(int argc, char* argv[]) { // matrix_2.data(), matrix_3.data()); CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(), matrix_2.data(), matrix_3.data()); + //CBlasSGEMM(CblasNoTrans, CblasTrans, b_dim, a_dim, c_dim, matrix_1.data(), + // matrix_2.data(), matrix_3.data()); auto stop = std::chrono::high_resolution_clock::now(); auto duration = std::chrono::time_point_cast(stop) - diff --git a/libgnn/test/single_mkl_micro.cpp b/libgnn/test/single_mkl_micro.cpp index ecbf9da6fd..7111b1b057 100644 --- a/libgnn/test/single_mkl_micro.cpp +++ b/libgnn/test/single_mkl_micro.cpp @@ -6,11 +6,11 @@ #ifdef USE_SHARED_GALOIS #include "galois/Galois.h" -#include "galois/LargeArray.h" +#include "galois/PODResizeableArray.h" #endif #ifdef USE_DIST_GALOIS #include "galois/DistGalois.h" -#include "galois/LargeArray.h" +#include "galois/PODResizeableArray.h" #endif #ifdef USE_OMP @@ -57,25 +57,60 @@ void CBlasSGEMMGalois(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans size_t lead_dim_b = (trans_b == CblasNoTrans) ? output_columns : input_columns; + static std::vector> temps; + if (trans_a == CblasTrans) { + temps.resize(galois::getActiveThreads()); + } + galois::on_each( [&] (size_t i, size_t num_threads) { - unsigned chunk_size = input_rows / num_threads; - unsigned my_start = chunk_size * i; - unsigned my_end = chunk_size * (i + 1); - if (num_threads - 1 == i) { - my_end = input_rows; + if (trans_a != CblasTrans) { + unsigned chunk_size = input_rows / num_threads; + unsigned my_start = chunk_size * i; + unsigned my_end = chunk_size * (i + 1); + if (num_threads - 1 == i) { + my_end = input_rows; + } + unsigned rows_to_use = my_end - my_start; + + const float* my_a = a + (my_start * input_columns); + float* my_output = output + (my_start * output_columns); + + // do the MM + cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns, + input_columns, 1.0, my_a, lead_dim_a, b, lead_dim_b, + false ? 1.0 : 0.0, my_output, output_columns); + } else { + galois::PODResizeableArray& my_pod = temps[i]; + my_pod.resize(input_rows * output_columns); + + unsigned chunk_size = input_columns / num_threads; + unsigned my_start = chunk_size * i; + unsigned my_end = chunk_size * (i + 1); + if (num_threads - 1 == i) { + my_end = input_columns; + } + unsigned b_rows_to_use = my_end - my_start; + + const float* my_a = a + (my_start * input_rows); + const float* my_b = b + (my_start * output_columns); + + // do the MM + cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns, + b_rows_to_use, 1.0, my_a, lead_dim_a, my_b, lead_dim_b, + false ? 1.0 : 0.0, my_pod.data(), output_columns); + } } - unsigned rows_to_use = my_end - my_start; - - const float* my_a = a + (my_start * input_columns); - float* my_output = output + (my_start * output_columns); + ); - // do the MM - cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns, - input_columns, 1.0, my_a, lead_dim_a, b, lead_dim_b, - false ? 1.0 : 0.0, my_output, output_columns); + if (trans_a == CblasTrans) { + printf("Manual summation\n"); + for (galois::PODResizeableArray& temp_out : temps) { + for (unsigned i = 0; i < temp_out.size(); i++) { + output[i] += temp_out[i]; + } } - ); + } } #endif @@ -120,7 +155,8 @@ int main(int argc, char* argv[]) { std::vector matrix_1(a_dim * b_dim); std::vector matrix_2(a_dim * c_dim); // output - std::vector matrix_3(a_dim * c_dim); + //std::vector matrix_3(a_dim * c_dim); + std::vector matrix_3(b_dim * c_dim); size_t kBigSize = 1000000000; std::vector very_big_matrix(kBigSize); @@ -152,7 +188,9 @@ int main(int argc, char* argv[]) { matrix_2.data(), matrix_3.data()); #endif #if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS) - CBlasSGEMMGalois(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(), + //CBlasSGEMMGalois(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(), + // matrix_2.data(), matrix_3.data()); + CBlasSGEMMGalois(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(), matrix_2.data(), matrix_3.data()); #endif //CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(), From fbdf83383ca079155234e80f329018bb7adff8a5 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 15 Jun 2021 19:50:39 -0500 Subject: [PATCH 564/660] SampledAny opt, fix end-of-execution free overhead 1) SampledAny was very expensive apparently (looping through boolean vec for every edge); sensible because probably locality issues. Avoids this by using a bitset to mark if an edge has ever been sampled. Improves perf significantly from what I can tell, but needs more testing. 2) End of execution free LargeArray of std::vectors was insanely expensive. This fixes it by using gstl::Vector instead. --- libgnn/include/galois/graphs/GNNGraph.h | 19 +++++++------------ libgnn/src/graphs/GNNGraph.cpp | 2 ++ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 775fd2af3a..723249fe2f 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -175,11 +175,7 @@ class GNNGraph { } bool IsEdgeSampledAny(EdgeIterator ei) const { - for (bool b : edge_sample_status_[*ei]) { - if (b) - return true; - } - return false; + return sampled_edges_.test(*ei); } bool IsEdgeSampled(uint32_t ei, size_t layer_num) const { if (!use_subgraph_) { @@ -205,6 +201,7 @@ class GNNGraph { //! Set the flag on the edge to 1; makes it sampled void MakeEdgeSampled(EdgeIterator ei, size_t layer_num) { + sampled_edges_.set(*ei); edge_sample_status_[*ei][layer_num] = 1; }; //! Set the flag on the edge to 0; makes it not sampled @@ -224,6 +221,7 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// // in edges ////////////////////////////////////////////////////////////////////////////// + EdgeIterator in_edge_begin(GraphNode n) const { if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->in_edge_begin(n); @@ -271,12 +269,7 @@ class GNNGraph { } bool IsInEdgeSampledAny(EdgeIterator ei) const { - for (bool b : - edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)]) { - if (b) - return true; - } - return false; + return sampled_edges_.test(partitioned_graph_->InEdgeToOutEdge(ei)); }; bool IsInEdgeSampled(EdgeIterator ei, size_t layer_num) const { if (!use_subgraph_) { @@ -724,7 +717,7 @@ class GNNGraph { std::vector> sampled_out_degrees_; //! Sample data on edges: each edge gets a small bitset to mark //! if it's been sampled for a particular layer - galois::LargeArray> edge_sample_status_; + galois::LargeArray> edge_sample_status_; // TODO use a char maybe? unlikely anyone will go over 2^8 layers... //! What timestep a node was added to sampled set; used to determine //! size of subgraph at each layer @@ -732,6 +725,8 @@ class GNNGraph { //! Indicates newly sampled nodes (for distributed synchronization of sampling //! status galois::DynamicBitSet new_sampled_nodes_; + //! If edge is sampled at any point, mark this + galois::DynamicBitSet sampled_edges_; ////////////////////////////////////////////////////////////////////////////// diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index b7cb9596e0..c1afe6e6c4 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -884,6 +884,7 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, sample_node_timestamps_.create(partitioned_graph_->size(), std::numeric_limits::max()); edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers, 0); + sampled_edges_.resize(partitioned_graph_->sizeEdges()); // this is to hold the degree of a sampled graph considering all hosts; yes, // memory wise this is slightly problematic possibly, but each layer is its // own subgraph @@ -929,6 +930,7 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { std::fill(edge_sample_status_[edge_id].begin(), edge_sample_status_[edge_id].end(), 0); }); + sampled_edges_.reset(); // reset all degrees if (!subgraph_choose_all_) { galois::do_all( From 9146c0c47630ebe92f997f91d5e021d6a1a51050 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 15 Jun 2021 21:40:50 -0500 Subject: [PATCH 565/660] DynamicBitsets for each layer's edge samples Rather than a vector for each edge, have a bitset for each layer for edge sampling marking; more locality when checking edges of one particular layer rather than jumping around many vectors. --- libgnn/include/galois/graphs/GNNGraph.h | 22 ++++++++++++---------- libgnn/src/graphs/GNNGraph.cpp | 14 ++++++++------ 2 files changed, 20 insertions(+), 16 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 723249fe2f..e50d0197d4 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -180,7 +180,7 @@ class GNNGraph { bool IsEdgeSampled(uint32_t ei, size_t layer_num) const { if (!use_subgraph_) { // view uses original graph edge iterators - return edge_sample_status_[ei][layer_num]; + return edge_sample_status_[layer_num].test(ei); } else { return subgraph_->OutEdgeSampled(ei, layer_num, *this); return false; @@ -189,24 +189,24 @@ class GNNGraph { bool IsEdgeSampled(EdgeIterator ei, size_t layer_num) const { if (!use_subgraph_) { // view uses original graph edge iterators - return edge_sample_status_[*ei][layer_num]; + return edge_sample_status_[layer_num].test(*ei); } else { return subgraph_->OutEdgeSampled(ei, layer_num, *this); } }; //! Always use original graph's edge iterator here bool IsEdgeSampledOriginalGraph(EdgeIterator ei, size_t layer_num) const { - return edge_sample_status_[*ei][layer_num]; + return edge_sample_status_[layer_num].test(*ei); }; //! Set the flag on the edge to 1; makes it sampled void MakeEdgeSampled(EdgeIterator ei, size_t layer_num) { sampled_edges_.set(*ei); - edge_sample_status_[*ei][layer_num] = 1; + edge_sample_status_[layer_num].set(*ei); }; //! Set the flag on the edge to 0; makes it not sampled void MakeEdgeUnsampled(EdgeIterator ei, size_t layer_num) { - edge_sample_status_[*ei][layer_num] = 0; + edge_sample_status_[layer_num].reset(*ei, *ei); }; // GNNEdgeSortIterator EdgeSortBegin(GraphNode n) { @@ -274,8 +274,8 @@ class GNNGraph { bool IsInEdgeSampled(EdgeIterator ei, size_t layer_num) const { if (!use_subgraph_) { // view can use this fine + requires it - return edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)] - [layer_num]; + return edge_sample_status_[layer_num].test( + partitioned_graph_->InEdgeToOutEdge(ei)); } else { return subgraph_->InEdgeSampled(ei, layer_num, *this); } @@ -283,11 +283,13 @@ class GNNGraph { //! Set the flag on the edge to 1; makes it sampled void MakeInEdgeSampled(EdgeIterator ei, size_t layer_num) { - edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)][layer_num] = 1; + edge_sample_status_[layer_num].set(partitioned_graph_->InEdgeToOutEdge(ei)); }; //! Set the flag on the edge to 0; makes it not sampled void MakeInEdgeUnsampled(EdgeIterator ei, size_t layer_num) { - edge_sample_status_[partitioned_graph_->InEdgeToOutEdge(ei)][layer_num] = 0; + edge_sample_status_[layer_num].reset( + partitioned_graph_->InEdgeToOutEdge(ei), + partitioned_graph_->InEdgeToOutEdge(ei)); }; ////////////////////////////////////////////////////////////////////////////// @@ -717,7 +719,7 @@ class GNNGraph { std::vector> sampled_out_degrees_; //! Sample data on edges: each edge gets a small bitset to mark //! if it's been sampled for a particular layer - galois::LargeArray> edge_sample_status_; + std::vector edge_sample_status_; // TODO use a char maybe? unlikely anyone will go over 2^8 layers... //! What timestep a node was added to sampled set; used to determine //! size of subgraph at each layer diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index c1afe6e6c4..b10ea8d26e 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -883,7 +883,10 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, subgraph_ = std::make_unique(partitioned_graph_->size()); sample_node_timestamps_.create(partitioned_graph_->size(), std::numeric_limits::max()); - edge_sample_status_.create(partitioned_graph_->sizeEdges(), num_layers, 0); + edge_sample_status_.resize(num_layers); + for (size_t i = 0; i < num_layers; i++) { + edge_sample_status_[i].resize(partitioned_graph_->sizeEdges()); + } sampled_edges_.resize(partitioned_graph_->sizeEdges()); // this is to hold the degree of a sampled graph considering all hosts; yes, // memory wise this is slightly problematic possibly, but each layer is its @@ -925,11 +928,10 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { std::fill(sample_node_timestamps_.begin(), sample_node_timestamps_.end(), std::numeric_limits::max()); // clear all sampled edges - galois::do_all(galois::iterate(size_t{0}, partitioned_graph_->sizeEdges()), - [&](size_t edge_id) { - std::fill(edge_sample_status_[edge_id].begin(), - edge_sample_status_[edge_id].end(), 0); - }); + galois::do_all( + galois::iterate(edge_sample_status_.begin(), edge_sample_status_.end()), + [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); }); + sampled_edges_.reset(); // reset all degrees if (!subgraph_choose_all_) { From 23d449b2ed1f10bcbe6a122572fcec05dccf7aa9 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 17 Jun 2021 14:26:11 -0500 Subject: [PATCH 566/660] GNN: Optimize SID map, fix subgraph master masking 1) SID mapping made many passes over the graph; this is inefficient. Optimize by counting how many nodes will make first appearance in each layer and make only a single pass over the graph to get SID mappings. Further improvement could be obtained by making this run in parallel (doable with another prefix sum on top of existing one for work assigned to each thread possibly), but will go there as necessary. 2) Master masking in SAGE/GCN layers for correctness was buggy in subgraph case as it is possible that masters would appear beyond the layer 0 prefix used by subgraphs. Avoid this by using a bitset which indicates masters that are not in this prefix so that they are not masked out accidentally. This fix seemingly improves time to accuracy as well as time in general because less 0s are being written. --- .../galois/graphs/DegreeSyncStructures.h | 15 +-- libgnn/include/galois/graphs/GNNGraph.h | 22 ++++- libgnn/include/galois/graphs/GNNSubgraph.h | 4 +- .../graphs/GraphAggregationSyncStructures.h | 33 ++++--- libgnn/include/galois/layers/GNNLayer.h | 5 + libgnn/src/graphs/GNNGraph.cpp | 49 ++++++++++ libgnn/src/graphs/GNNSubgraph.cpp | 97 +++++++++++-------- libgnn/src/layers/GNNLayer.cpp | 88 +++++++++++++++++ libgnn/src/layers/SAGELayer.cpp | 37 +++++-- 9 files changed, 275 insertions(+), 75 deletions(-) diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h index 44102a3807..91a94d64ac 100644 --- a/libgnn/include/galois/graphs/DegreeSyncStructures.h +++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h @@ -62,7 +62,7 @@ struct SubgraphDegreeSync { using ValTy = galois::gstl::Vector; static size_t FeatVecSize() { - return gnn_sampled_out_degrees_->size();; + return gnn_sampled_out_degrees_->size(); } static ValTy extract(uint32_t lid, char&) { @@ -76,12 +76,12 @@ struct SubgraphDegreeSync { return vec_to_send; } - static void ExtractDirect(uint32_t lid, typename ValTy::value_type* to_write) { + static void ExtractDirect(uint32_t lid, + typename ValTy::value_type* to_write) { size_t count = 0; for (galois::LargeArray& layer_degrees : *gnn_sampled_out_degrees_) { - std::memcpy(&to_write[count], - &layer_degrees[lid], + std::memcpy(&to_write[count], &layer_degrees[lid], sizeof(typename ValTy::value_type)); count++; } @@ -96,7 +96,8 @@ struct SubgraphDegreeSync { } static bool reduce(uint32_t lid, char&, ValTy::value_type* y) { - for (size_t degree_index = 0; degree_index < gnn_sampled_out_degrees_->size(); degree_index++) { + for (size_t degree_index = 0; + degree_index < gnn_sampled_out_degrees_->size(); degree_index++) { (*gnn_sampled_out_degrees_)[degree_index][lid] += y[degree_index]; } return true; @@ -119,12 +120,12 @@ struct SubgraphDegreeSync { } static void setVal(uint32_t lid, char&, ValTy::value_type* y) { - for (size_t degree_index = 0; degree_index < gnn_sampled_out_degrees_->size(); degree_index++) { + for (size_t degree_index = 0; + degree_index < gnn_sampled_out_degrees_->size(); degree_index++) { (*gnn_sampled_out_degrees_)[degree_index][lid] = y[degree_index]; } } - // GPU options TODO for GPU static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { return false; diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index e50d0197d4..9c2e6061bf 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -634,6 +634,16 @@ class GNNGraph { void DisableSubgraphChooseAll() { subgraph_choose_all_ = false; } void SetSubgraphChooseAll(bool a) { subgraph_choose_all_ = a; } + std::vector& GetMasterOffsets() { return sample_master_offsets_; } + std::vector& GetMirrorOffsets() { return sample_mirror_offsets_; } + + galois::DynamicBitSet& GetNonLayerZeroMasters() { + return non_layer_zero_masters_; + } + const galois::DynamicBitSet& GetNonLayerZeroMasters() const { + return non_layer_zero_masters_; + } + private: // included like this to avoid cyclic dependency issues + not used anywhere but // in this class anyways @@ -724,6 +734,17 @@ class GNNGraph { //! What timestep a node was added to sampled set; used to determine //! size of subgraph at each layer galois::LargeArray sample_node_timestamps_; + //! Count of how many masters are in each layer in a sampled subgraph. + std::vector sample_master_offsets_; + //! Count of how many mirrors are in each layer in a sampled subgraph. + std::vector sample_mirror_offsets_; + //! In a subgraph, all layer 0 masters are made the prefix of SIDs; other + //! masters that are not layer 0 will be scattered elsewhere. This bitset + //! tracks which of those SIDs are the masters. + //! This is required for master masking in certain layers in distributed + //! execution to avoid recomputation of certain gradients. + galois::DynamicBitSet non_layer_zero_masters_; + //! Indicates newly sampled nodes (for distributed synchronization of sampling //! status galois::DynamicBitSet new_sampled_nodes_; @@ -768,7 +789,6 @@ class GNNGraph { std::vector global_degrees_; std::vector global_train_degrees_; - // TODO vars for subgraphs as necessary bool use_subgraph_{false}; bool use_subgraph_view_{false}; bool subgraph_choose_all_{false}; diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h index 81825e2ed1..ddd4c8d277 100644 --- a/libgnn/include/galois/graphs/GNNSubgraph.h +++ b/libgnn/include/galois/graphs/GNNSubgraph.h @@ -111,10 +111,10 @@ class GNNSubgraph { t->stop(); } + // TODO signature cleanup //! Creates subgraph ID mapping from the number of sampled nodes from the //! original graph. Should be done every epoch when sampled graph changes. - void CreateSubgraphMapping(const GNNGraph& gnn_graph, - size_t num_sampled_layers); + void CreateSubgraphMapping(GNNGraph& gnn_graph, size_t); //! Counts in and out degrees of all sampled nodes in the graph void DegreeCounting(const GNNGraph& gnn_graph); diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index 1270df5ff5..7501a7c23d 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -68,10 +68,7 @@ struct SampleFlagBitset { struct GNNSumAggregate { using ValTy = galois::gstl::Vector; - static size_t FeatVecSize() { - return gnn_matrix_to_sync_column_length_; - } - + static size_t FeatVecSize() { return gnn_matrix_to_sync_column_length_; } //! return a vector of floats to sync static ValTy extract(uint32_t node_id, char&) { @@ -90,10 +87,13 @@ struct GNNSumAggregate { } //! return a vector of floats to sync - static void ExtractDirect(uint32_t node_id, typename ValTy::value_type* to_write) { - std::memcpy(to_write, - (char*)&(gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_]), - gnn_matrix_to_sync_column_length_ * sizeof(typename ValTy::value_type)); + static void ExtractDirect(uint32_t node_id, + typename ValTy::value_type* to_write) { + std::memcpy( + to_write, + (char*)&( + gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_]), + gnn_matrix_to_sync_column_length_ * sizeof(typename ValTy::value_type)); } //! reduction is addition in this case; add received vector to @@ -166,16 +166,14 @@ struct GNNSumAggregate { struct GNNSampleSumAggregate { using ValTy = galois::gstl::Vector; - static size_t FeatVecSize() { - return gnn_matrix_to_sync_column_length_; - } + static size_t FeatVecSize() { return gnn_matrix_to_sync_column_length_; } //! return a vector of floats to sync static ValTy extract(uint32_t node_id, char&) { // It should be a CPU synchronizing substrate. // If the GPU flag is turned off, then personality does not exist. // assert(device_personality == DevicePersonality::CPU); - //ValTy extracted_vec(gnn_matrix_to_sync_column_length_); + // ValTy extracted_vec(gnn_matrix_to_sync_column_length_); ValTy extracted_vec; extracted_vec.reserve(gnn_matrix_to_sync_column_length_); if ((*gnn_lid_to_sid_pointer_)[node_id] == @@ -194,14 +192,17 @@ struct GNNSampleSumAggregate { return extracted_vec; } - static void ExtractDirect(uint32_t node_id, typename ValTy::value_type* to_write) { + static void ExtractDirect(uint32_t node_id, + typename ValTy::value_type* to_write) { if ((*gnn_lid_to_sid_pointer_)[node_id] == std::numeric_limits::max()) { return; } - std::memcpy(to_write, - (char*)&(gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id]* gnn_matrix_to_sync_column_length_]), - gnn_matrix_to_sync_column_length_ * sizeof(typename ValTy::value_type)); + std::memcpy( + to_write, + (char*)&(gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] * + gnn_matrix_to_sync_column_length_]), + gnn_matrix_to_sync_column_length_ * sizeof(typename ValTy::value_type)); } //! reduction is addition in this case; add received vector to diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index e61d398a64..786a973230 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -364,12 +364,17 @@ class GNNLayer { MaskInputNonMasters(input, std::numeric_limits::max()); } void MaskInputNonMasters(PointerWithSize* input, size_t max_rows); + void MaskInputNonMasters(PointerWithSize* input, size_t max_rows, + const galois::DynamicBitSet&); + //! Mask a gradient size'd matrix's rows that correspond to mirrors void MaskGradientNonMasters(PointerWithSize* input) { MaskGradientNonMasters(input, std::numeric_limits::max()); } void MaskGradientNonMasters(PointerWithSize* gradients, size_t max_rows); + void MaskGradientNonMasters(PointerWithSize* gradients, + size_t max_rows, const galois::DynamicBitSet&); //! Does some math to get GB used by some # of floats double FloatElementsToGB(size_t num_of_floats) const { diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index b10ea8d26e..fdd2d6e1dc 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -899,6 +899,9 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, } else { subgraph_choose_all_ = true; } + + sample_master_offsets_.resize(num_layers + 1, 0); + sample_mirror_offsets_.resize(num_layers + 1, 0); } size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { @@ -927,6 +930,9 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { // clear node timestamps std::fill(sample_node_timestamps_.begin(), sample_node_timestamps_.end(), std::numeric_limits::max()); + std::fill(sample_master_offsets_.begin(), sample_master_offsets_.end(), 0); + std::fill(sample_mirror_offsets_.begin(), sample_mirror_offsets_.end(), 0); + // clear all sampled edges galois::do_all( galois::iterate(edge_sample_status_.begin(), edge_sample_status_.end()), @@ -958,15 +964,29 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { } galois::GAccumulator local_seed_count; local_seed_count.reset(); + galois::GAccumulator master_offset; + master_offset.reset(); + galois::GAccumulator mirror_offset; + mirror_offset.reset(); // count # of seed nodes galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { if (IsInSampledGraph(x)) { + if (*x < *end_owned()) { + master_offset += 1; + } else { + // mirror + mirror_offset += 1; + } + local_seed_count += 1; // 0 = seed node sample_node_timestamps_[*x] = 0; } }); + sample_master_offsets_[0] = master_offset.reduce(); + sample_mirror_offsets_[0] = mirror_offset.reduce(); + return local_seed_count.reduce(); } @@ -1036,15 +1056,29 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, galois::GAccumulator local_sample_count; local_sample_count.reset(); + galois::GAccumulator master_offset; + master_offset.reset(); + galois::GAccumulator mirror_offset; + mirror_offset.reset(); // count # of seed nodes galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { if (IsInSampledGraph(x)) { local_sample_count += 1; if (sample_node_timestamps_[*x] == std::numeric_limits::max()) { + if (*x < *end_owned()) { + master_offset += 1; + } else { + // mirror + mirror_offset += 1; + } sample_node_timestamps_[*x] = timestamp; } } }); + assert(sample_master_offsets_.size() > timestamp); + assert(sample_mirror_offsets_.size() > timestamp); + sample_master_offsets_[timestamp] = master_offset.reduce(); + sample_mirror_offsets_[timestamp] = mirror_offset.reduce(); EnableSubgraphChooseAll(); return local_sample_count.reduce(); @@ -1131,15 +1165,30 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, // count sampled node size galois::GAccumulator local_sample_count; local_sample_count.reset(); + galois::GAccumulator master_offset; + master_offset.reset(); + galois::GAccumulator mirror_offset; + mirror_offset.reset(); // count # of seed nodes galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { if (IsInSampledGraph(x)) { + local_sample_count += 1; if (sample_node_timestamps_[*x] == std::numeric_limits::max()) { + if (*x < *end_owned()) { + master_offset += 1; + } else { + // mirror + mirror_offset += 1; + } sample_node_timestamps_[*x] = timestamp; } } }); + assert(sample_master_offsets_.size() > timestamp); + assert(sample_mirror_offsets_.size() > timestamp); + sample_master_offsets_[timestamp] = master_offset.reduce(); + sample_mirror_offsets_[timestamp] = mirror_offset.reduce(); DisableSubgraphChooseAll(); return local_sample_count.reduce(); diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index 2493319904..360586b7df 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -28,8 +28,9 @@ size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraphView( return num_subgraph_nodes_; } +// TODO signature cleanup void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( - const GNNGraph& gnn_graph, size_t num_sampled_layers) { + GNNGraph& gnn_graph, size_t) { galois::StatTimer timer("SIDMapping", kRegionName); TimerStart(&timer); @@ -51,60 +52,73 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( subgraph_id_to_lid_.resize(num_subgraph_nodes_ * 1.02); } - // TODO(loc) depending on overhead, can parallelize this with a prefix sum - // serial loop over LIDs to construct lid -> subgraph id mapping - uint32_t current_sid = 0; + galois::DynamicBitSet& non_layer_zero_masters = + gnn_graph.GetNonLayerZeroMasters(); + std::vector& master_offsets = gnn_graph.GetMasterOffsets(); + std::vector& mirror_offsets = gnn_graph.GetMirrorOffsets(); - // split into 2 parts: masters, then mirrors + // init the bitset as necessary + if (non_layer_zero_masters.size() < num_subgraph_nodes_) { + non_layer_zero_masters.resize(num_subgraph_nodes_); + } else { + non_layer_zero_masters.reset(); + } + + // compute offsets for each layer + uint32_t layer_zero_offset = 0; + galois::PODResizeableArray layer_offsets; + layer_offsets.resize(master_offsets.size() - 1); + for (unsigned i = 0; i < layer_offsets.size(); i++) { + layer_offsets[i] = master_offsets[i] + mirror_offsets[i]; + if (i > 0) { + // prefix summing + layer_offsets[i] += layer_offsets[i - 1]; + } + } + + // split into 2 parts: masters, then everything else size_t last_owned_node = *(gnn_graph.end_owned()); + galois::gInfo(last_owned_node); for (size_t local_node_id = 0; local_node_id < last_owned_node; local_node_id++) { - if (gnn_graph.SampleNodeTimestamp(local_node_id) == 0) { - // TODO should bound check the SID to max uint32_t - // note: if SID is max uint32t, then it's not valid - subgraph_id_to_lid_[current_sid] = local_node_id; - lid_to_subgraph_id_[local_node_id] = current_sid++; + unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id); + if (node_timestamp != std::numeric_limits::max()) { + uint32_t sid_to_use; + if (node_timestamp != 0) { + sid_to_use = layer_offsets[node_timestamp - 1]++; + // master that won't be in prefix needs to be marked + non_layer_zero_masters.set(sid_to_use); + } else { + sid_to_use = layer_zero_offset++; + } + subgraph_id_to_lid_[sid_to_use] = local_node_id; + lid_to_subgraph_id_[local_node_id] = sid_to_use++; } } - // all nodes before this SID are master nodes *that matter* - // NOTE: there is a very subtle distinction here implementation wise - // that needs to be resolved in slightly more detail than this; - // there may be master nodes that are past this boundary that will - // not be covered by this begin_owned loop, which may cause problems down + // all nodes before this SID are master nodes in layer 0; + // NOTE: there are master nodes past this boundary that will + // not be covered by a begin_owned loop, which may cause problems down // the line - // TODO(loc) see above - subgraph_master_boundary_ = current_sid; + subgraph_master_boundary_ = master_offsets[0]; + // everything else; none of these are master nodes for (size_t local_node_id = last_owned_node; local_node_id < gnn_graph.size(); local_node_id++) { - if (gnn_graph.SampleNodeTimestamp(local_node_id) == 0) { - // TODO should bound check the SID to max uint32_t - // note: if SID is max uint32t, then it's not valid - subgraph_id_to_lid_[current_sid] = local_node_id; - lid_to_subgraph_id_[local_node_id] = current_sid++; - } - } - galois::gDebug( - "Number of sampled nodes for subgraph construction layer 0 is ", - current_sid); - - // XXX each sampled layer can be queried in parallel (think prefix sum); do - // this if this becomes a bottleneck - for (size_t i = 1; i < num_sampled_layers + 1; i++) { - for (size_t local_node_id = 0; local_node_id < gnn_graph.size(); - local_node_id++) { - if (gnn_graph.SampleNodeTimestamp(local_node_id) == i) { - subgraph_id_to_lid_[current_sid] = local_node_id; - lid_to_subgraph_id_[local_node_id] = current_sid++; + unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id); + if (node_timestamp != std::numeric_limits::max()) { + uint32_t sid_to_use; + if (node_timestamp != 0) { + sid_to_use = layer_offsets[node_timestamp - 1]++; + } else { + sid_to_use = layer_zero_offset++; } + subgraph_id_to_lid_[sid_to_use] = local_node_id; + lid_to_subgraph_id_[local_node_id] = sid_to_use++; } - galois::gDebug("Number of sampled nodes for subgraph construction, layer ", - i, " is ", current_sid); } - GALOIS_LOG_ASSERT(num_subgraph_nodes_ == current_sid); - // num_subgraph_nodes_ = current_sid; + GALOIS_LOG_ASSERT(layer_offsets.back() == num_subgraph_nodes_); TimerStop(&timer); } @@ -141,8 +155,6 @@ void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting( } } local_subgraph_in_degrees_[subgraph_id] = in_degrees; - // galois::gDebug("Local ID ", node_id, " SID ", subgraph_id, " out ", - // out_degrees, " in ", in_degrees); }, galois::loopname("DegreeCountingDoAll"), galois::steal()); @@ -231,7 +243,6 @@ void galois::graphs::GNNGraph::GNNSubgraph::NodeFeatureCreation( galois::StatTimer timer("NodeFeatureCreation", kRegionName); TimerStart(&timer); size_t feat_length = gnn_graph.node_feature_length(); - // assumes everything is already setup subgraph_node_features_.resize(feat_length * num_subgraph_nodes_); galois::do_all(galois::iterate(begin(), end()), [&](size_t subgraph_node_id) { diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 4c828dbb19..e4f14d7408 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -423,6 +423,45 @@ void galois::GNNLayer::MaskInputNonMasters(PointerWithSize* input, #endif } +void galois::GNNLayer::MaskInputNonMasters(PointerWithSize* input, + size_t max_rows, + const galois::DynamicBitSet& bs) { + assert(*(graph_.begin_owned()) == 0); + size_t start_node = *(graph_.end_owned()); + size_t end_node = graph_.active_size(); + + if (start_node > max_rows) { + start_node = max_rows; + } + if (end_node > max_rows) { + end_node = max_rows; + } + + size_t row_index = layer_dimensions_.input_columns; + assert(start_node * row_index <= input->size()); + assert(end_node * row_index <= input->size()); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node, row_index); + } else { +#endif + galois::do_all( + galois::iterate(start_node, end_node), + [&](size_t non_master) { + if (!bs.test(non_master)) { + // TODO(loc) use a std function for this for max efficiency + for (size_t i = 0; i < row_index; i++) { + (*input)[non_master * row_index + i] = 0; + } + } + }, + galois::loopname("MaskInputNonMasters")); +#ifdef GALOIS_ENABLE_GPU + } +#endif +} + void galois::GNNLayer::MaskGradientNonMasters( PointerWithSize* gradient, size_t max_rows) { assert(*(graph_.begin_owned()) == 0); @@ -465,3 +504,52 @@ void galois::GNNLayer::MaskGradientNonMasters( } #endif } + +void galois::GNNLayer::MaskGradientNonMasters( + PointerWithSize* gradient, size_t max_rows, + const galois::DynamicBitSet& bs) { + assert(*(graph_.begin_owned()) == 0); + size_t start_node = *(graph_.end_owned()); + size_t end_node = graph_.active_size(); + + if (start_node > max_rows) { + start_node = max_rows; + } + if (end_node > max_rows) { + end_node = max_rows; + } + + size_t row_index = layer_dimensions_.output_columns; + if (start_node > max_rows) { + start_node = max_rows; + } + if (end_node > max_rows) { + end_node = max_rows; + } + assert(start_node * row_index <= gradient->size()); + assert(end_node * row_index <= gradient->size()); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.MaskNonMastersGPU(gradient, start_node, end_node, + row_index); + } else { +#endif + // galois::gInfo(start_node, " to ", end_node); + galois::do_all( + galois::iterate(start_node, end_node), + [&](size_t non_master) { + // if something is not a master, kill it + if (!bs.test(non_master)) { + // galois::gInfo("don't keep ", non_master); + // TODO(loc) use a std function for this for max efficiency + for (size_t i = 0; i < row_index; i++) { + (*gradient)[non_master * row_index + i] = 0; + } + } + }, + galois::loopname("MaskGradientNonMasters")); +#ifdef GALOIS_ENABLE_GPU + } +#endif +} diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 70d85b853a..169dbe7ea3 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -206,7 +206,7 @@ void galois::SAGELayer::ResizeIntermediates(size_t new_input_rows, galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_, ", SAGE output temp var ", num_output_temp_elements, " (", FloatElementsToGB(num_output_temp_elements), " GB)"); - size_t buffer_size = (num_output_temp_elements * 0.02); + size_t buffer_size = (num_output_temp_elements * 0.02); #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { gpu_object_.AllocateOutTemp(num_output_temp_elements + buffer_size); @@ -370,11 +370,21 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( if (!sage_config_.disable_concat) { // XXX masking may not be required in sampling case where rows change if (layer_number_ != 0) { - MaskInputNonMasters(&input_data, layer_dimensions_.input_rows); + if (graph_.IsSubgraphOn()) { + MaskInputNonMasters(&input_data, layer_dimensions_.input_rows, + graph_.GetNonLayerZeroMasters()); + } else { + MaskInputNonMasters(&input_data, layer_dimensions_.input_rows); + } } else { // if 0 then no input to mask: mask the gradient // this is fine because gradient won't be used to get feature gradients - MaskGradientNonMasters(input_gradient, layer_dimensions_.output_rows); + if (graph_.IsSubgraphOn()) { + MaskGradientNonMasters(input_gradient, layer_dimensions_.output_rows, + graph_.GetNonLayerZeroMasters()); + } else { + MaskGradientNonMasters(input_gradient, layer_dimensions_.output_rows); + } } #ifdef GALOIS_ENABLE_GPU @@ -411,7 +421,12 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // mask it, then use it // XXX masking may not be required in sampling case where rows change if (layer_number_ != 0 || sage_config_.disable_concat) { - MaskInputNonMasters(&agg_data, layer_dimensions_.output_rows); + if (graph_.IsSubgraphOn()) { + MaskInputNonMasters(&agg_data, layer_dimensions_.output_rows, + graph_.GetNonLayerZeroMasters()); + } else { + MaskInputNonMasters(&agg_data, layer_dimensions_.output_rows); + } } // if concat is disabled, then input grad isn't masked; therefore, mask // this to get the same effect @@ -460,11 +475,21 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // disable concat part is here because otherwise it would get done elsewhere // XXX masking may not be required in sampling case where rows change if (layer_number_ != 0 && sage_config_.disable_concat) { - MaskInputNonMasters(&input_data, layer_dimensions_.input_rows); + if (graph_.IsSubgraphOn()) { + MaskInputNonMasters(&input_data, layer_dimensions_.input_rows, + graph_.GetNonLayerZeroMasters()); + } else { + MaskInputNonMasters(&input_data, layer_dimensions_.input_rows); + } } else { // if 0 then no input to mask: mask the gradient // this is fine because gradient won't be used to get feature gradients - MaskGradientNonMasters(&p_out_temp_, layer_dimensions_.input_rows); + if (graph_.IsSubgraphOn()) { + MaskGradientNonMasters(&p_out_temp_, layer_dimensions_.input_rows, + graph_.GetNonLayerZeroMasters()); + } else { + MaskGradientNonMasters(&p_out_temp_, layer_dimensions_.input_rows); + } } // W' = F^T (FW)' From b8d89a138499698fe308b28b8ac8a729fd357595 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 25 Jun 2021 16:55:17 -0500 Subject: [PATCH 567/660] prints for gnnsubgraph --- libgnn/src/graphs/GNNSubgraph.cpp | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index 360586b7df..6faa5ad419 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -183,8 +183,27 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( underlying_graph_.CSCAllocate(); TimerStop(&alloc_time); + galois::gInfo("subgraph nodes and edges are ", num_subgraph_nodes_, " ", num_subgraph_edges_); + + galois::DGAccumulator empty_masters; + galois::DGAccumulator empty_mirrors; + empty_masters.reset(); + empty_mirrors.reset(); + galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_), [&](uint32_t subgraph_id) { + if (local_subgraph_out_degrees_[subgraph_id] == 0 && + local_subgraph_in_degrees_[subgraph_id] == 0) { + if (subgraph_id < subgraph_master_boundary_) { + empty_masters += 1; + } else { + if (gnn_graph.GetNonLayerZeroMasters().test(subgraph_id)) { + empty_masters += 1; + } else { + empty_mirrors += 1; + } + } + } underlying_graph_.fixEndEdge( subgraph_id, local_subgraph_out_degrees_[subgraph_id]); underlying_graph_.FixEndInEdge( @@ -196,6 +215,10 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( if (in_subedge_to_original_edge_.size() < num_subgraph_edges_) { in_subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02); } + uint32_t emaster = empty_masters.reduce(); + uint32_t emirror = empty_mirrors.reduce(); + galois::gInfo("empty masters percent is ", emaster / (float)num_subgraph_nodes_, " ", emaster); + galois::gInfo("empty mirrors percent is ", emirror / (float)num_subgraph_nodes_, " ", emirror); // save edges + save reference to layer sample status galois::do_all( From 2ac6505fb78e5405430b5ce4232661113a1cc197 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 25 Jun 2021 17:17:38 -0500 Subject: [PATCH 568/660] Fixed empty master/mirror counting in subgraph --- libgnn/src/graphs/GNNSubgraph.cpp | 61 ++++++++++++++++++------------- 1 file changed, 36 insertions(+), 25 deletions(-) diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index 6faa5ad419..c85da2a957 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -166,11 +166,42 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( const GNNGraph& gnn_graph) { galois::StatTimer timer("EdgeConstruction", kRegionName); TimerStart(&timer); + galois::DGAccumulator empty_masters; + galois::DGAccumulator empty_mirrors; + empty_masters.reset(); + empty_mirrors.reset(); + galois::DGAccumulator total_sn; + total_sn.reset(); + total_sn += num_subgraph_nodes_; + size_t global_sub_size = total_sn.reduce(); // prefix sum over subgraph degrees from previous phase to get starting points - for (size_t i = 1; i < num_subgraph_nodes_; i++) { - local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1]; - local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1]; + for (size_t i = 0; i < num_subgraph_nodes_; i++) { + if (local_subgraph_out_degrees_[i] == 0 && + local_subgraph_in_degrees_[i] == 0) { + if (i < subgraph_master_boundary_) { + empty_masters += 1; + } else { + if (gnn_graph.GetNonLayerZeroMasters().test(i)) { + empty_masters += 1; + } else { + empty_mirrors += 1; + } + } + } + if (i != 0) { + local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1]; + local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1]; + } + } + + uint32_t emaster = empty_masters.reduce(); + uint32_t emirror = empty_mirrors.reduce(); + if (gnn_graph.host_id() == 0) { + galois::gInfo("empty masters percent is ", emaster / (float)global_sub_size, + " ", emaster, " ", global_sub_size); + galois::gInfo("empty mirrors percent is ", emirror / (float)global_sub_size, + " ", emirror, " ", global_sub_size); } // allocate then set node endpoints @@ -183,27 +214,11 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( underlying_graph_.CSCAllocate(); TimerStop(&alloc_time); - galois::gInfo("subgraph nodes and edges are ", num_subgraph_nodes_, " ", num_subgraph_edges_); - - galois::DGAccumulator empty_masters; - galois::DGAccumulator empty_mirrors; - empty_masters.reset(); - empty_mirrors.reset(); + galois::gInfo("subgraph nodes and edges are ", num_subgraph_nodes_, " ", + num_subgraph_edges_); galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_), [&](uint32_t subgraph_id) { - if (local_subgraph_out_degrees_[subgraph_id] == 0 && - local_subgraph_in_degrees_[subgraph_id] == 0) { - if (subgraph_id < subgraph_master_boundary_) { - empty_masters += 1; - } else { - if (gnn_graph.GetNonLayerZeroMasters().test(subgraph_id)) { - empty_masters += 1; - } else { - empty_mirrors += 1; - } - } - } underlying_graph_.fixEndEdge( subgraph_id, local_subgraph_out_degrees_[subgraph_id]); underlying_graph_.FixEndInEdge( @@ -215,10 +230,6 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( if (in_subedge_to_original_edge_.size() < num_subgraph_edges_) { in_subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02); } - uint32_t emaster = empty_masters.reduce(); - uint32_t emirror = empty_mirrors.reduce(); - galois::gInfo("empty masters percent is ", emaster / (float)num_subgraph_nodes_, " ", emaster); - galois::gInfo("empty mirrors percent is ", emirror / (float)num_subgraph_nodes_, " ", emirror); // save edges + save reference to layer sample status galois::do_all( From ea88e2757de179afc481bdb7a263aebe8a421777 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 25 Jun 2021 20:23:59 -0500 Subject: [PATCH 569/660] Remove dead mirrors from GNN sampling Some mirrors have no incoming/outgoing edges and are just activated by activation on another host; these should not exist in the sampled subgraph because they occupy rows/memory which results in wasted work. --- libgnn/include/galois/GraphNeuralNetwork.h | 19 +++ .../galois/graphs/DegreeSyncStructures.h | 4 +- libgnn/include/galois/graphs/GNNGraph.h | 14 +- libgnn/src/GraphNeuralNetwork.cpp | 11 +- libgnn/src/graphs/GNNGraph.cpp | 98 +++++++------- libgnn/src/graphs/GNNSubgraph.cpp | 127 +++++++++--------- 6 files changed, 153 insertions(+), 120 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 3b5b268daa..91bdf67d14 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -199,6 +199,25 @@ class GraphNeuralNetwork { //! most literature void GradientPropagation(); + //! # nodes may change in distributed setting due to dead mirrors; + //! given the # of nodes at each layer, fix the input/output rows + void CorrectRowCounts(const std::vector& nodes_at_each_layer) { + size_t layer_offset = 0; + // work backwards + for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); + back_iter++) { + GNNLayerType layer_type = (*back_iter)->layer_type(); + if (layer_type == GNNLayerType::kGraphConvolutional || + layer_type == GNNLayerType::kSAGE) { + (*back_iter) + ->ResizeInputOutputRows(nodes_at_each_layer[layer_offset + 1], + nodes_at_each_layer[layer_offset]); + layer_offset++; + } + } + GALOIS_LOG_ASSERT(layer_offset + 1 == nodes_at_each_layer.size()); + } + //! Call whenever resize occurs to correct reuse of pointers for layers void CorrectBackwardLinks(); diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h index 91a94d64ac..659541570d 100644 --- a/libgnn/include/galois/graphs/DegreeSyncStructures.h +++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h @@ -61,9 +61,7 @@ struct InitialDegreeSync { struct SubgraphDegreeSync { using ValTy = galois::gstl::Vector; - static size_t FeatVecSize() { - return gnn_sampled_out_degrees_->size(); - } + static size_t FeatVecSize() { return gnn_sampled_out_degrees_->size(); } static ValTy extract(uint32_t lid, char&) { ValTy vec_to_send(gnn_sampled_out_degrees_->size()); diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 9c2e6061bf..09fe0bffe4 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -309,11 +309,12 @@ class GNNGraph { size_t SampleEdges(size_t sample_layer_num, size_t num_to_sample, bool inductive_subgraph, size_t timestamp); - size_t ConstructSampledSubgraph(size_t num_sampled_layers) { + std::vector ConstructSampledSubgraph(size_t num_sampled_layers) { return ConstructSampledSubgraph(num_sampled_layers, false); }; //! Construct the subgraph from sampled edges and corresponding nodes - size_t ConstructSampledSubgraph(size_t num_sampled_layers, bool use_view); + std::vector ConstructSampledSubgraph(size_t num_sampled_layers, + bool use_view); unsigned SampleNodeTimestamp(unsigned lid) const { return sample_node_timestamps_[lid]; @@ -590,6 +591,10 @@ class GNNGraph { } } + bool IsActiveInSubgraph(size_t node_id) const { + return definitely_sampled_nodes_.test(node_id); + } + //! Calculate norm factor considering the entire graph void CalculateFullNormFactor(); @@ -738,6 +743,11 @@ class GNNGraph { std::vector sample_master_offsets_; //! Count of how many mirrors are in each layer in a sampled subgraph. std::vector sample_mirror_offsets_; + //! Definitely sampled nodes + galois::DynamicBitSet definitely_sampled_nodes_; + + std::vector> master_offset_accum_; + std::vector> mirror_offset_accum_; //! In a subgraph, all layer 0 masters are made the prefix of SIDs; other //! masters that are not layer 0 will be scattered elsewhere. This bitset //! tracks which of those SIDs are the masters. diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index b29ec3af88..110dff0ad4 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -215,7 +215,7 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() { } // resize layer matrices - graph_->ConstructSampledSubgraph(num_sampled_layers); + CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); graph_->EnableSubgraphChooseAll(); CorrectBackwardLinks(); @@ -278,7 +278,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { num_sampled_layers++; } } - graph_->ConstructSampledSubgraph(num_sampled_layers); + CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); CorrectBackwardLinks(); } @@ -342,7 +342,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } } // resize layer matrices - graph_->ConstructSampledSubgraph(num_sampled_layers); + CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); CorrectBackwardLinks(); mb_timer.stop(); } @@ -379,6 +379,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, seed_node_count); + // +1 later in call because 0 is already taken size_t num_sampled_layers = 0; for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); back_iter++) { @@ -411,7 +412,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { } // resize layer matrices - graph_->ConstructSampledSubgraph(num_sampled_layers); + CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); CorrectBackwardLinks(); // XXX resizes above only work for SAGE layers; will break if other // layers are tested @@ -598,7 +599,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { num_sampled_layers++; } } - graph_->ConstructSampledSubgraph(num_sampled_layers); + CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); CorrectBackwardLinks(); } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index fdd2d6e1dc..b23d5f81f4 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -899,7 +899,9 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, } else { subgraph_choose_all_ = true; } - + definitely_sampled_nodes_.resize(partitioned_graph_->size()); + master_offset_accum_.resize(num_layers + 1); + mirror_offset_accum_.resize(num_layers + 1); sample_master_offsets_.resize(num_layers + 1, 0); sample_mirror_offsets_.resize(num_layers + 1, 0); } @@ -910,12 +912,14 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { bitset_sample_flag_.resize(size()); bitset_sample_flag_.reset(); + definitely_sampled_nodes_.reset(); galois::do_all(galois::iterate(begin_owned(), end_owned()), [&](const NodeIterator& x) { if (IsValidForPhase(*x, seed_phase)) { SetSampledNode(*x); bitset_sample_flag_.set(*x); + definitely_sampled_nodes_.set(*x); } else { UnsetSampledNode(*x); } @@ -933,6 +937,11 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { std::fill(sample_master_offsets_.begin(), sample_master_offsets_.end(), 0); std::fill(sample_mirror_offsets_.begin(), sample_mirror_offsets_.end(), 0); + for (unsigned i = 0; i < master_offset_accum_.size(); i++) { + master_offset_accum_[i].reset(); + mirror_offset_accum_[i].reset(); + } + // clear all sampled edges galois::do_all( galois::iterate(edge_sample_status_.begin(), edge_sample_status_.end()), @@ -996,11 +1005,6 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, use_subgraph_ = false; use_subgraph_view_ = false; - // galois::GAccumulator sampled; - // galois::GAccumulator total; - // sampled.reset(); - // total.reset(); - galois::do_all( galois::iterate(begin(), end()), [&](const NodeIterator& src_iter) { @@ -1019,11 +1023,12 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, } MakeEdgeSampled(edge_iter, agg_layer_num); - if (!IsInSampledGraph(partitioned_graph_->getEdgeDst(edge_iter))) { - bitset_sample_flag_.set( - partitioned_graph_->getEdgeDst(edge_iter)); + uint32_t dest = partitioned_graph_->getEdgeDst(edge_iter); + if (!IsInSampledGraph(dest)) { + bitset_sample_flag_.set(dest); } - // sampled += 1; + definitely_sampled_nodes_.set(*src_iter); + definitely_sampled_nodes_.set(dest); } } }, @@ -1056,29 +1061,15 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, galois::GAccumulator local_sample_count; local_sample_count.reset(); - galois::GAccumulator master_offset; - master_offset.reset(); - galois::GAccumulator mirror_offset; - mirror_offset.reset(); // count # of seed nodes galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { if (IsInSampledGraph(x)) { local_sample_count += 1; if (sample_node_timestamps_[*x] == std::numeric_limits::max()) { - if (*x < *end_owned()) { - master_offset += 1; - } else { - // mirror - mirror_offset += 1; - } sample_node_timestamps_[*x] = timestamp; } } }); - assert(sample_master_offsets_.size() > timestamp); - assert(sample_mirror_offsets_.size() > timestamp); - sample_master_offsets_[timestamp] = master_offset.reduce(); - sample_mirror_offsets_[timestamp] = mirror_offset.reduce(); EnableSubgraphChooseAll(); return local_sample_count.reduce(); @@ -1121,15 +1112,16 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, } } + uint32_t edge_dst = partitioned_graph_->getEdgeDst(edge_iter); // if here, it means edge accepted; set sampled on, mark // as part of next set MakeEdgeSampled(edge_iter, sample_layer_num); - if (!IsInSampledGraph( - partitioned_graph_->getEdgeDst(edge_iter))) { - bitset_sample_flag_.set( - partitioned_graph_->getEdgeDst(edge_iter)); + if (!IsInSampledGraph(edge_dst)) { + bitset_sample_flag_.set(edge_dst); } bitset_sampled_degrees_.set(*src_iter); + definitely_sampled_nodes_.set(*src_iter); + definitely_sampled_nodes_.set(edge_dst); // degree increment sampled_out_degrees_[sample_layer_num][*src_iter]++; } @@ -1165,37 +1157,22 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, // count sampled node size galois::GAccumulator local_sample_count; local_sample_count.reset(); - galois::GAccumulator master_offset; - master_offset.reset(); - galois::GAccumulator mirror_offset; - mirror_offset.reset(); // count # of seed nodes galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { if (IsInSampledGraph(x)) { - local_sample_count += 1; if (sample_node_timestamps_[*x] == std::numeric_limits::max()) { - if (*x < *end_owned()) { - master_offset += 1; - } else { - // mirror - mirror_offset += 1; - } sample_node_timestamps_[*x] = timestamp; } } }); - assert(sample_master_offsets_.size() > timestamp); - assert(sample_mirror_offsets_.size() > timestamp); - sample_master_offsets_[timestamp] = master_offset.reduce(); - sample_mirror_offsets_[timestamp] = mirror_offset.reduce(); DisableSubgraphChooseAll(); return local_sample_count.reduce(); } //! Construct the subgraph from sampled edges and corresponding nodes -size_t +std::vector galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, bool use_view) { // false first so that the build process can use functions to access the @@ -1215,13 +1192,36 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, ->sync( "Ignore"); } - size_t num_subgraph_nodes; + + galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { + if (IsActiveInSubgraph(*x)) { + if (sample_node_timestamps_[*x] != std::numeric_limits::max()) { + if (*x < *end_owned()) { + // master + master_offset_accum_[sample_node_timestamps_[*x]] += 1; + } else { + // mirror + mirror_offset_accum_[sample_node_timestamps_[*x]] += 1; + } + } else { + GALOIS_LOG_FATAL( + "should have been timestamped at some point if active"); + } + } + }); + + std::vector new_rows(master_offset_accum_.size()); + for (unsigned i = 0; i < master_offset_accum_.size(); i++) { + sample_master_offsets_[i] = master_offset_accum_[i].reduce(); + sample_mirror_offsets_[i] = mirror_offset_accum_[i].reduce(); + new_rows[i] = sample_master_offsets_[i] + sample_mirror_offsets_[i]; + } + if (!use_view) { - num_subgraph_nodes = subgraph_->BuildSubgraph(*this, num_sampled_layers); + subgraph_->BuildSubgraph(*this, num_sampled_layers); } else { // a view only has lid<->sid mappings - num_subgraph_nodes = - subgraph_->BuildSubgraphView(*this, num_sampled_layers); + subgraph_->BuildSubgraphView(*this, num_sampled_layers); } // after this, this graph is a subgraph @@ -1231,7 +1231,7 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, use_subgraph_view_ = true; } - return num_subgraph_nodes; + return new_rows; } size_t galois::graphs::GNNGraph::PrepareNextTrainMinibatch() { diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index c85da2a957..fb1d7c78c6 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -39,11 +39,15 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( std::fill(lid_to_subgraph_id_.begin(), lid_to_subgraph_id_.end(), std::numeric_limits::max()); + std::vector& master_offsets = gnn_graph.GetMasterOffsets(); + std::vector& mirror_offsets = gnn_graph.GetMirrorOffsets(); + galois::GAccumulator subgraph_count; subgraph_count.reset(); galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()), [&](uint32_t node_id) { - if (gnn_graph.IsInSampledGraph(node_id)) { + // if (gnn_graph.IsInSampledGraph(node_id)) { + if (gnn_graph.IsActiveInSubgraph(node_id)) { subgraph_count += 1; } }); @@ -54,9 +58,6 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( galois::DynamicBitSet& non_layer_zero_masters = gnn_graph.GetNonLayerZeroMasters(); - std::vector& master_offsets = gnn_graph.GetMasterOffsets(); - std::vector& mirror_offsets = gnn_graph.GetMirrorOffsets(); - // init the bitset as necessary if (non_layer_zero_masters.size() < num_subgraph_nodes_) { non_layer_zero_masters.resize(num_subgraph_nodes_); @@ -78,21 +79,22 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( // split into 2 parts: masters, then everything else size_t last_owned_node = *(gnn_graph.end_owned()); - galois::gInfo(last_owned_node); for (size_t local_node_id = 0; local_node_id < last_owned_node; local_node_id++) { - unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id); - if (node_timestamp != std::numeric_limits::max()) { - uint32_t sid_to_use; - if (node_timestamp != 0) { - sid_to_use = layer_offsets[node_timestamp - 1]++; - // master that won't be in prefix needs to be marked - non_layer_zero_masters.set(sid_to_use); - } else { - sid_to_use = layer_zero_offset++; + if (gnn_graph.IsActiveInSubgraph(local_node_id)) { + unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id); + if (node_timestamp != std::numeric_limits::max()) { + uint32_t sid_to_use; + if (node_timestamp != 0) { + sid_to_use = layer_offsets[node_timestamp - 1]++; + // master that won't be in prefix needs to be marked + non_layer_zero_masters.set(sid_to_use); + } else { + sid_to_use = layer_zero_offset++; + } + subgraph_id_to_lid_[sid_to_use] = local_node_id; + lid_to_subgraph_id_[local_node_id] = sid_to_use++; } - subgraph_id_to_lid_[sid_to_use] = local_node_id; - lid_to_subgraph_id_[local_node_id] = sid_to_use++; } } @@ -105,16 +107,18 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( // everything else; none of these are master nodes for (size_t local_node_id = last_owned_node; local_node_id < gnn_graph.size(); local_node_id++) { - unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id); - if (node_timestamp != std::numeric_limits::max()) { - uint32_t sid_to_use; - if (node_timestamp != 0) { - sid_to_use = layer_offsets[node_timestamp - 1]++; - } else { - sid_to_use = layer_zero_offset++; + if (gnn_graph.IsActiveInSubgraph(local_node_id)) { + unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id); + if (node_timestamp != std::numeric_limits::max()) { + uint32_t sid_to_use; + if (node_timestamp != 0) { + sid_to_use = layer_offsets[node_timestamp - 1]++; + } else { + sid_to_use = layer_zero_offset++; + } + subgraph_id_to_lid_[sid_to_use] = local_node_id; + lid_to_subgraph_id_[local_node_id] = sid_to_use++; } - subgraph_id_to_lid_[sid_to_use] = local_node_id; - lid_to_subgraph_id_[local_node_id] = sid_to_use++; } } @@ -166,43 +170,44 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( const GNNGraph& gnn_graph) { galois::StatTimer timer("EdgeConstruction", kRegionName); TimerStart(&timer); - galois::DGAccumulator empty_masters; - galois::DGAccumulator empty_mirrors; - empty_masters.reset(); - empty_mirrors.reset(); - - galois::DGAccumulator total_sn; - total_sn.reset(); - total_sn += num_subgraph_nodes_; - size_t global_sub_size = total_sn.reduce(); + // galois::DGAccumulator empty_masters; + // galois::DGAccumulator empty_mirrors; + // empty_masters.reset(); + // empty_mirrors.reset(); + + // galois::DGAccumulator total_sn; + // total_sn.reset(); + // total_sn += num_subgraph_nodes_; + // size_t global_sub_size = total_sn.reduce(); + // prefix sum over subgraph degrees from previous phase to get starting points - for (size_t i = 0; i < num_subgraph_nodes_; i++) { - if (local_subgraph_out_degrees_[i] == 0 && - local_subgraph_in_degrees_[i] == 0) { - if (i < subgraph_master_boundary_) { - empty_masters += 1; - } else { - if (gnn_graph.GetNonLayerZeroMasters().test(i)) { - empty_masters += 1; - } else { - empty_mirrors += 1; - } - } - } - if (i != 0) { - local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1]; - local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1]; - } + for (size_t i = 1; i < num_subgraph_nodes_; i++) { + // if (local_subgraph_out_degrees_[i] == 0 && + // local_subgraph_in_degrees_[i] == 0) { + // if (i < subgraph_master_boundary_) { + // empty_masters += 1; + // } else { + // if (gnn_graph.GetNonLayerZeroMasters().test(i)) { + // empty_masters += 1; + // } else { + // empty_mirrors += 1; + // } + // } + //} + local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1]; + local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1]; } - uint32_t emaster = empty_masters.reduce(); - uint32_t emirror = empty_mirrors.reduce(); - if (gnn_graph.host_id() == 0) { - galois::gInfo("empty masters percent is ", emaster / (float)global_sub_size, - " ", emaster, " ", global_sub_size); - galois::gInfo("empty mirrors percent is ", emirror / (float)global_sub_size, - " ", emirror, " ", global_sub_size); - } + // uint32_t emaster = empty_masters.reduce(); + // uint32_t emirror = empty_mirrors.reduce(); + // if (gnn_graph.host_id() == 0) { + // galois::gInfo("Empty masters percent is ", emaster / + // (float)global_sub_size, + // " ", emaster, " ", global_sub_size); + // galois::gInfo("Empty mirrors percent is ", emirror / + // (float)global_sub_size, + // " ", emirror, " ", global_sub_size); + //} // allocate then set node endpoints num_subgraph_edges_ = local_subgraph_out_degrees_[num_subgraph_nodes_ - 1]; @@ -214,8 +219,8 @@ void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( underlying_graph_.CSCAllocate(); TimerStop(&alloc_time); - galois::gInfo("subgraph nodes and edges are ", num_subgraph_nodes_, " ", - num_subgraph_edges_); + galois::gInfo(gnn_graph.host_prefix(), "Subgraph nodes and edges are ", + num_subgraph_nodes_, " ", num_subgraph_edges_); galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_), [&](uint32_t subgraph_id) { From 8f6bfdeb9ad2cef882e2ce08c887b1452e028097 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 26 Jun 2021 15:20:38 -0500 Subject: [PATCH 570/660] GNN: FATAL in dead test code path --- libgnn/src/GraphNeuralNetwork.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 110dff0ad4..957e7a8eea 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -440,6 +440,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { DisableTimers(); float test_acc; if (!config_.test_minibatch_size()) { + // TODO something about this path breaks accuracy + GALOIS_LOG_FATAL("this path breaks accuracy for the rest of the " + "run for some reason"); bool f = graph_->SubgraphChooseAllStatus(); graph_->DisableSubgraph(); for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); From 2c2008c3d335ff884d3a3a73f318a878d97e3f09 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 26 Jun 2021 19:18:00 -0500 Subject: [PATCH 571/660] Fixed row correction code to remove dead mirrors Needed to do a prefix sum over actives in each layer when correcting rows after removal of dead mirror nodes. --- libgnn/include/galois/GraphNeuralNetwork.h | 2 ++ libgnn/src/graphs/GNNGraph.cpp | 3 +++ 2 files changed, 5 insertions(+) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 91bdf67d14..4e33bcb8fa 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -209,6 +209,8 @@ class GraphNeuralNetwork { GNNLayerType layer_type = (*back_iter)->layer_type(); if (layer_type == GNNLayerType::kGraphConvolutional || layer_type == GNNLayerType::kSAGE) { + GALOIS_LOG_ASSERT(nodes_at_each_layer[layer_offset + 1] >= + nodes_at_each_layer[layer_offset]); (*back_iter) ->ResizeInputOutputRows(nodes_at_each_layer[layer_offset + 1], nodes_at_each_layer[layer_offset]); diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index b23d5f81f4..ef92ef7615 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -1215,6 +1215,9 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, sample_master_offsets_[i] = master_offset_accum_[i].reduce(); sample_mirror_offsets_[i] = mirror_offset_accum_[i].reduce(); new_rows[i] = sample_master_offsets_[i] + sample_mirror_offsets_[i]; + if (i > 0) { + new_rows[i] += new_rows[i - 1]; + } } if (!use_view) { From 8c15669112e73aed7d51a81839e1829d5d0e6106 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 29 Jun 2021 18:13:18 -0500 Subject: [PATCH 572/660] Sample sync; non subgraph node needs to return vec Before this commit, sample sync for a node not in the subgraph would return an empty vector. This is bad because Gluon will expect vectors of a certain length, and trying to copy beyond allocated memory may cause memory corruption. This fixes that by allocating dead space to serialize anyways. --- libgnn/include/galois/graphs/GraphAggregationSyncStructures.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index 7501a7c23d..29fdd66e0c 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -178,6 +178,9 @@ struct GNNSampleSumAggregate { extracted_vec.reserve(gnn_matrix_to_sync_column_length_); if ((*gnn_lid_to_sid_pointer_)[node_id] == std::numeric_limits::max()) { + // need to have correct size because serializer will expect + // it to be of a certain length + extracted_vec.resize(gnn_matrix_to_sync_column_length_, 0); return extracted_vec; } From 3d820f6cc46243cf63c16d10d429a826956c4506 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 29 Jun 2021 21:21:31 -0500 Subject: [PATCH 573/660] Resizes layers after mirror nodes are deleted Only resizes layers (i.e. allocates memory) after it is known how many mirror nodes remain after deletion of dead mirrors rather than resizing the worst case scenario and changing rows later. This is done to save memory. --- libgnn/include/galois/GraphNeuralNetwork.h | 5 +++ libgnn/src/GraphNeuralNetwork.cpp | 51 ++++++++++++---------- 2 files changed, 32 insertions(+), 24 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 4e33bcb8fa..7d71efa61c 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -202,6 +202,11 @@ class GraphNeuralNetwork { //! # nodes may change in distributed setting due to dead mirrors; //! given the # of nodes at each layer, fix the input/output rows void CorrectRowCounts(const std::vector& nodes_at_each_layer) { + // assumes last layer is output row and resizes it based on first + // offset + gnn_layers_.back()->ResizeInputOutputRows(nodes_at_each_layer[0], + nodes_at_each_layer[0]); + size_t layer_offset = 0; // work backwards for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 957e7a8eea..1ed89e99cc 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -188,9 +188,11 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() { uint32_t total = 0; while (true) { work_left_.reset(); - size_t seed_node_count = graph_->PrepareNextTestMinibatch(); + // size_t seed_node_count = graph_->PrepareNextTestMinibatch(); + graph_->PrepareNextTestMinibatch(); // last layer input size/output rows becomes seed node size - gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, seed_node_count); + // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, + // seed_node_count); size_t num_sampled_layers = 0; for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); @@ -200,14 +202,14 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() { layer_type == GNNLayerType::kSAGE) { // you can minibatch with sampling or minibatch and grab all // relevant neighbors - size_t current_sample_size; - current_sample_size = - graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(), - false, num_sampled_layers + 1); + // size_t current_sample_size; + graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(), false, + num_sampled_layers + 1); // resize this layer, change seed node count - (*back_iter) - ->ResizeInputOutputRows(current_sample_size, seed_node_count); - seed_node_count = current_sample_size; + //(*back_iter) + // ->ResizeInputOutputRows(current_sample_size, seed_node_count); + // seed_node_count = current_sample_size; + num_sampled_layers++; // XXX resizes above only work for SAGE layers; will break if other // layers are tested @@ -257,7 +259,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", local_seed_node_count); size_t num_sampled_layers = 0; - gnn_layers_.back()->ResizeRows(local_seed_node_count); + // gnn_layers_.back()->ResizeRows(local_seed_node_count); for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); back_iter++) { GNNLayerType layer_type = (*back_iter)->layer_type(); @@ -271,8 +273,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { (*back_iter)->graph_user_layer_number(), " is ", current_sample_size); // resizing - (*back_iter) - ->ResizeInputOutputRows(current_sample_size, local_seed_node_count); + //(*back_iter) + // ->ResizeInputOutputRows(current_sample_size, + // local_seed_node_count); local_seed_node_count = current_sample_size; subgraph_layer_sizes.emplace_back(local_seed_node_count); num_sampled_layers++; @@ -313,7 +316,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { mb_timer.start(); size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); - gnn_layers_.back()->ResizeRows(local_seed_node_count); + // gnn_layers_.back()->ResizeRows(local_seed_node_count); galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", local_seed_node_count); size_t num_sampled_layers = 0; @@ -334,9 +337,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { (*back_iter)->graph_user_layer_number(), " is ", current_sample_size); - (*back_iter) - ->ResizeInputOutputRows(current_sample_size, - local_seed_node_count); + //(*back_iter) + // ->ResizeInputOutputRows(current_sample_size, + // local_seed_node_count); local_seed_node_count = current_sample_size; num_sampled_layers++; } @@ -376,8 +379,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { seed_node_count); // last layer input size/output rows becomes seed node size - gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, - seed_node_count); + // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, + // seed_node_count); // +1 later in call because 0 is already taken size_t num_sampled_layers = 0; @@ -404,8 +407,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { (*back_iter)->graph_user_layer_number(), " is ", current_sample_size); // resize this layer, change seed node count - (*back_iter) - ->ResizeInputOutputRows(current_sample_size, seed_node_count); + //(*back_iter) + // ->ResizeInputOutputRows(current_sample_size, seed_node_count); seed_node_count = current_sample_size; num_sampled_layers++; } @@ -585,7 +588,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", local_seed_node_count); size_t num_sampled_layers = 0; - gnn_layers_.back()->ResizeRows(local_seed_node_count); + // gnn_layers_.back()->ResizeRows(local_seed_node_count); for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); back_iter++) { GNNLayerType layer_type = (*back_iter)->layer_type(); @@ -595,9 +598,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { (*back_iter)->graph_user_layer_number(), config_.inductive_subgraph_, num_sampled_layers + 1); // resizing - (*back_iter) - ->ResizeInputOutputRows(current_sample_size, - local_seed_node_count); + //(*back_iter) + // ->ResizeInputOutputRows(current_sample_size, + // local_seed_node_count); local_seed_node_count = current_sample_size; num_sampled_layers++; } From 46196de98c87f3e6143554902b7e4811eecb4a73 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 30 Jun 2021 17:57:27 -0500 Subject: [PATCH 574/660] Softmax backward timer fix Timer for backward phase of softmax was coupled into the forward timer; this commit changes it to backward --- libgnn/src/layers/SoftmaxLayer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index eb6e900413..8b99db4073 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -88,7 +88,7 @@ galois::SoftmaxLayer::ForwardPhase( galois::PointerWithSize galois::SoftmaxLayer::BackwardPhaseCPU() { - galois::StatTimer timer("SoftmaxForward", "SoftmaxLayer"); + galois::StatTimer timer("SoftmaxBackward", "SoftmaxLayer"); TimerStart(&timer); const size_t feature_length = layer_dimensions_.input_columns; From c99236cab58001997db76f806fa215c84df4404a Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 30 Jun 2021 18:00:58 -0500 Subject: [PATCH 575/660] Adjustable learning rate for Adam optimizer Command line argument added to adjust Adam learning rate for GNNs. --- lonestar/libgnnbench/include/GNNBench/Start.h | 1 + lonestar/libgnnbench/src/Input.cpp | 11 ++++++++++- lonestar/libgnnbench/src/Start.cpp | 1 + 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/lonestar/libgnnbench/include/GNNBench/Start.h b/lonestar/libgnnbench/include/GNNBench/Start.h index 48507df80e..125307e0c3 100644 --- a/lonestar/libgnnbench/include/GNNBench/Start.h +++ b/lonestar/libgnnbench/include/GNNBench/Start.h @@ -18,6 +18,7 @@ extern llvm::cl::opt cl_layer_type; extern llvm::cl::opt train_minibatch_size; extern llvm::cl::opt test_minibatch_size; extern llvm::cl::opt do_graph_sampling; +extern llvm::cl::opt learning_rate; #ifdef GALOIS_ENABLE_GPU std::string personality_str(DevicePersonality p); diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index 3ebee8adea..7719340224 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -160,6 +160,11 @@ llvm::cl::opt cll::desc("# of epochs to test test set (default 0)"), cll::init(0)); +llvm::cl::opt + learning_rate("learningRate", + cll::desc("Adam optimizer learning rate (default 0.01)"), + cll::init(0.01)); + const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s) { switch (s) { case galois::graphs::GNNPartitionScheme::kOEC: @@ -304,8 +309,12 @@ CreateOptimizer(const galois::graphs::GNNGraph* gnn_graph) { } GALOIS_LOG_ASSERT(opt_sizes.size() == num_layers); + galois::AdamOptimizer::AdamConfiguration adam_config; + adam_config.alpha = learning_rate; + // TODO only adam works right now, add the others later - return std::make_unique(opt_sizes, num_layers); + return std::make_unique(adam_config, opt_sizes, + num_layers); } std::vector CreateFanOutVector() { diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp index 9a7e747744..daff6ad114 100644 --- a/lonestar/libgnnbench/src/Start.cpp +++ b/lonestar/libgnnbench/src/Start.cpp @@ -116,6 +116,7 @@ void GNNBenchStart(int argc, char** argv, const char* app, const char* desc, test_minibatch_size); galois::runtime::reportParam("GNNBench", "IsGraphSampled", do_graph_sampling); + galois::runtime::reportParam("GNNBench", "LearningRate", learning_rate); } char name[256]; From 46c07d91855fe2c19582e37426e54e104ee2bd7a Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 30 Jun 2021 18:33:50 -0500 Subject: [PATCH 576/660] Timers for full forward/backward phase in GNN Forward/backward timers before this commit did not catch non GCN/SAGE layers. This commit puts a timer around the forward/backward phase. Note that the accuracy check will not be caught by this timer but will still be in the epoch timer. --- libgnn/include/galois/GraphNeuralNetwork.h | 4 ++++ libgnn/src/GraphNeuralNetwork.cpp | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 7d71efa61c..ff13e24c41 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -231,7 +231,10 @@ class GraphNeuralNetwork { private: static const constexpr char* kRegionName = "GraphNeuralNetwork"; + bool timers_on_{false}; + void EnableTimers() { + timers_on_ = true; galois::gDebug("Enabling timers"); graph_->EnableTimers(); for (auto& layer : gnn_layers_) @@ -239,6 +242,7 @@ class GraphNeuralNetwork { } void DisableTimers() { + timers_on_ = false; galois::gDebug("Disabling timers"); graph_->DisableTimers(); for (auto& layer : gnn_layers_) diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 1ed89e99cc..7629a8ef57 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -655,6 +655,11 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { const galois::PointerWithSize galois::GraphNeuralNetwork::DoInference() { + galois::StatTimer timer("DoInference", "GraphNeuralNetwork"); + if (timers_on_) { + timer.start(); + } + // start with graph features and pass it through all layers of the network galois::PointerWithSize layer_input = graph_->GetLocalFeatures(); @@ -663,6 +668,10 @@ galois::GraphNeuralNetwork::DoInference() { layer_input = ptr->ForwardPhase(layer_input); } + if (timers_on_) { + timer.stop(); + } + return layer_input; } @@ -688,6 +697,11 @@ float galois::GraphNeuralNetwork::GetGlobalAccuracy( } void galois::GraphNeuralNetwork::GradientPropagation() { + galois::StatTimer timer("GradientPropagation", "GraphNeuralNetwork"); + if (timers_on_) { + timer.start(); + } + // from output layer get initial gradients std::vector dummy; std::unique_ptr& output_layer = gnn_layers_.back(); @@ -715,6 +729,10 @@ void galois::GraphNeuralNetwork::GradientPropagation() { // update the weights of the layer gnn_layers_[layer_index]->OptimizeLayer(optimizer_.get(), layer_index); } + + if (timers_on_) { + timer.stop(); + } } void galois::GraphNeuralNetwork::CorrectBackwardLinks() { From 71248703017037c2fea52a3bfbd248c490cf0922 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 2 Jul 2021 20:39:54 -0500 Subject: [PATCH 577/660] sage layer assertion fix --- libgnn/src/layers/SAGELayer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 169dbe7ea3..8962ec319a 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -295,7 +295,7 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( AggregateAll(layer_dimensions_.input_columns, input_data, agg_data, &input_column_intermediates_); assert(p_forward_output_matrix_.size() >= - layer_dimensions_.output_columns * layer_dimensions_.output_columns); + layer_dimensions_.output_rows * layer_dimensions_.output_columns); UpdateEmbeddings(agg_data, p_forward_output_matrix_.data(), true); } else { assert(p_out_temp_.size() >= @@ -305,7 +305,7 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( UpdateEmbeddings(input_data, p_out_temp_.data(), false); // A(FW) assert(p_forward_output_matrix_.size() >= - layer_dimensions_.output_columns * layer_dimensions_.output_columns); + layer_dimensions_.output_rows * layer_dimensions_.output_columns); AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(), p_forward_output_matrix_.data(), &output_column_intermediates_); From daf355f3a6e01368bbbbf0ef839509411af642af Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 3 Jul 2021 23:11:47 -0500 Subject: [PATCH 578/660] Mirror/master nodes in gluon now pointers Made the master/mirror node vars in Gluon pointers so that they can be swapped in/out; mostly used for subgraph mirror changes to avoid extra communications to dead mirrors on other hosts. --- .../include/galois/graphs/DistributedGraph.h | 3 + libcusp/include/galois/graphs/NewGeneric.h | 1 + .../include/galois/graphs/GluonSubstrate.h | 266 ++++++++++++------ 3 files changed, 177 insertions(+), 93 deletions(-) diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h index bf88a17acf..0e3e5fa43c 100644 --- a/libcusp/include/galois/graphs/DistributedGraph.h +++ b/libcusp/include/galois/graphs/DistributedGraph.h @@ -543,6 +543,9 @@ class DistGraph { public: virtual ~DistGraph() {} + + unsigned GetLIDHost(uint64_t lid) const { return getHostIDImpl(getGID(lid)); } + //! Determines which host has the master for a particular node //! @returns Host id of node in question inline unsigned getHostID(uint64_t gid) const { return getHostIDImpl(gid); } diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index 6f13f42737..4ff7832f3e 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -2589,6 +2589,7 @@ class NewDistGraphGeneric : public DistGraph { for (uint32_t i = base_DistGraph::numOwned; i < base_DistGraph::numNodes; i++) { uint32_t globalID = base_DistGraph::localToGlobalVector[i]; + assert(graphPartitioner->retrieveMaster(globalID) != base_DistGraph::id); base_DistGraph::mirrorNodes[graphPartitioner->retrieveMaster(globalID)] .push_back(globalID); } diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index ae50e0e10f..860480b262 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -119,12 +119,16 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // memoization optimization //! Master nodes on different hosts. For broadcast; - std::vector> masterNodes; + std::vector> master_nodes_concrete_; + std::vector> subgraph_master_nodes_; + std::vector>* masterNodes; //! Mirror nodes on different hosts. For reduce; comes from the user graph //! during initialization (we expect user to give to us) - std::vector>& mirrorNodes; + std::vector>* mirrorNodes; //! Maximum size of master or mirror nodes on different hosts size_t maxSharedSize; + //! Maximum size of master or mirror nodes on different hosts + size_t original_max_shared_size_; #ifdef GALOIS_USE_BARE_MPI std::vector mpi_identity_groups; @@ -190,7 +194,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { continue; galois::runtime::SendBuffer b; - gSerialize(b, mirrorNodes[x]); + gSerialize(b, (*mirrorNodes)[x]); net.sendTagged(x, galois::runtime::evilPhase, std::move(b)); } @@ -204,7 +208,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); - galois::runtime::gDeserialize(p->second, masterNodes[p->first]); + galois::runtime::gDeserialize(p->second, (*masterNodes)[p->first]); } incrementEvilPhase(); } @@ -274,11 +278,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // convert the global ids stored in the master/mirror nodes arrays to local // ids // TODO: use 32-bit distinct vectors for masters and mirrors from here on - for (uint32_t h = 0; h < masterNodes.size(); ++h) { + for (uint32_t h = 0; h < masterNodes->size(); ++h) { galois::do_all( - galois::iterate(size_t{0}, masterNodes[h].size()), + galois::iterate(size_t{0}, (*masterNodes)[h].size()), [&](size_t n) { - masterNodes[h][n] = userGraph.getLID(masterNodes[h][n]); + (*masterNodes)[h][n] = userGraph.getLID((*masterNodes)[h][n]); }, #if GALOIS_COMM_STATS galois::loopname(get_run_identifier("MasterNodes").c_str()), @@ -286,11 +290,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject { galois::no_stats()); } - for (uint32_t h = 0; h < mirrorNodes.size(); ++h) { + for (uint32_t h = 0; h < mirrorNodes->size(); ++h) { galois::do_all( - galois::iterate(size_t{0}, mirrorNodes[h].size()), + galois::iterate(size_t{0}, (*mirrorNodes)[h].size()), [&](size_t n) { - mirrorNodes[h][n] = userGraph.getLID(mirrorNodes[h][n]); + (*mirrorNodes)[h][n] = userGraph.getLID((*mirrorNodes)[h][n]); }, #if GALOIS_COMM_STATS galois::loopname(get_run_identifier("MirrorNodes").c_str()), @@ -302,30 +306,32 @@ class GluonSubstrate : public galois::runtime::GlobalObject { maxSharedSize = 0; // report masters/mirrors to/from other hosts as statistics - for (auto x = 0U; x < masterNodes.size(); ++x) { + for (auto x = 0U; x < masterNodes->size(); ++x) { if (x == id) continue; std::string master_nodes_str = "MasterNodesFrom_" + std::to_string(id) + "_To_" + std::to_string(x); galois::runtime::reportStatCond_Tsum( - RNAME, master_nodes_str, masterNodes[x].size()); - if (masterNodes[x].size() > maxSharedSize) { - maxSharedSize = masterNodes[x].size(); + RNAME, master_nodes_str, (*masterNodes)[x].size()); + if ((*masterNodes)[x].size() > maxSharedSize) { + maxSharedSize = (*masterNodes)[x].size(); } } - for (auto x = 0U; x < mirrorNodes.size(); ++x) { + for (auto x = 0U; x < mirrorNodes->size(); ++x) { if (x == id) continue; std::string mirror_nodes_str = "MirrorNodesFrom_" + std::to_string(x) + "_To_" + std::to_string(id); galois::runtime::reportStatCond_Tsum( - RNAME, mirror_nodes_str, mirrorNodes[x].size()); - if (mirrorNodes[x].size() > maxSharedSize) { - maxSharedSize = mirrorNodes[x].size(); + RNAME, mirror_nodes_str, (*mirrorNodes)[x].size()); + if ((*mirrorNodes)[x].size() > maxSharedSize) { + maxSharedSize = (*mirrorNodes)[x].size(); } } + original_max_shared_size_ = maxSharedSize; + sendInfoToHost(); // do not track memory usage of partitioning @@ -435,7 +441,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { cartesianGrid(_cartesianGrid), partitionAgnostic(_partitionAgnostic), substrateDataMode(_enforcedDataMode), numHosts(numHosts), num_run(0), num_round(0), currentBVFlag(nullptr), - mirrorNodes(userGraph.getMirrorNodes()) { + masterNodes(&master_nodes_concrete_), + mirrorNodes(&(userGraph.getMirrorNodes())) { is_a_graph_ = _userGraph.is_a_graph(); if (cartesianGrid.first != 0 && cartesianGrid.second != 0) { GALOIS_ASSERT(cartesianGrid.first * cartesianGrid.second == numHosts, @@ -455,7 +462,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { initBareMPI(); // master setup from mirrors done by setupCommunication call - masterNodes.resize(numHosts); + masterNodes->resize(numHosts); // setup proxy communication galois::CondStatTimer Tgraph_construct_comm( "GraphCommSetupTime", RNAME); @@ -464,11 +471,73 @@ class GluonSubstrate : public galois::runtime::GlobalObject { Tgraph_construct_comm.stop(); } + void RevertHandshakeToRealGraph() { + // XXX make sure I dont need anything else + masterNodes = &master_nodes_concrete_; + mirrorNodes = &(userGraph.getMirrorNodes()); + maxSharedSize = original_max_shared_size_; + } + + void + SetupSubgraphMirrors(std::vector>& subgraph_mirrors) { + galois::StatTimer t("SubgraphMirrorSetup"); + t.start(); + + // resetup master mirrors + masterNodes = &subgraph_master_nodes_; + mirrorNodes = &subgraph_mirrors; + masterNodes->clear(); + if (masterNodes->size() < numHosts) + masterNodes->resize(numHosts); + + // Exchange information for memoization optimization. + exchangeProxyInfo(); + + assert(masterNodes->size() == numHosts); + assert(mirrorNodes->size() == numHosts); + + // convert the global ids stored in the master/mirror nodes arrays to local + // ids + // TODO: use 32-bit distinct vectors for masters and mirrors from here on + for (uint32_t h = 0; h < masterNodes->size(); ++h) { + galois::do_all( + galois::iterate(size_t{0}, (*masterNodes)[h].size()), + [&](size_t n) { + (*masterNodes)[h][n] = userGraph.getLID((*masterNodes)[h][n]); + }, + galois::no_stats()); + } + + for (uint32_t h = 0; h < mirrorNodes->size(); ++h) { + galois::do_all( + galois::iterate(size_t{0}, (*mirrorNodes)[h].size()), + [&](size_t n) { + (*mirrorNodes)[h][n] = userGraph.getLID((*mirrorNodes)[h][n]); + }, + galois::no_stats()); + } + + maxSharedSize = 0; + for (auto x = 0U; x < masterNodes->size(); ++x) { + assert(x < mirrorNodes->size()); + if (x == id) + continue; + if ((*masterNodes)[x].size() > maxSharedSize) { + maxSharedSize = (*masterNodes)[x].size(); + } + if ((*mirrorNodes)[x].size() > maxSharedSize) { + maxSharedSize = (*mirrorNodes)[x].size(); + } + } + + t.stop(); + } + +private: //////////////////////////////////////////////////////////////////////////////// // Data extraction from bitsets //////////////////////////////////////////////////////////////////////////////// -private: /** * Given a bitset, determine the indices of the bitset that are currently * set. @@ -820,7 +889,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { typename std::enable_if::value>::type* = nullptr> void getSendBuffer(std::string loopName, unsigned x, galois::runtime::SendBuffer& b, size_t elem_size) { - auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes; + auto& sharedNodes = + (syncType == syncReduce) ? (*mirrorNodes) : (*masterNodes); SyncExtract2D( loopName, x, sharedNodes[x], b, elem_size); @@ -850,7 +920,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { typename std::enable_if::value>::type* = nullptr> void getSendBuffer(std::string loopName, unsigned x, galois::runtime::SendBuffer& b, size_t elem_size) { - auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes; + auto& sharedNodes = + (syncType == syncReduce) ? (*mirrorNodes) : (*masterNodes); if (BitsetFnTy::is_valid()) { syncExtract( @@ -886,7 +957,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { */ template void serializeMessage(std::string loopName, DataCommMode data_mode, - size_t bit_set_count, std::vector& indices, + size_t bit_set_count, + const std::vector& indices, galois::PODResizeableArray& offsets, galois::DynamicBitSet& bit_set_comm, VecType& val_vec, galois::runtime::SendBuffer& b) { @@ -931,7 +1003,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { template void serializeMessageVecHack(std::string loopName, DataCommMode data_mode, - size_t bit_set_count, std::vector& indices, + size_t bit_set_count, + const std::vector& indices, galois::PODResizeableArray& offsets, galois::DynamicBitSet& bit_set_comm, VecType& val_vec, galois::runtime::SendBuffer& b) { @@ -971,7 +1044,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // Calls data on the TwoDVector template void SerializeMessage2D(std::string loopName, DataCommMode data_mode, - size_t bit_set_count, std::vector& indices, + size_t bit_set_count, + const std::vector& indices, galois::PODResizeableArray& offsets, galois::DynamicBitSet& bit_set_comm, TwoDVecType& two_d_vec, @@ -1014,12 +1088,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // Only serializes the prefix template - void - SerializeMessagePrefix2D(std::string loopName, DataCommMode data_mode, - size_t bit_set_count, std::vector& indices, - galois::PODResizeableArray& offsets, - galois::DynamicBitSet& bit_set_comm, - galois::runtime::SendBuffer& b) { + void SerializeMessagePrefix2D( + std::string loopName, DataCommMode data_mode, size_t bit_set_count, + const std::vector& indices, + galois::PODResizeableArray& offsets, + galois::DynamicBitSet& bit_set_comm, galois::runtime::SendBuffer& b) { std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; std::string serialize_timer_str(syncTypeStr + "SerializeMessagePrefix_" + get_run_identifier(loopName)); @@ -1258,7 +1331,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { */ bool nothingToSend(unsigned host, SyncType syncType, WriteLocation writeLocation, ReadLocation readLocation) { - auto& sharedNodes = (syncType == syncReduce) ? mirrorNodes : masterNodes; + auto& sharedNodes = + (syncType == syncReduce) ? (*mirrorNodes) : (*masterNodes); // TODO refactor (below) if (!isCartCut) { return (sharedNodes[host].size() == 0); @@ -1287,7 +1361,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { */ bool nothingToRecv(unsigned host, SyncType syncType, WriteLocation writeLocation, ReadLocation readLocation) { - auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes; + auto& sharedNodes = + (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes); // TODO refactor (above) if (!isCartCut) { return (sharedNodes[host].size() == 0); @@ -2041,8 +2116,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { typename std::enable_if::value>::type* = nullptr> void syncExtract(std::string loopName, unsigned from_id, - std::vector& indices, galois::runtime::SendBuffer& b, - size_t elem_size) { + const std::vector& indices, + galois::runtime::SendBuffer& b, size_t elem_size) { uint32_t num = indices.size() * elem_size; static VecTy val_vec; // sometimes wasteful galois::PODResizeableArray& offsets = syncOffsets; @@ -2122,8 +2197,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { typename std::enable_if::value>::type* = nullptr> void syncExtract(std::string loopName, unsigned from_id, - std::vector& indices, galois::runtime::SendBuffer& b, - size_t elem_size) { + const std::vector& indices, + galois::runtime::SendBuffer& b, size_t elem_size) { std::string syncTypeStr = (syncType == syncReduce) ? "Reduce" : "Broadcast"; std::string extract_timer_str(syncTypeStr + "Extract_" + get_run_identifier(loopName)); @@ -2204,8 +2279,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { bool async, typename std::enable_if::type* = nullptr> void syncExtract(std::string loopName, unsigned from_id, - std::vector& indices, galois::runtime::SendBuffer& b, - size_t elem_size) { + const std::vector& indices, + galois::runtime::SendBuffer& b, size_t elem_size) { uint32_t num = indices.size() * elem_size; galois::DynamicBitSet& bit_set_comm = syncBitset; static VecTy val_vec; // sometimes wasteful @@ -2337,7 +2412,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { typename std::enable_if::type* = nullptr, typename std::enable_if::value>::type* = nullptr> void syncExtractFloatVecHack(std::string loopName, unsigned from_id, - std::vector& indices, + const std::vector& indices, galois::runtime::SendBuffer& b, size_t elem_size) { // TODO(loc) assumption that type in the VecTy is a vector of floats @@ -2479,7 +2554,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { template void SyncExtract2D(std::string loopName, unsigned from_id, - std::vector& indices, + const std::vector& indices, galois::runtime::SendBuffer& b, size_t elem_size) { uint32_t num = indices.size() * elem_size; galois::DynamicBitSet& bit_set_comm = syncBitset; @@ -2641,7 +2716,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { SyncType syncType, typename SyncFnTy, typename BitsetFnTy, typename VecTy, bool async, typename std::enable_if::type* = nullptr> - void syncExtract(std::string loopName, unsigned, std::vector& indices, + void syncExtract(std::string loopName, unsigned, + const std::vector& indices, galois::runtime::SendBuffer& b, size_t elem_size) { uint32_t num = indices.size() * elem_size; galois::DynamicBitSet& bit_set_comm = syncBitset; @@ -2684,13 +2760,11 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // vector extract, i.e. get element i of the vector (i passed in as // argument as well) if (data_mode == onlyData) { - // galois::gInfo(id, " node ", i, " has data to send"); bit_set_count = indices.size(); extractSubset( loopName, indices, bit_set_count, offsets, val_vec, i); } else if (data_mode != noData) { // bitsetData or offsetsData or gidsData - // galois::gInfo(id, " node ", i, " has data to send"); extractSubset( loopName, indices, bit_set_count, offsets, val_vec, i); } @@ -2926,9 +3000,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject { static VecTy val_vec; galois::PODResizeableArray& offsets = syncOffsets; - auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes; - uint32_t num = sharedNodes[from_id].size(); - size_t retval = 0; + auto& sharedNodes = + (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes); + uint32_t num = sharedNodes[from_id].size(); + size_t retval = 0; Tset.start(); @@ -3014,12 +3089,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject { set_batch_timer_str.c_str(), RNAME); //////////////////////////////////////////////////////////////////////////// - galois::DynamicBitSet& bit_set_comm = syncBitset; + galois::DynamicBitSet& bit_set_comm = syncBitset; galois::PODResizeableArray& offsets = syncOffsets; - auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes; - uint32_t num = sharedNodes[from_id].size(); - size_t retval = 0; + auto& sharedNodes = + (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes); + uint32_t num = sharedNodes[from_id].size(); + size_t retval = 0; Tset.start(); @@ -3115,9 +3191,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject { static galois::gstl::Vector single_array; galois::PODResizeableArray& offsets = syncOffsets; - auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes; - uint32_t num = sharedNodes[from_id].size(); - size_t retval = 0; + auto& sharedNodes = + (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes); + uint32_t num = sharedNodes[from_id].size(); + size_t retval = 0; Tset.start(); @@ -3251,9 +3328,10 @@ class GluonSubstrate : public galois::runtime::GlobalObject { static VecTy val_vec; galois::PODResizeableArray& offsets = syncOffsets; - auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes; - uint32_t num = sharedNodes[from_id].size(); - size_t retval = 0; + auto& sharedNodes = + (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes); + uint32_t num = sharedNodes[from_id].size(); + size_t retval = 0; Tset.start(); @@ -3536,7 +3614,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { if (rb.size() == 0) { // create the receive buffers TRecvTime.start(); - auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes; + auto& sharedNodes = + (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes); rb.resize(numHosts); request.resize(numHosts, MPI_REQUEST_NULL); @@ -3591,7 +3670,8 @@ class GluonSubstrate : public galois::runtime::GlobalObject { if (window.size() == 0) { // create the windows TRecvTime.start(); - auto& sharedNodes = (syncType == syncReduce) ? masterNodes : mirrorNodes; + auto& sharedNodes = + (syncType == syncReduce) ? (*masterNodes) : (*mirrorNodes); window.resize(numHosts); rb.resize(numHosts); @@ -4407,20 +4487,20 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // copy memoization meta-data m.num_master_nodes = - (unsigned int*)calloc(masterNodes.size(), sizeof(unsigned int)); + (unsigned int*)calloc(masterNodes->size(), sizeof(unsigned int)); ; m.master_nodes = - (unsigned int**)calloc(masterNodes.size(), sizeof(unsigned int*)); + (unsigned int**)calloc(masterNodes->size(), sizeof(unsigned int*)); ; - for (uint32_t h = 0; h < masterNodes.size(); ++h) { - m.num_master_nodes[h] = masterNodes[h].size(); + for (uint32_t h = 0; h < masterNodes->size(); ++h) { + m.num_master_nodes[h] = (*masterNodes)[h].size(); - if (masterNodes[h].size() > 0) { - m.master_nodes[h] = - (unsigned int*)calloc(masterNodes[h].size(), sizeof(unsigned int)); + if ((*masterNodes)[h].size() > 0) { + m.master_nodes[h] = (unsigned int*)calloc((*masterNodes)[h].size(), + sizeof(unsigned int)); ; - std::copy(masterNodes[h].begin(), masterNodes[h].end(), + std::copy((*masterNodes)[h].begin(), (*masterNodes)[h].end(), m.master_nodes[h]); } else { m.master_nodes[h] = NULL; @@ -4428,19 +4508,19 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } m.num_mirror_nodes = - (unsigned int*)calloc(mirrorNodes.size(), sizeof(unsigned int)); + (unsigned int*)calloc(mirrorNodes->size(), sizeof(unsigned int)); ; m.mirror_nodes = - (unsigned int**)calloc(mirrorNodes.size(), sizeof(unsigned int*)); + (unsigned int**)calloc(mirrorNodes->size(), sizeof(unsigned int*)); ; - for (uint32_t h = 0; h < mirrorNodes.size(); ++h) { - m.num_mirror_nodes[h] = mirrorNodes[h].size(); + for (uint32_t h = 0; h < mirrorNodes->size(); ++h) { + m.num_mirror_nodes[h] = (*mirrorNodes)[h].size(); - if (mirrorNodes[h].size() > 0) { - m.mirror_nodes[h] = - (unsigned int*)calloc(mirrorNodes[h].size(), sizeof(unsigned int)); + if ((*mirrorNodes)[h].size() > 0) { + m.mirror_nodes[h] = (unsigned int*)calloc((*mirrorNodes)[h].size(), + sizeof(unsigned int)); ; - std::copy(mirrorNodes[h].begin(), mirrorNodes[h].end(), + std::copy((*mirrorNodes)[h].begin(), (*mirrorNodes)[h].end(), m.mirror_nodes[h]); } else { m.mirror_nodes[h] = NULL; @@ -4469,18 +4549,18 @@ class GluonSubstrate : public galois::runtime::GlobalObject { // copy memoization meta-data g_info.num_master_nodes = - (unsigned int*)calloc(masterNodes.size(), sizeof(unsigned int)); + (unsigned int*)calloc(masterNodes->size(), sizeof(unsigned int)); g_info.master_nodes = - (unsigned int**)calloc(masterNodes.size(), sizeof(unsigned int*)); + (unsigned int**)calloc(masterNodes->size(), sizeof(unsigned int*)); - for (uint32_t h = 0; h < masterNodes.size(); ++h) { - g_info.num_master_nodes[h] = masterNodes[h].size(); + for (uint32_t h = 0; h < masterNodes->size(); ++h) { + g_info.num_master_nodes[h] = (*masterNodes)[h].size(); - if (masterNodes[h].size() > 0) { - g_info.master_nodes[h] = - (unsigned int*)calloc(masterNodes[h].size(), sizeof(unsigned int)); + if ((*masterNodes)[h].size() > 0) { + g_info.master_nodes[h] = (unsigned int*)calloc((*masterNodes)[h].size(), + sizeof(unsigned int)); ; - std::copy(masterNodes[h].begin(), masterNodes[h].end(), + std::copy((*masterNodes)[h].begin(), (*masterNodes)[h].end(), g_info.master_nodes[h]); } else { g_info.master_nodes[h] = NULL; @@ -4488,16 +4568,16 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } g_info.num_mirror_nodes = - (unsigned int*)calloc(mirrorNodes.size(), sizeof(unsigned int)); + (unsigned int*)calloc(mirrorNodes->size(), sizeof(unsigned int)); g_info.mirror_nodes = - (unsigned int**)calloc(mirrorNodes.size(), sizeof(unsigned int*)); - for (uint32_t h = 0; h < mirrorNodes.size(); ++h) { - g_info.num_mirror_nodes[h] = mirrorNodes[h].size(); - - if (mirrorNodes[h].size() > 0) { - g_info.mirror_nodes[h] = - (unsigned int*)calloc(mirrorNodes[h].size(), sizeof(unsigned int)); - std::copy(mirrorNodes[h].begin(), mirrorNodes[h].end(), + (unsigned int**)calloc(mirrorNodes->size(), sizeof(unsigned int*)); + for (uint32_t h = 0; h < mirrorNodes->size(); ++h) { + g_info.num_mirror_nodes[h] = (*mirrorNodes)[h].size(); + + if ((*mirrorNodes)[h].size() > 0) { + g_info.mirror_nodes[h] = (unsigned int*)calloc((*mirrorNodes)[h].size(), + sizeof(unsigned int)); + std::copy((*mirrorNodes)[h].begin(), (*mirrorNodes)[h].end(), g_info.mirror_nodes[h]); } else { g_info.mirror_nodes[h] = NULL; From 225f3fca34769a117c81d367668e2f3e635e3336 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Sat, 3 Jul 2021 23:20:47 -0500 Subject: [PATCH 579/660] Subgraph mirror creation for GNN sampling Redoes Gluon handshake for live mirrors after subgraph creation; idea is to avoid broadcasting very large feature vectors to dead mirror nodes. There are a lot of dead commented out prints in this code that need to be removed; low priority for now, just trying to get this pushed to test on Stampede. --- libgnn/include/galois/graphs/GNNGraph.h | 5 +++++ libgnn/include/galois/graphs/GNNSubgraph.h | 8 ++++++++ .../graphs/GraphAggregationSyncStructures.h | 15 +++++++++++++++ libgnn/src/GraphNeuralNetwork.cpp | 8 +++++--- libgnn/src/graphs/GNNGraph.cpp | 17 +++++++++++------ libgnn/src/graphs/GNNSubgraph.cpp | 11 +++++++++++ 6 files changed, 55 insertions(+), 9 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 09fe0bffe4..1d639a9cbd 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -69,6 +69,7 @@ class GNNGraph { //! Return # of nodes in the partitioned graph size_t size() const { return partitioned_graph_->size(); } + size_t global_size() const { return partitioned_graph_->globalSize(); } //! Returns # of nodes in the *graph that is currently active*. size_t active_size() const { if (!use_subgraph_ && !use_subgraph_view_) { @@ -81,6 +82,9 @@ class GNNGraph { bool is_local(size_t gid) const { return partitioned_graph_->isLocal(gid); } size_t GetLID(size_t gid) const { return partitioned_graph_->getLID(gid); } size_t GetGID(size_t lid) const { return partitioned_graph_->getGID(lid); } + size_t GetHostID(size_t gid) const { + return partitioned_graph_->getHostID(gid); + } //! Node begin for all local nodes NodeIterator begin() const { @@ -325,6 +329,7 @@ class GNNGraph { void DisableSubgraph() { use_subgraph_ = false; use_subgraph_view_ = false; + sync_substrate_->RevertHandshakeToRealGraph(); } bool IsSubgraphOn() const { return use_subgraph_; } bool IsSubgraphViewOn() const { return use_subgraph_view_; } diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h index ddd4c8d277..2836e3f181 100644 --- a/libgnn/include/galois/graphs/GNNSubgraph.h +++ b/libgnn/include/galois/graphs/GNNSubgraph.h @@ -12,6 +12,7 @@ class GNNSubgraph { std::numeric_limits::max()); // the subgraph to original graph maps are allocated on demand in gstl // vectors since those change every epoch + subgraph_mirrors_.resize(galois::runtime::getSystemNetworkInterface().Num); } //! Given sampled bits set on gnn_graph, builds an explicit subgraph //! for the sampled bits @@ -100,6 +101,10 @@ class GNNSubgraph { void EnableTimers() { use_timer_ = true; } void DisableTimers() { use_timer_ = false; } + std::vector>& GetSubgraphMirrors() { + return subgraph_mirrors_; + } + private: bool use_timer_{true}; void TimerStart(galois::StatTimer* t) { @@ -152,4 +157,7 @@ class GNNSubgraph { //! Maps from subgraph in-edge id to original graph edge id (used to check if //! edge exists in particular layer) galois::PODResizeableArray in_subedge_to_original_edge_; + + //! Mirror mappings for Gluon for subgraph + std::vector> subgraph_mirrors_; }; diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index 29fdd66e0c..8cfb3c9a5d 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -13,6 +13,7 @@ extern size_t gnn_matrix_to_sync_column_length_; extern galois::DynamicBitSet bitset_graph_aggregate; extern galois::LargeArray* gnn_lid_to_sid_pointer_; extern galois::DynamicBitSet bitset_sample_flag_; +extern size_t subgraph_size_; #ifdef GALOIS_ENABLE_GPU extern struct CUDA_Context* cuda_ctx_for_sync; extern unsigned layer_number_to_sync; @@ -216,9 +217,12 @@ struct GNNSampleSumAggregate { std::numeric_limits::max()) { return false; } + assert((*gnn_lid_to_sid_pointer_)[node_id] < subgraph_size_); // loop and do addition for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + // galois::gPrint("write ", (*gnn_lid_to_sid_pointer_)[node_id] * + // gnn_matrix_to_sync_column_length_ + i, "\n"); gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] * gnn_matrix_to_sync_column_length_ + i] += y[i]; @@ -231,9 +235,14 @@ struct GNNSampleSumAggregate { std::numeric_limits::max()) { return false; } + assert((*gnn_lid_to_sid_pointer_)[node_id] < subgraph_size_); // loop and do addition for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + // galois::gPrint(galois::runtime::getSystemNetworkInterface().ID, "] + // nodeid ", node_id, " sid ", (*gnn_lid_to_sid_pointer_)[node_id], + // " write ", (*gnn_lid_to_sid_pointer_)[node_id] * + // gnn_matrix_to_sync_column_length_ + i, "\n"); gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] * gnn_matrix_to_sync_column_length_ + i] += y[i]; @@ -251,6 +260,7 @@ struct GNNSampleSumAggregate { std::numeric_limits::max()) { return; } + assert((*gnn_lid_to_sid_pointer_)[node_id] < subgraph_size_); // loop and do addition for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { @@ -267,6 +277,11 @@ struct GNNSampleSumAggregate { // loop and do addition for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + // galois::gPrint(galois::runtime::getSystemNetworkInterface().ID, "] + // broadxast nodeid ", node_id, " sid ", + // (*gnn_lid_to_sid_pointer_)[node_id], + // " write ", (*gnn_lid_to_sid_pointer_)[node_id] * + // gnn_matrix_to_sync_column_length_ + i, "\n"); gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] * gnn_matrix_to_sync_column_length_ + i] = y[i]; diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 7629a8ef57..b6c38963ed 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -179,6 +179,7 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( float galois::GraphNeuralNetwork::MinibatchedTesting() { galois::gDebug("Minibatched Testing"); + graph_->DisableSubgraph(); graph_->ResetTestMinibatcher(); SetLayerPhases(galois::GNNPhase::kBatch); @@ -630,7 +631,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { if (!config_.test_minibatch_size()) { for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); layer++) { - // TODO nuclear resize + // TODO nuclear resize; this is **ridiculously** inefficient + // because full graph will be used even if not included in test + // k-hop neighborhood for eval (*layer)->ResizeRows(graph_->size()); } CorrectBackwardLinks(); @@ -649,8 +652,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { global_accuracy); } - // return global_accuracy; - return 0; + return global_accuracy; } const galois::PointerWithSize diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index ef92ef7615..19d7bb0ad5 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -41,6 +41,7 @@ namespace galois { namespace graphs { GNNFloat* gnn_matrix_to_sync_ = nullptr; size_t gnn_matrix_to_sync_column_length_ = 0; +size_t subgraph_size_ = 0; //! For synchronization of graph aggregations galois::DynamicBitSet bitset_graph_aggregate; galois::LargeArray* gnn_lid_to_sid_pointer_ = nullptr; @@ -204,6 +205,7 @@ void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync, bool is_backward) const { gnn_matrix_to_sync_ = matrix_to_sync; gnn_matrix_to_sync_column_length_ = matrix_column_size; + subgraph_size_ = active_size(); if (!use_subgraph_ && !use_subgraph_view_) { // set globals for the sync substrate if (!is_backward) { @@ -907,8 +909,7 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, } size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { - use_subgraph_ = false; - use_subgraph_view_ = false; + DisableSubgraph(); bitset_sample_flag_.resize(size()); bitset_sample_flag_.reset(); @@ -987,6 +988,7 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { mirror_offset += 1; } + // galois::gInfo(host_prefix_, "Seed node is ", GetGID(*x)); local_seed_count += 1; // 0 = seed node sample_node_timestamps_[*x] = 0; @@ -1002,8 +1004,7 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, bool inductive_subgraph, size_t timestamp) { - use_subgraph_ = false; - use_subgraph_view_ = false; + DisableSubgraph(); galois::do_all( galois::iterate(begin(), end()), @@ -1066,6 +1067,8 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, if (IsInSampledGraph(x)) { local_sample_count += 1; if (sample_node_timestamps_[*x] == std::numeric_limits::max()) { + // galois::gInfo(host_prefix_, "Layer ", timestamp, " new node is ", + // GetGID(*x)); sample_node_timestamps_[*x] = timestamp; } } @@ -1177,8 +1180,8 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, bool use_view) { // false first so that the build process can use functions to access the // real graph - use_subgraph_ = false; - use_subgraph_view_ = false; + DisableSubgraph(); + gnn_sampled_out_degrees_ = &sampled_out_degrees_; // first, sync the degres of the sampled edges across all hosts @@ -1227,6 +1230,8 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, subgraph_->BuildSubgraphView(*this, num_sampled_layers); } + sync_substrate_->SetupSubgraphMirrors(subgraph_->GetSubgraphMirrors()); + // after this, this graph is a subgraph if (!use_view) { use_subgraph_ = true; diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index fb1d7c78c6..141390e20e 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -5,6 +5,10 @@ size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph( GNNGraph& gnn_graph, size_t num_sampled_layers) { galois::StatTimer timer("BuildSubgraph", kRegionName); TimerStart(&timer); + for (auto& vec : subgraph_mirrors_) { + vec.clear(); + // vec.reserve(num_subgraph_nodes_ - subgraph_master_boundary_); + } CreateSubgraphMapping(gnn_graph, num_sampled_layers); if (num_subgraph_nodes_ == 0) { return 0; @@ -118,6 +122,13 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( } subgraph_id_to_lid_[sid_to_use] = local_node_id; lid_to_subgraph_id_[local_node_id] = sid_to_use++; + + uint32_t node_gid = gnn_graph.GetGID(local_node_id); + // mirror node; gids because they need to be sent as gids + // and converted over later + assert(node_gid < gnn_graph.global_size()); + assert(subgraph_mirrors_.size() > gnn_graph.GetHostID(node_gid)); + subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].push_back(node_gid); } } } From 59d6deeac5068fd781694aadf94dd7d44c197717 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 5 Jul 2021 21:43:25 -0500 Subject: [PATCH 580/660] Argument for gnn gluon handshake to disable timers Disable timer argument to be used for test phases to avoid counting time for that. --- libgluon/include/galois/graphs/GluonSubstrate.h | 11 ++++++++--- libgnn/src/graphs/GNNGraph.cpp | 3 ++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index 860480b262..dc834357b5 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -479,9 +479,12 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } void - SetupSubgraphMirrors(std::vector>& subgraph_mirrors) { + SetupSubgraphMirrors(std::vector>& subgraph_mirrors, + bool use_timer) { galois::StatTimer t("SubgraphMirrorSetup"); - t.start(); + if (use_timer) { + t.start(); + } // resetup master mirrors masterNodes = &subgraph_master_nodes_; @@ -530,7 +533,9 @@ class GluonSubstrate : public galois::runtime::GlobalObject { } } - t.stop(); + if (use_timer) { + t.stop(); + } } private: diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 19d7bb0ad5..36ef2ab58b 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -1230,7 +1230,8 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, subgraph_->BuildSubgraphView(*this, num_sampled_layers); } - sync_substrate_->SetupSubgraphMirrors(subgraph_->GetSubgraphMirrors()); + sync_substrate_->SetupSubgraphMirrors(subgraph_->GetSubgraphMirrors(), + use_timer_); // after this, this graph is a subgraph if (!use_view) { From 1933cbf77f9f2a9b609435caf36255e3e3ab258d Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 6 Jul 2021 21:44:23 -0500 Subject: [PATCH 581/660] Added one timer for all GNN backward syncs Max across hosts stat only works if the max being used is all from one host; the current setup sums 3 different sync times that can be from 3 different hosts which is incorrect. Therefore, wrap all backward sync things in a single timer. --- libgnn/src/graphs/GNNGraph.cpp | 6 ++++++ libgnn/src/layers/GNNLayer.cpp | 3 +++ libgnn/src/layers/SAGELayer.cpp | 3 +++ 3 files changed, 12 insertions(+) diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 36ef2ab58b..6f87f3e88e 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -217,9 +217,12 @@ void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync, Bitset_graph_aggregate>("Ignore"); } } else { + galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon"); + clubbed_timer.start(); sync_substrate_->sync( "BackwardGraphAggregateSync"); + clubbed_timer.stop(); } } else { // setup the SID to LID map for the sync substrate to use (SID != LID) @@ -234,9 +237,12 @@ void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync, Bitset_graph_aggregate>("Ignore"); } } else { + galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon"); + clubbed_timer.start(); sync_substrate_->sync( "BackwardGraphAggregateSync"); + clubbed_timer.stop(); } } } diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index e4f14d7408..885dc1f537 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -356,6 +356,8 @@ void galois::GNNLayer::ActivationDerivative( } void galois::GNNLayer::WeightGradientSyncSum() { + galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon"); + TimerStart(&clubbed_timer); galois::StatTimer t("Sync_WeightGradientsSum", "GNNLayer"); TimerStart(&t); int weight_size = static_cast(p_layer_weight_gradients_.size()); @@ -385,6 +387,7 @@ void galois::GNNLayer::WeightGradientSyncSum() { } #endif TimerStop(&t); + TimerStop(&clubbed_timer); } void galois::GNNLayer::MaskInputNonMasters(PointerWithSize* input, diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 8962ec319a..e28bc2d0c3 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -222,6 +222,8 @@ void galois::SAGELayer::ResizeIntermediates(size_t new_input_rows, } void galois::SAGELayer::WeightGradientSyncSum2() { + galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon"); + TimerStart(&clubbed_timer); galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName); TimerStart(&t); int weight_size = static_cast(p_layer_weight_gradients_2_.size()); @@ -250,6 +252,7 @@ void galois::SAGELayer::WeightGradientSyncSum2() { } #endif TimerStop(&t); + TimerStop(&clubbed_timer); } const galois::PointerWithSize galois::SAGELayer::ForwardPhase( From 8799bff2476b069db449a7c7abcdc29d708ff15b Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 7 Jul 2021 20:26:56 -0500 Subject: [PATCH 582/660] Optimize layer 0 backward for SAGE layer Layer 0 can avoid aggregation communication/compute completely if aggregation is done before update in that layer. Therefore, ignore the agg/update flip completely for that layer and always do aggregation followed by update (will result in more inefficient forward phase, but in sampling setting unless you have incredibly large HL size it won't matter because layer 0 backward aggregation is insanely expensive) --- libgnn/src/layers/SAGELayer.cpp | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index e28bc2d0c3..eae594d824 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -65,6 +65,12 @@ galois::SAGELayer::SAGELayer(size_t layer_num, size_t num_in_temp_elements = layer_dimensions_.output_rows * layer_dimensions_.input_columns; + if (layer_number_ == 0) { + // set this to true for layer 0; it avoids aggregation completely + // in the last layer for the backward phase + config_.disable_aggregate_after_update = true; + } + // if in temp is smaller than out temp, or if dropout exists if (!config_.disable_dropout || config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { @@ -366,6 +372,8 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // aggregate occurs regardless of layer being equal to 0 because it is // required in this case for the weight gradient calculation // this is (FW)' + // TODO: this is absolutely terrible performance wise as well; keep + // in mind AggregateAll(layer_dimensions_.output_columns, input_gradient->data(), p_out_temp_.data(), &output_column_intermediates_, true); } @@ -458,6 +466,8 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // 0 means input gradient shouldn't get masked if (layer_number_ != 0) { + // NOTE: this is super nice because it avoids aggregation completely + // in the layer 0 setting // ---unmasked--- // transposed sgemm for derivative; in_temp is output assert(input_gradient->size() >= From 3ab8b2686c4458a619b0b092088779c8aa71b81c Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 7 Jul 2021 20:57:58 -0500 Subject: [PATCH 583/660] Undo last commit Tradeoff space needs better exploration --- libgnn/src/layers/SAGELayer.cpp | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index eae594d824..29f4719996 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -65,11 +65,15 @@ galois::SAGELayer::SAGELayer(size_t layer_num, size_t num_in_temp_elements = layer_dimensions_.output_rows * layer_dimensions_.input_columns; - if (layer_number_ == 0) { - // set this to true for layer 0; it avoids aggregation completely - // in the last layer for the backward phase - config_.disable_aggregate_after_update = true; - } + // if (layer_number_ == 0) { + // // set this to true for layer 0; it avoids aggregation completely + // // in the last layer for the backward phase + // config_.disable_aggregate_after_update = true; + // // TODO this *will* hurt test evaluation because test eval has no + // // backward phase, so the end-to-end benefits do not exist there + // // Solution to this is to allocate all intermediate structures for both + // // cases + make sure resize handles both cases + // } // if in temp is smaller than out temp, or if dropout exists if (!config_.disable_dropout || config_.disable_aggregate_after_update || From 20c9ce891359fbb840475e8ad360754712050c19 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 12 Jul 2021 15:33:17 -0500 Subject: [PATCH 584/660] Sample flag sync can be readSource Sample flag setting is readAny before this commit: rationale I had back then was that even if a node didn't have outgoing edges it should still be included in the graph. This actually isn't necessary though because I remove dead mirrors later in subgraph construction, and it turns out that this sync call is actually a major part of subgraph construction. Therefore, to optimize runtime, go to readAny. This can cause problems if some vertex is activated later without the sync (i.e. no consistent "time" in which a vertex is entered), so in the sync call you have to make sure not to set anything that exceeds the current layer's number of rows (or you get a segfault). --- libgnn/include/galois/graphs/GNNGraph.h | 5 ++-- .../graphs/GraphAggregationSyncStructures.h | 29 +++++++++---------- libgnn/src/graphs/GNNGraph.cpp | 20 +++++++------ libgnn/src/layers/SAGELayer.cpp | 8 ++++- 4 files changed, 35 insertions(+), 27 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 1d639a9cbd..36115929d7 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -554,7 +554,8 @@ class GNNGraph { // issues later void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size) const { - AggregateSync(matrix_to_sync, matrix_column_size, false); + AggregateSync(matrix_to_sync, matrix_column_size, false, + std::numeric_limits::max()); }; //! Given a matrix and the column size, do an aggregate sync where each row @@ -563,7 +564,7 @@ class GNNGraph { //! Note that it's const because the only thing being used is the graph //! topology of this object; the thing modified is the passed in matrix void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size, - bool is_backward) const; + bool is_backward, uint32_t active_row_boundary) const; ////////////////////////////////////////////////////////////////////////////// // Sampling related diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index 8cfb3c9a5d..50a07bdd4e 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -14,6 +14,7 @@ extern galois::DynamicBitSet bitset_graph_aggregate; extern galois::LargeArray* gnn_lid_to_sid_pointer_; extern galois::DynamicBitSet bitset_sample_flag_; extern size_t subgraph_size_; +extern size_t num_active_layer_rows_; #ifdef GALOIS_ENABLE_GPU extern struct CUDA_Context* cuda_ctx_for_sync; extern unsigned layer_number_to_sync; @@ -253,37 +254,35 @@ struct GNNSampleSumAggregate { //! No-op: readAny = overwritten anyways static void reset(uint32_t, char&) {} - //! element wise set + // version where you have a vector object static void setVal(uint32_t node_id, char&, ValTy y) { assert(y.size() == gnn_matrix_to_sync_column_length_); - if ((*gnn_lid_to_sid_pointer_)[node_id] == - std::numeric_limits::max()) { + uint32_t converted_sid = (*gnn_lid_to_sid_pointer_)[node_id]; + if (converted_sid >= num_active_layer_rows_ || + converted_sid == std::numeric_limits::max()) { return; } - assert((*gnn_lid_to_sid_pointer_)[node_id] < subgraph_size_); + assert(converted_sid < subgraph_size_); // loop and do addition for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { - gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] * - gnn_matrix_to_sync_column_length_ + + gnn_matrix_to_sync_[converted_sid * gnn_matrix_to_sync_column_length_ + i] = y[i]; } } + + // version where you have a pointer only (more efficient because this + // version is for reading directly from the recv buffer) static void setVal(uint32_t node_id, char&, ValTy::value_type* y) { - if ((*gnn_lid_to_sid_pointer_)[node_id] == - std::numeric_limits::max()) { + uint32_t converted_sid = (*gnn_lid_to_sid_pointer_)[node_id]; + if (converted_sid >= num_active_layer_rows_ || + converted_sid == std::numeric_limits::max()) { return; } // loop and do addition for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { - // galois::gPrint(galois::runtime::getSystemNetworkInterface().ID, "] - // broadxast nodeid ", node_id, " sid ", - // (*gnn_lid_to_sid_pointer_)[node_id], - // " write ", (*gnn_lid_to_sid_pointer_)[node_id] * - // gnn_matrix_to_sync_column_length_ + i, "\n"); - gnn_matrix_to_sync_[(*gnn_lid_to_sid_pointer_)[node_id] * - gnn_matrix_to_sync_column_length_ + + gnn_matrix_to_sync_[converted_sid * gnn_matrix_to_sync_column_length_ + i] = y[i]; } } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 6f87f3e88e..c12701d926 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -45,6 +45,7 @@ size_t subgraph_size_ = 0; //! For synchronization of graph aggregations galois::DynamicBitSet bitset_graph_aggregate; galois::LargeArray* gnn_lid_to_sid_pointer_ = nullptr; +size_t num_active_layer_rows_ = 0; uint32_t* gnn_degree_vec_1_; uint32_t* gnn_degree_vec_2_; @@ -200,12 +201,13 @@ bool galois::graphs::GNNGraph::IsValidForPhaseMasked( return (*mask_to_use)[lid]; } -void galois::graphs::GNNGraph::AggregateSync(GNNFloat* matrix_to_sync, - const size_t matrix_column_size, - bool is_backward) const { +void galois::graphs::GNNGraph::AggregateSync( + GNNFloat* matrix_to_sync, const size_t matrix_column_size, bool is_backward, + uint32_t active_row_boundary) const { gnn_matrix_to_sync_ = matrix_to_sync; gnn_matrix_to_sync_column_length_ = matrix_column_size; subgraph_size_ = active_size(); + num_active_layer_rows_ = active_row_boundary; if (!use_subgraph_ && !use_subgraph_view_) { // set globals for the sync substrate if (!is_backward) { @@ -971,11 +973,11 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { // Seed nodes sync if (use_timer_) { sync_substrate_ - ->sync( + ->sync( "SeedNodeSample"); } else { sync_substrate_ - ->sync( + ->sync( "Ignore"); } galois::GAccumulator local_seed_count; @@ -1058,11 +1060,11 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, if (use_timer_) { sync_substrate_ - ->sync( + ->sync( "SampleFlag"); } else { sync_substrate_ - ->sync( + ->sync( "Ignore"); } @@ -1155,11 +1157,11 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, // correctly if (use_timer_) { sync_substrate_ - ->sync( + ->sync( "SampleFlag"); } else { sync_substrate_ - ->sync( + ->sync( "Ignore"); } diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 29f4719996..636d7690b9 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -573,11 +573,15 @@ void galois::SAGELayer::AggregateAll( pts, bool is_backward) { std::string agg_timer_name = "AggregateCompute"; + size_t num_rows_to_handle; if (!is_backward) { agg_timer_name += "Forward"; + num_rows_to_handle = layer_dimensions_.output_rows; } else { agg_timer_name += "Backward"; + num_rows_to_handle = layer_dimensions_.input_rows; } + galois::StatTimer timer(agg_timer_name.c_str(), kRegionName); TimerStart(&timer); @@ -597,8 +601,10 @@ void galois::SAGELayer::AggregateAll( AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts, is_backward); TimerStop(&timer); + // aggregate sync - graph_.AggregateSync(aggregate_output, column_length, is_backward); + graph_.AggregateSync(aggregate_output, column_length, is_backward, + num_rows_to_handle); #ifdef GALOIS_ENABLE_GPU } #endif From 44c003dd6f19db03d6956bd89516c5c4b99c9e0b Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 12 Jul 2021 21:27:39 -0500 Subject: [PATCH 585/660] GCN barrier before start --- lonestar/gnn/distributed/gcn/gcn-dist.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp index e3dd1cac77..f33fd89c38 100644 --- a/lonestar/gnn/distributed/gcn/gcn-dist.cpp +++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp @@ -13,6 +13,8 @@ int main(int argc, char* argv[]) { gnn->SetLayerPhases(galois::GNNPhase::kTrain); init_timer.stop(); + galois::runtime::getHostBarrier().wait(); + galois::StatTimer compute_timer("Timer_0"); compute_timer.start(); gnn->Train(num_epochs); From 14c6168ce12389c35f4e4381afadf624cdf6b196 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 12 Jul 2021 21:29:07 -0500 Subject: [PATCH 586/660] Parallel SID mapping for GNN subgraph SID mapping isn't slow if graph is sufficiently small, but it becomes a big problem once graphs scale up. This commit makes the process parallel using prefixsums. TODO need to double check it code wise for sanity. --- libgnn/include/galois/graphs/GNNSubgraph.h | 46 +++++ libgnn/src/graphs/GNNSubgraph.cpp | 205 ++++++++++++++++----- 2 files changed, 205 insertions(+), 46 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h index 2836e3f181..d9abd10c30 100644 --- a/libgnn/include/galois/graphs/GNNSubgraph.h +++ b/libgnn/include/galois/graphs/GNNSubgraph.h @@ -121,6 +121,45 @@ class GNNSubgraph { //! original graph. Should be done every epoch when sampled graph changes. void CreateSubgraphMapping(GNNGraph& gnn_graph, size_t); + //! reset sid thread offsets used for parallel SID mapping creation + void ResetSIDThreadOffsets(size_t num_layers) { + if (!sid_thread_offsets_.size()) { + sid_thread_offsets_.resize(galois::getActiveThreads()); + galois::on_each([&](size_t thread_id, size_t) { + sid_thread_offsets_[thread_id].resize(num_layers); + }); + } + + if (!subgraph_mirror_offsets_.size()) { + subgraph_mirror_offsets_.resize(galois::getActiveThreads()); + galois::on_each([&](size_t thread_id, size_t) { + subgraph_mirror_offsets_[thread_id].resize( + galois::runtime::getSystemNetworkInterface().Num); + }); + } + + galois::do_all( + galois::iterate(size_t{0}, sid_thread_offsets_.size()), [&](size_t i) { + galois::PODResizeableArray& arr = sid_thread_offsets_[i]; + std::fill(arr.begin(), arr.end(), 0); + galois::PODResizeableArray& arr2 = + subgraph_mirror_offsets_[i]; + std::fill(arr2.begin(), arr2.end(), 0); + }); + + if (thread_zero_work_.size() < num_layers) { + thread_zero_work_.resize(num_layers); + } + if (thread_zero_mirror_offsets_.size() < + galois::runtime::getSystemNetworkInterface().Num) { + thread_zero_mirror_offsets_.resize( + galois::runtime::getSystemNetworkInterface().Num); + } + std::fill(thread_zero_work_.begin(), thread_zero_work_.end(), 0); + std::fill(thread_zero_mirror_offsets_.begin(), + thread_zero_mirror_offsets_.end(), 0); + } + //! Counts in and out degrees of all sampled nodes in the graph void DegreeCounting(const GNNGraph& gnn_graph); //! Creates edges @@ -159,5 +198,12 @@ class GNNSubgraph { galois::PODResizeableArray in_subedge_to_original_edge_; //! Mirror mappings for Gluon for subgraph + // std::vector> subgraph_mirrors_; std::vector> subgraph_mirrors_; + + //! Offsets to use for + std::vector> sid_thread_offsets_; + std::vector> subgraph_mirror_offsets_; + galois::PODResizeableArray thread_zero_work_; + galois::PODResizeableArray thread_zero_mirror_offsets_; }; diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index 141390e20e..a19d1d1320 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -7,7 +7,6 @@ size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph( TimerStart(&timer); for (auto& vec : subgraph_mirrors_) { vec.clear(); - // vec.reserve(num_subgraph_nodes_ - subgraph_master_boundary_); } CreateSubgraphMapping(gnn_graph, num_sampled_layers); if (num_subgraph_nodes_ == 0) { @@ -43,23 +42,30 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( std::fill(lid_to_subgraph_id_.begin(), lid_to_subgraph_id_.end(), std::numeric_limits::max()); - std::vector& master_offsets = gnn_graph.GetMasterOffsets(); - std::vector& mirror_offsets = gnn_graph.GetMirrorOffsets(); - galois::GAccumulator subgraph_count; subgraph_count.reset(); galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()), [&](uint32_t node_id) { - // if (gnn_graph.IsInSampledGraph(node_id)) { if (gnn_graph.IsActiveInSubgraph(node_id)) { subgraph_count += 1; } }); num_subgraph_nodes_ = subgraph_count.reduce(); + // if no subgraph, get out + if (num_subgraph_nodes_ == 0) { + subgraph_master_boundary_ = 0; + TimerStop(&timer); + return; + } + if (subgraph_id_to_lid_.size() < num_subgraph_nodes_) { + // allocate a bit more than necessary to avoid a big realloc + // if node value changes slightly later subgraph_id_to_lid_.resize(num_subgraph_nodes_ * 1.02); } + // bitset to mark if a master is outside the "master only" boundary + // and not contiguous; needed to mask out non-masters galois::DynamicBitSet& non_layer_zero_masters = gnn_graph.GetNonLayerZeroMasters(); // init the bitset as necessary @@ -69,8 +75,12 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( non_layer_zero_masters.reset(); } + std::vector& master_offsets = gnn_graph.GetMasterOffsets(); + std::vector& mirror_offsets = gnn_graph.GetMirrorOffsets(); + + ResetSIDThreadOffsets(master_offsets.size()); + // compute offsets for each layer - uint32_t layer_zero_offset = 0; galois::PODResizeableArray layer_offsets; layer_offsets.resize(master_offsets.size() - 1); for (unsigned i = 0; i < layer_offsets.size(); i++) { @@ -81,59 +91,162 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( } } - // split into 2 parts: masters, then everything else + // all nodes before this SID are master nodes in layer 0; + // NOTE: there are master nodes past this boundary that will + // not be covered by a begin_owned loop, which may cause problems down + // the line; this is handled by the bitset above + subgraph_master_boundary_ = master_offsets[0]; + size_t last_owned_node = *(gnn_graph.end_owned()); - for (size_t local_node_id = 0; local_node_id < last_owned_node; - local_node_id++) { - if (gnn_graph.IsActiveInSubgraph(local_node_id)) { - unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id); - if (node_timestamp != std::numeric_limits::max()) { - uint32_t sid_to_use; - if (node_timestamp != 0) { - sid_to_use = layer_offsets[node_timestamp - 1]++; - // master that won't be in prefix needs to be marked - non_layer_zero_masters.set(sid_to_use); + // compute amount of work each thread needs to do + galois::on_each([&](size_t thread_id, size_t num_threads) { + unsigned start_node; + unsigned end_node; + // this thread always has a set number of nodes to run; this is it + std::tie(start_node, end_node) = galois::block_range( + size_t{0}, gnn_graph.size(), thread_id, num_threads); + // these arrays track how much work will need to be done by this + // thread + galois::PODResizeableArray& my_offsets = + sid_thread_offsets_[thread_id]; + galois::PODResizeableArray& my_mirror_offsets = + subgraph_mirror_offsets_[thread_id]; + + for (size_t local_node_id = start_node; local_node_id < end_node; + local_node_id++) { + // only bother if node was active + if (gnn_graph.IsActiveInSubgraph(local_node_id)) { + unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id); + // TODO(loc) this check shouldn't even be necessary; active in subgraph + // implies added at somepoint + if (node_timestamp != std::numeric_limits::max()) { + // tracks how many nodes for each timestamp this node will + // work with by incrementing this + my_offsets[node_timestamp]++; + + if (local_node_id >= last_owned_node) { + // this is a mirror node; get the host that the master is located + // on and increment this thread's mirror node count for that host + uint32_t node_gid = gnn_graph.GetGID(local_node_id); + my_mirror_offsets[gnn_graph.GetHostID(node_gid)]++; + } } else { - sid_to_use = layer_zero_offset++; + GALOIS_LOG_WARN("shouldn't ever get here right?"); } - subgraph_id_to_lid_[sid_to_use] = local_node_id; - lid_to_subgraph_id_[local_node_id] = sid_to_use++; } } + }); + + // prefix sum the threads + galois::do_all(galois::iterate(size_t{0}, master_offsets.size()), + [&](size_t layer_num) { + for (size_t thread_id = 1; + thread_id < galois::getActiveThreads(); thread_id++) { + sid_thread_offsets_[thread_id][layer_num] += + sid_thread_offsets_[thread_id - 1][layer_num]; + } + }); + + for (unsigned i = 0; i < master_offsets.size() - 1; i++) { + if (i > 0) { + GALOIS_LOG_VASSERT( + sid_thread_offsets_[galois::getActiveThreads() - 1][i] + + layer_offsets[i - 1] == + (layer_offsets[i]), + "layer {} wrong {} vs correct {}", i, + sid_thread_offsets_[galois::getActiveThreads() - 1][i], + layer_offsets[i]); + } else { + GALOIS_LOG_VASSERT( + sid_thread_offsets_[galois::getActiveThreads() - 1][i] == + (layer_offsets[i]), + "layer {} wrong {} vs correct {}", i, + sid_thread_offsets_[galois::getActiveThreads() - 1][i], + layer_offsets[i]); + } } - // all nodes before this SID are master nodes in layer 0; - // NOTE: there are master nodes past this boundary that will - // not be covered by a begin_owned loop, which may cause problems down - // the line - subgraph_master_boundary_ = master_offsets[0]; + // last element of prefix sum needs to equal the correct layer offset + galois::do_all( + galois::iterate(uint32_t{0}, + galois::runtime::getSystemNetworkInterface().Num), + [&](size_t host_num) { + // for each host, get prefix sum of each thread's mirrors + for (size_t thread_id = 1; thread_id < galois::getActiveThreads(); + thread_id++) { + subgraph_mirror_offsets_[thread_id][host_num] += + subgraph_mirror_offsets_[thread_id - 1][host_num]; + } + }); + + // allocate the mirror space; last element of prefix sum is total size + for (unsigned host_num = 0; + host_num < galois::runtime::getSystemNetworkInterface().Num; + host_num++) { + if (galois::runtime::getSystemNetworkInterface().ID == host_num) { + continue; + } + subgraph_mirrors_[host_num].resize( + subgraph_mirror_offsets_[galois::getActiveThreads() - 1][host_num]); + } + + galois::on_each([&](size_t thread_id, size_t num_threads) { + unsigned start_node; + unsigned end_node; + std::tie(start_node, end_node) = galois::block_range( + size_t{0}, gnn_graph.size(), thread_id, num_threads); + + galois::PODResizeableArray& current_thread_offset = + thread_id != 0 ? sid_thread_offsets_[thread_id - 1] : thread_zero_work_; + galois::PODResizeableArray& my_mirror_offsets = + thread_id != 0 ? subgraph_mirror_offsets_[thread_id - 1] + : thread_zero_mirror_offsets_; + + for (size_t local_node_id = start_node; local_node_id < end_node; + local_node_id++) { + if (gnn_graph.IsActiveInSubgraph(local_node_id)) { + unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id); + if (node_timestamp != std::numeric_limits::max()) { + uint32_t sid_to_use; + if (node_timestamp != 0) { + sid_to_use = layer_offsets[node_timestamp - 1] + + current_thread_offset[node_timestamp]++; + if (local_node_id < last_owned_node) { + // master node that is not in layer 0 (i.e. node_timestamp != 0) + non_layer_zero_masters.set(sid_to_use); + } + } else { + // node timestamp == 0; no layer offset needed because offset + // is 0 + sid_to_use = current_thread_offset[node_timestamp]++; + } - // everything else; none of these are master nodes - for (size_t local_node_id = last_owned_node; local_node_id < gnn_graph.size(); - local_node_id++) { - if (gnn_graph.IsActiveInSubgraph(local_node_id)) { - unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id); - if (node_timestamp != std::numeric_limits::max()) { - uint32_t sid_to_use; - if (node_timestamp != 0) { - sid_to_use = layer_offsets[node_timestamp - 1]++; + // this is a mirror + if (local_node_id >= last_owned_node) { + // XXX(loc) mirror offsets + uint32_t node_gid = gnn_graph.GetGID(local_node_id); + size_t my_offset = + my_mirror_offsets[gnn_graph.GetHostID(node_gid)]++; + + if (my_offset > + subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].size()) + GALOIS_LOG_FATAL( + "{} {}", my_offset, + subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].size()); + + subgraph_mirrors_[gnn_graph.GetHostID(node_gid)][my_offset] = + node_gid; + } + + subgraph_id_to_lid_[sid_to_use] = local_node_id; + lid_to_subgraph_id_[local_node_id] = sid_to_use; } else { - sid_to_use = layer_zero_offset++; + GALOIS_LOG_WARN("shouldn't ever get here right?"); } - subgraph_id_to_lid_[sid_to_use] = local_node_id; - lid_to_subgraph_id_[local_node_id] = sid_to_use++; - - uint32_t node_gid = gnn_graph.GetGID(local_node_id); - // mirror node; gids because they need to be sent as gids - // and converted over later - assert(node_gid < gnn_graph.global_size()); - assert(subgraph_mirrors_.size() > gnn_graph.GetHostID(node_gid)); - subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].push_back(node_gid); } } - } + }); - GALOIS_LOG_ASSERT(layer_offsets.back() == num_subgraph_nodes_); TimerStop(&timer); } From 7546f211f0d14834f86ef1927edcfd19b91e7d15 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 16 Jul 2021 20:17:36 -0500 Subject: [PATCH 587/660] Minibatch shuffling between epochs First half of ogbn paper fix: minibatches need to be shuffled between epochs to prevent overfitting to the same node minibatches in every epoch. The second half is input related: my remapping of the graph seems to have broken something as using the original graph results in much better accuracy. For the time being I will use that one. --- libgnn/include/galois/MinibatchGenerator.h | 54 ++++++++++++++++++++-- libgnn/include/galois/graphs/GNNGraph.h | 1 + libgnn/src/MinibatchGenerator.cpp | 14 +++++- 3 files changed, 64 insertions(+), 5 deletions(-) diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h index 8a5063ed1d..459014f65a 100644 --- a/libgnn/include/galois/MinibatchGenerator.h +++ b/libgnn/include/galois/MinibatchGenerator.h @@ -2,6 +2,8 @@ #include "galois/GNNTypes.h" #include "galois/Logging.h" +#include +#include namespace galois { @@ -12,20 +14,64 @@ class MinibatchGenerator { MinibatchGenerator(const GNNMask& mask_to_minibatch, size_t minibatch_size, size_t master_bound) : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size}, - current_position_{0}, master_bound_{master_bound} { + current_position_{0}, master_bound_{master_bound}, rand_generator_{ + 100} { GALOIS_LOG_ASSERT(master_bound_ <= mask_to_minibatch_.size()); } - void GetNextMinibatch(std::vector* batch_mask); + + void GetNextMinibatch(std::vector* batch_mask) { + if (!shuffle_mode_) { + OriginalGetNextMinibatch(batch_mask); + } else { + ShuffleGetNextMinibatch(batch_mask); + } + } + //! True if no more minibatches from this generator - bool NoMoreMinibatches() { return current_position_ == master_bound_; } + bool NoMoreMinibatches() { + if (!shuffle_mode_) { + return current_position_ == master_bound_; + } else { + return current_position_ >= all_indices_.size(); + } + } + //! Reset the only state (a position bit) - void ResetMinibatchState() { current_position_ = 0; } + void ResetMinibatchState() { + current_position_ = 0; + if (shuffle_mode_) { + std::shuffle(all_indices_.begin(), all_indices_.end(), rand_generator_); + } + } + + void ShuffleMode() { + if (!shuffle_mode_) { + shuffle_mode_ = true; + all_indices_.reserve(master_bound_); + // setup all set indices for the minibatch + for (size_t pos = 0; pos < master_bound_; pos++) { + if (mask_to_minibatch_[pos]) { + all_indices_.emplace_back(pos); + } + } + // shuffle it + std::shuffle(all_indices_.begin(), all_indices_.end(), rand_generator_); + printf("Number of things in minibatch generator is %lu\n", + all_indices_.size()); + } + } private: const GNNMask& mask_to_minibatch_; size_t minibatch_size_; size_t current_position_; size_t master_bound_; + std::vector all_indices_; + bool shuffle_mode_ = false; + std::mt19937 rand_generator_; + + void OriginalGetNextMinibatch(std::vector* batch_mask); + void ShuffleGetNextMinibatch(std::vector* batch_mask); }; } // namespace galois diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 36115929d7..0dc906c772 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -397,6 +397,7 @@ class GNNGraph { } train_batcher_ = std::make_unique( local_training_mask_, train_batch_size, *end_owned()); + train_batcher_->ShuffleMode(); local_minibatch_mask_.resize(partitioned_graph_->size()); } diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp index 48570c094e..a0c66f703b 100644 --- a/libgnn/src/MinibatchGenerator.cpp +++ b/libgnn/src/MinibatchGenerator.cpp @@ -1,7 +1,7 @@ #include "galois/MinibatchGenerator.h" #include -void galois::MinibatchGenerator::GetNextMinibatch( +void galois::MinibatchGenerator::OriginalGetNextMinibatch( std::vector* batch_mask) { assert(current_position_ <= mask_to_minibatch_.size()); assert(current_position_ <= master_bound_); @@ -33,3 +33,15 @@ void galois::MinibatchGenerator::GetNextMinibatch( current_position_++; } } + +void galois::MinibatchGenerator::ShuffleGetNextMinibatch( + std::vector* batch_mask) { + size_t current_count = 0; + std::fill(batch_mask->begin(), batch_mask->end(), 0); + while (current_position_ < all_indices_.size()) { + (*batch_mask)[all_indices_[current_position_++]] = 1; + current_count++; + if (current_count == minibatch_size_) + break; + } +} From b68fcae6f4a1e9e7b32119b49554000bafe0968f Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 19 Jul 2021 21:29:23 -0500 Subject: [PATCH 588/660] WIP: timing subgraph clears and beginning to fix --- libgnn/src/GraphNeuralNetwork.cpp | 8 +++++++- libgnn/src/graphs/GNNGraph.cpp | 24 +++++++++++++++++++----- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index b6c38963ed..898523aedf 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -236,7 +236,7 @@ float galois::GraphNeuralNetwork::MinibatchedTesting() { } } - galois::gDebug("Minibatching Correct / Total ", correct, " ", total); + galois::gInfo("Minibatching Correct / Total ", correct, " ", total); if (choose_all_status) { graph_->EnableSubgraphChooseAll(); @@ -366,6 +366,8 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // create mini batch graphs and loop until minibatches on all hosts done while (true) { + galois::StatTimer prep_timer("PrepNextMinibatch", kRegionName); + galois::StatTimer sample_time("MinibatchSampling", kRegionName); galois::StatTimer mb_timer("MinibatchSubgraphCreation", kRegionName); mb_timer.start(); @@ -374,15 +376,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { work_left_.reset(); galois::gInfo("Epoch ", epoch, " batch ", batch_num++); // break when all hosts are done with minibatches + prep_timer.start(); size_t seed_node_count = graph_->PrepareNextTrainMinibatch(); galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is for batch is ", seed_node_count); + prep_timer.stop(); // last layer input size/output rows becomes seed node size // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, // seed_node_count); + sample_time.start(); // +1 later in call because 0 is already taken size_t num_sampled_layers = 0; for (auto back_iter = gnn_layers_.rbegin(); @@ -414,6 +419,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { num_sampled_layers++; } } + sample_time.stop(); // resize layer matrices CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index c12701d926..edb6738bcc 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -932,7 +932,7 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { } else { UnsetSampledNode(*x); } - }); + }, galois::loopname("InitialSeedSetting")); // unsets nodes set in previous iterations; for some reason they get // synchronized along with everything else even though bitset sample flag // should prevent it (that, or it's because they don't get sync'd that they @@ -941,10 +941,13 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { [&](const NodeIterator& x) { UnsetSampledNode(*x); }); // clear node timestamps + galois::StatTimer fill_time("ClearFillTime"); + fill_time.start(); std::fill(sample_node_timestamps_.begin(), sample_node_timestamps_.end(), std::numeric_limits::max()); std::fill(sample_master_offsets_.begin(), sample_master_offsets_.end(), 0); std::fill(sample_mirror_offsets_.begin(), sample_mirror_offsets_.end(), 0); + fill_time.stop(); for (unsigned i = 0; i < master_offset_accum_.size(); i++) { master_offset_accum_[i].reset(); @@ -954,7 +957,8 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { // clear all sampled edges galois::do_all( galois::iterate(edge_sample_status_.begin(), edge_sample_status_.end()), - [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); }); + [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); }, + galois::loopname("ClearSampleEdges")); sampled_edges_.reset(); // reset all degrees @@ -962,8 +966,11 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { galois::do_all( galois::iterate(sampled_out_degrees_), [&](galois::LargeArray& array) { - std::fill(array.begin(), array.end(), 0); + memset(array.data(), 0, array.size() * sizeof(uint32_t)) + //std::fill(array.begin(), array.end(), 0); + //std::fill(array.begin(), array.end(), 0); }, + galois::loopname("ClearAllDegrees"), galois::chunk_size<1>()); } @@ -1001,7 +1008,8 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { // 0 = seed node sample_node_timestamps_[*x] = 0; } - }); + }, + galois::loopname("SeedNodeOffsetCounting")); sample_master_offsets_[0] = master_offset.reduce(); sample_mirror_offsets_[0] = mirror_offset.reduce(); @@ -1204,6 +1212,8 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, "Ignore"); } + galois::StatTimer offsets_n_rows_time("OffsetRowSubgraphTime"); + offsets_n_rows_time.start(); galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { if (IsActiveInSubgraph(*x)) { if (sample_node_timestamps_[*x] != std::numeric_limits::max()) { @@ -1219,7 +1229,9 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, "should have been timestamped at some point if active"); } } - }); + }, + galois::loopname("MasterMirrorOffset") + ); std::vector new_rows(master_offset_accum_.size()); for (unsigned i = 0; i < master_offset_accum_.size(); i++) { @@ -1231,6 +1243,8 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, } } + offsets_n_rows_time.stop(); + if (!use_view) { subgraph_->BuildSubgraph(*this, num_sampled_layers); } else { From 44e7995c1d4ae04455a129c576d13b08755d2e01 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Mon, 19 Jul 2021 22:38:37 -0500 Subject: [PATCH 589/660] Parallel bitset reset and parallel std::fill fill and reset operations for GNNs are a very big bottleneck for large graphs. This commit adds parallel reset for bitsets and std::fill and uses them in the GNN libraries to avoid resetting them in parallel. --- libgalois/include/galois/DynamicBitset.h | 6 + libgalois/include/galois/PODResizeableArray.h | 2 +- libgalois/include/galois/ParallelSTL.h | 6 + libgalois/include/galois/TwoDVector.h | 10 +- libgnn/include/galois/graphs/GNNSubgraph.h | 7 +- libgnn/src/GraphNeuralNetwork.cpp | 4 +- libgnn/src/MinibatchGenerator.cpp | 5 +- libgnn/src/graphs/GNNGraph.cpp | 149 ++++++++++-------- libgnn/src/graphs/GNNSubgraph.cpp | 5 +- 9 files changed, 114 insertions(+), 80 deletions(-) diff --git a/libgalois/include/galois/DynamicBitset.h b/libgalois/include/galois/DynamicBitset.h index 6bb9c34864..e2035d018a 100644 --- a/libgalois/include/galois/DynamicBitset.h +++ b/libgalois/include/galois/DynamicBitset.h @@ -109,6 +109,12 @@ class DynamicBitSet { */ void reset() { std::fill(bitvec.begin(), bitvec.end(), 0); } + void ParallelReset() { + galois::do_all( + galois::iterate(bitvec), + [&](galois::CopyableAtomic& to_reset) { to_reset = 0; }); + } + /** * Unset a range of bits given an inclusive range * diff --git a/libgalois/include/galois/PODResizeableArray.h b/libgalois/include/galois/PODResizeableArray.h index dc1cabdb48..acff59c8e9 100644 --- a/libgalois/include/galois/PODResizeableArray.h +++ b/libgalois/include/galois/PODResizeableArray.h @@ -187,7 +187,7 @@ class PODResizeableArray { void insert(iterator GALOIS_USED_ONLY_IN_DEBUG(position), InputIterator first, InputIterator last) { assert(position == end()); - size_t to_add = last - first; + size_t to_add = last - first; if (to_add > 0) { size_t old_size = size_; resize(old_size + to_add); diff --git a/libgalois/include/galois/ParallelSTL.h b/libgalois/include/galois/ParallelSTL.h index c22858c84f..4158a6dc5c 100644 --- a/libgalois/include/galois/ParallelSTL.h +++ b/libgalois/include/galois/ParallelSTL.h @@ -377,6 +377,12 @@ OutputIt partial_sum(InputIt first, InputIt last, OutputIt d_first) { } } +template +void fill(ForwardIt first, ForwardIt last, const T& value) { + galois::do_all(galois::iterate(first, last), + [&](auto& iter) { iter = value; }); +} + } // end namespace ParallelSTL } // end namespace galois #endif diff --git a/libgalois/include/galois/TwoDVector.h b/libgalois/include/galois/TwoDVector.h index 1af9fba505..396bb208af 100644 --- a/libgalois/include/galois/TwoDVector.h +++ b/libgalois/include/galois/TwoDVector.h @@ -1,7 +1,7 @@ #pragma once -#include "gstl.h" -#include "PODResizeableArray.h" +#include "gstl.h" +#include "PODResizeableArray.h" namespace galois { @@ -27,18 +27,18 @@ class TwoDVector { assert(to_copy == fixed_vector_size_); size_t array_index = index * fixed_vector_size_; std::memcpy((void*)(&(underlying_memory_[array_index])), - (void*)to_copy.data(), - sizeof(T) * fixed_vector_size_); + (void*)to_copy.data(), sizeof(T) * fixed_vector_size_); } PODResizeableArray& edit_data() { return underlying_memory_; } const PODResizeableArray& data() { return underlying_memory_; } void resize(size_t s) { underlying_memory_.resize(s); } size_t size() const { return underlying_memory_.size(); } + private: size_t num_elements_{0}; size_t fixed_vector_size_{0}; PODResizeableArray underlying_memory_; }; -} +} // namespace galois diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h index d9abd10c30..c7692533ba 100644 --- a/libgnn/include/galois/graphs/GNNSubgraph.h +++ b/libgnn/include/galois/graphs/GNNSubgraph.h @@ -155,9 +155,10 @@ class GNNSubgraph { thread_zero_mirror_offsets_.resize( galois::runtime::getSystemNetworkInterface().Num); } - std::fill(thread_zero_work_.begin(), thread_zero_work_.end(), 0); - std::fill(thread_zero_mirror_offsets_.begin(), - thread_zero_mirror_offsets_.end(), 0); + galois::ParallelSTL::fill(thread_zero_work_.begin(), + thread_zero_work_.end(), 0); + galois::ParallelSTL::fill(thread_zero_mirror_offsets_.begin(), + thread_zero_mirror_offsets_.end(), 0); } //! Counts in and out degrees of all sampled nodes in the graph diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 898523aedf..39c8c03eb0 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -387,7 +387,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, // seed_node_count); - sample_time.start(); + sample_time.start(); // +1 later in call because 0 is already taken size_t num_sampled_layers = 0; for (auto back_iter = gnn_layers_.rbegin(); @@ -419,7 +419,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { num_sampled_layers++; } } - sample_time.stop(); + sample_time.stop(); // resize layer matrices CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp index a0c66f703b..120a1e7533 100644 --- a/libgnn/src/MinibatchGenerator.cpp +++ b/libgnn/src/MinibatchGenerator.cpp @@ -1,4 +1,5 @@ #include "galois/MinibatchGenerator.h" +#include "galois/Galois.h" #include void galois::MinibatchGenerator::OriginalGetNextMinibatch( @@ -7,7 +8,7 @@ void galois::MinibatchGenerator::OriginalGetNextMinibatch( assert(current_position_ <= master_bound_); assert(batch_mask->size() == mask_to_minibatch_.size()); - std::fill(batch_mask->begin(), batch_mask->end(), 0); + galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0); if (current_position_ >= master_bound_) { return; } @@ -37,7 +38,7 @@ void galois::MinibatchGenerator::OriginalGetNextMinibatch( void galois::MinibatchGenerator::ShuffleGetNextMinibatch( std::vector* batch_mask) { size_t current_count = 0; - std::fill(batch_mask->begin(), batch_mask->end(), 0); + galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0); while (current_position_ < all_indices_.size()) { (*batch_mask)[all_indices_[current_position_++]] = 1; current_count++; diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index edb6738bcc..78d975ceee 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -919,20 +919,24 @@ void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { DisableSubgraph(); - bitset_sample_flag_.resize(size()); - bitset_sample_flag_.reset(); - definitely_sampled_nodes_.reset(); - - galois::do_all(galois::iterate(begin_owned(), end_owned()), - [&](const NodeIterator& x) { - if (IsValidForPhase(*x, seed_phase)) { - SetSampledNode(*x); - bitset_sample_flag_.set(*x); - definitely_sampled_nodes_.set(*x); - } else { - UnsetSampledNode(*x); - } - }, galois::loopname("InitialSeedSetting")); + if (!bitset_sample_flag_.size()) { + bitset_sample_flag_.resize(size()); + } + bitset_sample_flag_.ParallelReset(); + definitely_sampled_nodes_.ParallelReset(); + + galois::do_all( + galois::iterate(begin_owned(), end_owned()), + [&](const NodeIterator& x) { + if (IsValidForPhase(*x, seed_phase)) { + SetSampledNode(*x); + bitset_sample_flag_.set(*x); + definitely_sampled_nodes_.set(*x); + } else { + UnsetSampledNode(*x); + } + }, + galois::loopname("InitialSeedSetting")); // unsets nodes set in previous iterations; for some reason they get // synchronized along with everything else even though bitset sample flag // should prevent it (that, or it's because they don't get sync'd that they @@ -943,10 +947,13 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { // clear node timestamps galois::StatTimer fill_time("ClearFillTime"); fill_time.start(); - std::fill(sample_node_timestamps_.begin(), sample_node_timestamps_.end(), - std::numeric_limits::max()); - std::fill(sample_master_offsets_.begin(), sample_master_offsets_.end(), 0); - std::fill(sample_mirror_offsets_.begin(), sample_mirror_offsets_.end(), 0); + galois::ParallelSTL::fill(sample_node_timestamps_.begin(), + sample_node_timestamps_.end(), + std::numeric_limits::max()); + galois::ParallelSTL::fill(sample_master_offsets_.begin(), + sample_master_offsets_.end(), 0); + galois::ParallelSTL::fill(sample_mirror_offsets_.begin(), + sample_mirror_offsets_.end(), 0); fill_time.stop(); for (unsigned i = 0; i < master_offset_accum_.size(); i++) { @@ -955,26 +962,33 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { } // clear all sampled edges - galois::do_all( - galois::iterate(edge_sample_status_.begin(), edge_sample_status_.end()), - [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); }, - galois::loopname("ClearSampleEdges")); + galois::StatTimer ctime("ClearSampleEdges"); + ctime.start(); + for (galois::DynamicBitSet& edge_layer : edge_sample_status_) { + edge_layer.ParallelReset(); + } + ctime.stop(); + // galois::do_all( + // galois::iterate(edge_sample_status_.begin(), + // edge_sample_status_.end()), + // [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); }, + // galois::loopname("ClearSampleEdges")); + + sampled_edges_.ParallelReset(); - sampled_edges_.reset(); // reset all degrees if (!subgraph_choose_all_) { - galois::do_all( - galois::iterate(sampled_out_degrees_), - [&](galois::LargeArray& array) { - memset(array.data(), 0, array.size() * sizeof(uint32_t)) - //std::fill(array.begin(), array.end(), 0); - //std::fill(array.begin(), array.end(), 0); - }, - galois::loopname("ClearAllDegrees"), - galois::chunk_size<1>()); + galois::StatTimer cad_timer("ClearAllDegrees"); + cad_timer.start(); + for (galois::LargeArray& array : sampled_out_degrees_) { + galois::ParallelSTL::fill(array.begin(), array.end(), 0); + } + cad_timer.stop(); } - bitset_sampled_degrees_.resize(partitioned_graph_->size()); + if (!bitset_sampled_degrees_.size()) { + bitset_sampled_degrees_.resize(partitioned_graph_->size()); + } bitset_sampled_degrees_.reset(); // Seed nodes sync @@ -987,6 +1001,7 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { ->sync( "Ignore"); } + galois::GAccumulator local_seed_count; local_seed_count.reset(); galois::GAccumulator master_offset; @@ -994,22 +1009,24 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { galois::GAccumulator mirror_offset; mirror_offset.reset(); // count # of seed nodes - galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { - if (IsInSampledGraph(x)) { - if (*x < *end_owned()) { - master_offset += 1; - } else { - // mirror - mirror_offset += 1; - } + galois::do_all( + galois::iterate(begin(), end()), + [&](const NodeIterator& x) { + if (IsInSampledGraph(x)) { + if (*x < *end_owned()) { + master_offset += 1; + } else { + // mirror + mirror_offset += 1; + } - // galois::gInfo(host_prefix_, "Seed node is ", GetGID(*x)); - local_seed_count += 1; - // 0 = seed node - sample_node_timestamps_[*x] = 0; - } - }, - galois::loopname("SeedNodeOffsetCounting")); + // galois::gInfo(host_prefix_, "Seed node is ", GetGID(*x)); + local_seed_count += 1; + // 0 = seed node + sample_node_timestamps_[*x] = 0; + } + }, + galois::loopname("SeedNodeOffsetCounting")); sample_master_offsets_[0] = master_offset.reduce(); sample_mirror_offsets_[0] = mirror_offset.reduce(); @@ -1214,24 +1231,26 @@ galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, galois::StatTimer offsets_n_rows_time("OffsetRowSubgraphTime"); offsets_n_rows_time.start(); - galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { - if (IsActiveInSubgraph(*x)) { - if (sample_node_timestamps_[*x] != std::numeric_limits::max()) { - if (*x < *end_owned()) { - // master - master_offset_accum_[sample_node_timestamps_[*x]] += 1; - } else { - // mirror - mirror_offset_accum_[sample_node_timestamps_[*x]] += 1; + galois::do_all( + galois::iterate(begin(), end()), + [&](const NodeIterator& x) { + if (IsActiveInSubgraph(*x)) { + if (sample_node_timestamps_[*x] != + std::numeric_limits::max()) { + if (*x < *end_owned()) { + // master + master_offset_accum_[sample_node_timestamps_[*x]] += 1; + } else { + // mirror + mirror_offset_accum_[sample_node_timestamps_[*x]] += 1; + } + } else { + GALOIS_LOG_FATAL( + "should have been timestamped at some point if active"); + } } - } else { - GALOIS_LOG_FATAL( - "should have been timestamped at some point if active"); - } - } - }, - galois::loopname("MasterMirrorOffset") - ); + }, + galois::loopname("MasterMirrorOffset")); std::vector new_rows(master_offset_accum_.size()); for (unsigned i = 0; i < master_offset_accum_.size(); i++) { diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index a19d1d1320..720ff95413 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -39,8 +39,9 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( assert(gnn_graph.size() == lid_to_subgraph_id_.size()); // clear all mappings - std::fill(lid_to_subgraph_id_.begin(), lid_to_subgraph_id_.end(), - std::numeric_limits::max()); + galois::ParallelSTL::fill(lid_to_subgraph_id_.begin(), + lid_to_subgraph_id_.end(), + std::numeric_limits::max()); galois::GAccumulator subgraph_count; subgraph_count.reset(); From 5648fd935ff2f1f71137543be095f248ebc46f44 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Tue, 20 Jul 2021 11:29:13 -0500 Subject: [PATCH 590/660] ogbnpapers remapping program Called "remap verify" but it doesn't actually verify; it remaps the original ogbn-papers using the existing binary remapping file on cdgc servers. --- libgnn/test/CMakeLists.txt | 4 ++ libgnn/test/remapverify.cpp | 104 ++++++++++++++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 libgnn/test/remapverify.cpp diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 98b1d01e3e..e646259f87 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -23,6 +23,10 @@ add_executable(mkl_micro_dgalois mkl_micro.cpp) target_link_libraries(mkl_micro_dgalois galois_gnn) target_compile_definitions(mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1) +add_executable(remapverify remapverify.cpp) +target_link_libraries(remapverify galois_gnn) +target_compile_definitions(remapverify PUBLIC USE_DIST_GALOIS=1) + add_executable(mkl_micro_delete_galois mkl_micro.cpp) target_link_libraries(mkl_micro_delete_galois galois_gnn) target_compile_definitions(mkl_micro_delete_galois PUBLIC USE_SHARED_GALOIS_DELETE=1) diff --git a/libgnn/test/remapverify.cpp b/libgnn/test/remapverify.cpp new file mode 100644 index 0000000000..169a0f129c --- /dev/null +++ b/libgnn/test/remapverify.cpp @@ -0,0 +1,104 @@ +#include "galois/Logging.h" +#include "galois/graphs/GNNGraph.h" + +// actually does remapping +int main() { + galois::DistMemSys G; + galois::graphs::LC_CSR_Graph orig; + orig.readGraphFromGRFile( + "/net/ohm/export/iss/inputs/Learning/ogbn-papers100M.tgr"); + // orig.readGraphFromGRFile("/net/ohm/export/iss/inputs/Learning/ogbn-papers100M.gr"); + + std::vector node_indices; + node_indices.resize(orig.size(), 0); + std::vector destinations; + destinations.resize(orig.sizeEdges(), 0); + + // get mapping + std::string remap_name = + galois::default_gnn_dataset_path + "ogbn-papers100M-remap-mapping.bin"; + std::ifstream file_stream; + file_stream.open(remap_name, std::ios::binary | std::ios::in); + std::vector new_to_old(111059956); + file_stream.read((char*)new_to_old.data(), + sizeof(uint32_t) * new_to_old.size()); + file_stream.close(); + + std::vector old_to_new(111059956); + + galois::DynamicBitSet mark_all; + mark_all.resize(orig.size()); + mark_all.reset(); + + // get # edges on each node in remapped + galois::do_all( + galois::iterate(orig.begin(), orig.end()), [&](uint32_t remapped_id) { + uint32_t source_id = new_to_old[remapped_id]; + old_to_new[source_id] = remapped_id; + mark_all.set(source_id); + GALOIS_LOG_ASSERT(source_id < orig.size()); + // TODO check duplicates too + node_indices[remapped_id] = + std::distance(orig.edge_begin(source_id), orig.edge_end(source_id)); + }); + + galois::do_all(galois::iterate(0, 111059956), + [&](unsigned i) { GALOIS_LOG_ASSERT(mark_all.test(i)); }); + + // prefix sum it + for (size_t i = 1; i < node_indices.size(); i++) { + node_indices[i] += node_indices[i - 1]; + } + // write all edges + galois::do_all( + galois::iterate(orig.begin(), orig.end()), + [&](uint32_t remapped_id) { + uint32_t source_id = new_to_old[remapped_id]; + GALOIS_LOG_ASSERT(source_id < orig.size()); + uint64_t current_idx; + if (remapped_id != 0) { + current_idx = node_indices[remapped_id - 1]; + } else { + current_idx = 0; + } + uint64_t my_end = node_indices[remapped_id]; + + for (auto ei = orig.edge_begin(source_id); + ei != orig.edge_end(source_id); ei++) { + uint32_t dest = old_to_new[orig.getEdgeDst(ei)]; + destinations[current_idx++] = dest; + } + GALOIS_LOG_ASSERT(current_idx == my_end); + // TODO check duplicates too + // node_indices[remapped_id] = std::distance(orig.edge_begin(node_id), + // orig.edge_end(node_id)); + }, + galois::steal()); + + // write everything + struct Header { + uint64_t version; + uint64_t size; + uint64_t numNodes; + uint64_t numEdges; + }; + Header h; + h.version = 1; + h.size = 0; + h.numNodes = orig.size(); + h.numEdges = orig.sizeEdges(); + + std::string filename = + "/net/ohm/export/iss/inputs/Learning/ogbn-papers100M-remap.tgr"; + // std::string filename = + // "/net/ohm/export/iss/inputs/Learning/ogbn-papers100M-remap.gr"; + std::ofstream write_stream; + write_stream.open(filename, std::ios::binary | std::ios::out); + write_stream.write((char*)&h, sizeof(Header)); + write_stream.write((char*)node_indices.data(), + sizeof(uint64_t) * node_indices.size()); + write_stream.write((char*)destinations.data(), + sizeof(uint32_t) * destinations.size()); + + write_stream.close(); +} From 0b8538b1d48cc0dc828487227ee4655ce23fd1e4 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 21 Jul 2021 23:21:20 -0500 Subject: [PATCH 591/660] GNN softmax, sync, masking correctness changes 1) You cannot use readSource for sampled nodes: they need to be active across all hosts the moment that they become active on one host because all other hosts need to start calculating the gradients for them in the backward phase because they may be used later for gradient computation. (unless they're *completely* disconnected in a subgraph, but that is unknown until sampling is done; therefore, readAny must be used) 2) Softmax was using a "is valid for phase" check for nodes: this is wrong because hosts will need to compute the softmax value for nodes that aren't part of its own batch but part of another hosts' batch. 3) A matrix was being masked incorrectly by SAGE due to a faulty if condition: this broke the distributed execution accuracy curve. The if has been fixed. Performance wise, (1) and (2) will affect performance due to increased sync volume and more compute in the softmax layer. --- libgnn/include/galois/graphs/GNNGraph.h | 1 + libgnn/src/GraphNeuralNetwork.cpp | 37 +++++++++++++++---------- libgnn/src/graphs/GNNGraph.cpp | 18 ++++-------- libgnn/src/layers/SAGELayer.cpp | 35 +++++++++++++++-------- libgnn/src/layers/SoftmaxLayer.cpp | 8 ------ 5 files changed, 53 insertions(+), 46 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 0dc906c772..e46e388bf1 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -79,6 +79,7 @@ class GNNGraph { } } + bool is_owned(size_t gid) const { return partitioned_graph_->isOwned(gid); } bool is_local(size_t gid) const { return partitioned_graph_->isLocal(gid); } size_t GetLID(size_t gid) const { return partitioned_graph_->getLID(gid); } size_t GetGID(size_t lid) const { return partitioned_graph_->getGID(lid); } diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 39c8c03eb0..eca0e4022c 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -295,23 +295,28 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // swap to train subgraph if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) { graph_->EnableSubgraph(); - size_t l_count = 0; - gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]); - for (auto back_iter = gnn_layers_.rbegin(); - back_iter != gnn_layers_.rend(); back_iter++) { - GNNLayerType layer_type = (*back_iter)->layer_type(); - if (layer_type == GNNLayerType::kGraphConvolutional || - layer_type == GNNLayerType::kSAGE) { - (*back_iter) - ->ResizeInputOutputRows(subgraph_layer_sizes[l_count + 1], - subgraph_layer_sizes[l_count]); - l_count++; - } - } + // TODO(loc) this doesn't actually function as expected anymore + // with the numerous changes to the system; this commenting + // out is more of a hack for the train subgraph option (which + // probably shouldn't be used anyways) + + //size_t l_count = 0; + //gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]); + //for (auto back_iter = gnn_layers_.rbegin(); + // back_iter != gnn_layers_.rend(); back_iter++) { + // GNNLayerType layer_type = (*back_iter)->layer_type(); + // if (layer_type == GNNLayerType::kGraphConvolutional || + // layer_type == GNNLayerType::kSAGE) { + // (*back_iter) + // ->ResizeInputOutputRows(subgraph_layer_sizes[l_count + 1], + // subgraph_layer_sizes[l_count]); + // l_count++; + // } + //} CorrectBackwardLinks(); } - // beginning of epoch sampling + // beginning of epoch sampling (no minibatches) if (config_.do_sampling() && !config_.train_minibatch_size()) { galois::StatTimer mb_timer("EpochSubgraphCreation", kRegionName); mb_timer.start(); @@ -398,6 +403,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // you can minibatch with sampling or minibatch and grab all // relevant neighbors size_t current_sample_size; + if (config_.do_sampling()) { current_sample_size = graph_->SampleEdges( (*back_iter)->graph_user_layer_number(), @@ -408,10 +414,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { (*back_iter)->graph_user_layer_number(), config_.inductive_subgraph_, num_sampled_layers + 1); } + galois::gDebug(graph_->host_prefix(), "Number of local nodes for layer ", (*back_iter)->graph_user_layer_number(), " is ", current_sample_size); + // resize this layer, change seed node count //(*back_iter) // ->ResizeInputOutputRows(current_sample_size, seed_node_count); @@ -424,6 +432,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // resize layer matrices CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); CorrectBackwardLinks(); + // XXX resizes above only work for SAGE layers; will break if other // layers are tested diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 78d975ceee..e616465d1b 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -994,11 +994,11 @@ size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { // Seed nodes sync if (use_timer_) { sync_substrate_ - ->sync( + ->sync( "SeedNodeSample"); } else { sync_substrate_ - ->sync( + ->sync( "Ignore"); } @@ -1068,10 +1068,6 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, }, galois::steal(), galois::loopname("ChooseAllEdges")); - // galois::gPrint("Num sampled edges in inductive graph is ", - // sampled.reduce(), - // " out of ", total.reduce(), "\n"); - // update nodes, then communicate update to all hosts so that they can // continue the exploration galois::do_all( @@ -1085,11 +1081,11 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, if (use_timer_) { sync_substrate_ - ->sync( + ->sync( "SampleFlag"); } else { sync_substrate_ - ->sync( + ->sync( "Ignore"); } @@ -1100,8 +1096,6 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, if (IsInSampledGraph(x)) { local_sample_count += 1; if (sample_node_timestamps_[*x] == std::numeric_limits::max()) { - // galois::gInfo(host_prefix_, "Layer ", timestamp, " new node is ", - // GetGID(*x)); sample_node_timestamps_[*x] = timestamp; } } @@ -1182,11 +1176,11 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, // correctly if (use_timer_) { sync_substrate_ - ->sync( + ->sync( "SampleFlag"); } else { sync_substrate_ - ->sync( + ->sync( "Ignore"); } diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 636d7690b9..0354035958 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -155,16 +155,15 @@ void galois::SAGELayer::ResizeIntermediates(size_t new_input_rows, size_t new_output_rows) { size_t num_in_temp_elements = new_output_rows * layer_dimensions_.input_columns; - galois::gDebug("Layer num ", layer_number_, " ", in_temp_1_.size(), " and ", - num_in_temp_elements, " ", layer_dimensions_.input_columns, - " ", layer_dimensions_.output_columns); + galois::gDebug(graph_.host_prefix(), "Layer num ", layer_number_, " ", + in_temp_1_.size(), " and ", num_in_temp_elements, " ", + layer_dimensions_.input_columns, " ", + layer_dimensions_.output_columns); // if in temp is smaller than out temp, or if dropout exists if (!config_.disable_dropout || config_.disable_aggregate_after_update || layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { - galois::gDebug("in first if"); if (in_temp_1_.size() < num_in_temp_elements) { - galois::gDebug("in the resize"); galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_, ", SAGE input temp var 1 ", num_in_temp_elements, " (", FloatElementsToGB(num_in_temp_elements), " GB)"); @@ -237,6 +236,7 @@ void galois::SAGELayer::WeightGradientSyncSum2() { galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName); TimerStart(&t); int weight_size = static_cast(p_layer_weight_gradients_2_.size()); + #ifdef GALOIS_ENABLE_GPU bool gpu_direct_enabled = false; if (device_personality == DevicePersonality::GPU_CUDA && @@ -270,7 +270,8 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( galois::gDebug( "Layer ", layer_number_, " dims: ", layer_dimensions_.input_rows, " ", layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, " ", - layer_dimensions_.output_columns); + layer_dimensions_.output_columns, " ", input_embeddings.size(), " ", + layer_dimensions_.input_rows * layer_dimensions_.input_columns); galois::StatTimer timer("ForwardPhase", kRegionName); TimerStart(&timer); @@ -304,6 +305,7 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( assert(p_in_temp_1_.size() >= layer_dimensions_.output_rows * layer_dimensions_.input_columns); } + // aggregation and update AggregateAll(layer_dimensions_.input_columns, input_data, agg_data, &input_column_intermediates_); @@ -313,9 +315,11 @@ const galois::PointerWithSize galois::SAGELayer::ForwardPhase( } else { assert(p_out_temp_.size() >= layer_dimensions_.input_rows * layer_dimensions_.output_columns); + // update to aggregate // FW UpdateEmbeddings(input_data, p_out_temp_.data(), false); + // A(FW) assert(p_forward_output_matrix_.size() >= layer_dimensions_.output_rows * layer_dimensions_.output_columns); @@ -383,7 +387,6 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } if (!sage_config_.disable_concat) { - // XXX masking may not be required in sampling case where rows change if (layer_number_ != 0) { if (graph_.IsSubgraphOn()) { MaskInputNonMasters(&input_data, layer_dimensions_.input_rows, @@ -422,6 +425,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( input_data.data(), input_gradient->data(), p_layer_weight_gradients_2_.data()); TimerStop(&concat_grad_timer); + #ifdef GALOIS_ENABLE_GPU } #endif @@ -443,8 +447,6 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( MaskInputNonMasters(&agg_data, layer_dimensions_.output_rows); } } - // if concat is disabled, then input grad isn't masked; therefore, mask - // this to get the same effect #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { @@ -481,6 +483,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( // weight matrix) UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data(), true); + // pback contains F' // derivative of aggregate is the same due to symmetric graph AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(), @@ -489,8 +492,9 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } } else { // --unmasked-- - // disable concat part is here because otherwise it would get done elsewhere - // XXX masking may not be required in sampling case where rows change + + // disable concat is part of condition because otherwise this mask + // should have gotten done elsewhere if (layer_number_ != 0 && sage_config_.disable_concat) { if (graph_.IsSubgraphOn()) { MaskInputNonMasters(&input_data, layer_dimensions_.input_rows, @@ -498,7 +502,11 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } else { MaskInputNonMasters(&input_data, layer_dimensions_.input_rows); } - } else { + } + + // layer number 0 means output needs to be masked because input cannot + // be masked + if (layer_number_ == 0) { // if 0 then no input to mask: mask the gradient // this is fine because gradient won't be used to get feature gradients if (graph_.IsSubgraphOn()) { @@ -532,6 +540,9 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } #endif + // to get a correct result out temp mask cannot be masked; + // outtemp will only be masked if layer number is 0, so this + // is safe in all other cases if (layer_number_ != 0) { // derivative for update // backout = F' diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 8b99db4073..70a6afa6c3 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -32,7 +32,6 @@ galois::SoftmaxLayer::ForwardPhaseCPU( } } - if (graph_.IsValidForPhase(i, layer_phase_)) { // do softmax GNNSoftmax(feature_length, &input_embeddings[feature_length * i], &p_backward_output_matrix_[feature_length * i]); @@ -53,10 +52,6 @@ galois::SoftmaxLayer::ForwardPhaseCPU( loss_accum += input_loss_[i]; handled += 1; #endif - } else { - VectorZero(feature_length, - &p_backward_output_matrix_[i * feature_length]); - } }, // TODO chunk size? // steal on as some threads may have nothing to work on @@ -94,10 +89,8 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { const size_t feature_length = layer_dimensions_.input_columns; galois::do_all( - // galois::iterate(graph_.begin(), graph_.end()), galois::iterate(size_t{0}, layer_dimensions_.input_rows), [&](const unsigned node) { - if (graph_.IsValidForPhase(node, layer_phase_)) { if (IsSampledLayer()) { if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraphSubgraph(node)) @@ -121,7 +114,6 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { p_backward_output_matrix_[node * feature_length + idx]; } } - } }, galois::steal(), galois::loopname("SoftmaxBackward")); From 1f0dbf6aaee3cc9197aacc700b82f33800129b2e Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 22 Jul 2021 15:09:09 -0500 Subject: [PATCH 592/660] GNN: time-based randomness; parreset in subgraph 1) Parallel reset for a bitset in subgraph construction. 2) RNG for minibatcher now determined by time when called. --- libgnn/include/galois/MinibatchGenerator.h | 13 ++++++++----- libgnn/src/graphs/GNNSubgraph.cpp | 2 +- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h index 459014f65a..73a65180d0 100644 --- a/libgnn/include/galois/MinibatchGenerator.h +++ b/libgnn/include/galois/MinibatchGenerator.h @@ -2,6 +2,7 @@ #include "galois/GNNTypes.h" #include "galois/Logging.h" +#include #include #include @@ -14,8 +15,10 @@ class MinibatchGenerator { MinibatchGenerator(const GNNMask& mask_to_minibatch, size_t minibatch_size, size_t master_bound) : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size}, - current_position_{0}, master_bound_{master_bound}, rand_generator_{ - 100} { + current_position_{0}, master_bound_{master_bound} { + // set seed based on time then initialize random generate with rand() + srand(time(NULL)); + rand_generator_ = std::make_unique(rand()); GALOIS_LOG_ASSERT(master_bound_ <= mask_to_minibatch_.size()); } @@ -40,7 +43,7 @@ class MinibatchGenerator { void ResetMinibatchState() { current_position_ = 0; if (shuffle_mode_) { - std::shuffle(all_indices_.begin(), all_indices_.end(), rand_generator_); + std::shuffle(all_indices_.begin(), all_indices_.end(), *rand_generator_); } } @@ -55,7 +58,7 @@ class MinibatchGenerator { } } // shuffle it - std::shuffle(all_indices_.begin(), all_indices_.end(), rand_generator_); + std::shuffle(all_indices_.begin(), all_indices_.end(), *rand_generator_); printf("Number of things in minibatch generator is %lu\n", all_indices_.size()); } @@ -68,7 +71,7 @@ class MinibatchGenerator { size_t master_bound_; std::vector all_indices_; bool shuffle_mode_ = false; - std::mt19937 rand_generator_; + std::unique_ptr rand_generator_; void OriginalGetNextMinibatch(std::vector* batch_mask); void ShuffleGetNextMinibatch(std::vector* batch_mask); diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index 720ff95413..5e95b079fd 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -73,7 +73,7 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( if (non_layer_zero_masters.size() < num_subgraph_nodes_) { non_layer_zero_masters.resize(num_subgraph_nodes_); } else { - non_layer_zero_masters.reset(); + non_layer_zero_masters.ParallelReset(); } std::vector& master_offsets = gnn_graph.GetMasterOffsets(); From a7477e8fc26120ace304c10d380c51f17c8d8658 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 23 Jul 2021 19:14:32 -0500 Subject: [PATCH 593/660] GNN distributed minibatch shuffler As # of hosts grow in distributed setting, it becomes more difficult to sample certain kinds of minibatch distributions because in the old scheme before this commit you always had to pick a fixed number on each host: this commit changes that up and does some sync to allow each host to pick a different number from each other to shuffle up the distribution more like single host. --- libgnn/CMakeLists.txt | 1 + .../galois/DistributedMinibatchTracker.h | 63 +++++++++++++++++++ libgnn/include/galois/GraphNeuralNetwork.h | 4 ++ libgnn/include/galois/MinibatchGenerator.h | 21 +++++++ libgnn/include/galois/graphs/GNNGraph.h | 7 ++- libgnn/src/DistributedMinibatchTracker.cpp | 57 +++++++++++++++++ libgnn/src/GraphNeuralNetwork.cpp | 36 +++++++++-- libgnn/src/MinibatchGenerator.cpp | 12 ++++ libgnn/src/layers/SAGELayer.cpp | 26 ++++---- 9 files changed, 208 insertions(+), 19 deletions(-) create mode 100644 libgnn/include/galois/DistributedMinibatchTracker.h create mode 100644 libgnn/src/DistributedMinibatchTracker.cpp diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 5bf32581d7..22a18c7fdf 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -1,4 +1,5 @@ set(sources + src/DistributedMinibatchTracker.cpp src/GNNMath.cpp src/GNNOptimizers.cpp src/GraphNeuralNetwork.cpp diff --git a/libgnn/include/galois/DistributedMinibatchTracker.h b/libgnn/include/galois/DistributedMinibatchTracker.h new file mode 100644 index 0000000000..1469db0e62 --- /dev/null +++ b/libgnn/include/galois/DistributedMinibatchTracker.h @@ -0,0 +1,63 @@ +#pragma once + +#include "galois/graphs/GNNGraph.h" +#include + +namespace galois { + +//! Tracks how many nodes remain to be chosen from every host's +//! minibatch and also determines how many to pull from this +//! particular host every iteration. +class DistributedMinibatchTracker { +public: + DistributedMinibatchTracker(size_t my_host_id, size_t num_hosts, + size_t my_minibatch_nodes, + size_t total_minibatch_size) + : my_host_id_{my_host_id}, num_hosts_{num_hosts}, + total_minibatch_size_{total_minibatch_size}, complete_hosts_{0}, + rng_object_{(long unsigned)rand() * (my_host_id_ + 1)}, + int_distribution_{0, (unsigned)num_hosts_ - 1} { + max_num_on_hosts_.resize(num_hosts_, 0); + current_num_on_hosts_.resize(num_hosts_, 0); + sampled_num_on_hosts_.resize(num_hosts_, 0); + max_num_on_hosts_[my_host_id_] = my_minibatch_nodes; + + // all reduce so all get the right values + // TODO technically all reduce would be sending unnecessary 0s + // but whatever this is relatively small + MPI_Allreduce(MPI_IN_PLACE, static_cast(max_num_on_hosts_.data()), + num_hosts_, MPI_UINT32_T, MPI_SUM, MPI_COMM_WORLD); + } + + //! Reset epoch = set all current sampled back to initial state + void ResetEpoch() { + galois::do_all( + galois::iterate(size_t{0}, num_hosts_), [&](size_t host_id_) { + current_num_on_hosts_[host_id_] = max_num_on_hosts_[host_id_]; + }); + complete_hosts_ = 0; + } + + size_t GetNumberForNextMinibatch(); + + bool OutOfWork() { return complete_hosts_ == num_hosts_; } + +private: + size_t my_host_id_; + size_t num_hosts_; + size_t total_minibatch_size_; + unsigned complete_hosts_; + + std::mt19937 rng_object_; + std::uniform_int_distribution int_distribution_; + //! Maximum amount of nodes on each host; used to reset state + std::vector max_num_on_hosts_; + //! Current number of nodes left on each host; used to know how + //! to sample on each host + std::vector current_num_on_hosts_; + //! Vector to be sync'd indicating how many to grab from each + //! batch + std::vector sampled_num_on_hosts_; +}; + +} // namespace galois diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index ff13e24c41..a813378116 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -8,6 +8,7 @@ #include "galois/GNNOptimizers.h" #include "galois/graphs/GNNGraph.h" #include "galois/layers/GNNLayer.h" +#include "galois/DistributedMinibatchTracker.h" #ifdef GALOIS_ENABLE_GPU #include "galois/GraphNeuralNetwork.cuh" @@ -265,6 +266,9 @@ class GraphNeuralNetwork { //! Termination detection for minibatching galois::DGAccumulator work_left_; + size_t num_hosts_{0}; + std::unique_ptr dist_minibatch_tracker_; + #ifdef GALOIS_ENABLE_GPU //! Holds all GPU functions GraphNeuralNetworkGPU gpu_object_; diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h index 73a65180d0..7e939c9cf4 100644 --- a/libgnn/include/galois/MinibatchGenerator.h +++ b/libgnn/include/galois/MinibatchGenerator.h @@ -30,6 +30,15 @@ class MinibatchGenerator { } } + void GetNextMinibatch(std::vector* batch_mask, size_t num_to_get) { + if (!shuffle_mode_) { + // TODO + GALOIS_LOG_FATAL("not yet implemented"); + } else { + ShuffleGetNextMinibatch(batch_mask, num_to_get); + } + } + //! True if no more minibatches from this generator bool NoMoreMinibatches() { if (!shuffle_mode_) { @@ -64,6 +73,16 @@ class MinibatchGenerator { } } + //! Total number of nodes that can be minibatched by this minibatch + //! generator on this host + size_t ShuffleMinibatchTotal() { + if (shuffle_mode_) { + return all_indices_.size(); + } else { + return 0; + } + } + private: const GNNMask& mask_to_minibatch_; size_t minibatch_size_; @@ -75,6 +94,8 @@ class MinibatchGenerator { void OriginalGetNextMinibatch(std::vector* batch_mask); void ShuffleGetNextMinibatch(std::vector* batch_mask); + void ShuffleGetNextMinibatch(std::vector* batch_mask, + size_t num_to_get); }; } // namespace galois diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index e46e388bf1..044f82e7a2 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -391,7 +391,7 @@ class GNNGraph { //} ////////////////////////////////////////////////////////////////////////////// - void SetupTrainBatcher(size_t train_batch_size) { + size_t SetupTrainBatcher(size_t train_batch_size) { if (train_batcher_) { // clear before remake train_batcher_.reset(); @@ -400,6 +400,7 @@ class GNNGraph { local_training_mask_, train_batch_size, *end_owned()); train_batcher_->ShuffleMode(); local_minibatch_mask_.resize(partitioned_graph_->size()); + return train_batcher_->ShuffleMinibatchTotal(); } void ResetTrainMinibatcher() { train_batcher_->ResetMinibatchState(); } @@ -407,6 +408,10 @@ class GNNGraph { //! Setup the state for the next minibatch sampling call by using the //! minibatcher to pick up the next set batch of nodes size_t PrepareNextTrainMinibatch(); + size_t PrepareNextTrainMinibatch(size_t num_to_get) { + train_batcher_->GetNextMinibatch(&local_minibatch_mask_, num_to_get); + return SetupNeighborhoodSample(GNNPhase::kBatch); + } //! Returns true if there are still more minibatches in this graph bool MoreTrainMinibatches() { return !train_batcher_->NoMoreMinibatches(); }; diff --git a/libgnn/src/DistributedMinibatchTracker.cpp b/libgnn/src/DistributedMinibatchTracker.cpp new file mode 100644 index 0000000000..609030ae23 --- /dev/null +++ b/libgnn/src/DistributedMinibatchTracker.cpp @@ -0,0 +1,57 @@ +#include +#include "galois/DistributedMinibatchTracker.h" + +size_t galois::DistributedMinibatchTracker::GetNumberForNextMinibatch() { + galois::StatTimer timer("DistributedGetNumberForNextMinibatch"); + timer.start(); + + // TODO + for (size_t i = 0; i < total_minibatch_size_; i++) { + // pick a host, increment + unsigned chosen_host = int_distribution_(rng_object_); + assert(chosen_host < num_hosts_); + sampled_num_on_hosts_[chosen_host]++; + } + // sync and post process *the same way on all hosts* + MPI_Allreduce(MPI_IN_PLACE, static_cast(sampled_num_on_hosts_.data()), + num_hosts_, MPI_UINT32_T, MPI_SUM, MPI_COMM_WORLD); + + size_t to_return = 0; + uint32_t leftover_to_allocate = 0; + + // TODO parallel? + for (size_t i = 0; i < num_hosts_; i++) { + uint32_t proposed_to_sample = sampled_num_on_hosts_[i]; + size_t left_to_sample = current_num_on_hosts_[i]; + size_t actual_to_sample = 0; + if (left_to_sample > 0) { + actual_to_sample = std::min(proposed_to_sample, current_num_on_hosts_[i]); + + if (actual_to_sample < left_to_sample && leftover_to_allocate) { + // more left to sample and we have extra; dump more from extra if + // possible + uint32_t what_is_left = left_to_sample - actual_to_sample; + size_t more_to_sample = std::min(what_is_left, leftover_to_allocate); + leftover_to_allocate -= more_to_sample; + actual_to_sample += more_to_sample; + assert(actual_to_sample <= left_to_sample); + } + } + leftover_to_allocate = proposed_to_sample - actual_to_sample; + current_num_on_hosts_[i] -= actual_to_sample; + + sampled_num_on_hosts_[i] = 0; + if (my_host_id_ == i) { + to_return = actual_to_sample; + } + } + timer.stop(); + + if (leftover_to_allocate) { + // if there are leftovers, it means that there is no more work + // in this system period + complete_hosts_ = num_hosts_; + } + + return to_return; +} diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index eca0e4022c..feddc1fb2c 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -122,9 +122,17 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( config_.use_train_subgraph_); } + num_hosts_ = galois::runtime::getSystemNetworkInterface().Num; if (config_.train_minibatch_size()) { - graph_->SetupTrainBatcher(config_.train_minibatch_size()); + size_t local_num = + graph_->SetupTrainBatcher(config_.train_minibatch_size()); + if (num_hosts_ > 1) { + dist_minibatch_tracker_ = std::make_unique( + galois::runtime::getSystemNetworkInterface().ID, num_hosts_, + local_num, config_.train_minibatch_size()); + } } + if (config_.test_minibatch_size()) { graph_->SetupTestBatcher(config_.test_minibatch_size()); } @@ -300,9 +308,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // out is more of a hack for the train subgraph option (which // probably shouldn't be used anyways) - //size_t l_count = 0; - //gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]); - //for (auto back_iter = gnn_layers_.rbegin(); + // size_t l_count = 0; + // gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]); + // for (auto back_iter = gnn_layers_.rbegin(); // back_iter != gnn_layers_.rend(); back_iter++) { // GNNLayerType layer_type = (*back_iter)->layer_type(); // if (layer_type == GNNLayerType::kGraphConvolutional || @@ -365,6 +373,10 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { GradientPropagation(); } else { graph_->ResetTrainMinibatcher(); + if (num_hosts_ > 1) { + dist_minibatch_tracker_->ResetEpoch(); + } + SetLayerPhases(galois::GNNPhase::kBatch); size_t batch_num = 0; @@ -382,7 +394,18 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { galois::gInfo("Epoch ", epoch, " batch ", batch_num++); // break when all hosts are done with minibatches prep_timer.start(); - size_t seed_node_count = graph_->PrepareNextTrainMinibatch(); + size_t seed_node_count; + if (num_hosts_ > 1) { + size_t num_for_next_batch = + dist_minibatch_tracker_->GetNumberForNextMinibatch(); + galois::gInfo(graph_->host_prefix(), "Sampling ", num_for_next_batch, + " for this minibatch"); + seed_node_count = + graph_->PrepareNextTrainMinibatch(num_for_next_batch); + } else { + seed_node_count = graph_->PrepareNextTrainMinibatch(); + } + galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is for batch is ", seed_node_count); @@ -503,6 +526,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { epoch_timer.start(); if (!global_work_left) { + if (num_hosts_ > 1) { + GALOIS_LOG_ASSERT(dist_minibatch_tracker_->OutOfWork()); + } break; } } diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp index 120a1e7533..c1bb8c221d 100644 --- a/libgnn/src/MinibatchGenerator.cpp +++ b/libgnn/src/MinibatchGenerator.cpp @@ -46,3 +46,15 @@ void galois::MinibatchGenerator::ShuffleGetNextMinibatch( break; } } + +void galois::MinibatchGenerator::ShuffleGetNextMinibatch( + std::vector* batch_mask, size_t num_to_get) { + size_t current_count = 0; + galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0); + while (current_position_ < all_indices_.size()) { + (*batch_mask)[all_indices_[current_position_++]] = 1; + current_count++; + if (current_count == num_to_get) + break; + } +} diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 0354035958..f078d97bd9 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -155,10 +155,10 @@ void galois::SAGELayer::ResizeIntermediates(size_t new_input_rows, size_t new_output_rows) { size_t num_in_temp_elements = new_output_rows * layer_dimensions_.input_columns; - galois::gDebug(graph_.host_prefix(), "Layer num ", layer_number_, " ", - in_temp_1_.size(), " and ", num_in_temp_elements, " ", - layer_dimensions_.input_columns, " ", - layer_dimensions_.output_columns); + //galois::gDebug(graph_.host_prefix(), "Layer num ", layer_number_, " ", + // in_temp_1_.size(), " and ", num_in_temp_elements, " ", + // layer_dimensions_.input_columns, " ", + // layer_dimensions_.output_columns); // if in temp is smaller than out temp, or if dropout exists if (!config_.disable_dropout || config_.disable_aggregate_after_update || @@ -267,11 +267,11 @@ void galois::SAGELayer::WeightGradientSyncSum2() { const galois::PointerWithSize galois::SAGELayer::ForwardPhase( const galois::PointerWithSize input_embeddings) { - galois::gDebug( - "Layer ", layer_number_, " dims: ", layer_dimensions_.input_rows, " ", - layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, " ", - layer_dimensions_.output_columns, " ", input_embeddings.size(), " ", - layer_dimensions_.input_rows * layer_dimensions_.input_columns); + //galois::gDebug( + // "Layer ", layer_number_, " dims: ", layer_dimensions_.input_rows, " ", + // layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, " ", + // layer_dimensions_.output_columns, " ", input_embeddings.size(), " ", + // layer_dimensions_.input_rows * layer_dimensions_.input_columns); galois::StatTimer timer("ForwardPhase", kRegionName); TimerStart(&timer); @@ -742,10 +742,10 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, base_gpu_object_.layer_weights(), output); } else { #endif - galois::gDebug("Layer ", graph_user_layer_number_, " ", - layer_dimensions_.output_rows, " ", - layer_dimensions_.input_columns, " ", - layer_dimensions_.output_columns); + //galois::gDebug("Layer ", graph_user_layer_number_, " ", + // layer_dimensions_.output_rows, " ", + // layer_dimensions_.input_columns, " ", + // layer_dimensions_.output_columns); // CPU version is just a call into CBlas if (after) { galois::CBlasSGEMM( From 1a0a94d57ccf5daa80627ac621bd4cc535b65822 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 28 Jul 2021 15:09:38 -0500 Subject: [PATCH 594/660] Updated the distributed minibatch sampler 1) Uses normalized sampling to get more varied distributions across hosts 2) Fixes bug with original sampler where extra work from later hosts is not applied to hosts that come before it in the loop --- .../galois/DistributedMinibatchTracker.h | 16 ++- libgnn/src/DistributedMinibatchTracker.cpp | 109 +++++++++++------- libgnn/src/GraphNeuralNetwork.cpp | 6 +- 3 files changed, 83 insertions(+), 48 deletions(-) diff --git a/libgnn/include/galois/DistributedMinibatchTracker.h b/libgnn/include/galois/DistributedMinibatchTracker.h index 1469db0e62..be5496ba92 100644 --- a/libgnn/include/galois/DistributedMinibatchTracker.h +++ b/libgnn/include/galois/DistributedMinibatchTracker.h @@ -12,11 +12,13 @@ class DistributedMinibatchTracker { public: DistributedMinibatchTracker(size_t my_host_id, size_t num_hosts, size_t my_minibatch_nodes, - size_t total_minibatch_size) + size_t local_minibatch_size) : my_host_id_{my_host_id}, num_hosts_{num_hosts}, - total_minibatch_size_{total_minibatch_size}, complete_hosts_{0}, - rng_object_{(long unsigned)rand() * (my_host_id_ + 1)}, - int_distribution_{0, (unsigned)num_hosts_ - 1} { + local_minibatch_size_{local_minibatch_size}, + total_minibatch_size_{local_minibatch_size_ * num_hosts_}, + complete_hosts_{0}, rng_object_{(long unsigned)rand() * + (my_host_id_ + 1)}, + int_distribution_{1, 10} { max_num_on_hosts_.resize(num_hosts_, 0); current_num_on_hosts_.resize(num_hosts_, 0); sampled_num_on_hosts_.resize(num_hosts_, 0); @@ -40,11 +42,15 @@ class DistributedMinibatchTracker { size_t GetNumberForNextMinibatch(); - bool OutOfWork() { return complete_hosts_ == num_hosts_; } + bool OutOfWork() { + GALOIS_LOG_FATAL("NEED TO IMPLEMENT"); + return complete_hosts_ == num_hosts_; + } private: size_t my_host_id_; size_t num_hosts_; + size_t local_minibatch_size_; size_t total_minibatch_size_; unsigned complete_hosts_; diff --git a/libgnn/src/DistributedMinibatchTracker.cpp b/libgnn/src/DistributedMinibatchTracker.cpp index 609030ae23..dddbc33519 100644 --- a/libgnn/src/DistributedMinibatchTracker.cpp +++ b/libgnn/src/DistributedMinibatchTracker.cpp @@ -5,53 +5,82 @@ size_t galois::DistributedMinibatchTracker::GetNumberForNextMinibatch() { galois::StatTimer timer("DistributedGetNumberForNextMinibatch"); timer.start(); - // TODO - for (size_t i = 0; i < total_minibatch_size_; i++) { - // pick a host, increment - unsigned chosen_host = int_distribution_(rng_object_); - assert(chosen_host < num_hosts_); - sampled_num_on_hosts_[chosen_host]++; + uint32_t my_share = int_distribution_(rng_object_); + if (current_num_on_hosts_[my_host_id_] == 0) { + my_share = 0; } + sampled_num_on_hosts_[my_host_id_] = my_share; // sync and post process *the same way on all hosts* - MPI_Allreduce(MPI_IN_PLACE, static_cast(sampled_num_on_hosts_.data()), - num_hosts_, MPI_UINT32_T, MPI_SUM, MPI_COMM_WORLD); - - size_t to_return = 0; - uint32_t leftover_to_allocate = 0; - - // TODO parallel? - for (size_t i = 0; i < num_hosts_; i++) { - uint32_t proposed_to_sample = sampled_num_on_hosts_[i]; - size_t left_to_sample = current_num_on_hosts_[i]; - size_t actual_to_sample = 0; - if (left_to_sample > 0) { - actual_to_sample = std::min(proposed_to_sample, current_num_on_hosts_[i]); - - if (actual_to_sample < left_to_sample && leftover_to_allocate) { - // more left to sample and we have extra; dump more from extra if - // possible - uint32_t what_is_left = left_to_sample - actual_to_sample; - size_t more_to_sample = std::min(what_is_left, leftover_to_allocate); - leftover_to_allocate -= more_to_sample; - actual_to_sample += more_to_sample; - assert(actual_to_sample <= left_to_sample); - } + MPI_Allgather(MPI_IN_PLACE, 0, MPI_UINT32_T, + static_cast(sampled_num_on_hosts_.data()), 1, + MPI_UINT32_T, MPI_COMM_WORLD); + + for (size_t i = 1; i < sampled_num_on_hosts_.size(); i++) { + sampled_num_on_hosts_[i] += sampled_num_on_hosts_[i - 1]; + } + uint32_t share_sum = sampled_num_on_hosts_.back(); + uint32_t num_per_unit = + std::max((total_minibatch_size_ + share_sum - 1) / share_sum, size_t{1}); + + size_t my_value_to_take = 0; + size_t extra_to_distribute = 0; + size_t sanity_sum = 0; + for (size_t host = 0; host < num_hosts_; host++) { + // determine how much to pull from each host based on sampled number + uint32_t start; + uint32_t end; + if (host == 0) { + start = 0; + end = std::min(num_per_unit * sampled_num_on_hosts_[host], + (uint32_t)total_minibatch_size_); + } else if (host == (num_hosts_ - 1)) { + start = std::min(num_per_unit * sampled_num_on_hosts_[host - 1], + (uint32_t)total_minibatch_size_); + end = total_minibatch_size_; + } else { + start = std::min(num_per_unit * sampled_num_on_hosts_[host - 1], + (uint32_t)total_minibatch_size_); + end = std::min(num_per_unit * sampled_num_on_hosts_[host], + (uint32_t)total_minibatch_size_); } - leftover_to_allocate = proposed_to_sample - actual_to_sample; - current_num_on_hosts_[i] -= actual_to_sample; - sampled_num_on_hosts_[i] = 0; - if (my_host_id_ == i) { - to_return = actual_to_sample; + uint32_t proposed_to_take = end - start; + sanity_sum += proposed_to_take; + + // is there actually that much? check + uint32_t actual_to_take = + std::min(proposed_to_take, current_num_on_hosts_[host]); + + if (actual_to_take < proposed_to_take) { + extra_to_distribute += proposed_to_take - actual_to_take; + } + // update counts, then return + current_num_on_hosts_[host] -= actual_to_take; + if (host == my_host_id_) { + my_value_to_take = actual_to_take; } } - timer.stop(); + GALOIS_LOG_ASSERT(sanity_sum == total_minibatch_size_); + + // redistribute extra to hosts with remaining + for (size_t host = 0; host < num_hosts_; host++) { + if (!extra_to_distribute) { + // leave when there is nothing selse to distribute + break; + } - if (leftover_to_allocate) { - // if there are leftovers, it means that there is no more work - // in this system period - complete_hosts_ = num_hosts_; + size_t left_on_host = current_num_on_hosts_[host]; + if (left_on_host) { + uint32_t to_take = std::min(extra_to_distribute, left_on_host); + extra_to_distribute -= to_take; + current_num_on_hosts_[host] -= to_take; + // update my count as neccessary + if (my_host_id_ == host) { + my_value_to_take += to_take; + } + } } + timer.stop(); - return to_return; + return my_value_to_take; } diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index feddc1fb2c..ea0c5dc05f 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -526,9 +526,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { epoch_timer.start(); if (!global_work_left) { - if (num_hosts_ > 1) { - GALOIS_LOG_ASSERT(dist_minibatch_tracker_->OutOfWork()); - } + // if (num_hosts_ > 1) { + // GALOIS_LOG_ASSERT(dist_minibatch_tracker_->OutOfWork()); + //} break; } } From 97e8632db69d7acd27a4b2b0ab4ac1411c026369 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 30 Jul 2021 19:53:13 -0500 Subject: [PATCH 595/660] Test: all hosts share the same minibatch shuffle All hosts initialize a training-length array and shuffle it the same way. Removes the need for the distributed minibatch generator as well. --- libgnn/include/galois/MinibatchGenerator.h | 22 ++++++++---- libgnn/include/galois/graphs/GNNGraph.h | 5 ++- libgnn/src/GraphNeuralNetwork.cpp | 41 +++++++++++----------- libgnn/src/MinibatchGenerator.cpp | 13 +++++-- libgnn/src/graphs/GNNGraph.cpp | 11 ++++++ 5 files changed, 62 insertions(+), 30 deletions(-) diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h index 7e939c9cf4..8c6ae2275f 100644 --- a/libgnn/include/galois/MinibatchGenerator.h +++ b/libgnn/include/galois/MinibatchGenerator.h @@ -2,6 +2,7 @@ #include "galois/GNNTypes.h" #include "galois/Logging.h" +#include "galois/graphs/DistributedGraph.h" #include #include #include @@ -17,8 +18,9 @@ class MinibatchGenerator { : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size}, current_position_{0}, master_bound_{master_bound} { // set seed based on time then initialize random generate with rand() - srand(time(NULL)); + srand(1); rand_generator_ = std::make_unique(rand()); + srand(time(NULL)); GALOIS_LOG_ASSERT(master_bound_ <= mask_to_minibatch_.size()); } @@ -56,16 +58,24 @@ class MinibatchGenerator { } } - void ShuffleMode() { + void ShuffleMode(const galois::graphs::DistGraph& graph, + GNNMask& global_training_mask, size_t total_train_nodes) { if (!shuffle_mode_) { shuffle_mode_ = true; - all_indices_.reserve(master_bound_); + all_indices_.reserve(total_train_nodes); // setup all set indices for the minibatch - for (size_t pos = 0; pos < master_bound_; pos++) { - if (mask_to_minibatch_[pos]) { - all_indices_.emplace_back(pos); + for (size_t pos = 0; pos < global_training_mask.size(); pos++) { + if (global_training_mask[pos]) { + if (graph.isLocal(pos)) { + all_indices_.emplace_back(graph.getLID(pos)); + } else { + // size is greater than LID; use this as a "not present" + all_indices_.emplace_back(graph.size()); + } } } + GALOIS_LOG_ASSERT(all_indices_.size() == total_train_nodes); + // shuffle it std::shuffle(all_indices_.begin(), all_indices_.end(), *rand_generator_); printf("Number of things in minibatch generator is %lu\n", diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 044f82e7a2..835c2cba01 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -398,7 +398,8 @@ class GNNGraph { } train_batcher_ = std::make_unique( local_training_mask_, train_batch_size, *end_owned()); - train_batcher_->ShuffleMode(); + train_batcher_->ShuffleMode(*partitioned_graph_, global_training_mask_, + global_training_mask_range_.size); local_minibatch_mask_.resize(partitioned_graph_->size()); return train_batcher_->ShuffleMinibatchTotal(); } @@ -777,6 +778,8 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// // TODO maybe revisit this and use an actual bitset + //! Bitset indicating which nodes are training nodes (global) + GNNMask global_training_mask_; //! Bitset indicating which nodes are training nodes GNNMask local_training_mask_; //! Bitset indicating which nodes are validation nodes diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index ea0c5dc05f..90fa6fd009 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -124,13 +124,13 @@ galois::GraphNeuralNetwork::GraphNeuralNetwork( num_hosts_ = galois::runtime::getSystemNetworkInterface().Num; if (config_.train_minibatch_size()) { - size_t local_num = - graph_->SetupTrainBatcher(config_.train_minibatch_size()); - if (num_hosts_ > 1) { - dist_minibatch_tracker_ = std::make_unique( - galois::runtime::getSystemNetworkInterface().ID, num_hosts_, - local_num, config_.train_minibatch_size()); - } + graph_->SetupTrainBatcher(config_.train_minibatch_size()); + // size_t local_num = + // if (num_hosts_ > 1) { + // dist_minibatch_tracker_ = std::make_unique( + // galois::runtime::getSystemNetworkInterface().ID, num_hosts_, + // local_num, config_.train_minibatch_size()); + //} } if (config_.test_minibatch_size()) { @@ -373,9 +373,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { GradientPropagation(); } else { graph_->ResetTrainMinibatcher(); - if (num_hosts_ > 1) { - dist_minibatch_tracker_->ResetEpoch(); - } + // if (num_hosts_ > 1) { + // dist_minibatch_tracker_->ResetEpoch(); + //} SetLayerPhases(galois::GNNPhase::kBatch); @@ -395,16 +395,17 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // break when all hosts are done with minibatches prep_timer.start(); size_t seed_node_count; - if (num_hosts_ > 1) { - size_t num_for_next_batch = - dist_minibatch_tracker_->GetNumberForNextMinibatch(); - galois::gInfo(graph_->host_prefix(), "Sampling ", num_for_next_batch, - " for this minibatch"); - seed_node_count = - graph_->PrepareNextTrainMinibatch(num_for_next_batch); - } else { - seed_node_count = graph_->PrepareNextTrainMinibatch(); - } + // if (num_hosts_ > 1) { + // size_t num_for_next_batch = + // dist_minibatch_tracker_->GetNumberForNextMinibatch(); + // galois::gInfo(graph_->host_prefix(), "Sampling ", + // num_for_next_batch, + // " for this minibatch"); + // seed_node_count = + // graph_->PrepareNextTrainMinibatch(num_for_next_batch); + //} else { + //} + seed_node_count = graph_->PrepareNextTrainMinibatch(); galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is for batch is ", diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp index c1bb8c221d..4d851aacac 100644 --- a/libgnn/src/MinibatchGenerator.cpp +++ b/libgnn/src/MinibatchGenerator.cpp @@ -37,12 +37,19 @@ void galois::MinibatchGenerator::OriginalGetNextMinibatch( void galois::MinibatchGenerator::ShuffleGetNextMinibatch( std::vector* batch_mask) { - size_t current_count = 0; galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0); + + size_t current_count = 0; + size_t global_minibatch_size = + minibatch_size_ * galois::runtime::getSystemNetworkInterface().Num; while (current_position_ < all_indices_.size()) { - (*batch_mask)[all_indices_[current_position_++]] = 1; + size_t candidate_lid = all_indices_[current_position_++]; + if (candidate_lid < batch_mask->size() && candidate_lid < master_bound_) { + (*batch_mask)[candidate_lid] = 1; + } + current_count++; - if (current_count == minibatch_size_) + if (current_count == global_minibatch_size) break; } } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index e616465d1b..9f980b6134 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -490,6 +490,11 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( // read mask range std::string mask_filename = input_directory_ + dataset_name + "-" + mask_type + "_mask.txt"; + bool train_is_on = false; + if (mask_type == "train") { + train_is_on = true; + } + std::ifstream mask_stream; mask_stream.open(mask_filename, std::ios::in); mask_stream >> range_begin >> range_end >> std::ws; @@ -520,6 +525,9 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( (*masks)[partitioned_graph_->getLID(cur_line_num)] = 1; local_sample_count++; } + if (train_is_on) { + global_training_mask_[cur_line_num] = 1; + } } } cur_line_num++; @@ -560,6 +568,7 @@ size_t galois::graphs::GNNGraph::FindOtherMask() { void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { // allocate the memory for the local masks + global_training_mask_.resize(partitioned_graph_->globalSize()); local_training_mask_.resize(partitioned_graph_->size()); local_validation_mask_.resize(partitioned_graph_->size()); local_testing_mask_.resize(partitioned_graph_->size()); @@ -579,6 +588,7 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { if (partitioned_graph_->isLocal(i)) { local_training_mask_[partitioned_graph_->getLID(i)] = 1; } + global_training_mask_[i] = 1; } // validation @@ -608,6 +618,7 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { if (partitioned_graph_->isLocal(i)) { local_training_mask_[partitioned_graph_->getLID(i)] = 1; } + global_training_mask_[i] = 1; } // validation for (size_t i = global_validation_mask_range_.begin; From c42688d2d8f2eeadb0f3e78b720ed66ac675a4bb Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Wed, 11 Aug 2021 17:08:30 -0500 Subject: [PATCH 596/660] Major mirror deletion fix: don't delete masters Master proxies were being deleted from subgraphs if they did not have incoming/outgoing edges. This is a problem because the master is responsible for reducing updates from all other proxies which may still exist on other hosts, and this was the cause of accuracy degradation at a higher number of hosts. This problem did not appear much for edge cuts because master nodes would not get deleted since edges end up on master. --- libgnn/include/galois/MinibatchGenerator.h | 4 +++- libgnn/include/galois/graphs/GNNGraph.h | 3 ++- libgnn/src/graphs/GNNGraph.cpp | 24 ++++++++++++++++++++++ libgnn/src/graphs/GNNSubgraph.cpp | 21 +++++++++++++++++++ 4 files changed, 50 insertions(+), 2 deletions(-) diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h index 8c6ae2275f..fd7c92ff50 100644 --- a/libgnn/include/galois/MinibatchGenerator.h +++ b/libgnn/include/galois/MinibatchGenerator.h @@ -74,7 +74,9 @@ class MinibatchGenerator { } } } - GALOIS_LOG_ASSERT(all_indices_.size() == total_train_nodes); + GALOIS_LOG_VASSERT(all_indices_.size() == total_train_nodes, + "{} vs right {}", all_indices_.size(), + total_train_nodes); // shuffle it std::shuffle(all_indices_.begin(), all_indices_.end(), *rand_generator_); diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 835c2cba01..18604361a4 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -399,7 +399,7 @@ class GNNGraph { train_batcher_ = std::make_unique( local_training_mask_, train_batch_size, *end_owned()); train_batcher_->ShuffleMode(*partitioned_graph_, global_training_mask_, - global_training_mask_range_.size); + global_training_count_); local_minibatch_mask_.resize(partitioned_graph_->size()); return train_batcher_->ShuffleMinibatchTotal(); } @@ -778,6 +778,7 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// // TODO maybe revisit this and use an actual bitset + size_t global_training_count_; //! Bitset indicating which nodes are training nodes (global) GNNMask global_training_mask_; //! Bitset indicating which nodes are training nodes diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 9f980b6134..1c7d19040b 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -534,6 +534,10 @@ size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( } mask_stream.close(); + if (train_is_on) { + global_training_count_ = valid_count; + } + if (valid_count != mask_range->size) { // overlapping masks: need to actually check the masks rather than use // ranges @@ -574,6 +578,8 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { local_testing_mask_.resize(partitioned_graph_->size()); if (dataset_name == "reddit") { + global_training_count_ = 153431; + // TODO reddit is hardcode handled at the moment; better way to not do // this? global_training_mask_range_ = {.begin = 0, .end = 153431, .size = 153431}; @@ -607,6 +613,8 @@ void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { } } } else if (dataset_name == "ogbn-papers100M-remap") { + global_training_count_ = 1207178; + global_training_mask_range_ = {.begin = 0, .end = 1207178, .size = 1207178}; global_validation_mask_range_ = { .begin = 1207178, .end = 1207178 + 125264, .size = 125264}; @@ -1107,6 +1115,14 @@ size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, if (IsInSampledGraph(x)) { local_sample_count += 1; if (sample_node_timestamps_[*x] == std::numeric_limits::max()) { + if (x < end_owned()) { + // owned nodes that are activated on other hosts shoudl always + // be activated because it's responsible for keeping others in + // sync during comms; ignoring it = bad + // TODO(gluon) make it so you don't have to deal with this + // and just use host as a reducer point + definitely_sampled_nodes_.set(*x); + } sample_node_timestamps_[*x] = timestamp; } } @@ -1203,6 +1219,14 @@ size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, if (IsInSampledGraph(x)) { local_sample_count += 1; if (sample_node_timestamps_[*x] == std::numeric_limits::max()) { + if (x < end_owned()) { + // owned nodes that are activated on other hosts shoudl always + // be activated because it's responsible for keeping others in + // sync during comms; ignoring it = bad + // TODO(gluon) make it so you don't have to deal with this + // and just use host as a reducer point + definitely_sampled_nodes_.set(*x); + } sample_node_timestamps_[*x] = timestamp; } } diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index 5e95b079fd..f2148b2706 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -59,6 +59,27 @@ void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( return; } + // checking sanity + // galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()), + // [&](uint32_t node_id) { + // if (gnn_graph.IsInSampledGraph(node_id) && + // !gnn_graph.IsActiveInSubgraph(node_id)) { + // // check if any edges are active + // for (auto a = gnn_graph.edge_begin(node_id); a != + // gnn_graph.edge_end(node_id);a++) { + // if (gnn_graph.IsEdgeSampledAny(a)) { + // galois::gWarn("ERROR node ", node_id); + // } + // } + // for (auto a = gnn_graph.in_edge_begin(node_id); a != + // gnn_graph.in_edge_end(node_id);a++) { + // if (gnn_graph.IsInEdgeSampledAny(a)) { + // galois::gWarn("ERROR in node ", node_id); + // } + // } + // } + // }); + if (subgraph_id_to_lid_.size() < num_subgraph_nodes_) { // allocate a bit more than necessary to avoid a big realloc // if node value changes slightly later From c27153ab2339789b3e2c0736063c4a4586fe6f01 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 12 Aug 2021 14:40:11 -0500 Subject: [PATCH 597/660] Revert minibatch selection to "pick locally" Instead of global minibatch view where all hosts advance selector the same way (can result in uneven minibatch size on each host), revert to scheme where all hosts select same # (which requires even distribution of training nodes across hosts). Revert this commit to get back to the other functionality. --- libgnn/include/galois/MinibatchGenerator.h | 57 ++++++++++++++++------ libgnn/include/galois/graphs/GNNGraph.h | 14 +++--- libgnn/src/MinibatchGenerator.cpp | 20 +++++++- 3 files changed, 70 insertions(+), 21 deletions(-) diff --git a/libgnn/include/galois/MinibatchGenerator.h b/libgnn/include/galois/MinibatchGenerator.h index fd7c92ff50..127367bdf1 100644 --- a/libgnn/include/galois/MinibatchGenerator.h +++ b/libgnn/include/galois/MinibatchGenerator.h @@ -18,9 +18,9 @@ class MinibatchGenerator { : mask_to_minibatch_{mask_to_minibatch}, minibatch_size_{minibatch_size}, current_position_{0}, master_bound_{master_bound} { // set seed based on time then initialize random generate with rand() - srand(1); - rand_generator_ = std::make_unique(rand()); + // srand(1); srand(time(NULL)); + rand_generator_ = std::make_unique(rand()); GALOIS_LOG_ASSERT(master_bound_ <= mask_to_minibatch_.size()); } @@ -32,14 +32,14 @@ class MinibatchGenerator { } } - void GetNextMinibatch(std::vector* batch_mask, size_t num_to_get) { - if (!shuffle_mode_) { - // TODO - GALOIS_LOG_FATAL("not yet implemented"); - } else { - ShuffleGetNextMinibatch(batch_mask, num_to_get); - } - } + // void GetNextMinibatch(std::vector* batch_mask, size_t num_to_get) { + // if (!shuffle_mode_) { + // // TODO + // GALOIS_LOG_FATAL("not yet implemented"); + // } else { + // ShuffleGetNextMinibatch(batch_mask, num_to_get); + // } + //} //! True if no more minibatches from this generator bool NoMoreMinibatches() { @@ -58,8 +58,34 @@ class MinibatchGenerator { } } - void ShuffleMode(const galois::graphs::DistGraph& graph, - GNNMask& global_training_mask, size_t total_train_nodes) { + //! Original shuffle mode in which every host only considers locally owned + //! training nodes in the all indices array + void ShuffleMode() { + if (!shuffle_mode_) { + shuffle_mode_ = true; + all_indices_.reserve(master_bound_); + // setup all set indices for the minibatch + for (size_t pos = 0; pos < master_bound_; pos++) { + if (mask_to_minibatch_[pos]) { + all_indices_.emplace_back(pos); + } + } + // shuffle it + std::shuffle(all_indices_.begin(), all_indices_.end(), *rand_generator_); + printf("Number of things in minibatch generator is %lu\n", + all_indices_.size()); + } + } + + //! Distributed shuffle mode: all hosts create array with ALL global training + //! node IDs and initialize shuffler to same seed. All hosts then advance it + //! at the same time, resulting in a consistent minibatch across all hosts. + //! Will *NOT* balance # of training nodes done on a host each minibatch + //! unlike original shuffle. + void + DistributedShuffleMode(const galois::graphs::DistGraph& graph, + GNNMask& global_training_mask, + size_t total_train_nodes) { if (!shuffle_mode_) { shuffle_mode_ = true; all_indices_.reserve(total_train_nodes); @@ -106,8 +132,11 @@ class MinibatchGenerator { void OriginalGetNextMinibatch(std::vector* batch_mask); void ShuffleGetNextMinibatch(std::vector* batch_mask); - void ShuffleGetNextMinibatch(std::vector* batch_mask, - size_t num_to_get); + + // Do not use these unless you know what they're doing + void DistributedShuffleGetNextMinibatch(std::vector* batch_mask); + void DistributedShuffleGetNextMinibatch(std::vector* batch_mask, + size_t num_to_get); }; } // namespace galois diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 18604361a4..2eaba6e90d 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -398,8 +398,9 @@ class GNNGraph { } train_batcher_ = std::make_unique( local_training_mask_, train_batch_size, *end_owned()); - train_batcher_->ShuffleMode(*partitioned_graph_, global_training_mask_, - global_training_count_); + train_batcher_->ShuffleMode(); + // train_batcher_->DistributedShuffleMode(*partitioned_graph_, + // global_training_mask_, global_training_count_); local_minibatch_mask_.resize(partitioned_graph_->size()); return train_batcher_->ShuffleMinibatchTotal(); } @@ -409,10 +410,11 @@ class GNNGraph { //! Setup the state for the next minibatch sampling call by using the //! minibatcher to pick up the next set batch of nodes size_t PrepareNextTrainMinibatch(); - size_t PrepareNextTrainMinibatch(size_t num_to_get) { - train_batcher_->GetNextMinibatch(&local_minibatch_mask_, num_to_get); - return SetupNeighborhoodSample(GNNPhase::kBatch); - } + // Used with distributed minibatch tracker + // size_t PrepareNextTrainMinibatch(size_t num_to_get) { + // train_batcher_->GetNextMinibatch(&local_minibatch_mask_, num_to_get); + // return SetupNeighborhoodSample(GNNPhase::kBatch); + //} //! Returns true if there are still more minibatches in this graph bool MoreTrainMinibatches() { return !train_batcher_->NoMoreMinibatches(); }; diff --git a/libgnn/src/MinibatchGenerator.cpp b/libgnn/src/MinibatchGenerator.cpp index 4d851aacac..9b603fc2e4 100644 --- a/libgnn/src/MinibatchGenerator.cpp +++ b/libgnn/src/MinibatchGenerator.cpp @@ -37,6 +37,22 @@ void galois::MinibatchGenerator::OriginalGetNextMinibatch( void galois::MinibatchGenerator::ShuffleGetNextMinibatch( std::vector* batch_mask) { + size_t current_count = 0; + galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0); + // loops through a number of indices locally and sets + while (current_position_ < all_indices_.size()) { + (*batch_mask)[all_indices_[current_position_++]] = 1; + current_count++; + if (current_count == minibatch_size_) + break; + } +} + +// used if all hosts have a global view of the same minibatch sequence +// (occurs if all hosts use same shuffle seed) +// Do not use unless you know what you are doing +void galois::MinibatchGenerator::DistributedShuffleGetNextMinibatch( + std::vector* batch_mask) { galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0); size_t current_count = 0; @@ -54,7 +70,9 @@ void galois::MinibatchGenerator::ShuffleGetNextMinibatch( } } -void galois::MinibatchGenerator::ShuffleGetNextMinibatch( +// used with distributed minibatch tracker which is deprecated; code not +// guaranteed to work +void galois::MinibatchGenerator::DistributedShuffleGetNextMinibatch( std::vector* batch_mask, size_t num_to_get) { size_t current_count = 0; galois::ParallelSTL::fill(batch_mask->begin(), batch_mask->end(), 0); From 916d25fb106a1e223d8a58bd35a4dd3b225c992d Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Thu, 12 Aug 2021 15:15:44 -0500 Subject: [PATCH 598/660] Warning/crashes added to layers Warnings or failures added to out of date layers that do not work with new GNN execution pipeline. --- libgnn/src/layers/DenseLayer.cpp | 4 + libgnn/src/layers/GNNLayer.cpp | 2 +- libgnn/src/layers/GraphConvolutionalLayer.cpp | 4 + libgnn/src/layers/L2NormLayer.cpp | 2 + libgnn/src/layers/SAGELayer.cpp | 10 +-- libgnn/src/layers/SigmoidLayer.cpp | 4 + libgnn/src/layers/SoftmaxLayer.cpp | 76 +++++++++---------- 7 files changed, 58 insertions(+), 44 deletions(-) diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp index 483ceb7850..eed3143a01 100644 --- a/libgnn/src/layers/DenseLayer.cpp +++ b/libgnn/src/layers/DenseLayer.cpp @@ -9,6 +9,10 @@ galois::DenseLayer::DenseLayer( : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config), input_column_intermediates_(dimensions.input_columns), output_column_intermediates_(dimensions.output_columns) { + // TODO Need to make sure that layer knows about forward/backward matrix + // sharing (e.g., overwriting previously used input to save space) + GALOIS_LOG_FATAL("This layer has not been kept up to date; do not use until " + "sure it's been updated"); size_t num_input_elements = layer_dimensions_.input_rows * layer_dimensions_.input_columns; in_temp_1_.resize(num_input_elements, 0); diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 885dc1f537..82a864a41d 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -189,7 +189,7 @@ void galois::GNNLayer::RandomInitVector(std::vector* vector_to_init) { void galois::GNNLayer::DoDropoutCPU( const PointerWithSize input_to_dropout, PointerWithSize* output_matrix) { - // XXX(loc) check this to make sure it works in subgraph setting + // TODO This (and dropout in general) may not work in the sampling setting size_t num_elements = layer_dimensions_.input_rows * layer_dimensions_.input_columns; diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index 82522fafd9..de84903447 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -9,6 +9,10 @@ galois::GraphConvolutionalLayer::GraphConvolutionalLayer( : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config), input_column_intermediates_(dimensions.input_columns), output_column_intermediates_(dimensions.output_columns) { + galois::gWarn( + "GCN layer not up to date with new subgraph/sampling changes; " + "do not use until updated to reflect changes (see GraphSAGE layer)"); + size_t num_input_elements = layer_dimensions_.input_rows * layer_dimensions_.input_columns; if (!config_.disable_dropout || config_.disable_aggregate_after_update || diff --git a/libgnn/src/layers/L2NormLayer.cpp b/libgnn/src/layers/L2NormLayer.cpp index bcf66eb2f9..0d566f0b66 100644 --- a/libgnn/src/layers/L2NormLayer.cpp +++ b/libgnn/src/layers/L2NormLayer.cpp @@ -5,6 +5,8 @@ galois::L2NormLayer::ForwardPhase( #ifdef GALOIS_ENABLE_GPU // TODO #endif + GALOIS_LOG_FATAL( + "L2 layer has not been kept up to date for months; do not use"); return ForwardPhaseCPU(input_embeddings); } diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index f078d97bd9..25b9418fa1 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -155,7 +155,7 @@ void galois::SAGELayer::ResizeIntermediates(size_t new_input_rows, size_t new_output_rows) { size_t num_in_temp_elements = new_output_rows * layer_dimensions_.input_columns; - //galois::gDebug(graph_.host_prefix(), "Layer num ", layer_number_, " ", + // galois::gDebug(graph_.host_prefix(), "Layer num ", layer_number_, " ", // in_temp_1_.size(), " and ", num_in_temp_elements, " ", // layer_dimensions_.input_columns, " ", // layer_dimensions_.output_columns); @@ -267,10 +267,10 @@ void galois::SAGELayer::WeightGradientSyncSum2() { const galois::PointerWithSize galois::SAGELayer::ForwardPhase( const galois::PointerWithSize input_embeddings) { - //galois::gDebug( + // galois::gDebug( // "Layer ", layer_number_, " dims: ", layer_dimensions_.input_rows, " ", - // layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, " ", - // layer_dimensions_.output_columns, " ", input_embeddings.size(), " ", + // layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, " + // ", layer_dimensions_.output_columns, " ", input_embeddings.size(), " ", // layer_dimensions_.input_rows * layer_dimensions_.input_columns); galois::StatTimer timer("ForwardPhase", kRegionName); TimerStart(&timer); @@ -742,7 +742,7 @@ void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, base_gpu_object_.layer_weights(), output); } else { #endif - //galois::gDebug("Layer ", graph_user_layer_number_, " ", + // galois::gDebug("Layer ", graph_user_layer_number_, " ", // layer_dimensions_.output_rows, " ", // layer_dimensions_.input_columns, " ", // layer_dimensions_.output_columns); diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp index 1809decc8a..595fd5c023 100644 --- a/libgnn/src/layers/SigmoidLayer.cpp +++ b/libgnn/src/layers/SigmoidLayer.cpp @@ -7,6 +7,10 @@ const galois::PointerWithSize galois::SigmoidLayer::ForwardPhaseCPU( const galois::PointerWithSize input_embeddings) { + galois::gWarn( + "Sigmoid layer has not been kept up to date; do not use unless sure" + " it works with new changes"); + input_loss_.assign(input_loss_.size(), 0.0); forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); const size_t feature_length = layer_dimensions_.input_columns; diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp index 70a6afa6c3..aebbb3dd9b 100644 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ b/libgnn/src/layers/SoftmaxLayer.cpp @@ -32,25 +32,25 @@ galois::SoftmaxLayer::ForwardPhaseCPU( } } - // do softmax - GNNSoftmax(feature_length, &input_embeddings[feature_length * i], - &p_backward_output_matrix_[feature_length * i]); - // create ground truth vector for this LID - std::vector* ground_truth_vec = - ground_truth_vectors_.getLocal(); - assert(ground_truth_vec->size() == feature_length); - ground_truth_vec->assign(ground_truth_vec->size(), 0.0); - // single class label is an index; set the correct one - (*ground_truth_vec)[static_cast( - graph_.GetSingleClassLabel(i))] = 1.0; + // do softmax + GNNSoftmax(feature_length, &input_embeddings[feature_length * i], + &p_backward_output_matrix_[feature_length * i]); + // create ground truth vector for this LID + std::vector* ground_truth_vec = + ground_truth_vectors_.getLocal(); + assert(ground_truth_vec->size() == feature_length); + ground_truth_vec->assign(ground_truth_vec->size(), 0.0); + // single class label is an index; set the correct one + (*ground_truth_vec)[static_cast( + graph_.GetSingleClassLabel(i))] = 1.0; - // calculate loss for this LID (note not all i will be filled) - input_loss_[i] = - GNNCrossEntropy(feature_length, ground_truth_vec->data(), - &p_backward_output_matrix_[feature_length * i]); + // calculate loss for this LID (note not all i will be filled) + input_loss_[i] = + GNNCrossEntropy(feature_length, ground_truth_vec->data(), + &p_backward_output_matrix_[feature_length * i]); #ifndef NDEBUG - loss_accum += input_loss_[i]; - handled += 1; + loss_accum += input_loss_[i]; + handled += 1; #endif }, // TODO chunk size? @@ -91,29 +91,29 @@ galois::SoftmaxLayer::BackwardPhaseCPU() { galois::do_all( galois::iterate(size_t{0}, layer_dimensions_.input_rows), [&](const unsigned node) { - if (IsSampledLayer()) { - if (layer_phase_ == GNNPhase::kTrain && - !graph_.IsInSampledGraphSubgraph(node)) - return; - } + if (IsSampledLayer()) { + if (layer_phase_ == GNNPhase::kTrain && + !graph_.IsInSampledGraphSubgraph(node)) + return; + } - size_t correct = graph_.GetSingleClassLabel(node); - // See here for explanation for why this works - // https://gombru.github.io/2018/05/23/cross_entropy_loss/ - // Derivation of full combined derivative isn't there, but some - // emperical inspection tells me this is likely correct - // TODO(loc) work it out myself - for (size_t idx = 0; idx < feature_length; idx++) { - if (idx == correct) { - // positive class - p_backward_output_matrix_[node * feature_length + idx] = - p_backward_output_matrix_[node * feature_length + idx] - 1; - } else { - // negative class - p_backward_output_matrix_[node * feature_length + idx] = - p_backward_output_matrix_[node * feature_length + idx]; - } + size_t correct = graph_.GetSingleClassLabel(node); + // See here for explanation for why this works + // https://gombru.github.io/2018/05/23/cross_entropy_loss/ + // Derivation of full combined derivative isn't there, but some + // emperical inspection tells me this is likely correct + // TODO(loc) work it out myself + for (size_t idx = 0; idx < feature_length; idx++) { + if (idx == correct) { + // positive class + p_backward_output_matrix_[node * feature_length + idx] = + p_backward_output_matrix_[node * feature_length + idx] - 1; + } else { + // negative class + p_backward_output_matrix_[node * feature_length + idx] = + p_backward_output_matrix_[node * feature_length + idx]; } + } }, galois::steal(), galois::loopname("SoftmaxBackward")); From adbd3b578260f2405d0c08aaeb7a7c90c71aeea2 Mon Sep 17 00:00:00 2001 From: Loc Hoang Date: Fri, 13 Aug 2021 17:48:44 -0500 Subject: [PATCH 599/660] Add design doc README, some comments to SAGE Added a full markdown README for libgnn explaining some design decisions. Comments to the SAGE layer indicating which conditionals are XForm first --- libgnn/README.md | 562 ++++++++++++++++++++++++++++++++ libgnn/src/layers/SAGELayer.cpp | 3 + 2 files changed, 565 insertions(+) create mode 100644 libgnn/README.md diff --git a/libgnn/README.md b/libgnn/README.md new file mode 100644 index 0000000000..dbca774922 --- /dev/null +++ b/libgnn/README.md @@ -0,0 +1,562 @@ +Author: Loc Hoang, + +Best viewed with a Markdown viewer due to Latex + formatting. + +This file's sections are ordered such that you can read from +top to bottom and still get a decent understanding of the +pieces of `libgnn`. As such, independent portions are near the +top. + +This file is being written so that whoever works on this code in the +future has a general idea what contributions I've made to the code +and how the gnn branch differs from master. Some of these changes +need to get merged into master in the future. It also allows me +to take stock of the changes/implementation choices I've made +in the past year. + +# CuSP Changes + +Variants of the regular partitions were added to allow training +nodes to be partitioned relatively evenly among machines rather +than having CVC/OEC use a regular block partition over all nodes (which +would ignore the train/val/test split). + +This causes some weird effects when this version's CuSP is used outside +of GNNs or if the training boundaries are not hardcoded (e.g., if +the training boundaries are unknown, a segfault can occur). Some care +will be needed to make this integration more clean. + +# Gluon Changes + +Many changes occurred to Gluon to optimize for the vector communication +case. A few of them are listed below. + +* Serialize/deserialize **directly** to/from the serialization and +deserialization buffers. This eliminates a large amount of redundant +copying from original source to vector to buffer (and in the reverse) +which is incredibly important for performance when communicating vectors. +Something important to also take away from this experience is that +if you have a vector of vectors, serializing each vector individually +into the buffer is a very bad idea: care should be taken to make +it so that you can serialize as much data as possible in one go. + +* QoL change: way to disable Gluon timers with a variable change/flag. + +* Method to swap out mirror handshake since this is used by subgraph +code to avoid sending messages to inactive mirrors. + +* Hochan ported large message handling from KatanaGraph into Galois. +This involved changing the serialization buffers among other things. + +# GNN Optimizers + +Only one that exists is the ADAM optimizer. Note that each +layer has its own moments and does not share them (this may or +may not be standard; I'm not sure). + +All hosts will see the same gradients due to synchronization, +so all hosts should end up making the same changes to the weights. + +# Layers + +Each layer inherits from a `GNNLayer` class which has common functionality +like weight allocation, output allocation, etc. The children classes +can add more things to it; for example, SAGE adds weights for the +concatenated feature and intermediates for intermediate calculation +(also reused in backward prop). + +One thing to note is that the backward output matrix (used to output +gradients during the backward phase) is **not** a completely independent +piece of memory: it is the **memory used by the forward output of +the layer that came before it**. The reason for this is that doing it +this way saves a very large amount of memory, especially in a full batch +setting where the number of nodes (multiplied by features/hidden feature +size) can grow very large. **Be very careful about this as it means that +you cannot reuse the output matrix from the forward pass after it +has been overwritten.** This results in some rather convoluted logic that +you may find in the code. It also means that **whenever an output matrix +is resized for any reason, the pointers that each layer holds MUST +be updated, or you will get undefined behavior**. + +## Softmax Layer + +Runs a softmax on each individual row, gets the highest value, +compares with ground truth, gets loss. + +Note that the **forward and backward output matrix are shared** in this +layer, so be careful with the assumptions made after the backward +step is run (because the forward output will no longer be accessible +after the backward step; this is why the accuracy check in the +code has to occur before backward is called). + +Regarding the backward step: it turns out that for single class +classification, the gradient if the answer is wrong is simply +the softmax value itself, and if the answer is right, then its +the softmax value - 1. This has the advantage of being very +numerically stable as well. + +Things are slightly more complicated for the multi-class case; some +investigation needs to be done to figure this out. + +## SAGE Layer (and GCN Layer by Extension) + +### ReLU Activation and Overwriting of the Forward Matrix + +ReLU activation is used by the compute layers: if the value +is greater than 0, it is kept, else it is discarded. + +Because the forward output matrix gets overwritten during +the backward step and because the derivative of the +ReLU operation requires knowledge of what elements were +affected by the ReLU, the system must *track* which +elements were not set to 0 using a bitmask. This +mask is used during the backward phase to keep gradients +only if their values corresponding to that gradient +were originally greater than 0, and it works even +if the original forward matrix has been overwritten. + +### Row Dimensions and Active Portions of Matrices + +An optimal version of a normal GNN should make it so that +the number of active rows decreases as execution progresses +through the layers of the GNN: the last layer's active +rows in the feature matrix should be *only* the seed +nodes (i.e., nodes that are being predicted): keeping +all nodes up to date is a waste of compute. + +The number of active nodes at the beginning of a GNNs +should be all nodes involved in the k-hop neighborhood +of the seed nodes. The next layer should remove +the kth hop from the active nodes; the layer after, +the (k-1)th hop, and so on. This can be accomplished +relatively easily without disrupting the contiguous +feature matrix by making sure that the nodes that will +be dropped are in the suffix of matrix in the order +that they will be dropped from the bottom. Then, +to drop them, the code just changes the number of input +rows for the layer so that any loops/matrix multiplies +will only look at the relevant row prefix. + +In a distributed setting, the active nodes of a particular +layer should be *shared* across all hosts; a host should not +drop a node if it is being used somewhere else *and* if +the node in question has a contribution to it (i.e., +has edges or is the master proxy). + +### SAGE's Concatenation of Input Features + +The GraphSAGE model concatenates the input feature to the aggregated +feature vector on each node after aggregation which doubles +the length of the vector. Actually doing this in the feature +matrix is not great as it would mean that the original weight +matrix needs to double in size, and additional space would have +to be allocated on top of the existing input features +with the aggregated copied over to it. + +Instead of doing this, you can allocate a separate weight matrix +of the same size as the original, multiply the original input +features with that new weight matrix, and sum it up to the final +output matrix. The result is exactly the same as if the input +feature was concatenated to the aggregated features then +multiplied with a weight matrix with double the number of rows. +(work it out mathematically; it's the same) + +### Intermediates and Flipping Aggregation/Linear XForm: Basics + +The GNN computation in SAGE is two-step: aggregation +followed by linear transform (more steps if dropout is enabled): +an intermediate matrix is required to store the result of the first +step for use in the next step. Additionally, keeping this +intermediate result around in memory significantly speeds up +the backward step which can use it to derive gradients. +Therefore, the SAGE layer must allocate space for the intermediate. + +The size of the intermediate changes depending on if you do +linear xform before aggregation; this is done if doing +the linear xform reduces the column dimension as it makes +the aggregation aggregate on smaller feature vector sizes (which +speeds up computation overall in general). It helps to understand +how the dimensions change after aggregation and after linear +xform. Say the input matrix is IR by IC (input row by input column). + +* Aggregation only needs to occur for the nodes that will +be active in the next layer, i.e. the *output rows* (OR). Therefore, +after aggregation, the rows of the matrix go from IR to OR. + +* Linear transform changes the number of columns to output columns (OC). +Therefore, after linear xform, IC turns to OC. + +After both operations, the output matrix to the next layer is the +expected OR by OC. Depending on which one occurs first, +the code generates an intermediate of OR by IC *or* IC by OC. +(more than one may be needed if dropout is used as that generates +a new dropout matrix). + +### Intermediates and Flipping Aggregation/Linear XForm: Backward Pass + +The computation of a SAGE layer is the following in matrix +terms where $T$ is the graph topology, $F$ is features, +and the $W$s are the two weight matrices (one for aggregated +value, other for concatenated vector). + +$TFW_1 + FW_2 = O$ + +The gradients we want are $W_{1,2}'$ and $F'$ to pass back to the next layer in +the backward phase. We have the gradient $O'$. The method in which this occurs +depends on the order of aggregation/xform in the forward phase. + +First, $FW_2$. One can derive one part of $F'$ (the other part +is from the first term) and $W_{2}'$. $F' = O'(W_2)^T$ and $W_{2}' = F^T O'$. + +Next, $TFW_1$. + +* If aggregation occurs first, we have $(TF)$ in an intermediate +matrix. The $W_{1}'$ gradient is $W_{1}' = (TF)^{T}O'$. To get one part of +$F'$, we do $O' W_{1}^{T} = (TF)'$ followed by $T^T (TF)' = F'$. +* If xform occurs first, $(FW_1)$ is in the intermediate matrix. +To get $F'$, $T^T O' = (FW_{1})'$, followed by $(FW_{1})' (W_{1})^T = F'$. +The weight gradient is $W_{1}' = F^T (FW_{1})'$. + +The $F'$ gradient from the two terms ($TFW_1$ and $FW_2$) can be summed +together. + +### Masking Out Non-Masters in Distributed Setting + +In a distributed setting, all hosts need to see the same gradient +computed in the backward phase so that the weights can all be updated +in the same manner to keep consistency across hosts. This can +be accomplished by synchronizing appropriately and making +sure that a gradient computation isn't accounted for more than +once globally. + +For $F'$, keeping it consistent simply means making sure that all +hosts compute all the required rows. This is doable if a host knows +what proxies it owns are active in the global subgraph being operated +on and makes sure that it has the most up-to-date value for that proxy's +gradient at all times. For example, since all hosts have a copy of the +weights, in order to get the gradients for $F'$, all a host needs +is to make sure $O'$ contains the gradients for local proxies +active in a particular layer (even if they aren't part of that +host's seed nodes). In this way, all hosts *recompute* the same gradient +required for a proxy. + +For $W'$, each node contributes a gradient to it. A node is +replicated across hosts via proxies; unlike the previous case, +however, a *sync* of weight gradients occurs across all hosts because +not all hosts have all proxies, and in this case, you need the +contribution of all nodes and not just the ones you have proxies +of, so you do **not** want a node's gradient to be computed more than +once across all hosts. Therefore, when doing computation involving +the weight gradient, a node's contribution should only be computed +once **by the owner/master of that node**. Therefore, non-masters +on hosts **need to be masked when computing $W'$**. +This presents a problem implementation wise: masking non-masters +is an in-place operation since you do not want to allocate +new memory, so some care needs to be taken on which matrices to mask +as well as when to mask them since $F'$ computation requires *non-masked* +matrices. This is the reason for the very convoluted logic in the +backward pass in the code that will need to be cleaned up or +redesigned at some point. +It might be possible to play a similar trick to active row prefixing +where non-masters are placed lower in the rows so that "masking" +can occur by changing the row count, but I believe I tried +this and ran into issues with non-contiguity of masters/mirrors. + +Below is the masking logic used by the current code: + +``` +Calculate W2' using masked input or masked gradients (mask required else overcount, +if not layer 0 then can mask input, else mask gradient) + +if (xform before agg) + Calculate (FW1)' by tranpose aggregating gradients + Mask out the non-masters in feature matrix F if not layer 0, else mask FW1 + Calculate W1' using F^T and (FW1)' (one of which is masked) + Calculate F' from W1 by using (FW1)', W1^T and W2^T (masked FW1 won't occur here, + because this is only required if layer isn't 0) +else + Mask F if not layer 0, else mask gradient + Get F' from W2 by multiplying O' with W2 (no masks allowed here) + Mask TF^T if not layer 0 (because O' won't be masked in that case) + Get W1' by multiplying TF^T with O' (one will be masked) + Get F' from W1 by (1) multipling O' with W1^T then (2) transpose aggregate to get F' + (none of the ops above should be masked) +``` + +The above isn't the neatest explanation of things, but essentially, +anything involving a W' calculation requires one of the operands +to have masked non-masters. Layer 0 is special because you +can't mask the inputs there as those are the inputs used at +the beginning of an epoch. + +### Regarding Dropout + +The way that dropout works is that random parts of the input +are set to 0 for that particular batch. +The ones set to 0 need to be memorized so that the backward +pass can correctly compute the derivative. + +Dropout currently **does not work in a distributed setting**: the problem +is that each host may dropout different weights due to the nature +of RNG, leading to divergence on each host. One way to avoid +this is to make it so each host dropouts a particular portion only and +synchronize this choice. This has not been implemented efficiently (yet?). +**I have not kept this code up-to-date as well** as all runs I've been +doing are without dropout. + +*Therefore, it's probably better not to use it for the time being.* + +# Graph Neural Network + +`GraphNeuralNetwork.{cpp/h}` is the main class which runs the +graph neural network. It creates the layers and chains their outputs +together to create the network flow. + +## Constructor + +1) Creates the intermediate layers. See the section on Layers to get an +idea of what is done. +Typically, activation is activated for compute layers except for the last +layer: activation is typically disabled for that layer for accuracy +reasons (running activation on the final output layer messes with +predictions). + +2) If minibatching is enabled, create minibatch generators. + +3) Create the output layer (Softmax is the only one that works right now, +but Sigmoid is required for multi-class classification). + +## Training Flow + +There are a few scenarios based on if training and testing minibatching +is enabled or not. These are not necessarily the most optimal things to +do (e.g., you never want the entire graph to participate in training; +only k-hop neighborhood is required). + +1) No training/testing minibatch -> the entire graph participates in training. + +2) Training minibatch but no test minibatch -> k-hop neighborhood only, but +space required for entire graph is allocated (inefficient, should only need +k-hop neighborhood of test nodes) + +3) Train/test minibatching -> k-hop neighborhood subgraphs only, and space +for them is allocated on demand rather than worst case entire graph. + +Note that because of the way the code works, if you want to do an *efficient* +full-batch no sampling run, you should specify very large numbers for the train +and test minibatches so that the efficient code path is taken. Due to the +way the design is at the moment it will **inefficiently regenerate +the k-hop full batch train/test subgraphs when they are used**: this +need to be fixed in a future redesign where multiple subgraphs can be +swapped among. + +If a k-hop subgraph needs to be generated, it's generated with the following +flow: + +1) Choose the seed nodes (i.e., nodes that will have their output compared +to ground truth to potentially get loss/gradients to backpropagate) + +2) From seed nodes, sample a few edges OR if not sampling, choose all +of them. Activate the destination nodes, communicate this, repeat +for k hops. + +3) Correct layer dimensions based on subgraph/number of nodes at +each layer (reduce memory AND compute footprint). + +4) Generate subgraph (see subgraph construction section). + +5) Do inference and back prop, update weights, repeat. +The way this works is relatively simple: the code loops +through each layer and calls the forward or backward pass function +on it. + +Depending on how the test interval is set, between each epoch +a test subgraph may be used to check test accuracy. +The flaw with the current design is that the graph object is +only aware of one 'graph' at any one point, meaning the code +has to be very careful to generate the right graph (train/test) +for use at the right time. + +Note that the `kBatch` mode used in the Train code refers to +a status that is set on nodes based on the minibatch and only +includes *local seed nodes*, so keep this in mind when using it (there +have been unintentional problems where I assumed `kBatch` meant +more than just local seed nodes). The main reason for this is +that it helps to distinguish local and global seed nodes to avoid +over-calculating gradients. + +# GNN Graph + +`GNNGraph.{cpp/h}` is responsible for reading in the graph topology, +labels, and features. Topology is read/partitioned via the CuSP +infrastructure. Each host reads labels for nodes it owns; same with +features (right now it's pretty inefficient as all hosts read the entire +file; some better way should probably be come up with). + +It is responsible for the synchronization substrate: Gluon is initialized +on the partitioned graph. Normally sync occurs on the node data of the graph, +but the node data in GNN case is a feature vector. To get around sync +structure limitations, a global pointer is set to point to the feature +matrix array (along with some other globals) so that the sync structure +can know how to access it. + +There are sync structures for global degrees and aggregation mainly. +If a subgraph is used, things change slightly (see subgraph section) + +The class provides functions to get degrees and also holds the minibatch +generator. It also holds one `GNNSubgraph` object if a subgraph is being used +(this is a limitation; there can only be one active subgraph at any one point). +If the subgraph is active and the flag for the subgraph is on, then all +user-facing functions on the `GNNGraph` object will access the *subgraph* +instead of the original graph. **Be very careful with this and make sure the +graph is in the right mode that you intend it to be.** + +# Subgraph Construction + +Subgraphs are created by the sampling/minibatch infrastructure: +a few nodes are marked "active" along with edges, and +the program compiles these chosen nodes/edges into a separate +CSR for use during execution. There are a few implementation details +during this process that will be documented here. + +## Code Structure + +The current implementation in Galois has a Subgraph class +contained by the GNNGraph class. The subgraph is enabled +by a flag which alters GNNGraph calls to direct to the +subgraph instead. + +Optimally, we want to be able to work with many subgraphs +at once; this design makes it difficult to do so as +only 1 subgraph is contained by on GNNGraph. It would +probably be possible to extend this design and have GNNGraph +expose a subgraph switcher or something of the sort so that +it isn't tied directly to the class. + +## Sampling + +The "activeness" of a node is marked on the node itself as a flag. +In addition to this, the layer number in which a node is added +is noted as well (the reason for this will be apparent later). + +Each edge has two variables associated with it: a normal flag +saying if it has been sampled in any layer, and a bitset saying +which layers the edge has been sampled in. This is because +an edge once sampled is not necessarily sampled in *all* layers: +it may be sampled in only a single layer (or many layers), +and this info needs to be known when iterating over the edges +to keep things correct. + +In addition, the degree of a node for each sampled phase locally +is kept track of. At the end of all sampling, the degrees +of the nodes at each layer are synchronized among all hosts. +This is required because normalization in aggregation uses +the subgraph degrees (this is actually quite annoying runtime +wise as it adds this extra degree sync step). + +## Construction Steps + +The steps in subgraph construction are the following: + +1) Create the local ID to subgraph ID mapping (and vice versa) +2) Count degrees for the sampled vertices in order to construct +the CSR; this includes edges that may not always be active. +3) Create the CSR using the degrees. +4) Create the local subgraph features matrix by copying +them over from the original feature matrix. + +In order to make row elimination easier, +the SID of the vertices are ordered such that seed nodes are +first, the 1-hop samples next, then 2-hops, 3-hops, etc. +This makes it easy to eliminate vertices that aren't used after +a certain point by changing the row dimensions used by multiplies/ +aggregations. Master nodes that are also seed nodes always occupy +the first SIDs so that it's easy to loop through master nodes only. +Other master nodes may end up with non-contiguous SIDs as they +may become active in different layers; to track these masters +for masking later, a bitset is maintained. +Counts as to how many nodes are in each layer have to be +compiled so this process can be done in parallel. An on_each +loop is used to get SIDs in parallel. + +In addition, nodes that (1) are not master proxies and (2) do +not have any outgoing or incoming edges are eliminated from +the local subgraph. This is because some proxies do not have +edges on some hosts even if they do on other hosts, so even +if they become active, they do not change the outcome of computation +and actually add unnecessary overhead. **This dead mirror +removal is extremely important for performance.** Implementation +wise it is done by keeping a "definitely active" flag which +will only mark proxies that definitely have an edge connecting +them or proxies that are masters. + +Degree counting and graph construction proceed as normal: count +degrees, do a prefix sum, create the CSR. One thing to note is +that the CSC is also created in order to do the backward aggregation +step. The data which says which layers an edge is active in is +pointed to by the newly constructed graph. + +## Synchronization when Subgraphs Exist + +### Mirror Regeneration + +Some mirrors on a local host may be inactive in the subgraph because +they were not sampled. The subgraph code can create a new mirror +node mapping that Gluon can swap out for each subgraph. + +This has its own overhead, and from some experiments in the +past this doesn't significantly affect performance, but it's +done anyways. + +### GID to SID + +Gluon memoizes GID-LID handshakes on each host to avoid the need +to send IDs along with messages. This means that if a subgraph is being +synchronized, another conversion to SIDs must occur. There need +to be sampled graph versions of the sync structures that use +a mapping from LID to SID in order to save the updates to the correct +memory locations. + +Sometimes, due to the way Gluon works, a node that isn't part of the +active subgraph may have its data queried for extract/update. The sync +structure must account for this and check if such data is being accessed +so that it can avoid seg-faulting. + +# Minibatch Generator + +`MinibatchGenerator.{cpp/h}` takes the list of training/test nodes on +a single host and gives the user an interface for getting the nodes +in batches at a time. This is used to do minibatching of nodes across +hosts; each host picks the same number at a time before the beginning +of minibatch. + +# Other (Dead) Files/Code + +`DistributedMinibatchTracker` was created to track variable number +of seed nodes on each host to make the sampling more like single-host +sampling. This was deprecated for a new functionality in the `MinibatchGenerator` +which does it in a much more sane manner by having all hosts see the same +global sequence of nodes to choose and moving the window locally on each +host (this can result in imbalanced seeds). + +A lot of the existing layers have not been kept up-to-date due to the rapid +development process on minibatching/sampling. Only the SAGE layer and Softmax +Layer are guaranteed to be functional as those are the ones most +of the runs have been on. + +There is an experimental implementation of something known as "sampled views" +in which an explicit subgraph isn't constructed; a mask is used instead. +Performance wise this did not do too well, so the code has been abandoned +and is not guaranteed to work. + +# Regarding GPU Code + +It has been a while since I worked on the GPU code, but the idea is essentially +to pre-allocate the same data that you would have allocated on the CPU +and use those pointers instead of CPU pointers. + +Some updates will need to be made in order to do dynamic resizing of the +data depending on the size of the minibatch. The best way to avoid this +in general, though, is to just allocate space for the test subgraph's +k-hops since that is likely to be more expensive than whatever +the minibatch size for the train nodes are (unless it's all nodes). \ No newline at end of file diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 25b9418fa1..bf301e5bdd 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -375,6 +375,7 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } // aggregate this here before gradient starts to get overwritten + // this is xform ffirst if (!config_.disable_aggregate_after_update && layer_dimensions_.input_columns > layer_dimensions_.output_columns) { // aggregate occurs regardless of layer being equal to 0 because it is @@ -491,6 +492,8 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( &input_column_intermediates_, true); } } else { + // xform first + // --unmasked-- // disable concat is part of condition because otherwise this mask From 051f88b011a4fb3eb7c8eacd07e1bf032a5a6ba5 Mon Sep 17 00:00:00 2001 From: Hochan Lee Date: Mon, 22 May 2023 14:42:38 -0700 Subject: [PATCH 600/660] Fix a minor bug with a file path --- libdeepgalois/include/deepgalois/layers/GluonGradients.h | 2 ++ libgnn/include/galois/graphs/GNNGraph.h | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/libdeepgalois/include/deepgalois/layers/GluonGradients.h b/libdeepgalois/include/deepgalois/layers/GluonGradients.h index e14fe27bc8..2918cdd8dd 100644 --- a/libdeepgalois/include/deepgalois/layers/GluonGradients.h +++ b/libdeepgalois/include/deepgalois/layers/GluonGradients.h @@ -40,6 +40,8 @@ class GluonGradients { std::vector> _mirrorRanges; public: + bool is_a_graph() { return true; } + /** * Save weight gradients + number of them (i.e. size). * Then setup mirror metadata for Gluon to use during setup. diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 2eaba6e90d..fff1d03ed4 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -16,7 +16,7 @@ namespace galois { // TODO remove the need to hardcode this path //! Path to location of all gnn files static const std::string default_gnn_dataset_path = - "/net/ohm/export/iss/inputs/Learning/"; + "/home/hochan/inputs/Learning/"; //! Helper struct to maintain start/end/size of any particular range. Mostly //! used for mask ranges. From 7bd324a6ebbb46b1eb0201950b08a9be28db5adb Mon Sep 17 00:00:00 2001 From: Hochan Lee Date: Wed, 14 Jun 2023 00:14:09 -0500 Subject: [PATCH 601/660] Add timers for time breakdown --- libgnn/include/galois/GraphNeuralNetwork.h | 2 +- libgnn/src/GraphNeuralNetwork.cpp | 28 ++++++++++++++++++- libgnn/src/graphs/GNNGraph.cpp | 15 ++++++++-- libgnn/src/layers/GraphConvolutionalLayer.cpp | 11 ++++++++ libgnn/src/layers/SAGELayer.cpp | 21 ++++++++++++-- 5 files changed, 71 insertions(+), 6 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index a813378116..7aa859c84c 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -232,7 +232,7 @@ class GraphNeuralNetwork { private: static const constexpr char* kRegionName = "GraphNeuralNetwork"; - bool timers_on_{false}; + bool timers_on_{true}; void EnableTimers() { timers_on_ = true; diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp index 90fa6fd009..201da985d5 100644 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ b/libgnn/src/GraphNeuralNetwork.cpp @@ -262,12 +262,22 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { std::vector subgraph_layer_sizes; // this subgraph only needs to be created once if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) { + galois::StatTimer total_subgraph_construction_timer("TotalSubGraphConstruction", kRegionName); + galois::StatTimer setup_neighborhood_sample_timer("SetupNeighborhoodSample", kRegionName); + galois::StatTimer edge_sampling_timer("SampleAllEdges", kRegionName); + galois::StatTimer subgraph_construction_timer("SubGraphConstruction", kRegionName); + total_subgraph_construction_timer.start(); + + setup_neighborhood_sample_timer.start(); // Setup the subgraph to only be the training graph size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); + setup_neighborhood_sample_timer.stop(); + subgraph_layer_sizes.emplace_back(local_seed_node_count); galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", local_seed_node_count); size_t num_sampled_layers = 0; + edge_sampling_timer.start(); // gnn_layers_.back()->ResizeRows(local_seed_node_count); for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); back_iter++) { @@ -290,8 +300,12 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { num_sampled_layers++; } } + edge_sampling_timer.stop(); + subgraph_construction_timer.start(); CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); + subgraph_construction_timer.stop(); CorrectBackwardLinks(); + total_subgraph_construction_timer.stop(); } galois::StatTimer epoch_timer("TrainingTime", kRegionName); @@ -327,14 +341,20 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { // beginning of epoch sampling (no minibatches) if (config_.do_sampling() && !config_.train_minibatch_size()) { galois::StatTimer mb_timer("EpochSubgraphCreation", kRegionName); + galois::StatTimer subgraph_construction_timer("SubGraphConstruction", kRegionName); + galois::StatTimer setup_neighborhood_sample_timer("SetupNeighborhoodSample", kRegionName); + galois::StatTimer edge_sampling_timer("SampleEdges", kRegionName); mb_timer.start(); + setup_neighborhood_sample_timer.start(); size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); + setup_neighborhood_sample_timer.stop(); // gnn_layers_.back()->ResizeRows(local_seed_node_count); galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", local_seed_node_count); size_t num_sampled_layers = 0; + edge_sampling_timer.start(); // work backwards on GCN/SAGE layers // loop backward and find last GCN/SAGE (main) layer to disable activation for (auto back_iter = gnn_layers_.rbegin(); @@ -358,8 +378,11 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { num_sampled_layers++; } } + edge_sampling_timer.stop(); // resize layer matrices + subgraph_construction_timer.start(); CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); + subgraph_construction_timer.stop(); CorrectBackwardLinks(); mb_timer.stop(); } @@ -386,6 +409,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { galois::StatTimer prep_timer("PrepNextMinibatch", kRegionName); galois::StatTimer sample_time("MinibatchSampling", kRegionName); galois::StatTimer mb_timer("MinibatchSubgraphCreation", kRegionName); + galois::StatTimer subgraph_construction_timer("SubGraphConstruction", kRegionName); mb_timer.start(); galois::Timer batch_timer; @@ -454,7 +478,9 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { sample_time.stop(); // resize layer matrices + subgraph_construction_timer.start(); CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); + subgraph_construction_timer.stop(); CorrectBackwardLinks(); // XXX resizes above only work for SAGE layers; will break if other @@ -659,7 +685,7 @@ float galois::GraphNeuralNetwork::Train(size_t num_epochs) { uint64_t average_epoch_time = epoch_timer.get() / num_epochs; galois::runtime::reportStat_Tavg(kRegionName, "AverageEpochTime", average_epoch_time); - DisableTimers(); + //DisableTimers(); // disable subgraph graph_->DisableSubgraph(); graph_->EnableSubgraphChooseAll(); diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 1c7d19040b..4a83753670 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -716,11 +716,22 @@ float galois::graphs::GNNGraph::GetGlobalAccuracy( float galois::graphs::GNNGraph::GetGlobalAccuracyCPU( PointerWithSize predictions, GNNPhase phase, bool sampling) { + galois::StatTimer global_accuracy_timer("GetGlobalAccuracy"); + galois::StatTimer global_accuracy_for_singleclass_timer("GetGlobalAccuracyForSingleClass"); + galois::StatTimer global_accuracy_for_multiclass_timer("GetGlobalAccuracyForMultiClass"); + global_accuracy_timer.start(); + float accuracy{0}; if (is_single_class_label()) { - return GetGlobalAccuracyCPUSingle(predictions, phase, sampling); + global_accuracy_for_singleclass_timer.start(); + accuracy = GetGlobalAccuracyCPUSingle(predictions, phase, sampling); + global_accuracy_for_singleclass_timer.stop(); } else { - return GetGlobalAccuracyCPUMulti(predictions, phase, sampling); + global_accuracy_for_multiclass_timer.start(); + accuracy = GetGlobalAccuracyCPUMulti(predictions, phase, sampling); + global_accuracy_for_multiclass_timer.stop(); } + global_accuracy_timer.stop(); + return accuracy; } float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index de84903447..b9a9c2120c 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -148,6 +148,8 @@ galois::GraphConvolutionalLayer::BackwardPhase( galois::PointerWithSize prev_layer_input, galois::PointerWithSize* input_gradient) { galois::StatTimer timer("BackwardPhase", kRegionName); + galois::StatTimer weight_gradient_timer("BackwardPhaseWeight", kRegionName); + galois::StatTimer weight_gradient_sync_timer("BackwardPhaseWeightSync", kRegionName); timer.start(); assert(layer_phase_ == GNNPhase::kTrain); @@ -190,12 +192,14 @@ galois::GraphConvolutionalLayer::BackwardPhase( input_gradient->data(), p_layer_weight_gradients_.data()); } else { #endif + weight_gradient_timer.start(); // temp 2 holds aggregated feature vectors from forward phase galois::CBlasSGEMM( CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, layer_dimensions_.input_rows, layer_dimensions_.output_columns, agg_data.data(), input_gradient->data(), p_layer_weight_gradients_.data()); + weight_gradient_timer.stop(); #ifdef GALOIS_ENABLE_GPU } #endif @@ -243,11 +247,13 @@ galois::GraphConvolutionalLayer::BackwardPhase( p_out_temp_.data(), p_layer_weight_gradients_.data()); } else { #endif + weight_gradient_timer.start(); galois::CBlasSGEMM(CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, layer_dimensions_.input_rows, layer_dimensions_.output_columns, input_data.data(), p_out_temp_.data(), p_layer_weight_gradients_.data()); + weight_gradient_timer.stop(); #ifdef GALOIS_ENABLE_GPU } #endif @@ -262,7 +268,9 @@ galois::GraphConvolutionalLayer::BackwardPhase( // sync weight gradients; note aggregation sync occurs in the function call // already + weight_gradient_sync_timer.start(); WeightGradientSyncSum(); + weight_gradient_sync_timer.stop(); if (!config_.disable_dropout && layer_number_ != 0) { DoDropoutDerivative(); @@ -316,6 +324,7 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( size_t column_length, const GNNFloat* node_embeddings, GNNFloat* aggregate_output, galois::substrate::PerThreadStorage>*) { + galois::StatTimer aggregate_all_sync_timer("AggregateSync", kRegionName); size_t num_nodes = graph_.size(); size_t last_master = *(graph_.end_owned()); assert(0 == *(graph_.begin_owned())); @@ -393,7 +402,9 @@ void galois::GraphConvolutionalLayer::AggregateAllCPU( galois::chunk_size<1>(), galois::steal(), galois::loopname("ConvolutionalAggregateAll")); // aggregate sync + aggregate_all_sync_timer.start(); graph_.AggregateSync(aggregate_output, column_length); + aggregate_all_sync_timer.stop(); } void galois::GraphConvolutionalLayer::UpdateEmbeddings( diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index bf301e5bdd..032478745d 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -351,6 +351,8 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( galois::PointerWithSize prev_layer_input, galois::PointerWithSize* input_gradient) { galois::StatTimer timer("BackwardPhase", kRegionName); + galois::StatTimer weight_gradient_sync_timer("BackwardPhaseWeightSync", kRegionName); + galois::StatTimer weight_gradient_sync_timer2("BackwardPhaseWeight2Sync", kRegionName); TimerStart(&timer); assert(layer_phase_ == GNNPhase::kTrain || layer_phase_ == GNNPhase::kBatch); @@ -431,7 +433,10 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( } #endif } + + weight_gradient_sync_timer2.start(); WeightGradientSyncSum2(); + weight_gradient_sync_timer2.stop(); // derivative of aggregation/update // TODO clean up logic here to reduce nesting @@ -553,7 +558,10 @@ galois::PointerWithSize galois::SAGELayer::BackwardPhase( p_backward_output_matrix_.data(), false); } } + + weight_gradient_sync_timer.start(); WeightGradientSyncSum(); + weight_gradient_sync_timer.stop(); // full gradient needed here; should occur after all updates if (layer_number_ != 0) { @@ -587,16 +595,19 @@ void galois::SAGELayer::AggregateAll( pts, bool is_backward) { std::string agg_timer_name = "AggregateCompute"; + std::string agg_sync_timer_name = "AggregateSync"; size_t num_rows_to_handle; if (!is_backward) { agg_timer_name += "Forward"; + agg_sync_timer_name += "Forward"; num_rows_to_handle = layer_dimensions_.output_rows; } else { agg_timer_name += "Backward"; + agg_sync_timer_name += "Backward"; num_rows_to_handle = layer_dimensions_.input_rows; } - galois::StatTimer timer(agg_timer_name.c_str(), kRegionName); + galois::StatTimer aggregate_all_sync_timer(agg_sync_timer_name.c_str(), kRegionName); TimerStart(&timer); #ifdef GALOIS_ENABLE_GPU @@ -617,8 +628,10 @@ void galois::SAGELayer::AggregateAll( TimerStop(&timer); // aggregate sync + aggregate_all_sync_timer.start(); graph_.AggregateSync(aggregate_output, column_length, is_backward, num_rows_to_handle); + aggregate_all_sync_timer.stop(); #ifdef GALOIS_ENABLE_GPU } #endif @@ -728,7 +741,8 @@ void galois::SAGELayer::AggregateAllCPU( } } }, - galois::chunk_size<1>(), galois::steal()); + galois::chunk_size<1>(), galois::steal(), + galois::loopname("SAGEAggregateAll")); } void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, @@ -854,10 +868,13 @@ void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative( void galois::SAGELayer::OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number) { + galois::StatTimer total_gradient_timer("GradientDescent", kRegionName); + total_gradient_timer.start(); optimizer->GradientDescent(p_layer_weight_gradients_, p_layer_weights_, trainable_layer_number); if (!sage_config_.disable_concat) { second_weight_optimizer_->GradientDescent(p_layer_weight_gradients_2_, p_layer_weights_2_, 0); } + total_gradient_timer.stop(); } From 5ab5c10238c49e5e1d0b15a14e6a5a998ab784cf Mon Sep 17 00:00:00 2001 From: "Lee, Hochan" <133701794+hochanlee-amd@users.noreply.github.com> Date: Tue, 25 Jul 2023 03:35:18 -0500 Subject: [PATCH 602/660] WMD CSV-based graph ingestion in Galois (#3) This commit is to enable Galois/Gluon to read WMD CSV-based graphs. This is a temporary code and this reads the whole graph on each machine on memory, which is not scaled. This will be updated to scalable graph ingestion with new dynamic graph data types (e.g., Log-structured CSR). For now, users can test WFs and ISBs with the WMD inputs through this. --- CMakeLists.txt | 7 +- README_SHAD.md | 57 ++ libcusp/CMakeLists.txt | 2 + .../include/galois/graphs/CuSPPartitioner.h | 8 +- .../include/galois/graphs/DistributedGraph.h | 258 ++++++ libcusp/include/galois/graphs/NewGeneric.h | 180 ++++- libcusp/test/CMakeLists.txt | 2 + libcusp/test/shad-dist-graph.cpp | 118 +++ libdeepgalois/include/deepgalois/types.h | 1 + .../include/galois/graphs/BufferedGraph.h | 43 +- libgalois/include/shad/DataTypes.h | 734 ++++++++++++++++++ libgalois/include/shad/Graph.h | 169 ++++ libgalois/include/shad/GraphTypes.h | 71 ++ libgalois/include/shad/ShadGraphConverter.h | 712 +++++++++++++++++ libgnn/CMakeLists.txt | 7 +- libgnn/include/galois/graphs/GNNGraph.h | 5 +- libgnn/src/graphs/GNNGraph.cpp | 19 +- lonestar/analytics/distributed/CMakeLists.txt | 1 - lonestar/gnn/include/DistributedGraphLoader.h | 13 +- lonestar/gnn/src/DistributedGraphLoader.cpp | 5 + .../libdistbench/include/DistBench/Input.h | 67 +- lonestar/libdistbench/src/Input.cpp | 5 + lonestar/libgnnbench/src/Input.cpp | 8 +- .../scientific/cpu/longestedge/test/catch.hpp | 7 + 24 files changed, 2418 insertions(+), 81 deletions(-) create mode 100644 README_SHAD.md create mode 100644 libcusp/test/CMakeLists.txt create mode 100644 libcusp/test/shad-dist-graph.cpp create mode 100644 libgalois/include/shad/DataTypes.h create mode 100644 libgalois/include/shad/Graph.h create mode 100644 libgalois/include/shad/GraphTypes.h create mode 100644 libgalois/include/shad/ShadGraphConverter.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 1eaa1e1e0a..88eaa64d74 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -38,7 +38,7 @@ set(USE_ARCH native CACHE STRING "Optimize for a specific processor architecture set(USE_DEEPGALOIS OFF CACHE BOOL "Use gnn apps as well as the DeepGalois library") set(USE_MKL_BLAS OFF CACHE BOOL "Use MKL for BLAS") # TODO; this is GNN related; find better way to do than hardcode -SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.2) +#SET(CUDA_HOME /org/centers/cdgc/cuda/cuda-10.2) # This option is automatically handled by CMake. # It makes add_library build a shared lib unless STATIC is explicitly specified. @@ -141,8 +141,7 @@ endif() # TODO (loc) prefix with GALOIS, move elsewhere more fitting in this file ################################################################################ if(USE_MKL_BLAS) - SET(INTEL_ROOT /opt/apps/sysnet/intel/20.0) - SET(MKL_ROOT ${INTEL_ROOT}/mkl) + SET(MKL_ROOT /home/hochan/intel/oneapi/mkl/2023.1.0) find_package(MKL REQUIRED) message(STATUS "MKL: ${MKL_INCLUDE_DIRS}") if (MKL_FOUND) @@ -151,7 +150,7 @@ if(USE_MKL_BLAS) endif() endif() -SET(OPENBLAS_ROOT /org/centers/cdgc/openblas/gcc8.1) +#SET(OPENBLAS_ROOT /org/centers/cdgc/openblas/gcc8.1) if(USE_OPENBLAS) find_package(OpenBLAS) message(STATUS "OpenBLAS: ${OPENBLAS_INCLUDE_DIRS}") diff --git a/README_SHAD.md b/README_SHAD.md new file mode 100644 index 0000000000..4253bb0e55 --- /dev/null +++ b/README_SHAD.md @@ -0,0 +1,57 @@ +README related to SHAD input graph ingestion +(Including some notes for other workflows) +This README is for our internal purpose. +This README will be refined with more concrete information later. + +1. CMakeList paths: +The current CMake in Galois is using hard-:coded paths for CUDA_HOME, +OPENBLAS_ROOT, INTEL_COMPILER_LIBRARIES, and MKL_LIBRARIES. +Please set those variables based on your environments. + + +2. Assumptions regarding SHAD WMD graph formats: +We assume that in SHAD WMD graph formats, each node and edge has a single type, +and those types are ALWAYS uint64_t. +The current Galois does not support node/edge properties (possibly, +programmers can implement a struct containing multiple +fields, but that is not like getData(n), getData(n), etc.) +and so, we store those SHAD types in node and edge data. +If you need other types than uint64_t, you should add new execution paths for +them. + + +3. Limitations of the current SHAD graph ingestion module: +In the original CuSP, each host reads parts of the .gr graph file and constructs +in-memory format. In this case, each host does not need to load the full graph +in its memory space. This is possible since .gr file is CSR and each component +such as outgoing edge indices, outgoing edge destinations, and outgoing edge +data is stored consecutively. + +However, in the SHAD graph format, all components are not stored consecutively. +They are unsorted. For example, edges and nodes can be stored in interleaved +manner. Therefore, it is not possible to read partial graphs by using +the original method. + +As the current SHAD graph ingestion does not focus on decent/scalable methods, +but to make SHAD graphs work in Galois to proceed with workflows, +each host reads the FULL graph to in-memory. This should NOT be the final +artifact since our long-run target graphs should exceed a single machine memory. +But for the immediate goal and the target data sets, I assume that it is fine +for now. + +UT team is currently working on new graph formats for dynamic graphs, and +scalable SHAD graph ingestion across hosts. + +4. TODO: +CuSP marks training/test/validation nodes while it is partitioning a graph. +It is not implemented yet for a SHAD graph. +This will be added in a GNN/feature construction branch. + +5. Requirements: +Galois-GNN requires additional packages listed below on top of the requirements of Galois. +You can use older/newer versions but let me (hochan) also list the versions that I have used: +1) Intel MKL: 2023.1.0 +2) Intel Compiler (including runtime libraries): 2023.0.0 +3) Intel Onedpl-devel library: 2023.1.0 +4) Intel OpenMP: 2023.0.0 + diff --git a/libcusp/CMakeLists.txt b/libcusp/CMakeLists.txt index 2cc6e1714d..67b603019e 100644 --- a/libcusp/CMakeLists.txt +++ b/libcusp/CMakeLists.txt @@ -27,3 +27,5 @@ install(TARGETS galois_cusp COMPONENT lib INCLUDES DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}" ) + +add_subdirectory(test) diff --git a/libcusp/include/galois/graphs/CuSPPartitioner.h b/libcusp/include/galois/graphs/CuSPPartitioner.h index 6df9707a27..6b7fef6dab 100644 --- a/libcusp/include/galois/graphs/CuSPPartitioner.h +++ b/libcusp/include/galois/graphs/CuSPPartitioner.h @@ -50,6 +50,7 @@ using DistGraphPtr = * to the partitioner * @param outputType Specifies the output format (CSR or CSC) that each * partition will be created in + * @param useShad "true" if the passed graph file format is a SHAD WMD graph * @param symmetricGraph This should be "true" if the passed in graphFile * is a symmetric graph * @param transposeGraphFile Transpose graph of graphFile in Galois binary @@ -83,7 +84,8 @@ template DistGraphPtr cuspPartitionGraph(std::string graphFile, CUSP_GRAPH_TYPE inputType, - CUSP_GRAPH_TYPE outputType, bool symmetricGraph = false, + CUSP_GRAPH_TYPE outputType, bool useShad = false, + bool symmetricGraph = false, std::string transposeGraphFile = "", std::string masterBlockFile = "", bool cuspAsync = true, uint32_t cuspStateRounds = 100, @@ -126,13 +128,13 @@ cuspPartitionGraph(std::string graphFile, CUSP_GRAPH_TYPE inputType, } return std::make_unique( - inputToUse, net.ID, net.Num, cuspAsync, cuspStateRounds, useTranspose, + inputToUse, net.ID, net.Num, useShad, cuspAsync, cuspStateRounds, useTranspose, readPolicy, nodeWeight, edgeWeight, masterBlockFile); } else { // symmetric graph path: assume the passed in graphFile is a symmetric // graph; output is also symmetric return std::make_unique( - graphFile, net.ID, net.Num, cuspAsync, cuspStateRounds, false, + graphFile, net.ID, net.Num, useShad, cuspAsync, cuspStateRounds, false, readPolicy, nodeWeight, edgeWeight, masterBlockFile); } } diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h index 0e3e5fa43c..415afba33d 100644 --- a/libcusp/include/galois/graphs/DistributedGraph.h +++ b/libcusp/include/galois/graphs/DistributedGraph.h @@ -347,6 +347,203 @@ class DistGraph { increment_evilPhase(); } + /** + * Given the number of global nodes, compute the masters for each node by + * evenly (or unevenly as specified by scale factor) + * blocking the nodes off to assign to each host. Considers + * ONLY nodes and not edges. + * + * @param numGlobalNodes The number of global nodes to divide + * @param scalefactor A vector that specifies if a particular host + * should have more or less than other hosts + * @param DecomposeFactor Specifies how decomposed the blocking + * of nodes should be. For example, a factor of 2 will make 2 blocks + * out of 1 block had the decompose factor been set to 1. + */ + void computeMastersBlockedNodes(uint64_t numGlobalNodes, + const std::vector& scalefactor, + unsigned DecomposeFactor = 1) { + uint64_t numNodes_to_divide = numGlobalNodes; + if (scalefactor.empty() || (numHosts * DecomposeFactor == 1)) { + for (unsigned i = 0; i < numHosts * DecomposeFactor; ++i) + gid2host.push_back(galois::block_range(uint64_t{0}, numNodes_to_divide, + i, numHosts * DecomposeFactor)); + return; + } + + // TODO: not compatible with DecomposeFactor. + assert(scalefactor.size() == numHosts); + + unsigned numBlocks = 0; + + for (unsigned i = 0; i < numHosts; ++i) { + numBlocks += scalefactor[i]; + } + + std::vector> blocks; + for (unsigned i = 0; i < numBlocks; ++i) { + blocks.push_back( + galois::block_range(uint64_t{0}, numNodes_to_divide, i, numBlocks)); + } + + std::vector prefixSums; + prefixSums.push_back(0); + + for (unsigned i = 1; i < numHosts; ++i) { + prefixSums.push_back(prefixSums[i - 1] + scalefactor[i - 1]); + } + + for (unsigned i = 0; i < numHosts; ++i) { + unsigned firstBlock = prefixSums[i]; + unsigned lastBlock = prefixSums[i] + scalefactor[i] - 1; + gid2host.push_back( + std::make_pair(blocks[firstBlock].first, blocks[lastBlock].second)); + } + } + + /** + * Given the number of global nodes and edges, + * compute the masters for each node by + * evenly (or unevenly as specified by scale factor) + * blocking the nodes off to assign to each host while taking + * into consideration the only edges of the node to get + * even blocks. + * + * @param numGlobalNodes The number of global nodes to divide + * @param numGlobalEdges The number of global edges to divide + * @param outIndices A complete outgoing edge range array of CSR to calculate + * range + * @param scalefactor A vector that specifies if a particular host + * should have more or less than other hosts + * @param DecomposeFactor Specifies how decomposed the blocking + * of nodes should be. For example, a factor of 2 will make 2 blocks + * out of 1 block had the decompose factor been set to 1. + */ + void computeMastersBalancedEdges(uint64_t numGlobalNodes, + uint64_t numGlobalEdges, + uint64_t* outIndices, + const std::vector& scalefactor, + uint32_t edgeWeight, + unsigned DecomposeFactor = 1) { + if (edgeWeight == 0) { + edgeWeight = 1; + } + + auto& net = galois::runtime::getSystemNetworkInterface(); + + gid2host.resize(numHosts * DecomposeFactor); + for (unsigned d = 0; d < DecomposeFactor; ++d) { + // TODO(hc): + auto r = galois::graphs::divideNodesBinarySearch( + numGlobalNodes, numGlobalEdges, 0, edgeWeight, (id + d * numHosts), + numHosts * DecomposeFactor, outIndices, scalefactor); + gid2host[id + d * numHosts].first = *(r.first.first); + gid2host[id + d * numHosts].second = *(r.first.second); + } + + for (unsigned h = 0; h < numHosts; ++h) { + if (h == id) { + continue; + } + galois::runtime::SendBuffer b; + for (unsigned d = 0; d < DecomposeFactor; ++d) { + galois::runtime::gSerialize(b, gid2host[id + d * numHosts]); + } + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + } + net.flush(); + unsigned received = 1; + while (received < numHosts) { + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; + do { + p = net.recieveTagged(galois::runtime::evilPhase); + } while (!p); + assert(p->first != id); + auto& b = p->second; + for (unsigned d = 0; d < DecomposeFactor; ++d) { + galois::runtime::gDeserialize(b, gid2host[p->first + d * numHosts]); + } + ++received; + } + increment_evilPhase(); + +#ifndef NDEBUG + // TODO(hc): + for (unsigned h = 0; h < numHosts; h++) { + if (h == 0) { + assert(gid2host[h].first == 0); + } else if (h == numHosts - 1) { + assert(gid2host[h].first == gid2host[h - 1].second); + assert(gid2host[h].second == numGlobalNodes); + } else { + assert(gid2host[h].first == gid2host[h - 1].second); + assert(gid2host[h].second == gid2host[h + 1].first); + } + } +#endif + } + + /** + * Given the number of global nodes and edges, + * compute the masters for each node by evenly + * (or unevenly as specified by scale factor) + * blocking the nodes off to assign to each host while taking + * into consideration the edges of the node AND the node itself. + * + * @param numGlobalNodes The number of global nodes to divide + * @param numGlobalEdges The number of global edges to divide + * @param outIndices A complete outgoing edge range array of CSR to calculate + * range + * @param scalefactor A vector that specifies if a particular host + * should have more or less than other hosts + * @param DecomposeFactor Specifies how decomposed the blocking + * of nodes should be. For example, a factor of 2 will make 2 blocks + * out of 1 block had the decompose factor been set to 1. Ignored + * in this function currently. + * + * @todo make this function work with decompose factor + */ + void computeMastersBalancedNodesAndEdges( + uint64_t numGlobalNodes, uint64_t numGlobalEdges, + uint64_t* outIndices, const std::vector& scalefactor, + uint32_t nodeWeight, uint32_t edgeWeight, unsigned) { + if (nodeWeight == 0) { + nodeWeight = numGlobalEdges / numGlobalNodes; // average degree + } + if (edgeWeight == 0) { + edgeWeight = 1; + } + + auto& net = galois::runtime::getSystemNetworkInterface(); + gid2host.resize(numHosts); + auto r = galois::graphs::divideNodesBinarySearch( + numGlobalNodes, numGlobalEdges, nodeWeight, edgeWeight, + id, numHosts, outIndices, scalefactor); + gid2host[id].first = *r.first.first; + gid2host[id].second = *r.first.second; + for (unsigned h = 0; h < numHosts; ++h) { + if (h == id) + continue; + galois::runtime::SendBuffer b; + galois::runtime::gSerialize(b, gid2host[id]); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + } + net.flush(); + unsigned received = 1; + while (received < numHosts) { + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; + do { + p = net.recieveTagged(galois::runtime::evilPhase); + } while (!p); + assert(p->first != id); + auto& b = p->second; + galois::runtime::gDeserialize(b, gid2host[p->first]); + ++received; + } + increment_evilPhase(); + } + + protected: /** * Wrapper call that will call into more specific compute masters @@ -401,6 +598,67 @@ class DistGraph { return numNodes_to_divide; } + /** + * Wrapper call that will call into more specific compute masters + * functions that compute masters based on nodes, edges, or both. + * + * @param masters_distribution method of masters distribution to use + * @param numGlobalNodes The number of global nodes to divide + * @param numGlobalEdges The number of global edges to divide + * @param outIndices A complete outgoing edge range array of CSR to calculate + * range + * @param scalefactor A vector that specifies if a particular host + * should have more or less than other hosts + * @param nodeWeight weight to give nodes when computing balance + * @param edgeWeight weight to give edges when computing balance + * @param DecomposeFactor Specifies how decomposed the blocking + * of nodes should be. For example, a factor of 2 will make 2 blocks + * out of 1 block had the decompose factor been set to 1. + */ + uint64_t computeMasters(MASTERS_DISTRIBUTION masters_distribution, + uint64_t numGlobalNodes, uint64_t numGlobalEdges, + uint64_t* outIndices, + const std::vector& scalefactor, + uint32_t nodeWeight = 0, uint32_t edgeWeight = 0, + unsigned DecomposeFactor = 1) { + galois::Timer timer; + timer.start(); + uint64_t numNodes_to_divide = numGlobalNodes; + + // compute masters for all nodes + switch (masters_distribution) { + case BALANCED_MASTERS: + computeMastersBlockedNodes( + numGlobalNodes, scalefactor, DecomposeFactor); + break; + case BALANCED_MASTERS_AND_EDGES: + computeMastersBalancedNodesAndEdges( + numGlobalNodes, numGlobalEdges, outIndices, + scalefactor, nodeWeight, edgeWeight, DecomposeFactor); + break; + case BALANCED_EDGES_OF_MASTERS: + default: + computeMastersBalancedEdges( + numGlobalNodes, numGlobalEdges, outIndices, + scalefactor, edgeWeight, DecomposeFactor); + break; + } + + timer.stop(); + + galois::runtime::reportStatCond_Tmax( + GRNAME, "MasterDistTime", timer.get()); + +#if 0 + galois::gDebug( + "[", id, "] Master distribution time : ", timer.get_usec() / 1000000.0f, + " seconds to read ", g.num_bytes_read(), " bytes in ", g.num_seeks(), + " seeks (", g.num_bytes_read() / (float)timer.get_usec(), " MBPS)"); +#endif + return numNodes_to_divide; + } + + //! reader assignment from a file //! corresponds to master assignment if using an edge cut void readersFromFile(galois::graphs::OfflineGraph& g, std::string filename) { diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index 4ff7832f3e..49c96a965c 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -29,6 +29,9 @@ #include "galois/graphs/DistributedGraph.h" #include "galois/DReducible.h" + +#include "shad/ShadGraphConverter.h" + #include #include @@ -220,7 +223,8 @@ class NewDistGraphGeneric : public DistGraph { */ NewDistGraphGeneric( const std::string& filename, unsigned host, unsigned _numHosts, - bool cuspAsync = true, uint32_t stateRounds = 100, bool transpose = false, + bool useShad = false, bool cuspAsync = true, uint32_t stateRounds = 100, + bool transpose = false, galois::graphs::MASTERS_DISTRIBUTION md = BALANCED_EDGES_OF_MASTERS, uint32_t nodeWeight = 0, uint32_t edgeWeight = 0, std::string masterBlockFile = "", bool readFromFile = false, @@ -240,17 +244,65 @@ class NewDistGraphGeneric : public DistGraph { return; } - galois::graphs::OfflineGraph g(filename); + galois::graphs::OfflineGraph* offlineGraph{nullptr}; + + shad::ShadGraphConverter shadConverter; + galois::graphs::BufferedGraph bufGraph; + bufGraph.resetReadCounters(); - base_DistGraph::numGlobalNodes = g.size(); - base_DistGraph::numGlobalEdges = g.sizeEdges(); std::vector dummy; // not actually getting masters, but getting assigned readers for nodes if (masterBlockFile == "") { - base_DistGraph::computeMasters(md, g, dummy, nodeWeight, edgeWeight); + if (useShad) { + std::cout << "Construct a distributed graph from SHAD WMD format.\n"; + uint64_t numGlobalNodes{0}, numGlobalEdges{0}; + // Read and load the whole SHAD WMD dataset to memory. + // TODO(hc): Note that this reads the entire graph. + // We will improve this to read partial graphs + // on each host later. For now, the main focus is + // to enable WMD dataset for the workflows. + shadConverter.readSHADFile(filename, &numGlobalNodes, &numGlobalEdges); + base_DistGraph::numGlobalNodes = numGlobalNodes; + base_DistGraph::numGlobalEdges = numGlobalEdges; + // Construct node data/outgoing index range arrays + // for a GLOBAL array, not a local array. + // Later, parts for the local graph partition will be + // extracted and be used after graph partitioning is done. + // Basically, the idea that is used here is to mimic + // the BufferedGraph. BufferedGraph does not load the whole arrays + // to memory, but only reads and loads parts of the arrays from + // an input file. It is possible since the .gr files are stored + // in a CSR format, and in a consecutive manner. We can know + // offset for each data in advance. + // However, we cannot achieve it from a SHAD graph file since + // it is not consecutive, but edges and nodes are mixed. + // Due to this, we construct nodes' array for a global graph + // here. This array will be restructured after CuSP decides + // local nodes. + // TODO(hc): UT will improve and redesign this part to + // get scalability. + shadConverter.constructNodeArrays( + 0, numGlobalNodes, numGlobalNodes); + + // Compute master proxies by using the number of global nodes + // and edges. + base_DistGraph::computeMasters( + md, base_DistGraph::numGlobalNodes, + base_DistGraph::numGlobalEdges, + shadConverter.getOutIndexBuffer(), dummy, nodeWeight, + edgeWeight); + } else { + offlineGraph = new galois::graphs::OfflineGraph(filename); + base_DistGraph::numGlobalNodes = offlineGraph->size(); + base_DistGraph::numGlobalEdges = offlineGraph->sizeEdges(); + base_DistGraph::computeMasters(md, *offlineGraph, dummy, nodeWeight, edgeWeight); + } } else { + if (useShad) { + GALOIS_DIE("SHAD graph format does not support master block file"); + } galois::gInfo("Getting reader assignment from file"); - base_DistGraph::readersFromFile(g, masterBlockFile); + base_DistGraph::readersFromFile(*offlineGraph, masterBlockFile); } graphPartitioner = std::make_unique( @@ -261,17 +313,18 @@ class NewDistGraphGeneric : public DistGraph { // get training nodes and split evenly among hosts std::vector trainPoints = this->getGNNBreakpoints(filename); + // TODO(hc) if (!trainPoints.empty()) { std::vector testDistribution = galois::graphs::determineUnitRangesFromPrefixSum( - base_DistGraph::numHosts, g, trainPoints[0], trainPoints[1]); + base_DistGraph::numHosts, *offlineGraph, trainPoints[0], trainPoints[1]); std::vector restDistribution = galois::graphs::determineUnitRangesFromPrefixSum( - base_DistGraph::numHosts, g, trainPoints[1], g.size()); + base_DistGraph::numHosts, *offlineGraph, trainPoints[1], offlineGraph->size()); // create global distribution of edges - std::vector mappings(g.size()); + std::vector mappings(offlineGraph->size()); galois::do_all( galois::iterate((size_t)0, (size_t)base_DistGraph::numHosts), [&](size_t h) { @@ -294,13 +347,6 @@ class NewDistGraphGeneric : public DistGraph { } } - uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first; - typename galois::graphs::OfflineGraph::edge_iterator edgeBegin = - g.edge_begin(nodeBegin); - uint64_t nodeEnd = base_DistGraph::gid2host[base_DistGraph::id].second; - typename galois::graphs::OfflineGraph::edge_iterator edgeEnd = - g.edge_begin(nodeEnd); - // signifies how many outgoing edges a particular host should expect from // this host std::vector> numOutgoingEdges; @@ -321,13 +367,59 @@ class NewDistGraphGeneric : public DistGraph { // phase 0 galois::gDebug("[", base_DistGraph::id, "] Starting graph reading."); - galois::graphs::BufferedGraph bufGraph; - bufGraph.resetReadCounters(); galois::StatTimer graphReadTimer("GraphReading", GRNAME); graphReadTimer.start(); - bufGraph.loadPartialGraph(filename, nodeBegin, nodeEnd, *edgeBegin, - *edgeEnd, base_DistGraph::numGlobalNodes, - base_DistGraph::numGlobalEdges); + + uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first; + uint64_t nodeEnd = base_DistGraph::gid2host[base_DistGraph::id].second; + + if (!useShad) { + // If the input graph is not SHAD WMD format, + // construct a buffered graph from the file directly, as ordinary. + typename galois::graphs::OfflineGraph::edge_iterator edgeBegin = + offlineGraph->edge_begin(nodeBegin); + typename galois::graphs::OfflineGraph::edge_iterator edgeEnd = + offlineGraph->edge_begin(nodeEnd); + bufGraph.loadPartialGraph(filename, nodeBegin, nodeEnd, *edgeBegin, + *edgeEnd, base_DistGraph::numGlobalNodes, + base_DistGraph::numGlobalEdges); + } else { + // Now construct arrays for in-memory CSR. + // In case of the node out-going edge range array and + // the node data array, it will extract parts corresponding to + // local graph paritition from the arrays holding the global + // array information. + // Edge destination and data arrays are constructed based on + // unrefined maps constructed from SHAD graph reading. + // NOTE that those arrays all store GLOBAL node ids. + // For example, edge destination array's size is equal + // to the number of local edges, but its destination ID is + // global node IDs, not local node IDs. + uint32_t numLocalNodes = nodeEnd - nodeBegin; + // So, this holds outgoing edge array of a whole (global) graph. + uint64_t *outIndexBuffer = shadConverter.getOutIndexBuffer(); + // Global edge id range assigned to the current host. + uint64_t edgeBegin = + (nodeBegin == 0)? 0 : outIndexBuffer[nodeBegin - 1]; + // This is the last local node's edge range end. + // So, [edgeBegin, edgeEnd) is for this current host. + uint64_t edgeEnd = outIndexBuffer[nodeEnd - 1]; + // Extract node out-going range and data arrays of local nodes. + // From now on, those arrays store local node information + // as a dense memory representation. + shadConverter.extractLocalOutIndexArray( + nodeBegin, nodeEnd); + + uint64_t numLocalEdges = edgeEnd - edgeBegin; + shadConverter.constructEdgeArrays( + nodeBegin, edgeBegin, numLocalNodes, numLocalEdges); + // Construct a buffered graph that is used by CuSP to partition + // a graph. + shadConverter.constructBufferedGraph( + base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges, + nodeBegin, nodeEnd, edgeBegin, edgeEnd, &bufGraph); + } + graphReadTimer.stop(); galois::gDebug("[", base_DistGraph::id, "] Reading graph complete."); @@ -455,6 +547,15 @@ class NewDistGraphGeneric : public DistGraph { Tgraph_construct.stop(); galois::gDebug("[", base_DistGraph::id, "] Graph construction complete."); + if (useShad) { + // Different from the gr format file that has been used by Galois + // and does not contain node data in the file, + // a SHAD graph file has a single type for each node, and it + // is considered as node data. + // This function constructs and sets node data (type). + assignNodeDataFromSHADProp(&shadConverter); + } + // report state rounds if (base_DistGraph::id == 0) { galois::runtime::reportStat_Single(GRNAME, "CuSPStateRounds", @@ -503,6 +604,43 @@ class NewDistGraphGeneric : public DistGraph { return toReturn; } + /** + * @brief Assign a SHAD node type to a node data. + * + * @detail Different from the gr format file that has been used by Galois + * and does not contain node data in the file, + * a SHAD graph file has a single type for each node, and it + * considered as node data. This function constructs and sets node + * data based on that. + * This function assumes that the node type's data type is always + * uint64_t. + * + * @tparam T Node data type + * + * @param shadConverter SHAD graph converter holding node data from a + * SHAD file. + */ + template >* = nullptr> + void assignNodeDataFromSHADProp(shad::ShadGraphConverter* shadConverter) { + galois::gPrint("[", base_DistGraph::id, "] Graph node data is assigned."); + uint64_t* nodeDataBuffer = shadConverter->getNodeDataBuffer(); + galois::do_all(galois::iterate(base_DistGraph::allNodesRange()), + [&](uint32_t lid) { + uint64_t gid = this->getGID(lid); + this->getData(lid) = nodeDataBuffer[gid]; + std::cout << "lid :" << lid << " is set to " << + this->getData(lid) << "\n"; + }); + } + + template >* = nullptr> + void assignNodeDataFromSHADProp( + [[maybe_unused]] shad::ShadGraphConverter* shadConverter) {} + /** * For each other host, determine which nodes that this host needs to get * info from diff --git a/libcusp/test/CMakeLists.txt b/libcusp/test/CMakeLists.txt new file mode 100644 index 0000000000..710627302c --- /dev/null +++ b/libcusp/test/CMakeLists.txt @@ -0,0 +1,2 @@ +add_executable(shad_dist_graph shad-dist-graph.cpp) +target_link_libraries(shad_dist_graph galois_gnn) diff --git a/libcusp/test/shad-dist-graph.cpp b/libcusp/test/shad-dist-graph.cpp new file mode 100644 index 0000000000..fe71231295 --- /dev/null +++ b/libcusp/test/shad-dist-graph.cpp @@ -0,0 +1,118 @@ +/* + * This file belongs to the Galois project, a C++ library for exploiting + * parallelism. The code is being released under the terms of the 3-Clause BSD + * License (a copy is located in LICENSE.txt at the top-level directory). + * + * Copyright (C) 2018, The University of Texas at Austin. All rights reserved. + * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS + * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF + * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF + * DEALING OR USAGE OF TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH + * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances + * shall University be liable for incidental, special, indirect, direct or + * consequential damages or loss of profits, interruption of business, or + * related expenses which may arise from use of Software or Documentation, + * including but not limited to those resulting from defects in Software and/or + * Documentation, or loss or inaccuracy of data of any kind. + */ + +#include + +#include "galois/Galois.h" +#include "galois/graphs/CuSPPartitioner.h" +#include "shad/ShadGraphConverter.h" + +int main() { + galois::DistMemSys G; + unsigned M = galois::substrate::getThreadPool().getMaxThreads(); + //M = 1; + galois::setActiveThreads(M); + + shad::ShadGraphConverter shadConverter; + size_t numNodes{0}, numEdges{0}; + + std::string filename = "/home/hochan/data.csv"; + shadConverter.readSHADFile(filename, &numNodes, &numEdges); + std::unique_ptr> + graph = galois::cuspPartitionGraph( + filename, galois::CUSP_CSR, galois::CUSP_CSR, true, true); + + galois::DGAccumulator sumGlobalNodes; + galois::DGAccumulator sumGlobalEdges; + + sumGlobalNodes.reset(); + sumGlobalEdges.reset(); + + sumGlobalNodes += graph->numMasters(); + sumGlobalEdges += graph->sizeEdges(); + + uint64_t reducedSumGlobalNodes = sumGlobalNodes.reduce(); + uint64_t reducedSumGlobalEdges = sumGlobalEdges.reduce(); + + assert(reducedSumGlobalNodes == numNodes); + assert(reducedSumGlobalNodes == graph->globalSize()); + assert(reducedSumGlobalEdges == numEdges); + assert(reducedSumGlobalEdges == graph->globalSizeEdges()); + + uint32_t id = galois::runtime::getSystemNetworkInterface().ID; + uint32_t numHosts = galois::runtime::getSystemNetworkInterface().Num; + { + std::ofstream fp(std::to_string(id) + ".master"); + for (uint32_t src = 0; src < graph->numMasters(); ++src) { + uint64_t srcglobal = graph->getGID(src); + fp << "node " << srcglobal << ", type: " << graph->getData(src) << "\n"; + for (auto e : graph->edges(src)) { + uint32_t dstlocal = graph->getEdgeDst(e); + uint64_t dstglobal = graph->getGID(dstlocal); + fp << "\t edge dst " << dstglobal << ", type: " << + graph->getEdgeData(e) << "\n"; + } + } + fp.close(); + } + + { + for (uint32_t host = 0; host < numHosts; ++host) { + if (host == id) { continue; } + std::ofstream fp(std::to_string(id) + "-" + std::to_string(host) + ".graph"); + for (uint32_t i = 0; i < graph->size(); ++i) { + fp << i << ", " << graph->getGID(i) << ", " << + graph->getData(i) << "\n"; + } + fp.close(); + } + } + { + for (uint32_t host = 0; host < numHosts; ++host) { + if (host == id) { + continue; + } + std::ofstream fp(std::to_string(id) + "-" + std::to_string(host) + ".mirror"); + for (uint32_t i = 0; + i < graph->getMirrorNodes()[host].size(); ++i) { + uint64_t srcglobal = graph->getMirrorNodes()[host][i]; + uint32_t src = graph->getLID(srcglobal); + fp << "src:" << src << ", global:" << srcglobal << ", node data:" << + graph->getData(src) << "\n" << std::flush; + + assert(shadConverter.checkNode(srcglobal, graph->getData(src))); + fp << "node " << srcglobal << ", type: " << graph->getData(src) << "\n"; + //if (std::distance(graph->edge_begin(src), graph->edge_end(src)) > 0) { + for (auto e : graph->edges(src)) { + uint32_t dst = graph->getEdgeDst(e); + uint64_t dstglobal = graph->getGID(dst); + assert(shadConverter.checkNode(dstglobal, graph->getData(dst))); + assert(shadConverter.checkEdge(srcglobal, dstglobal, + std::distance(graph->edge_begin(src), e), + graph->getEdgeData(e))); + fp << "\t edge dst " << dstglobal << ", type: " << + graph->getEdgeData(e) << "\n" << std::flush; + } + } + fp.close(); + } + } + + return 0; +} diff --git a/libdeepgalois/include/deepgalois/types.h b/libdeepgalois/include/deepgalois/types.h index 43d55eb331..17dd05b15d 100644 --- a/libdeepgalois/include/deepgalois/types.h +++ b/libdeepgalois/include/deepgalois/types.h @@ -3,6 +3,7 @@ #include #include #include +#include // TODO namespace diff --git a/libgalois/include/galois/graphs/BufferedGraph.h b/libgalois/include/galois/graphs/BufferedGraph.h index 22cc10cc11..956c9d7d7a 100644 --- a/libgalois/include/galois/graphs/BufferedGraph.h +++ b/libgalois/include/galois/graphs/BufferedGraph.h @@ -277,6 +277,46 @@ class BufferedGraph { */ BufferedGraph() { resetReadCounters(); } + /** + * @brief Construct a buffered graph from parameters paseed. + * The array parameters should be constructed outside. + * + * @param _outIndexBuffer Outgoing neighbors range for each node + * @param _edgeDestBuffer Outgoing edge destination nodes + * @param _edgeDataBuffer Outgoing edge data + * @param _globalsize The number of global nodes + * @param _globalEdgeSize The number of global edges + * @param _numLocalNodes The number of local nodes + * @param _numLocalEdges The number of local edges + * @param _nodeOffset Node offsets on the global node space of + * the current host + * @param _edgeOffset Edge offsets on the global edge space of + * the current host + */ + void constructFrom(uint64_t* _outIndexBuffer, uint32_t* _edgeDestBuffer, + EdgeDataType* _edgeDataBuffer, uint32_t _globalSize, + uint64_t _globalEdgeSize, uint32_t _numLocalNodes, + uint64_t _numLocalEdges, uint64_t _nodeOffset, + uint64_t _edgeOffset) { + assert(_outIndexBuffer != nullptr); + assert(_edgeDestBuffer != nullptr); + assert(_edgeDataBuffer != nullptr); + outIndexBuffer = _outIndexBuffer; + edgeDestBuffer = _edgeDestBuffer; + edgeDataBuffer = _edgeDataBuffer; + globalSize = _globalSize; + globalEdgeSize = _globalEdgeSize; + numLocalNodes = _numLocalNodes; + numLocalEdges = _numLocalEdges; + nodeOffset = _nodeOffset; + edgeOffset = _edgeOffset; + resetReadCounters(); + graphLoaded = true; + numBytesReadOutIndex += sizeof(uint64_t); + numBytesReadEdgeDest += sizeof(uint64_t); + numBytesReadEdgeData += sizeof(uint64_t); + } + /** * On destruction, free allocated buffers (if necessary). */ @@ -430,10 +470,9 @@ class BufferedGraph { } assert(nodeOffset <= globalNodeID); assert(globalNodeID < (nodeOffset + numLocalNodes)); - numBytesReadOutIndex += sizeof(uint64_t); - uint64_t localNodeID = globalNodeID - nodeOffset; + return EdgeIterator(outIndexBuffer[localNodeID]); } diff --git a/libgalois/include/shad/DataTypes.h b/libgalois/include/shad/DataTypes.h new file mode 100644 index 0000000000..84dc770bee --- /dev/null +++ b/libgalois/include/shad/DataTypes.h @@ -0,0 +1,734 @@ +//===------------------------------------------------------------*- C++ -*-===// +// +// SHAD +// +// The Scalable High-performance Algorithms and Data Structure Library +// +//===----------------------------------------------------------------------===// +// +// Copyright 2018 Battelle Memorial Institute +// +// Licensed under the Apache License, Version 2.0 (the "License"); you may not +// use this file except in compliance with the License. You may obtain a copy +// of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, WITHOUT +// WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the +// License for the specific language governing permissions and limitations +// under the License. +// +//===----------------------------------------------------------------------===// + +#ifndef LIBGALOIS_INCLUDE_SHAD_DATATYPES_H_ +#define LIBGALOIS_INCLUDE_SHAD_DATATYPES_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +namespace shad { + +/// @brief Data conversion utilities. +/// +/// Please refer to methods specialization to check +/// which data types are supported. +namespace data_types { + + /// @brief Enumeration of supported data types. + /// + /// The enumeration is meant to be used when parsing data + /// (i.e. type information is not known at compile time). + enum data_t { + STRING = 0, // string support is currenlty limited + CHARS, // sequence of characters + UINT, // unsigned, binds by default to uint64_t + INT, // int, binds by default to int64_t + FLOAT, // float, binds by default to float + DOUBLE, // double, binds by default to double + BOOL, // bool, binds by default to bool + DATE, // date in "%y-%m-%d" format, binds by default to time_t + USDATE, // date in "%m/%d/%y" format, binds by default to time_t + DATE_TIME, // date in "%y-%m-%dT%H:%M:%S" format, + // binds by default to time_t + IP_ADDRESS, // IPv4, binds by default to data_types::ipv4_t + LIST_UINT, // Sequence of unsigneds, support currently limited + LIST_INT, // Sequence of integers, support currently limited + LIST_DOUBLE, // Sequence of doubles, support currently limited + NONE + }; + + /// @brief Data structures for storing schema information. + /// Given a tuple of data, it associates elements labels and data types + /// to their position in the tuple. + using schema_t = std::vector>; + + /// @brief Encoded null value. + /// @tparam ENC_t encoding type. + /// @return Encoded null value for ENC_t. + template + constexpr ENC_t kNullValue = ENC_t(); + + /// @brief Encoded null value for uint64_t. + /// @return Null encoded value for uint64_t. + template <> + constexpr uint64_t kNullValue = std::numeric_limits::max(); + + /// @brief Encoded null value for time_t (same as long). + /// @return Null encoded value for time_t (same as long). + template <> + constexpr time_t kNullValue = std::numeric_limits::max(); + + /// @brief Encoded null value for double. + /// @return Null encoded value for double. + template <> + constexpr double kNullValue = std::numeric_limits::max(); + + /// @brief Encode Function + /// Available specializations: + /// ENC_t = uint64_t, IN_t = std::string + /// @tparam ENC_t The type to encode to. + /// @tparam IN_t The type (format) of the data to encode. + /// @tparam DT data_types::data_t of the data to encode. + /// @param in Data to encode. + /// @return Encoded data. + template + ENC_t encode(IN_t &in); + + /// @brief Encode Function + /// Available specializations: + /// ENC_t = uint64_t, IN_t = default bindings of data_types::data_t + /// @tparam ENC_t The type to encode to. + /// @tparam IN_t The type of the data to encode. + /// @param in Data to encode. + /// @return Encoded data. + template + ENC_t encode(IN_t &in); + + template + ENC_t encode(IN_t &in, data_t dt); + + template + std::array encode(std::string &str) { + std::array res; + if (str.size() > 0) { + memcpy(res.data(), str.data(), sizeof(ENC_t)*MAX_s); + } else { + res.fill('\0'); + } + return res; + } + + template + typename std::enable_if<(std::is_arithmetic::value or (sizeof(DEC_t) == sizeof(ENC_t))), DEC_t>::type + decode(ENC_t encvalue) { + DEC_t val; + memcpy(&val, &encvalue, sizeof(DEC_t)); + return val; + } + + template + DEC_t decode(ENC_t value); + + template + typename std::enable_if<(ST==data_t::INT), int64_t>::type + decode(ENC_t encvalue) { + return decode(encvalue); + } + + template + typename std::enable_if<(ST==data_t::UINT), uint64_t>::type + decode(ENC_t encvalue) { + return decode(encvalue); + } + + template + typename std::enable_if<(ST==data_t::FLOAT), float>::type + decode(ENC_t encvalue) { + return decode(encvalue); + } + + template + typename std::enable_if<(ST==data_t::DOUBLE), double>::type + decode(ENC_t encvalue) { + return decode(encvalue); + } + + template + typename std::enable_if<(ST==data_t::BOOL), bool>::type + decode(ENC_t encvalue) { + return decode(encvalue); + } + + template + typename std::enable_if<(ST==data_t::DATE), std::time_t>::type + decode(ENC_t encvalue) { + return decode(encvalue); + } + + template + std::string decode(std::array &val) { + return std::string(reinterpret_cast(val.data())); + } +} // namespace data_types + + +// ENCODE METHODS SPECIALIZATION FOR UINT64 ENC_t +template<> inline +uint64_t data_types::encode(std::string &str) { + uint64_t value; + try { value = std::stoull(str); } + catch(...) { value = kNullValue; } + return value; +} + +template<> inline +uint64_t data_types::encode(std::string &str) { + uint64_t encval; + int64_t value; + try { value = stoll(str); } + catch(...) { return kNullValue; } + memcpy(&encval, &value, sizeof(value)); + return encval; +} + +template<> inline +uint64_t data_types::encode(std::string &str) { + uint64_t encval; + float value; + try { value = stof(str); } + catch(...) { return kNullValue; } + memcpy(&encval, &value, sizeof(value)); + return encval; +} + +template<> inline +uint64_t data_types::encode(std::string &str) { + uint64_t encval; + double value; + try { value = stod(str); } + catch(...) { return kNullValue; } + memcpy(&encval, &value, sizeof(value)); + return encval; +} + +template<> inline +uint64_t data_types::encode(std::string &str) { + if (str.size() == 0) return kNullValue; + uint64_t encval = 1; + if ((str == "F") || (str == "f") || (str == "FALSE") + || (str == "false") || (str == "0")) encval = 0; + return encval; +} + + +template<> inline +uint64_t data_types::encode(std::string &str) { + uint64_t encval = 0; + memset(&encval, '\0', sizeof(encval)); + memcpy(&encval, str.c_str(), sizeof(encval)-1); + return encval; +} + +template<> inline +uint64_t data_types::encode(std::string &str) { + uint64_t val, value = 0; + std::string::iterator start = str.begin(); + for (unsigned i = 0; i < 4; i ++) { + std::string::iterator end = std::find(start, str.end(), '.'); + try { + val = std::stoull(std::string(start, end)); + } catch(...) { + return kNullValue; + } + if (val < 256) { + value = (value << 8) + val; start = end + 1; + } else { + return kNullValue; + } + } + return value; +} + +template<> inline +uint64_t data_types::encode(std::string &str) { + uint64_t value = 0; + struct tm date{}; + date.tm_isdst = -1; + strptime(str.c_str(), "%Y-%m-%d", &date); + time_t t; + try { + t = mktime(&date); + } + catch(...) { + return kNullValue; + } + memcpy(&value, &t, sizeof(value)); + return value; +} + +template<> inline +uint64_t data_types::encode(std::string &str) { + uint64_t value = 0; + struct tm date{}; + date.tm_isdst = -1; + strptime(str.c_str(), "%m/%d/%y", &date); + time_t t; + try { + t = mktime(&date); + } + catch(...) { + return kNullValue; + } + memcpy(&value, &t, sizeof(value)); + return value; +} + +template<> inline +uint64_t data_types::encode(std::string &str) { + uint64_t value = 0; + struct tm date{}; + date.tm_isdst = -1; + strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date); + time_t t; + try { + t = mktime(&date); + } + catch(...) { + return kNullValue; + } + memcpy(&value, &t, sizeof(value)); + return value; +} + +// ENCODE METHODS SPECIALIZATION FOR DOUBLE ENC_t + +template<> inline +double data_types::encode(std::string &str) { + double encval; + uint64_t value; + try { value = std::stoull(str); } + catch(...) { return kNullValue; } + memcpy(&encval, &value, sizeof(value)); + return encval; +} + +template<> inline +double data_types::encode(std::string &str) { + double encval; + int64_t value; + try { value = stoll(str); } + catch(...) { return kNullValue; } + memcpy(&encval, &value, sizeof(value)); + return encval; +} + +template<> inline +double data_types::encode(std::string &str) { + double encval; + float value; + try { value = stof(str); } + catch(...) { return kNullValue; } + memcpy(&encval, &value, sizeof(value)); + return encval; +} + +template<> inline +double data_types::encode(std::string &str) { + double value; + try { value = stod(str); } + catch(...) { return kNullValue; } + return value; +} + +template<> inline +double data_types::encode(std::string &str) { + if (str.size() == 0) return kNullValue; + double encval = 1; + if ((str == "F") || (str == "f") || (str == "FALSE") + || (str == "false") || (str == "0")) encval = 0; + return encval; +} + + +template<> inline +double data_types::encode(std::string &str) { + double encval = 0; + memset(&encval, '\0', sizeof(encval)); + memcpy(&encval, str.c_str(), sizeof(encval)-1); + return encval; +} + +template<> inline +double data_types::encode(std::string &str) { + uint64_t val, value = 0; + std::string::iterator start = str.begin(); + for (unsigned i = 0; i < 4; i ++) { + std::string::iterator end = std::find(start, str.end(), '.'); + try { + val = std::stoull(std::string(start, end)); + } catch(...) { + return kNullValue; + } + if (val < 256) { + value = (value << 8) + val; start = end + 1; + } else { + return kNullValue; + } + } + double encval; + memcpy(&encval, &value, sizeof(value)); + return encval; +} + +template<> inline +double data_types::encode(std::string &str) { + double value = 0; + struct tm date{}; + date.tm_isdst = -1; + strptime(str.c_str(), "%Y-%m-%d", &date); + time_t t; + try { + t = mktime(&date); + } + catch(...) { + return kNullValue; + } + memcpy(&value, &t, sizeof(value)); + return value; +} + +template<> inline +double data_types::encode(std::string &str) { + double value = 0; + struct tm date{}; + date.tm_isdst = -1; + strptime(str.c_str(), "%m/%d/%y", &date); + time_t t; + try { + t = mktime(&date); + } + catch(...) { + return kNullValue; + } + memcpy(&value, &t, sizeof(value)); + return value; +} + +template<> inline +double data_types::encode(std::string &str) { + double value = 0; + struct tm date{}; + date.tm_isdst = -1; + strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date); + time_t t; + try { + t = mktime(&date); + } + catch(...) { + return kNullValue; + } + memcpy(&value, &t, sizeof(value)); + return value; +} + +// ENCODE METHODS SPECIALIZATION FOR TIME_T ENC_t (same as long) +template<> inline +time_t data_types::encode(std::string &str) { + time_t value; + try { value = std::stoul(str); } + catch(...) { value = kNullValue; } + return value; +} + +template<> inline +time_t data_types::encode(std::string &str) { + int64_t value; + try { value = stol(str); } + catch(...) { return kNullValue; } + return value; +} + +template<> inline +time_t data_types::encode(std::string &str) { + time_t encval; + float value; + try { value = stof(str); } + catch(...) { return kNullValue; } + memcpy(&encval, &value, sizeof(value)); + return encval; +} + +template<> inline +time_t data_types::encode(std::string &str) { + time_t encval; + double value; + try { value = stod(str); } + catch(...) { return kNullValue; } + memcpy(&encval, &value, sizeof(value)); + return encval; +} + +template<> inline +time_t data_types::encode(std::string &str) { + if (str.size() == 0) return kNullValue; + time_t encval = 1; + if ((str == "F") || (str == "f") || (str == "FALSE") + || (str == "false") || (str == "0")) encval = 0; + return encval; +} + + +template<> inline +time_t data_types::encode(std::string &str) { + time_t encval = 0; + memset(&encval, '\0', sizeof(encval)); + memcpy(&encval, str.c_str(), sizeof(encval)-1); + return encval; +} + +template<> inline +time_t data_types::encode(std::string &str) { + time_t val, value = 0; + std::string::iterator start = str.begin(); + for (unsigned i = 0; i < 4; i ++) { + std::string::iterator end = std::find(start, str.end(), '.'); + try { + val = std::stoull(std::string(start, end)); + } catch(...) { + return kNullValue; + } + if (val < 256) { + value = (value << 8) + val; start = end + 1; + } else { + return kNullValue; + } + } + return value; +} + +template<> inline +time_t data_types::encode(std::string &str) { + struct tm date{}; + date.tm_isdst = -1; + strptime(str.c_str(), "%Y-%m-%d", &date); + time_t t; + try { + t = mktime(&date); + } + catch(...) { + return kNullValue; + } + return t; +} + +template<> inline +time_t data_types::encode(std::string &str) { + struct tm date{}; + date.tm_isdst = -1; + strptime(str.c_str(), "%m/%d/%y", &date); + time_t t; + try { + t = mktime(&date); + } + catch(...) { + return kNullValue; + } + return t; +} + +template<> inline +time_t data_types::encode(std::string &str) { + struct tm date{}; + date.tm_isdst = -1; + strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date); + time_t t; + try { + t = mktime(&date); + } + catch(...) { + return kNullValue; + } + return t; +} + +template +ENC_t data_types::encode(IN_t &in, data_types::data_t dt) { + switch (dt) { +// case data_types::STRING : +// return data_types::encode(in); +// case data_types::CHARS : +// return data_types::encode(in); + case data_types::UINT : + return data_types::encode(in); + case data_types::INT : + return data_types::encode(in); + case data_types::FLOAT : + return data_types::encode(in); + case data_types::DOUBLE : + return data_types::encode(in); + case data_types::BOOL : + return data_types::encode(in); + case data_types::DATE : + return data_types::encode(in); + case data_types::USDATE : + return data_types::encode(in); + case data_types::DATE_TIME : + return data_types::encode(in); + case data_types::IP_ADDRESS : + return data_types::encode(in); + } + return data_types::kNullValue; +} + +template<> inline +std::string data_types::decode(uint64_t value) { + if (value == kNullValue) return ""; + return std::to_string(value); +} + +template<> inline +std::string data_types::decode(uint64_t value) { + if (value == kNullValue) return ""; + int64_t v; + memcpy(&v, &value, sizeof(v)); + return std::to_string(v); +} + +template<> inline +std::string data_types::decode(uint64_t value) { + if (value == kNullValue) return ""; + float v; + memcpy(&v, &value, sizeof(v)); + return std::to_string(v); +} + +template<> inline +std::string data_types::decode(uint64_t value) { + if (value == kNullValue) return ""; + double v; + memcpy(&v, &value, sizeof(v)); + return std::to_string(v); +} + +template<> inline +std::string data_types::decode(uint64_t value) { + std::string ipAddr = ""; + uint64_t octets[4]; + for (uint64_t k = 0; k < 4; k ++) {octets[k] = value & 255; value = value >> 8;} + for (uint64_t k = 3; k >= 1; k --) ipAddr += std::to_string(octets[k]) + '.'; + return ipAddr + std::to_string(octets[0]); +} + +template<> inline +std::string data_types::decode(uint64_t value) { + if (value == kNullValue) return ""; + return std::to_string(value); +} + +template<> inline +std::string data_types::decode(uint64_t value) { + time_t t = data_types::decode(value); + char dateString[11]; + strftime(dateString, 11, "%Y-%m-%d", std::localtime(&t)); + return std::string(dateString); +} + +template<> inline +std::string data_types::decode(uint64_t value) { + const char* c = reinterpret_cast(&value); + return std::string(c); +} + +template <> inline +uint64_t data_types::decode(uint64_t encvalue) { + return encvalue; +} +} // namespace shad + +#endif // LIBGALOIS_INCLUDE_SHAD_DATA_TYPES_H_ diff --git a/libgalois/include/shad/Graph.h b/libgalois/include/shad/Graph.h new file mode 100644 index 0000000000..9029b1ef32 --- /dev/null +++ b/libgalois/include/shad/Graph.h @@ -0,0 +1,169 @@ +//TODO(hc): Upgrade copyright if it is necessary; for now, we have no plan +// to make this public. + +//===------------------------------------------------------------*- C++ -*-===// +// +// The AGILE Workflows +// +//===----------------------------------------------------------------------===// +// ** Pre-Copyright Notice +// +// This computer software was prepared by Battelle Memorial Institute, +// hereinafter the Contractor, under Contract No. DE-AC05-76RL01830 with the +// Department of Energy (DOE). All rights in the computer software are reserved +// by DOE on behalf of the United States Government and the Contractor as +// provided in the Contract. You are authorized to use this computer software +// for Governmental purposes but it is not to be released or distributed to the +// public. NEITHER THE GOVERNMENT NOR THE CONTRACTOR MAKES ANY WARRANTY, EXPRESS +// OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. This +// notice including this sentence must appear on any copies of this computer +// software. +// +// ** Disclaimer Notice +// +// This material was prepared as an account of work sponsored by an agency of +// the United States Government. Neither the United States Government nor the +// United States Department of Energy, nor Battelle, nor any of their employees, +// nor any jurisdiction or organization that has cooperated in the development +// of these materials, makes any warranty, express or implied, or assumes any +// legal liability or responsibility for the accuracy, completeness, or +// usefulness or any information, apparatus, product, software, or process +// disclosed, or represents that its use would not infringe privately owned +// rights. Reference herein to any specific commercial product, process, or +// service by trade name, trademark, manufacturer, or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or any agency thereof, or Battelle Memorial +// Institute. The views and opinions of authors expressed herein do not +// necessarily state or reflect those of the United States Government or any +// agency thereof. +// +// PACIFIC NORTHWEST NATIONAL LABORATORY +//===----------------------------------------------------------------------===// + +#ifndef LIBGALOIS_INCLUDE_SHAD_GRAPH_H_ +#define LIBGALOIS_INCLUDE_SHAD_GRAPH_H_ + +#include +#include +#include + +#include "DataTypes.h" +#include "GraphTypes.h" + +#define UINT shad::data_types::UINT +#define DOUBLE shad::data_types::DOUBLE +#define USDATE shad::data_types::USDATE +#define ENCODE shad::data_types::encode + +namespace shad { + +class Vertex { + public: + // Vertex id; initially it is set + // to a local node id while CuSP reads a file and constructs + // this vertex. After each host finishes and synchronizes it to construct + // a full CSR graph, it is updated to a global node id. + uint64_t id; + TYPES type; + uint64_t shadKey; + // Number of edges. + // This is incremented while reads a graph. + uint64_t numEdges{0}; + + Vertex () { + this->id = shad::data_types::kNullValue; + this->type = TYPES::NONE; + this->shadKey = shad::data_types::kNullValue; + } + + Vertex (uint64_t id_, TYPES type_, uint64_t shadKey_) { + this->id = id_; + this->type = type_; + this->shadKey = shadKey_; + } + + void incrNumEdges() { + this->numEdges += 1; + } + + uint64_t getNumEdges() { + return this->numEdges; + } +}; + +class Edge { + public: + uint64_t src; // vertex id of src + uint64_t dst; // vertex id of dst + TYPES type; + TYPES src_type; + TYPES dst_type; + uint64_t src_glbid; + uint64_t dst_glbid; + + Edge () { + src = shad::data_types::kNullValue; + dst = shad::data_types::kNullValue; + type = TYPES::NONE; + src_type = TYPES::NONE; + dst_type = TYPES::NONE; + src_glbid = shad::data_types::kNullValue; + dst_glbid = shad::data_types::kNullValue; + } + + Edge (std::vector & tokens) { + if (tokens[0] == "Sale") { + src = ENCODE(tokens[1]); + dst = ENCODE(tokens[2]); + type = TYPES::SALE; + src_type = TYPES::PERSON; + dst_type = TYPES::PERSON; + src_glbid = shad::data_types::kNullValue; + dst_glbid = shad::data_types::kNullValue; + } else if (tokens[0] == "Author") { + src = ENCODE(tokens[1]); + type = TYPES::AUTHOR; + src_type = TYPES::PERSON; + src_glbid = shad::data_types::kNullValue; + dst_glbid = shad::data_types::kNullValue; + if (tokens[3] != "") dst = ENCODE(tokens[3]); + else if (tokens[4] != "") dst = ENCODE(tokens[4]); + else if (tokens[5] != "") dst = ENCODE(tokens[5]); + if (tokens[3] != "") dst_type = TYPES::FORUM; + else if (tokens[4] != "") dst_type = TYPES::FORUMEVENT; + else if (tokens[5] != "") dst_type = TYPES::PUBLICATION; + } else if (tokens[0] == "Includes") { + src = ENCODE(tokens[3]); + dst = ENCODE(tokens[4]); + type = TYPES::INCLUDES; + src_type = TYPES::FORUM; + dst_type = TYPES::FORUMEVENT; + src_glbid = shad::data_types::kNullValue; + dst_glbid = shad::data_types::kNullValue; + } else if (tokens[0] == "HasTopic") { + dst = ENCODE(tokens[6]); + type = TYPES::HASTOPIC; + dst_type = TYPES::TOPIC; + src_glbid = shad::data_types::kNullValue; + dst_glbid = shad::data_types::kNullValue; + if (tokens[3] != "") src = ENCODE(tokens[3]); + else if (tokens[4] != "") src = ENCODE(tokens[4]); + else if (tokens[5] != "") src = ENCODE(tokens[5]); + if (tokens[3] != "") src_type = TYPES::FORUM; + else if (tokens[4] != "") src_type = TYPES::FORUMEVENT; + else if (tokens[5] != "") src_type = TYPES::PUBLICATION; + } else if (tokens[0] == "HasOrg") { + src = ENCODE(tokens[5]); + dst = ENCODE(tokens[6]); + type = TYPES::HASORG; + src_type = TYPES::PUBLICATION; + dst_type = TYPES::TOPIC; + src_glbid = shad::data_types::kNullValue; + dst_glbid = shad::data_types::kNullValue; + } + } +}; + +} // namespace agile::workflow1 + +#endif // GRAPH_H diff --git a/libgalois/include/shad/GraphTypes.h b/libgalois/include/shad/GraphTypes.h new file mode 100644 index 0000000000..eb84e123c2 --- /dev/null +++ b/libgalois/include/shad/GraphTypes.h @@ -0,0 +1,71 @@ +//===------------------------------------------------------------*- C++ -*-===// +// +// The AGILE Workflows +// +//===----------------------------------------------------------------------===// +// ** Pre-Copyright Notice +// +// This computer software was prepared by Battelle Memorial Institute, +// hereinafter the Contractor, under Contract No. DE-AC05-76RL01830 with the +// Department of Energy (DOE). All rights in the computer software are reserved +// by DOE on behalf of the United States Government and the Contractor as +// provided in the Contract. You are authorized to use this computer software +// for Governmental purposes but it is not to be released or distributed to the +// public. NEITHER THE GOVERNMENT NOR THE CONTRACTOR MAKES ANY WARRANTY, EXPRESS +// OR IMPLIED, OR ASSUMES ANY LIABILITY FOR THE USE OF THIS SOFTWARE. This +// notice including this sentence must appear on any copies of this computer +// software. +// +// ** Disclaimer Notice +// +// This material was prepared as an account of work sponsored by an agency of +// the United States Government. Neither the United States Government nor the +// United States Department of Energy, nor Battelle, nor any of their employees, +// nor any jurisdiction or organization that has cooperated in the development +// of these materials, makes any warranty, express or implied, or assumes any +// legal liability or responsibility for the accuracy, completeness, or +// usefulness or any information, apparatus, product, software, or process +// disclosed, or represents that its use would not infringe privately owned +// rights. Reference herein to any specific commercial product, process, or +// service by trade name, trademark, manufacturer, or otherwise does not +// necessarily constitute or imply its endorsement, recommendation, or favoring +// by the United States Government or any agency thereof, or Battelle Memorial +// Institute. The views and opinions of authors expressed herein do not +// necessarily state or reflect those of the United States Government or any +// agency thereof. +// +// PACIFIC NORTHWEST NATIONAL LABORATORY +// operated by +// BATTELLE +// for the +// UNITED STATES DEPARTMENT OF ENERGY +// under Contract DE-AC05-76RL01830 +//===----------------------------------------------------------------------===// + +#ifndef LIBGALOIS_INCLUDE_SHAD_GRAPHTYPES_H_ +#define LIBGALOIS_INCLUDE_SHAD_GRAPHTYPES_H_ + +namespace shad { + +enum class TYPES { + PERSON, + FORUMEVENT, + FORUM, + PUBLICATION, + TOPIC, + PURCHASE, + SALE, + AUTHOR, + WRITTENBY, + INCLUDES, + INCLUDEDIN, + HASTOPIC, + TOPICIN, + HASORG, + ORGIN, + NONE +}; + +} // namespace agile::workflow1 + +#endif // GRAPHTYPES_H diff --git a/libgalois/include/shad/ShadGraphConverter.h b/libgalois/include/shad/ShadGraphConverter.h new file mode 100644 index 0000000000..5162fc8dfb --- /dev/null +++ b/libgalois/include/shad/ShadGraphConverter.h @@ -0,0 +1,712 @@ +#ifndef LIBGALOIS_INCLUDE_SHAED_GRAPH_READER_H_ +#define LIBGALOIS_INCLUDE_SHAED_GRAPH_READER_H_ + +#include +#include + +#include "galois/graphs/BufferedGraph.h" + +#include "shad/DataTypes.h" +#include "shad/Graph.h" +#include "shad/GraphTypes.h" + +namespace shad { + +/** + * TODO(hc): This is a shared-memory version. + * Later, a distributed-memory version in libgluon will reuse this code. + */ +template +class ShadGraphConverter { + +public: + ShadGraphConverter() : + nodeDataBuffer(nullptr) {} + + ~ShadGraphConverter() { + // BufferedGraph holds these arrays. + outIndexBuffer = nullptr; + nodeDataBuffer = nullptr; + edgeDestBuffer = nullptr; + edgeDataBuffer = nullptr; + } + + /** + * @brief Flush a graph topology to a file for debugging. + */ + void flushGraphTopology() { + std::ofstream fp("shad_graph.out"); + for (size_t i = 0; i < this->verticeIdKeyMapping.size(); ++i) { + uint64_t key = this->verticeIdKeyMapping[i]; + Vertex v = this->vertices[key]; + fp << "node " << i << ", type: " << to_underlying(v.type) << "\n"; + auto edgeRange = this->edges.equal_range(key); + for (auto ei = edgeRange.first ; ei != edgeRange.second; ++ei) { + Edge& edge = ei->second; + Vertex dst = this->vertices[edge.dst]; + fp << "\t edge dst " << dst.id << ", type: " << + to_underlying(edge.type) << "\n"; + } + } + fp.close(); + } + + /** + * @brief Read a input graph file and inspect the number of nodes and edges. + * @detail In order to construct a dense LC_CSR_Graph, we need to know how + * many edges and nodes exist. This method reads one line by one line, and + * counts those information. + * Note that this method assumes that the types of {"Person", "ForumEvent", + * "Forum", "Publication", "Topic"} are nodes, and the types of + * {"SALE", "Author", "Includes", "HasTopic", "HasOrg"} are edges. + * + * @param filename file name to read + * @param numNodes number of nodes that this method reads + * @param numEdges number of edges that this method reads + */ + void InspectGraph(const std::string& filename, size_t* numNodes, + size_t* numEdges) { + // TODO(hc): Get the number of nodes and edges from file + // For example, it reads {SALE, Author, Includes, HasTopic, HasOrg} as + // edges. So we just count how many they exist in the file. + + std::string line; + std::ifstream file(filename); + if (!file.is_open()) { + std::cerr << "Cannot open file " << filename << "\n"; + exit(-1); + } + while (!file.eof()) { + getline(file, line); + // Skip comments. + if (line[0] == '#') continue; + // Delimiter and # tokens set for WMD data file. + std::vector tokens = splitTokens(line, ',', 10); + + if (this->isTokenNodeType(tokens[0])) { + ++(*numNodes); + } else if (this->isTokenEdgeType(tokens[0])) { + *numEdges += 2; + } + } + + std::cout << "Number of nodes:" << *numNodes << ", number of edges:" << + *numEdges << "\n"; + } + + /** + * @brief Construct a buffered graph from existing arrays constructed + * by constructNodeArrays() and constructEdgeArrays(). + * + * @param numGlobalNodes The number of global nodes + * @param numGlobalEdges The number of global edges + * @param nodeBegin Global node ID of the first local node + * @param nodeEnd (Global node ID of the last local node) + 1 + * @param edgeBegin Global edge ID of the first local edge + * @param edgeEnd (Global edge ID of the last local edge) + 1 + * @param bufferedGraph Buffered graph for CuSP + */ + void constructBufferedGraph( + uint64_t numGlobalNodes, uint64_t numGlobalEdges, + uint32_t nodeBegin, uint32_t nodeEnd, + uint64_t edgeBegin, uint64_t edgeEnd, + [[maybe_unused]]galois::graphs::BufferedGraph* bufferedGraph) { + // TODO(hc): Each of these functions first construct graphs in the SHAD + // format as this file is written in not binary, but string, and also + // nodes or edges are not sorted. So, until we preprocess the input graph + // file, we should first read it in memory, and reconstruct this to Galois + // compatible + + uint32_t numLocalNodes = nodeEnd - nodeBegin; + uint64_t numLocalEdges = edgeEnd - edgeBegin; + + bufferedGraph->constructFrom( + outIndexBuffer, edgeDestBuffer, edgeDataBuffer, + numGlobalNodes, numGlobalEdges, numLocalNodes, numLocalEdges, + nodeBegin, edgeBegin); +#if 0 + TODO(hc): This verification should be fixed since it tests + a shared-memory execution that one host loads the whole + graph. It should not work on distributed-memory machine + since a CSR graph should be partitioned but tepmorary + maps reading and holding SHAD graphs are for global graph. +#ifndef NDEBUG + std::cout << "CSR verification starts.." << std::endl << std::flush; + this->VerifyCSRConstruction(outIndexBuffer, nodeDataBuffer, + edgeDestBuffer, edgeDataBuffer); + std::cout << "CSR verification starts.. [done]" << std::endl << std::flush; +#endif +#endif + // TODO(hc): Construct `buffer_graph`. + } + + /** + * @brief Read SHAD graph file and construct in-memory buffer SHAD graph. + * + * @param filename SHAD graph file name + */ + // TODO(hc): We can assign a disjointed range of file for each host. + // For now, let all hosts read the whole file. + void readSHADFile( + const std::string& filename, uint64_t* numGlobalNodes, + uint64_t *numGlobalEdges) { + std::ifstream graphFile(filename.c_str()); + uint64_t vertexId{0}; + std::string line; + uint64_t numNodes{0}, numEdges{0}; + // TODO(hc): We can parallelize it by assigning disjointed + // ranges with some inspection. + // But this would be the future work as + while (!graphFile.eof()) { + getline(graphFile, line); + // Skip comments. + if (line[0] == '#') continue; + // Delimiter and # tokens set for WMD data file. + std::vector tokens = splitTokens(line, ',', 10); + + if (tokens[0] == "Person") { + insertSHADVertex(ENCODE(tokens[1]), + TYPES::PERSON, vertexId); + ++numNodes; + } else if (tokens[0] == "ForumEvent") { + insertSHADVertex(ENCODE(tokens[4]), + TYPES::FORUMEVENT, vertexId); + ++numNodes; + } else if (tokens[0] == "Forum") { + insertSHADVertex(ENCODE(tokens[3]), + TYPES::FORUM, vertexId); + ++numNodes; + } else if (tokens[0] == "Publication") { + insertSHADVertex(ENCODE(tokens[5]), + TYPES::PUBLICATION, vertexId); + ++numNodes; + } else if (tokens[0] == "Topic") { + insertSHADVertex(ENCODE(tokens[6]), + TYPES::TOPIC, vertexId); + ++numNodes; + } else if (tokens[0] == "Sale") { + Edge sale(tokens); + insertSHADEdge(sale.src, sale); + + Edge purchase = sale; + purchase.type = TYPES::PURCHASE; + std::swap(purchase.src, purchase.dst); + insertSHADEdge(purchase.src, purchase); + numEdges += 2; + } else if (tokens[0] == "Author") { + Edge authors(tokens); + insertSHADEdge(authors.src, authors); + + Edge writtenBY = authors; + writtenBY.type = TYPES::WRITTENBY; + std::swap(writtenBY.src, writtenBY.dst); + std::swap(writtenBY.src_type, writtenBY.dst_type); + insertSHADEdge(writtenBY.src, writtenBY); + numEdges += 2; + } else if (tokens[0] == "Includes") { + Edge includes(tokens); + insertSHADEdge(includes.src, includes); + + Edge includedIN = includes; + includedIN.type = TYPES::INCLUDEDIN; + std::swap(includedIN.src, includedIN.dst); + std::swap(includedIN.src_type, includedIN.dst_type); + insertSHADEdge(includedIN.src, includedIN); + numEdges += 2; + } else if (tokens[0] == "HasTopic") { + Edge hasTopic(tokens); + insertSHADEdge(hasTopic.src, hasTopic); + + Edge topicIN = hasTopic; + topicIN.type = TYPES::TOPICIN; + std::swap(topicIN.src, topicIN.dst); + std::swap(topicIN.src_type, topicIN.dst_type); + insertSHADEdge(topicIN.src, topicIN); + numEdges += 2; + } else if (tokens[0] == "HasOrg") { + Edge hasOrg(tokens); + insertSHADEdge(hasOrg.src, hasOrg); + + Edge orgIN = hasOrg; + orgIN.type = TYPES::ORGIN; + std::swap(orgIN.src, orgIN.dst); + std::swap(orgIN.src_type, orgIN.dst_type); + insertSHADEdge(orgIN.src, orgIN); + numEdges += 2; + } + } + + // After the above loop, vertices and edges are complete. + this->CountNumEdgesForEachVertex(numNodes, numEdges); + *numGlobalNodes = numNodes; + *numGlobalEdges = numEdges; + +#ifndef NDEBUG + this->VerifySHADGraphRead(filename); +#endif + } + + /** + * @brief Return node data array. + * Note that this can be either of global graph or local graph. + */ + uint64_t* getNodeDataBuffer() { + return nodeDataBuffer; + } + + /** + * @brief Return node outgoing edge index array + * Note that this can be either of global graph or local graph. + */ + uint64_t* getOutIndexBuffer() { + return outIndexBuffer; + } + + /** + * @brief Construct vertex outgoing edge range buffer and + * vertex data buffer. + * + * @detail Extract local vertices' outgoing edge ranges and + * data from a temprory buffer of vertex map that is read and constructed + * from a SHAD CSV graph file. Note that these arrays are for local graph + * partition and their indices should be corresponding to local node ids. + * + * @param nodeBegin Global node ID of the first local node + * @param nodeEnd (Global node ID of the last local node + 1) + * @param numLocalNodes The number of local nodes + * + */ + void constructNodeArrays( + uint32_t nodeBegin, uint32_t nodeEnd, uint32_t numLocalNodes) { + // 1) Construct an edge index array (size == number of nodes). + this->outIndexBuffer = new uint64_t[numLocalNodes]; + this->nodeDataBuffer = new uint64_t[numLocalNodes]; + + // TODO(hc): for now, only consider a single host, but need to add offset later. + galois::do_all(galois::iterate(this->vertices), + [&](auto element) { + Vertex& vertex = element.second; + uint64_t vertexId = vertex.id; + if (vertexId >= nodeBegin && vertexId < nodeEnd) { + this->outIndexBuffer[vertexId - nodeBegin] = + vertex.getNumEdges(); + // Fill vertex data too; This assumes that a SHAD graph + // has a type, which is considered as a vertex data. + this->nodeDataBuffer[vertexId - nodeBegin] = + this->to_underlying(vertex.type); + } + }); + // 2) Perform parallel prefix sum to finalize outgoing edge index + // array construction. + galois::ParallelSTL::partial_sum( + outIndexBuffer, &(outIndexBuffer[numLocalNodes]), + outIndexBuffer); + } + + /** + * @brief Construct edge destination and data arrays. + * + * @detail Extract local edge destination and data from a + * temprory buffer of edge map that is read and constructed + * from a SHAD CSV graph file. Note that these arrays are for local graph + * partition and their indices should be corresponding to local node ids. + * + * @tparam T Edge data type; if this is not void, edge data array is + * constructed + * + * @param nodeBegin Global node ID of the first local node + * @param edgeBegin Global edge ID of the first local edge + * @param numLocalNodes The number of local nodes + * @param numLocalEdges The number of local edges + * + */ + template >* = nullptr> + void constructEdgeArrays( + uint32_t nodeBegin, uint64_t edgeBegin, uint32_t numLocalNodes, + uint64_t numLocalEdges) { + this->edgeDestBuffer = new uint32_t[numLocalEdges]; + this->edgeDataBuffer = new EdgeDataTy[numLocalEdges]; + std::vector edgeIndexPointers(numLocalNodes, 0); + galois::on_each([&](uint32_t tid, uint32_t numThreads) { + // 1) Find disjointed node range for each thread. + auto thread_work_range = + galois::block_range(uint32_t{0}, numLocalNodes, tid, numThreads); + // 2) Each thread iterates the whole edges. + for (auto edgeElem : this->edges) { + uint64_t srcVertex = edgeElem.first; + Vertex& vertex = this->vertices[srcVertex]; + uint64_t srcVertexId = vertex.id; + // 3) Each thread fills edge destination for the assigned nodes. + if (srcVertexId >= thread_work_range.first + nodeBegin && + srcVertexId < thread_work_range.second + nodeBegin) { + uint64_t edgeIdx = edgeIndexPointers[srcVertexId - nodeBegin]++; + // OutIndexBuffer now contains global edge range. + // So we need to subtract edge offset to get the local edge id. + uint64_t nodeBaseOffset = + ((srcVertexId - nodeBegin) == 0)? + 0 : outIndexBuffer[srcVertexId - nodeBegin - 1] - edgeBegin; + edgeDestBuffer[edgeIdx + nodeBaseOffset] = + this->vertices[edgeElem.second.dst].id; + edgeDataBuffer[edgeIdx + nodeBaseOffset] = + to_underlying(edgeElem.second.type); + } + } + }); + // Or inspector/executor model + // But that might be more expensive. + } + + /** + * @brief Construct edge destination array + * + * @detail Extract local edge destination from a + * temprory buffer of edge map that is read and constructed + * from a SHAD CSV graph file. Note that this array is for local graph + * partition and their indices should be corresponding to local node ids. + * + * @tparam T Edge data type; This function is enabled when + * edge data type is void + * + * @param nodeBegin Global node ID of the first local node + * @param edgeBegin Global edge ID of the first local edge + * @param numLocalNodes The number of local nodes + * @param numLocalEdges The number of local edges + * + */ + template >* = nullptr> + void constructEdgeArrays( + uint32_t nodeBegin, uint64_t edgeBegin, uint32_t numLocalNodes, + uint64_t numLocalEdges) { + edgeDestBuffer = new uint32_t[numLocalEdges]; + std::vector edgeIndexPointers(numLocalNodes, 0); + galois::on_each([&](uint32_t tid, uint32_t numThreads) { + // 1) Find disjointed node range for each thread. + auto thread_work_range = + galois::block_range(uint32_t{0}, numLocalNodes, tid, numThreads); + // 2) Each thread iterates the whole edges. + for (auto edgeElem : this->edges) { + uint64_t srcVertex = edgeElem.first; + Vertex& vertex = this->vertices[srcVertex]; + uint64_t srcVertexId = vertex.id; + // 3) Each thread fills edge destination for the assigned nodes. + if (srcVertexId >= thread_work_range.first + nodeBegin && + srcVertexId < thread_work_range.second + nodeBegin) { + uint64_t edgeIdx = edgeIndexPointers[srcVertexId - nodeBegin]++; + uint64_t nodeBaseOffset = + ((srcVertexId - nodeBegin)== 0)? + 0 : outIndexBuffer[srcVertexId - 1] - edgeBegin; + edgeDestBuffer[edgeIdx + nodeBaseOffset] = + this->vertices[edgeElem.second.dst].id; + } + } + }); + // Or inspector/executor model + // But that might be more expensive. + } + + /** + * @brief Extract outgoing edge index ranges for local vertices + * from the global outgoing edge index range array. + * + * @param nodeBegin Node global id of the first local node + * @param nodeEnd (Node global id for the last local node + 1) + */ + void extractLocalOutIndexArray(uint32_t nodeBegin, uint32_t nodeEnd) { + + uint64_t* newOutIndexBuffer = new uint64_t[nodeEnd - nodeBegin]; + galois::do_all(galois::iterate(nodeBegin, nodeEnd), + [&](uint32_t n) { + newOutIndexBuffer[n - nodeBegin] = this->outIndexBuffer[n]; + } ); + delete[] this->outIndexBuffer; + this->outIndexBuffer = newOutIndexBuffer; + } + + /** + * @brief Check if a type of a node having the passed id is + * equal to the one in a temporary vertex map constructed from + * SHAD graph file. + * + * @param id Node global id to check + * @param type Node type + * + * @return True if passed information matches to the one in + * a temporary vertex map + */ + bool checkNode(uint64_t id, uint64_t type) { + uint64_t key = this->verticeIdKeyMapping[id]; + Vertex& vertex = this->vertices[key]; + return (this->to_underlying(vertex.type) == type); + } + + /** + * @brief Check if a type of a edge having the passed id is + * equal to the one in a temporary edge map constructed from + * SHAD graph file. + * + * @param snid Global node ID of the source node of an edge + * @param dnid Global node ID of the destination node of an edge + * @param type Edge type + * @param type Edge type + * + * @return True if passed information matches to the one in + * a temporary edge map + */ + bool checkEdge(uint64_t snid, uint64_t dnid, + uint64_t eid, uint64_t type) { + uint64_t skey = this->verticeIdKeyMapping[snid]; + uint64_t dkey = this->verticeIdKeyMapping[dnid]; + + Vertex& vertex = this->vertices[skey]; + auto edgeRange = this->edges.equal_range(skey); + uint64_t eidx{0}; + Edge edge; + bool found{false}; + for (auto ei = edgeRange.first ; ei != edgeRange.second; ++ei, ++eidx) { + edge = ei->second; + // Multiple edges having the same source and destination could + // exist. So we repeat until find the one that has the same type to + // the passed one. + if (this->vertices[edge.dst].id == dnid && + this->to_underlying(edge.type) == type) { + found = true; + break; + } + } + return found; + } + +private: + /** + * @brief Return true if a token is a node type. + * + * @param token Token parsed from a graph file to check + */ + bool isTokenNodeType(std::string token) { + if (token == "Person" || token == "ForumEvent" || token == "Forum" || + token == "Publication" || token == "Topic") { + return true; + } else { + return false; + } + } + + /** + * @brief Return true if a token is an edge type. + * + * @param token Token parsed from a graph file to check + */ + bool isTokenEdgeType(std::string token) { + if (token == "Sale" || token == "Author" || token == "Includes" || + token == "HasTopic" || token == "HasOrg") { + return true; + } else { + return false; + } + } + + std::vector splitTokens( + std::string& line, char delim, uint64_t size = 0) { + uint64_t ndx = 0, start = 0, end = 0; + std::vector tokens(size); + + for ( ; end < line.length(); end ++) { + if ((line[end] == delim) || (line[end] == '\n')) { + tokens[ndx] = line.substr(start, end - start); + start = end + 1; + ndx ++; + } + } + + // Flush the last token. + tokens[size - 1] = line.substr(start, end - start); + return tokens; + } + + void CountNumEdgesForEachVertex(uint64_t numNodes, uint64_t numEdges) { + //galois::on_each([this, numNodes, numEdges]( + galois::on_each([&]( + uint32_t tid, uint32_t numThreads) { + // Each thread is assigned disjointed range of nodes. + // Each thread iterates edges and accumulates edges for only + // the nodes assigned to that. + auto thread_work_range = + galois::block_range(uint64_t{0}, numNodes, tid, numThreads); + for (auto edgeElem : this->edges) { + uint64_t srcVertex = edgeElem.first; + Vertex& vertex = this->vertices[srcVertex]; + if (vertex.id >= thread_work_range.first && + vertex.id < thread_work_range.second) { + vertex.incrNumEdges(); + } + } + }); + +#ifndef NDEBUG + this->VerifyNumEdgesPerVertex(numEdges); +#endif + } + + /** + * @brief Insert SHAD vertex to a vertex map. + * + * @param key SHAD token key + * @param type SHAD vertex type + * @param id Vertex id; Local vertex id until it is synchronized + */ + void insertSHADVertex(const uint64_t& key, const TYPES& type, uint64_t& id) { + auto found = this->vertices.find(key); + if (found == this->vertices.end()) { + this->vertices[key] = Vertex(id, type, key); + this->verticeIdKeyMapping[id] = key; + id++; + } else { + std::cerr << "[error] There is no reason to have duplicated vertices\n"; + } + } + + /** + * @brief Insert SHAD edge to a edge map. + * @detail Edges + * + * @param vertexKey Source vertex's SHAD token key + * @param edge Adjacent edge of the vertex + */ + void insertSHADEdge(const uint64_t& vertexKey, const Edge& edge) { + this->edges.insert({vertexKey, edge}); + } + + /* + uint64_t edge_begin(uint32_t n) { + return this->verticeIdKeyMapping[n] + */ + +#ifndef NDEBUG + /** + * @brief Verify in-meomry SHAD graph. + * + * @param filename SHAD graph file name + */ + // TODO(hc): This function can be parallelized but + // let me stick with sequential execution until the whole + // implementation works correctly. + void VerifySHADGraphRead(const std::string& filename) { + size_t numNodes{0}, numEdges{0}; + this->InspectGraph(filename, &numNodes, &numEdges); + // 1) Check the number of vertices and edges. + assert(this->vertices.size() == numNodes); + // Note that edges are doubled to symmetrize a graph. + assert(this->edges.size() == numEdges); + for ([[maybe_unused]] auto& element : this->edges) { + // 2) Check if a source node key of the edges map is equal to a source + // of an edge. + assert(element.first == element.second.src); + // 3) Check if vertex information in the edges map is equal to the one + // in the vertex map. + assert(element.second.src_type == + this->vertices[element.second.src].type); + assert(element.second.dst_type == + this->vertices[element.second.dst].type); + } + } + + void VerifyNumEdgesPerVertex([[maybe_unused]] uint64_t numEdges) { + // 4) Check if the total number of edges of each vertex is equal to + // the number of total edges counted during inspection. + uint64_t numAccumulatedEdges{0}; + for (auto& element : this->vertices) { + numAccumulatedEdges += element.second.getNumEdges(); + } + assert(numAccumulatedEdges == numEdges); + } + + void VerifyCSRConstruction( + [[maybe_unused]] uint64_t* outIndexBuffer, + [[maybe_unused]] uint64_t* nodeDataBuffer, + [[maybe_unused]] uint32_t* edgeDestBuffer, + [[maybe_unused]] void* edgeDataBuffer) {} + + template >* = nullptr> + void VerifyCSRConstruction( + uint64_t* outIndexBuffer, [[maybe_unused]] uint64_t* nodeDataBuffer, + uint32_t* edgeDestBuffer, EdgeDataTy* edgeDataBuffer) { + // 1) Iterate edge index array. + // 2) Compare each verteices' edge range with SHAD vertex + for (size_t i = 0; i < this->vertices.size(); ++i) { + Vertex& srcV = this->vertices[this->verticeIdKeyMapping[i]]; + uint64_t srcShadKey = srcV.shadKey; + assert(this->verticeIdKeyMapping[i] == srcV.shadKey); + uint64_t edgeBegin = (i == 0)? 0 : outIndexBuffer[i - 1]; + uint64_t edgeEnd = outIndexBuffer[i]; + assert(srcV.numEdges == edgeEnd - edgeBegin); + assert(this->to_underlying(srcV.type) == int(nodeDataBuffer[i])); + assert(srcV.id == i); + galois::do_all(galois::iterate(edgeBegin, edgeEnd), + [&](size_t j) { + uint32_t dstV = edgeDestBuffer[j]; + [[maybe_unused]] uint64_t edgeData = edgeDataBuffer[j]; + + [[maybe_unused]] bool found{false}; + auto edgeRange = this->edges.equal_range(srcShadKey); + size_t cnt{0}; + for (auto ei = edgeRange.first ; ei != edgeRange.second; ++ei) { + Edge& edge = ei->second; + if (this->vertices[edge.dst].id == dstV) { + // Multiple edges between vertices are possible. + if (this->to_underlying(edge.type) == int(edgeData)) { + assert(this->vertices[edge.src].id == i); + assert(this->vertices[edge.src].id == srcV.id); + found = true; + } + } + cnt++; + } + assert((edgeEnd - edgeBegin) == cnt); + /* + for (auto i = this->edges.begin(); i != this->edges.end(); ++i) { + std::cout << srcId << " vs " << i->first << "\n"; + } + */ + assert(found); + }, galois::steal()); + } + } +#endif + + /** + * @brief Cast a type to an underlying type; in case of scoped enum, + * this should be an integral type. + * + * @param e + */ + template + constexpr typename std::underlying_type::type to_underlying(E e) noexcept { + return static_cast::type>(e); + } + + // This holds the whole global vertices and their + // information such as its type. A key is globla node ID, and its value + // is the information. + std::unordered_map vertices; + // This holds the whole global edges and their information + // such as its type. The key is global source node ID, and its + // value is an edge iterator pointing to adjacent edges to the source. + std::unordered_multimap edges; + // Key is global node id and value is corresponding key of that node + std::unordered_map verticeIdKeyMapping; + // TODO(hc): Always assume uint64_t node data type + uint64_t* nodeDataBuffer; + uint64_t* outIndexBuffer; + uint32_t* edgeDestBuffer; + EdgeDataTy* edgeDataBuffer; +}; + +}; // shad namespace + +#endif diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 22a18c7fdf..98df493175 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -15,15 +15,20 @@ set(sources src/layers/SoftmaxLayer.cpp ) +## TODO(hc): Note that these libraries should be hard-coded +## based on your own system. +## These should be automatic library linking. set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64) +set(INTEL_COMPILER_LIBRARIES /home/hochan/intel/oneapi/compiler/2023.1.0/linux/compiler/lib/intel64_lin) set(INTEL_LIBS "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5") set(SINGLE_INTEL_LIBS "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core") add_library(galois_gnn STATIC ${sources}) target_link_directories(galois_gnn PUBLIC ${MKL_LIBRARIES}) -target_link_libraries(galois_gnn galois_shmem) +target_link_directories(galois_gnn PUBLIC ${INTEL_COMPILER_LIBRARIES}) target_link_libraries(galois_gnn ${INTEL_LIBS}) +target_link_libraries(galois_gnn galois_shmem) target_link_libraries(galois_gnn galois_dist_async galois_cusp galois_gluon galois_support) target_include_directories(galois_gnn PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index fff1d03ed4..447facef39 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -44,10 +44,11 @@ class GNNGraph { // galois::LargeArray>>; GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme, - bool has_single_class_label); + bool has_single_class_label, bool useShad = false); //! Loads a graph and all relevant metadata (labels, features, masks, etc.) GNNGraph(const std::string& input_directory, const std::string& dataset_name, - GNNPartitionScheme partition_scheme, bool has_single_class_label); + GNNPartitionScheme partition_scheme, bool has_single_class_label, + bool useShad = false); //! Returns host id size_t host_id() const { return host_id_; } diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 4a83753670..8fc68ea193 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -11,7 +11,8 @@ namespace { std::unique_ptr LoadPartition(const std::string& input_directory, const std::string& dataset_name, - galois::graphs::GNNPartitionScheme partition_scheme) { + galois::graphs::GNNPartitionScheme partition_scheme, + bool useShad) { // XXX input path std::string input_file = input_directory + dataset_name + ".csgr"; GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file); @@ -20,13 +21,13 @@ LoadPartition(const std::string& input_directory, switch (partition_scheme) { case galois::graphs::GNNPartitionScheme::kOEC: return galois::cuspPartitionGraph( - input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1); + input_file, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "", "", false, 1); case galois::graphs::GNNPartitionScheme::kCVC: return galois::cuspPartitionGraph( - input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1); + input_file, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "", "", false, 1); case galois::graphs::GNNPartitionScheme::kOCVC: return galois::cuspPartitionGraph( - input_file, galois::CUSP_CSR, galois::CUSP_CSR, true, "", "", false, 1); + input_file, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "", "", false, 1); default: GALOIS_LOG_FATAL("Error: partition scheme specified is invalid"); return nullptr; @@ -65,14 +66,16 @@ unsigned layer_number_to_sync; galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme, - bool has_single_class_label) + bool has_single_class_label, + bool useShad) : GNNGraph(galois::default_gnn_dataset_path, dataset_name, partition_scheme, - has_single_class_label) {} + has_single_class_label, useShad) {} galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, const std::string& dataset_name, GNNPartitionScheme partition_scheme, - bool has_single_class_label) + bool has_single_class_label, + bool useShad) : input_directory_(input_directory) { GALOIS_LOG_VERBOSE("[{}] Constructing partitioning for {}", host_id_, dataset_name); @@ -84,7 +87,7 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, std::string("] "); // load partition partitioned_graph_ = - LoadPartition(input_directory_, dataset_name, partition_scheme); + LoadPartition(input_directory_, dataset_name, partition_scheme, useShad); // reverse edges partitioned_graph_->ConstructIncomingEdges(); diff --git a/lonestar/analytics/distributed/CMakeLists.txt b/lonestar/analytics/distributed/CMakeLists.txt index fa3046c679..546937cbda 100644 --- a/lonestar/analytics/distributed/CMakeLists.txt +++ b/lonestar/analytics/distributed/CMakeLists.txt @@ -6,6 +6,5 @@ add_subdirectory(connected-components) add_subdirectory(k-core) add_subdirectory(pagerank) add_subdirectory(partition) -add_subdirectory(matrixcompletion) add_subdirectory(sssp) add_subdirectory(triangle-counting) diff --git a/lonestar/gnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h index ac9cf02060..87b12de63d 100644 --- a/lonestar/gnn/include/DistributedGraphLoader.h +++ b/lonestar/gnn/include/DistributedGraphLoader.h @@ -107,6 +107,8 @@ namespace cll = llvm::cl; extern cll::opt dataset; //! partitioning scheme to use extern cll::opt partitionScheme; +//! true if input graph file format is SHAD WMD +extern cll::opt useShad; // @todo command line argument for read balancing across hosts @@ -130,27 +132,26 @@ template std::unique_ptr> constructSymmetricGraph(std::vector&) { std::string inputFile = deepgalois::path + dataset + ".csgr"; galois::gInfo("File to read is ", inputFile); - switch (partitionScheme) { case OEC: case IEC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, ""); case HOVC: case HIVC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, ""); case CART_VCUT: case CART_VCUT_IEC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, ""); case GNN_OEC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, ""); case GNN_CVC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, ""); + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, ""); default: GALOIS_DIE("Error: partition scheme specified is invalid"); return nullptr; diff --git a/lonestar/gnn/src/DistributedGraphLoader.cpp b/lonestar/gnn/src/DistributedGraphLoader.cpp index e18340fe82..611a7c3e50 100644 --- a/lonestar/gnn/src/DistributedGraphLoader.cpp +++ b/lonestar/gnn/src/DistributedGraphLoader.cpp @@ -41,3 +41,8 @@ cll::opt partitionScheme( clEnumValN(GNN_CVC, "g-cvc", "gnn cvc: train nodes evenly distributed")), cll::init(GNN_OEC)); + +cll::opt useShad("useShad", cll::desc("true if the input graph is" + " SHAD WMD graph format." + " Otheriwse, set false."), + cll::init(false)); diff --git a/lonestar/libdistbench/include/DistBench/Input.h b/lonestar/libdistbench/include/DistBench/Input.h index 088bc82444..396b01a983 100644 --- a/lonestar/libdistbench/include/DistBench/Input.h +++ b/lonestar/libdistbench/include/DistBench/Input.h @@ -99,6 +99,8 @@ extern cll::opt inputFileTranspose; extern cll::opt symmetricGraph; //! partitioning scheme to use extern cll::opt partitionScheme; +//! true if input graph file format is SHAD WMD +extern cll::opt useShad; ////! path to vertex id map for custom edge cut // extern cll::opt vertexIDMapFileName; //! true if you want to read graph structure from a file @@ -143,18 +145,18 @@ constructSymmetricGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { case OEC: case IEC: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, inputFileTranspose, - mastersFile); + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, + inputFileTranspose, mastersFile); case HOVC: case HIVC: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, inputFileTranspose); case CART_VCUT: case CART_VCUT_IEC: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, inputFileTranspose); // case CEC: @@ -164,18 +166,18 @@ constructSymmetricGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { case GINGER_O: case GINGER_I: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad ,true, inputFileTranspose); case FENNEL_O: case FENNEL_I: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, inputFileTranspose); case SUGAR_O: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, true, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, inputFileTranspose); default: GALOIS_DIE("partition scheme specified is invalid: ", partitionScheme); @@ -204,19 +206,19 @@ constructGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { auto& net = galois::runtime::getSystemNetworkInterface(); if (net.Num == 1) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false, inputFileTranspose); } switch (partitionScheme) { case OEC: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false, inputFileTranspose, mastersFile); case IEC: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false, inputFileTranspose, mastersFile); } else { GALOIS_DIE("incoming edge cut requires transpose graph"); @@ -225,12 +227,12 @@ constructGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { case HOVC: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false, inputFileTranspose); case HIVC: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false, inputFileTranspose); } else { GALOIS_DIE("incoming hybrid cut requires transpose graph"); @@ -239,13 +241,13 @@ constructGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { case CART_VCUT: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false, inputFileTranspose); case CART_VCUT_IEC: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false, inputFileTranspose); } else { GALOIS_DIE("cvc incoming cut requires transpose graph"); @@ -258,12 +260,12 @@ constructGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { case GINGER_O: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false, inputFileTranspose); case GINGER_I: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false, inputFileTranspose); } else { GALOIS_DIE("Ginger requires transpose graph"); @@ -272,12 +274,12 @@ constructGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { case FENNEL_O: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false, inputFileTranspose); case FENNEL_I: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false, inputFileTranspose); } else { GALOIS_DIE("Fennel requires transpose graph"); @@ -286,7 +288,7 @@ constructGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { case SUGAR_O: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false, inputFileTranspose); default: @@ -318,7 +320,7 @@ DistGraphPtr constructGraph(std::vector&) { if (net.Num == 1) { if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false, inputFileTranspose); } else { fprintf(stderr, "WARNING: Loading transpose graph through in-memory " @@ -326,7 +328,7 @@ DistGraphPtr constructGraph(std::vector&) { "graph with -graphTranspose to avoid unnecessary " "overhead.\n"); return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false, inputFileTranspose); } } @@ -334,12 +336,12 @@ DistGraphPtr constructGraph(std::vector&) { switch (partitionScheme) { case OEC: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false, inputFileTranspose, mastersFile); case IEC: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false, inputFileTranspose, mastersFile); } else { GALOIS_DIE("iec requires transpose graph"); @@ -348,12 +350,12 @@ DistGraphPtr constructGraph(std::vector&) { case HOVC: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false, inputFileTranspose); case HIVC: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false, inputFileTranspose); } else { GALOIS_DIE("hivc requires transpose graph"); @@ -362,13 +364,14 @@ DistGraphPtr constructGraph(std::vector&) { case CART_VCUT: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false, inputFileTranspose); case CART_VCUT_IEC: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph(inputFile, galois::CUSP_CSC, - galois::CUSP_CSC, false, + galois::CUSP_CSC, useShad, + false, inputFileTranspose); } else { GALOIS_DIE("cvc requires transpose graph"); @@ -377,12 +380,12 @@ DistGraphPtr constructGraph(std::vector&) { case GINGER_O: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false, inputFileTranspose); case GINGER_I: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false, inputFileTranspose); } else { GALOIS_DIE("Ginger requires transpose graph"); @@ -391,12 +394,12 @@ DistGraphPtr constructGraph(std::vector&) { case FENNEL_O: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false, inputFileTranspose); case FENNEL_I: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false, inputFileTranspose); } else { GALOIS_DIE("Fennel requires transpose graph"); @@ -405,7 +408,7 @@ DistGraphPtr constructGraph(std::vector&) { case SUGAR_O: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false, inputFileTranspose); default: diff --git a/lonestar/libdistbench/src/Input.cpp b/lonestar/libdistbench/src/Input.cpp index 495f68c0c5..04321bd14e 100644 --- a/lonestar/libdistbench/src/Input.cpp +++ b/lonestar/libdistbench/src/Input.cpp @@ -60,6 +60,11 @@ cll::opt partitionScheme( "fennel, incoming edge cut, using CuSP")), cll::init(OEC)); +cll::opt useShad("useShad", cll::desc("true if the input graph is" + " SHAD WMD graph format." + " Otheriwse, set false."), + cll::init(false)); + cll::opt readFromFile("readFromFile", cll::desc("Set this flag if graph is to be " "constructed from file (file must be " diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index 7719340224..d1685b8e2b 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -26,6 +26,11 @@ llvm::cl::opt partition_scheme( "Original Cartesian Vertex-Cut")), cll::init(galois::graphs::GNNPartitionScheme::kOEC)); +cll::opt useShad("useShad", cll::desc("true if the input graph is" + " SHAD WMD graph format." + " Otheriwse, set false."), + cll::init(false)); + llvm::cl::opt num_layers( "numLayers", cll::desc( @@ -341,7 +346,8 @@ std::vector CreateFanOutVector() { std::unique_ptr InitializeGraphNeuralNetwork() { // partition/load graph auto gnn_graph = std::make_unique( - input_directory, input_name, partition_scheme, !multiclass_labels); + input_directory, input_name, partition_scheme, !multiclass_labels, + useShad); // create layer types vector std::vector layer_types = CreateLayerTypesVector(); diff --git a/lonestar/scientific/cpu/longestedge/test/catch.hpp b/lonestar/scientific/cpu/longestedge/test/catch.hpp index 6c1756a6ce..841b9c8128 100644 --- a/lonestar/scientific/cpu/longestedge/test/catch.hpp +++ b/lonestar/scientific/cpu/longestedge/test/catch.hpp @@ -10723,6 +10723,13 @@ PVOID FatalConditionHandler::exceptionHandlerHandle = nullptr; #elif defined( CATCH_CONFIG_POSIX_SIGNALS ) +// MINSIGSTKSZ is not constexpr in the recent Linux, and so, +// requires manual declaration for backward compatibility. +// This number is from +// https://stackoverflow.com/questions/71454588/minsigstksz-error-after-update-in-my-manjaro-linux` +#undef MINSIGSTKSZ +#define MINSIGSTKSZ 16384 + namespace Catch { struct SignalDefs { From d9a1c69839a74d0ffd8cfba2d854c1199e87366c Mon Sep 17 00:00:00 2001 From: "Lee, Hochan" <133701794+hochanlee-amd@users.noreply.github.com> Date: Wed, 2 Aug 2023 14:17:18 -0500 Subject: [PATCH 603/660] Use a separate vector to specify sampled nodes instead of a node label to extend its usage (#4) Co-authored-by: Hochan Lee --- libgnn/include/galois/graphs/GNNGraph.h | 14 ++++++++------ libgnn/src/graphs/GNNGraph.cpp | 2 ++ 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 447facef39..6dbfdfbcf1 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -582,29 +582,29 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// //! Makes a node "sampled"; used for debugging/testing - void SetSampledNode(size_t node) { partitioned_graph_->getData(node) = 1; } + void SetSampledNode(size_t node) { mark_sampled_nodes_[node] = 1; } //! Makes a node "not sampled"; used for debugging/testing - void UnsetSampledNode(size_t node) { partitioned_graph_->getData(node) = 0; } + void UnsetSampledNode(size_t node) { mark_sampled_nodes_[node] = 0; } //! Returns true if a particular node is currently considered "in" a sampled //! graph bool IsInSampledGraph(const NodeIterator& ni) const { // TODO(loc) GPU assert(*ni < size()); - return partitioned_graph_->getData(*ni); + return mark_sampled_nodes_[*ni]; } bool IsInSampledGraph(size_t node_id) const { // TODO(loc) GPU assert(node_id < size()); - return partitioned_graph_->getData(node_id); + return mark_sampled_nodes_[node_id]; } bool IsInSampledGraphSubgraph(size_t node_id) const { // TODO(loc) GPU assert(node_id < size()); if (use_subgraph_) { - return partitioned_graph_->getData(ConvertToLID(node_id)); + return mark_sampled_nodes_[ConvertToLID(node_id)]; } else { - return partitioned_graph_->getData(node_id); + return mark_sampled_nodes_[node_id]; } } @@ -850,6 +850,8 @@ class GNNGraph { DGAccumulator local_false_positive_; DGAccumulator local_false_negative_; + std::vector mark_sampled_nodes_; + bool use_timer_{true}; }; diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index 8fc68ea193..b0ed03d34c 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -90,6 +90,8 @@ galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, LoadPartition(input_directory_, dataset_name, partition_scheme, useShad); // reverse edges partitioned_graph_->ConstructIncomingEdges(); + // mark a node if it is sampled + mark_sampled_nodes_.resize(partitioned_graph_->size()); galois::gInfo(host_prefix_, "Number of local proxies is ", partitioned_graph_->size()); From 01cdb4ccfecebfa8ec32aa23663662b2ec9481f4 Mon Sep 17 00:00:00 2001 From: "Lee, Hochan" <133701794+hochanlee-amd@users.noreply.github.com> Date: Wed, 30 Aug 2023 21:51:54 -0500 Subject: [PATCH 604/660] Templatizing graphs and layers, feature construction and training/testing/validation vertices marking This commit contains three updates. First, it templatizes graphs and layers to support various node and edge data types, other than char and void. Second, this commit performs histogram-based feature construction like AGILE does. Third, this commit selects training/testing/validation vertices like AGILE does. This commit passed a correctness check on data10.csv, 4hosts, and CVC partitioning policy. --- .../include/galois/graphs/CuSPPartitioner.h | 12 +- .../include/galois/graphs/DistributedGraph.h | 28 +- libcusp/include/galois/graphs/NewGeneric.h | 186 +- libcusp/test/shad-dist-graph.cpp | 22 +- libgalois/include/shad/ShadGraphConverter.h | 54 +- .../include/galois/graphs/GluonSubstrate.h | 2 +- .../include/galois/runtime/SyncStructures.h | 8 +- libgnn/CMakeLists.txt | 11 +- libgnn/include/galois/GraphNeuralNetwork.h | 848 +++++++- .../galois/graphs/DegreeSyncStructures.h | 28 +- libgnn/include/galois/graphs/GNNGraph.h | 1882 ++++++++++++++++- libgnn/include/galois/graphs/GNNSubgraph.h | 456 +++- .../graphs/GraphAggregationSyncStructures.h | 161 +- libgnn/include/galois/layers/DenseLayer.h | 154 +- libgnn/include/galois/layers/GNNLayer.h | 563 ++++- .../galois/layers/GraphConvolutionalLayer.h | 485 ++++- libgnn/include/galois/layers/L2NormLayer.h | 133 +- libgnn/include/galois/layers/SAGELayer.h | 946 ++++++++- libgnn/include/galois/layers/SigmoidLayer.h | 123 +- libgnn/include/galois/layers/SoftmaxLayer.h | 156 +- libgnn/src/DistributedMinibatchTracker.cpp | 4 +- libgnn/src/GNNMath.cpp | 4 +- libgnn/src/GraphNeuralNetwork.cpp | 818 ------- libgnn/src/graphs/GNNGraph.cpp | 1539 +------------- libgnn/src/graphs/GNNSubgraph.cpp | 440 ---- libgnn/src/layers/DenseLayer.cpp | 144 -- libgnn/src/layers/GNNLayer.cpp | 557 ----- libgnn/src/layers/GraphConvolutionalLayer.cpp | 459 ---- libgnn/src/layers/L2NormLayer.cpp | 124 -- libgnn/src/layers/SAGELayer.cpp | 879 -------- libgnn/src/layers/SigmoidLayer.cpp | 113 - libgnn/src/layers/SoftmaxLayer.cpp | 139 -- libgnn/test/CMakeLists.txt | 2 + libgnn/test/accuracy-test.cpp | 6 +- libgnn/test/aggregate-sync-test.cpp | 20 +- libgnn/test/back-conv-test.cpp | 8 +- libgnn/test/convlayer-test.cpp | 16 +- libgnn/test/epoch-test.cpp | 6 +- libgnn/test/f1-test.cpp | 4 +- libgnn/test/gnnconstruct-test.cpp | 8 +- libgnn/test/gnnfb-test.cpp | 12 +- libgnn/test/gnngraph-test.cpp | 8 +- libgnn/test/gpu-adam-test.cpp | 4 +- libgnn/test/gpu-aggregate-sync-test.cpp | 12 +- libgnn/test/gpu-back-conv-test.cpp | 8 +- libgnn/test/gpu-convlayer-test.cpp | 16 +- libgnn/test/gpu-epoch-test.cpp | 6 +- libgnn/test/gpu-sage-layer-test.cpp | 12 +- libgnn/test/gpu-softmaxlayer-test.cpp | 6 +- libgnn/test/l2norm-layer-test.cpp | 6 +- libgnn/test/multilabel-epoch-test.cpp | 6 +- libgnn/test/multilabel-read.cpp | 4 +- libgnn/test/sage-layer-test.cpp | 12 +- libgnn/test/sample-bit-test.cpp | 4 +- libgnn/test/sample-test.cpp | 16 +- libgnn/test/sigmoidlayer-test.cpp | 6 +- libgnn/test/softmaxlayer-test.cpp | 6 +- lonestar/gnn/distributed/gcn/gcn-dist.cpp | 5 +- lonestar/gnn/include/DistributedGraphLoader.h | 12 +- lonestar/gnn/src/DistributedGraphLoader.cpp | 6 +- .../libdistbench/include/DistBench/Input.h | 64 +- .../libdistbench/include/DistBench/Output.h | 1 + lonestar/libdistbench/src/Input.cpp | 6 +- lonestar/libgnnbench/include/GNNBench/Input.h | 161 +- lonestar/libgnnbench/src/Input.cpp | 144 +- lonestar/libgnnbench/src/Start.cpp | 27 + .../shad-gnn/check_feature_construction.py | 51 + 67 files changed, 6328 insertions(+), 5841 deletions(-) delete mode 100644 libgnn/src/GraphNeuralNetwork.cpp delete mode 100644 libgnn/src/layers/SoftmaxLayer.cpp create mode 100644 scripts/shad-gnn/check_feature_construction.py diff --git a/libcusp/include/galois/graphs/CuSPPartitioner.h b/libcusp/include/galois/graphs/CuSPPartitioner.h index 6b7fef6dab..5541be426d 100644 --- a/libcusp/include/galois/graphs/CuSPPartitioner.h +++ b/libcusp/include/galois/graphs/CuSPPartitioner.h @@ -50,7 +50,7 @@ using DistGraphPtr = * to the partitioner * @param outputType Specifies the output format (CSR or CSC) that each * partition will be created in - * @param useShad "true" if the passed graph file format is a SHAD WMD graph + * @param useWMD "true" if the passed graph file format is a WMD graph * @param symmetricGraph This should be "true" if the passed in graphFile * is a symmetric graph * @param transposeGraphFile Transpose graph of graphFile in Galois binary @@ -84,8 +84,8 @@ template DistGraphPtr cuspPartitionGraph(std::string graphFile, CUSP_GRAPH_TYPE inputType, - CUSP_GRAPH_TYPE outputType, bool useShad = false, - bool symmetricGraph = false, + CUSP_GRAPH_TYPE outputType, bool useWMD = false, + bool symmetricGraph = false, std::string transposeGraphFile = "", std::string masterBlockFile = "", bool cuspAsync = true, uint32_t cuspStateRounds = 100, @@ -128,13 +128,13 @@ cuspPartitionGraph(std::string graphFile, CUSP_GRAPH_TYPE inputType, } return std::make_unique( - inputToUse, net.ID, net.Num, useShad, cuspAsync, cuspStateRounds, useTranspose, - readPolicy, nodeWeight, edgeWeight, masterBlockFile); + inputToUse, net.ID, net.Num, useWMD, cuspAsync, cuspStateRounds, + useTranspose, readPolicy, nodeWeight, edgeWeight, masterBlockFile); } else { // symmetric graph path: assume the passed in graphFile is a symmetric // graph; output is also symmetric return std::make_unique( - graphFile, net.ID, net.Num, useShad, cuspAsync, cuspStateRounds, false, + graphFile, net.ID, net.Num, useWMD, cuspAsync, cuspStateRounds, false, readPolicy, nodeWeight, edgeWeight, masterBlockFile); } } diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h index 415afba33d..540b25e120 100644 --- a/libcusp/include/galois/graphs/DistributedGraph.h +++ b/libcusp/include/galois/graphs/DistributedGraph.h @@ -436,7 +436,7 @@ class DistGraph { // TODO(hc): auto r = galois::graphs::divideNodesBinarySearch( numGlobalNodes, numGlobalEdges, 0, edgeWeight, (id + d * numHosts), - numHosts * DecomposeFactor, outIndices, scalefactor); + numHosts * DecomposeFactor, outIndices, scalefactor); gid2host[id + d * numHosts].first = *(r.first.first); gid2host[id + d * numHosts].second = *(r.first.second); } @@ -504,9 +504,9 @@ class DistGraph { * @todo make this function work with decompose factor */ void computeMastersBalancedNodesAndEdges( - uint64_t numGlobalNodes, uint64_t numGlobalEdges, - uint64_t* outIndices, const std::vector& scalefactor, - uint32_t nodeWeight, uint32_t edgeWeight, unsigned) { + uint64_t numGlobalNodes, uint64_t numGlobalEdges, uint64_t* outIndices, + const std::vector& scalefactor, uint32_t nodeWeight, + uint32_t edgeWeight, unsigned) { if (nodeWeight == 0) { nodeWeight = numGlobalEdges / numGlobalNodes; // average degree } @@ -517,8 +517,8 @@ class DistGraph { auto& net = galois::runtime::getSystemNetworkInterface(); gid2host.resize(numHosts); auto r = galois::graphs::divideNodesBinarySearch( - numGlobalNodes, numGlobalEdges, nodeWeight, edgeWeight, - id, numHosts, outIndices, scalefactor); + numGlobalNodes, numGlobalEdges, nodeWeight, edgeWeight, id, numHosts, + outIndices, scalefactor); gid2host[id].first = *r.first.first; gid2host[id].second = *r.first.second; for (unsigned h = 0; h < numHosts; ++h) { @@ -543,7 +543,6 @@ class DistGraph { increment_evilPhase(); } - protected: /** * Wrapper call that will call into more specific compute masters @@ -628,19 +627,17 @@ class DistGraph { // compute masters for all nodes switch (masters_distribution) { case BALANCED_MASTERS: - computeMastersBlockedNodes( - numGlobalNodes, scalefactor, DecomposeFactor); + computeMastersBlockedNodes(numGlobalNodes, scalefactor, DecomposeFactor); break; case BALANCED_MASTERS_AND_EDGES: - computeMastersBalancedNodesAndEdges( - numGlobalNodes, numGlobalEdges, outIndices, - scalefactor, nodeWeight, edgeWeight, DecomposeFactor); + computeMastersBalancedNodesAndEdges(numGlobalNodes, numGlobalEdges, + outIndices, scalefactor, nodeWeight, + edgeWeight, DecomposeFactor); break; case BALANCED_EDGES_OF_MASTERS: default: - computeMastersBalancedEdges( - numGlobalNodes, numGlobalEdges, outIndices, - scalefactor, edgeWeight, DecomposeFactor); + computeMastersBalancedEdges(numGlobalNodes, numGlobalEdges, outIndices, + scalefactor, edgeWeight, DecomposeFactor); break; } @@ -658,7 +655,6 @@ class DistGraph { return numNodes_to_divide; } - //! reader assignment from a file //! corresponds to master assignment if using an edge cut void readersFromFile(galois::graphs::OfflineGraph& g, std::string filename) { diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index 49c96a965c..9fa37159f1 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -223,8 +223,8 @@ class NewDistGraphGeneric : public DistGraph { */ NewDistGraphGeneric( const std::string& filename, unsigned host, unsigned _numHosts, - bool useShad = false, bool cuspAsync = true, uint32_t stateRounds = 100, - bool transpose = false, + bool useWMD = false, bool cuspAsync = true, uint32_t stateRounds = 100, + bool transpose = false, galois::graphs::MASTERS_DISTRIBUTION md = BALANCED_EDGES_OF_MASTERS, uint32_t nodeWeight = 0, uint32_t edgeWeight = 0, std::string masterBlockFile = "", bool readFromFile = false, @@ -246,24 +246,36 @@ class NewDistGraphGeneric : public DistGraph { galois::graphs::OfflineGraph* offlineGraph{nullptr}; - shad::ShadGraphConverter shadConverter; + std::string host_prefix = + std::string("[") + + std::to_string(galois::runtime::getSystemNetworkInterface().ID) + + std::string("] "); + + shad::ShadGraphConverter shadConverter; galois::graphs::BufferedGraph bufGraph; bufGraph.resetReadCounters(); std::vector dummy; // not actually getting masters, but getting assigned readers for nodes if (masterBlockFile == "") { - if (useShad) { - std::cout << "Construct a distributed graph from SHAD WMD format.\n"; + if (useWMD) { uint64_t numGlobalNodes{0}, numGlobalEdges{0}; + galois::gInfo(host_prefix, "Starts reading SHAD graph file"); // Read and load the whole SHAD WMD dataset to memory. // TODO(hc): Note that this reads the entire graph. // We will improve this to read partial graphs // on each host later. For now, the main focus is // to enable WMD dataset for the workflows. shadConverter.readSHADFile(filename, &numGlobalNodes, &numGlobalEdges); + galois::gInfo(host_prefix, "Completes reading SHAD graph file"); base_DistGraph::numGlobalNodes = numGlobalNodes; base_DistGraph::numGlobalEdges = numGlobalEdges; + + galois::gInfo(host_prefix, + "Read graph # nodes:", std::to_string(numGlobalNodes), + " # edges:", std::to_string(numGlobalEdges)); + galois::gInfo(host_prefix, "Starts node array construction from SHAD" + " graph"); // Construct node data/outgoing index range arrays // for a GLOBAL array, not a local array. // Later, parts for the local graph partition will be @@ -281,24 +293,23 @@ class NewDistGraphGeneric : public DistGraph { // local nodes. // TODO(hc): UT will improve and redesign this part to // get scalability. - shadConverter.constructNodeArrays( - 0, numGlobalNodes, numGlobalNodes); - + shadConverter.constructNodeArrays(0, numGlobalNodes, numGlobalNodes); + galois::gInfo(host_prefix, "Completes node array construction from SHAD" + " graph"); // Compute master proxies by using the number of global nodes // and edges. base_DistGraph::computeMasters( - md, base_DistGraph::numGlobalNodes, - base_DistGraph::numGlobalEdges, - shadConverter.getOutIndexBuffer(), dummy, nodeWeight, - edgeWeight); + md, base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges, + shadConverter.getOutIndexBuffer(), dummy, nodeWeight, edgeWeight); } else { offlineGraph = new galois::graphs::OfflineGraph(filename); base_DistGraph::numGlobalNodes = offlineGraph->size(); base_DistGraph::numGlobalEdges = offlineGraph->sizeEdges(); - base_DistGraph::computeMasters(md, *offlineGraph, dummy, nodeWeight, edgeWeight); + base_DistGraph::computeMasters(md, *offlineGraph, dummy, nodeWeight, + edgeWeight); } } else { - if (useShad) { + if (useWMD) { GALOIS_DIE("SHAD graph format does not support master block file"); } galois::gInfo("Getting reader assignment from file"); @@ -317,11 +328,13 @@ class NewDistGraphGeneric : public DistGraph { if (!trainPoints.empty()) { std::vector testDistribution = galois::graphs::determineUnitRangesFromPrefixSum( - base_DistGraph::numHosts, *offlineGraph, trainPoints[0], trainPoints[1]); + base_DistGraph::numHosts, *offlineGraph, trainPoints[0], + trainPoints[1]); std::vector restDistribution = galois::graphs::determineUnitRangesFromPrefixSum( - base_DistGraph::numHosts, *offlineGraph, trainPoints[1], offlineGraph->size()); + base_DistGraph::numHosts, *offlineGraph, trainPoints[1], + offlineGraph->size()); // create global distribution of edges std::vector mappings(offlineGraph->size()); @@ -371,9 +384,9 @@ class NewDistGraphGeneric : public DistGraph { graphReadTimer.start(); uint64_t nodeBegin = base_DistGraph::gid2host[base_DistGraph::id].first; - uint64_t nodeEnd = base_DistGraph::gid2host[base_DistGraph::id].second; + uint64_t nodeEnd = base_DistGraph::gid2host[base_DistGraph::id].second; - if (!useShad) { + if (!useWMD) { // If the input graph is not SHAD WMD format, // construct a buffered graph from the file directly, as ordinary. typename galois::graphs::OfflineGraph::edge_iterator edgeBegin = @@ -384,40 +397,8 @@ class NewDistGraphGeneric : public DistGraph { *edgeEnd, base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges); } else { - // Now construct arrays for in-memory CSR. - // In case of the node out-going edge range array and - // the node data array, it will extract parts corresponding to - // local graph paritition from the arrays holding the global - // array information. - // Edge destination and data arrays are constructed based on - // unrefined maps constructed from SHAD graph reading. - // NOTE that those arrays all store GLOBAL node ids. - // For example, edge destination array's size is equal - // to the number of local edges, but its destination ID is - // global node IDs, not local node IDs. - uint32_t numLocalNodes = nodeEnd - nodeBegin; - // So, this holds outgoing edge array of a whole (global) graph. - uint64_t *outIndexBuffer = shadConverter.getOutIndexBuffer(); - // Global edge id range assigned to the current host. - uint64_t edgeBegin = - (nodeBegin == 0)? 0 : outIndexBuffer[nodeBegin - 1]; - // This is the last local node's edge range end. - // So, [edgeBegin, edgeEnd) is for this current host. - uint64_t edgeEnd = outIndexBuffer[nodeEnd - 1]; - // Extract node out-going range and data arrays of local nodes. - // From now on, those arrays store local node information - // as a dense memory representation. - shadConverter.extractLocalOutIndexArray( - nodeBegin, nodeEnd); - - uint64_t numLocalEdges = edgeEnd - edgeBegin; - shadConverter.constructEdgeArrays( - nodeBegin, edgeBegin, numLocalNodes, numLocalEdges); - // Construct a buffered graph that is used by CuSP to partition - // a graph. - shadConverter.constructBufferedGraph( - base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges, - nodeBegin, nodeEnd, edgeBegin, edgeEnd, &bufGraph); + constructCSRFromSHADGraph( + &bufGraph, &shadConverter, nodeBegin, nodeEnd, host_prefix); } graphReadTimer.stop(); @@ -547,9 +528,9 @@ class NewDistGraphGeneric : public DistGraph { Tgraph_construct.stop(); galois::gDebug("[", base_DistGraph::id, "] Graph construction complete."); - if (useShad) { + if (useWMD) { // Different from the gr format file that has been used by Galois - // and does not contain node data in the file, + // and does not contain node data in the file, // a SHAD graph file has a single type for each node, and it // is considered as node data. // This function constructs and sets node data (type). @@ -604,11 +585,79 @@ class NewDistGraphGeneric : public DistGraph { return toReturn; } + /// Construct arrays for in-memory CSR. + /// In case of the node out-going edge range array and + /// the node data array, it will extract parts corresponding to + /// local graph paritition from the arrays holding the global + /// array information. + /// Edge destination and data arrays are constructed based on + /// unordered maps constructed from SHAD graph reading. + /// NOTE that those arrays for CSR all store GLOBAL node ids. + /// For example, edge destination array's size is equal + /// to the number of local edges, but its destination ID is + /// global node IDs, not local node IDs. + /// + /// @tparam T Graph node data type + /// + /// @param bufGraph Buffered graph to construct + /// @param shadConverter Shad graph ingestor which ingested + /// a SHAD graph in memory to an unordered node/edge map + /// @param nodeBegin Global id of the first local node range + /// @param nodeEnd Global id of the last local node range + /// @param host_prefix Log prefix string for this host + template < + typename T = NodeTy, + typename std::enable_if_t>* = nullptr> + void constructCSRFromSHADGraph( + galois::graphs::BufferedGraph* bufGraph, + shad::ShadGraphConverter* shadConverter, + uint64_t nodeBegin, uint64_t nodeEnd, std::string host_prefix) { + uint32_t numLocalNodes = nodeEnd - nodeBegin; + // So, this holds outgoing edge array of a whole (global) graph. + uint64_t* outIndexBuffer = shadConverter->getOutIndexBuffer(); + // Global edge id range assigned to the current host. + uint64_t edgeBegin = (nodeBegin == 0) ? 0 : outIndexBuffer[nodeBegin - 1]; + // This is the last local node's edge range end. + // So, [edgeBegin, edgeEnd) is for this current host. + uint64_t edgeEnd = outIndexBuffer[nodeEnd - 1]; + galois::gInfo(host_prefix, "Starts local out index array construction"); + // Extract node out-going range and data arrays of local nodes. + // From now on, those arrays store local node information + // as a dense memory representation. + shadConverter->extractLocalOutIndexArray(nodeBegin, nodeEnd); + galois::gInfo(host_prefix, + "Completes local out index array construction"); + + galois::gInfo(host_prefix, "Starts edge destination/data " + "array construction"); + uint64_t numLocalEdges = edgeEnd - edgeBegin; + shadConverter->constructEdgeArrays(nodeBegin, edgeBegin, numLocalNodes, + numLocalEdges); + + galois::gInfo(host_prefix, "Completes edge destination/data " + "array construction"); + // Construct a buffered graph that is used by CuSP to partition + // a graph. + shadConverter->constructBufferedGraph( + base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges, + nodeBegin, nodeEnd, edgeBegin, edgeEnd, bufGraph); + galois::gInfo(host_prefix, "Completes buffered graph construction from" + " SHAD graph"); + } + + // Disable this method for non-SHAD graph construction. + template < + typename T = NodeTy, + typename std::enable_if_t>* = nullptr> + void constructCSRFromSHADGraph( + galois::graphs::BufferedGraph*, + shad::ShadGraphConverter*, uint64_t, uint64_t, std::string) {} + /** * @brief Assign a SHAD node type to a node data. * * @detail Different from the gr format file that has been used by Galois - * and does not contain node data in the file, + * and does not contain node data in the file, * a SHAD graph file has a single type for each node, and it * considered as node data. This function constructs and sets node * data based on that. @@ -620,26 +669,23 @@ class NewDistGraphGeneric : public DistGraph { * @param shadConverter SHAD graph converter holding node data from a * SHAD file. */ - template >* = nullptr> - void assignNodeDataFromSHADProp(shad::ShadGraphConverter* shadConverter) { + template < + typename T = NodeTy, + typename std::enable_if_t>* = nullptr> + void assignNodeDataFromSHADProp(shad::ShadGraphConverter* shadConverter) { galois::gPrint("[", base_DistGraph::id, "] Graph node data is assigned."); - uint64_t* nodeDataBuffer = shadConverter->getNodeDataBuffer(); + shad::ShadNodeTy* nodeDataBuffer = shadConverter->getNodeDataBuffer(); galois::do_all(galois::iterate(base_DistGraph::allNodesRange()), - [&](uint32_t lid) { - uint64_t gid = this->getGID(lid); - this->getData(lid) = nodeDataBuffer[gid]; - std::cout << "lid :" << lid << " is set to " << - this->getData(lid) << "\n"; - }); + [&](uint32_t lid) { + uint64_t gid = this->getGID(lid); + this->getData(lid) = nodeDataBuffer[gid]; + }); } template >* = nullptr> - void assignNodeDataFromSHADProp( - [[maybe_unused]] shad::ShadGraphConverter* shadConverter) {} + typename std::enable_if_t>* = + nullptr> + void assignNodeDataFromSHADProp(shad::ShadGraphConverter*) {} /** * For each other host, determine which nodes that this host needs to get diff --git a/libcusp/test/shad-dist-graph.cpp b/libcusp/test/shad-dist-graph.cpp index fe71231295..dedc3c34cb 100644 --- a/libcusp/test/shad-dist-graph.cpp +++ b/libcusp/test/shad-dist-graph.cpp @@ -29,15 +29,20 @@ int main() { //M = 1; galois::setActiveThreads(M); - shad::ShadGraphConverter shadConverter; + shad::ShadGraphConverter shadConverter; size_t numNodes{0}, numEdges{0}; - std::string filename = "/home/hochan/data.csv"; + // TODO(hc): This path should be properly set based on user's environment. + // Later, this test dataset will be included in the Galois repository, and + // will use a relative path. + std::string filename = "/home/hochan/data.01.csv"; shadConverter.readSHADFile(filename, &numNodes, &numEdges); - std::unique_ptr> - graph = galois::cuspPartitionGraph( + std::unique_ptr> + graph = galois::cuspPartitionGraph( filename, galois::CUSP_CSR, galois::CUSP_CSR, true, true); + std::cout << "Test starts...\n"; + galois::DGAccumulator sumGlobalNodes; galois::DGAccumulator sumGlobalEdges; @@ -55,13 +60,16 @@ int main() { assert(reducedSumGlobalEdges == numEdges); assert(reducedSumGlobalEdges == graph->globalSizeEdges()); + std::cout << "Num. nodes/edges tests has been passed\n"; + uint32_t id = galois::runtime::getSystemNetworkInterface().ID; uint32_t numHosts = galois::runtime::getSystemNetworkInterface().Num; { std::ofstream fp(std::to_string(id) + ".master"); for (uint32_t src = 0; src < graph->numMasters(); ++src) { uint64_t srcglobal = graph->getGID(src); - fp << "node " << srcglobal << ", type: " << graph->getData(src) << "\n"; + fp << "node " << srcglobal << ", type: " << graph->getData(src).type << + ", key: " << graph->getData(src).key << "\n"; for (auto e : graph->edges(src)) { uint32_t dstlocal = graph->getEdgeDst(e); uint64_t dstglobal = graph->getGID(dstlocal); @@ -78,11 +86,12 @@ int main() { std::ofstream fp(std::to_string(id) + "-" + std::to_string(host) + ".graph"); for (uint32_t i = 0; i < graph->size(); ++i) { fp << i << ", " << graph->getGID(i) << ", " << - graph->getData(i) << "\n"; + graph->getData(i).type << ", " << graph->getData(i).key << "\n"; } fp.close(); } } +#if 0 { for (uint32_t host = 0; host < numHosts; ++host) { if (host == id) { @@ -113,6 +122,7 @@ int main() { fp.close(); } } +#endif return 0; } diff --git a/libgalois/include/shad/ShadGraphConverter.h b/libgalois/include/shad/ShadGraphConverter.h index 5162fc8dfb..4b1c0351db 100644 --- a/libgalois/include/shad/ShadGraphConverter.h +++ b/libgalois/include/shad/ShadGraphConverter.h @@ -12,11 +12,16 @@ namespace shad { +struct ShadNodeTy { + int type; + uint64_t key; +}; +using ShadEdgeTy = uint64_t; + /** * TODO(hc): This is a shared-memory version. * Later, a distributed-memory version in libgluon will reuse this code. */ -template class ShadGraphConverter { public: @@ -39,13 +44,14 @@ class ShadGraphConverter { for (size_t i = 0; i < this->verticeIdKeyMapping.size(); ++i) { uint64_t key = this->verticeIdKeyMapping[i]; Vertex v = this->vertices[key]; - fp << "node " << i << ", type: " << to_underlying(v.type) << "\n"; + fp << "node " << i << ", type: " << to_underlying(v.type) << ", key: " << + key << "\n"; auto edgeRange = this->edges.equal_range(key); for (auto ei = edgeRange.first ; ei != edgeRange.second; ++ei) { Edge& edge = ei->second; Vertex dst = this->vertices[edge.dst]; fp << "\t edge dst " << dst.id << ", type: " << - to_underlying(edge.type) << "\n"; + to_underlying(edge.type) << ", key: " << dst.shadKey << "\n"; } } fp.close(); @@ -110,7 +116,7 @@ class ShadGraphConverter { uint64_t numGlobalNodes, uint64_t numGlobalEdges, uint32_t nodeBegin, uint32_t nodeEnd, uint64_t edgeBegin, uint64_t edgeEnd, - [[maybe_unused]]galois::graphs::BufferedGraph* bufferedGraph) { + [[maybe_unused]]galois::graphs::BufferedGraph* bufferedGraph) { // TODO(hc): Each of these functions first construct graphs in the SHAD // format as this file is written in not binary, but string, and also // nodes or edges are not sorted. So, until we preprocess the input graph @@ -250,7 +256,7 @@ class ShadGraphConverter { * @brief Return node data array. * Note that this can be either of global graph or local graph. */ - uint64_t* getNodeDataBuffer() { + ShadNodeTy* getNodeDataBuffer() { return nodeDataBuffer; } @@ -280,7 +286,7 @@ class ShadGraphConverter { uint32_t nodeBegin, uint32_t nodeEnd, uint32_t numLocalNodes) { // 1) Construct an edge index array (size == number of nodes). this->outIndexBuffer = new uint64_t[numLocalNodes]; - this->nodeDataBuffer = new uint64_t[numLocalNodes]; + this->nodeDataBuffer = new ShadNodeTy[numLocalNodes]; // TODO(hc): for now, only consider a single host, but need to add offset later. galois::do_all(galois::iterate(this->vertices), @@ -292,8 +298,13 @@ class ShadGraphConverter { vertex.getNumEdges(); // Fill vertex data too; This assumes that a SHAD graph // has a type, which is considered as a vertex data. - this->nodeDataBuffer[vertexId - nodeBegin] = + this->nodeDataBuffer[vertexId - nodeBegin].type = this->to_underlying(vertex.type); + this->nodeDataBuffer[vertexId - nodeBegin].key = + vertex.shadKey; + //std::cout << vertexId - nodeBegin << " is set to " + //<< this->nodeDataBuffer[vertexId - nodeBegin].type << " and " << + //this->nodeDataBuffer[vertexId - nodeBegin].key << "\n"; } }); // 2) Perform parallel prefix sum to finalize outgoing edge index @@ -320,14 +331,14 @@ class ShadGraphConverter { * @param numLocalEdges The number of local edges * */ - template >* = nullptr> void constructEdgeArrays( uint32_t nodeBegin, uint64_t edgeBegin, uint32_t numLocalNodes, uint64_t numLocalEdges) { this->edgeDestBuffer = new uint32_t[numLocalEdges]; - this->edgeDataBuffer = new EdgeDataTy[numLocalEdges]; + this->edgeDataBuffer = new ShadEdgeTy[numLocalEdges]; std::vector edgeIndexPointers(numLocalNodes, 0); galois::on_each([&](uint32_t tid, uint32_t numThreads) { // 1) Find disjointed node range for each thread. @@ -375,7 +386,7 @@ class ShadGraphConverter { * @param numLocalEdges The number of local edges * */ - template >* = nullptr> void constructEdgeArrays( @@ -437,7 +448,7 @@ class ShadGraphConverter { * @return True if passed information matches to the one in * a temporary vertex map */ - bool checkNode(uint64_t id, uint64_t type) { + bool checkNode(uint64_t id, int type) { uint64_t key = this->verticeIdKeyMapping[id]; Vertex& vertex = this->vertices[key]; return (this->to_underlying(vertex.type) == type); @@ -457,11 +468,8 @@ class ShadGraphConverter { * a temporary edge map */ bool checkEdge(uint64_t snid, uint64_t dnid, - uint64_t eid, uint64_t type) { + uint64_t /*eid*/, int type) { uint64_t skey = this->verticeIdKeyMapping[snid]; - uint64_t dkey = this->verticeIdKeyMapping[dnid]; - - Vertex& vertex = this->vertices[skey]; auto edgeRange = this->edges.equal_range(skey); uint64_t eidx{0}; Edge edge; @@ -527,7 +535,7 @@ class ShadGraphConverter { return tokens; } - void CountNumEdgesForEachVertex(uint64_t numNodes, uint64_t numEdges) { + void CountNumEdgesForEachVertex(uint64_t numNodes, uint64_t /*numEdges*/) { //galois::on_each([this, numNodes, numEdges]( galois::on_each([&]( uint32_t tid, uint32_t numThreads) { @@ -626,16 +634,16 @@ class ShadGraphConverter { void VerifyCSRConstruction( [[maybe_unused]] uint64_t* outIndexBuffer, - [[maybe_unused]] uint64_t* nodeDataBuffer, + [[maybe_unused]] ShadNodeTy* nodeDataBuffer, [[maybe_unused]] uint32_t* edgeDestBuffer, [[maybe_unused]] void* edgeDataBuffer) {} - template >* = nullptr> void VerifyCSRConstruction( - uint64_t* outIndexBuffer, [[maybe_unused]] uint64_t* nodeDataBuffer, - uint32_t* edgeDestBuffer, EdgeDataTy* edgeDataBuffer) { + uint64_t* outIndexBuffer, [[maybe_unused]] ShadNodeTy* nodeDataBuffer, + uint32_t* edgeDestBuffer, ShadEdgeTy* edgeDataBuffer) { // 1) Iterate edge index array. // 2) Compare each verteices' edge range with SHAD vertex for (size_t i = 0; i < this->vertices.size(); ++i) { @@ -645,7 +653,7 @@ class ShadGraphConverter { uint64_t edgeBegin = (i == 0)? 0 : outIndexBuffer[i - 1]; uint64_t edgeEnd = outIndexBuffer[i]; assert(srcV.numEdges == edgeEnd - edgeBegin); - assert(this->to_underlying(srcV.type) == int(nodeDataBuffer[i])); + assert(this->to_underlying(srcV.type) == int(nodeDataBuffer[i].type)); assert(srcV.id == i); galois::do_all(galois::iterate(edgeBegin, edgeEnd), [&](size_t j) { @@ -701,10 +709,10 @@ class ShadGraphConverter { // Key is global node id and value is corresponding key of that node std::unordered_map verticeIdKeyMapping; // TODO(hc): Always assume uint64_t node data type - uint64_t* nodeDataBuffer; + ShadNodeTy* nodeDataBuffer; uint64_t* outIndexBuffer; uint32_t* edgeDestBuffer; - EdgeDataTy* edgeDataBuffer; + ShadEdgeTy* edgeDataBuffer; }; }; // shad namespace diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index dc834357b5..ec24bf2ce6 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -1443,7 +1443,7 @@ class GluonSubstrate : public galois::runtime::GlobalObject { typename FnTy::ValTy::value_type* location_to_write) { if (syncType == syncReduce) { FnTy::ExtractDirect(lid, location_to_write); - char dummy = 0; + typename FnTy::NodeTy dummy{}; FnTy::reset(lid, dummy); } else { FnTy::ExtractDirect(lid, location_to_write); diff --git a/libgluon/include/galois/runtime/SyncStructures.h b/libgluon/include/galois/runtime/SyncStructures.h index 75398c4f02..56cf8dd311 100644 --- a/libgluon/include/galois/runtime/SyncStructures.h +++ b/libgluon/include/galois/runtime/SyncStructures.h @@ -1985,14 +1985,16 @@ class FieldFlags { #define GALOIS_SYNC_STRUCTURE_GNN_LAYER(fieldname, cuda_ctx_for_sync, \ gnn_matrix_to_sync_column_length_, \ layer_number_to_sync) \ + template \ struct GNNSumAggregate_##fieldname { \ + using NodeTy = NTy; \ using ValTy = GNNFloat; \ \ - static ValTy extract(uint32_t, char&) { return 0.f; } \ + static ValTy extract(uint32_t, NodeTy&) { return 0.f; } \ \ - static bool reduce(uint32_t, char&, ValTy) { return false; } \ + static bool reduce(uint32_t, NodeTy&, ValTy) { return false; } \ \ - static void reset(uint32_t, char&) {} \ + static void reset(uint32_t, NodeTy&) {} \ \ static void setVal(uint32_t, char&, ValTy) {} \ \ diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index 98df493175..ca799c34b4 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -2,17 +2,8 @@ set(sources src/DistributedMinibatchTracker.cpp src/GNNMath.cpp src/GNNOptimizers.cpp - src/GraphNeuralNetwork.cpp src/MinibatchGenerator.cpp src/graphs/GNNGraph.cpp - src/graphs/GNNSubgraph.cpp - src/layers/DenseLayer.cpp - src/layers/GNNLayer.cpp - src/layers/GraphConvolutionalLayer.cpp - src/layers/L2NormLayer.cpp - src/layers/SAGELayer.cpp - src/layers/SigmoidLayer.cpp - src/layers/SoftmaxLayer.cpp ) ## TODO(hc): Note that these libraries should be hard-coded @@ -26,8 +17,8 @@ set(SINGLE_INTEL_LIBS "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core") add_library(galois_gnn STATIC ${sources}) target_link_directories(galois_gnn PUBLIC ${MKL_LIBRARIES}) -target_link_directories(galois_gnn PUBLIC ${INTEL_COMPILER_LIBRARIES}) target_link_libraries(galois_gnn ${INTEL_LIBS}) +target_link_directories(galois_gnn PUBLIC ${INTEL_COMPILER_LIBRARIES}) target_link_libraries(galois_gnn galois_shmem) target_link_libraries(galois_gnn galois_dist_async galois_cusp galois_gluon galois_support) target_include_directories(galois_gnn PUBLIC diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 7aa859c84c..c63175f65e 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -9,6 +9,14 @@ #include "galois/graphs/GNNGraph.h" #include "galois/layers/GNNLayer.h" #include "galois/DistributedMinibatchTracker.h" +#include "galois/GNNMath.h" +#include "galois/GraphNeuralNetwork.h" +#include "galois/layers/DenseLayer.h" +#include "galois/layers/GraphConvolutionalLayer.h" +#include "galois/layers/L2NormLayer.h" +#include "galois/layers/SAGELayer.h" +#include "galois/layers/SigmoidLayer.h" +#include "galois/layers/SoftmaxLayer.h" #ifdef GALOIS_ENABLE_GPU #include "galois/GraphNeuralNetwork.cuh" @@ -139,19 +147,195 @@ class GraphNeuralNetworkConfig { //! Class representing the graph neural network: contains the graph to train as //! well as all the layers that comprise it +template class GraphNeuralNetwork { public: //! Construct the graph neural network given the graph to train on as well as //! a configuration object - GraphNeuralNetwork(std::unique_ptr graph, + GraphNeuralNetwork(std::unique_ptr> graph, std::unique_ptr optimizer, - GraphNeuralNetworkConfig&& config); + GraphNeuralNetworkConfig&& config) + : graph_(std::move(graph)), optimizer_(std::move(optimizer)), + config_(std::move(config)) { + if (config_.do_sampling_ && config_.use_train_subgraph_) { + GALOIS_LOG_FATAL("Do not set train subgraph and sampling at same time " + "(sampling uses training subgraph already)"); + } + // max number of rows that can be passed as inputs; allocate space for it as + // this will be the # of rows for each layer + size_t max_rows = graph_->size(); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + graph_->ResizeGPULayerVector(config_.num_intermediate_layers()); + } +#endif + // used for chaining layers together; begins as nullptr + PointerWithSize prev_output_layer(nullptr, 0); + num_graph_user_layers_ = 0; + + // create the intermediate layers + for (size_t i = 0; i < config_.num_intermediate_layers(); i++) { + GNNLayerType layer_type = config_.intermediate_layer_type(i); + size_t prev_layer_columns; + + if (i != 0) { + // grab previous layer's size + prev_layer_columns = config_.intermediate_layer_size(i - 1); + } else { + // first layer means the input columns are # features in graph + prev_layer_columns = graph_->node_feature_length(); + } + + // max dims + GNNLayerDimensions layer_dims = {.input_rows = max_rows, + .input_columns = prev_layer_columns, + .output_columns = + config_.intermediate_layer_size(i), + .output_rows = max_rows}; + + // test minibatch size: if it's not enabled, then currently the full + // graph is used (should really only subgraph the test nodes, though; + // that's a TODO) + if ((config_.train_minibatch_size() || config_.use_train_subgraph_) && + config_.test_minibatch_size()) { + galois::gInfo("Not allocating rows"); + // set to 0 here to make it allocate nothing + layer_dims.input_rows = 0; + layer_dims.output_rows = 0; + } + + switch (layer_type) { + case GNNLayerType::kGraphConvolutional: + gnn_layers_.push_back( + std::move(std::make_unique>( + i, *graph_, &prev_output_layer, layer_dims, + config_.default_layer_config()))); + gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++); + break; + case GNNLayerType::kSAGE: + gnn_layers_.push_back(std::move(std::make_unique>( + i, *graph_, &prev_output_layer, layer_dims, + config_.default_layer_config()))); + gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++); +#ifdef GALOIS_ENABLE_GPU + // TODO(loc/hochan) sage layer gpu +#endif + break; + case GNNLayerType::kL2Norm: + gnn_layers_.push_back(std::move(std::make_unique>( + i, *graph_, &prev_output_layer, layer_dims, + config_.default_layer_config()))); + break; + case GNNLayerType::kDense: + gnn_layers_.push_back(std::move(std::make_unique>( + i, *graph_, &prev_output_layer, layer_dims, + config_.default_layer_config()))); + break; + default: + GALOIS_LOG_FATAL("Invalid layer type during network construction"); + } + + // update output layer for next layer + prev_output_layer = gnn_layers_.back()->GetForwardOutput(); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + graph_->InitLayerVectorMetaObjects( + i, galois::runtime::getSystemNetworkInterface().Num, + layer_dims.input_columns, layer_dims.output_columns); + } +#endif + } + + // loop backward and find last GCN/SAGE (main) layer to disable activation + for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); + back_iter++) { + GNNLayerType layer_type = (*back_iter)->layer_type(); + if (layer_type == GNNLayerType::kGraphConvolutional || + layer_type == GNNLayerType::kSAGE) { + galois::gDebug("Disabling activation on layer ", + (*back_iter)->layer_number(), "\n"); + (*back_iter)->DisableActivation(); + break; + } + } + + if (config_.do_sampling() || config_.use_train_subgraph_ || + config.train_minibatch_size() || config.test_minibatch_size()) { + // output layer not included; it will never involve sampling + graph_->InitializeSamplingData(num_graph_user_layers_, + config_.use_train_subgraph_); + } + + num_hosts_ = galois::runtime::getSystemNetworkInterface().Num; + if (config_.train_minibatch_size()) { + graph_->SetupTrainBatcher(config_.train_minibatch_size()); + // size_t local_num = + // if (num_hosts_ > 1) { + // dist_minibatch_tracker_ = + // std::make_unique( + // galois::runtime::getSystemNetworkInterface().ID, num_hosts_, + // local_num, config_.train_minibatch_size()); + //} + } + + if (config_.test_minibatch_size()) { + graph_->SetupTestBatcher(config_.test_minibatch_size()); + } + + // create the output layer + GNNLayerDimensions output_dims = { + .input_rows = max_rows, + // get last intermediate layer column size + .input_columns = config_.intermediate_layer_size( + config_.num_intermediate_layers() - 1), + .output_columns = config_.output_layer_size(), + .output_rows = max_rows}; + + if ((config_.train_minibatch_size() || config_.use_train_subgraph_) && + config_.test_minibatch_size()) { + output_dims.input_rows = 0; + output_dims.output_rows = 0; + } + + switch (config_.output_layer_type()) { + case (GNNOutputLayerType::kSoftmax): + gnn_layers_.push_back(std::move(std::make_unique>( + config_.num_intermediate_layers(), *graph_, &prev_output_layer, + output_dims))); + break; + case (GNNOutputLayerType::kSigmoid): + gnn_layers_.push_back(std::move(std::make_unique>( + config_.num_intermediate_layers(), *graph_, &prev_output_layer, + output_dims))); + break; + default: + GALOIS_LOG_FATAL("Invalid layer type during network construction"); + } + + // sanity checking multi-class + output layer + if (!graph_->is_single_class_label() && + (config_.output_layer_type() != GNNOutputLayerType::kSigmoid)) { + GALOIS_LOG_WARN( + "Using a non-sigmoid output layer with a multi-class label!"); + // if debug mode just kill program + assert(false); + } + + // flip sampling on layers + if (config_.use_train_subgraph_ || config_.do_sampling() || + config_.train_minibatch_size()) { + for (std::unique_ptr>& ptr : gnn_layers_) { + ptr->EnableSampling(); + } + } + } //! Number of intermediate layers (DOES NOT INCLUDE OUTPUT LAYER) size_t num_intermediate_layers() { return gnn_layers_.size() - 1; } //! Returns pointer to intermediate layer i - galois::GNNLayer* GetIntermediateLayer(size_t i) { + galois::GNNLayer* GetIntermediateLayer(size_t i) { if (i < gnn_layers_.size() - 1) { return gnn_layers_[i].get(); } else { @@ -162,43 +346,669 @@ class GraphNeuralNetwork { //! Set the phases of all layers at once as well as this network void SetLayerPhases(galois::GNNPhase phase) { phase_ = phase; - for (std::unique_ptr& ptr : gnn_layers_) { + for (std::unique_ptr>& ptr : gnn_layers_) { ptr->SetLayerPhase(phase); } } //! Set weights on all layers to 1; should be used for debugging only void SetAllLayerWeightsTo1() { - for (std::unique_ptr& ptr : gnn_layers_) { + for (std::unique_ptr>& ptr : gnn_layers_) { ptr->InitAllWeightsTo1(); } } //! Returns the output layer - galois::GNNLayer* GetOutputLayer() { return gnn_layers_.back().get(); } + galois::GNNLayer* GetOutputLayer() { + return gnn_layers_.back().get(); + } + + float MinibatchedTesting() { + galois::gDebug("Minibatched Testing"); + graph_->DisableSubgraph(); + graph_->ResetTestMinibatcher(); + SetLayerPhases(galois::GNNPhase::kBatch); + + bool choose_all_status = graph_->SubgraphChooseAllStatus(); + + uint32_t correct = 0; + uint32_t total = 0; + while (true) { + work_left_.reset(); + // size_t seed_node_count = graph_->PrepareNextTestMinibatch(); + graph_->PrepareNextTestMinibatch(); + // last layer input size/output rows becomes seed node size + // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, + // seed_node_count); + size_t num_sampled_layers = 0; + + for (auto back_iter = gnn_layers_.rbegin(); + back_iter != gnn_layers_.rend(); back_iter++) { + GNNLayerType layer_type = (*back_iter)->layer_type(); + if (layer_type == GNNLayerType::kGraphConvolutional || + layer_type == GNNLayerType::kSAGE) { + // you can minibatch with sampling or minibatch and grab all + // relevant neighbors + // size_t current_sample_size; + graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(), false, + num_sampled_layers + 1); + // resize this layer, change seed node count + //(*back_iter) + // ->ResizeInputOutputRows(current_sample_size, seed_node_count); + // seed_node_count = current_sample_size; + + num_sampled_layers++; + // XXX resizes above only work for SAGE layers; will break if other + // layers are tested + } + } + + // resize layer matrices + CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); + graph_->EnableSubgraphChooseAll(); + CorrectBackwardLinks(); + + const PointerWithSize batch_pred = DoInference(); + std::pair correct_total = + graph_->GetBatchAccuracy(batch_pred); + + correct += correct_total.first; + total += correct_total.second; + + work_left_ += graph_->MoreTestMinibatches(); + char global_work_left = work_left_.reduce(); + if (!global_work_left) { + break; + } + } + + galois::gInfo("Minibatching Correct / Total ", correct, " ", total); + + if (choose_all_status) { + graph_->EnableSubgraphChooseAll(); + } else { + graph_->DisableSubgraphChooseAll(); + } - float MinibatchedTesting(); + return (1.0 * correct) / (1.0 * total); + } //! Do training for a specified # of epochs and return test accuracy at the //! end of it - float Train(size_t num_epochs); + float Train(size_t num_epochs) { + EnableTimers(); + const size_t this_host = graph_->host_id(); + float train_accuracy{0.f}; + std::vector subgraph_layer_sizes; + // this subgraph only needs to be created once + if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) { + galois::StatTimer total_subgraph_construction_timer( + "TotalSubGraphConstruction", kRegionName); + galois::StatTimer setup_neighborhood_sample_timer( + "SetupNeighborhoodSample", kRegionName); + galois::StatTimer edge_sampling_timer("SampleAllEdges", kRegionName); + galois::StatTimer subgraph_construction_timer("SubGraphConstruction", + kRegionName); + total_subgraph_construction_timer.start(); + + setup_neighborhood_sample_timer.start(); + // Setup the subgraph to only be the training graph + size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); + setup_neighborhood_sample_timer.stop(); + + subgraph_layer_sizes.emplace_back(local_seed_node_count); + galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", + local_seed_node_count); + size_t num_sampled_layers = 0; + edge_sampling_timer.start(); + // gnn_layers_.back()->ResizeRows(local_seed_node_count); + for (auto back_iter = gnn_layers_.rbegin(); + back_iter != gnn_layers_.rend(); back_iter++) { + GNNLayerType layer_type = (*back_iter)->layer_type(); + if (layer_type == GNNLayerType::kGraphConvolutional || + layer_type == GNNLayerType::kSAGE) { + size_t current_sample_size = graph_->SampleAllEdges( + (*back_iter)->graph_user_layer_number(), + config_.inductive_subgraph_, num_sampled_layers + 1); + galois::gDebug(graph_->host_prefix(), + "Number of local nodes for train subgraph for layer ", + (*back_iter)->graph_user_layer_number(), " is ", + current_sample_size); + // resizing + //(*back_iter) + // ->ResizeInputOutputRows(current_sample_size, + // local_seed_node_count); + local_seed_node_count = current_sample_size; + subgraph_layer_sizes.emplace_back(local_seed_node_count); + num_sampled_layers++; + } + } + edge_sampling_timer.stop(); + subgraph_construction_timer.start(); + CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); + subgraph_construction_timer.stop(); + CorrectBackwardLinks(); + total_subgraph_construction_timer.stop(); + } + + galois::StatTimer epoch_timer("TrainingTime", kRegionName); + galois::StatTimer validation_timer("ValidationTime", kRegionName); + galois::StatTimer epoch_test_timer("TestTime", kRegionName); + + for (size_t epoch = 0; epoch < num_epochs; epoch++) { + epoch_timer.start(); + // swap to train subgraph + if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) { + graph_->EnableSubgraph(); + // TODO(loc) this doesn't actually function as expected anymore + // with the numerous changes to the system; this commenting + // out is more of a hack for the train subgraph option (which + // probably shouldn't be used anyways) + + // size_t l_count = 0; + // gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]); + // for (auto back_iter = gnn_layers_.rbegin(); + // back_iter != gnn_layers_.rend(); back_iter++) { + // GNNLayerType layer_type = (*back_iter)->layer_type(); + // if (layer_type == GNNLayerType::kGraphConvolutional || + // layer_type == GNNLayerType::kSAGE) { + // (*back_iter) + // ->ResizeInputOutputRows(subgraph_layer_sizes[l_count + 1], + // subgraph_layer_sizes[l_count]); + // l_count++; + // } + //} + CorrectBackwardLinks(); + } + + // beginning of epoch sampling (no minibatches) + if (config_.do_sampling() && !config_.train_minibatch_size()) { + galois::StatTimer mb_timer("EpochSubgraphCreation", kRegionName); + galois::StatTimer subgraph_construction_timer("SubGraphConstruction", + kRegionName); + galois::StatTimer setup_neighborhood_sample_timer( + "SetupNeighborhoodSample", kRegionName); + galois::StatTimer edge_sampling_timer("SampleEdges", kRegionName); + mb_timer.start(); + + setup_neighborhood_sample_timer.start(); + size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); + setup_neighborhood_sample_timer.stop(); + // gnn_layers_.back()->ResizeRows(local_seed_node_count); + galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", + local_seed_node_count); + size_t num_sampled_layers = 0; + + edge_sampling_timer.start(); + // work backwards on GCN/SAGE layers + // loop backward and find last GCN/SAGE (main) layer to disable + // activation + for (auto back_iter = gnn_layers_.rbegin(); + back_iter != gnn_layers_.rend(); back_iter++) { + GNNLayerType layer_type = (*back_iter)->layer_type(); + if (layer_type == GNNLayerType::kGraphConvolutional || + layer_type == GNNLayerType::kSAGE) { + size_t current_sample_size = graph_->SampleEdges( + (*back_iter)->graph_user_layer_number(), + config_.fan_out_vector_[num_sampled_layers], + config_.inductive_subgraph_, num_sampled_layers + 1); + galois::gDebug(graph_->host_prefix(), + "Number of local nodes for layer ", + (*back_iter)->graph_user_layer_number(), " is ", + current_sample_size); + + //(*back_iter) + // ->ResizeInputOutputRows(current_sample_size, + // local_seed_node_count); + local_seed_node_count = current_sample_size; + num_sampled_layers++; + } + } + edge_sampling_timer.stop(); + // resize layer matrices + subgraph_construction_timer.start(); + CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); + subgraph_construction_timer.stop(); + CorrectBackwardLinks(); + mb_timer.stop(); + } + + if (!config_.train_minibatch_size()) { + // no minibatching, full batch + const PointerWithSize predictions = DoInference(); + // have to get accuracy here because gradient prop destroys the + // predictions matrix + train_accuracy = GetGlobalAccuracy(predictions); + GradientPropagation(); + } else { + graph_->ResetTrainMinibatcher(); + // if (num_hosts_ > 1) { + // dist_minibatch_tracker_->ResetEpoch(); + //} + + SetLayerPhases(galois::GNNPhase::kBatch); + + size_t batch_num = 0; + + // create mini batch graphs and loop until minibatches on all hosts done + while (true) { + galois::StatTimer prep_timer("PrepNextMinibatch", kRegionName); + galois::StatTimer sample_time("MinibatchSampling", kRegionName); + galois::StatTimer mb_timer("MinibatchSubgraphCreation", kRegionName); + galois::StatTimer subgraph_construction_timer("SubGraphConstruction", + kRegionName); + mb_timer.start(); + + galois::Timer batch_timer; + batch_timer.start(); + work_left_.reset(); + galois::gInfo("Epoch ", epoch, " batch ", batch_num++); + // break when all hosts are done with minibatches + prep_timer.start(); + size_t seed_node_count; + // if (num_hosts_ > 1) { + // size_t num_for_next_batch = + // dist_minibatch_tracker_->GetNumberForNextMinibatch(); + // galois::gInfo(graph_->host_prefix(), "Sampling ", + // num_for_next_batch, + // " for this minibatch"); + // seed_node_count = + // graph_->PrepareNextTrainMinibatch(num_for_next_batch); + //} else { + //} + seed_node_count = graph_->PrepareNextTrainMinibatch(); + + galois::gDebug(graph_->host_prefix(), + "Number of local seed nodes is for batch is ", + seed_node_count); + prep_timer.stop(); + + // last layer input size/output rows becomes seed node size + // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, + // seed_node_count); + + sample_time.start(); + // +1 later in call because 0 is already taken + size_t num_sampled_layers = 0; + for (auto back_iter = gnn_layers_.rbegin(); + back_iter != gnn_layers_.rend(); back_iter++) { + GNNLayerType layer_type = (*back_iter)->layer_type(); + if (layer_type == GNNLayerType::kGraphConvolutional || + layer_type == GNNLayerType::kSAGE) { + // you can minibatch with sampling or minibatch and grab all + // relevant neighbors + size_t current_sample_size; + + if (config_.do_sampling()) { + current_sample_size = graph_->SampleEdges( + (*back_iter)->graph_user_layer_number(), + config_.fan_out_vector_[num_sampled_layers], + config_.inductive_subgraph_, num_sampled_layers + 1); + } else { + current_sample_size = graph_->SampleAllEdges( + (*back_iter)->graph_user_layer_number(), + config_.inductive_subgraph_, num_sampled_layers + 1); + } + + galois::gDebug(graph_->host_prefix(), + "Number of local nodes for layer ", + (*back_iter)->graph_user_layer_number(), " is ", + current_sample_size); + + // resize this layer, change seed node count + //(*back_iter) + // ->ResizeInputOutputRows(current_sample_size, + // seed_node_count); + seed_node_count = current_sample_size; + num_sampled_layers++; + } + } + sample_time.stop(); + + // resize layer matrices + subgraph_construction_timer.start(); + CorrectRowCounts( + graph_->ConstructSampledSubgraph(num_sampled_layers)); + subgraph_construction_timer.stop(); + CorrectBackwardLinks(); + + // XXX resizes above only work for SAGE layers; will break if other + // layers are tested + + mb_timer.stop(); + + const PointerWithSize batch_pred = DoInference(); + train_accuracy = GetGlobalAccuracy(batch_pred); + GradientPropagation(); + + work_left_ += graph_->MoreTrainMinibatches(); + char global_work_left = work_left_.reduce(); + batch_timer.stop(); + epoch_timer.stop(); + galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1, + ": Train accuracy/F1 micro is ", train_accuracy, + " time ", batch_timer.get(), "\n"); + + bool test_eval = + config_.minibatch_test_interval_ + ? (batch_num - 1) % config_.minibatch_test_interval_ == 0 + : false; + + if (test_eval) { + DisableTimers(); + float test_acc; + if (!config_.test_minibatch_size()) { + // TODO something about this path breaks accuracy + GALOIS_LOG_FATAL("this path breaks accuracy for the rest of the " + "run for some reason"); + bool f = graph_->SubgraphChooseAllStatus(); + graph_->DisableSubgraph(); + for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); + layer++) { + // TODO nuclear resize + (*layer)->ResizeRows(graph_->size()); + } + CorrectBackwardLinks(); + SetLayerPhases(galois::GNNPhase::kTest); + graph_->EnableSubgraphChooseAll(); + const PointerWithSize test_pred = DoInference(); + test_acc = GetGlobalAccuracy(test_pred); + graph_->SetSubgraphChooseAll(f); + } else { + test_acc = MinibatchedTesting(); + } + + if (this_host == 0) { + galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1, + ": Test accuracy is ", test_acc, "\n"); + const std::string test_name_acc = + "TestEpoch" + std::to_string(epoch) + "Batch" + + std::to_string(batch_num - 1) + "Accuracy"; + galois::runtime::reportStat_Single(kRegionName, test_name_acc, + test_acc); + } + + // report the training time elapsed at this point in time + galois::runtime::reportStat_Single( + kRegionName, + "ElapsedTrainTimeEpoch" + std::to_string(epoch) + "Batch" + + std::to_string(batch_num - 1), + epoch_timer.get()); + // revert to training phase for next epoch + SetLayerPhases(galois::GNNPhase::kTrain); + EnableTimers(); + } + + epoch_timer.start(); + + if (!global_work_left) { + // if (num_hosts_ > 1) { + // GALOIS_LOG_ASSERT(dist_minibatch_tracker_->OutOfWork()); + //} + break; + } + } + } + epoch_timer.stop(); + + if (this_host == 0) { + const std::string t_name_acc = + "TrainEpoch" + std::to_string(epoch) + "Accuracy"; + galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ", + train_accuracy, "\n"); + galois::runtime::reportStat_Single(kRegionName, t_name_acc, + train_accuracy); + } + + bool do_validate = config_.validation_interval_ + ? epoch % config_.validation_interval_ == 0 + : false; + bool do_test = + config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false; + + bool subgraph_choose_all_status = graph_->SubgraphChooseAllStatus(); + + if (do_validate || do_test) { + DisableTimers(); + // disable subgraph + graph_->DisableSubgraph(); + graph_->EnableSubgraphChooseAll(); + } + + if (do_validate) { + // XXX induced subgraph here + for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); + layer++) { + // nuclear resize + (*layer)->ResizeRows(graph_->size()); + } + + CorrectBackwardLinks(); + validation_timer.start(); + SetLayerPhases(galois::GNNPhase::kValidate); + const PointerWithSize val_pred = DoInference(); + validation_timer.stop(); + + float val_acc = GetGlobalAccuracy(val_pred); + if (this_host == 0) { + galois::gPrint("Epoch ", epoch, ": Validation accuracy is ", val_acc, + "\n"); + const std::string v_name_acc = + "ValEpoch" + std::to_string(epoch) + "Accuracy"; + galois::runtime::reportStat_Single(kRegionName, v_name_acc, val_acc); + } + } + + if (do_test) { + epoch_test_timer.start(); + float test_acc; + + if (!config_.test_minibatch_size()) { + for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); + layer++) { + // nuclear resize + (*layer)->ResizeRows(graph_->size()); + } + CorrectBackwardLinks(); + SetLayerPhases(galois::GNNPhase::kTest); + const PointerWithSize test_pred = DoInference(); + epoch_test_timer.stop(); + test_acc = GetGlobalAccuracy(test_pred); + } else { + test_acc = MinibatchedTesting(); + epoch_test_timer.stop(); + } + + if (this_host == 0) { + galois::gPrint("Epoch ", epoch, ": Test accuracy is ", test_acc, + "\n"); + const std::string test_name_acc = + "TestEpoch" + std::to_string(epoch) + "Accuracy"; + galois::runtime::reportStat_Single(kRegionName, test_name_acc, + test_acc); + } + } + + if (do_validate || do_test) { + // report the training time elapsed at this point in time + galois::runtime::reportStat_Single( + kRegionName, "ElapsedTrainTimeEpoch" + std::to_string(epoch), + epoch_timer.get()); + // revert to training phase for next epoch + SetLayerPhases(galois::GNNPhase::kTrain); + graph_->SetSubgraphChooseAll(subgraph_choose_all_status); + + // TODO too much code dupe + // Resconstruct the train subgraph since it was replaced by test + // subgraph + if (config_.use_train_subgraph_ && !config_.train_minibatch_size() && + config_.test_minibatch_size() && do_test) { + // Setup the subgraph to only be the training graph + size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); + galois::gDebug(graph_->host_prefix(), + "Number of local seed nodes is ", + local_seed_node_count); + size_t num_sampled_layers = 0; + // gnn_layers_.back()->ResizeRows(local_seed_node_count); + for (auto back_iter = gnn_layers_.rbegin(); + back_iter != gnn_layers_.rend(); back_iter++) { + GNNLayerType layer_type = (*back_iter)->layer_type(); + if (layer_type == GNNLayerType::kGraphConvolutional || + layer_type == GNNLayerType::kSAGE) { + size_t current_sample_size = graph_->SampleAllEdges( + (*back_iter)->graph_user_layer_number(), + config_.inductive_subgraph_, num_sampled_layers + 1); + // resizing + //(*back_iter) + // ->ResizeInputOutputRows(current_sample_size, + // local_seed_node_count); + local_seed_node_count = current_sample_size; + num_sampled_layers++; + } + } + CorrectRowCounts( + graph_->ConstructSampledSubgraph(num_sampled_layers)); + CorrectBackwardLinks(); + } + + EnableTimers(); + } + } + + uint64_t average_epoch_time = epoch_timer.get() / num_epochs; + galois::runtime::reportStat_Tavg(kRegionName, "AverageEpochTime", + average_epoch_time); + // DisableTimers(); + // disable subgraph + graph_->DisableSubgraph(); + graph_->EnableSubgraphChooseAll(); + + // check test accuracy + galois::StatTimer test_timer("FinalTestRun", kRegionName); + float global_accuracy; + + test_timer.start(); + + if (!config_.test_minibatch_size()) { + for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); + layer++) { + // TODO nuclear resize; this is **ridiculously** inefficient + // because full graph will be used even if not included in test + // k-hop neighborhood for eval + (*layer)->ResizeRows(graph_->size()); + } + CorrectBackwardLinks(); + SetLayerPhases(galois::GNNPhase::kTest); + const PointerWithSize predictions = DoInference(); + global_accuracy = GetGlobalAccuracy(predictions); + } else { + global_accuracy = MinibatchedTesting(); + } + + test_timer.stop(); + + if (this_host == 0) { + galois::gPrint("Final test accuracy is ", global_accuracy, "\n"); + galois::runtime::reportStat_Single(kRegionName, "FinalTestAccuracy", + global_accuracy); + } + + return global_accuracy; + } //! Propogates the graph's feature vectors through the network to get a new //! vector representation. //! Also known as the forward phase in most literature //! @returns Output layer's output - const PointerWithSize DoInference(); + const PointerWithSize DoInference() { + galois::StatTimer timer("DoInference", "GraphNeuralNetwork"); + if (timers_on_) { + timer.start(); + } + + // start with graph features and pass it through all layers of the network + galois::PointerWithSize layer_input = + graph_->GetLocalFeatures(); + + for (std::unique_ptr>& ptr : gnn_layers_) { + layer_input = ptr->ForwardPhase(layer_input); + } + + if (timers_on_) { + timer.stop(); + } + + return layer_input; + } //! Returns classification accuracy for single class label or micro F1 score //! for multi-class predictions; this calls into GNNGraph's accuracy call - float GetGlobalAccuracy(const PointerWithSize predictions); + float GetGlobalAccuracy(const PointerWithSize predictions) { +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + if (cpu_pred_.size() != predictions.size()) { + cpu_pred_.resize(predictions.size()); + } + + // TODO get rid of CPU copy here if possible + AdamOptimizer* adam = static_cast(optimizer_.get()); + adam->CopyToVector(cpu_pred_, predictions); + return graph_->GetGlobalAccuracy(cpu_pred_, phase_, + config_.do_sampling()); + } else { +#endif + return graph_->GetGlobalAccuracy(predictions, phase_, + config_.do_sampling()); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } + float GetGlobalAccuracy(const PointerWithSize predictions, bool sampling); //! Backpropagate gradients from the output layer backwards through the //! network to update the layer weights. Also known as a backward phase in //! most literature - void GradientPropagation(); + void GradientPropagation() { + galois::StatTimer timer("GradientPropagation", "GraphNeuralNetwork"); + if (timers_on_) { + timer.start(); + } + + // from output layer get initial gradients + std::vector dummy; + std::unique_ptr>& output_layer = + gnn_layers_.back(); + galois::PointerWithSize current_gradients = + output_layer->BackwardPhase(dummy, nullptr); + // loops through intermediate layers in a backward fashion + // -1 to ignore output layer which was handled above + for (size_t i = 0; i < gnn_layers_.size() - 1; i++) { + // note this assumes you have at least 2 layers (including output) + size_t layer_index = gnn_layers_.size() - 2 - i; + + // get the input to the layer before this one + galois::PointerWithSize prev_layer_input; + if (layer_index != 0) { + prev_layer_input = gnn_layers_[layer_index - 1]->GetForwardOutput(); + } else { + prev_layer_input = graph_->GetLocalFeatures(); + } + + // backward prop and get a new set of gradients + current_gradients = gnn_layers_[layer_index]->BackwardPhase( + prev_layer_input, ¤t_gradients); + // if not output do optimization/gradient descent + // at this point in the layer the gradients exist; use the gradients to + // update the weights of the layer + gnn_layers_[layer_index]->OptimizeLayer(optimizer_.get(), layer_index); + } + + if (timers_on_) { + timer.stop(); + } + } //! # nodes may change in distributed setting due to dead mirrors; //! given the # of nodes at each layer, fix the input/output rows @@ -227,7 +1037,17 @@ class GraphNeuralNetwork { } //! Call whenever resize occurs to correct reuse of pointers for layers - void CorrectBackwardLinks(); + void CorrectBackwardLinks() { + // layer chain pointer + PointerWithSize prev_output_layer(nullptr, 0); + for (size_t layer_num = 0; layer_num < gnn_layers_.size(); layer_num++) { + // first layer is nullptr so can be ignored + if (layer_num != 0) { + gnn_layers_[layer_num]->UpdateBackwardOutput(&prev_output_layer); + } + prev_output_layer = gnn_layers_[layer_num]->GetForwardOutput(); + } + } private: static const constexpr char* kRegionName = "GraphNeuralNetwork"; @@ -251,13 +1071,13 @@ class GraphNeuralNetwork { } //! Underlying graph to train - std::unique_ptr graph_; + std::unique_ptr> graph_; //! Optimizer object for weight updates std::unique_ptr optimizer_; //! Configuration object used to construct this GNN GraphNeuralNetworkConfig config_; //! GNN layers including the output - std::vector> gnn_layers_; + std::vector>> gnn_layers_; //! Current phase of the GNN: train, validation, test GNNPhase phase_{GNNPhase::kTrain}; //! Number of layers that use the graph (e.g. SAGE, GCN) diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h index 659541570d..a104f18bff 100644 --- a/libgnn/include/galois/graphs/DegreeSyncStructures.h +++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h @@ -10,17 +10,19 @@ extern uint32_t* gnn_degree_vec_2_; extern galois::DynamicBitSet bitset_sampled_degrees_; extern std::vector>* gnn_sampled_out_degrees_; +template struct InitialDegreeSync { - using ValTy = std::pair; + using NodeTy = NTy; + using ValTy = std::pair; //! return a vector of floats to sync - static ValTy extract(uint32_t lid, char&) { + static ValTy extract(uint32_t lid, NodeTy&) { return std::make_pair(gnn_degree_vec_1_[lid], gnn_degree_vec_2_[lid]); } //! reduction is addition in this case; add received vector to //! own vector - static bool reduce(uint32_t lid, char&, ValTy y) { + static bool reduce(uint32_t lid, NodeTy&, ValTy y) { gnn_degree_vec_1_[lid] += y.first; gnn_degree_vec_2_[lid] += y.second; if (y.first || y.second) { @@ -31,13 +33,13 @@ struct InitialDegreeSync { } //! No-op: readAny = overwritten anyways - static void reset(uint32_t lid, char&) { + static void reset(uint32_t lid, NodeTy&) { gnn_degree_vec_1_[lid] = 0; gnn_degree_vec_2_[lid] = 0; } //! element wise set - static void setVal(uint32_t lid, char&, ValTy y) { + static void setVal(uint32_t lid, NodeTy&, ValTy y) { gnn_degree_vec_1_[lid] = y.first; gnn_degree_vec_2_[lid] = y.second; } @@ -58,12 +60,14 @@ struct InitialDegreeSync { static bool extract_reset_batch(unsigned, uint8_t*) { return false; } }; +template struct SubgraphDegreeSync { - using ValTy = galois::gstl::Vector; + using NodeTy = NTy; + using ValTy = galois::gstl::Vector; static size_t FeatVecSize() { return gnn_sampled_out_degrees_->size(); } - static ValTy extract(uint32_t lid, char&) { + static ValTy extract(uint32_t lid, NodeTy&) { ValTy vec_to_send(gnn_sampled_out_degrees_->size()); size_t count = 0; for (galois::LargeArray& layer_degrees : @@ -85,7 +89,7 @@ struct SubgraphDegreeSync { } } - static bool reduce(uint32_t lid, char&, ValTy y) { + static bool reduce(uint32_t lid, NodeTy&, ValTy y) { assert(y.size() == gnn_sampled_out_degrees_->size()); for (size_t degree_index = 0; degree_index < y.size(); degree_index++) { (*gnn_sampled_out_degrees_)[degree_index][lid] += y[degree_index]; @@ -93,7 +97,7 @@ struct SubgraphDegreeSync { return true; } - static bool reduce(uint32_t lid, char&, ValTy::value_type* y) { + static bool reduce(uint32_t lid, NodeTy&, ValTy::value_type* y) { for (size_t degree_index = 0; degree_index < gnn_sampled_out_degrees_->size(); degree_index++) { (*gnn_sampled_out_degrees_)[degree_index][lid] += y[degree_index]; @@ -102,7 +106,7 @@ struct SubgraphDegreeSync { } //! No-op: readAny = overwritten anyways; can probably get away with no-op - static void reset(uint32_t lid, char&) { + static void reset(uint32_t lid, NodeTy&) { for (galois::LargeArray& layer_degrees : *gnn_sampled_out_degrees_) { layer_degrees[lid] = 0; @@ -110,14 +114,14 @@ struct SubgraphDegreeSync { } //! element wise set - static void setVal(uint32_t lid, char&, ValTy y) { + static void setVal(uint32_t lid, NodeTy&, ValTy y) { assert(y.size() == gnn_sampled_out_degrees_->size()); for (size_t degree_index = 0; degree_index < y.size(); degree_index++) { (*gnn_sampled_out_degrees_)[degree_index][lid] = y[degree_index]; } } - static void setVal(uint32_t lid, char&, ValTy::value_type* y) { + static void setVal(uint32_t lid, NodeTy&, ValTy::value_type* y) { for (size_t degree_index = 0; degree_index < gnn_sampled_out_degrees_->size(); degree_index++) { (*gnn_sampled_out_degrees_)[degree_index][lid] = y[degree_index]; diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 6dbfdfbcf1..ad41def334 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -6,6 +6,14 @@ #include "galois/graphs/GluonSubstrate.h" #include "galois/graphs/GraphAggregationSyncStructures.h" #include "galois/MinibatchGenerator.h" +#include "galois/Logging.h" +#include "galois/graphs/ReadGraph.h" +#include "galois/GNNMath.h" +#include "galois/graphs/DegreeSyncStructures.h" + +#include +#include +#include #ifdef GALOIS_ENABLE_GPU #include "galois/graphs/GNNGraph.cuh" @@ -31,24 +39,99 @@ namespace graphs { //! Possible partitioning schemes for the GNN graph enum class GNNPartitionScheme { kOEC, kCVC, kOCVC }; +template class GNNGraph { public: - using GNNDistGraph = galois::graphs::DistGraph; - using GraphNode = GNNDistGraph::GraphNode; + using GNNDistGraph = galois::graphs::DistGraph; + using GraphNode = typename GNNDistGraph::GraphNode; // defined as such because dist graph range objects used long unsigned using NodeIterator = boost::counting_iterator; - using EdgeIterator = GNNDistGraph::edge_iterator; + using EdgeIterator = typename GNNDistGraph::edge_iterator; // using GNNEdgeSortIterator = internal::EdgeSortIterator, // galois::LargeArray>>; GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme, - bool has_single_class_label, bool useShad = false); + bool has_single_class_label, bool useWMD = false) + : GNNGraph(galois::default_gnn_dataset_path, dataset_name, + partition_scheme, has_single_class_label, + useWMD) {} + //! Loads a graph and all relevant metadata (labels, features, masks, etc.) GNNGraph(const std::string& input_directory, const std::string& dataset_name, GNNPartitionScheme partition_scheme, bool has_single_class_label, - bool useShad = false); + bool useWMD = false) + : input_directory_(input_directory) { + GALOIS_LOG_VERBOSE("[{}] Constructing partitioning for {}", host_id_, + dataset_name); + // save host id + host_id_ = galois::runtime::getSystemNetworkInterface().ID; + host_prefix_ = + std::string("[") + + std::to_string(galois::runtime::getSystemNetworkInterface().ID) + + std::string("] "); + // load partition + partitioned_graph_ = LoadPartition(input_directory_, dataset_name, + partition_scheme, useWMD); + galois::gInfo(host_prefix_, "Loading partition is completed"); + // reverse edges + partitioned_graph_->ConstructIncomingEdges(); + // mark a node if it is sampled + mark_sampled_nodes_.resize(partitioned_graph_->size()); + + galois::gInfo(host_prefix_, "Number of local proxies is ", + partitioned_graph_->size()); + galois::gInfo(host_prefix_, "Number of local edges is ", + partitioned_graph_->sizeEdges()); + + // init gluon from the partitioned graph + sync_substrate_ = + std::make_unique>( + *partitioned_graph_, host_id_, + galois::runtime::getSystemNetworkInterface().Num, false, + partitioned_graph_->cartesianGrid()); + bitset_graph_aggregate.resize(partitioned_graph_->size()); + + // Construct/read additional graph data + if (useWMD) { + galois::gInfo("Feature is constructed by aggregating 2-hop features, " + "instead from feature files"); + this->ConstructFeatureBy2HopAggregation(); + this->ConstructLocalLabels(); + this->SetLocalMasksRandomly(); + } else { + if (dataset_name != "ogbn-papers100M-remap") { + ReadLocalLabels(dataset_name, has_single_class_label); + } else { + galois::gInfo("Remapped ogbn 100M"); + ReadLocalLabelsBin(dataset_name); + } + ReadLocalFeatures(dataset_name); + ReadLocalMasks(dataset_name); + } + + // init norm factors (involves a sync call) + InitNormFactor(); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + // allocate/copy data structures over to GPU + GALOIS_LOG_VERBOSE("[{}] Initializing GPU memory", host_id_); + InitGPUMemory(); + + // initialize CUDA context + cuda_ctx_ = get_CUDA_context(host_id_); + if (!init_CUDA_context(cuda_ctx_, ::gpudevice)) { + GALOIS_DIE("Failed to initialize CUDA context"); + } + PartitionedGraphInfo g_info; + GetPartitionedGraphInfo(g_info); + load_graph_CUDA_GNN(cuda_ctx_, g_info, + galois::runtime::getSystemNetworkInterface().Num); + } +#endif + } //! Returns host id size_t host_id() const { return host_id_; } @@ -127,7 +210,32 @@ class GNNGraph { void InitializeSamplingData() { InitializeSamplingData(1, false); } //! Initialize data required to do graph sampling - void InitializeSamplingData(size_t num_layers, bool is_inductive); + void InitializeSamplingData(size_t num_layers, bool choose_all) { + subgraph_ = std::make_unique(partitioned_graph_->size()); + sample_node_timestamps_.create(partitioned_graph_->size(), + std::numeric_limits::max()); + edge_sample_status_.resize(num_layers); + for (size_t i = 0; i < num_layers; i++) { + edge_sample_status_[i].resize(partitioned_graph_->sizeEdges()); + } + sampled_edges_.resize(partitioned_graph_->sizeEdges()); + // this is to hold the degree of a sampled graph considering all hosts; yes, + // memory wise this is slightly problematic possibly, but each layer is its + // own subgraph + if (!choose_all) { + sampled_out_degrees_.resize(num_layers); + for (galois::LargeArray& array : sampled_out_degrees_) { + array.create(partitioned_graph_->size()); + } + } else { + subgraph_choose_all_ = true; + } + definitely_sampled_nodes_.resize(partitioned_graph_->size()); + master_offset_accum_.resize(num_layers + 1); + mirror_offset_accum_.resize(num_layers + 1); + sample_master_offsets_.resize(num_layers + 1, 0); + sample_mirror_offsets_.resize(num_layers + 1, 0); + } ////////////////////////////////////////////////////////////////////////////// // Out Edges @@ -169,7 +277,7 @@ class GNNGraph { }; galois::runtime::iterable< - galois::NoDerefIterator> + galois::NoDerefIterator> edges(GraphNode N) const { if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->edges(N); @@ -247,7 +355,7 @@ class GNNGraph { } } galois::runtime::iterable< - galois::NoDerefIterator> + galois::NoDerefIterator> in_edges(GraphNode N) const { if (!use_subgraph_ && !use_subgraph_view_) { return partitioned_graph_->in_edges(N); @@ -306,21 +414,364 @@ class GNNGraph { size_t SetupNeighborhoodSample() { return SetupNeighborhoodSample(GNNPhase::kTrain); } - size_t SetupNeighborhoodSample(GNNPhase seed_phase); + size_t SetupNeighborhoodSample(GNNPhase seed_phase) { + DisableSubgraph(); + + if (!bitset_sample_flag_.size()) { + bitset_sample_flag_.resize(size()); + } + bitset_sample_flag_.ParallelReset(); + definitely_sampled_nodes_.ParallelReset(); + + galois::do_all( + galois::iterate(begin_owned(), end_owned()), + [&](const NodeIterator& x) { + if (IsValidForPhase(*x, seed_phase)) { + SetSampledNode(*x); + bitset_sample_flag_.set(*x); + definitely_sampled_nodes_.set(*x); + } else { + UnsetSampledNode(*x); + } + }, + galois::loopname("InitialSeedSetting")); + // unsets nodes set in previous iterations; for some reason they get + // synchronized along with everything else even though bitset sample flag + // should prevent it (that, or it's because they don't get sync'd that they + // remain the same) + galois::do_all(galois::iterate(end_owned(), end()), + [&](const NodeIterator& x) { UnsetSampledNode(*x); }); + + // clear node timestamps + galois::StatTimer fill_time("ClearFillTime"); + fill_time.start(); + galois::ParallelSTL::fill(sample_node_timestamps_.begin(), + sample_node_timestamps_.end(), + std::numeric_limits::max()); + galois::ParallelSTL::fill(sample_master_offsets_.begin(), + sample_master_offsets_.end(), 0); + galois::ParallelSTL::fill(sample_mirror_offsets_.begin(), + sample_mirror_offsets_.end(), 0); + fill_time.stop(); + + for (unsigned i = 0; i < master_offset_accum_.size(); i++) { + master_offset_accum_[i].reset(); + mirror_offset_accum_[i].reset(); + } + + // clear all sampled edges + galois::StatTimer ctime("ClearSampleEdges"); + ctime.start(); + for (galois::DynamicBitSet& edge_layer : edge_sample_status_) { + edge_layer.ParallelReset(); + } + ctime.stop(); + // galois::do_all( + // galois::iterate(edge_sample_status_.begin(), + // edge_sample_status_.end()), + // [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); }, + // galois::loopname("ClearSampleEdges")); + + sampled_edges_.ParallelReset(); + + // reset all degrees + if (!subgraph_choose_all_) { + galois::StatTimer cad_timer("ClearAllDegrees"); + cad_timer.start(); + for (galois::LargeArray& array : sampled_out_degrees_) { + galois::ParallelSTL::fill(array.begin(), array.end(), 0); + } + cad_timer.stop(); + } + + if (!bitset_sampled_degrees_.size()) { + bitset_sampled_degrees_.resize(partitioned_graph_->size()); + } + bitset_sampled_degrees_.reset(); + + // Seed nodes sync + SampleNodeSync("SeedNodeSample"); + + galois::GAccumulator local_seed_count; + local_seed_count.reset(); + galois::GAccumulator master_offset; + master_offset.reset(); + galois::GAccumulator mirror_offset; + mirror_offset.reset(); + // count # of seed nodes + galois::do_all( + galois::iterate(begin(), end()), + [&](const NodeIterator& x) { + if (IsInSampledGraph(x)) { + if (*x < *end_owned()) { + master_offset += 1; + } else { + // mirror + mirror_offset += 1; + } + + // galois::gInfo(host_prefix_, "Seed node is ", GetGID(*x)); + local_seed_count += 1; + // 0 = seed node + sample_node_timestamps_[*x] = 0; + } + }, + galois::loopname("SeedNodeOffsetCounting")); + + sample_master_offsets_[0] = master_offset.reduce(); + sample_mirror_offsets_[0] = mirror_offset.reduce(); + + return local_seed_count.reduce(); + } //! Choose all edges from sampled nodes size_t SampleAllEdges(size_t agg_layer_num, bool inductive_subgraph, - size_t timestamp); + size_t timestamp) { + DisableSubgraph(); + + galois::do_all( + galois::iterate(begin(), end()), + [&](const NodeIterator& src_iter) { + // only operate on if sampled + if (IsInSampledGraph(src_iter)) { + // marks ALL edges of nodes that connect to train/other nodes + for (auto edge_iter : partitioned_graph_->edges(*src_iter)) { + // total += 1; + if (inductive_subgraph) { + if (!IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), + GNNPhase::kTrain) && + !IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), + GNNPhase::kOther)) { + continue; + } + } + + MakeEdgeSampled(edge_iter, agg_layer_num); + uint32_t dest = partitioned_graph_->getEdgeDst(edge_iter); + if (!IsInSampledGraph(dest)) { + bitset_sample_flag_.set(dest); + } + definitely_sampled_nodes_.set(*src_iter); + definitely_sampled_nodes_.set(dest); + } + } + }, + galois::steal(), galois::loopname("ChooseAllEdges")); + + // update nodes, then communicate update to all hosts so that they can + // continue the exploration + galois::do_all( + galois::iterate(size_t{0}, bitset_sample_flag_.size()), + [&](uint32_t new_node_id) { + if (bitset_sample_flag_.test(new_node_id)) { + SetSampledNode(new_node_id); + } + }, + galois::loopname("NeighborhoodSampleSet")); + + SampleNodeSync("SampleFlag"); + + galois::GAccumulator local_sample_count; + local_sample_count.reset(); + // count # of seed nodes + galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { + if (IsInSampledGraph(x)) { + local_sample_count += 1; + if (sample_node_timestamps_[*x] == + std::numeric_limits::max()) { + if (x < end_owned()) { + // owned nodes that are activated on other hosts shoudl always + // be activated because it's responsible for keeping others in + // sync during comms; ignoring it = bad + // TODO(gluon) make it so you don't have to deal with this + // and just use host as a reducer point + definitely_sampled_nodes_.set(*x); + } + sample_node_timestamps_[*x] = timestamp; + } + } + }); + + EnableSubgraphChooseAll(); + return local_sample_count.reduce(); + } + //! Sample neighbors of nodes that are marked as ready for sampling size_t SampleEdges(size_t sample_layer_num, size_t num_to_sample, - bool inductive_subgraph, size_t timestamp); + bool inductive_subgraph, size_t timestamp) { + use_subgraph_ = false; + use_subgraph_view_ = false; + + galois::do_all( + galois::iterate(begin(), end()), + [&](const NodeIterator& src_iter) { + // only operate on if sampled + if (IsInSampledGraph(src_iter)) { + // chance of not uniformly choosing an edge of this node + // num_to_sample times (degree norm is 1 / degree) + double probability_of_reject; + if (!inductive_subgraph) { + probability_of_reject = + std::pow(1 - GetGlobalDegreeNorm(*src_iter), num_to_sample); + } else { + probability_of_reject = std::pow( + 1 - GetGlobalTrainDegreeNorm(*src_iter), num_to_sample); + } + + // loop through edges, turn "on" edge with some probability + for (auto edge_iter : partitioned_graph_->edges(*src_iter)) { + if (sample_rng_.DoBernoulli(probability_of_reject)) { + if (inductive_subgraph) { + // only take if node is training node or a node not classified + // into train/test/val + if (!IsValidForPhase( + partitioned_graph_->getEdgeDst(edge_iter), + GNNPhase::kTrain) && + !IsValidForPhase( + partitioned_graph_->getEdgeDst(edge_iter), + GNNPhase::kOther)) { + continue; + } + } + + uint32_t edge_dst = partitioned_graph_->getEdgeDst(edge_iter); + // if here, it means edge accepted; set sampled on, mark + // as part of next set + MakeEdgeSampled(edge_iter, sample_layer_num); + if (!IsInSampledGraph(edge_dst)) { + bitset_sample_flag_.set(edge_dst); + } + bitset_sampled_degrees_.set(*src_iter); + definitely_sampled_nodes_.set(*src_iter); + definitely_sampled_nodes_.set(edge_dst); + // degree increment + sampled_out_degrees_[sample_layer_num][*src_iter]++; + } + } + } + }, + galois::steal(), galois::loopname("NeighborhoodSample")); + + // update nodes, then communicate update to all hosts so that they can + // continue the exploration + galois::do_all( + galois::iterate(size_t{0}, bitset_sample_flag_.size()), + [&](uint32_t new_node_id) { + if (bitset_sample_flag_.test(new_node_id)) { + SetSampledNode(new_node_id); + } + }, + galois::loopname("NeighborhoodSampleSet")); + + // why not read source? even if it doesn't need to sample anything, it needs + // to know that it's active so that subgraph construction can proceed + // correctly + SampleNodeSync("SampleFlag"); + + // count sampled node size + galois::GAccumulator local_sample_count; + local_sample_count.reset(); + // count # of seed nodes + galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { + if (IsInSampledGraph(x)) { + local_sample_count += 1; + if (sample_node_timestamps_[*x] == + std::numeric_limits::max()) { + if (x < end_owned()) { + // owned nodes that are activated on other hosts shoudl always + // be activated because it's responsible for keeping others in + // sync during comms; ignoring it = bad + // TODO(gluon) make it so you don't have to deal with this + // and just use host as a reducer point + definitely_sampled_nodes_.set(*x); + } + sample_node_timestamps_[*x] = timestamp; + } + } + }); + + DisableSubgraphChooseAll(); + return local_sample_count.reduce(); + } std::vector ConstructSampledSubgraph(size_t num_sampled_layers) { return ConstructSampledSubgraph(num_sampled_layers, false); }; //! Construct the subgraph from sampled edges and corresponding nodes std::vector ConstructSampledSubgraph(size_t num_sampled_layers, - bool use_view); + bool use_view) { + // false first so that the build process can use functions to access the + // real graph + DisableSubgraph(); + + gnn_sampled_out_degrees_ = &sampled_out_degrees_; + + // first, sync the degres of the sampled edges across all hosts + // read any because destinations need it to for reverse phase + if (use_timer_) { + sync_substrate_->template sync< + writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>( + "SubgraphDegree"); + } else { + sync_substrate_->template sync< + writeSource, readAny, SubgraphDegreeSync, SubgraphDegreeBitset>( + "Ignore"); + } + + galois::StatTimer offsets_n_rows_time("OffsetRowSubgraphTime"); + offsets_n_rows_time.start(); + galois::do_all( + galois::iterate(begin(), end()), + [&](const NodeIterator& x) { + if (IsActiveInSubgraph(*x)) { + if (sample_node_timestamps_[*x] != + std::numeric_limits::max()) { + if (*x < *end_owned()) { + // master + master_offset_accum_[sample_node_timestamps_[*x]] += 1; + } else { + // mirror + mirror_offset_accum_[sample_node_timestamps_[*x]] += 1; + } + } else { + GALOIS_LOG_FATAL( + "should have been timestamped at some point if active"); + } + } + }, + galois::loopname("MasterMirrorOffset")); + + std::vector new_rows(master_offset_accum_.size()); + for (unsigned i = 0; i < master_offset_accum_.size(); i++) { + sample_master_offsets_[i] = master_offset_accum_[i].reduce(); + sample_mirror_offsets_[i] = mirror_offset_accum_[i].reduce(); + new_rows[i] = sample_master_offsets_[i] + sample_mirror_offsets_[i]; + if (i > 0) { + new_rows[i] += new_rows[i - 1]; + } + } + + offsets_n_rows_time.stop(); + + if (!use_view) { + subgraph_->BuildSubgraph(*this, num_sampled_layers); + } else { + // a view only has lid<->sid mappings + subgraph_->BuildSubgraphView(*this, num_sampled_layers); + } + + sync_substrate_->SetupSubgraphMirrors(subgraph_->GetSubgraphMirrors(), + use_timer_); + + // after this, this graph is a subgraph + if (!use_view) { + use_subgraph_ = true; + } else { + use_subgraph_view_ = true; + } + + return new_rows; + } unsigned SampleNodeTimestamp(unsigned lid) const { return sample_node_timestamps_[lid]; @@ -410,7 +861,23 @@ class GNNGraph { //! Setup the state for the next minibatch sampling call by using the //! minibatcher to pick up the next set batch of nodes - size_t PrepareNextTrainMinibatch(); + size_t PrepareNextTrainMinibatch() { + train_batcher_->GetNextMinibatch(&local_minibatch_mask_); +#ifndef NDEBUG + size_t count = 0; + // galois::gPrint("Minibatch : "); + for (unsigned i = 0; i < local_minibatch_mask_.size(); i++) { + if (local_minibatch_mask_[i]) { + // galois::gPrint(partitioned_graph_->getGID(i), ","); + count++; + } + } + // galois::gPrint("\n"); + galois::gInfo(host_prefix(), "Batched nodes ", count); +#endif + return SetupNeighborhoodSample(GNNPhase::kBatch); + } + // Used with distributed minibatch tracker // size_t PrepareNextTrainMinibatch(size_t num_to_get) { // train_batcher_->GetNextMinibatch(&local_minibatch_mask_, num_to_get); @@ -419,6 +886,169 @@ class GNNGraph { //! Returns true if there are still more minibatches in this graph bool MoreTrainMinibatches() { return !train_batcher_->NoMoreMinibatches(); }; + template < + typename T = VTy, + typename std::enable_if_t>* = nullptr> + void ConstructFeatureBy2HopAggregation() { + galois::StatTimer timer("ConstructFeatureBy2HopAggregation"); + if (this->use_timer_) { + timer.start(); + } + + // TODO(hc): This constant is from SHAD implementation. + // This will be an user parameter for general/flexible support. + + // The first 15 floats are for the current node feature, + // and the another 15 floats are for the aggregated neighbor's node feature. + // These two 15-dimension features are concateneated to a single feature + // for each node. + this->node_feature_length_ = 30; + this->local_node_features_.resize( + this->partitioned_graph_->size() * this->node_feature_length_, 0.f); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + this->ConstructFeatureBy2HopAggregationGPU(); + } else { +#endif + this->ConstructFeatureBy2HopAggregationCPU(); +#ifdef GALOIS_ENABLE_GPU + } +#endif + + if (this->use_timer_) { + timer.stop(); + } + } + + template < + typename T = VTy, + typename std::enable_if_t>* = nullptr> + void ConstructFeatureBy2HopAggregation() {} + + void ConstructFeatureBy2HopAggregationGPU() { + // TODO(hc): This might not be used in the future. + // This might be renamed to use "PANDO" instead of "GPU". + // For now, just following the existing code format. + GALOIS_LOG_FATAL( + "ConstructFeatureBy2HopAggregationGPU() is not supported."); + } + + void ConstructFeatureBy2HopAggregationCPU() { + galois::gInfo("Construct an initial feature on CPU by " + "aggregating and concatenating neighbors' features."); + //this->PrintFeatures("0hop"); + // this->FillTestNodeType(); + //this->PrintGraphTopo("before"); + this->Construct1HopFeatureCPU(); + //this->PrintFeatures("1hop"); + this->Construct2HopFeatureCPU(); + this->PrintFeatures("2hop"); + } + + void PrintFeatures(std::string postfix) { + // XXX(hc): Printing code for correctness check. + auto& net = galois::runtime::getSystemNetworkInterface(); + unsigned host_id = net.ID; + std::ofstream fp(postfix + "." + std::to_string(host_id) + ".feat"); + for (size_t lid = 0; lid < this->partitioned_graph_->size(); ++lid) { + /* + size_t gid = this->partitioned_graph_->getGID(lid); + fp << "src:" << gid << ", " << + this->partitioned_graph_->getData(lid).type << ", " << + this->partitioned_graph_->getData(lid).key << "\n"; + for (size_t i = 0; i < this->node_feature_length_; ++i) { + fp << "\t [" << i << "] = " << + this->local_node_features_[lid * this->node_feature_length_ + i] + << "\n"; + } + */ + fp << this->partitioned_graph_->getData(lid).key; + for (size_t i = 0; i < this->node_feature_length_; ++i) { + fp << "," + << this->local_node_features_[lid * this->node_feature_length_ + i]; + } + fp << "\n"; + } + fp.close(); + } + + /// Construct feature from 1-hop neighbors. + /// This method traverses 1-hop outgoing neighbors from each vertex + /// and constructs a histogram of the outgoing edge type and + /// the outgoing neighbor type. + void Construct1HopFeatureCPU() { + auto& graph = *(this->partitioned_graph_); + // Aggregate adjacent node and edge types and construct + // an intermediate feature. + galois::do_all( + galois::iterate(size_t{0}, graph.size()), + [&](size_t src_lid) { + bitset_graph_aggregate.set(src_lid); + for (auto edge_iter = graph.edge_begin(src_lid); + edge_iter < graph.edge_end(src_lid); ++edge_iter) { + size_t dst_lid = graph.getEdgeDst(edge_iter); + uint32_t dst_type = graph.getData(dst_lid).type; + uint64_t edge_type = graph.getEdgeData(edge_iter); + // Aggregate out neighbors' types. + ++this->local_node_features_[this->node_feature_length_ * src_lid + + dst_type]; + // TODO(hc): Assume that edge type is always 0. + // So, the 0th feature value of a node should be + // (degree of the node + sum of type-0 neighbors). + ++this->local_node_features_[this->node_feature_length_ * src_lid + + edge_type]; + } + }, + galois::steal(), galois::loopname("Construct1HopFeatureCPU")); + + gnn_matrix_to_sync_ = this->local_node_features_.data(); + gnn_matrix_to_sync_column_length_ = this->node_feature_length_; + // All the source vertices reduce and update proxies' data + // and both the source and destination vertices set those + // updated data to their data. + sync_substrate_->template sync, + Bitset_graph_aggregate>( + "GraphAggregateSync"); + } + + + /// Construct feature from 2-hop neighbors. + /// After `Construct1HopFeatureCPU()`, each vertex aggregates types of + /// the outgoing edges and neighbors, and constructs a histogram for + /// its feature. Now, in this method, each vertex aggregates those + /// histograms from outgoing neighbors and constructs a new histogram. + /// Then, each vertex appends this new histogram to the old histogram + /// as its feature. + void Construct2HopFeatureCPU() { + auto& graph = *(this->partitioned_graph_); + // Aggregate neighbor nodes' features and append (concatenate) it to the + // current node feature. So the first half is the current node and + // the next half is the aggregated node feature. + galois::do_all( + galois::iterate(size_t{0}, graph.size()), + [&](size_t src_lid) { + // Offset for the second part of the source node feature. + size_t src_foffset = this->node_feature_length_ * src_lid + + this->node_feature_length_ / 2; + bitset_graph_aggregate.set(src_lid); + for (auto edge_iter = graph.edge_begin(src_lid); + edge_iter < graph.edge_end(src_lid); ++edge_iter) { + size_t dst_lid = graph.getEdgeDst(edge_iter); + // Offset for the first part of the destination node feature. + size_t dst_foffset = this->node_feature_length_ * dst_lid; + for (size_t fid = 0; fid < this->node_feature_length_ / 2; ++fid) { + // Aggregate outgoing neighbors' features and, + // construct and append a new histogram to the old one. + this->local_node_features_[src_foffset + fid] += + this->local_node_features_[dst_foffset + fid]; + } + } + }, + galois::steal(), galois::loopname("Construct2HopFeatureCPU")); + this->SHADFeatureAggregateSync(this->local_node_features_.data(), + this->node_feature_length_); + } + ////////////////////////////////////////////////////////////////////////////// void SetupTestBatcher(size_t test_batch_size) { @@ -433,7 +1063,11 @@ class GNNGraph { void ResetTestMinibatcher() { test_batcher_->ResetMinibatchState(); } //! Setup the state for the next minibatch sampling call by using the //! minibatcher to pick up the next set batch of nodes - size_t PrepareNextTestMinibatch(); + size_t PrepareNextTestMinibatch() { + test_batcher_->GetNextMinibatch(&local_minibatch_mask_); + return SetupNeighborhoodSample(GNNPhase::kBatch); + } + //! Returns true if there are still more minibatches in this graph bool MoreTestMinibatches() { return !test_batcher_->NoMoreMinibatches(); }; @@ -487,12 +1121,47 @@ class GNNGraph { // Get accuracy: sampling is by default false float GetGlobalAccuracy(PointerWithSize predictions, - GNNPhase phase); + GNNPhase phase) { + // No GPU version yet, but this is where it would be + return GetGlobalAccuracy(predictions, phase, false); + } + float GetGlobalAccuracy(PointerWithSize predictions, GNNPhase phase, - bool sampling); + bool sampling) { + // No GPU version yet, but this is where it would be + return GetGlobalAccuracyCPU(predictions, phase, sampling); + } std::pair - GetBatchAccuracy(PointerWithSize predictions); + GetBatchAccuracy(PointerWithSize predictions) { + // check owned nodes' accuracy + num_correct_.reset(); + total_checked_.reset(); + + galois::do_all( + // will only loop over sampled nodes if sampling is on + galois::iterate(begin_owned(), end_owned()), + // this is possibly the subgraph id + [&](const unsigned node_id) { + if (IsValidForPhase(node_id, GNNPhase::kBatch)) { + total_checked_ += 1; + size_t predicted_label = + galois::MaxIndex(num_label_classes_, + &(predictions[node_id * num_label_classes_])); + if (predicted_label == + static_cast(GetSingleClassLabel(node_id))) { + num_correct_ += 1; + } + } + }, + // steal on as some threads may have nothing to work on + galois::steal(), galois::loopname("GlobalAccuracy")); + + size_t global_correct = num_correct_.reduce(); + size_t global_checked = total_checked_.reduce(); + + return std::make_pair(global_correct, global_checked); + } //! Returns the ground truth label of some local id assuming labels are single //! class labels. @@ -561,6 +1230,49 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// + //! @brief Variant of the plain feature aggregation. + //! @detail This is a variant version of the dense feature aggregation + //! that follows SHAD GNN feature construction. This aggregates features of + //! the neighbor vertices that are from (vertex's feature offset + + //! 1/2 * feature length) to (vertex's feature offset + feature length), + //! to (vertex's feature offset) of the current vertex, from its proxies. + //! + //! @param matrix_to_sync Float pointer pointing to features of the target + //! vertex + //! @param matrix_column_size Feature length to calculate a base offset of + //! each vertex + void SHADFeatureAggregateSync(GNNFloat* matrix_to_sync, + const size_t matrix_column_size) const { + gnn_matrix_to_sync_ = matrix_to_sync; + gnn_matrix_to_sync_column_length_ = matrix_column_size; + + // set globals for the sync substrate + if (use_timer_) { + sync_substrate_->template sync< + writeSource, readAny, SHADGNNSumAggregate, Bitset_graph_aggregate>( + "SHADGraphAggregateSync"); + } else { + sync_substrate_->template sync< + writeSource, readAny, SHADGNNSumAggregate, Bitset_graph_aggregate>( + "Ignore"); + } + } + + void SampleNodeSync(std::string stat_str) { + sampled_nodes_ = &(this->mark_sampled_nodes_); + + // set globals for the sync substrate + if (use_timer_) { + sync_substrate_->template sync, SampleFlagBitset>( + stat_str); + } else { + sync_substrate_->template sync, SampleFlagBitset>( + "Ignore"); + } + } + // TODO(loc) Should not be a default version of this to avoid potential // issues later void AggregateSync(GNNFloat* matrix_to_sync, @@ -575,7 +1287,58 @@ class GNNGraph { //! Note that it's const because the only thing being used is the graph //! topology of this object; the thing modified is the passed in matrix void AggregateSync(GNNFloat* matrix_to_sync, const size_t matrix_column_size, - bool is_backward, uint32_t active_row_boundary) const; + bool is_backward, uint32_t active_row_boundary) const { + gnn_matrix_to_sync_ = matrix_to_sync; + gnn_matrix_to_sync_column_length_ = matrix_column_size; + subgraph_size_ = active_size(); + num_active_layer_rows_ = active_row_boundary; + + if (!use_subgraph_ && !use_subgraph_view_) { + // set globals for the sync substrate + if (!is_backward) { + if (use_timer_) { + sync_substrate_ + ->template sync, + Bitset_graph_aggregate>("GraphAggregateSync"); + } else { + sync_substrate_ + ->template sync, + Bitset_graph_aggregate>("Ignore"); + } + } else { + galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon"); + clubbed_timer.start(); + sync_substrate_ + ->template sync, + Bitset_graph_aggregate>( + "BackwardGraphAggregateSync"); + clubbed_timer.stop(); + } + } else { + // setup the SID to LID map for the sync substrate to use (SID != LID) + gnn_lid_to_sid_pointer_ = subgraph_->GetLIDToSIDPointer(); + + if (!is_backward) { + if (use_timer_) { + sync_substrate_ + ->template sync, + Bitset_graph_aggregate>("GraphAggregateSync"); + } else { + sync_substrate_ + ->template sync, + Bitset_graph_aggregate>("Ignore"); + } + } else { + galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon"); + clubbed_timer.start(); + sync_substrate_ + ->template sync, Bitset_graph_aggregate>( + "BackwardGraphAggregateSync"); + clubbed_timer.stop(); + } + } + } ////////////////////////////////////////////////////////////////////////////// // Sampling related @@ -613,17 +1376,87 @@ class GNNGraph { } //! Calculate norm factor considering the entire graph - void CalculateFullNormFactor(); + void CalculateFullNormFactor() { + // TODO(loc) reset all degrees if this is called multiple times? + // get the norm factor contribution for each node based on the GLOBAL graph + galois::do_all( + galois::iterate(static_cast(0), partitioned_graph_->size()), + [&](size_t src) { + for (auto edge_iter = partitioned_graph_->edge_begin(src); + edge_iter != partitioned_graph_->edge_end(src); edge_iter++) { + // count degrees for all + train/other + size_t dest = GetEdgeDest(edge_iter); + if (IsValidForPhase(dest, GNNPhase::kTrain) || + IsValidForPhase(dest, GNNPhase::kOther)) { + global_train_degrees_[src] += 1; + } + global_degrees_[src] += 1; + } + }, + galois::loopname("CalculateLocalDegrees")); + // degree sync + gnn_degree_vec_1_ = global_train_degrees_.data(); + gnn_degree_vec_2_ = global_degrees_.data(); + sync_substrate_ + ->template sync>( + "InitialDegreeSync"); + } #ifdef GALOIS_ENABLE_GPU void AggregateSyncGPU(GNNFloat* matrix_to_sync, const size_t matrix_column_size, - const unsigned layer_number) const; + const unsigned layer_number) const { + size_t layer_input_mtx_column_size = + getLayerInputMatrixColumnSize(cuda_ctx_, layer_number); + size_t layer_output_mtx_column_size = + getLayerOutputMatrixColumnSize(cuda_ctx_, layer_number); + // set globals for the sync substrate + gnn_matrix_to_sync_ = matrix_to_sync; + gnn_matrix_to_sync_column_length_ = matrix_column_size; + cuda_ctx_for_sync = cuda_ctx_; + layer_number_to_sync = layer_number; + // TODO bitset setting + // call sync + cudaSetLayerInputOutput(cuda_ctx_, matrix_to_sync, matrix_column_size, + size(), layer_number); + + // XXX no timer if use_timer is off + if (gnn_matrix_to_sync_column_length_ == layer_input_mtx_column_size) { + if (use_timer_) { + sync_substrate_->template sync>( + "GraphAggregateSync", gnn_matrix_to_sync_column_length_); + } else { + sync_substrate_->template sync>( + "Ignore", gnn_matrix_to_sync_column_length_); + } + } else if (gnn_matrix_to_sync_column_length_ == + layer_output_mtx_column_size) { + if (use_timer_) { + sync_substrate_->template sync>( + "GraphAggregateSync", gnn_matrix_to_sync_column_length_); + } else { + sync_substrate_->template sync>( + "Ignore", gnn_matrix_to_sync_column_length_); + } + } else { + GALOIS_LOG_FATAL("Column size of the synchronized matrix does not" + " match to the column size of the CUDA context"); + } + } void InitLayerVectorMetaObjects(size_t layer_number, unsigned num_hosts, - size_t infl_in_size, size_t infl_out_size); + size_t infl_in_size, size_t infl_out_size) { + init_CUDA_layer_vector_meta_obj(cuda_ctx_, layer_number, num_hosts, size(), + infl_in_size, infl_out_size); + } - void ResizeGPULayerVector(size_t num_layers); + void ResizeGPULayerVector(size_t num_layers) { + resize_CUDA_layer_vector(cuda_ctx_, num_layers); + } const GNNGraphGPUAllocations& GetGPUGraph() const { return gpu_memory_; } @@ -636,7 +1469,64 @@ class GNNGraph { } #endif - void ContiguousRemap(const std::string& new_name); + void ContiguousRemap(const std::string& new_name) { + node_remapping_.resize(partitioned_graph_->size()); + + uint32_t new_node_id = 0; + + // serial loops because new ID needs to be kept consistent + // first, train nodes + for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); + cur_node++) { + if (IsValidForPhase(cur_node, GNNPhase::kTrain)) { + node_remapping_[new_node_id++] = cur_node; + } + } + galois::gInfo("Train nodes are from 0 to ", new_node_id); + + // second, val nodes + uint32_t val_start = new_node_id; + for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); + cur_node++) { + if (IsValidForPhase(cur_node, GNNPhase::kValidate)) { + node_remapping_[new_node_id++] = cur_node; + } + } + galois::gInfo("Val nodes are from ", val_start, " to ", new_node_id, "(", + new_node_id - val_start, ")"); + + // third, test nodes + uint32_t test_start = new_node_id; + for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); + cur_node++) { + if (IsValidForPhase(cur_node, GNNPhase::kTest)) { + node_remapping_[new_node_id++] = cur_node; + } + } + galois::gInfo("Test nodes are from ", test_start, " to ", new_node_id, "(", + new_node_id - test_start, ")"); + + // last, everything else + uint32_t other_start = new_node_id; + for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); + cur_node++) { + if (IsValidForPhase(cur_node, GNNPhase::kOther)) { + node_remapping_[new_node_id++] = cur_node; + } + } + galois::gInfo("Other nodes are from ", other_start, " to ", new_node_id, + "(", new_node_id - other_start, ")"); + GALOIS_LOG_ASSERT(new_node_id == partitioned_graph_->size()); + + // save the mapping to a binary file for use by graph convert to deal with + // the gr + std::string label_filename = input_directory_ + new_name + "-mapping.bin"; + std::ofstream label_write_stream; + label_write_stream.open(label_filename, std::ios::binary | std::ios::out); + label_write_stream.write((char*)node_remapping_.data(), + sizeof(uint32_t) * node_remapping_.size()); + label_write_stream.close(); + } void EnableTimers() { use_timer_ = true; @@ -675,46 +1565,909 @@ class GNNGraph { // Initialization ////////////////////////////////////////////////////////////////////////////// - void ReadLocalLabelsBin(const std::string& dataset_name); + //! Partitions a particular dataset given some partitioning scheme + std::unique_ptr LoadPartition( + const std::string& input_directory, const std::string& dataset_name, + galois::graphs::GNNPartitionScheme partition_scheme, bool useWMD) { + // XXX input path + std::string input_file = input_directory + dataset_name + ".csgr"; + if (useWMD) { + input_file = dataset_name; + } + GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file); + + // load partition + switch (partition_scheme) { + case galois::graphs::GNNPartitionScheme::kOEC: + return galois::cuspPartitionGraph( + input_file, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "", "", + false, 1); + case galois::graphs::GNNPartitionScheme::kCVC: + return galois::cuspPartitionGraph( + input_file, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "", "", + false, 1); + case galois::graphs::GNNPartitionScheme::kOCVC: + return galois::cuspPartitionGraph( + input_file, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "", "", + false, 1); + default: + GALOIS_LOG_FATAL("Error: partition scheme specified is invalid"); + return nullptr; + } + } + + template < + typename T = VTy, + typename std::enable_if_t>* = nullptr> + void ConstructLocalLabels() { + GALOIS_LOG_VERBOSE("[{}] Constructing labels from disk...", host_id_); + auto& graph = *(this->partitioned_graph_); + // For WMD graph, we always assume a single class label. + // allocate memory for labels + // single-class (one-hot) label for each vertex: N x 1 + using_single_class_labels_ = true; + local_ground_truth_labels_.resize(graph.size()); + // In WMD graphs, a vertex class is a vertex type. + // As the vertex type is already materialized in a vertex data, + // iterate a graph and extract that. + // TODO(hc): Using concurrent set using a finer-grained lock + // is better + std::mutex label_class_set_mtx; + std::unordered_set label_class_set; + galois::do_all( + galois::iterate(size_t{0}, graph.size()), + [&](size_t lid) { + local_ground_truth_labels_[lid] = graph.getData(lid).type; + label_class_set_mtx.lock(); + auto found = label_class_set.find(local_ground_truth_labels_[lid]); + if (found == label_class_set.end()) { + label_class_set.emplace(local_ground_truth_labels_[lid]); + ++num_label_classes_; + } + label_class_set_mtx.unlock(); + }); + + // Exchange found local vertex classes with other hosts to + // calculate the total number of the classes. + // + // Serialize the label class set to a vector to serialize this data + // to galois::runtime::SendBuffer. The current libdist does not + // support std::set and std::unordered_set de/serialization. + // TODO(hc): support this type of serialization. + std::vector label_vec(label_class_set.begin(), label_class_set.end()); + auto &net = galois::runtime::getSystemNetworkInterface(); + for (uint32_t h = 0; h < net.Num; ++h) { + if (h == net.ID) { continue; } + galois::runtime::SendBuffer b; + galois::runtime::gSerialize(b, label_vec); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + } + net.flush(); + for (uint32_t h = 0; h < net.Num - 1; ++h) { + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; + do { + p = net.recieveTagged(galois::runtime::evilPhase); + } while (!p); + + std::vector h_label_vec; + galois::runtime::gDeserialize(p->second, h_label_vec); + galois::do_all(galois::iterate(h_label_vec), + [&](int i) { + label_class_set_mtx.lock(); + auto found = label_class_set.find(i); + if (found == label_class_set.end()) { + label_class_set.emplace(i); + // Increaes the number of classes only if + // it was not found in the local host. + ++num_label_classes_; + } + label_class_set_mtx.unlock(); + } ); + } + increment_evilPhase(); + } + + template < + typename T = VTy, + typename std::enable_if_t>* = nullptr> + void ConstructLocalLabels() {} + + void ReadLocalLabelsBin(const std::string& dataset_name) { + GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_); + + std::ifstream file_stream; + file_stream.open(input_directory_ + dataset_name + "-labels-dims.txt", + std::ios::in); + size_t num_nodes; + file_stream >> num_nodes >> num_label_classes_ >> std::ws; + assert(num_nodes == partitioned_graph_->globalSize()); + if (host_id_ == 0) { + galois::gInfo("Number of label classes is ", num_label_classes_); + } + file_stream.close(); + + std::string filename = input_directory_ + dataset_name + "-labels.bin"; + std::ifstream file_stream_bin; + file_stream_bin.open(filename, std::ios::binary | std::ios::in); + + std::vector all_labels(num_nodes); + // read all labels into a vector + file_stream_bin.read((char*)all_labels.data(), + sizeof(GNNLabel) * num_nodes); + + using_single_class_labels_ = true; + local_ground_truth_labels_.resize(partitioned_graph_->size()); + + galois::GAccumulator found_local_vertices; + found_local_vertices.reset(); + + // save only local ones; can do in parallel as well + // assumes -1 already dealt with + galois::do_all(galois::iterate(size_t{0}, partitioned_graph_->size()), + [&](size_t lid) { + local_ground_truth_labels_[lid] = all_labels[GetGID(lid)]; + found_local_vertices += 1; + }); + + size_t fli = found_local_vertices.reduce(); + galois::gInfo(host_prefix_, "Read ", fli, " labels (", + local_ground_truth_labels_.size() * double{4} / (1 << 30), + " GB)"); + GALOIS_LOG_ASSERT(fli == partitioned_graph_->size()); + } + //! Read labels of local nodes only void ReadLocalLabels(const std::string& dataset_name, - bool has_single_class_label); + bool has_single_class_label) { + GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_); + std::string filename; + if (has_single_class_label) { + filename = input_directory_ + dataset_name + "-labels.txt"; + } else { + filename = input_directory_ + dataset_name + "-mlabels.txt"; + } + + // read file header, save num label classes while at it + std::ifstream file_stream; + file_stream.open(filename, std::ios::in); + size_t num_nodes; + file_stream >> num_nodes >> num_label_classes_ >> std::ws; + assert(num_nodes == partitioned_graph_->globalSize()); + if (host_id_ == 0) { + galois::gInfo("Number of label classes is ", num_label_classes_); + } + + // allocate memory for labels + if (has_single_class_label) { + // single-class (one-hot) label for each vertex: N x 1 + using_single_class_labels_ = true; + local_ground_truth_labels_.resize(partitioned_graph_->size()); + } else { + // multi-class label for each vertex: N x num classes + using_single_class_labels_ = false; + local_ground_truth_labels_.resize(partitioned_graph_->size() * + num_label_classes_); + } + + size_t cur_gid = 0; + size_t found_local_vertices = 0; + // each line contains a set of 0s and 1s + std::string read_line; + + // loop through all labels of the graph + while (std::getline(file_stream, read_line)) { + // only process label if this node is local + if (partitioned_graph_->isLocal(cur_gid)) { + uint32_t cur_lid = partitioned_graph_->getLID(cur_gid); + // read line as bitset of 0s and 1s + std::istringstream label_stream(read_line); + int cur_bit; + // bitset size is # of label classes + for (size_t cur_class = 0; cur_class < num_label_classes_; + ++cur_class) { + // read a bit + label_stream >> cur_bit; + + if (has_single_class_label) { + // no label + if (cur_bit == -1) { + local_ground_truth_labels_[cur_lid] = num_label_classes_; + break; + } + + // in single class, only 1 bit is set in bitset; that represents the + // class to take + if (cur_bit != 0) { + // set class and break (assumption is that's the only bit that is + // set) + local_ground_truth_labels_[cur_lid] = cur_class; + break; + } + } else { + // else the entire bitset needs to be copied over to the label array + // TODO this can possibly be saved all at once rather than bit by + // bit? + local_ground_truth_labels_[cur_lid * num_label_classes_ + + cur_class] = cur_bit; + } + } + found_local_vertices++; + } + // always increment cur_gid + cur_gid++; + } + + file_stream.close(); + + galois::gInfo(host_prefix_, "Read ", found_local_vertices, " labels (", + local_ground_truth_labels_.size() * double{4} / (1 << 30), + " GB)"); + GALOIS_LOG_ASSERT(found_local_vertices == partitioned_graph_->size()); + } + //! Read features of local nodes only - void ReadLocalFeatures(const std::string& dataset_str); + void ReadLocalFeatures(const std::string& dataset_name) { + GALOIS_LOG_VERBOSE("[{}] Reading features from disk...", host_id_); + + // read in dimensions of features, specifically node feature length + size_t num_global_vertices; + + std::string file_dims = input_directory_ + dataset_name + "-dims.txt"; + std::ifstream ifs; + ifs.open(file_dims, std::ios::in); + ifs >> num_global_vertices >> node_feature_length_; + ifs.close(); + + GALOIS_LOG_ASSERT(num_global_vertices == partitioned_graph_->globalSize()); + GALOIS_LOG_VERBOSE("[{}] N x D: {} x {}", host_id_, num_global_vertices, + node_feature_length_); + + // memory for all features of all nodes in graph + // TODO read features without loading entire feature file into memory; this + // is quite inefficient + std::unique_ptr full_feature_set = std::make_unique( + num_global_vertices * node_feature_length_); + + // read in all features + std::ifstream file_stream; + std::string feature_file = input_directory_ + dataset_name + "-feats.bin"; + file_stream.open(feature_file, std::ios::binary | std::ios::in); + file_stream.read((char*)full_feature_set.get(), sizeof(GNNFloat) * + num_global_vertices * + node_feature_length_); + file_stream.close(); + + // allocate memory for local features + local_node_features_.resize(partitioned_graph_->size() * + node_feature_length_); + + // copy over features for local nodes only + galois::GAccumulator num_kept_vertices; + num_kept_vertices.reset(); + galois::do_all( + galois::iterate(size_t{0}, num_global_vertices), [&](size_t gid) { + if (partitioned_graph_->isLocal(gid)) { + // copy over feature vector + std::copy(full_feature_set.get() + gid * node_feature_length_, + full_feature_set.get() + (gid + 1) * node_feature_length_, + &local_node_features_[partitioned_graph_->getLID(gid) * + node_feature_length_]); + num_kept_vertices += 1; + } + }); + full_feature_set.reset(); + + galois::gInfo(host_prefix_, "Read ", local_node_features_.size(), + " features (", + local_node_features_.size() * double{4} / (1 << 30), " GB)"); + GALOIS_LOG_ASSERT(num_kept_vertices.reduce() == partitioned_graph_->size()); + } + //! Helper function to read masks from file into the appropriate structures //! given a name, mask type, and arrays to save into size_t ReadLocalMasksFromFile(const std::string& dataset_name, const std::string& mask_type, - GNNRange* mask_range, std::vector* masks); + GNNRange* mask_range, + std::vector* masks) { + size_t range_begin; + size_t range_end; + + // read mask range + std::string mask_filename = + input_directory_ + dataset_name + "-" + mask_type + "_mask.txt"; + bool train_is_on = false; + if (mask_type == "train") { + train_is_on = true; + } + + std::ifstream mask_stream; + mask_stream.open(mask_filename, std::ios::in); + mask_stream >> range_begin >> range_end >> std::ws; + GALOIS_LOG_ASSERT(range_begin <= range_end); + + // set the range object + mask_range->begin = range_begin; + mask_range->end = range_end; + mask_range->size = range_end - range_begin; + + size_t cur_line_num = 0; + // valid nodes on this host + size_t local_sample_count = 0; + // this tracks TOTAL # of valid nodes in this group (not necessarily valid + // ones on this host) + size_t valid_count = 0; + std::string line; + // each line is a number signifying if mask is set for the vertex + while (std::getline(mask_stream, line)) { + std::istringstream mask_stream(line); + // only examine vertices/lines in range + if (cur_line_num >= range_begin && cur_line_num < range_end) { + unsigned mask = 0; + mask_stream >> mask; + if (mask == 1) { + valid_count++; + if (partitioned_graph_->isLocal(cur_line_num)) { + (*masks)[partitioned_graph_->getLID(cur_line_num)] = 1; + local_sample_count++; + } + if (train_is_on) { + global_training_mask_[cur_line_num] = 1; + } + } + } + cur_line_num++; + } + mask_stream.close(); + + if (train_is_on) { + global_training_count_ = valid_count; + } + + if (valid_count != mask_range->size) { + // overlapping masks: need to actually check the masks rather than use + // ranges + if (!incomplete_masks_) { + galois::gInfo( + "Masks are not contained in range: must actually check mask"); + } + incomplete_masks_ = true; + } + + return valid_count; + } + //! Finds nodes that aren't part of the 3 main GNN phase classifications - size_t FindOtherMask(); + size_t FindOtherMask() { + galois::GAccumulator other_accum; + other_accum.reset(); + other_mask_.resize(partitioned_graph_->size()); + + galois::do_all( + galois::iterate(size_t{0}, partitioned_graph_->size()), + [&](size_t local_id) { + if (!IsValidForPhase(local_id, GNNPhase::kTrain) && + !IsValidForPhase(local_id, GNNPhase::kValidate) && + !IsValidForPhase(local_id, GNNPhase::kTest)) { + other_mask_[local_id] = 1; + other_accum += 1; + } + }, + galois::loopname("FindOtherMask")); + return other_accum.reduce(); + } + + //! @brief Choose and set local training/validation/testing vertices + //! consecutively. + void SetLocalMasksConsecutively() { + // allocate the memory for the local masks + global_training_mask_.resize(partitioned_graph_->globalSize()); + local_training_mask_.resize(partitioned_graph_->size()); + local_validation_mask_.resize(partitioned_graph_->size()); + local_testing_mask_.resize(partitioned_graph_->size()); + + global_training_count_ = partitioned_graph_->globalSize() / 4; + size_t global_testing_count = global_training_count_ / 2; + global_training_mask_range_ = { + .begin = 0, .end = global_training_count_, .size = global_training_count_}; + global_testing_mask_range_ = { + .begin = global_training_count_, + .end = global_training_count_ + global_testing_count, + .size = global_testing_count + }; + global_validation_mask_range_ = { + .begin = global_training_count_ + global_testing_count, + .end = global_training_count_ + 2 * global_testing_count, + .size = global_testing_count + }; + // training + for (size_t i = global_training_mask_range_.begin; + i < global_training_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_training_mask_[partitioned_graph_->getLID(i)] = 1; + } + global_training_mask_[i] = 1; + } + + // validation + for (size_t i = global_validation_mask_range_.begin; + i < global_validation_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_validation_mask_[partitioned_graph_->getLID(i)] = 1; + } + } + + // testing + for (size_t i = global_testing_mask_range_.begin; + i < global_testing_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_testing_mask_[partitioned_graph_->getLID(i)] = 1; + } + } + } + + //! @brief Randomly choose and set local training/validation/testing + //! vertices. This mimics what AGILE GNN does through Pytorch + //! `DistributedRandomSampler`. + void DistributedRandomSampling( + size_t local_sample_size, std::vector* masks) { + // Pytorch's DistributedRandomSampler, + // first materializes an array populated with + // 0 to (num_local_vertices - 1), shuffles this array, and + // extracts 0 to (num_local_shuffle - 1) vertices. + // This method mimics this operation. + // Like Pytorch, all the hosts use the same seed, and so, + // deterministically choose each type of vertices for not only + // the current host, but also others, and mark vertices to + // the corresponding mask array if they are locals. + auto& net = galois::runtime::getSystemNetworkInterface(); + std::vector< + std::pair> num_masters_per_hosts(net.Num); + std::pair master_ranges = + { partitioned_graph_->getGID(0), + partitioned_graph_->getGID(partitioned_graph_->numMasters() - 1) }; + // 1) Exchange node master ranges, and so, each host knows + // the range of vertex sampling. + for (uint32_t h = 0; h < net.Num; ++h) { + if (h == net.ID) { continue; } + galois::runtime::SendBuffer b; + galois::runtime::gSerialize(b, master_ranges); + net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); + } + net.flush(); + for (uint32_t h = 0; h < net.Num - 1; ++h) { + decltype(net.recieveTagged(galois::runtime::evilPhase)) p; + do { + p = net.recieveTagged(galois::runtime::evilPhase); + } while (!p); + + galois::runtime::gDeserialize(p->second, + num_masters_per_hosts[p->first]); + } + increment_evilPhase(); + + // 2) Sample vertices and mark them to the `masks` array + // if a vertex is local. + for (uint32_t h = 0; h < net.Num; ++h) { + size_t h_begin = (h == net.ID)? master_ranges.first : num_masters_per_hosts[h].first; + size_t h_end = (h == net.ID)? master_ranges.second : num_masters_per_hosts[h].second; + std::vector h_all_indices(h_end - h_begin); + // Fill global vertex ids to h_global_ids. + galois::do_all(galois::iterate(h_begin, h_end), + [&](size_t i) { + h_all_indices[i - h_begin] = i; + } ); + std::mt19937 rand(0); + std::shuffle(h_all_indices.begin(), h_all_indices.end(), rand); + galois::do_all( + galois::iterate(size_t{0}, local_sample_size), + [&](size_t i) { + // First, it doens't have duplications. + // Second, only mark `masks` if the checking vertex is a local + // vertex. + if (partitioned_graph_->isLocal(h_all_indices[i])) { + (*masks)[partitioned_graph_->getLID(h_all_indices[i])] = 1; + } + } ); + } + } + + void SetLocalMasksRandomly() { + // allocate the memory for the local masks + global_training_mask_.resize(partitioned_graph_->globalSize()); + local_training_mask_.resize(partitioned_graph_->size()); + local_validation_mask_.resize(partitioned_graph_->size()); + local_testing_mask_.resize(partitioned_graph_->size()); + + auto& net = galois::runtime::getSystemNetworkInterface(); + global_training_count_ = partitioned_graph_->globalSize() / 4; + size_t global_testing_count = global_training_count_ / 2; + size_t num_local_training_samples = global_training_count_ / net.Num; + size_t num_local_testing_samples = global_testing_count / net.Num; + size_t num_local_validating_samples = num_local_testing_samples; + global_training_mask_range_ = { + .begin = 0, .end = global_training_count_, .size = global_training_count_}; + global_testing_mask_range_ = { + .begin = 0, .end = global_training_count_, .size = global_training_count_}; + global_validation_mask_range_ = { + .begin = 0, .end = global_training_count_, .size = global_training_count_}; + + incomplete_masks_ = true; + DistributedRandomSampling( + num_local_training_samples, &local_training_mask_); + DistributedRandomSampling( + num_local_testing_samples, &local_testing_mask_); + DistributedRandomSampling( + num_local_validating_samples, &local_validation_mask_); + } + //! Read masks of local nodes only for training, validation, and testing - void ReadLocalMasks(const std::string& dataset_name); - //! Reads the entire graph topology in (but nothing else) - void ReadWholeGraph(const std::string& dataset_name); + void ReadLocalMasks(const std::string& dataset_name) { + // allocate the memory for the local masks + global_training_mask_.resize(partitioned_graph_->globalSize()); + local_training_mask_.resize(partitioned_graph_->size()); + local_validation_mask_.resize(partitioned_graph_->size()); + local_testing_mask_.resize(partitioned_graph_->size()); + + if (dataset_name == "reddit") { + global_training_count_ = 153431; + + // TODO reddit is hardcode handled at the moment; better way to not do + // this? + global_training_mask_range_ = {.begin = 0, .end = 153431, .size = 153431}; + global_validation_mask_range_ = { + .begin = 153431, .end = 153431 + 23831, .size = 23831}; + global_testing_mask_range_ = { + .begin = 177262, .end = 177262 + 55703, .size = 55703}; + + // training + for (size_t i = global_training_mask_range_.begin; + i < global_training_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_training_mask_[partitioned_graph_->getLID(i)] = 1; + } + global_training_mask_[i] = 1; + } + + // validation + for (size_t i = global_validation_mask_range_.begin; + i < global_validation_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_validation_mask_[partitioned_graph_->getLID(i)] = 1; + } + } + + // testing + for (size_t i = global_testing_mask_range_.begin; + i < global_testing_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_testing_mask_[partitioned_graph_->getLID(i)] = 1; + } + } + } else if (dataset_name == "ogbn-papers100M-remap") { + global_training_count_ = 1207178; + + global_training_mask_range_ = { + .begin = 0, .end = 1207178, .size = 1207178}; + global_validation_mask_range_ = { + .begin = 1207178, .end = 1207178 + 125264, .size = 125264}; + global_testing_mask_range_ = { + .begin = 1332442, .end = 1332442 + 214337, .size = 214337}; + // training + for (size_t i = global_training_mask_range_.begin; + i < global_training_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_training_mask_[partitioned_graph_->getLID(i)] = 1; + } + global_training_mask_[i] = 1; + } + // validation + for (size_t i = global_validation_mask_range_.begin; + i < global_validation_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_validation_mask_[partitioned_graph_->getLID(i)] = 1; + } + } + // testing + for (size_t i = global_testing_mask_range_.begin; + i < global_testing_mask_range_.end; i++) { + if (partitioned_graph_->isLocal(i)) { + local_testing_mask_[partitioned_graph_->getLID(i)] = 1; + } + } + valid_other_ = FindOtherMask(); + GALOIS_LOG_ASSERT(valid_other_ <= 109513177); + } else { + size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train", + &global_training_mask_range_, + &local_training_mask_); + size_t valid_val = ReadLocalMasksFromFile(dataset_name, "val", + &global_validation_mask_range_, + &local_validation_mask_); + size_t valid_test = ReadLocalMasksFromFile(dataset_name, "test", + &global_testing_mask_range_, + &local_testing_mask_); + valid_other_ = FindOtherMask(); + // the "other" set of nodes that don't fall into any classification + if (galois::runtime::getSystemNetworkInterface().ID == 0) { + galois::gInfo("Valid # training nodes is ", valid_train); + galois::gInfo("Valid # validation nodes is ", valid_val); + galois::gInfo("Valid # test nodes is ", valid_test); + galois::gInfo("Valid # other nodes is ", valid_other_); + } + } + } + //! Initializes the norm factors using the entire graph's topology for global //! degree access - void InitNormFactor(); + void InitNormFactor() { + GALOIS_LOG_VERBOSE("[{}] Initializing norm factors", host_id_); + global_degrees_.resize(partitioned_graph_->size(), 0.0); + global_train_degrees_.resize(partitioned_graph_->size(), 0.0); + CalculateFullNormFactor(); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_memory_.InitNormFactor(partitioned_graph_->size()); + } +#endif + } //! Used if ranges for a mask are complete (if in range, it's part of mask). - bool IsValidForPhaseCompleteRange(const unsigned lid, - const galois::GNNPhase current_phase) const; + bool + IsValidForPhaseCompleteRange(const unsigned lid, + const galois::GNNPhase current_phase) const { + // only use ranges if they're complete + // convert to gid first + size_t gid = partitioned_graph_->getGID(lid); + + // select range to use based on phase + const GNNRange* range_to_use; + switch (current_phase) { + case GNNPhase::kTrain: + range_to_use = &global_training_mask_range_; + break; + case GNNPhase::kValidate: + range_to_use = &global_validation_mask_range_; + break; + case GNNPhase::kTest: + range_to_use = &global_testing_mask_range_; + break; + case GNNPhase::kOther: + GALOIS_LOG_FATAL("no range for other"); + break; + default: + GALOIS_LOG_FATAL("Invalid phase used"); + range_to_use = nullptr; + } + + // if within range, it is valid + // there is an assumption here that ranges are contiguous; may not + // necessarily be the case in all inputs in which case using the mask is + // required (but less cache efficient) + if (range_to_use->begin <= gid && gid < range_to_use->end) { + return true; + } else { + return false; + } + } //! Used if ranges for a mask are incomplete, meaning I actually have to //! check the mask. bool IsValidForPhaseMasked(const unsigned lid, - const galois::GNNPhase current_phase) const; + const galois::GNNPhase current_phase) const { + // select mask to use based on phase + const GNNMask* mask_to_use; + switch (current_phase) { + case GNNPhase::kTrain: + mask_to_use = &local_training_mask_; + break; + case GNNPhase::kValidate: + mask_to_use = &local_validation_mask_; + break; + case GNNPhase::kTest: + mask_to_use = &local_testing_mask_; + break; + case GNNPhase::kOther: + if (valid_other_ == 0) { + return false; + } + mask_to_use = &other_mask_; + break; + case GNNPhase::kBatch: + mask_to_use = &local_minibatch_mask_; + break; + default: + GALOIS_LOG_FATAL("Invalid phase used"); + mask_to_use = nullptr; + } + return (*mask_to_use)[lid]; + } ////////////////////////////////////////////////////////////////////////////// // Accuracy ////////////////////////////////////////////////////////////////////////////// float GetGlobalAccuracyCPU(PointerWithSize predictions, - GNNPhase phase, bool sampling); + GNNPhase phase, bool sampling) { + galois::StatTimer global_accuracy_timer("GetGlobalAccuracy"); + galois::StatTimer global_accuracy_for_singleclass_timer( + "GetGlobalAccuracyForSingleClass"); + galois::StatTimer global_accuracy_for_multiclass_timer( + "GetGlobalAccuracyForMultiClass"); + global_accuracy_timer.start(); + float accuracy{0}; + if (is_single_class_label()) { + global_accuracy_for_singleclass_timer.start(); + accuracy = GetGlobalAccuracyCPUSingle(predictions, phase, sampling); + global_accuracy_for_singleclass_timer.stop(); + } else { + global_accuracy_for_multiclass_timer.start(); + accuracy = GetGlobalAccuracyCPUMulti(predictions, phase, sampling); + global_accuracy_for_multiclass_timer.stop(); + } + global_accuracy_timer.stop(); + return accuracy; + } + float GetGlobalAccuracyCPUSingle(PointerWithSize predictions, - GNNPhase phase, bool sampling); + GNNPhase phase, bool) { + // check owned nodes' accuracy + num_correct_.reset(); + total_checked_.reset(); + + galois::do_all( + // will only loop over sampled nodes if sampling is on + galois::iterate(begin_owned(), end_owned()), + // this is possibly the subgraph id + [&](const unsigned node_id) { + if (IsValidForPhase(node_id, phase)) { + total_checked_ += 1; + // get prediction by getting max + // note the use of node_id here: lid only used to check original + // labels + size_t predicted_label = + galois::MaxIndex(num_label_classes_, + &(predictions[node_id * num_label_classes_])); + // check against ground truth and track accordingly + // TODO static cast used here is dangerous + if (predicted_label == + static_cast(GetSingleClassLabel(node_id))) { + num_correct_ += 1; + } + } + }, + // steal on as some threads may have nothing to work on + galois::steal()); + + size_t global_correct = num_correct_.reduce(); + size_t global_checked = total_checked_.reduce(); + + GALOIS_LOG_DEBUG("Sub: {}, Accuracy: {} / {}", use_subgraph_, + global_correct, global_checked); + + return static_cast(global_correct) / + static_cast(global_checked); + } + float GetGlobalAccuracyCPUMulti(PointerWithSize predictions, - GNNPhase phase, bool sampling); + GNNPhase phase, bool sampling) { + const GNNLabel* full_ground_truth = GetMultiClassLabel(0); + assert(predictions.size() == (num_label_classes_ * size())); + + size_t global_true_positive = 0; + size_t global_true_negative = 0; + size_t global_false_positive = 0; + size_t global_false_negative = 0; + size_t global_f1_score = 0; + + // per class check + for (size_t label_class = 0; label_class < num_label_classes_; + label_class++) { + local_true_positive_.reset(); + local_true_negative_.reset(); + local_false_positive_.reset(); + local_false_negative_.reset(); + + // loop through all *owned* nodes (do not want to overcount) + galois::do_all( + galois::iterate(begin_owned(), end_owned()), + [&](const unsigned lid) { + if (IsValidForPhase(lid, phase)) { + if (sampling) { + if (phase == GNNPhase::kTrain && !IsInSampledGraph(lid)) { + return; + } + } + + size_t label_index = lid * num_label_classes_ + label_class; + + GNNLabel true_label = full_ground_truth[label_index]; + GNNLabel prediction_is_positive = + (predictions[label_index] > 0.5) ? 1 : 0; + + if (true_label && prediction_is_positive) { + local_true_positive_ += 1; + } else if (true_label && !prediction_is_positive) { + local_false_negative_ += 1; + } else if (!true_label && prediction_is_positive) { + local_false_positive_ += 1; + } else if (!true_label && !prediction_is_positive) { + local_true_negative_ += 1; + } else { + // all cases should be covered with clauses above, so it should + // NEVER get here; adding it here just for sanity purposes + GALOIS_LOG_FATAL( + "Logic error with true label and prediction label"); + } + } + total_checked_ += 1; + }, + galois::steal(), galois::loopname("GlobalMultiAccuracy")); + + // reduce from accumulators across all hosts for this particular class + size_t class_true_positives = local_true_positive_.reduce(); + size_t class_false_positives = local_false_positive_.reduce(); + size_t class_true_negatives = local_true_negative_.reduce(); + size_t class_false_negatives = local_false_negative_.reduce(); + + // add to global counts + global_true_positive += class_true_positives; + global_false_positive += class_false_positives; + global_true_negative += class_true_negatives; + global_false_negative += class_false_negatives; + + // calculate precision, recall, and f1 score for this class + // ternery op used to avoid division by 0 + double class_precision = + (class_true_positives + class_true_negatives) > 0 + ? static_cast(class_true_positives) / + (class_true_positives + class_false_positives) + : 0.0; + double class_recall = + (class_true_positives + class_false_negatives) > 0 + ? static_cast(class_true_positives) / + (class_true_positives + class_false_negatives) + : 0.0; + double class_f1_score = (class_precision + class_recall) > 0 + ? (2.0 * (class_precision * class_recall)) / + (class_precision + class_recall) + : 0.0; + + global_f1_score += class_f1_score; + } // end label class loop + + // GALOIS_LOG_WARN("{} {} {} {}", global_true_positive, + // global_true_negative, global_false_positive, global_false_negative); + + // double global_f1_macro_score = global_f1_score / num_label_classes_; + + // micro = considers all classes for precision/recall + double global_micro_precision = + (global_true_positive + global_true_negative) > 0 + ? static_cast(global_true_positive) / + (global_true_positive + global_false_positive) + : 0.0; + double global_micro_recall = + (global_true_positive + global_false_negative) > 0 + ? static_cast(global_true_positive) / + (global_true_positive + global_false_negative) + : 0.0; + + double global_f1_micro_score = + (global_micro_precision + global_micro_recall) > 0 + ? (2.0 * (global_micro_precision * global_micro_recall)) / + (global_micro_precision + global_micro_recall) + : 0.0; + + return global_f1_micro_score; + } + + void increment_evilPhase() { + ++galois::runtime::evilPhase; + if (galois::runtime::evilPhase >= + static_cast(std::numeric_limits::max())) { + galois::runtime::evilPhase = 1; + } + } ////////////////////////////////////////////////////////////////////////////// // Vars @@ -838,7 +2591,58 @@ class GNNGraph { GNNGraphGPUAllocations gpu_memory_; //! Call this to setup GPU memory for this graph: allocates necessary GPU //! memory and copies things over - void InitGPUMemory(); + void InitGPUMemory() { + // create int casted CSR + uint64_t* e_index_ptr = partitioned_graph_->row_start_ptr(); + uint32_t* e_dest_ptr = partitioned_graph_->edge_dst_ptr(); + + // + 1 because first element is 0 in BLAS CSRs + std::vector e_index(partitioned_graph_->size() + 1); + std::vector e_dest(partitioned_graph_->sizeEdges()); + + // set in parallel + galois::do_all( + galois::iterate(static_cast(0), partitioned_graph_->size() + 1), + [&](size_t index) { + if (index != 0) { + if (e_index_ptr[index - 1] > + static_cast(std::numeric_limits::max())) { + GALOIS_LOG_FATAL("{} is too big a number for int arrays on GPUs", + e_index_ptr[index - 1]); + } + e_index[index] = static_cast(e_index_ptr[index - 1]); + } else { + e_index[index] = 0; + } + }, + galois::loopname("GPUEdgeIndexConstruction")); + galois::do_all( + galois::iterate(static_cast(0), + partitioned_graph_->sizeEdges()), + [&](size_t edge) { + if (e_dest_ptr[edge] > + static_cast(std::numeric_limits::max())) { + GALOIS_LOG_FATAL("{} is too big a number for int arrays on GPUs", + e_dest_ptr[edge]); + } + + e_dest[edge] = static_cast(e_dest_ptr[edge]); + }, + galois::loopname("GPUEdgeDestConstruction")); + + gpu_memory_.SetGraphTopology(e_index, e_dest); + e_index.clear(); + e_dest.clear(); + + gpu_memory_.SetFeatures(local_node_features_, node_feature_length_); + gpu_memory_.SetLabels(local_ground_truth_labels_); + gpu_memory_.SetMasks(local_training_mask_, local_validation_mask_, + local_testing_mask_); + gpu_memory_.AllocAggregateBitset(partitioned_graph_->size()); + gpu_memory_.SetGlobalTrainDegrees(global_train_degrees_); + gpu_memory_.SetGlobalDegrees(global_degrees_); + } + #endif //! Used to track accurate predictions during accuracy calculation DGAccumulator num_correct_; diff --git a/libgnn/include/galois/graphs/GNNSubgraph.h b/libgnn/include/galois/graphs/GNNSubgraph.h index c7692533ba..9bddc9d313 100644 --- a/libgnn/include/galois/graphs/GNNSubgraph.h +++ b/libgnn/include/galois/graphs/GNNSubgraph.h @@ -1,10 +1,14 @@ +#include "galois/graphs/GNNGraph.h" + +#include + // Note no header guard or anything like that; this file is meant to be // included in the middle of GNNGraph class declaration as a class in a class class GNNSubgraph { public: - using GraphNode = LC_CSR_CSC_Graph::GraphNode; + using GraphNode = typename LC_CSR_CSC_Graph::GraphNode; using NodeIterator = boost::counting_iterator; - using EdgeIterator = LC_CSR_CSC_Graph::edge_iterator; + using EdgeIterator = typename LC_CSR_CSC_Graph::edge_iterator; //! Allocates space for the lid to sid map GNNSubgraph(size_t main_graph_size) { @@ -16,9 +20,35 @@ class GNNSubgraph { } //! Given sampled bits set on gnn_graph, builds an explicit subgraph //! for the sampled bits - size_t BuildSubgraph(GNNGraph& gnn_graph, size_t num_sampled_layers); + size_t BuildSubgraph(GNNGraph& gnn_graph, + size_t num_sampled_layers) { + galois::StatTimer timer("BuildSubgraph", kRegionName); + TimerStart(&timer); + for (auto& vec : subgraph_mirrors_) { + vec.clear(); + } + CreateSubgraphMapping(gnn_graph, num_sampled_layers); + if (num_subgraph_nodes_ == 0) { + return 0; + } + DegreeCounting(gnn_graph); + EdgeCreation(gnn_graph); + NodeFeatureCreation(gnn_graph); + // loop over each node, grab out/in edges, construct them in LC_CSR_CSC + // no edge data, just topology + TimerStop(&timer); + return num_subgraph_nodes_; + } - size_t BuildSubgraphView(GNNGraph& gnn_graph, size_t num_sampled_layers); + size_t BuildSubgraphView(GNNGraph& gnn_graph, + size_t num_sampled_layers) { + galois::StatTimer timer("BuildSubgraphView", kRegionName); + TimerStart(&timer); + CreateSubgraphMapping(gnn_graph, num_sampled_layers); + NodeFeatureCreation(gnn_graph); + TimerStop(&timer); + return num_subgraph_nodes_; + } galois::PODResizeableArray& GetLocalFeatures() { return subgraph_node_features_; @@ -52,7 +82,7 @@ class GNNSubgraph { return underlying_graph_.getEdgeDst(out_edge_iterator); }; galois::runtime::iterable< - galois::NoDerefIterator> + galois::NoDerefIterator> edges(GraphNode n) { return internal::make_no_deref_range(edge_begin(n), edge_end(n)); } @@ -67,7 +97,7 @@ class GNNSubgraph { return underlying_graph_.getInEdgeDst(in_edge_iterator); }; galois::runtime::iterable< - galois::NoDerefIterator> + galois::NoDerefIterator> in_edges(GraphNode n) { return internal::make_no_deref_range(in_edge_begin(n), in_edge_end(n)); } @@ -81,12 +111,12 @@ class GNNSubgraph { ////////////////////////////////////////////////////////////////////////////// bool OutEdgeSampled(EdgeIterator out_edge_iterator, size_t layer_num, - const GNNGraph& original_graph) { + const GNNGraph& original_graph) { return original_graph.IsEdgeSampledOriginalGraph( subedge_to_original_edge_[*out_edge_iterator], layer_num); } bool InEdgeSampled(EdgeIterator in_edge_iterator, size_t layer_num, - const GNNGraph& original_graph) { + const GNNGraph& original_graph) { // note that original IsEdgeSampled is called because this object stores the // original edge already return original_graph.IsEdgeSampledOriginalGraph( @@ -119,7 +149,247 @@ class GNNSubgraph { // TODO signature cleanup //! Creates subgraph ID mapping from the number of sampled nodes from the //! original graph. Should be done every epoch when sampled graph changes. - void CreateSubgraphMapping(GNNGraph& gnn_graph, size_t); + void CreateSubgraphMapping(GNNGraph& gnn_graph, size_t) { + galois::StatTimer timer("SIDMapping", kRegionName); + TimerStart(&timer); + + assert(gnn_graph.size() == lid_to_subgraph_id_.size()); + // clear all mappings + galois::ParallelSTL::fill(lid_to_subgraph_id_.begin(), + lid_to_subgraph_id_.end(), + std::numeric_limits::max()); + + galois::GAccumulator subgraph_count; + subgraph_count.reset(); + galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()), + [&](uint32_t node_id) { + if (gnn_graph.IsActiveInSubgraph(node_id)) { + subgraph_count += 1; + } + }); + num_subgraph_nodes_ = subgraph_count.reduce(); + // if no subgraph, get out + if (num_subgraph_nodes_ == 0) { + subgraph_master_boundary_ = 0; + TimerStop(&timer); + return; + } + + // checking sanity + // galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()), + // [&](uint32_t node_id) { + // if (gnn_graph.IsInSampledGraph(node_id) && + // !gnn_graph.IsActiveInSubgraph(node_id)) { + // // check if any edges are active + // for (auto a = gnn_graph.edge_begin(node_id); a != + // gnn_graph.edge_end(node_id);a++) { + // if (gnn_graph.IsEdgeSampledAny(a)) { + // galois::gWarn("ERROR node ", node_id); + // } + // } + // for (auto a = gnn_graph.in_edge_begin(node_id); a != + // gnn_graph.in_edge_end(node_id);a++) { + // if (gnn_graph.IsInEdgeSampledAny(a)) { + // galois::gWarn("ERROR in node ", node_id); + // } + // } + // } + // }); + + if (subgraph_id_to_lid_.size() < num_subgraph_nodes_) { + // allocate a bit more than necessary to avoid a big realloc + // if node value changes slightly later + subgraph_id_to_lid_.resize(num_subgraph_nodes_ * 1.02); + } + + // bitset to mark if a master is outside the "master only" boundary + // and not contiguous; needed to mask out non-masters + galois::DynamicBitSet& non_layer_zero_masters = + gnn_graph.GetNonLayerZeroMasters(); + // init the bitset as necessary + if (non_layer_zero_masters.size() < num_subgraph_nodes_) { + non_layer_zero_masters.resize(num_subgraph_nodes_); + } else { + non_layer_zero_masters.ParallelReset(); + } + + std::vector& master_offsets = gnn_graph.GetMasterOffsets(); + std::vector& mirror_offsets = gnn_graph.GetMirrorOffsets(); + + ResetSIDThreadOffsets(master_offsets.size()); + + // compute offsets for each layer + galois::PODResizeableArray layer_offsets; + layer_offsets.resize(master_offsets.size() - 1); + for (unsigned i = 0; i < layer_offsets.size(); i++) { + layer_offsets[i] = master_offsets[i] + mirror_offsets[i]; + if (i > 0) { + // prefix summing + layer_offsets[i] += layer_offsets[i - 1]; + } + } + + // all nodes before this SID are master nodes in layer 0; + // NOTE: there are master nodes past this boundary that will + // not be covered by a begin_owned loop, which may cause problems down + // the line; this is handled by the bitset above + subgraph_master_boundary_ = master_offsets[0]; + + size_t last_owned_node = *(gnn_graph.end_owned()); + // compute amount of work each thread needs to do + galois::on_each([&](size_t thread_id, size_t num_threads) { + unsigned start_node; + unsigned end_node; + // this thread always has a set number of nodes to run; this is it + std::tie(start_node, end_node) = galois::block_range( + size_t{0}, gnn_graph.size(), thread_id, num_threads); + // these arrays track how much work will need to be done by this + // thread + galois::PODResizeableArray& my_offsets = + sid_thread_offsets_[thread_id]; + galois::PODResizeableArray& my_mirror_offsets = + subgraph_mirror_offsets_[thread_id]; + + for (size_t local_node_id = start_node; local_node_id < end_node; + local_node_id++) { + // only bother if node was active + if (gnn_graph.IsActiveInSubgraph(local_node_id)) { + unsigned node_timestamp = + gnn_graph.SampleNodeTimestamp(local_node_id); + // TODO(loc) this check shouldn't even be necessary; active in + // subgraph implies added at somepoint + if (node_timestamp != std::numeric_limits::max()) { + // tracks how many nodes for each timestamp this node will + // work with by incrementing this + my_offsets[node_timestamp]++; + + if (local_node_id >= last_owned_node) { + // this is a mirror node; get the host that the master is located + // on and increment this thread's mirror node count for that host + uint32_t node_gid = gnn_graph.GetGID(local_node_id); + my_mirror_offsets[gnn_graph.GetHostID(node_gid)]++; + } + } else { + GALOIS_LOG_WARN("shouldn't ever get here right?"); + } + } + } + }); + + // prefix sum the threads + galois::do_all(galois::iterate(size_t{0}, master_offsets.size()), + [&](size_t layer_num) { + for (size_t thread_id = 1; + thread_id < galois::getActiveThreads(); thread_id++) { + sid_thread_offsets_[thread_id][layer_num] += + sid_thread_offsets_[thread_id - 1][layer_num]; + } + }); + + for (unsigned i = 0; i < master_offsets.size() - 1; i++) { + if (i > 0) { + GALOIS_LOG_VASSERT( + sid_thread_offsets_[galois::getActiveThreads() - 1][i] + + layer_offsets[i - 1] == + (layer_offsets[i]), + "layer {} wrong {} vs correct {}", i, + sid_thread_offsets_[galois::getActiveThreads() - 1][i], + layer_offsets[i]); + } else { + GALOIS_LOG_VASSERT( + sid_thread_offsets_[galois::getActiveThreads() - 1][i] == + (layer_offsets[i]), + "layer {} wrong {} vs correct {}", i, + sid_thread_offsets_[galois::getActiveThreads() - 1][i], + layer_offsets[i]); + } + } + + // last element of prefix sum needs to equal the correct layer offset + galois::do_all( + galois::iterate(uint32_t{0}, + galois::runtime::getSystemNetworkInterface().Num), + [&](size_t host_num) { + // for each host, get prefix sum of each thread's mirrors + for (size_t thread_id = 1; thread_id < galois::getActiveThreads(); + thread_id++) { + subgraph_mirror_offsets_[thread_id][host_num] += + subgraph_mirror_offsets_[thread_id - 1][host_num]; + } + }); + + // allocate the mirror space; last element of prefix sum is total size + for (unsigned host_num = 0; + host_num < galois::runtime::getSystemNetworkInterface().Num; + host_num++) { + if (galois::runtime::getSystemNetworkInterface().ID == host_num) { + continue; + } + subgraph_mirrors_[host_num].resize( + subgraph_mirror_offsets_[galois::getActiveThreads() - 1][host_num]); + } + + galois::on_each([&](size_t thread_id, size_t num_threads) { + unsigned start_node; + unsigned end_node; + std::tie(start_node, end_node) = galois::block_range( + size_t{0}, gnn_graph.size(), thread_id, num_threads); + + galois::PODResizeableArray& current_thread_offset = + thread_id != 0 ? sid_thread_offsets_[thread_id - 1] + : thread_zero_work_; + galois::PODResizeableArray& my_mirror_offsets = + thread_id != 0 ? subgraph_mirror_offsets_[thread_id - 1] + : thread_zero_mirror_offsets_; + + for (size_t local_node_id = start_node; local_node_id < end_node; + local_node_id++) { + if (gnn_graph.IsActiveInSubgraph(local_node_id)) { + unsigned node_timestamp = + gnn_graph.SampleNodeTimestamp(local_node_id); + if (node_timestamp != std::numeric_limits::max()) { + uint32_t sid_to_use; + if (node_timestamp != 0) { + sid_to_use = layer_offsets[node_timestamp - 1] + + current_thread_offset[node_timestamp]++; + if (local_node_id < last_owned_node) { + // master node that is not in layer 0 (i.e. node_timestamp != 0) + non_layer_zero_masters.set(sid_to_use); + } + } else { + // node timestamp == 0; no layer offset needed because offset + // is 0 + sid_to_use = current_thread_offset[node_timestamp]++; + } + + // this is a mirror + if (local_node_id >= last_owned_node) { + // XXX(loc) mirror offsets + uint32_t node_gid = gnn_graph.GetGID(local_node_id); + size_t my_offset = + my_mirror_offsets[gnn_graph.GetHostID(node_gid)]++; + + if (my_offset > + subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].size()) + GALOIS_LOG_FATAL( + "{} {}", my_offset, + subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].size()); + + subgraph_mirrors_[gnn_graph.GetHostID(node_gid)][my_offset] = + node_gid; + } + + subgraph_id_to_lid_[sid_to_use] = local_node_id; + lid_to_subgraph_id_[local_node_id] = sid_to_use; + } else { + GALOIS_LOG_WARN("shouldn't ever get here right?"); + } + } + } + }); + + TimerStop(&timer); + } //! reset sid thread offsets used for parallel SID mapping creation void ResetSIDThreadOffsets(size_t num_layers) { @@ -162,11 +432,173 @@ class GNNSubgraph { } //! Counts in and out degrees of all sampled nodes in the graph - void DegreeCounting(const GNNGraph& gnn_graph); + void DegreeCounting(const GNNGraph& gnn_graph) { + galois::StatTimer timer("DegreeCounting", kRegionName); + TimerStart(&timer); + + if (local_subgraph_out_degrees_.size() < num_subgraph_nodes_) { + local_subgraph_out_degrees_.resize(num_subgraph_nodes_ * 1.02); + } + + if (local_subgraph_in_degrees_.size() < num_subgraph_nodes_) { + local_subgraph_in_degrees_.resize(num_subgraph_nodes_ * 1.02); + } + + galois::do_all( + galois::iterate(begin(), end()), + [&](uint32_t subgraph_id) { + uint32_t node_id = subgraph_id_to_lid_[subgraph_id]; + uint32_t out_degrees = 0; + for (auto out_edge_iter : gnn_graph.edges(node_id)) { + if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) { + out_degrees++; + } + } + local_subgraph_out_degrees_[subgraph_id] = out_degrees; + + uint32_t in_degrees = 0; + for (auto in_edge_iter : gnn_graph.in_edges(node_id)) { + if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) { + in_degrees++; + } + } + local_subgraph_in_degrees_[subgraph_id] = in_degrees; + }, + galois::loopname("DegreeCountingDoAll"), galois::steal()); + + TimerStop(&timer); + } + //! Creates edges - void EdgeCreation(const GNNGraph& gnn_graph); + void EdgeCreation(const GNNGraph& gnn_graph) { + galois::StatTimer timer("EdgeConstruction", kRegionName); + TimerStart(&timer); + // galois::DGAccumulator empty_masters; + // galois::DGAccumulator empty_mirrors; + // empty_masters.reset(); + // empty_mirrors.reset(); + + // galois::DGAccumulator total_sn; + // total_sn.reset(); + // total_sn += num_subgraph_nodes_; + // size_t global_sub_size = total_sn.reduce(); + + // prefix sum over subgraph degrees from previous phase to get starting + // points + for (size_t i = 1; i < num_subgraph_nodes_; i++) { + // if (local_subgraph_out_degrees_[i] == 0 && + // local_subgraph_in_degrees_[i] == 0) { + // if (i < subgraph_master_boundary_) { + // empty_masters += 1; + // } else { + // if (gnn_graph.GetNonLayerZeroMasters().test(i)) { + // empty_masters += 1; + // } else { + // empty_mirrors += 1; + // } + // } + //} + local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1]; + local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1]; + } + + // uint32_t emaster = empty_masters.reduce(); + // uint32_t emirror = empty_mirrors.reduce(); + // if (gnn_graph.host_id() == 0) { + // galois::gInfo("Empty masters percent is ", emaster / + // (float)global_sub_size, + // " ", emaster, " ", global_sub_size); + // galois::gInfo("Empty mirrors percent is ", emirror / + // (float)global_sub_size, + // " ", emirror, " ", global_sub_size); + //} + + // allocate then set node endpoints + num_subgraph_edges_ = local_subgraph_out_degrees_[num_subgraph_nodes_ - 1]; + + galois::StatTimer alloc_time("EdgeCreationAlloc", kRegionName); + TimerStart(&alloc_time); + underlying_graph_.DeallocateOnly(); + underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_); + underlying_graph_.CSCAllocate(); + TimerStop(&alloc_time); + + galois::gInfo(gnn_graph.host_prefix(), "Subgraph nodes and edges are ", + num_subgraph_nodes_, " ", num_subgraph_edges_); + + galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_), + [&](uint32_t subgraph_id) { + underlying_graph_.fixEndEdge( + subgraph_id, local_subgraph_out_degrees_[subgraph_id]); + underlying_graph_.FixEndInEdge( + subgraph_id, local_subgraph_in_degrees_[subgraph_id]); + }); + if (subedge_to_original_edge_.size() < num_subgraph_edges_) { + subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02); + } + if (in_subedge_to_original_edge_.size() < num_subgraph_edges_) { + in_subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02); + } + + // save edges + save reference to layer sample status + galois::do_all( + galois::iterate(begin(), end()), + [&](uint32_t subgraph_id) { + uint32_t node_id = subgraph_id_to_lid_[subgraph_id]; + assert(subgraph_id != std::numeric_limits::max()); + uint32_t out_location = 0; + uint32_t in_location = 0; + if (subgraph_id != 0) { + out_location = local_subgraph_out_degrees_[subgraph_id - 1]; + in_location = local_subgraph_in_degrees_[subgraph_id - 1]; + } + + for (auto out_edge_iter : gnn_graph.edges(node_id)) { + if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) { + assert( + lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)] != + std::numeric_limits::max()); + subedge_to_original_edge_[out_location] = *out_edge_iter; + + underlying_graph_.constructEdge( + out_location++, + lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)]); + } + } + + for (auto in_edge_iter : gnn_graph.in_edges(node_id)) { + if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) { + in_subedge_to_original_edge_[in_location] = + *(gnn_graph.InEdgeToOutEdge(in_edge_iter)); + underlying_graph_.ConstructInEdge( + in_location++, + lid_to_subgraph_id_[gnn_graph.GetInEdgeDest(in_edge_iter)]); + } + } + assert(out_location == local_subgraph_out_degrees_[subgraph_id]); + assert(in_location == local_subgraph_in_degrees_[subgraph_id]); + }, + galois::loopname("EdgeCreationDoAll"), galois::steal()); + TimerStop(&timer); + } + //! Copies over relevant features of the nodes - void NodeFeatureCreation(GNNGraph& gnn_graph); + void NodeFeatureCreation(GNNGraph& gnn_graph) { + galois::StatTimer timer("NodeFeatureCreation", kRegionName); + TimerStart(&timer); + size_t feat_length = gnn_graph.node_feature_length(); + subgraph_node_features_.resize(feat_length * num_subgraph_nodes_); + + galois::do_all( + galois::iterate(begin(), end()), [&](size_t subgraph_node_id) { + size_t local_id = subgraph_id_to_lid_[subgraph_node_id]; + std::memcpy( + &(subgraph_node_features_[subgraph_node_id * feat_length]), + &((gnn_graph.GetLocalFeatures().data())[local_id * feat_length]), + feat_length * sizeof(GNNFeature)); + }); + TimerStop(&timer); + } static const constexpr char* kRegionName = "GNNSubgraph"; diff --git a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h index 50a07bdd4e..422965fbaf 100644 --- a/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h +++ b/libgnn/include/galois/graphs/GraphAggregationSyncStructures.h @@ -8,6 +8,7 @@ namespace galois { namespace graphs { +extern std::vector* sampled_nodes_; extern GNNFloat* gnn_matrix_to_sync_; extern size_t gnn_matrix_to_sync_column_length_; extern galois::DynamicBitSet bitset_graph_aggregate; @@ -20,16 +21,22 @@ extern struct CUDA_Context* cuda_ctx_for_sync; extern unsigned layer_number_to_sync; #endif +// NodeTy is always a node data type of a "graph" type. +// This type is used by GluonSubstrate to reset a value. +// ValTy is either a node data type of a graph or the ones +// that are stored in separate objects. +template struct SampleFlagSync { - using ValTy = char; + using NodeTy = NTy; + using ValTy = char; //! return a vector of floats to sync - static ValTy extract(uint32_t, char& i) { return i; } + static ValTy extract(uint32_t lid, NodeTy&) { return (*sampled_nodes_)[lid]; } - static bool reduce(uint32_t, char& i, ValTy y) { + static bool reduce(uint32_t lid, NodeTy&, ValTy y) { if (y) { - i = y; - assert(i == 1); + (*sampled_nodes_)[lid] = y; + assert((*sampled_nodes_)[lid] == 1); return true; } else { return false; @@ -37,10 +44,12 @@ struct SampleFlagSync { } //! No-op: readAny = overwritten anyways - static void reset(uint32_t, char&) {} + static void reset(uint32_t, NodeTy&) {} //! element wise set - static void setVal(uint32_t, char& i, ValTy y) { i = y; } + static void setVal(uint32_t lid, NodeTy&, ValTy y) { + (*sampled_nodes_)[lid] = y; + } // GPU options TODO for GPU static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { @@ -67,13 +76,15 @@ struct SampleFlagBitset { } }; +template struct GNNSumAggregate { - using ValTy = galois::gstl::Vector; + using ValTy = galois::gstl::Vector; + using NodeTy = NTy; static size_t FeatVecSize() { return gnn_matrix_to_sync_column_length_; } //! return a vector of floats to sync - static ValTy extract(uint32_t node_id, char&) { + static ValTy extract(uint32_t node_id, NodeTy&) { // It should be a CPU synchronizing substrate. // If the GPU flag is turned off, then personality does not exist. // assert(device_personality == DevicePersonality::CPU); @@ -100,7 +111,7 @@ struct GNNSumAggregate { //! reduction is addition in this case; add received vector to //! own vector - static bool reduce(uint32_t node_id, char&, ValTy y) { + static bool reduce(uint32_t node_id, NodeTy&, ValTy y) { assert(y.size() == gnn_matrix_to_sync_column_length_); // loop and do addition for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { @@ -111,7 +122,7 @@ struct GNNSumAggregate { return true; } - static bool reduce(uint32_t node_id, char&, const ValTy::value_type* y) { + static bool reduce(uint32_t node_id, NodeTy&, const ValTy::value_type* y) { // loop and do addition for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { // XXX vectorized add @@ -122,7 +133,7 @@ struct GNNSumAggregate { } //! No-op: readAny = overwritten anyways - static void reset(uint32_t, char&) {} + static void reset(uint32_t, NodeTy&) {} // Reset is here in case anyone wants to bring it back // static void reset(uint32_t node_id, char&) { // for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { @@ -132,7 +143,7 @@ struct GNNSumAggregate { //} //! element wise set - static void setVal(uint32_t node_id, char&, ValTy y) { + static void setVal(uint32_t node_id, NodeTy&, ValTy y) { assert(y.size() == gnn_matrix_to_sync_column_length_); // loop and do addition for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { @@ -141,7 +152,7 @@ struct GNNSumAggregate { } } - static void setVal(uint32_t node_id, char&, const ValTy::value_type* y) { + static void setVal(uint32_t node_id, NodeTy&, const ValTy::value_type* y) { // loop and do addition for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] = @@ -165,13 +176,15 @@ struct GNNSumAggregate { static bool extract_reset_batch(unsigned, uint8_t*) { return false; } }; +template struct GNNSampleSumAggregate { - using ValTy = galois::gstl::Vector; + using ValTy = galois::gstl::Vector; + using NodeTy = NTy; static size_t FeatVecSize() { return gnn_matrix_to_sync_column_length_; } //! return a vector of floats to sync - static ValTy extract(uint32_t node_id, char&) { + static ValTy extract(uint32_t node_id, NodeTy&) { // It should be a CPU synchronizing substrate. // If the GPU flag is turned off, then personality does not exist. // assert(device_personality == DevicePersonality::CPU); @@ -212,7 +225,7 @@ struct GNNSampleSumAggregate { //! reduction is addition in this case; add received vector to //! own vector - static bool reduce(uint32_t node_id, char&, ValTy y) { + static bool reduce(uint32_t node_id, NodeTy&, ValTy y) { assert(y.size() == gnn_matrix_to_sync_column_length_); if ((*gnn_lid_to_sid_pointer_)[node_id] == std::numeric_limits::max()) { @@ -231,7 +244,7 @@ struct GNNSampleSumAggregate { return true; } - static bool reduce(uint32_t node_id, char&, ValTy::value_type* y) { + static bool reduce(uint32_t node_id, NodeTy&, ValTy::value_type* y) { if ((*gnn_lid_to_sid_pointer_)[node_id] == std::numeric_limits::max()) { return false; @@ -252,10 +265,10 @@ struct GNNSampleSumAggregate { } //! No-op: readAny = overwritten anyways - static void reset(uint32_t, char&) {} + static void reset(uint32_t, NodeTy&) {} // version where you have a vector object - static void setVal(uint32_t node_id, char&, ValTy y) { + static void setVal(uint32_t node_id, NodeTy&, ValTy y) { assert(y.size() == gnn_matrix_to_sync_column_length_); uint32_t converted_sid = (*gnn_lid_to_sid_pointer_)[node_id]; if (converted_sid >= num_active_layer_rows_ || @@ -273,7 +286,7 @@ struct GNNSampleSumAggregate { // version where you have a pointer only (more efficient because this // version is for reading directly from the recv buffer) - static void setVal(uint32_t node_id, char&, ValTy::value_type* y) { + static void setVal(uint32_t node_id, NodeTy&, ValTy::value_type* y) { uint32_t converted_sid = (*gnn_lid_to_sid_pointer_)[node_id]; if (converted_sid >= num_active_layer_rows_ || converted_sid == std::numeric_limits::max()) { @@ -303,6 +316,112 @@ struct GNNSampleSumAggregate { static bool extract_reset_batch(unsigned, uint8_t*) { return false; } }; +template +struct SHADGNNSumAggregate { + using ValTy = galois::gstl::Vector; + using NodeTy = NTy; + + static size_t FeatVecSize() { return gnn_matrix_to_sync_column_length_ / 2; } + + //! return a vector of floats to sync + static ValTy extract(uint32_t node_id, NodeTy&) { + // It should be a CPU synchronizing substrate. + // If the GPU flag is turned off, then personality does not exist. + // assert(device_personality == DevicePersonality::CPU); + + // It should extract the last half of features of the adjacent neighbors + // (So, source of feature aggregation). + ValTy extracted_vec; + extracted_vec.reserve(gnn_matrix_to_sync_column_length_ / 2); + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) { + // XXX memcpy + extracted_vec.emplace_back( + gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i + + gnn_matrix_to_sync_column_length_ / 2]); + } + // move constructor should kick in here to avoid return copy + return extracted_vec; + } + + //! return a vector of floats to sync + static void ExtractDirect(uint32_t node_id, + typename ValTy::value_type* to_write) { + std::memcpy( + to_write, + (char*)&( + gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + + gnn_matrix_to_sync_column_length_ / 2]), + (gnn_matrix_to_sync_column_length_ / 2) * + sizeof(typename ValTy::value_type)); + } + + //! reduction is addition in this case; add received vector to + //! own vector + static bool reduce(uint32_t node_id, char&, ValTy y) { + assert(y.size() == gnn_matrix_to_sync_column_length_ / 2); + // loop and do addition + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) { + // XXX vectorized add + gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i + + gnn_matrix_to_sync_column_length_ / 2] += y[i]; + } + return true; + } + + static bool reduce(uint32_t node_id, NodeTy&, const ValTy::value_type* y) { + // loop and do addition + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) { + // XXX vectorized add + gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i + + gnn_matrix_to_sync_column_length_ / 2] += y[i]; + } + return true; + } + + //! No-op: readAny = overwritten anyways + static void reset(uint32_t, NodeTy&) {} + // Reset is here in case anyone wants to bring it back + // static void reset(uint32_t node_id, char&) { + // for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_; i++) { + // gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i] = + // 0; + // } + //} + + //! element wise set + static void setVal(uint32_t node_id, NodeTy&, ValTy y) { + assert(y.size() == gnn_matrix_to_sync_column_length_); + // loop and do addition + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) { + gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i + + gnn_matrix_to_sync_column_length_ / 2] = y[i]; + } + } + + static void setVal(uint32_t node_id, NodeTy&, const ValTy::value_type* y) { + // loop and do addition + for (unsigned i = 0; i < gnn_matrix_to_sync_column_length_ / 2; i++) { + gnn_matrix_to_sync_[node_id * gnn_matrix_to_sync_column_length_ + i + + gnn_matrix_to_sync_column_length_ / 2] = y[i]; + } + } + + // GPU options TODO for GPU + static bool extract_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_batch(unsigned, uint8_t*) { return false; } + static bool reduce_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool reduce_mirror_batch(unsigned, uint8_t*, DataCommMode) { + return false; + } + static bool setVal_batch(unsigned, uint8_t*, DataCommMode) { return false; } + static bool extract_reset_batch(unsigned, uint8_t*, size_t*, DataCommMode*) { + return false; + } + static bool extract_reset_batch(unsigned, uint8_t*) { return false; } +}; + #ifdef GALOIS_ENABLE_GPU extern struct CUDA_Context* cuda_ctx; GALOIS_SYNC_STRUCTURE_GNN_LAYER(layer_input, cuda_ctx_for_sync, diff --git a/libgnn/include/galois/layers/DenseLayer.h b/libgnn/include/galois/layers/DenseLayer.h index e7dc46e9f3..c347ae8dbe 100644 --- a/libgnn/include/galois/layers/DenseLayer.h +++ b/libgnn/include/galois/layers/DenseLayer.h @@ -1,21 +1,43 @@ #pragma once #include "galois/layers/GNNLayer.h" +#include "galois/Logging.h" +#include "galois/GNNMath.h" namespace galois { //! Just does a linear xform with no convolution over graph -class DenseLayer : public GNNLayer { +template +class DenseLayer : public GNNLayer { public: //! Initializes the variables of the base class and also allocates additional //! memory for temporary matrices. Also initializes sync substrate for the //! weight matrix - DenseLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + DenseLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, - const GNNLayerDimensions& dimensions, - const GNNLayerConfig& config); + const GNNLayerDimensions& layer_dimensions, + const GNNLayerConfig& config) + : GNNLayer(layer_num, graph, backward_output_matrix, + layer_dimensions, config), + input_column_intermediates_(layer_dimensions.input_columns), + output_column_intermediates_(layer_dimensions.output_columns) { + // TODO Need to make sure that layer knows about forward/backward matrix + // sharing (e.g., overwriting previously used input to save space) + GALOIS_LOG_FATAL( + "This layer has not been kept up to date; do not use until " + "sure it's been updated"); + size_t num_input_elements = this->layer_dimensions_.input_rows * + this->layer_dimensions_.input_columns; + in_temp_1_.resize(num_input_elements, 0); + size_t num_output_elements = this->layer_dimensions_.input_rows * + this->layer_dimensions_.output_columns; + GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements); + this->layer_type_ = galois::GNNLayerType::kDense; + this->p_in_temp_1_ = PointerWithSize(in_temp_1_); + GALOIS_LOG_VERBOSE("Dense initialized"); + } - DenseLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + DenseLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) : DenseLayer(layer_num, graph, backward_output_matrix, dimensions, @@ -23,11 +45,80 @@ class DenseLayer : public GNNLayer { // Parent functions const PointerWithSize - ForwardPhase(const PointerWithSize input_embeddings) final; + ForwardPhase(const PointerWithSize input_embeddings) final { + GALOIS_LOG_VERBOSE("Calling forward phase"); + assert(input_embeddings.size() == (this->layer_dimensions_.input_rows * + this->layer_dimensions_.input_columns)); + assert(this->p_in_temp_1_.size() == input_embeddings.size()); + assert(this->p_forward_output_matrix_.size() == + (this->layer_dimensions_.input_rows * + this->layer_dimensions_.output_columns)); + // pointer to input to operate on + const GNNFloat* input_data = input_embeddings.data(); + // first, dropout + if (!this->config_.disable_dropout && + (this->layer_phase_ == GNNPhase::kTrain)) { + this->DoDropout(input_embeddings, &this->p_in_temp_1_); + input_data = this->p_in_temp_1_.data(); + } + + // FW + UpdateEmbeddings(input_data, this->p_forward_output_matrix_.data()); + + if (!this->config_.disable_activation) { + GALOIS_LOG_VERBOSE("Doing activation"); + this->Activation(); + } + + assert(this->p_forward_output_matrix_.size() == + (this->layer_dimensions_.input_rows * + this->layer_dimensions_.output_columns)); + return this->p_forward_output_matrix_; + } PointerWithSize BackwardPhase(PointerWithSize prev_layer_input, - PointerWithSize* input_gradient) final; + PointerWithSize* input_gradient) final { + assert(this->layer_phase_ == GNNPhase::kTrain); + + // derivative of activation + if (!this->config_.disable_activation) { + this->ActivationDerivative(input_gradient); + } + + if (this->layer_number_ != 0) { + // derivative for update + // backout = F' + UpdateEmbeddingsDerivative(input_gradient->data(), + this->p_backward_output_matrix_.data()); + } + + galois::PointerWithSize input_data; + if (!this->config_.disable_dropout) { + // dropout result is currently stored in temp 1 + // needs to be used before it gets overwritten + input_data = this->p_in_temp_1_; + } else { + // no dropout = use vanilla input + input_data = prev_layer_input; + } + + // W' = F^T (FW)' + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns, + this->layer_dimensions_.input_rows, + this->layer_dimensions_.output_columns, input_data.data(), + input_gradient->data(), this->p_layer_weight_gradients_.data()); + // sync weight gradients; note aggregation sync occurs in the function call + // already + this->WeightGradientSyncSum(); + + if (!this->config_.disable_dropout && this->layer_number_ != 0) { + this->DoDropoutDerivative(); + } + + return this->p_backward_output_matrix_; + } private: // 2 temporaries the size of the forward input; used for dropout and @@ -45,9 +136,54 @@ class DenseLayer : public GNNLayer { output_column_intermediates_; //! Do embedding update via mxm with this layer's weights (forward) - void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output); + void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output) { +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + /* TODO(lhc) implement this + gpu_object_.UpdateEmbeddingsGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, node_embeddings, + base_gpu_object_.layer_weights(), output); + */ + } else { +#endif + // CPU version is just a call into CBlas + galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, + this->layer_dimensions_.input_rows, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, + node_embeddings, this->layer_weights_.data(), output); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } + //! Calculate graident via mxm with last layer's gradients (backward) - void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output); + void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output) { + assert(this->p_layer_weights_.size() == + this->layer_dimensions_.input_columns * + this->layer_dimensions_.output_columns); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + /* TODO(lhc) implement this + gpu_object_.UpdateEmbeddingsDerivativeGPU( + layer_dimensions_.input_rows, layer_dimensions_.input_columns, + layer_dimensions_.output_columns, gradients, + base_gpu_object_.layer_weights(), output); + */ + } else { +#endif + // difference is Trans for B matrix (data) to get z by y (weights is y by + // z normally); result is x by y + galois::CBlasSGEMM(CblasNoTrans, CblasTrans, + this->layer_dimensions_.input_rows, + this->layer_dimensions_.output_columns, + this->layer_dimensions_.input_columns, gradients, + this->layer_weights_.data(), output); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } #ifdef GALOIS_ENABLE_GPU // TODO(hochan/loc) replace with dense gpu object diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 786a973230..9ac6b925ae 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -3,6 +3,8 @@ #include "galois/PerThreadRNG.h" #include "galois/GNNOptimizers.h" #include "galois/graphs/GNNGraph.h" +#include "galois/Logging.h" +#include "galois/layers/GradientSyncStructures.h" #ifdef GALOIS_ENABLE_GPU #include "galois/layers/GNNLayer.cuh" @@ -81,17 +83,113 @@ struct GNNLayerConfig { // Tried to avoid inheritance, but keeping track of heterogeneous layers // becomes a mess if there isn't a base class I can create the container on. //! Base class for layers in a graph neural network +template class GNNLayer { public: //! Creation of a layer needs the # of the layer, the graph to train on, and //! the input/output dimensions of the MxM that occurs in the layer; config //! as well - GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, - const GNNLayerDimensions& dimensions, const GNNLayerConfig& config); + const GNNLayerDimensions& dimensions, const GNNLayerConfig& config) + : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions), + config_(config) { + // TODO(loc) + // this is currently a backward-compatibility hack, need to have caller + // set output rows rather than created here + layer_dimensions_.output_rows = layer_dimensions_.input_rows; + + if (config_.allocate_weights) { + // dropout allocation; dropout is same as input + if (!config_.disable_dropout) { + dropout_mask_.resize(layer_dimensions_.input_rows * + layer_dimensions_.input_columns, + false); + } + // allocate memory based on layer dimensions + size_t num_weight_elements = + layer_dimensions_.input_columns * layer_dimensions_.output_columns; + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", layer weights ", num_weight_elements, " (", + FloatElementsToGB(num_weight_elements), " GB)"); + layer_weights_.resize(num_weight_elements); + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", layer gradients ", num_weight_elements, " (", + FloatElementsToGB(num_weight_elements), " GB)"); + layer_weight_gradients_.resize(num_weight_elements, 0); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.InitWeightMemory(num_weight_elements); + base_gpu_object_.InitDropoutMemory(layer_dimensions_.input_rows * + layer_dimensions_.input_columns); + } +#endif + + GlorotBengioInit(&layer_weights_); + } + + // TODO(loc) optimize this and layer creation in general + // this does not use output_rows and assumes the worst case where + // all nodes are generated + // for now it's kept as input_rows so as to not break things + size_t num_output_elements = + layer_dimensions_.input_rows * layer_dimensions_.output_columns; + + if (!config_.disable_output) { + galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, + ", forward output matrix ", num_output_elements, " (", + FloatElementsToGB(num_output_elements), " GB)"); + forward_output_matrix_.resize(num_output_elements, 0); + } + + if (layer_number_ != 0) { + GALOIS_LOG_VASSERT( + backward_output_matrix->size() == + layer_dimensions_.input_rows * layer_dimensions_.input_columns, + "backward output size {} should equal input size {}", + backward_output_matrix->size(), + layer_dimensions_.input_rows * layer_dimensions_.input_columns); + } else { + GALOIS_LOG_VASSERT(backward_output_matrix->data() == nullptr, + "layer 0 should null ptr backward output"); + GALOIS_LOG_VASSERT(backward_output_matrix->size() == 0, + "layer 0 should size 0 backward output"); + } + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.InitInOutMemory(num_output_elements, + layer_dimensions_.input_rows * + layer_dimensions_.input_columns); + + // initialize the PointerWithSize wrappers + p_layer_weights_ = PointerWithSize( + base_gpu_object_.layer_weights(), layer_weights_.size()); + p_layer_weight_gradients_ = + PointerWithSize(base_gpu_object_.layer_weight_gradients(), + layer_weight_gradients_.size()); + p_forward_output_matrix_ = PointerWithSize( + base_gpu_object_.forward_output(), forward_output_matrix_.size()); + p_backward_output_matrix_ = PointerWithSize( + base_gpu_object_.backward_output(), backward_output_matrix->size()); + // TODO can clear the cpu side vectors/don't use .size() since optimally + // they aren't initialized + } else { +#endif + // initialize the PointerWithSize wrappers + p_layer_weights_ = PointerWithSize(layer_weights_); + p_layer_weight_gradients_ = + PointerWithSize(layer_weight_gradients_); + p_forward_output_matrix_ = + PointerWithSize(forward_output_matrix_); + p_backward_output_matrix_ = *backward_output_matrix; +#ifdef GALOIS_ENABLE_GPU + } +#endif + } //! Uses a default config - GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + GNNLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, @@ -109,7 +207,31 @@ class GNNLayer { ResizeOutputMatrix(output_row); } - void ResizeOutputMatrix(size_t new_output_row); + void ResizeOutputMatrix(size_t new_output_row) { + size_t num_output_elements = + new_output_row * layer_dimensions_.output_columns; + + if (!config_.disable_output && + (forward_output_matrix_.size() < num_output_elements)) { + galois::gInfo(graph_.host_prefix(), "Resizing layer ", layer_number_, + ", forward output matrix to ", num_output_elements, " (", + FloatElementsToGB(num_output_elements), " GB)"); + // resize with a bit of a buffer to prevent possible future resizes + size_t buffer_size = (num_output_elements * 0.02); + forward_output_matrix_.resize(num_output_elements + buffer_size, 0); + } + + // XXX(hochan) GPU end +#ifdef GALOIS_ENABLE_GPU + // XXX(hochan) +#endif + // reinitialize the PointerWithSize wrappers + p_forward_output_matrix_ = + PointerWithSize(forward_output_matrix_); +#ifdef GALOIS_ENABLE_GPU + // XXX(hochan) +#endif + } void UpdateBackwardOutput(PointerWithSize* backward_output_matrix) { // XXX(hochan) gpu @@ -257,7 +379,7 @@ class GNNLayer { //! Pointer to the graph being trained by this layer. //! This is owned by the creator of this layer, so no need to free it when //! this layer is destroyed. - const galois::graphs::GNNGraph& graph_; + const galois::graphs::GNNGraph& graph_; //! Dimensions (input/output sizes) of this layer GNNLayerDimensions layer_dimensions_; //! Config object for certain parameters for layer @@ -318,38 +440,277 @@ class GNNLayer { //! used are the dimensions of this particular weight matrix //! TODO revisit paper and see what they really mean //! Code inspired DGL and TinyDNN - void GlorotBengioInit(std::vector* vector_to_init); + void GlorotBengioInit(std::vector* vector_to_init) { + float max = std::sqrt(6.0) / std::sqrt(layer_dimensions_.output_columns + + layer_dimensions_.input_columns); + std::default_random_engine rng(1 + layer_number_); + std::uniform_real_distribution dist(-max, max); + + for (size_t i = 0; i < vector_to_init->size(); i++) { + (*vector_to_init)[i] = dist(rng); + } +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + CopyLayerWeightsToGPU(); + } +#endif + } //! Init 2 things as one unit; used for SAGE void PairGlorotBengioInit(std::vector* vector1, - std::vector* vector2); + std::vector* vector2) { + // multiplied by 2 here because 2 pieces are 1 unit + float max = + std::sqrt(6.0) / std::sqrt((2 * layer_dimensions_.output_columns) + + layer_dimensions_.input_columns); + assert(vector1->size() == (layer_dimensions_.input_columns * + layer_dimensions_.output_columns)); + assert(vector2->size() == (layer_dimensions_.input_columns * + layer_dimensions_.output_columns)); + std::default_random_engine rng(1 + layer_number_); + std::uniform_real_distribution dist(-max, max); + + for (size_t i = 0; i < vector1->size(); i++) { + (*vector1)[i] = dist(rng); + } + for (size_t i = 0; i < vector2->size(); i++) { + (*vector2)[i] = dist(rng); + } + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + CopyLayerWeightsToGPU(); + } +#endif + } //! Randomly init a float vector using the class's random init RNG - void RandomInitVector(std::vector* vector_to_init); + void RandomInitVector(std::vector* vector_to_init) { + galois::do_all( + galois::iterate(static_cast(0), vector_to_init->size()), + [&](size_t i) { + // pull from the class's per thread RNG + (*vector_to_init)[i] = random_init_rng_.GetRandomNumber(); + }, + galois::loopname("RandomInitVector")); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + CopyLayerWeightsToGPU(); + } +#endif + } //! CPU variant of dropout - void DoDropoutCPU(const PointerWithSize input_to_drop, - PointerWithSize* output_matrix); + void DoDropoutCPU(const PointerWithSize input_to_dropout, + PointerWithSize* output_matrix) { + // TODO This (and dropout in general) may not work in the sampling setting + size_t num_elements = + layer_dimensions_.input_rows * layer_dimensions_.input_columns; + + // determine which parts to drop + galois::do_all( + galois::iterate(static_cast(0), num_elements), + [&](size_t i) { + dropout_mask_[i] = dropout_rng_.DoBernoulli(config_.dropout_rate); + }, + galois::loopname("LayerDropoutRNG")); + + // create new matrix with non-dropped input + some scaling + // TODO save scaling elsewhere? + GNNFloat scale = 1. / (1. - config_.dropout_rate); + galois::do_all( + galois::iterate(static_cast(0), num_elements), + [&](size_t i) { + (*output_matrix)[i] = input_to_dropout[i] * + static_cast(dropout_mask_[i]) * scale; + }, + galois::loopname("LayerDropout")); + } //! Choose a set of weights from this layer's weights to keep and save to //! the output matrix + apply some scaling to the kept weights based on //! dropout rate - void DoDropout(const PointerWithSize input_to_drop, - PointerWithSize* output_matrix); + void DoDropout(const PointerWithSize input_to_dropout, + PointerWithSize* output_matrix) { + galois::StatTimer timer("ForwardDropout", "GNNLayer"); + TimerStart(&timer); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix, + config_.dropout_rate); + } else { +#endif + DoDropoutCPU(input_to_dropout, output_matrix); +#ifdef GALOIS_ENABLE_GPU + } +#endif + TimerStop(&timer); + } + //! Apply the derivative of dropout to the backward phase output - void DoDropoutDerivative(); - void ReconstructDropoutMatrix(const PointerWithSize input_to_drop, - PointerWithSize* output_matrix); + void DoDropoutDerivative() { + galois::StatTimer timer("BackwardDropout", "GNNLayer"); + TimerStart(&timer); + assert(p_backward_output_matrix_.size() == dropout_mask_.size()); + GNNFloat scale = 1. / (1. - config_.dropout_rate); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.DoDropoutDerivativeGPU(p_backward_output_matrix_.size(), + scale); + } else { +#endif + // use dropout mask to figure out derivative + galois::do_all( + galois::iterate(static_cast(0), + p_backward_output_matrix_.size()), + [&](size_t i) { + p_backward_output_matrix_[i] = + p_backward_output_matrix_[i] * + static_cast(dropout_mask_[i]) * scale; + }, + galois::loopname("LayerDropoutDerivative")); +#ifdef GALOIS_ENABLE_GPU + } +#endif + TimerStop(&timer); + } + + void + ReconstructDropoutMatrix(const PointerWithSize input_to_dropout, + PointerWithSize* output_matrix) { + galois::StatTimer timer("ReconstructDropoutMatrix", "GNNLayer"); + TimerStart(&timer); + // reuse the dropout mask from a previous dropout call + size_t num_elements = output_matrix->size(); + GNNFloat scale = 1. / (1. - config_.dropout_rate); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.ReconstructDropoutMatrixGPU( + input_to_dropout, output_matrix, num_elements, scale); + } else { +#endif + galois::do_all( + galois::iterate(static_cast(0), num_elements), + [&](size_t i) { + (*output_matrix)[i] = input_to_dropout[i] * + static_cast(dropout_mask_[i]) * + scale; + }, + galois::loopname("ReconstructDropout")); +#ifdef GALOIS_ENABLE_GPU + } +#endif + TimerStop(&timer); + } //! Does some activation function based on configuration on forward output //! matrix - void Activation(); + void Activation() { + galois::StatTimer timer("ForwardActivation", "GNNLayer"); + TimerStart(&timer); + + // TODO only does relu at the moment; should check user specified activation + // and act accordingly +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.ActivationGPU(p_forward_output_matrix_.size()); + } else { +#endif + if (activation_memo_.size() != p_forward_output_matrix_.size()) { + activation_memo_.resize(p_forward_output_matrix_.size()); + } + activation_memo_.reset(); + assert(activation_memo_.size() == p_forward_output_matrix_.size()); + assert(layer_dimensions_.output_rows * layer_dimensions_.output_columns <= + p_forward_output_matrix_.size()); + + galois::do_all(galois::iterate(static_cast(0), + layer_dimensions_.output_rows * + layer_dimensions_.output_columns), + [&](size_t i) { + if (p_forward_output_matrix_[i] > 0.0) { + // do nothing, keep value; set the memo though + activation_memo_.set(i); + } else { + p_forward_output_matrix_[i] = 0; + } + }); +#ifdef GALOIS_ENABLE_GPU + } +#endif + TimerStop(&timer); + } + void ActivationCPU(); //! Calculate derivative of activation function based on config on the matrix - void ActivationDerivative(PointerWithSize* matrix); + void ActivationDerivative(PointerWithSize* gradient) { + galois::StatTimer timer("BackwardActivation", "GNNLayer"); + TimerStart(&timer); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.ActivationDerivativeGPU(gradient->data(), + gradient->size()); + } else { +#endif + assert(gradient->size() >= + layer_dimensions_.output_rows * layer_dimensions_.output_columns); + // TODO only does relu at the moment; should check user specified + // activation and act accordingly keep gradient if the original output was + // greater than 0 + galois::do_all( + galois::iterate(static_cast(0), + layer_dimensions_.output_rows * + layer_dimensions_.output_columns), + [&](size_t i) { + // it was <= 0 before; set back to 0 + if (!activation_memo_.test(i)) { + (*gradient)[i] = 0; + } + }, + galois::loopname("ReLU-Derivative")); +#ifdef GALOIS_ENABLE_GPU + } +#endif + TimerStop(&timer); + } //! Synchronize weight gradients with a summation - void WeightGradientSyncSum(); + void WeightGradientSyncSum() { + galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon"); + TimerStart(&clubbed_timer); + galois::StatTimer t("Sync_WeightGradientsSum", "GNNLayer"); + TimerStart(&t); + int weight_size = static_cast(p_layer_weight_gradients_.size()); + + // TODO(loc) remove this limitation later; can just do a loop over the + // weight matrix + if (p_layer_weight_gradients_.size() > + size_t{std::numeric_limits::max()}) { + GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max " + "int at the moment"); + } +#ifdef GALOIS_ENABLE_GPU + // TODO(lhc) make this clang option later + bool gpu_direct_enabled = false; + if (device_personality == DevicePersonality::GPU_CUDA && + !gpu_direct_enabled) { + base_gpu_object_.CopyWeightGradientsToCPU(&layer_weight_gradients_); + MPI_Allreduce(MPI_IN_PLACE, layer_weight_gradients_.data(), weight_size, + MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); + base_gpu_object_.CopyToWeightGradients(layer_weight_gradients_); + } else { +#endif + MPI_Allreduce(MPI_IN_PLACE, + static_cast(p_layer_weight_gradients_.data()), + weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); +#ifdef GALOIS_ENABLE_GPU + } +#endif + TimerStop(&t); + TimerStop(&clubbed_timer); + } #ifdef GALOIS_ENABLE_GPU //! Object that holds all GPU allocated pointers to memory related to layers @@ -363,18 +724,176 @@ class GNNLayer { void MaskInputNonMasters(PointerWithSize* input) { MaskInputNonMasters(input, std::numeric_limits::max()); } - void MaskInputNonMasters(PointerWithSize* input, size_t max_rows); + void MaskInputNonMasters(PointerWithSize* input, size_t max_rows) { + assert(*(graph_.begin_owned()) == 0); + size_t start_node = *(graph_.end_owned()); + size_t end_node = graph_.active_size(); + + if (start_node > max_rows) { + start_node = max_rows; + } + if (end_node > max_rows) { + end_node = max_rows; + } + + size_t row_index = layer_dimensions_.input_columns; + assert(start_node * row_index <= input->size()); + assert(end_node * row_index <= input->size()); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node, + row_index); + } else { +#endif + galois::do_all( + galois::iterate(start_node, end_node), + [&](size_t non_master) { + // TODO(loc) use a std function for this for max efficiency + for (size_t i = 0; i < row_index; i++) { + (*input)[non_master * row_index + i] = 0; + } + }, + galois::loopname("MaskInputNonMasters")); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } + void MaskInputNonMasters(PointerWithSize* input, size_t max_rows, - const galois::DynamicBitSet&); + const galois::DynamicBitSet& bs) { + assert(*(graph_.begin_owned()) == 0); + size_t start_node = *(graph_.end_owned()); + size_t end_node = graph_.active_size(); + + if (start_node > max_rows) { + start_node = max_rows; + } + if (end_node > max_rows) { + end_node = max_rows; + } + + size_t row_index = layer_dimensions_.input_columns; + assert(start_node * row_index <= input->size()); + assert(end_node * row_index <= input->size()); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node, + row_index); + } else { +#endif + galois::do_all( + galois::iterate(start_node, end_node), + [&](size_t non_master) { + if (!bs.test(non_master)) { + // TODO(loc) use a std function for this for max efficiency + for (size_t i = 0; i < row_index; i++) { + (*input)[non_master * row_index + i] = 0; + } + } + }, + galois::loopname("MaskInputNonMasters")); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } //! Mask a gradient size'd matrix's rows that correspond to mirrors void MaskGradientNonMasters(PointerWithSize* input) { MaskGradientNonMasters(input, std::numeric_limits::max()); } void MaskGradientNonMasters(PointerWithSize* gradients, - size_t max_rows); + size_t max_rows) { + assert(*(graph_.begin_owned()) == 0); + size_t start_node = *(graph_.end_owned()); + size_t end_node = graph_.active_size(); + + if (start_node > max_rows) { + start_node = max_rows; + } + if (end_node > max_rows) { + end_node = max_rows; + } + + size_t row_index = layer_dimensions_.output_columns; + if (start_node > max_rows) { + start_node = max_rows; + } + if (end_node > max_rows) { + end_node = max_rows; + } + assert(start_node * row_index <= gradients->size()); + assert(end_node * row_index <= gradients->size()); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.MaskNonMastersGPU(gradients, start_node, end_node, + row_index); + } else { +#endif + galois::do_all( + galois::iterate(start_node, end_node), + [&](size_t non_master) { + // TODO(loc) use a std function for this for max efficiency + for (size_t i = 0; i < row_index; i++) { + (*gradients)[non_master * row_index + i] = 0; + } + }, + galois::loopname("MaskGradientNonMasters")); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } + void MaskGradientNonMasters(PointerWithSize* gradients, - size_t max_rows, const galois::DynamicBitSet&); + size_t max_rows, + const galois::DynamicBitSet& bs) { + assert(*(graph_.begin_owned()) == 0); + size_t start_node = *(graph_.end_owned()); + size_t end_node = graph_.active_size(); + + if (start_node > max_rows) { + start_node = max_rows; + } + if (end_node > max_rows) { + end_node = max_rows; + } + + size_t row_index = layer_dimensions_.output_columns; + if (start_node > max_rows) { + start_node = max_rows; + } + if (end_node > max_rows) { + end_node = max_rows; + } + assert(start_node * row_index <= gradients->size()); + assert(end_node * row_index <= gradients->size()); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + base_gpu_object_.MaskNonMastersGPU(gradients, start_node, end_node, + row_index); + } else { +#endif + // galois::gInfo(start_node, " to ", end_node); + galois::do_all( + galois::iterate(start_node, end_node), + [&](size_t non_master) { + // if something is not a master, kill it + if (!bs.test(non_master)) { + // galois::gInfo("don't keep ", non_master); + // TODO(loc) use a std function for this for max efficiency + for (size_t i = 0; i < row_index; i++) { + (*gradients)[non_master * row_index + i] = 0; + } + } + }, + galois::loopname("MaskGradientNonMasters")); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } //! Does some math to get GB used by some # of floats double FloatElementsToGB(size_t num_of_floats) const { diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h index 988276965d..2c7a41ecab 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h @@ -1,5 +1,7 @@ #pragma once #include "galois/layers/GNNLayer.h" +#include "galois/Logging.h" +#include "galois/GNNMath.h" #ifdef GALOIS_ENABLE_GPU #include "galois/layers/GraphConvolutionalLayer.cuh" @@ -9,19 +11,113 @@ namespace galois { extern galois::DynamicBitSet graphs::bitset_graph_aggregate; -class GraphConvolutionalLayer : public GNNLayer { +template +class GraphConvolutionalLayer : public GNNLayer { public: //! Initializes the variables of the base class and also allocates additional //! memory for temporary matrices. Also initializes sync substrate for the //! weight matrix GraphConvolutionalLayer(size_t layer_num, - const galois::graphs::GNNGraph& graph, + const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions, - const GNNLayerConfig& config); + const GNNLayerConfig& config) + : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, + config), + input_column_intermediates_(dimensions.input_columns), + output_column_intermediates_(dimensions.output_columns) { + galois::gWarn( + "GCN layer not up to date with new subgraph/sampling changes; " + "do not use until updated to reflect changes (see GraphSAGE layer)"); + + size_t num_input_elements = this->layer_dimensions_.input_rows * + this->layer_dimensions_.input_columns; + if (!this->config_.disable_dropout || + this->config_.disable_aggregate_after_update || + this->layer_dimensions_.input_columns <= + this->layer_dimensions_.output_columns) { + galois::gInfo(this->graph_.host_prefix(), "Creating layer ", + this->layer_number_, ", GCN input temp var 1 ", + num_input_elements, " (", + this->FloatElementsToGB(num_input_elements), " GB)"); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateInTemp1(num_input_elements); + } else { +#endif + in_temp_1_.resize(num_input_elements, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } + + // only on in dropout case + if in temp is smaller than out temp + if (!this->config_.disable_dropout && + (this->config_.disable_aggregate_after_update || + this->layer_dimensions_.input_columns <= + this->layer_dimensions_.output_columns)) { + galois::gInfo(this->graph_.host_prefix(), "Creating layer ", + this->layer_number_, ", GCN input temp var 2 ", + num_input_elements, " (", + this->FloatElementsToGB(num_input_elements), " GB)"); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateInTemp2(num_input_elements); + } else { +#endif + in_temp_2_.resize(num_input_elements, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } + + size_t num_output_elements = this->layer_dimensions_.input_rows * + this->layer_dimensions_.output_columns; + + // only needed if out temp would be smaller than intemp + if (!this->config_.disable_aggregate_after_update && + this->layer_dimensions_.input_columns > + this->layer_dimensions_.output_columns) { + // xform matrix first to work with a smaller output size + galois::gInfo(this->graph_.host_prefix(), "Creating layer ", + this->layer_number_, ", GCN output temp var ", + num_output_elements, " (", + this->FloatElementsToGB(num_output_elements), " GB)"); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateOutTemp(num_output_elements); + } else { +#endif + out_temp_.resize(num_output_elements, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } + + this->layer_type_ = galois::GNNLayerType::kGraphConvolutional; +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + // init pointers with size + p_in_temp_1_ = PointerWithSize(gpu_object_.in_temp_1(), + num_input_elements); + p_in_temp_2_ = PointerWithSize(gpu_object_.in_temp_2(), + num_input_elements); + p_out_temp_ = PointerWithSize(gpu_object_.out_temp(), + num_output_elements); + } else { +#endif + p_in_temp_1_ = PointerWithSize(in_temp_1_); + p_in_temp_2_ = PointerWithSize(in_temp_2_); + p_out_temp_ = PointerWithSize(out_temp_); +#ifdef GALOIS_ENABLE_GPU + } +#endif + + GALOIS_LOG_VERBOSE("Conv layer initialized"); + } GraphConvolutionalLayer(size_t layer_num, - const galois::graphs::GNNGraph& graph, + const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) : GraphConvolutionalLayer(layer_num, graph, backward_output_matrix, @@ -29,11 +125,200 @@ class GraphConvolutionalLayer : public GNNLayer { // Parent functions const PointerWithSize - ForwardPhase(const PointerWithSize input_embeddings) final; + ForwardPhase(const PointerWithSize input_embeddings) final { + galois::StatTimer timer("ForwardPhase", kRegionName); + timer.start(); + GALOIS_LOG_VERBOSE("Calling forward phase"); + assert(input_embeddings.size() == (this->layer_dimensions_.input_rows * + this->layer_dimensions_.input_columns)); + assert(this->p_forward_output_matrix_.size() == + (this->layer_dimensions_.input_rows * + this->layer_dimensions_.output_columns)); + // pointer to input to operate on + const GNNFloat* input_data = input_embeddings.data(); + GNNFloat* agg_data; + // first, dropout + if (!this->config_.disable_dropout && + (this->layer_phase_ == GNNPhase::kTrain)) { + this->DoDropout(input_embeddings, &p_in_temp_1_); + input_data = p_in_temp_1_.data(); + agg_data = p_in_temp_2_.data(); + } else { + agg_data = p_in_temp_1_.data(); + } + + // flip aggregate/update if dimensions favor it (do less work) + if (this->config_.disable_aggregate_after_update || + this->layer_dimensions_.input_columns <= + this->layer_dimensions_.output_columns) { + // aggregation and update + AggregateAll(this->layer_dimensions_.input_columns, input_data, agg_data, + &input_column_intermediates_); + UpdateEmbeddings(agg_data, this->p_forward_output_matrix_.data()); + } else { + // update to aggregate + // FW + UpdateEmbeddings(input_data, p_out_temp_.data()); + // A(FW) + AggregateAll(this->layer_dimensions_.output_columns, p_out_temp_.data(), + this->p_forward_output_matrix_.data(), + &output_column_intermediates_); + } + + if (!this->config_.disable_activation) { + GALOIS_LOG_VERBOSE("Doing activation"); + this->Activation(); + } + + assert(this->p_forward_output_matrix_.size() == + (this->layer_dimensions_.input_rows * + this->layer_dimensions_.output_columns)); + timer.stop(); + + return this->p_forward_output_matrix_; + } PointerWithSize BackwardPhase(PointerWithSize prev_layer_input, - PointerWithSize* input_gradient) final; + PointerWithSize* input_gradient) final { + galois::StatTimer timer("BackwardPhase", kRegionName); + galois::StatTimer weight_gradient_timer("BackwardPhaseWeight", kRegionName); + galois::StatTimer weight_gradient_sync_timer("BackwardPhaseWeightSync", + kRegionName); + timer.start(); + + assert(this->layer_phase_ == GNNPhase::kTrain); + + // derivative of activation + if (!this->config_.disable_activation) { + this->ActivationDerivative(input_gradient); + } + + // AFW = O + galois::PointerWithSize input_data; + galois::PointerWithSize agg_data; + if (!this->config_.disable_dropout) { + // dropout result is currently stored in temp 1 + // needs to be used before it gets overwritten + input_data = p_in_temp_1_; + agg_data = p_in_temp_2_; + } else { + // no dropout = use vanilla input + input_data = prev_layer_input; + agg_data = p_in_temp_1_; + } + + // NOTE: PREV LAYER INPUT AND BACKWARDOUTPUT ARE THE SAME MEMORY LOCATION; + // BEWARE OF DEPENDENCIES + + // derivative of aggregation/update + // TODO clean up logic here to reduce nesting + if (this->config_.disable_aggregate_after_update || + this->layer_dimensions_.input_columns <= + this->layer_dimensions_.output_columns) { + // aggdata can == p_intemp1; in other words, need to use before overwrite + // mask it, then use it + this->MaskInputNonMasters(&agg_data); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.GetWeightGradientsGPU( + this->layer_dimensions_.input_rows, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, agg_data.data(), + input_gradient->data(), this->p_layer_weight_gradients.data()); + } else { +#endif + weight_gradient_timer.start(); + // temp 2 holds aggregated feature vectors from forward phase + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns, + this->layer_dimensions_.input_rows, + this->layer_dimensions_.output_columns, agg_data.data(), + input_gradient->data(), this->p_layer_weight_gradients_.data()); + weight_gradient_timer.stop(); +#ifdef GALOIS_ENABLE_GPU + } +#endif + + // gradient isn't masked here; only temp1, which has already been + // overwritten = fine + if (this->layer_number_ != 0) { + // transposed sgemm for derivative; in_temp is output + assert(input_gradient->size() == + this->layer_dimensions_.input_rows * + this->layer_dimensions_.output_columns); + // pintemp1 contains (AF)' + UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data()); + // pback contains F' + // derivative of aggregate is the same due to symmetric graph + AggregateAll(this->layer_dimensions_.input_columns, p_in_temp_1_.data(), + this->p_backward_output_matrix_.data(), + &input_column_intermediates_, true); + } + } else { + // TODO at this point, out_temp contains memoized FW + // can use it to get A' = O' (FW)^T + // aggregate occurs regardless of layer being equal to 0 because it is + // required in this case for the weight gradient calculation + // this is (FW)' + AggregateAll(this->layer_dimensions_.output_columns, + input_gradient->data(), p_out_temp_.data(), + &output_column_intermediates_, true); + + // done after above because input_data = p_backward_output_matrix in some + // cases; use first before overwriting here if layer # doesn't = 0, it + // means I can mess with the input data itself instad of masking the + // gradients I can mask the input + if (this->layer_number_ != 0) { + this->MaskInputNonMasters(&input_data); + } else { + // if 0 then no input to mask: mask the gradient + // this is fine because gradient won't be used to get feature gradients + this->MaskGradientNonMasters(&p_out_temp_); + } + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.GetWeightGradientsGPU( + this->layer_dimensions_.input_rows, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, input_data.data(), + p_out_temp_.data(), this->p_layer_weight_gradients.data()); + } else { +#endif + weight_gradient_timer.start(); + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns, + this->layer_dimensions_.input_rows, + this->layer_dimensions_.output_columns, input_data.data(), + p_out_temp_.data(), this->p_layer_weight_gradients_.data()); + weight_gradient_timer.stop(); +#ifdef GALOIS_ENABLE_GPU + } +#endif + + if (this->layer_number_ != 0) { + // can now overwrite p_backward without issue; since input gradient + // is untouched if layer number isn't 0 this will be correct + UpdateEmbeddingsDerivative(p_out_temp_.data(), + this->p_backward_output_matrix_.data()); + } + } + + // sync weight gradients; note aggregation sync occurs in the function call + // already + weight_gradient_sync_timer.start(); + this->WeightGradientSyncSum(); + weight_gradient_sync_timer.stop(); + + if (!this->config_.disable_dropout && this->layer_number_ != 0) { + this->DoDropoutDerivative(); + } + + timer.stop(); + return this->p_backward_output_matrix_; + } private: static const constexpr char* kRegionName = "GCNLayer"; @@ -59,28 +344,194 @@ class GraphConvolutionalLayer : public GNNLayer { output_column_intermediates_; //! CPU aggregation - void AggregateAllCPU( - size_t column_length, const GNNFloat* node_embeddings, - GNNFloat* aggregate_output, - galois::substrate::PerThreadStorage>* pts); + void + AggregateAllCPU(size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + galois::substrate::PerThreadStorage>*) { + galois::StatTimer aggregate_all_sync_timer("AggregateSync", kRegionName); + size_t num_nodes = this->graph_.size(); + size_t last_master = *(this->graph_.end_owned()); + assert(0 == *(this->graph_.begin_owned())); + + galois::do_all( + galois::iterate(static_cast(0), num_nodes), + [&](size_t src) { + size_t index_to_src_feature = src * column_length; + // zero out src feature first + for (size_t i = 0; i < column_length; i++) { + aggregate_output[index_to_src_feature + i] = 0; + } + + if (this->layer_phase_ == GNNPhase::kTrain) { + if (this->IsSampledLayer()) { + // XXX(loc) + GALOIS_LOG_WARN( + "Edge sampling not yet implemented for GCN; only SAGE"); + // check if node is part of sampled graph; ignore after 0'ing if + // not sampled + if (!this->graph_.IsInSampledGraph(src)) + return; + } + } + + GNNFloat source_norm = 0.0; + if (!this->config_.disable_normalization) { + source_norm = this->graph_.GetGCNNormFactor(src); + } + + // init to self + if (!this->config_.disable_self_aggregate) { + graphs::bitset_graph_aggregate.set(src); + // only aggregate self once on master + if (src < last_master) { + for (size_t i = 0; i < column_length; i++) { + aggregate_output[index_to_src_feature + i] = + node_embeddings[index_to_src_feature + i] * source_norm * + source_norm; + } + } + } + + // loop through all destinations to grab the feature to aggregate + for (auto e = this->graph_.edge_begin(src); + e != this->graph_.edge_end(src); e++) { + size_t dst = this->graph_.GetEdgeDest(e); + graphs::bitset_graph_aggregate.set(src); + + if (this->layer_phase_ == GNNPhase::kTrain) { + if (this->IsSampledLayer()) { + // ignore non-sampled nodes + if (this->layer_phase_ == GNNPhase::kTrain && + !this->graph_.IsInSampledGraph(dst)) + continue; + } + } + + size_t index_to_dst_feature = dst * column_length; + + if (!this->config_.disable_normalization) { + GNNFloat norm_scale = + source_norm * this->graph_.GetGCNNormFactor(dst); + galois::VectorMulAdd( + column_length, &aggregate_output[index_to_src_feature], + &node_embeddings[index_to_dst_feature], norm_scale, + &aggregate_output[index_to_src_feature]); + } else { + // add dst feature to aggregate output + galois::VectorAdd(column_length, + &aggregate_output[index_to_src_feature], + &node_embeddings[index_to_dst_feature], + &aggregate_output[index_to_src_feature]); + } + } + }, + galois::chunk_size<1>(), galois::steal(), + galois::loopname("ConvolutionalAggregateAll")); + // aggregate sync + aggregate_all_sync_timer.start(); + this->graph_.AggregateSync(aggregate_output, column_length); + aggregate_all_sync_timer.stop(); + } //! Performs aggregation for all nodes of the graph given the length of the //! vector to aggregate, the features themselves, an output array, and per //! thread storage for the intermediate scaling via norm factor - void - AggregateAll(size_t column_length, const GNNFloat* node_embeddings, - GNNFloat* aggregate_output, - galois::substrate::PerThreadStorage>* pts); + void AggregateAll( + size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + galois::substrate::PerThreadStorage>* pts) { + AggregateAll(column_length, node_embeddings, aggregate_output, pts, false); + } + void AggregateAll(size_t column_length, const GNNFloat* node_embeddings, GNNFloat* aggregate_output, galois::substrate::PerThreadStorage>* pts, - bool is_backward); + bool is_backward) { + std::string agg_timer_name = "Aggregate"; + if (!is_backward) { + agg_timer_name += "Forward"; + } else { + agg_timer_name += "Backward"; + } + galois::StatTimer timer(agg_timer_name.c_str(), kRegionName); + timer.start(); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + size_t last_master = *(this->graph_.end_owned()); + gpu_object_.AggregateAllGPU( + this->graph_.GetGPUGraph(), this->graph_.size(), column_length, + node_embeddings, aggregate_output, + !this->config_.disable_normalization, + this->config_.disable_self_aggregate, last_master); + this->graph_.AggregateSyncGPU(aggregate_output, column_length, + this->layer_number_); + } else { +#endif + AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts); +#ifdef GALOIS_ENABLE_GPU + } +#endif + timer.stop(); + } //! Do embedding update via mxm with this layer's weights (forward) - void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output); + void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output) { + galois::StatTimer timer("ForwardXform", kRegionName); + timer.start(); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.UpdateEmbeddingsGPU(this->layer_dimensions_.input_rows, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, + node_embeddings, + base_gpu_object_.layer_weights(), output); + } else { +#endif + // CPU version is just a call into CBlas + galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, + this->layer_dimensions_.input_rows, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, + node_embeddings, this->layer_weights_.data(), output); +#ifdef GALOIS_ENABLE_GPU + } +#endif + timer.stop(); + } + //! Calculate graident via mxm with last layer's gradients (backward) - void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output); + void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output) { + galois::StatTimer timer("BackwardXform", kRegionName); + timer.start(); + + assert(this->p_layer_weights_.size() == + this->layer_dimensions_.input_columns * + this->layer_dimensions_.output_columns); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.UpdateEmbeddingsDerivativeGPU( + this->layer_dimensions_.input_rows, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, gradients, + base_gpu_object_.layer_weights(), output); + } else { +#endif + // difference is Trans for B matrix (data) to get z by y (weights is y by + // z normally); result is x by y + galois::CBlasSGEMM(CblasNoTrans, CblasTrans, + this->layer_dimensions_.input_rows, + this->layer_dimensions_.output_columns, + this->layer_dimensions_.input_columns, gradients, + this->layer_weights_.data(), output); +#ifdef GALOIS_ENABLE_GPU + } +#endif + timer.stop(); + } + #ifdef GALOIS_ENABLE_GPU GCNGPUAllocations gpu_object_; #endif diff --git a/libgnn/include/galois/layers/L2NormLayer.h b/libgnn/include/galois/layers/L2NormLayer.h index 0ed1a0d0df..e3ec67f726 100644 --- a/libgnn/include/galois/layers/L2NormLayer.h +++ b/libgnn/include/galois/layers/L2NormLayer.h @@ -8,39 +8,152 @@ namespace galois { //! Applies L2 norm to rows of the input -class L2NormLayer : public GNNLayer { +template +class L2NormLayer : public GNNLayer { public: - L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, - + L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) : L2NormLayer(layer_num, graph, backward_output_matrix, dimensions, GNNLayerConfig{.allocate_weights = false}) {} - L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + L2NormLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions, const GNNLayerConfig& config) - : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config) { - layer_type_ = galois::GNNLayerType::kL2Norm; + : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, + config) { + this->layer_type_ = galois::GNNLayerType::kL2Norm; // input/output columns must be equivalent in a softmax GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns); GALOIS_LOG_VERBOSE("L2 norm initialized"); } const PointerWithSize - ForwardPhase(const PointerWithSize input_embeddings); + ForwardPhase(const PointerWithSize input_embeddings) { +#ifdef GALOIS_ENABLE_GPU + // TODO +#endif + GALOIS_LOG_FATAL( + "L2 Layer has not been kept up to date for months; do not use"); + return ForwardPhaseCPU(input_embeddings); + } PointerWithSize BackwardPhase(PointerWithSize prev_layer_input, - PointerWithSize* input_gradient); + PointerWithSize* input_gradient) { +#ifdef GALOIS_ENABLE_GPU + // TODO +#endif + return BackwardPhaseCPU(prev_layer_input, input_gradient); + } private: const PointerWithSize - ForwardPhaseCPU(const PointerWithSize input_embeddings); + ForwardPhaseCPU(const PointerWithSize input_embeddings) { + this->forward_output_matrix_.assign(this->forward_output_matrix_.size(), + 0.0); + // for each row, get square root of squared sums then normalize + const size_t feature_length = this->layer_dimensions_.input_columns; + // TODO(loc) make sure this works in distributed setting as well + galois::do_all( + galois::iterate(this->graph_.begin_owned(), this->graph_.end_owned()), + [&](const unsigned row) { + if (this->IsSampledLayer()) { + if (this->layer_phase_ == GNNPhase::kTrain && + !this->graph_.IsInSampledGraph(row)) + return; + } + + if (this->graph_.IsValidForPhase(row, this->layer_phase_)) { + size_t row_offset = row * feature_length; + float running_square_sum = 0.0; + // get square sums + for (size_t row_index = row_offset; + row_index < (row_offset + feature_length); row_index++) { + running_square_sum += std::pow(input_embeddings[row_index], 2); + } + + // make sure running sum isn't too small + running_square_sum = + (running_square_sum < 1.0e-12) ? 10e-12 : running_square_sum; + + // sqrt of sums, then divide row by it + float sqrt_squares = std::pow(running_square_sum, 0.5); + for (size_t row_index = row_offset; + row_index < (row_offset + feature_length); row_index++) { + this->forward_output_matrix_[row_index] = + input_embeddings[row_index] / sqrt_squares; + } + } + }, + galois::loopname("L2ForwardNormalization")); + + return this->forward_output_matrix_; + } PointerWithSize BackwardPhaseCPU(PointerWithSize prev_layer_input, - PointerWithSize* input_gradient); + PointerWithSize* input_gradient) { + galois::do_all( + galois::iterate(size_t{0}, this->p_backward_output_matrix_.size()), + [&](size_t i) { this->p_backward_output_matrix_[i] = 0; }); + const size_t feature_length = this->layer_dimensions_.input_columns; + + // derivative of some x_1 is sum of gradient w.r.t. x_1 for all elements of + // the row (since l2 norm affects entire row) + // The math itself can be derived using quotient/chain rule on each element + // of the normalized row + galois::do_all( + galois::iterate(this->graph_.begin_owned(), this->graph_.end_owned()), + [&](const unsigned row) { + if (this->IsSampledLayer()) { + if (this->layer_phase_ == GNNPhase::kTrain && + !this->graph_.IsInSampledGraph(row)) + return; + } + + if (this->graph_.IsValidForPhase(row, this->layer_phase_)) { + size_t row_offset = row * feature_length; + // note: if you work this out on paper it turns out that terms that + // seem extra in the way this is calculated below simply get + // canceled out, so this ends up working out This implementation is + // taken from the IPDPS GraphSAINT implementation: I (loc) have + // confirmed the math checks out + float running_square_sum = 0.0; + float mult_with_input = 0.0; + + // get square sums + for (size_t row_index = row_offset; + row_index < (row_offset + feature_length); row_index++) { + running_square_sum += std::pow(prev_layer_input[row_index], 2); + // gradient multiplied with corresponding input; subtraction + // because derivative math ends up working out that way + mult_with_input -= + prev_layer_input[row_index] * (*input_gradient)[row_index]; + } + running_square_sum = + (running_square_sum < 1.0e-12) ? 10e-12 : running_square_sum; + assert(running_square_sum != 0.0); + + // denominator for all gradients is just the square sum to the + // -3/2'd power since this is -, all we have to do is multiply it + // later rather than divide + float denominator = std::pow(running_square_sum, -1.5); + assert(denominator != 0.0); + + for (size_t row_index = row_offset; + row_index < (row_offset + feature_length); row_index++) { + this->p_backward_output_matrix_[row_index] = + denominator * + (prev_layer_input[row_index] * mult_with_input + + (*input_gradient)[row_index] * running_square_sum); + } + } + }, + galois::loopname("L2Backward")); + + return this->p_backward_output_matrix_; + } //! No op void OptimizeLayer(BaseOptimizer*, size_t) { return; }; diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index 581115a00e..19d5a75815 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -1,6 +1,8 @@ #pragma once #include "galois/layers/GNNLayer.h" #include "galois/layers/GradientSyncStructures.h" +#include "galois/GNNMath.h" +#include "galois/Logging.h" #ifdef GALOIS_ENABLE_GPU #include "galois/layers/SAGELayer.cuh" @@ -22,23 +24,177 @@ struct SAGELayerConfig { //! ends up performing better for some graphs) //! - Concatination of the self: rather than aggregating self //! feature it is concatinated (i.e. dimensions are doubled) -class SAGELayer : public GNNLayer { +template +class SAGELayer : public GNNLayer { public: //! Initializes the variables of the base class and also allocates additional //! memory for temporary matrices. Also initializes sync substrate for the //! weight matrix - SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions, const GNNLayerConfig& config, - const SAGELayerConfig& sage_config); + const SAGELayerConfig& sage_config) + : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, + config), + sage_config_(sage_config), + input_column_intermediates_(dimensions.input_columns), + output_column_intermediates_(dimensions.output_columns) { + if (!sage_config_.disable_concat) { + // there are now 2 weight matrices used: one for self, one for aggregation + // abstractly it's one matrix: W = W1 | W2 + size_t num_weight_elements = this->layer_dimensions_.input_columns * + this->layer_dimensions_.output_columns; + galois::gInfo(this->graph_.host_prefix(), "Creating layer ", + this->layer_number_, ", SAGE second layer weights ", + num_weight_elements, " (", + this->FloatElementsToGB(num_weight_elements), " GB)"); + // TODO(lhc) for now, allocate dummy cpu weight2 for copying to GPU + layer_weights_2_.resize(num_weight_elements); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateWeight2(num_weight_elements); + } +#endif + galois::gInfo(this->graph_.host_prefix(), "Creating layer ", + this->layer_number_, ", SAGE second layer gradients ", + num_weight_elements, " (", + this->FloatElementsToGB(num_weight_elements), " GB)"); + layer_weight_gradients_2_.resize(num_weight_elements, 0); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateWeightGradient2(num_weight_elements); + } +#endif + + // reinit both weight matrices as one unit + this->PairGlorotBengioInit(&this->layer_weights_, &layer_weights_2_); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + // copy weight2 to GPU + gpu_object_.CopyToWeights2(layer_weights_2_); + p_layer_weights_2_ = PointerWithSize( + gpu_object_.layer_weights_2(), num_weight_elements); + p_layer_weight_gradients_2_ = PointerWithSize( + gpu_object_.layer_weight_gradients_2(), num_weight_elements); + } else { +#endif + // update the pointers to them as well as realloc will require it + p_layer_weights_2_ = PointerWithSize(layer_weights_2_); + p_layer_weight_gradients_2_ = + PointerWithSize(layer_weight_gradients_2_); +#ifdef GALOIS_ENABLE_GPU + } +#endif + std::vector weight_size = {num_weight_elements}; + // initialize the optimizer + second_weight_optimizer_ = + std::make_unique(weight_size, 1); + } + + // TODO(loc) dropout uses input rows; this won't work if dropout is enabled + size_t num_in_temp_elements = this->layer_dimensions_.output_rows * + this->layer_dimensions_.input_columns; + + // if (this->layer_number_ == 0) { + // // set this to true for layer 0; it avoids aggregation completely + // // in the last layer for the backward phase + // config_.disable_aggregate_after_update = true; + // // TODO this *will* hurt test evaluation because test eval has no + // // backward phase, so the end-to-end benefits do not exist there + // // Solution to this is to allocate all intermediate structures for both + // // cases + make sure resize handles both cases + // } + + // if in temp is smaller than out temp, or if dropout exists + if (!this->config_.disable_dropout || + this->config_.disable_aggregate_after_update || + this->layer_dimensions_.input_columns <= + this->layer_dimensions_.output_columns) { + galois::gInfo(this->graph_.host_prefix(), "Creating layer ", + this->layer_number_, ", SAGE input temp var 1 ", + num_in_temp_elements, " (", + this->FloatElementsToGB(num_in_temp_elements), " GB)"); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateInTemp1(num_in_temp_elements); + } else { +#endif + in_temp_1_.resize(num_in_temp_elements, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } + + // only on in dropout case + if in temp is smaller than out temp + if (!this->config_.disable_dropout && + (this->config_.disable_aggregate_after_update || + this->layer_dimensions_.input_columns <= + this->layer_dimensions_.output_columns)) { + galois::gInfo(this->graph_.host_prefix(), "Creating layer ", + this->layer_number_, ", SAGE input temp var 2 ", + num_in_temp_elements, " (", + this->FloatElementsToGB(num_in_temp_elements), " GB)"); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateInTemp2(num_in_temp_elements); + } else { +#endif + in_temp_2_.resize(num_in_temp_elements, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } + + size_t num_out_temp = this->layer_dimensions_.input_rows * + this->layer_dimensions_.output_columns; + // only needed if out temp would be smaller than intemp + if (!this->config_.disable_aggregate_after_update && + this->layer_dimensions_.input_columns > + this->layer_dimensions_.output_columns) { + galois::gInfo(this->graph_.host_prefix(), "Creating layer ", + this->layer_number_, ", SAGE output temp var ", + num_out_temp, " (", this->FloatElementsToGB(num_out_temp), + " GB)"); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateOutTemp(num_out_temp); + } else { +#endif + out_temp_.resize(num_out_temp, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } + + this->layer_type_ = galois::GNNLayerType::kSAGE; +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + // init pointers with size + p_in_temp_1_ = PointerWithSize(gpu_object_.in_temp_1(), + num_in_temp_elements); + p_in_temp_2_ = PointerWithSize(gpu_object_.in_temp_2(), + num_in_temp_elements); + p_out_temp_ = PointerWithSize(gpu_object_.out_temp(), + num_output_elements); + } else { +#endif + p_in_temp_1_ = PointerWithSize(in_temp_1_); + p_in_temp_2_ = PointerWithSize(in_temp_2_); + p_out_temp_ = PointerWithSize(out_temp_); +#ifdef GALOIS_ENABLE_GPU + } +#endif + + GALOIS_LOG_VERBOSE("SAGE layer initialized"); + } - SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions, const GNNLayerConfig& config) : SAGELayer(layer_num, graph, backward_output_matrix, dimensions, config, SAGELayerConfig()) {} - SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + SAGELayer(size_t layer_num, const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) : SAGELayer(layer_num, graph, backward_output_matrix, dimensions, @@ -69,11 +225,350 @@ class SAGELayer : public GNNLayer { // Parent functions const PointerWithSize - ForwardPhase(const PointerWithSize input_embeddings) final; + ForwardPhase(const PointerWithSize input_embeddings) final { + // galois::gDebug( + // "Layer ", this->layer_number_, " dims: ", + // layer_dimensions_.input_rows, " ", layer_dimensions_.output_rows, " ", + // layer_dimensions_.input_columns, " + // ", layer_dimensions_.output_columns, " ", input_embeddings.size(), " + // ", layer_dimensions_.input_rows * layer_dimensions_.input_columns); + galois::StatTimer timer("ForwardPhase", kRegionName); + this->TimerStart(&timer); + + assert(input_embeddings.size() >= (this->layer_dimensions_.input_rows * + this->layer_dimensions_.input_columns)); + assert(this->p_forward_output_matrix_.size() >= + (this->layer_dimensions_.output_rows * + this->layer_dimensions_.output_columns)); + + // pointer to input to operate on + const GNNFloat* input_data = input_embeddings.data(); + GNNFloat* agg_data; + // first, dropout + if (!this->config_.disable_dropout && + (this->layer_phase_ == GNNPhase::kTrain)) { + this->DoDropout(input_embeddings, &p_in_temp_1_); + input_data = p_in_temp_1_.data(); + agg_data = p_in_temp_2_.data(); + } else { + agg_data = p_in_temp_1_.data(); + } + + // O = FW1 + AFW2 is what is done if concat is on: below is the AFW2 part + // which is done regardless + + // flip aggregate/update if dimensions favor it (do less work) + if (this->config_.disable_aggregate_after_update || + this->layer_dimensions_.input_columns <= + this->layer_dimensions_.output_columns) { + if (!this->config_.disable_dropout && + (this->layer_phase_ == GNNPhase::kTrain)) { + assert(p_in_temp_2_.size() >= + this->layer_dimensions_.output_rows * + this->layer_dimensions_.input_columns); + } else { + assert(p_in_temp_1_.size() >= + this->layer_dimensions_.output_rows * + this->layer_dimensions_.input_columns); + } + + // aggregation and update + AggregateAll(this->layer_dimensions_.input_columns, input_data, agg_data, + &input_column_intermediates_); + assert(this->p_forward_output_matrix_.size() >= + this->layer_dimensions_.output_rows * + this->layer_dimensions_.output_columns); + UpdateEmbeddings(agg_data, this->p_forward_output_matrix_.data(), true); + } else { + assert(p_out_temp_.size() >= this->layer_dimensions_.input_rows * + this->layer_dimensions_.output_columns); + + // update to aggregate + // FW + UpdateEmbeddings(input_data, p_out_temp_.data(), false); + + // A(FW) + assert(this->p_forward_output_matrix_.size() >= + this->layer_dimensions_.output_rows * + this->layer_dimensions_.output_columns); + AggregateAll(this->layer_dimensions_.output_columns, p_out_temp_.data(), + this->p_forward_output_matrix_.data(), + &output_column_intermediates_); + } + + if (!sage_config_.disable_concat) { + // FW1 is unaffected by the agg/update flip, so can to it + // separately + SelfFeatureUpdateEmbeddings(input_data, + this->p_forward_output_matrix_.data()); + } + + if (!this->config_.disable_activation) { + GALOIS_LOG_VERBOSE("Doing activation"); + this->Activation(); + } + + assert(this->p_forward_output_matrix_.size() >= + (this->layer_dimensions_.output_rows * + this->layer_dimensions_.output_columns)); + + this->TimerStop(&timer); + + return this->p_forward_output_matrix_; + } PointerWithSize BackwardPhase(PointerWithSize prev_layer_input, - PointerWithSize* input_gradient) final; + PointerWithSize* input_gradient) final { + galois::StatTimer timer("BackwardPhase", kRegionName); + galois::StatTimer weight_gradient_sync_timer("BackwardPhaseWeightSync", + kRegionName); + galois::StatTimer weight_gradient_sync_timer2("BackwardPhaseWeight2Sync", + kRegionName); + this->TimerStart(&timer); + + assert(this->layer_phase_ == GNNPhase::kTrain || + this->layer_phase_ == GNNPhase::kBatch); + + // derivative of activation + if (!this->config_.disable_activation) { + this->ActivationDerivative(input_gradient); + } + + // if dropout was used, use the dropout matrix for the input + galois::PointerWithSize input_data; + galois::PointerWithSize agg_data; + if (!this->config_.disable_dropout) { + // dropout result is currently stored in temp 1 + // needs to be used before it gets overwritten + input_data = p_in_temp_1_; + agg_data = p_in_temp_2_; + } else { + // no dropout = use vanilla input + input_data = prev_layer_input; + agg_data = p_in_temp_1_; + } + + // aggregate this here before gradient starts to get overwritten + // this is xform ffirst + if (!this->config_.disable_aggregate_after_update && + this->layer_dimensions_.input_columns > + this->layer_dimensions_.output_columns) { + // aggregate occurs regardless of layer being equal to 0 because it is + // required in this case for the weight gradient calculation + // this is (FW)' + // TODO: this is absolutely terrible performance wise as well; keep + // in mind + AggregateAll(this->layer_dimensions_.output_columns, + input_gradient->data(), p_out_temp_.data(), + &output_column_intermediates_, true); + } + + if (!sage_config_.disable_concat) { + if (this->layer_number_ != 0) { + if (this->graph_.IsSubgraphOn()) { + this->MaskInputNonMasters(&input_data, + this->layer_dimensions_.input_rows, + this->graph_.GetNonLayerZeroMasters()); + } else { + this->MaskInputNonMasters(&input_data, + this->layer_dimensions_.input_rows); + } + } else { + // if 0 then no input to mask: mask the gradient + // this is fine because gradient won't be used to get feature gradients + if (this->graph_.IsSubgraphOn()) { + this->MaskGradientNonMasters(input_gradient, + this->layer_dimensions_.output_rows, + this->graph_.GetNonLayerZeroMasters()); + } else { + this->MaskGradientNonMasters(input_gradient, + this->layer_dimensions_.output_rows); + } + } + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.UpdateWeight2DerivativeGPU( + this->layer_dimensions_.input_columns, + this->layer_dimensions_.input_rows, + this->layer_dimensions_.output_columns, input_data.data(), + input_gradient->data(), p_layer_weight_gradients_2_.data()); + } else { +#endif + // input data (prev layer input or temp1) or gradient need mask + // can mask gradient if layer == 0 + // otherwise must mask other + + galois::StatTimer concat_grad_timer("ConcatGradMultiply", kRegionName); + this->TimerStart(&concat_grad_timer); + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_rows, + this->layer_dimensions_.output_columns, input_data.data(), + input_gradient->data(), p_layer_weight_gradients_2_.data()); + this->TimerStop(&concat_grad_timer); + +#ifdef GALOIS_ENABLE_GPU + } +#endif + } + + weight_gradient_sync_timer2.start(); + this->WeightGradientSyncSum2(); + weight_gradient_sync_timer2.stop(); + + // derivative of aggregation/update + // TODO clean up logic here to reduce nesting + if (this->config_.disable_aggregate_after_update || + this->layer_dimensions_.input_columns <= + this->layer_dimensions_.output_columns) { + // aggdata can == p_intemp1; in other words, need to use before overwrite + // mask it, then use it + // XXX masking may not be required in sampling case where rows change + if (this->layer_number_ != 0 || sage_config_.disable_concat) { + if (this->graph_.IsSubgraphOn()) { + this->MaskInputNonMasters(&agg_data, + this->layer_dimensions_.output_rows, + this->graph_.GetNonLayerZeroMasters()); + } else { + this->MaskInputNonMasters(&agg_data, + this->layer_dimensions_.output_rows); + } + } + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + // XXX output rows + gpu_object_.GetWeightGradientsGPU( + this->layer_dimensions_.input_rows, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, agg_data.data(), + input_gradient->data(), this->p_layer_weight_gradients_.data()); + } else { +#endif + // agg data holds aggregated feature vectors from forward phase + galois::StatTimer normal_grad_timer("NormalGradMultiply", kRegionName); + this->TimerStart(&normal_grad_timer); + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_rows, + this->layer_dimensions_.output_columns, agg_data.data(), + input_gradient->data(), this->p_layer_weight_gradients_.data()); + this->TimerStop(&normal_grad_timer); +#ifdef GALOIS_ENABLE_GPU + } +#endif + + // 0 means input gradient shouldn't get masked + if (this->layer_number_ != 0) { + // NOTE: this is super nice because it avoids aggregation completely + // in the layer 0 setting + // ---unmasked--- + // transposed sgemm for derivative; in_temp is output + assert(input_gradient->size() >= + this->layer_dimensions_.output_rows * + this->layer_dimensions_.output_columns); + // pintemp1 contains (AF)' + // overwrites the dropout matrix that was in ptemp1 (needed for second + // weight matrix) + UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data(), + true); + + // pback contains F' + // derivative of aggregate is the same due to symmetric graph + AggregateAll(this->layer_dimensions_.input_columns, p_in_temp_1_.data(), + this->p_backward_output_matrix_.data(), + &input_column_intermediates_, true); + } + } else { + // xform first + + // --unmasked-- + + // disable concat is part of condition because otherwise this mask + // should have gotten done elsewhere + if (this->layer_number_ != 0 && sage_config_.disable_concat) { + if (this->graph_.IsSubgraphOn()) { + this->MaskInputNonMasters(&input_data, + this->layer_dimensions_.input_rows, + this->graph_.GetNonLayerZeroMasters()); + } else { + this->MaskInputNonMasters(&input_data, + this->layer_dimensions_.input_rows); + } + } + + // layer number 0 means output needs to be masked because input cannot + // be masked + if (this->layer_number_ == 0) { + // if 0 then no input to mask: mask the gradient + // this is fine because gradient won't be used to get feature gradients + if (this->graph_.IsSubgraphOn()) { + this->MaskGradientNonMasters(&p_out_temp_, + this->layer_dimensions_.input_rows, + this->graph_.GetNonLayerZeroMasters()); + } else { + this->MaskGradientNonMasters(&p_out_temp_, + this->layer_dimensions_.input_rows); + } + } + + // W' = F^T (FW)' + // TODO put this in a function +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.GetWeightGradientsGPU( + this->layer_dimensions_.input_rows, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, input_data.data(), + p_out_temp_.data(), this->p_layer_weight_gradients_.data()); + } else { +#endif + // input col x input row * input row x output col + galois::StatTimer normal_grad_timer("NormalGradMultiply", kRegionName); + this->TimerStart(&normal_grad_timer); + galois::CBlasSGEMM( + CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns, + this->layer_dimensions_.input_rows, + this->layer_dimensions_.output_columns, input_data.data(), + p_out_temp_.data(), this->p_layer_weight_gradients_.data()); + this->TimerStop(&normal_grad_timer); +#ifdef GALOIS_ENABLE_GPU + } +#endif + + // to get a correct result out temp mask cannot be masked; + // outtemp will only be masked if layer number is 0, so this + // is safe in all other cases + if (this->layer_number_ != 0) { + // derivative for update + // backout = F' + UpdateEmbeddingsDerivative( + p_out_temp_.data(), this->p_backward_output_matrix_.data(), false); + } + } + + weight_gradient_sync_timer.start(); + this->WeightGradientSyncSum(); + weight_gradient_sync_timer.stop(); + + // full gradient needed here; should occur after all updates + if (this->layer_number_ != 0) { + // deal with feature gradients for the self feature here + // this function will sum directly into the backward matrix + // input gradient never gets masked if layer number != 0 + SelfFeatureUpdateEmbeddingsDerivative( + input_gradient->data(), this->p_backward_output_matrix_.data()); + } + + if (!this->config_.disable_dropout && this->layer_number_ != 0) { + this->DoDropoutDerivative(); + } + + this->TimerStop(&timer); + return this->p_backward_output_matrix_; + } #ifdef GALOIS_ENABLE_GPU //! Copies over self weight gradients to CPU from GPU @@ -93,52 +588,457 @@ class SAGELayer : public GNNLayer { void AggregateAllCPU( size_t column_length, const GNNFloat* node_embeddings, GNNFloat* aggregate_output, - galois::substrate::PerThreadStorage>* pts, - bool is_backward); + galois::substrate::PerThreadStorage>*, + bool is_backward) { + // aggregation causes a row count change + size_t num_rows_to_handle; + if (!is_backward) { + num_rows_to_handle = this->layer_dimensions_.output_rows; + } else { + num_rows_to_handle = this->layer_dimensions_.input_rows; + } + + galois::do_all( + galois::iterate(*(this->graph_.begin()), num_rows_to_handle), + [&](size_t src) { + size_t index_to_src_feature = src * column_length; + // zero out src feature first + for (size_t i = 0; i < column_length; i++) { + aggregate_output[index_to_src_feature + i] = 0; + } + + GNNFloat source_norm = 0.0; + if (!this->config_.disable_normalization) { + source_norm = + this->graph_.GetDegreeNorm(src, this->graph_user_layer_number_); + } + + if (!is_backward) { + // loop through all destinations to grab the feature to aggregate + for (auto e = this->graph_.edge_begin(src); + e != this->graph_.edge_end(src); e++) { + if (this->layer_phase_ == GNNPhase::kTrain || + this->layer_phase_ == GNNPhase::kBatch) { + // XXX + // galois::gDebug("In here"); + if (this->IsSampledLayer()) { + if (!this->graph_.IsEdgeSampled( + e, this->graph_user_layer_number_)) { + continue; + } + } + } + size_t dst = this->graph_.GetEdgeDest(e); + graphs::bitset_graph_aggregate.set( + this->graph_.ConvertToLID(src)); + size_t index_to_dst_feature = dst * column_length; + + if (!this->config_.disable_normalization) { + GNNFloat norm_scale = source_norm; + assert(norm_scale != 0); + + galois::VectorMulAdd( + column_length, &aggregate_output[index_to_src_feature], + &node_embeddings[index_to_dst_feature], norm_scale, + &aggregate_output[index_to_src_feature]); + } else { + // add dst feature to aggregate output + galois::VectorAdd(column_length, + &aggregate_output[index_to_src_feature], + &node_embeddings[index_to_dst_feature], + &aggregate_output[index_to_src_feature]); + } + } + } else { + // loop through all destinations to grab the feature to aggregate + for (auto e = this->graph_.in_edge_begin(src); + e != this->graph_.in_edge_end(src); e++) { + if (this->layer_phase_ == GNNPhase::kTrain || + this->layer_phase_ == GNNPhase::kBatch) { + // XXX + if (this->IsSampledLayer()) { + if (!this->graph_.IsInEdgeSampled( + e, this->graph_user_layer_number_)) { + continue; + } + } + } + size_t dst = this->graph_.GetInEdgeDest(e); + graphs::bitset_graph_aggregate.set( + this->graph_.ConvertToLID(src)); + + // input row x output row in backward means that i shouldn't be + // touching nodes past output rows; the above sample check + // should deal with this where this matters + assert(dst < this->layer_dimensions_.output_rows); + + size_t index_to_dst_feature = dst * column_length; + + if (!this->config_.disable_normalization) { + GNNFloat norm_scale = this->graph_.GetDegreeNorm( + dst, this->graph_user_layer_number_); + + assert(norm_scale != 0); + + galois::VectorMulAdd( + column_length, &aggregate_output[index_to_src_feature], + &node_embeddings[index_to_dst_feature], norm_scale, + &aggregate_output[index_to_src_feature]); + } else { + // add dst feature to aggregate output + galois::VectorAdd(column_length, + &aggregate_output[index_to_src_feature], + &node_embeddings[index_to_dst_feature], + &aggregate_output[index_to_src_feature]); + } + } + } + }, + galois::chunk_size<1>(), galois::steal(), + galois::loopname("SAGEAggregateAll")); + } //! Performs aggregation for all nodes of the graph given the length of the //! vector to aggregate, the features themselves, an output array, and per //! thread storage for the intermediate scaling via norm factor - void - AggregateAll(size_t column_length, const GNNFloat* node_embeddings, - GNNFloat* aggregate_output, - galois::substrate::PerThreadStorage>* pts); + void AggregateAll( + size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + galois::substrate::PerThreadStorage>* pts) { + AggregateAll(column_length, node_embeddings, aggregate_output, pts, false); + } + void AggregateAll(size_t column_length, const GNNFloat* node_embeddings, GNNFloat* aggregate_output, galois::substrate::PerThreadStorage>* pts, - bool is_backward); + bool is_backward) { + std::string agg_timer_name = "AggregateCompute"; + std::string agg_sync_timer_name = "AggregateSync"; + size_t num_rows_to_handle; + if (!is_backward) { + agg_timer_name += "Forward"; + agg_sync_timer_name += "Forward"; + num_rows_to_handle = this->layer_dimensions_.output_rows; + } else { + agg_timer_name += "Backward"; + agg_sync_timer_name += "Backward"; + num_rows_to_handle = this->layer_dimensions_.input_rows; + } + galois::StatTimer timer(agg_timer_name.c_str(), kRegionName); + galois::StatTimer aggregate_all_sync_timer(agg_sync_timer_name.c_str(), + kRegionName); + this->TimerStart(&timer); + +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + if (!this->IsSampledLayer()) { + gpu_object_.AggregateAllGPU( + this->graph_.GetGPUGraph(), this->graph_.size(), column_length, + node_embeddings, aggregate_output, + !this->config_.disable_normalization, is_backward); + } else { + // TODO(hochan) + GALOIS_LOG_FATAL("SAMPLING IMPLEMENTATION"); + } + this->graph_.AggregateSyncGPU(aggregate_output, column_length, + this->layer_number_); + } else { +#endif + AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts, + is_backward); + this->TimerStop(&timer); + + // aggregate sync + aggregate_all_sync_timer.start(); + this->graph_.AggregateSync(aggregate_output, column_length, is_backward, + num_rows_to_handle); + aggregate_all_sync_timer.stop(); +#ifdef GALOIS_ENABLE_GPU + } +#endif + } //! Do embedding update via mxm with this layer's weights (forward) void UpdateEmbeddings(const GNNFloat* node_embeddings, GNNFloat* output, - bool after); + bool after) { + galois::StatTimer timer("ForwardXForm", kRegionName); + this->TimerStart(&timer); +#ifdef GALOIS_ENABLE_GPU + // TODO self change + // XXX(hochan) output rows + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.UpdateEmbeddingsGPU(this->layer_dimensions_.input_rows, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, + node_embeddings, + base_gpu_object_.layer_weights(), output); + } else { +#endif + // galois::gDebug("Layer ", this->graph_user_layer_number_, " ", + // layer_dimensions_.output_rows, " ", + // layer_dimensions_.input_columns, " ", + // layer_dimensions_.output_columns); + // CPU version is just a call into CBlas + if (after) { + galois::CBlasSGEMM( + CblasNoTrans, CblasNoTrans, this->layer_dimensions_.output_rows, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, node_embeddings, + this->p_layer_weights_.data(), output); + } else { + galois::CBlasSGEMM( + CblasNoTrans, CblasNoTrans, this->layer_dimensions_.input_rows, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, node_embeddings, + this->p_layer_weights_.data(), output); + } +#ifdef GALOIS_ENABLE_GPU + } +#endif + this->TimerStop(&timer); + } + //! Same as above but uses the second set of weights (self feature weights) void SelfFeatureUpdateEmbeddings(const GNNFloat* node_embeddings, - GNNFloat* output); + GNNFloat* output) { + galois::StatTimer timer("SelfForwardXForm", kRegionName); + this->TimerStart(&timer); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.SelfFeatureUpdateEmbeddingsGPU( + this->layer_dimensions_.input_rows, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, node_embeddings, output); + } else { +#endif + // note use of layer weights 2 differentiates this from above + galois::CBlasSGEMM( + CblasNoTrans, CblasNoTrans, this->layer_dimensions_.output_rows, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, node_embeddings, + layer_weights_2_.data(), output, true); +#ifdef GALOIS_ENABLE_GPU + } +#endif + this->TimerStop(&timer); + } + //! Calculate graident via mxm with last layer's gradients (backward) void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output, - bool after); + bool after) { + galois::StatTimer timer("BackwardXForm", kRegionName); + this->TimerStart(&timer); + + assert(this->p_layer_weights_.size() >= + this->layer_dimensions_.input_columns * + this->layer_dimensions_.output_columns); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.UpdateEmbeddingsDerivativeGPU( + this->layer_dimensions_.input_rows, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, gradients, + base_gpu_object_.layer_weights(), output); + } else { +#endif + // difference is Trans for B matrix (data) to get z by y (weights is y by + // z normally); result is x by y note input rows is used here due to + // transpose of aggregation + if (after) { + galois::CBlasSGEMM(CblasNoTrans, CblasTrans, + this->layer_dimensions_.output_rows, + this->layer_dimensions_.output_columns, + this->layer_dimensions_.input_columns, gradients, + this->p_layer_weights_.data(), output); + } else { + galois::CBlasSGEMM(CblasNoTrans, CblasTrans, + this->layer_dimensions_.input_rows, + this->layer_dimensions_.output_columns, + this->layer_dimensions_.input_columns, gradients, + this->p_layer_weights_.data(), output); + } +#ifdef GALOIS_ENABLE_GPU + } +#endif + this->TimerStop(&timer); + } + //! Same as above but uses the second set of weights (self feature weights) void SelfFeatureUpdateEmbeddingsDerivative(const GNNFloat* gradients, - GNNFloat* output); + GNNFloat* output) { + galois::StatTimer timer("SelfBackwardXForm", kRegionName); + this->TimerStart(&timer); + + assert(this->p_layer_weights_.size() >= + this->layer_dimensions_.input_columns * + this->layer_dimensions_.output_columns); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.SelfFeatureUpdateEmbeddingsDerivativeGPU( + this->layer_dimensions_.input_rows, + this->layer_dimensions_.output_columns, + this->layer_dimensions_.input_columns, gradients, output); + } else { +#endif + // difference is Trans for B matrix (data) to get z by y (weights is y by + // z normally); result is x by y true at end -> accumulate + galois::CBlasSGEMM(CblasNoTrans, CblasTrans, + this->layer_dimensions_.output_rows, + this->layer_dimensions_.output_columns, + this->layer_dimensions_.input_columns, gradients, + layer_weights_2_.data(), output, true); +#ifdef GALOIS_ENABLE_GPU + } +#endif + this->TimerStop(&timer); + } //! override parent function: optimizes the second set of weights as well - void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number); + void OptimizeLayer(BaseOptimizer* optimizer, size_t trainable_layer_number) { + galois::StatTimer total_gradient_timer("GradientDescent", kRegionName); + total_gradient_timer.start(); + optimizer->GradientDescent(this->p_layer_weight_gradients_, + this->p_layer_weights_, trainable_layer_number); + if (!sage_config_.disable_concat) { + second_weight_optimizer_->GradientDescent(p_layer_weight_gradients_2_, + p_layer_weights_2_, 0); + } + total_gradient_timer.stop(); + } //! Sync second set of weight gradients - void WeightGradientSyncSum2(); + void WeightGradientSyncSum2() { + galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon"); + this->TimerStart(&clubbed_timer); + galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName); + this->TimerStart(&t); + int weight_size = static_cast(p_layer_weight_gradients_2_.size()); + +#ifdef GALOIS_ENABLE_GPU + bool gpu_direct_enabled = false; + if (device_personality == DevicePersonality::GPU_CUDA && + !gpu_direct_enabled) { + gpu_object_.CopyWeight2GradientsToCPU(&layer_weight_gradients_2_); + MPI_Allreduce(MPI_IN_PLACE, + static_cast(layer_weight_gradients_2_.data()), + weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); + gpu_object_.CopyToWeight2Gradients(layer_weight_gradients_2_); + } else { +#endif + // TODO(loc) remove this limitation later; can just do a loop over the + // weight matrix + if (p_layer_weight_gradients_2_.size() > + size_t{std::numeric_limits::max()}) { + GALOIS_LOG_FATAL( + "Weight sync code does not handle size larger than max " + "int at the moment"); + } + MPI_Allreduce(MPI_IN_PLACE, + static_cast(p_layer_weight_gradients_2_.data()), + weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); +#ifdef GALOIS_ENABLE_GPU + } +#endif + this->TimerStop(&t); + this->TimerStop(&clubbed_timer); + } void ResizeRows(size_t new_row_count) { - GNNLayer::ResizeRows(new_row_count); + GNNLayer::ResizeRows(new_row_count); ResizeIntermediates(new_row_count, new_row_count); } void ResizeInputOutputRows(size_t input_row, size_t output_row) { - GNNLayer::ResizeInputOutputRows(input_row, output_row); + GNNLayer::ResizeInputOutputRows(input_row, output_row); ResizeIntermediates(input_row, output_row); } - void ResizeIntermediates(size_t new_input_rows, size_t new_output_rows); + void ResizeIntermediates(size_t new_input_rows, size_t new_output_rows) { + size_t num_in_temp_elements = + new_output_rows * this->layer_dimensions_.input_columns; + // galois::gDebug(this->graph_.host_prefix(), "Layer num ", + // this->layer_number_, " ", + // in_temp_1_.size(), " and ", num_in_temp_elements, " ", + // layer_dimensions_.input_columns, " ", + // layer_dimensions_.output_columns); + + // if in temp is smaller than out temp, or if dropout exists + if (!this->config_.disable_dropout || + this->config_.disable_aggregate_after_update || + this->layer_dimensions_.input_columns <= + this->layer_dimensions_.output_columns) { + if (in_temp_1_.size() < num_in_temp_elements) { + galois::gInfo(this->graph_.host_prefix(), "Resize layer ", + this->layer_number_, ", SAGE input temp var 1 ", + num_in_temp_elements, " (", + this->FloatElementsToGB(num_in_temp_elements), " GB)"); + size_t buffer_size = num_in_temp_elements * 0.02; +#ifdef GALOIS_ENABLE_GPU + // XXX(hochan) + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateInTemp1(num_in_temp_elements + buffer_size); + } else { +#endif + in_temp_1_.resize(num_in_temp_elements + buffer_size, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif + // XXX(hochan) GPU + p_in_temp_1_ = PointerWithSize(in_temp_1_); + } + } + + // only on in dropout case + if in temp is smaller than out temp + if (!this->config_.disable_dropout && + (this->config_.disable_aggregate_after_update || + this->layer_dimensions_.input_columns <= + this->layer_dimensions_.output_columns)) { + if (in_temp_2_.size() < num_in_temp_elements) { + galois::gInfo(this->graph_.host_prefix(), "Resize layer ", + this->layer_number_, ", SAGE input temp var 2 ", + num_in_temp_elements, " (", + this->FloatElementsToGB(num_in_temp_elements), " GB)"); + size_t buffer_size = num_in_temp_elements * 0.02; +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateInTemp2(num_in_temp_elements + buffer_size); + } else { +#endif + in_temp_2_.resize(num_in_temp_elements + buffer_size, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif + // XXX(hochan) GPU + p_in_temp_2_ = PointerWithSize(in_temp_2_); + } + } + + size_t num_output_temp_elements = + new_input_rows * this->layer_dimensions_.output_columns; + // only needed if out temp would be smaller than intemp + if (!this->config_.disable_aggregate_after_update && + this->layer_dimensions_.input_columns > + this->layer_dimensions_.output_columns) { + if (out_temp_.size() < num_output_temp_elements) { + galois::gInfo( + this->graph_.host_prefix(), "Resize layer ", this->layer_number_, + ", SAGE output temp var ", num_output_temp_elements, " (", + this->FloatElementsToGB(num_output_temp_elements), " GB)"); + size_t buffer_size = (num_output_temp_elements * 0.02); +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.AllocateOutTemp(num_output_temp_elements + buffer_size); + } else { +#endif + out_temp_.resize(num_output_temp_elements + buffer_size, 0); +#ifdef GALOIS_ENABLE_GPU + } +#endif + p_out_temp_ = PointerWithSize(out_temp_); + } + } + } //! SAGE config params SAGELayerConfig sage_config_; diff --git a/libgnn/include/galois/layers/SigmoidLayer.h b/libgnn/include/galois/layers/SigmoidLayer.h index 209929bf30..26d9271d37 100644 --- a/libgnn/include/galois/layers/SigmoidLayer.h +++ b/libgnn/include/galois/layers/SigmoidLayer.h @@ -1,5 +1,8 @@ #pragma once #include "galois/layers/GNNLayer.h" +#include "galois/GNNMath.h" + +#include // TODO(loc) GPU support @@ -8,17 +11,18 @@ namespace galois { //! Sigmoid layer: applies sigmoid function element wise to each element of the //! input. //! Meant for use with *multi-class* labels. -class SigmoidLayer : public GNNLayer { +template +class SigmoidLayer : public GNNLayer { public: - SigmoidLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, - + SigmoidLayer(size_t layer_num, + const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) - : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, - GNNLayerConfig{.allocate_weights = false}), + : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, + GNNLayerConfig{.allocate_weights = false}), input_loss_(dimensions.input_rows), norm_gradient_vectors_(dimensions.input_columns) { - output_layer_type_ = galois::GNNOutputLayerType::kSigmoid; + this->output_layer_type_ = galois::GNNOutputLayerType::kSigmoid; // input/output columns must be equivalent GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns); // output needs to match number of possible classes @@ -27,18 +31,117 @@ class SigmoidLayer : public GNNLayer { //! Normalizes all elements by applying sigmoid to all of them const PointerWithSize - ForwardPhase(const PointerWithSize input_embeddings) final; + ForwardPhase(const PointerWithSize input_embeddings) final { +#ifdef GALOIS_ENABLE_GPU + // TODO(loc) when GPU needs it + printf("%p\n", input_embeddings.data()); + return p_layer_weights_; +#else + return ForwardPhaseCPU(input_embeddings); +#endif + } //! Get gradients to fix distribution such that it leans more towards //! multiclass ground truth. PointerWithSize BackwardPhase(PointerWithSize, - PointerWithSize*) final; + PointerWithSize*) final { +#ifdef GALOIS_ENABLE_GPU + // TODO(loc) when GPU needs it + return p_layer_weights_; +#else + return BackwardPhaseCPU(); +#endif + } private: const PointerWithSize - ForwardPhaseCPU(const PointerWithSize input_embeddings); - PointerWithSize BackwardPhaseCPU(); + ForwardPhaseCPU(const PointerWithSize input_embeddings) { + galois::gWarn( + "Sigmoid layer has not been kept up to date; do not use unless sure" + " it works with new changes"); + + input_loss_.assign(input_loss_.size(), 0.0); + this->forward_output_matrix_.assign(this->forward_output_matrix_.size(), + 0.0); + const size_t feature_length = this->layer_dimensions_.input_columns; + this->node_count_.reset(); + this->float_accumulator_.reset(); + + galois::do_all( + galois::iterate(this->graph_.begin(), this->graph_.end()), + [&](const unsigned local_node) { + if (this->graph_.IsValidForPhase(local_node, this->layer_phase_)) { + if (this->IsSampledLayer()) { + if (this->layer_phase_ == GNNPhase::kTrain && + !this->graph_.IsInSampledGraph(local_node)) + return; + } + + this->node_count_ += 1; + + size_t node_offset = feature_length * local_node; + // sigmoid the values for this node + for (unsigned index = 0; index < feature_length; index++) { + // splitting in half is done for numerical stability of log + if (input_embeddings[node_offset + index] >= 0) { + this->forward_output_matrix_[node_offset + index] = + 1.0 / (1.0 + expf(-input_embeddings[node_offset + index])); + } else { + this->forward_output_matrix_[node_offset + index] = + expf(input_embeddings[node_offset + index]) / + (1.0 + expf(input_embeddings[node_offset + index])); + } + } + + input_loss_[local_node] = GNNCrossEntropy( + feature_length, this->graph_.GetMultiClassLabel(local_node), + &this->forward_output_matrix_[node_offset]); + // TODO(loc) normalize the loss + this->float_accumulator_ += input_loss_[local_node]; + } + }, + galois::steal(), galois::loopname("SigmoidForward")); + + galois::gPrint( + "Average loss is ", + this->float_accumulator_.reduce() / this->node_count_.reduce(), "\n"); + return this->forward_output_matrix_; + } + + PointerWithSize BackwardPhaseCPU() { + const size_t feature_length = this->layer_dimensions_.input_columns; + galois::do_all( + galois::iterate(size_t{0}, this->p_backward_output_matrix_.size()), + [&](size_t i) { this->p_backward_output_matrix_[i] = 0; }); + + galois::do_all( + galois::iterate(this->graph_.begin(), this->graph_.end()), + [&](const unsigned local_node) { + if (this->graph_.IsValidForPhase(local_node, this->layer_phase_)) { + if (this->IsSampledLayer()) { + if (this->layer_phase_ == GNNPhase::kTrain && + !this->graph_.IsInSampledGraph(local_node)) + return; + } + + // derivative cross entropy into norm grad + const GNNLabel* ground_truth = + this->graph_.GetMultiClassLabel(local_node); + size_t node_offset = feature_length * local_node; + // sigmoid-cross-entropy derivative: turns out all it is is simple + // subtraction + for (unsigned index = 0; index < feature_length; index++) { + this->p_backward_output_matrix_[node_offset + index] = + this->forward_output_matrix_[node_offset + index] - + ground_truth[index]; + } + } + }, + galois::steal(), galois::loopname("SigmoidBackward")); + + return this->p_backward_output_matrix_; + } //! Loss for each row of the input std::vector input_loss_; diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index 3878b29685..b55e37f05d 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -1,5 +1,7 @@ #pragma once #include "galois/layers/GNNLayer.h" +#include "galois/GNNMath.h" + #ifdef GALOIS_ENABLE_GPU #include "galois/layers/SoftmaxLayer.cuh" #endif @@ -9,13 +11,14 @@ namespace galois { //! Softmax layer: takes each row of the input matrix and creates a probability //! distribution based on the magnitude of elements in each row. //! Currently this only works with **single class* labels and is coded as such. -class SoftmaxLayer : public GNNLayer { +template +class SoftmaxLayer : public GNNLayer { public: - SoftmaxLayer(size_t layer_num, const galois::graphs::GNNGraph& graph, - + SoftmaxLayer(size_t layer_num, + const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) - : GNNLayer( + : GNNLayer( layer_num, graph, backward_output_matrix, dimensions, GNNLayerConfig{.allocate_weights = false, .disable_output = true}), #ifdef GALOIS_ENABLE_GPU @@ -27,7 +30,7 @@ class SoftmaxLayer : public GNNLayer { softmax_temp_vectors_(dimensions.input_columns) { - output_layer_type_ = galois::GNNOutputLayerType::kSoftmax; + this->output_layer_type_ = galois::GNNOutputLayerType::kSoftmax; // input/output columns must be equivalent in a softmax GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns); // output needs to match number of possible classes @@ -35,21 +38,146 @@ class SoftmaxLayer : public GNNLayer { } const PointerWithSize - ForwardPhaseCPU(const PointerWithSize input_embeddings); + ForwardPhaseCPU(const PointerWithSize input_embeddings) { + galois::StatTimer Timer("SoftmaxForward", "SoftmaxLayer"); + this->TimerStart(&Timer); + + // note: p_backward == input_embeddings + input_loss_.assign(input_loss_.size(), 0.0); + const size_t feature_length = this->layer_dimensions_.input_columns; +#ifndef NDEBUG + galois::DGAccumulator loss_accum; + galois::DGAccumulator handled; + loss_accum.reset(); + handled.reset(); +#endif + + galois::do_all( + galois::iterate(size_t{0}, this->layer_dimensions_.input_rows), + [&](const unsigned i) { + if (this->IsSampledLayer()) { + if ((this->layer_phase_ == GNNPhase::kTrain || + this->layer_phase_ == GNNPhase::kBatch) && + !this->graph_.IsInSampledGraphSubgraph(i)) { + // XXX + VectorZero(feature_length, + &this->p_backward_output_matrix_[i * feature_length]); + return; + } + } + + // do softmax + GNNSoftmax(feature_length, &input_embeddings[feature_length * i], + &this->p_backward_output_matrix_[feature_length * i]); + // create ground truth vector for this LID + std::vector* ground_truth_vec = + ground_truth_vectors_.getLocal(); + assert(ground_truth_vec->size() == feature_length); + ground_truth_vec->assign(ground_truth_vec->size(), 0.0); + // single class label is an index; set the correct one + (*ground_truth_vec)[static_cast( + this->graph_.GetSingleClassLabel(i))] = 1.0; + + // calculate loss for this LID (note not all i will be filled) + input_loss_[i] = GNNCrossEntropy( + feature_length, ground_truth_vec->data(), + &this->p_backward_output_matrix_[feature_length * i]); +#ifndef NDEBUG + loss_accum += input_loss_[i]; + handled += 1; +#endif + }, + // TODO chunk size? + // steal on as some threads may have nothing to work on + // galois::steal(), galois::loopname("SoftmaxForward")); + galois::steal()); +#ifndef NDEBUG + GNNFloat reduced_loss = loss_accum.reduce(); + size_t t = handled.reduce(); + galois::gPrint("Loss is ", reduced_loss / t, " ", reduced_loss, " ", t, + "\n"); +#endif + + this->TimerStop(&Timer); + return this->p_backward_output_matrix_; + } + //! Creates probability distribution of each row of input const PointerWithSize - ForwardPhase(const PointerWithSize input_embeddings) final; + ForwardPhase(const PointerWithSize input_embeddings) final { +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.ForwardPhaseGPU(this->layer_phase_, this->graph_.size(), + this->layer_dimensions_.input_columns, + input_embeddings.data(), + this->p_backward_output_matrix_.data()); + return this->p_backward_output_matrix_; + } +#endif + return ForwardPhaseCPU(input_embeddings); + } + + PointerWithSize BackwardPhaseCPU() { + galois::StatTimer Timer("SoftmaxBackward", "SoftmaxLayer"); + this->TimerStart(&Timer); + + const size_t feature_length = this->layer_dimensions_.input_columns; + + galois::do_all( + galois::iterate(size_t{0}, this->layer_dimensions_.input_rows), + [&](const unsigned node) { + if (this->IsSampledLayer()) { + if (this->layer_phase_ == GNNPhase::kTrain && + !this->graph_.IsInSampledGraphSubgraph(node)) + return; + } + + size_t correct = this->graph_.GetSingleClassLabel(node); + // See here for explanation for why this works + // https://gombru.github.io/2018/05/23/cross_entropy_loss/ + // Derivation of full combined derivative isn't there, but some + // emperical inspection tells me this is likely correct + // TODO(loc) work it out myself + for (size_t idx = 0; idx < feature_length; idx++) { + if (idx == correct) { + // positive class + this->p_backward_output_matrix_[node * feature_length + idx] = + this->p_backward_output_matrix_[node * feature_length + idx] - + 1; + } else { + // negative class + this->p_backward_output_matrix_[node * feature_length + idx] = + this->p_backward_output_matrix_[node * feature_length + idx]; + } + } + }, + galois::steal(), galois::loopname("SoftmaxBackward")); + + this->TimerStop(&Timer); + + return this->p_backward_output_matrix_; + } - PointerWithSize BackwardPhaseCPU(); //! Get gradients to fix distribution such that it leans more towards single //! class ground truth. PointerWithSize - BackwardPhase(PointerWithSize in_out, - PointerWithSize* input_gradient) final; + BackwardPhase(PointerWithSize, + PointerWithSize*) final { +#ifdef GALOIS_ENABLE_GPU + if (device_personality == DevicePersonality::GPU_CUDA) { + gpu_object_.BackwardPhaseGPU(this->layer_phase_, this->graph_.size(), + this->layer_dimensions_.input_columns, + this->p_backward_output_matrix_.data(), + this->p_backward_output_matrix_.data()); + return this->p_backward_output_matrix_; + } +#endif + return BackwardPhaseCPU(); + } void ResizeRows(size_t new_row_count) { - layer_dimensions_.input_rows = new_row_count; - layer_dimensions_.output_rows = new_row_count; + this->layer_dimensions_.input_rows = new_row_count; + this->layer_dimensions_.output_rows = new_row_count; // no output resize if (input_loss_.size() < new_row_count) { input_loss_.resize(new_row_count * 1.02); @@ -58,8 +186,8 @@ class SoftmaxLayer : public GNNLayer { void ResizeInputOutputRows(size_t in, size_t out) { assert(in == out); - layer_dimensions_.input_rows = in; - layer_dimensions_.output_rows = out; + this->layer_dimensions_.input_rows = in; + this->layer_dimensions_.output_rows = out; // no output resize if (input_loss_.size() < in) { input_loss_.resize(in * 1.02); diff --git a/libgnn/src/DistributedMinibatchTracker.cpp b/libgnn/src/DistributedMinibatchTracker.cpp index dddbc33519..4f25252b0a 100644 --- a/libgnn/src/DistributedMinibatchTracker.cpp +++ b/libgnn/src/DistributedMinibatchTracker.cpp @@ -32,7 +32,7 @@ size_t galois::DistributedMinibatchTracker::GetNumberForNextMinibatch() { if (host == 0) { start = 0; end = std::min(num_per_unit * sampled_num_on_hosts_[host], - (uint32_t)total_minibatch_size_); + (uint32_t)total_minibatch_size_); } else if (host == (num_hosts_ - 1)) { start = std::min(num_per_unit * sampled_num_on_hosts_[host - 1], (uint32_t)total_minibatch_size_); @@ -41,7 +41,7 @@ size_t galois::DistributedMinibatchTracker::GetNumberForNextMinibatch() { start = std::min(num_per_unit * sampled_num_on_hosts_[host - 1], (uint32_t)total_minibatch_size_); end = std::min(num_per_unit * sampled_num_on_hosts_[host], - (uint32_t)total_minibatch_size_); + (uint32_t)total_minibatch_size_); } uint32_t proposed_to_take = end - start; diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp index 582fba95f6..c25f3ae7ec 100644 --- a/libgnn/src/GNNMath.cpp +++ b/libgnn/src/GNNMath.cpp @@ -57,8 +57,8 @@ void galois::VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b, constexpr size_t vectorization_length = 16; const size_t aligned_end = length - length % vectorization_length; __m512 scale_vec_main = _mm512_set_ps( - b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, - b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale); + b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, + b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale); for (size_t i = 0; i < aligned_end; i += vectorization_length) { _mm512_storeu_ps( &output[i], diff --git a/libgnn/src/GraphNeuralNetwork.cpp b/libgnn/src/GraphNeuralNetwork.cpp deleted file mode 100644 index 201da985d5..0000000000 --- a/libgnn/src/GraphNeuralNetwork.cpp +++ /dev/null @@ -1,818 +0,0 @@ -#include "galois/GNNMath.h" -#include "galois/GraphNeuralNetwork.h" -#include "galois/layers/DenseLayer.h" -#include "galois/layers/GraphConvolutionalLayer.h" -#include "galois/layers/L2NormLayer.h" -#include "galois/layers/SAGELayer.h" -#include "galois/layers/SigmoidLayer.h" -#include "galois/layers/SoftmaxLayer.h" - -galois::GraphNeuralNetwork::GraphNeuralNetwork( - std::unique_ptr graph, - std::unique_ptr optimizer, - galois::GraphNeuralNetworkConfig&& config) - : graph_(std::move(graph)), optimizer_(std::move(optimizer)), - config_(std::move(config)) { - if (config_.do_sampling_ && config_.use_train_subgraph_) { - GALOIS_LOG_FATAL("Do not set train subgraph and sampling at same time " - "(sampling uses training subgraph already)"); - } - // max number of rows that can be passed as inputs; allocate space for it as - // this will be the # of rows for each layer - size_t max_rows = graph_->size(); - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - graph_->ResizeGPULayerVector(config_.num_intermediate_layers()); - } -#endif - // used for chaining layers together; begins as nullptr - PointerWithSize prev_output_layer(nullptr, 0); - num_graph_user_layers_ = 0; - - // create the intermediate layers - for (size_t i = 0; i < config_.num_intermediate_layers(); i++) { - GNNLayerType layer_type = config_.intermediate_layer_type(i); - size_t prev_layer_columns; - - if (i != 0) { - // grab previous layer's size - prev_layer_columns = config_.intermediate_layer_size(i - 1); - } else { - // first layer means the input columns are # features in graph - prev_layer_columns = graph_->node_feature_length(); - } - - // max dims - GNNLayerDimensions layer_dims = {.input_rows = max_rows, - .input_columns = prev_layer_columns, - .output_columns = - config_.intermediate_layer_size(i), - .output_rows = max_rows}; - - // test minibatch size: if it's not enabled, then currently the full - // graph is used (should really only subgraph the test nodes, though; - // that's a TODO) - if ((config_.train_minibatch_size() || config_.use_train_subgraph_) && - config_.test_minibatch_size()) { - galois::gInfo("Not allocating rows"); - // set to 0 here to make it allocate nothing - layer_dims.input_rows = 0; - layer_dims.output_rows = 0; - } - - switch (layer_type) { - case GNNLayerType::kGraphConvolutional: - gnn_layers_.push_back(std::move(std::make_unique( - i, *graph_, &prev_output_layer, layer_dims, - config_.default_layer_config()))); - gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++); - break; - case GNNLayerType::kSAGE: - gnn_layers_.push_back(std::move(std::make_unique( - i, *graph_, &prev_output_layer, layer_dims, - config_.default_layer_config()))); - gnn_layers_.back()->SetGraphUserLayerNumber(num_graph_user_layers_++); -#ifdef GALOIS_ENABLE_GPU - // TODO(loc/hochan) sage layer gpu -#endif - break; - case GNNLayerType::kL2Norm: - gnn_layers_.push_back(std::move(std::make_unique( - i, *graph_, &prev_output_layer, layer_dims, - config_.default_layer_config()))); - break; - case GNNLayerType::kDense: - gnn_layers_.push_back(std::move(std::make_unique( - i, *graph_, &prev_output_layer, layer_dims, - config_.default_layer_config()))); - break; - default: - GALOIS_LOG_FATAL("Invalid layer type during network construction"); - } - - // update output layer for next layer - prev_output_layer = gnn_layers_.back()->GetForwardOutput(); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - graph_->InitLayerVectorMetaObjects( - i, galois::runtime::getSystemNetworkInterface().Num, - layer_dims.input_columns, layer_dims.output_columns); - } -#endif - } - - // loop backward and find last GCN/SAGE (main) layer to disable activation - for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); - back_iter++) { - GNNLayerType layer_type = (*back_iter)->layer_type(); - if (layer_type == GNNLayerType::kGraphConvolutional || - layer_type == GNNLayerType::kSAGE) { - galois::gDebug("Disabling activation on layer ", - (*back_iter)->layer_number(), "\n"); - (*back_iter)->DisableActivation(); - break; - } - } - - if (config_.do_sampling() || config_.use_train_subgraph_ || - config.train_minibatch_size() || config.test_minibatch_size()) { - // output layer not included; it will never involve sampling - graph_->InitializeSamplingData(num_graph_user_layers_, - config_.use_train_subgraph_); - } - - num_hosts_ = galois::runtime::getSystemNetworkInterface().Num; - if (config_.train_minibatch_size()) { - graph_->SetupTrainBatcher(config_.train_minibatch_size()); - // size_t local_num = - // if (num_hosts_ > 1) { - // dist_minibatch_tracker_ = std::make_unique( - // galois::runtime::getSystemNetworkInterface().ID, num_hosts_, - // local_num, config_.train_minibatch_size()); - //} - } - - if (config_.test_minibatch_size()) { - graph_->SetupTestBatcher(config_.test_minibatch_size()); - } - - // create the output layer - GNNLayerDimensions output_dims = { - .input_rows = max_rows, - // get last intermediate layer column size - .input_columns = config_.intermediate_layer_size( - config_.num_intermediate_layers() - 1), - .output_columns = config_.output_layer_size(), - .output_rows = max_rows}; - - if ((config_.train_minibatch_size() || config_.use_train_subgraph_) && - config_.test_minibatch_size()) { - output_dims.input_rows = 0; - output_dims.output_rows = 0; - } - - switch (config_.output_layer_type()) { - case (GNNOutputLayerType::kSoftmax): - gnn_layers_.push_back(std::move(std::make_unique( - config_.num_intermediate_layers(), *graph_, &prev_output_layer, - output_dims))); - break; - case (GNNOutputLayerType::kSigmoid): - gnn_layers_.push_back(std::move(std::make_unique( - config_.num_intermediate_layers(), *graph_, &prev_output_layer, - output_dims))); - break; - default: - GALOIS_LOG_FATAL("Invalid layer type during network construction"); - } - - // sanity checking multi-class + output layer - if (!graph_->is_single_class_label() && - (config_.output_layer_type() != GNNOutputLayerType::kSigmoid)) { - GALOIS_LOG_WARN( - "Using a non-sigmoid output layer with a multi-class label!"); - // if debug mode just kill program - assert(false); - } - - // flip sampling on layers - if (config_.use_train_subgraph_ || config_.do_sampling() || - config_.train_minibatch_size()) { - for (std::unique_ptr& ptr : gnn_layers_) { - ptr->EnableSampling(); - } - } -} - -float galois::GraphNeuralNetwork::MinibatchedTesting() { - galois::gDebug("Minibatched Testing"); - graph_->DisableSubgraph(); - graph_->ResetTestMinibatcher(); - SetLayerPhases(galois::GNNPhase::kBatch); - - bool choose_all_status = graph_->SubgraphChooseAllStatus(); - - uint32_t correct = 0; - uint32_t total = 0; - while (true) { - work_left_.reset(); - // size_t seed_node_count = graph_->PrepareNextTestMinibatch(); - graph_->PrepareNextTestMinibatch(); - // last layer input size/output rows becomes seed node size - // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, - // seed_node_count); - size_t num_sampled_layers = 0; - - for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); - back_iter++) { - GNNLayerType layer_type = (*back_iter)->layer_type(); - if (layer_type == GNNLayerType::kGraphConvolutional || - layer_type == GNNLayerType::kSAGE) { - // you can minibatch with sampling or minibatch and grab all - // relevant neighbors - // size_t current_sample_size; - graph_->SampleAllEdges((*back_iter)->graph_user_layer_number(), false, - num_sampled_layers + 1); - // resize this layer, change seed node count - //(*back_iter) - // ->ResizeInputOutputRows(current_sample_size, seed_node_count); - // seed_node_count = current_sample_size; - - num_sampled_layers++; - // XXX resizes above only work for SAGE layers; will break if other - // layers are tested - } - } - - // resize layer matrices - CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); - graph_->EnableSubgraphChooseAll(); - CorrectBackwardLinks(); - - const PointerWithSize batch_pred = DoInference(); - std::pair correct_total = - graph_->GetBatchAccuracy(batch_pred); - - correct += correct_total.first; - total += correct_total.second; - - work_left_ += graph_->MoreTestMinibatches(); - char global_work_left = work_left_.reduce(); - if (!global_work_left) { - break; - } - } - - galois::gInfo("Minibatching Correct / Total ", correct, " ", total); - - if (choose_all_status) { - graph_->EnableSubgraphChooseAll(); - } else { - graph_->DisableSubgraphChooseAll(); - } - - return (1.0 * correct) / (1.0 * total); -} - -float galois::GraphNeuralNetwork::Train(size_t num_epochs) { - EnableTimers(); - const size_t this_host = graph_->host_id(); - float train_accuracy{0.f}; - std::vector subgraph_layer_sizes; - // this subgraph only needs to be created once - if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) { - galois::StatTimer total_subgraph_construction_timer("TotalSubGraphConstruction", kRegionName); - galois::StatTimer setup_neighborhood_sample_timer("SetupNeighborhoodSample", kRegionName); - galois::StatTimer edge_sampling_timer("SampleAllEdges", kRegionName); - galois::StatTimer subgraph_construction_timer("SubGraphConstruction", kRegionName); - total_subgraph_construction_timer.start(); - - setup_neighborhood_sample_timer.start(); - // Setup the subgraph to only be the training graph - size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); - setup_neighborhood_sample_timer.stop(); - - subgraph_layer_sizes.emplace_back(local_seed_node_count); - galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", - local_seed_node_count); - size_t num_sampled_layers = 0; - edge_sampling_timer.start(); - // gnn_layers_.back()->ResizeRows(local_seed_node_count); - for (auto back_iter = gnn_layers_.rbegin(); back_iter != gnn_layers_.rend(); - back_iter++) { - GNNLayerType layer_type = (*back_iter)->layer_type(); - if (layer_type == GNNLayerType::kGraphConvolutional || - layer_type == GNNLayerType::kSAGE) { - size_t current_sample_size = graph_->SampleAllEdges( - (*back_iter)->graph_user_layer_number(), - config_.inductive_subgraph_, num_sampled_layers + 1); - galois::gDebug(graph_->host_prefix(), - "Number of local nodes for train subgraph for layer ", - (*back_iter)->graph_user_layer_number(), " is ", - current_sample_size); - // resizing - //(*back_iter) - // ->ResizeInputOutputRows(current_sample_size, - // local_seed_node_count); - local_seed_node_count = current_sample_size; - subgraph_layer_sizes.emplace_back(local_seed_node_count); - num_sampled_layers++; - } - } - edge_sampling_timer.stop(); - subgraph_construction_timer.start(); - CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); - subgraph_construction_timer.stop(); - CorrectBackwardLinks(); - total_subgraph_construction_timer.stop(); - } - - galois::StatTimer epoch_timer("TrainingTime", kRegionName); - galois::StatTimer validation_timer("ValidationTime", kRegionName); - galois::StatTimer epoch_test_timer("TestTime", kRegionName); - - for (size_t epoch = 0; epoch < num_epochs; epoch++) { - epoch_timer.start(); - // swap to train subgraph - if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) { - graph_->EnableSubgraph(); - // TODO(loc) this doesn't actually function as expected anymore - // with the numerous changes to the system; this commenting - // out is more of a hack for the train subgraph option (which - // probably shouldn't be used anyways) - - // size_t l_count = 0; - // gnn_layers_.back()->ResizeRows(subgraph_layer_sizes[0]); - // for (auto back_iter = gnn_layers_.rbegin(); - // back_iter != gnn_layers_.rend(); back_iter++) { - // GNNLayerType layer_type = (*back_iter)->layer_type(); - // if (layer_type == GNNLayerType::kGraphConvolutional || - // layer_type == GNNLayerType::kSAGE) { - // (*back_iter) - // ->ResizeInputOutputRows(subgraph_layer_sizes[l_count + 1], - // subgraph_layer_sizes[l_count]); - // l_count++; - // } - //} - CorrectBackwardLinks(); - } - - // beginning of epoch sampling (no minibatches) - if (config_.do_sampling() && !config_.train_minibatch_size()) { - galois::StatTimer mb_timer("EpochSubgraphCreation", kRegionName); - galois::StatTimer subgraph_construction_timer("SubGraphConstruction", kRegionName); - galois::StatTimer setup_neighborhood_sample_timer("SetupNeighborhoodSample", kRegionName); - galois::StatTimer edge_sampling_timer("SampleEdges", kRegionName); - mb_timer.start(); - - setup_neighborhood_sample_timer.start(); - size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); - setup_neighborhood_sample_timer.stop(); - // gnn_layers_.back()->ResizeRows(local_seed_node_count); - galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", - local_seed_node_count); - size_t num_sampled_layers = 0; - - edge_sampling_timer.start(); - // work backwards on GCN/SAGE layers - // loop backward and find last GCN/SAGE (main) layer to disable activation - for (auto back_iter = gnn_layers_.rbegin(); - back_iter != gnn_layers_.rend(); back_iter++) { - GNNLayerType layer_type = (*back_iter)->layer_type(); - if (layer_type == GNNLayerType::kGraphConvolutional || - layer_type == GNNLayerType::kSAGE) { - size_t current_sample_size = graph_->SampleEdges( - (*back_iter)->graph_user_layer_number(), - config_.fan_out_vector_[num_sampled_layers], - config_.inductive_subgraph_, num_sampled_layers + 1); - galois::gDebug(graph_->host_prefix(), - "Number of local nodes for layer ", - (*back_iter)->graph_user_layer_number(), " is ", - current_sample_size); - - //(*back_iter) - // ->ResizeInputOutputRows(current_sample_size, - // local_seed_node_count); - local_seed_node_count = current_sample_size; - num_sampled_layers++; - } - } - edge_sampling_timer.stop(); - // resize layer matrices - subgraph_construction_timer.start(); - CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); - subgraph_construction_timer.stop(); - CorrectBackwardLinks(); - mb_timer.stop(); - } - - if (!config_.train_minibatch_size()) { - // no minibatching, full batch - const PointerWithSize predictions = DoInference(); - // have to get accuracy here because gradient prop destroys the - // predictions matrix - train_accuracy = GetGlobalAccuracy(predictions); - GradientPropagation(); - } else { - graph_->ResetTrainMinibatcher(); - // if (num_hosts_ > 1) { - // dist_minibatch_tracker_->ResetEpoch(); - //} - - SetLayerPhases(galois::GNNPhase::kBatch); - - size_t batch_num = 0; - - // create mini batch graphs and loop until minibatches on all hosts done - while (true) { - galois::StatTimer prep_timer("PrepNextMinibatch", kRegionName); - galois::StatTimer sample_time("MinibatchSampling", kRegionName); - galois::StatTimer mb_timer("MinibatchSubgraphCreation", kRegionName); - galois::StatTimer subgraph_construction_timer("SubGraphConstruction", kRegionName); - mb_timer.start(); - - galois::Timer batch_timer; - batch_timer.start(); - work_left_.reset(); - galois::gInfo("Epoch ", epoch, " batch ", batch_num++); - // break when all hosts are done with minibatches - prep_timer.start(); - size_t seed_node_count; - // if (num_hosts_ > 1) { - // size_t num_for_next_batch = - // dist_minibatch_tracker_->GetNumberForNextMinibatch(); - // galois::gInfo(graph_->host_prefix(), "Sampling ", - // num_for_next_batch, - // " for this minibatch"); - // seed_node_count = - // graph_->PrepareNextTrainMinibatch(num_for_next_batch); - //} else { - //} - seed_node_count = graph_->PrepareNextTrainMinibatch(); - - galois::gDebug(graph_->host_prefix(), - "Number of local seed nodes is for batch is ", - seed_node_count); - prep_timer.stop(); - - // last layer input size/output rows becomes seed node size - // gnn_layers_.back()->ResizeInputOutputRows(seed_node_count, - // seed_node_count); - - sample_time.start(); - // +1 later in call because 0 is already taken - size_t num_sampled_layers = 0; - for (auto back_iter = gnn_layers_.rbegin(); - back_iter != gnn_layers_.rend(); back_iter++) { - GNNLayerType layer_type = (*back_iter)->layer_type(); - if (layer_type == GNNLayerType::kGraphConvolutional || - layer_type == GNNLayerType::kSAGE) { - // you can minibatch with sampling or minibatch and grab all - // relevant neighbors - size_t current_sample_size; - - if (config_.do_sampling()) { - current_sample_size = graph_->SampleEdges( - (*back_iter)->graph_user_layer_number(), - config_.fan_out_vector_[num_sampled_layers], - config_.inductive_subgraph_, num_sampled_layers + 1); - } else { - current_sample_size = graph_->SampleAllEdges( - (*back_iter)->graph_user_layer_number(), - config_.inductive_subgraph_, num_sampled_layers + 1); - } - - galois::gDebug(graph_->host_prefix(), - "Number of local nodes for layer ", - (*back_iter)->graph_user_layer_number(), " is ", - current_sample_size); - - // resize this layer, change seed node count - //(*back_iter) - // ->ResizeInputOutputRows(current_sample_size, seed_node_count); - seed_node_count = current_sample_size; - num_sampled_layers++; - } - } - sample_time.stop(); - - // resize layer matrices - subgraph_construction_timer.start(); - CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); - subgraph_construction_timer.stop(); - CorrectBackwardLinks(); - - // XXX resizes above only work for SAGE layers; will break if other - // layers are tested - - mb_timer.stop(); - - const PointerWithSize batch_pred = DoInference(); - train_accuracy = GetGlobalAccuracy(batch_pred); - GradientPropagation(); - - work_left_ += graph_->MoreTrainMinibatches(); - char global_work_left = work_left_.reduce(); - batch_timer.stop(); - epoch_timer.stop(); - galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1, - ": Train accuracy/F1 micro is ", train_accuracy, - " time ", batch_timer.get(), "\n"); - - bool test_eval = - config_.minibatch_test_interval_ - ? (batch_num - 1) % config_.minibatch_test_interval_ == 0 - : false; - - if (test_eval) { - DisableTimers(); - float test_acc; - if (!config_.test_minibatch_size()) { - // TODO something about this path breaks accuracy - GALOIS_LOG_FATAL("this path breaks accuracy for the rest of the " - "run for some reason"); - bool f = graph_->SubgraphChooseAllStatus(); - graph_->DisableSubgraph(); - for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); - layer++) { - // TODO nuclear resize - (*layer)->ResizeRows(graph_->size()); - } - CorrectBackwardLinks(); - SetLayerPhases(galois::GNNPhase::kTest); - graph_->EnableSubgraphChooseAll(); - const PointerWithSize test_pred = DoInference(); - test_acc = GetGlobalAccuracy(test_pred); - graph_->SetSubgraphChooseAll(f); - } else { - test_acc = MinibatchedTesting(); - } - - if (this_host == 0) { - galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1, - ": Test accuracy is ", test_acc, "\n"); - const std::string test_name_acc = - "TestEpoch" + std::to_string(epoch) + "Batch" + - std::to_string(batch_num - 1) + "Accuracy"; - galois::runtime::reportStat_Single(kRegionName, test_name_acc, - test_acc); - } - - // report the training time elapsed at this point in time - galois::runtime::reportStat_Single( - kRegionName, - "ElapsedTrainTimeEpoch" + std::to_string(epoch) + "Batch" + - std::to_string(batch_num - 1), - epoch_timer.get()); - // revert to training phase for next epoch - SetLayerPhases(galois::GNNPhase::kTrain); - EnableTimers(); - } - - epoch_timer.start(); - - if (!global_work_left) { - // if (num_hosts_ > 1) { - // GALOIS_LOG_ASSERT(dist_minibatch_tracker_->OutOfWork()); - //} - break; - } - } - } - epoch_timer.stop(); - - if (this_host == 0) { - const std::string t_name_acc = - "TrainEpoch" + std::to_string(epoch) + "Accuracy"; - galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ", - train_accuracy, "\n"); - galois::runtime::reportStat_Single(kRegionName, t_name_acc, - train_accuracy); - } - - bool do_validate = config_.validation_interval_ - ? epoch % config_.validation_interval_ == 0 - : false; - bool do_test = - config_.test_interval_ ? epoch % config_.test_interval_ == 0 : false; - - bool subgraph_choose_all_status = graph_->SubgraphChooseAllStatus(); - - if (do_validate || do_test) { - DisableTimers(); - // disable subgraph - graph_->DisableSubgraph(); - graph_->EnableSubgraphChooseAll(); - } - - if (do_validate) { - // XXX induced subgraph here - for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); - layer++) { - // nuclear resize - (*layer)->ResizeRows(graph_->size()); - } - - CorrectBackwardLinks(); - validation_timer.start(); - SetLayerPhases(galois::GNNPhase::kValidate); - const PointerWithSize val_pred = DoInference(); - validation_timer.stop(); - - float val_acc = GetGlobalAccuracy(val_pred); - if (this_host == 0) { - galois::gPrint("Epoch ", epoch, ": Validation accuracy is ", val_acc, - "\n"); - const std::string v_name_acc = - "ValEpoch" + std::to_string(epoch) + "Accuracy"; - galois::runtime::reportStat_Single(kRegionName, v_name_acc, val_acc); - } - } - - if (do_test) { - epoch_test_timer.start(); - float test_acc; - - if (!config_.test_minibatch_size()) { - for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); - layer++) { - // nuclear resize - (*layer)->ResizeRows(graph_->size()); - } - CorrectBackwardLinks(); - SetLayerPhases(galois::GNNPhase::kTest); - const PointerWithSize test_pred = DoInference(); - epoch_test_timer.stop(); - test_acc = GetGlobalAccuracy(test_pred); - } else { - test_acc = MinibatchedTesting(); - epoch_test_timer.stop(); - } - - if (this_host == 0) { - galois::gPrint("Epoch ", epoch, ": Test accuracy is ", test_acc, "\n"); - const std::string test_name_acc = - "TestEpoch" + std::to_string(epoch) + "Accuracy"; - galois::runtime::reportStat_Single(kRegionName, test_name_acc, - test_acc); - } - } - - if (do_validate || do_test) { - // report the training time elapsed at this point in time - galois::runtime::reportStat_Single( - kRegionName, "ElapsedTrainTimeEpoch" + std::to_string(epoch), - epoch_timer.get()); - // revert to training phase for next epoch - SetLayerPhases(galois::GNNPhase::kTrain); - graph_->SetSubgraphChooseAll(subgraph_choose_all_status); - - // TODO too much code dupe - // Resconstruct the train subgraph since it was replaced by test subgraph - if (config_.use_train_subgraph_ && !config_.train_minibatch_size() && - config_.test_minibatch_size() && do_test) { - // Setup the subgraph to only be the training graph - size_t local_seed_node_count = graph_->SetupNeighborhoodSample(); - galois::gDebug(graph_->host_prefix(), "Number of local seed nodes is ", - local_seed_node_count); - size_t num_sampled_layers = 0; - // gnn_layers_.back()->ResizeRows(local_seed_node_count); - for (auto back_iter = gnn_layers_.rbegin(); - back_iter != gnn_layers_.rend(); back_iter++) { - GNNLayerType layer_type = (*back_iter)->layer_type(); - if (layer_type == GNNLayerType::kGraphConvolutional || - layer_type == GNNLayerType::kSAGE) { - size_t current_sample_size = graph_->SampleAllEdges( - (*back_iter)->graph_user_layer_number(), - config_.inductive_subgraph_, num_sampled_layers + 1); - // resizing - //(*back_iter) - // ->ResizeInputOutputRows(current_sample_size, - // local_seed_node_count); - local_seed_node_count = current_sample_size; - num_sampled_layers++; - } - } - CorrectRowCounts(graph_->ConstructSampledSubgraph(num_sampled_layers)); - CorrectBackwardLinks(); - } - - EnableTimers(); - } - } - - uint64_t average_epoch_time = epoch_timer.get() / num_epochs; - galois::runtime::reportStat_Tavg(kRegionName, "AverageEpochTime", - average_epoch_time); - //DisableTimers(); - // disable subgraph - graph_->DisableSubgraph(); - graph_->EnableSubgraphChooseAll(); - - // check test accuracy - galois::StatTimer test_timer("FinalTestRun", kRegionName); - float global_accuracy; - - test_timer.start(); - - if (!config_.test_minibatch_size()) { - for (auto layer = gnn_layers_.begin(); layer != gnn_layers_.end(); - layer++) { - // TODO nuclear resize; this is **ridiculously** inefficient - // because full graph will be used even if not included in test - // k-hop neighborhood for eval - (*layer)->ResizeRows(graph_->size()); - } - CorrectBackwardLinks(); - SetLayerPhases(galois::GNNPhase::kTest); - const PointerWithSize predictions = DoInference(); - global_accuracy = GetGlobalAccuracy(predictions); - } else { - global_accuracy = MinibatchedTesting(); - } - - test_timer.stop(); - - if (this_host == 0) { - galois::gPrint("Final test accuracy is ", global_accuracy, "\n"); - galois::runtime::reportStat_Single(kRegionName, "FinalTestAccuracy", - global_accuracy); - } - - return global_accuracy; -} - -const galois::PointerWithSize -galois::GraphNeuralNetwork::DoInference() { - galois::StatTimer timer("DoInference", "GraphNeuralNetwork"); - if (timers_on_) { - timer.start(); - } - - // start with graph features and pass it through all layers of the network - galois::PointerWithSize layer_input = - graph_->GetLocalFeatures(); - - for (std::unique_ptr& ptr : gnn_layers_) { - layer_input = ptr->ForwardPhase(layer_input); - } - - if (timers_on_) { - timer.stop(); - } - - return layer_input; -} - -float galois::GraphNeuralNetwork::GetGlobalAccuracy( - PointerWithSize predictions) { -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - if (cpu_pred_.size() != predictions.size()) { - cpu_pred_.resize(predictions.size()); - } - - // TODO get rid of CPU copy here if possible - AdamOptimizer* adam = static_cast(optimizer_.get()); - adam->CopyToVector(cpu_pred_, predictions); - return graph_->GetGlobalAccuracy(cpu_pred_, phase_, config_.do_sampling()); - } else { -#endif - return graph_->GetGlobalAccuracy(predictions, phase_, - config_.do_sampling()); -#ifdef GALOIS_ENABLE_GPU - } -#endif -} - -void galois::GraphNeuralNetwork::GradientPropagation() { - galois::StatTimer timer("GradientPropagation", "GraphNeuralNetwork"); - if (timers_on_) { - timer.start(); - } - - // from output layer get initial gradients - std::vector dummy; - std::unique_ptr& output_layer = gnn_layers_.back(); - galois::PointerWithSize current_gradients = - output_layer->BackwardPhase(dummy, nullptr); - // loops through intermediate layers in a backward fashion - // -1 to ignore output layer which was handled above - for (size_t i = 0; i < gnn_layers_.size() - 1; i++) { - // note this assumes you have at least 2 layers (including output) - size_t layer_index = gnn_layers_.size() - 2 - i; - - // get the input to the layer before this one - galois::PointerWithSize prev_layer_input; - if (layer_index != 0) { - prev_layer_input = gnn_layers_[layer_index - 1]->GetForwardOutput(); - } else { - prev_layer_input = graph_->GetLocalFeatures(); - } - - // backward prop and get a new set of gradients - current_gradients = gnn_layers_[layer_index]->BackwardPhase( - prev_layer_input, ¤t_gradients); - // if not output do optimization/gradient descent - // at this point in the layer the gradients exist; use the gradients to - // update the weights of the layer - gnn_layers_[layer_index]->OptimizeLayer(optimizer_.get(), layer_index); - } - - if (timers_on_) { - timer.stop(); - } -} - -void galois::GraphNeuralNetwork::CorrectBackwardLinks() { - // layer chain pointer - PointerWithSize prev_output_layer(nullptr, 0); - for (size_t layer_num = 0; layer_num < gnn_layers_.size(); layer_num++) { - // first layer is nullptr so can be ignored - if (layer_num != 0) { - gnn_layers_[layer_num]->UpdateBackwardOutput(&prev_output_layer); - } - prev_output_layer = gnn_layers_[layer_num]->GetForwardOutput(); - } -} diff --git a/libgnn/src/graphs/GNNGraph.cpp b/libgnn/src/graphs/GNNGraph.cpp index b0ed03d34c..7fe3fed8f4 100644 --- a/libgnn/src/graphs/GNNGraph.cpp +++ b/libgnn/src/graphs/GNNGraph.cpp @@ -1,45 +1,12 @@ // XXX include net interface if necessary -#include "galois/Logging.h" -#include "galois/graphs/ReadGraph.h" #include "galois/graphs/GNNGraph.h" -#include "galois/GNNMath.h" -#include "galois/graphs/DegreeSyncStructures.h" -#include -namespace { -//! Partitions a particular dataset given some partitioning scheme -std::unique_ptr -LoadPartition(const std::string& input_directory, - const std::string& dataset_name, - galois::graphs::GNNPartitionScheme partition_scheme, - bool useShad) { - // XXX input path - std::string input_file = input_directory + dataset_name + ".csgr"; - GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file); - - // load partition - switch (partition_scheme) { - case galois::graphs::GNNPartitionScheme::kOEC: - return galois::cuspPartitionGraph( - input_file, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "", "", false, 1); - case galois::graphs::GNNPartitionScheme::kCVC: - return galois::cuspPartitionGraph( - input_file, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "", "", false, 1); - case galois::graphs::GNNPartitionScheme::kOCVC: - return galois::cuspPartitionGraph( - input_file, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, "", "", false, 1); - default: - GALOIS_LOG_FATAL("Error: partition scheme specified is invalid"); - return nullptr; - } -} - -} // end namespace +namespace galois { +namespace graphs { +std::vector* sampled_nodes_ = nullptr; // Sync structure variables; global to get around sync structure // limitations at the moment -namespace galois { -namespace graphs { GNNFloat* gnn_matrix_to_sync_ = nullptr; size_t gnn_matrix_to_sync_column_length_ = 0; size_t subgraph_size_ = 0; @@ -47,6 +14,8 @@ size_t subgraph_size_ = 0; galois::DynamicBitSet bitset_graph_aggregate; galois::LargeArray* gnn_lid_to_sid_pointer_ = nullptr; size_t num_active_layer_rows_ = 0; +//! It specifies offset for feature aggregation +size_t feature_aggregation_offset_ = 0; uint32_t* gnn_degree_vec_1_; uint32_t* gnn_degree_vec_2_; @@ -61,1500 +30,6 @@ struct CUDA_Context* cuda_ctx_for_sync; struct CUDA_Context* cuda_ctx; unsigned layer_number_to_sync; #endif -} // namespace graphs -} // namespace galois - -galois::graphs::GNNGraph::GNNGraph(const std::string& dataset_name, - GNNPartitionScheme partition_scheme, - bool has_single_class_label, - bool useShad) - : GNNGraph(galois::default_gnn_dataset_path, dataset_name, partition_scheme, - has_single_class_label, useShad) {} - -galois::graphs::GNNGraph::GNNGraph(const std::string& input_directory, - const std::string& dataset_name, - GNNPartitionScheme partition_scheme, - bool has_single_class_label, - bool useShad) - : input_directory_(input_directory) { - GALOIS_LOG_VERBOSE("[{}] Constructing partitioning for {}", host_id_, - dataset_name); - // save host id - host_id_ = galois::runtime::getSystemNetworkInterface().ID; - host_prefix_ = - std::string("[") + - std::to_string(galois::runtime::getSystemNetworkInterface().ID) + - std::string("] "); - // load partition - partitioned_graph_ = - LoadPartition(input_directory_, dataset_name, partition_scheme, useShad); - // reverse edges - partitioned_graph_->ConstructIncomingEdges(); - // mark a node if it is sampled - mark_sampled_nodes_.resize(partitioned_graph_->size()); - - galois::gInfo(host_prefix_, "Number of local proxies is ", - partitioned_graph_->size()); - galois::gInfo(host_prefix_, "Number of local edges is ", - partitioned_graph_->sizeEdges()); - - // read additional graph data - if (dataset_name != "ogbn-papers100M-remap") { - ReadLocalLabels(dataset_name, has_single_class_label); - } else { - galois::gInfo("Remapped ogbn 100M"); - ReadLocalLabelsBin(dataset_name); - } - ReadLocalFeatures(dataset_name); - ReadLocalMasks(dataset_name); - - // init gluon from the partitioned graph - sync_substrate_ = - std::make_unique>( - *partitioned_graph_, host_id_, - galois::runtime::getSystemNetworkInterface().Num, false, - partitioned_graph_->cartesianGrid()); - bitset_graph_aggregate.resize(partitioned_graph_->size()); - - // init norm factors (involves a sync call) - InitNormFactor(); - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - // allocate/copy data structures over to GPU - GALOIS_LOG_VERBOSE("[{}] Initializing GPU memory", host_id_); - InitGPUMemory(); - - // initialize CUDA context - cuda_ctx_ = get_CUDA_context(host_id_); - if (!init_CUDA_context(cuda_ctx_, ::gpudevice)) { - GALOIS_DIE("Failed to initialize CUDA context"); - } - PartitionedGraphInfo g_info; - GetPartitionedGraphInfo(g_info); - load_graph_CUDA_GNN(cuda_ctx_, g_info, - galois::runtime::getSystemNetworkInterface().Num); - } -#endif -} - -bool galois::graphs::GNNGraph::IsValidForPhaseCompleteRange( - const unsigned lid, const galois::GNNPhase current_phase) const { - // only use ranges if they're complete - // convert to gid first - size_t gid = partitioned_graph_->getGID(lid); - - // select range to use based on phase - const GNNRange* range_to_use; - switch (current_phase) { - case GNNPhase::kTrain: - range_to_use = &global_training_mask_range_; - break; - case GNNPhase::kValidate: - range_to_use = &global_validation_mask_range_; - break; - case GNNPhase::kTest: - range_to_use = &global_testing_mask_range_; - break; - case GNNPhase::kOther: - GALOIS_LOG_FATAL("no range for other"); - break; - default: - GALOIS_LOG_FATAL("Invalid phase used"); - range_to_use = nullptr; - } - - // if within range, it is valid - // there is an assumption here that ranges are contiguous; may not - // necessarily be the case in all inputs in which case using the mask is - // required (but less cache efficient) - if (range_to_use->begin <= gid && gid < range_to_use->end) { - return true; - } else { - return false; - } -} - -bool galois::graphs::GNNGraph::IsValidForPhaseMasked( - const unsigned lid, const galois::GNNPhase current_phase) const { - // select mask to use based on phase - const GNNMask* mask_to_use; - switch (current_phase) { - case GNNPhase::kTrain: - mask_to_use = &local_training_mask_; - break; - case GNNPhase::kValidate: - mask_to_use = &local_validation_mask_; - break; - case GNNPhase::kTest: - mask_to_use = &local_testing_mask_; - break; - case GNNPhase::kOther: - if (valid_other_ == 0) { - return false; - } - mask_to_use = &other_mask_; - break; - case GNNPhase::kBatch: - mask_to_use = &local_minibatch_mask_; - break; - default: - GALOIS_LOG_FATAL("Invalid phase used"); - mask_to_use = nullptr; - } - - return (*mask_to_use)[lid]; -} - -void galois::graphs::GNNGraph::AggregateSync( - GNNFloat* matrix_to_sync, const size_t matrix_column_size, bool is_backward, - uint32_t active_row_boundary) const { - gnn_matrix_to_sync_ = matrix_to_sync; - gnn_matrix_to_sync_column_length_ = matrix_column_size; - subgraph_size_ = active_size(); - num_active_layer_rows_ = active_row_boundary; - if (!use_subgraph_ && !use_subgraph_view_) { - // set globals for the sync substrate - if (!is_backward) { - if (use_timer_) { - sync_substrate_->sync("GraphAggregateSync"); - } else { - sync_substrate_->sync("Ignore"); - } - } else { - galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon"); - clubbed_timer.start(); - sync_substrate_->sync( - "BackwardGraphAggregateSync"); - clubbed_timer.stop(); - } - } else { - // setup the SID to LID map for the sync substrate to use (SID != LID) - gnn_lid_to_sid_pointer_ = subgraph_->GetLIDToSIDPointer(); - - if (!is_backward) { - if (use_timer_) { - sync_substrate_->sync("GraphAggregateSync"); - } else { - sync_substrate_->sync("Ignore"); - } - } else { - galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon"); - clubbed_timer.start(); - sync_substrate_->sync( - "BackwardGraphAggregateSync"); - clubbed_timer.stop(); - } - } -} - -#ifdef GALOIS_ENABLE_GPU -void galois::graphs::GNNGraph::AggregateSyncGPU( - GNNFloat* matrix_to_sync, const size_t matrix_column_size, - const unsigned layer_number) const { - size_t layer_input_mtx_column_size = - getLayerInputMatrixColumnSize(cuda_ctx_, layer_number); - size_t layer_output_mtx_column_size = - getLayerOutputMatrixColumnSize(cuda_ctx_, layer_number); - // set globals for the sync substrate - gnn_matrix_to_sync_ = matrix_to_sync; - gnn_matrix_to_sync_column_length_ = matrix_column_size; - cuda_ctx_for_sync = cuda_ctx_; - layer_number_to_sync = layer_number; - // TODO bitset setting - // call sync - cudaSetLayerInputOutput(cuda_ctx_, matrix_to_sync, matrix_column_size, size(), - layer_number); - - // XXX no timer if use_timer is off - if (gnn_matrix_to_sync_column_length_ == layer_input_mtx_column_size) { - if (use_timer_) { - sync_substrate_->sync( - "GraphAggregateSync", gnn_matrix_to_sync_column_length_); - } else { - sync_substrate_->sync( - "Ignore", gnn_matrix_to_sync_column_length_); - } - } else if (gnn_matrix_to_sync_column_length_ == - layer_output_mtx_column_size) { - if (use_timer_) { - sync_substrate_->sync( - "GraphAggregateSync", gnn_matrix_to_sync_column_length_); - } else { - sync_substrate_->sync( - "Ignore", gnn_matrix_to_sync_column_length_); - } - } else { - GALOIS_LOG_FATAL("Column size of the synchronized matrix does not" - " match to the column size of the CUDA context"); - } -} -#endif -void galois::graphs::GNNGraph::ReadLocalLabelsBin( - const std::string& dataset_name) { - GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_); - - std::ifstream file_stream; - file_stream.open(input_directory_ + dataset_name + "-labels-dims.txt", - std::ios::in); - size_t num_nodes; - file_stream >> num_nodes >> num_label_classes_ >> std::ws; - assert(num_nodes == partitioned_graph_->globalSize()); - if (host_id_ == 0) { - galois::gInfo("Number of label classes is ", num_label_classes_); - } - file_stream.close(); - - std::string filename = input_directory_ + dataset_name + "-labels.bin"; - std::ifstream file_stream_bin; - file_stream_bin.open(filename, std::ios::binary | std::ios::in); - - std::vector all_labels(num_nodes); - // read all labels into a vector - file_stream_bin.read((char*)all_labels.data(), sizeof(GNNLabel) * num_nodes); - - using_single_class_labels_ = true; - local_ground_truth_labels_.resize(partitioned_graph_->size()); - - galois::GAccumulator found_local_vertices; - found_local_vertices.reset(); - - // save only local ones; can do in parallel as well - // assumes -1 already dealt with - galois::do_all(galois::iterate(size_t{0}, partitioned_graph_->size()), - [&](size_t lid) { - local_ground_truth_labels_[lid] = all_labels[GetGID(lid)]; - found_local_vertices += 1; - }); - - size_t fli = found_local_vertices.reduce(); - galois::gInfo(host_prefix_, "Read ", fli, " labels (", - local_ground_truth_labels_.size() * double{4} / (1 << 30), - " GB)"); - GALOIS_LOG_ASSERT(fli == partitioned_graph_->size()); -} - -void galois::graphs::GNNGraph::ReadLocalLabels(const std::string& dataset_name, - bool has_single_class_label) { - GALOIS_LOG_VERBOSE("[{}] Reading labels from disk...", host_id_); - std::string filename; - if (has_single_class_label) { - filename = input_directory_ + dataset_name + "-labels.txt"; - } else { - filename = input_directory_ + dataset_name + "-mlabels.txt"; - } - - // read file header, save num label classes while at it - std::ifstream file_stream; - file_stream.open(filename, std::ios::in); - size_t num_nodes; - file_stream >> num_nodes >> num_label_classes_ >> std::ws; - assert(num_nodes == partitioned_graph_->globalSize()); - if (host_id_ == 0) { - galois::gInfo("Number of label classes is ", num_label_classes_); - } - - // allocate memory for labels - if (has_single_class_label) { - // single-class (one-hot) label for each vertex: N x 1 - using_single_class_labels_ = true; - local_ground_truth_labels_.resize(partitioned_graph_->size()); - } else { - // multi-class label for each vertex: N x num classes - using_single_class_labels_ = false; - local_ground_truth_labels_.resize(partitioned_graph_->size() * - num_label_classes_); - } - - size_t cur_gid = 0; - size_t found_local_vertices = 0; - // each line contains a set of 0s and 1s - std::string read_line; - - // loop through all labels of the graph - while (std::getline(file_stream, read_line)) { - // only process label if this node is local - if (partitioned_graph_->isLocal(cur_gid)) { - uint32_t cur_lid = partitioned_graph_->getLID(cur_gid); - // read line as bitset of 0s and 1s - std::istringstream label_stream(read_line); - int cur_bit; - // bitset size is # of label classes - for (size_t cur_class = 0; cur_class < num_label_classes_; ++cur_class) { - // read a bit - label_stream >> cur_bit; - - if (has_single_class_label) { - // no label - if (cur_bit == -1) { - local_ground_truth_labels_[cur_lid] = num_label_classes_; - break; - } - - // in single class, only 1 bit is set in bitset; that represents the - // class to take - if (cur_bit != 0) { - // set class and break (assumption is that's the only bit that is - // set) - local_ground_truth_labels_[cur_lid] = cur_class; - break; - } - } else { - // else the entire bitset needs to be copied over to the label array - // TODO this can possibly be saved all at once rather than bit by bit? - local_ground_truth_labels_[cur_lid * num_label_classes_ + cur_class] = - cur_bit; - } - } - found_local_vertices++; - } - // always increment cur_gid - cur_gid++; - } - - file_stream.close(); - - galois::gInfo(host_prefix_, "Read ", found_local_vertices, " labels (", - local_ground_truth_labels_.size() * double{4} / (1 << 30), - " GB)"); - GALOIS_LOG_ASSERT(found_local_vertices == partitioned_graph_->size()); -} - -void galois::graphs::GNNGraph::ReadLocalFeatures( - const std::string& dataset_name) { - GALOIS_LOG_VERBOSE("[{}] Reading features from disk...", host_id_); - - // read in dimensions of features, specifically node feature length - size_t num_global_vertices; - - std::string file_dims = input_directory_ + dataset_name + "-dims.txt"; - std::ifstream ifs; - ifs.open(file_dims, std::ios::in); - ifs >> num_global_vertices >> node_feature_length_; - ifs.close(); - - GALOIS_LOG_ASSERT(num_global_vertices == partitioned_graph_->globalSize()); - GALOIS_LOG_VERBOSE("[{}] N x D: {} x {}", host_id_, num_global_vertices, - node_feature_length_); - - // memory for all features of all nodes in graph - // TODO read features without loading entire feature file into memory; this - // is quite inefficient - std::unique_ptr full_feature_set = - std::make_unique(num_global_vertices * node_feature_length_); - - // read in all features - std::ifstream file_stream; - std::string feature_file = input_directory_ + dataset_name + "-feats.bin"; - file_stream.open(feature_file, std::ios::binary | std::ios::in); - file_stream.read((char*)full_feature_set.get(), sizeof(GNNFloat) * - num_global_vertices * - node_feature_length_); - file_stream.close(); - - // allocate memory for local features - local_node_features_.resize(partitioned_graph_->size() * - node_feature_length_); - - // copy over features for local nodes only - galois::GAccumulator num_kept_vertices; - num_kept_vertices.reset(); - galois::do_all( - galois::iterate(size_t{0}, num_global_vertices), [&](size_t gid) { - if (partitioned_graph_->isLocal(gid)) { - // copy over feature vector - std::copy(full_feature_set.get() + gid * node_feature_length_, - full_feature_set.get() + (gid + 1) * node_feature_length_, - &local_node_features_[partitioned_graph_->getLID(gid) * - node_feature_length_]); - num_kept_vertices += 1; - } - }); - full_feature_set.reset(); - - galois::gInfo(host_prefix_, "Read ", local_node_features_.size(), - " features (", - local_node_features_.size() * double{4} / (1 << 30), " GB)"); - GALOIS_LOG_ASSERT(num_kept_vertices.reduce() == partitioned_graph_->size()); -} - -//! Helper function to read masks from file into the appropriate structures -//! given a name, mask type, and arrays to save into -size_t galois::graphs::GNNGraph::ReadLocalMasksFromFile( - const std::string& dataset_name, const std::string& mask_type, - GNNRange* mask_range, std::vector* masks) { - size_t range_begin; - size_t range_end; - - // read mask range - std::string mask_filename = - input_directory_ + dataset_name + "-" + mask_type + "_mask.txt"; - bool train_is_on = false; - if (mask_type == "train") { - train_is_on = true; - } - - std::ifstream mask_stream; - mask_stream.open(mask_filename, std::ios::in); - mask_stream >> range_begin >> range_end >> std::ws; - GALOIS_LOG_ASSERT(range_begin <= range_end); - - // set the range object - mask_range->begin = range_begin; - mask_range->end = range_end; - mask_range->size = range_end - range_begin; - - size_t cur_line_num = 0; - // valid nodes on this host - size_t local_sample_count = 0; - // this tracks TOTAL # of valid nodes in this group (not necessarily valid - // ones on this host) - size_t valid_count = 0; - std::string line; - // each line is a number signifying if mask is set for the vertex - while (std::getline(mask_stream, line)) { - std::istringstream mask_stream(line); - // only examine vertices/lines in range - if (cur_line_num >= range_begin && cur_line_num < range_end) { - unsigned mask = 0; - mask_stream >> mask; - if (mask == 1) { - valid_count++; - if (partitioned_graph_->isLocal(cur_line_num)) { - (*masks)[partitioned_graph_->getLID(cur_line_num)] = 1; - local_sample_count++; - } - if (train_is_on) { - global_training_mask_[cur_line_num] = 1; - } - } - } - cur_line_num++; - } - mask_stream.close(); - - if (train_is_on) { - global_training_count_ = valid_count; - } - - if (valid_count != mask_range->size) { - // overlapping masks: need to actually check the masks rather than use - // ranges - if (!incomplete_masks_) { - galois::gInfo( - "Masks are not contained in range: must actually check mask"); - } - incomplete_masks_ = true; - } - - return valid_count; -} - -size_t galois::graphs::GNNGraph::FindOtherMask() { - galois::GAccumulator other_accum; - other_accum.reset(); - other_mask_.resize(partitioned_graph_->size()); - - galois::do_all( - galois::iterate(size_t{0}, partitioned_graph_->size()), - [&](size_t local_id) { - if (!IsValidForPhase(local_id, GNNPhase::kTrain) && - !IsValidForPhase(local_id, GNNPhase::kValidate) && - !IsValidForPhase(local_id, GNNPhase::kTest)) { - other_mask_[local_id] = 1; - other_accum += 1; - } - }, - galois::loopname("FindOtherMask")); - return other_accum.reduce(); -} - -void galois::graphs::GNNGraph::ReadLocalMasks(const std::string& dataset_name) { - // allocate the memory for the local masks - global_training_mask_.resize(partitioned_graph_->globalSize()); - local_training_mask_.resize(partitioned_graph_->size()); - local_validation_mask_.resize(partitioned_graph_->size()); - local_testing_mask_.resize(partitioned_graph_->size()); - - if (dataset_name == "reddit") { - global_training_count_ = 153431; - - // TODO reddit is hardcode handled at the moment; better way to not do - // this? - global_training_mask_range_ = {.begin = 0, .end = 153431, .size = 153431}; - global_validation_mask_range_ = { - .begin = 153431, .end = 153431 + 23831, .size = 23831}; - global_testing_mask_range_ = { - .begin = 177262, .end = 177262 + 55703, .size = 55703}; - - // training - for (size_t i = global_training_mask_range_.begin; - i < global_training_mask_range_.end; i++) { - if (partitioned_graph_->isLocal(i)) { - local_training_mask_[partitioned_graph_->getLID(i)] = 1; - } - global_training_mask_[i] = 1; - } - - // validation - for (size_t i = global_validation_mask_range_.begin; - i < global_validation_mask_range_.end; i++) { - if (partitioned_graph_->isLocal(i)) { - local_validation_mask_[partitioned_graph_->getLID(i)] = 1; - } - } - - // testing - for (size_t i = global_testing_mask_range_.begin; - i < global_testing_mask_range_.end; i++) { - if (partitioned_graph_->isLocal(i)) { - local_testing_mask_[partitioned_graph_->getLID(i)] = 1; - } - } - } else if (dataset_name == "ogbn-papers100M-remap") { - global_training_count_ = 1207178; - - global_training_mask_range_ = {.begin = 0, .end = 1207178, .size = 1207178}; - global_validation_mask_range_ = { - .begin = 1207178, .end = 1207178 + 125264, .size = 125264}; - global_testing_mask_range_ = { - .begin = 1332442, .end = 1332442 + 214337, .size = 214337}; - // training - for (size_t i = global_training_mask_range_.begin; - i < global_training_mask_range_.end; i++) { - if (partitioned_graph_->isLocal(i)) { - local_training_mask_[partitioned_graph_->getLID(i)] = 1; - } - global_training_mask_[i] = 1; - } - // validation - for (size_t i = global_validation_mask_range_.begin; - i < global_validation_mask_range_.end; i++) { - if (partitioned_graph_->isLocal(i)) { - local_validation_mask_[partitioned_graph_->getLID(i)] = 1; - } - } - // testing - for (size_t i = global_testing_mask_range_.begin; - i < global_testing_mask_range_.end; i++) { - if (partitioned_graph_->isLocal(i)) { - local_testing_mask_[partitioned_graph_->getLID(i)] = 1; - } - } - valid_other_ = FindOtherMask(); - GALOIS_LOG_ASSERT(valid_other_ <= 109513177); - } else { - size_t valid_train = ReadLocalMasksFromFile(dataset_name, "train", - &global_training_mask_range_, - &local_training_mask_); - size_t valid_val = ReadLocalMasksFromFile(dataset_name, "val", - &global_validation_mask_range_, - &local_validation_mask_); - size_t valid_test = ReadLocalMasksFromFile(dataset_name, "test", - &global_testing_mask_range_, - &local_testing_mask_); - valid_other_ = FindOtherMask(); - // the "other" set of nodes that don't fall into any classification - if (galois::runtime::getSystemNetworkInterface().ID == 0) { - galois::gInfo("Valid # training nodes is ", valid_train); - galois::gInfo("Valid # validation nodes is ", valid_val); - galois::gInfo("Valid # test nodes is ", valid_test); - galois::gInfo("Valid # other nodes is ", valid_other_); - } - } -} - -void galois::graphs::GNNGraph::InitNormFactor() { - GALOIS_LOG_VERBOSE("[{}] Initializing norm factors", host_id_); - global_degrees_.resize(partitioned_graph_->size(), 0.0); - global_train_degrees_.resize(partitioned_graph_->size(), 0.0); - CalculateFullNormFactor(); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_memory_.InitNormFactor(partitioned_graph_->size()); - } -#endif -} - -void galois::graphs::GNNGraph::CalculateFullNormFactor() { - // TODO(loc) reset all degrees if this is called multiple times? - // get the norm factor contribution for each node based on the GLOBAL graph - galois::do_all( - galois::iterate(static_cast(0), partitioned_graph_->size()), - [&](size_t src) { - for (auto edge_iter = partitioned_graph_->edge_begin(src); - edge_iter != partitioned_graph_->edge_end(src); edge_iter++) { - // count degrees for all + train/other - size_t dest = GetEdgeDest(edge_iter); - if (IsValidForPhase(dest, GNNPhase::kTrain) || - IsValidForPhase(dest, GNNPhase::kOther)) { - global_train_degrees_[src] += 1; - } - global_degrees_[src] += 1; - } - }, - galois::loopname("CalculateLocalDegrees")); - // degree sync - gnn_degree_vec_1_ = global_train_degrees_.data(); - gnn_degree_vec_2_ = global_degrees_.data(); - sync_substrate_->sync( - "InitialDegreeSync"); -} - -float galois::graphs::GNNGraph::GetGlobalAccuracy( - PointerWithSize predictions, GNNPhase phase) { - // No GPU version yet, but this is where it would be - return GetGlobalAccuracy(predictions, phase, false); -} - -float galois::graphs::GNNGraph::GetGlobalAccuracy( - PointerWithSize predictions, GNNPhase phase, bool sampling) { - // No GPU version yet, but this is where it would be - return GetGlobalAccuracyCPU(predictions, phase, sampling); -} - -float galois::graphs::GNNGraph::GetGlobalAccuracyCPU( - PointerWithSize predictions, GNNPhase phase, bool sampling) { - galois::StatTimer global_accuracy_timer("GetGlobalAccuracy"); - galois::StatTimer global_accuracy_for_singleclass_timer("GetGlobalAccuracyForSingleClass"); - galois::StatTimer global_accuracy_for_multiclass_timer("GetGlobalAccuracyForMultiClass"); - global_accuracy_timer.start(); - float accuracy{0}; - if (is_single_class_label()) { - global_accuracy_for_singleclass_timer.start(); - accuracy = GetGlobalAccuracyCPUSingle(predictions, phase, sampling); - global_accuracy_for_singleclass_timer.stop(); - } else { - global_accuracy_for_multiclass_timer.start(); - accuracy = GetGlobalAccuracyCPUMulti(predictions, phase, sampling); - global_accuracy_for_multiclass_timer.stop(); - } - global_accuracy_timer.stop(); - return accuracy; -} - -float galois::graphs::GNNGraph::GetGlobalAccuracyCPUSingle( - PointerWithSize predictions, GNNPhase phase, bool) { - // check owned nodes' accuracy - num_correct_.reset(); - total_checked_.reset(); - - galois::do_all( - // will only loop over sampled nodes if sampling is on - galois::iterate(begin_owned(), end_owned()), - // this is possibly the subgraph id - [&](const unsigned node_id) { - if (IsValidForPhase(node_id, phase)) { - total_checked_ += 1; - // get prediction by getting max - // note the use of node_id here: lid only used to check original - // labels - size_t predicted_label = galois::MaxIndex( - num_label_classes_, &(predictions[node_id * num_label_classes_])); - // check against ground truth and track accordingly - // TODO static cast used here is dangerous - if (predicted_label == - static_cast(GetSingleClassLabel(node_id))) { - num_correct_ += 1; - } - } - }, - // steal on as some threads may have nothing to work on - galois::steal()); - - size_t global_correct = num_correct_.reduce(); - size_t global_checked = total_checked_.reduce(); - - GALOIS_LOG_DEBUG("Sub: {}, Accuracy: {} / {}", use_subgraph_, global_correct, - global_checked); - - return static_cast(global_correct) / - static_cast(global_checked); -} -std::pair galois::graphs::GNNGraph::GetBatchAccuracy( - PointerWithSize predictions) { - // check owned nodes' accuracy - num_correct_.reset(); - total_checked_.reset(); - - galois::do_all( - // will only loop over sampled nodes if sampling is on - galois::iterate(begin_owned(), end_owned()), - // this is possibly the subgraph id - [&](const unsigned node_id) { - if (IsValidForPhase(node_id, GNNPhase::kBatch)) { - total_checked_ += 1; - size_t predicted_label = galois::MaxIndex( - num_label_classes_, &(predictions[node_id * num_label_classes_])); - if (predicted_label == - static_cast(GetSingleClassLabel(node_id))) { - num_correct_ += 1; - } - } - }, - // steal on as some threads may have nothing to work on - galois::steal(), galois::loopname("GlobalAccuracy")); - - size_t global_correct = num_correct_.reduce(); - size_t global_checked = total_checked_.reduce(); - - return std::make_pair(global_correct, global_checked); -} - -float galois::graphs::GNNGraph::GetGlobalAccuracyCPUMulti( - PointerWithSize predictions, GNNPhase phase, bool sampling) { - - const GNNLabel* full_ground_truth = GetMultiClassLabel(0); - assert(predictions.size() == (num_label_classes_ * size())); - - size_t global_true_positive = 0; - size_t global_true_negative = 0; - size_t global_false_positive = 0; - size_t global_false_negative = 0; - size_t global_f1_score = 0; - - // per class check - for (size_t label_class = 0; label_class < num_label_classes_; - label_class++) { - local_true_positive_.reset(); - local_true_negative_.reset(); - local_false_positive_.reset(); - local_false_negative_.reset(); - - // loop through all *owned* nodes (do not want to overcount) - galois::do_all( - galois::iterate(begin_owned(), end_owned()), - [&](const unsigned lid) { - if (IsValidForPhase(lid, phase)) { - if (sampling) { - if (phase == GNNPhase::kTrain && !IsInSampledGraph(lid)) { - return; - } - } - - size_t label_index = lid * num_label_classes_ + label_class; - - GNNLabel true_label = full_ground_truth[label_index]; - GNNLabel prediction_is_positive = - (predictions[label_index] > 0.5) ? 1 : 0; - - if (true_label && prediction_is_positive) { - local_true_positive_ += 1; - } else if (true_label && !prediction_is_positive) { - local_false_negative_ += 1; - } else if (!true_label && prediction_is_positive) { - local_false_positive_ += 1; - } else if (!true_label && !prediction_is_positive) { - local_true_negative_ += 1; - } else { - // all cases should be covered with clauses above, so it should - // NEVER get here; adding it here just for sanity purposes - GALOIS_LOG_FATAL( - "Logic error with true label and prediction label"); - } - } - total_checked_ += 1; - }, - galois::steal(), galois::loopname("GlobalMultiAccuracy")); - - // reduce from accumulators across all hosts for this particular class - size_t class_true_positives = local_true_positive_.reduce(); - size_t class_false_positives = local_false_positive_.reduce(); - size_t class_true_negatives = local_true_negative_.reduce(); - size_t class_false_negatives = local_false_negative_.reduce(); - - // add to global counts - global_true_positive += class_true_positives; - global_false_positive += class_false_positives; - global_true_negative += class_true_negatives; - global_false_negative += class_false_negatives; - - // calculate precision, recall, and f1 score for this class - // ternery op used to avoid division by 0 - double class_precision = - (class_true_positives + class_true_negatives) > 0 - ? static_cast(class_true_positives) / - (class_true_positives + class_false_positives) - : 0.0; - double class_recall = - (class_true_positives + class_false_negatives) > 0 - ? static_cast(class_true_positives) / - (class_true_positives + class_false_negatives) - : 0.0; - double class_f1_score = (class_precision + class_recall) > 0 - ? (2.0 * (class_precision * class_recall)) / - (class_precision + class_recall) - : 0.0; - - global_f1_score += class_f1_score; - } // end label class loop - - // GALOIS_LOG_WARN("{} {} {} {}", global_true_positive, global_true_negative, - // global_false_positive, global_false_negative); - - // double global_f1_macro_score = global_f1_score / num_label_classes_; - - // micro = considers all classes for precision/recall - double global_micro_precision = - (global_true_positive + global_true_negative) > 0 - ? static_cast(global_true_positive) / - (global_true_positive + global_false_positive) - : 0.0; - double global_micro_recall = - (global_true_positive + global_false_negative) > 0 - ? static_cast(global_true_positive) / - (global_true_positive + global_false_negative) - : 0.0; - - double global_f1_micro_score = - (global_micro_precision + global_micro_recall) > 0 - ? (2.0 * (global_micro_precision * global_micro_recall)) / - (global_micro_precision + global_micro_recall) - : 0.0; - - return global_f1_micro_score; -} - -//////////////////////////////////////////////////////////////////////////////// - -void galois::graphs::GNNGraph::InitializeSamplingData(size_t num_layers, - bool choose_all) { - subgraph_ = std::make_unique(partitioned_graph_->size()); - sample_node_timestamps_.create(partitioned_graph_->size(), - std::numeric_limits::max()); - edge_sample_status_.resize(num_layers); - for (size_t i = 0; i < num_layers; i++) { - edge_sample_status_[i].resize(partitioned_graph_->sizeEdges()); - } - sampled_edges_.resize(partitioned_graph_->sizeEdges()); - // this is to hold the degree of a sampled graph considering all hosts; yes, - // memory wise this is slightly problematic possibly, but each layer is its - // own subgraph - if (!choose_all) { - sampled_out_degrees_.resize(num_layers); - for (galois::LargeArray& array : sampled_out_degrees_) { - array.create(partitioned_graph_->size()); - } - } else { - subgraph_choose_all_ = true; - } - definitely_sampled_nodes_.resize(partitioned_graph_->size()); - master_offset_accum_.resize(num_layers + 1); - mirror_offset_accum_.resize(num_layers + 1); - sample_master_offsets_.resize(num_layers + 1, 0); - sample_mirror_offsets_.resize(num_layers + 1, 0); -} - -size_t galois::graphs::GNNGraph::SetupNeighborhoodSample(GNNPhase seed_phase) { - DisableSubgraph(); - - if (!bitset_sample_flag_.size()) { - bitset_sample_flag_.resize(size()); - } - bitset_sample_flag_.ParallelReset(); - definitely_sampled_nodes_.ParallelReset(); - - galois::do_all( - galois::iterate(begin_owned(), end_owned()), - [&](const NodeIterator& x) { - if (IsValidForPhase(*x, seed_phase)) { - SetSampledNode(*x); - bitset_sample_flag_.set(*x); - definitely_sampled_nodes_.set(*x); - } else { - UnsetSampledNode(*x); - } - }, - galois::loopname("InitialSeedSetting")); - // unsets nodes set in previous iterations; for some reason they get - // synchronized along with everything else even though bitset sample flag - // should prevent it (that, or it's because they don't get sync'd that they - // remain the same) - galois::do_all(galois::iterate(end_owned(), end()), - [&](const NodeIterator& x) { UnsetSampledNode(*x); }); - - // clear node timestamps - galois::StatTimer fill_time("ClearFillTime"); - fill_time.start(); - galois::ParallelSTL::fill(sample_node_timestamps_.begin(), - sample_node_timestamps_.end(), - std::numeric_limits::max()); - galois::ParallelSTL::fill(sample_master_offsets_.begin(), - sample_master_offsets_.end(), 0); - galois::ParallelSTL::fill(sample_mirror_offsets_.begin(), - sample_mirror_offsets_.end(), 0); - fill_time.stop(); - - for (unsigned i = 0; i < master_offset_accum_.size(); i++) { - master_offset_accum_[i].reset(); - mirror_offset_accum_[i].reset(); - } - - // clear all sampled edges - galois::StatTimer ctime("ClearSampleEdges"); - ctime.start(); - for (galois::DynamicBitSet& edge_layer : edge_sample_status_) { - edge_layer.ParallelReset(); - } - ctime.stop(); - // galois::do_all( - // galois::iterate(edge_sample_status_.begin(), - // edge_sample_status_.end()), - // [&](galois::DynamicBitSet& edge_layer) { edge_layer.reset(); }, - // galois::loopname("ClearSampleEdges")); - - sampled_edges_.ParallelReset(); - - // reset all degrees - if (!subgraph_choose_all_) { - galois::StatTimer cad_timer("ClearAllDegrees"); - cad_timer.start(); - for (galois::LargeArray& array : sampled_out_degrees_) { - galois::ParallelSTL::fill(array.begin(), array.end(), 0); - } - cad_timer.stop(); - } - - if (!bitset_sampled_degrees_.size()) { - bitset_sampled_degrees_.resize(partitioned_graph_->size()); - } - bitset_sampled_degrees_.reset(); - - // Seed nodes sync - if (use_timer_) { - sync_substrate_ - ->sync( - "SeedNodeSample"); - } else { - sync_substrate_ - ->sync( - "Ignore"); - } - - galois::GAccumulator local_seed_count; - local_seed_count.reset(); - galois::GAccumulator master_offset; - master_offset.reset(); - galois::GAccumulator mirror_offset; - mirror_offset.reset(); - // count # of seed nodes - galois::do_all( - galois::iterate(begin(), end()), - [&](const NodeIterator& x) { - if (IsInSampledGraph(x)) { - if (*x < *end_owned()) { - master_offset += 1; - } else { - // mirror - mirror_offset += 1; - } - - // galois::gInfo(host_prefix_, "Seed node is ", GetGID(*x)); - local_seed_count += 1; - // 0 = seed node - sample_node_timestamps_[*x] = 0; - } - }, - galois::loopname("SeedNodeOffsetCounting")); - - sample_master_offsets_[0] = master_offset.reduce(); - sample_mirror_offsets_[0] = mirror_offset.reduce(); - - return local_seed_count.reduce(); -} - -size_t galois::graphs::GNNGraph::SampleAllEdges(size_t agg_layer_num, - bool inductive_subgraph, - size_t timestamp) { - DisableSubgraph(); - - galois::do_all( - galois::iterate(begin(), end()), - [&](const NodeIterator& src_iter) { - // only operate on if sampled - if (IsInSampledGraph(src_iter)) { - // marks ALL edges of nodes that connect to train/other nodes - for (auto edge_iter : partitioned_graph_->edges(*src_iter)) { - // total += 1; - if (inductive_subgraph) { - if (!IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), - GNNPhase::kTrain) && - !IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), - GNNPhase::kOther)) { - continue; - } - } - - MakeEdgeSampled(edge_iter, agg_layer_num); - uint32_t dest = partitioned_graph_->getEdgeDst(edge_iter); - if (!IsInSampledGraph(dest)) { - bitset_sample_flag_.set(dest); - } - definitely_sampled_nodes_.set(*src_iter); - definitely_sampled_nodes_.set(dest); - } - } - }, - galois::steal(), galois::loopname("ChooseAllEdges")); - - // update nodes, then communicate update to all hosts so that they can - // continue the exploration - galois::do_all( - galois::iterate(size_t{0}, bitset_sample_flag_.size()), - [&](uint32_t new_node_id) { - if (bitset_sample_flag_.test(new_node_id)) { - SetSampledNode(new_node_id); - } - }, - galois::loopname("NeighborhoodSampleSet")); - - if (use_timer_) { - sync_substrate_ - ->sync( - "SampleFlag"); - } else { - sync_substrate_ - ->sync( - "Ignore"); - } - - galois::GAccumulator local_sample_count; - local_sample_count.reset(); - // count # of seed nodes - galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { - if (IsInSampledGraph(x)) { - local_sample_count += 1; - if (sample_node_timestamps_[*x] == std::numeric_limits::max()) { - if (x < end_owned()) { - // owned nodes that are activated on other hosts shoudl always - // be activated because it's responsible for keeping others in - // sync during comms; ignoring it = bad - // TODO(gluon) make it so you don't have to deal with this - // and just use host as a reducer point - definitely_sampled_nodes_.set(*x); - } - sample_node_timestamps_[*x] = timestamp; - } - } - }); - - EnableSubgraphChooseAll(); - return local_sample_count.reduce(); -} - -size_t galois::graphs::GNNGraph::SampleEdges(size_t sample_layer_num, - size_t num_to_sample, - bool inductive_subgraph, - size_t timestamp) { - use_subgraph_ = false; - use_subgraph_view_ = false; - - galois::do_all( - galois::iterate(begin(), end()), - [&](const NodeIterator& src_iter) { - // only operate on if sampled - if (IsInSampledGraph(src_iter)) { - // chance of not uniformly choosing an edge of this node num_to_sample - // times (degree norm is 1 / degree) - double probability_of_reject; - if (!inductive_subgraph) { - probability_of_reject = - std::pow(1 - GetGlobalDegreeNorm(*src_iter), num_to_sample); - } else { - probability_of_reject = std::pow( - 1 - GetGlobalTrainDegreeNorm(*src_iter), num_to_sample); - } - - // loop through edges, turn "on" edge with some probability - for (auto edge_iter : partitioned_graph_->edges(*src_iter)) { - if (sample_rng_.DoBernoulli(probability_of_reject)) { - if (inductive_subgraph) { - // only take if node is training node or a node not classified - // into train/test/val - if (!IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), - GNNPhase::kTrain) && - !IsValidForPhase(partitioned_graph_->getEdgeDst(edge_iter), - GNNPhase::kOther)) { - continue; - } - } - - uint32_t edge_dst = partitioned_graph_->getEdgeDst(edge_iter); - // if here, it means edge accepted; set sampled on, mark - // as part of next set - MakeEdgeSampled(edge_iter, sample_layer_num); - if (!IsInSampledGraph(edge_dst)) { - bitset_sample_flag_.set(edge_dst); - } - bitset_sampled_degrees_.set(*src_iter); - definitely_sampled_nodes_.set(*src_iter); - definitely_sampled_nodes_.set(edge_dst); - // degree increment - sampled_out_degrees_[sample_layer_num][*src_iter]++; - } - } - } - }, - galois::steal(), galois::loopname("NeighborhoodSample")); - - // update nodes, then communicate update to all hosts so that they can - // continue the exploration - galois::do_all( - galois::iterate(size_t{0}, bitset_sample_flag_.size()), - [&](uint32_t new_node_id) { - if (bitset_sample_flag_.test(new_node_id)) { - SetSampledNode(new_node_id); - } - }, - galois::loopname("NeighborhoodSampleSet")); - - // why not read source? even if it doesn't need to sample anything, it needs - // to know that it's active so that subgraph construction can proceed - // correctly - if (use_timer_) { - sync_substrate_ - ->sync( - "SampleFlag"); - } else { - sync_substrate_ - ->sync( - "Ignore"); - } - - // count sampled node size - galois::GAccumulator local_sample_count; - local_sample_count.reset(); - // count # of seed nodes - galois::do_all(galois::iterate(begin(), end()), [&](const NodeIterator& x) { - if (IsInSampledGraph(x)) { - local_sample_count += 1; - if (sample_node_timestamps_[*x] == std::numeric_limits::max()) { - if (x < end_owned()) { - // owned nodes that are activated on other hosts shoudl always - // be activated because it's responsible for keeping others in - // sync during comms; ignoring it = bad - // TODO(gluon) make it so you don't have to deal with this - // and just use host as a reducer point - definitely_sampled_nodes_.set(*x); - } - sample_node_timestamps_[*x] = timestamp; - } - } - }); - - DisableSubgraphChooseAll(); - return local_sample_count.reduce(); -} - -//! Construct the subgraph from sampled edges and corresponding nodes -std::vector -galois::graphs::GNNGraph::ConstructSampledSubgraph(size_t num_sampled_layers, - bool use_view) { - // false first so that the build process can use functions to access the - // real graph - DisableSubgraph(); - - gnn_sampled_out_degrees_ = &sampled_out_degrees_; - - // first, sync the degres of the sampled edges across all hosts - // read any because destinations need it to for reverse phase - if (use_timer_) { - sync_substrate_ - ->sync( - "SubgraphDegree"); - } else { - sync_substrate_ - ->sync( - "Ignore"); - } - - galois::StatTimer offsets_n_rows_time("OffsetRowSubgraphTime"); - offsets_n_rows_time.start(); - galois::do_all( - galois::iterate(begin(), end()), - [&](const NodeIterator& x) { - if (IsActiveInSubgraph(*x)) { - if (sample_node_timestamps_[*x] != - std::numeric_limits::max()) { - if (*x < *end_owned()) { - // master - master_offset_accum_[sample_node_timestamps_[*x]] += 1; - } else { - // mirror - mirror_offset_accum_[sample_node_timestamps_[*x]] += 1; - } - } else { - GALOIS_LOG_FATAL( - "should have been timestamped at some point if active"); - } - } - }, - galois::loopname("MasterMirrorOffset")); - - std::vector new_rows(master_offset_accum_.size()); - for (unsigned i = 0; i < master_offset_accum_.size(); i++) { - sample_master_offsets_[i] = master_offset_accum_[i].reduce(); - sample_mirror_offsets_[i] = mirror_offset_accum_[i].reduce(); - new_rows[i] = sample_master_offsets_[i] + sample_mirror_offsets_[i]; - if (i > 0) { - new_rows[i] += new_rows[i - 1]; - } - } - - offsets_n_rows_time.stop(); - - if (!use_view) { - subgraph_->BuildSubgraph(*this, num_sampled_layers); - } else { - // a view only has lid<->sid mappings - subgraph_->BuildSubgraphView(*this, num_sampled_layers); - } - - sync_substrate_->SetupSubgraphMirrors(subgraph_->GetSubgraphMirrors(), - use_timer_); - - // after this, this graph is a subgraph - if (!use_view) { - use_subgraph_ = true; - } else { - use_subgraph_view_ = true; - } - - return new_rows; -} - -size_t galois::graphs::GNNGraph::PrepareNextTrainMinibatch() { - train_batcher_->GetNextMinibatch(&local_minibatch_mask_); -#ifndef NDEBUG - size_t count = 0; - // galois::gPrint("Minibatch : "); - for (unsigned i = 0; i < local_minibatch_mask_.size(); i++) { - if (local_minibatch_mask_[i]) { - // galois::gPrint(partitioned_graph_->getGID(i), ","); - count++; - } - } - // galois::gPrint("\n"); - galois::gInfo(host_prefix(), "Batched nodes ", count); -#endif - return SetupNeighborhoodSample(GNNPhase::kBatch); -} - -size_t galois::graphs::GNNGraph::PrepareNextTestMinibatch() { - test_batcher_->GetNextMinibatch(&local_minibatch_mask_); - return SetupNeighborhoodSample(GNNPhase::kBatch); -} - -//////////////////////////////////////////////////////////////////////////////// - -#ifdef GALOIS_ENABLE_GPU -void galois::graphs::GNNGraph::InitGPUMemory() { - // create int casted CSR - uint64_t* e_index_ptr = partitioned_graph_->row_start_ptr(); - uint32_t* e_dest_ptr = partitioned_graph_->edge_dst_ptr(); - - // + 1 because first element is 0 in BLAS CSRs - std::vector e_index(partitioned_graph_->size() + 1); - std::vector e_dest(partitioned_graph_->sizeEdges()); - - // set in parallel - galois::do_all( - galois::iterate(static_cast(0), partitioned_graph_->size() + 1), - [&](size_t index) { - if (index != 0) { - if (e_index_ptr[index - 1] > - static_cast(std::numeric_limits::max())) { - GALOIS_LOG_FATAL("{} is too big a number for int arrays on GPUs", - e_index_ptr[index - 1]); - } - e_index[index] = static_cast(e_index_ptr[index - 1]); - } else { - e_index[index] = 0; - } - }, - galois::loopname("GPUEdgeIndexConstruction")); - galois::do_all( - galois::iterate(static_cast(0), partitioned_graph_->sizeEdges()), - [&](size_t edge) { - if (e_dest_ptr[edge] > - static_cast(std::numeric_limits::max())) { - GALOIS_LOG_FATAL("{} is too big a number for int arrays on GPUs", - e_dest_ptr[edge]); - } - - e_dest[edge] = static_cast(e_dest_ptr[edge]); - }, - galois::loopname("GPUEdgeDestConstruction")); - - gpu_memory_.SetGraphTopology(e_index, e_dest); - e_index.clear(); - e_dest.clear(); - - gpu_memory_.SetFeatures(local_node_features_, node_feature_length_); - gpu_memory_.SetLabels(local_ground_truth_labels_); - gpu_memory_.SetMasks(local_training_mask_, local_validation_mask_, - local_testing_mask_); - gpu_memory_.AllocAggregateBitset(partitioned_graph_->size()); - gpu_memory_.SetGlobalTrainDegrees(global_train_degrees_); - gpu_memory_.SetGlobalDegrees(global_degrees_); -} - -void galois::graphs::GNNGraph::InitLayerVectorMetaObjects( - size_t layer_number, unsigned num_hosts, size_t infl_in_size, - size_t infl_out_size) { - init_CUDA_layer_vector_meta_obj(cuda_ctx_, layer_number, num_hosts, size(), - infl_in_size, infl_out_size); -} - -void galois::graphs::GNNGraph::ResizeGPULayerVector(size_t num_layers) { - resize_CUDA_layer_vector(cuda_ctx_, num_layers); -} -#endif -void galois::graphs::GNNGraph::ContiguousRemap(const std::string& new_name) { - node_remapping_.resize(partitioned_graph_->size()); - - uint32_t new_node_id = 0; - - // serial loops because new ID needs to be kept consistent - // first, train nodes - for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) { - if (IsValidForPhase(cur_node, GNNPhase::kTrain)) { - node_remapping_[new_node_id++] = cur_node; - } - } - galois::gInfo("Train nodes are from 0 to ", new_node_id); - - // second, val nodes - uint32_t val_start = new_node_id; - for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) { - if (IsValidForPhase(cur_node, GNNPhase::kValidate)) { - node_remapping_[new_node_id++] = cur_node; - } - } - galois::gInfo("Val nodes are from ", val_start, " to ", new_node_id, "(", - new_node_id - val_start, ")"); - - // third, test nodes - uint32_t test_start = new_node_id; - for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) { - if (IsValidForPhase(cur_node, GNNPhase::kTest)) { - node_remapping_[new_node_id++] = cur_node; - } - } - galois::gInfo("Test nodes are from ", test_start, " to ", new_node_id, "(", - new_node_id - test_start, ")"); - - // last, everything else - uint32_t other_start = new_node_id; - for (size_t cur_node = 0; cur_node < partitioned_graph_->size(); cur_node++) { - if (IsValidForPhase(cur_node, GNNPhase::kOther)) { - node_remapping_[new_node_id++] = cur_node; - } - } - galois::gInfo("Other nodes are from ", other_start, " to ", new_node_id, "(", - new_node_id - other_start, ")"); - GALOIS_LOG_ASSERT(new_node_id == partitioned_graph_->size()); - - // remap features to match new node mapping, save to disk - // std::vector remapped_features(local_node_features_.size()); - //// do all works because can copy in parallel - // galois::do_all( - // galois::iterate(size_t{0}, partitioned_graph_->size()), - // [&] (size_t remap_node_id) { - // std::memcpy( - // &(remapped_features[remap_node_id * node_feature_length_]), - // &((local_node_features_.data())[node_remapping_[remap_node_id] * - // node_feature_length_]), node_feature_length_ * sizeof(GNNFeature)); - // } - //); - //// sanity check - // galois::do_all( - // galois::iterate(size_t{0}, partitioned_graph_->size()), - // [&] (size_t remap_node_id) { - // for (size_t i = 0; i < node_feature_length_; i++) { - // GALOIS_LOG_ASSERT(remapped_features[remap_node_id * - // node_feature_length_ + i] == - // local_node_features_[node_remapping_[remap_node_id] - // * node_feature_length_ + i]); - // } - // } - //); - //// save to disk - // std::ofstream write_file_stream; - // std::string feature_file = input_directory_ + new_name + "-feats.bin"; - // galois::gPrint(feature_file, "\n"); - // write_file_stream.open(feature_file, std::ios::binary | std::ios::out); - // write_file_stream.write((char*)remapped_features.data(), sizeof(GNNFeature) - // * - // partitioned_graph_->size() - // * node_feature_length_); - // write_file_stream.close(); - - // std::ifstream file_stream; - // file_stream.open(feature_file, std::ios::binary | std::ios::in); - // file_stream.read((char*)remapped_features.data(), sizeof(GNNFloat) * - // partitioned_graph_->size() - // * node_feature_length_); - // file_stream.close(); - //// sanity check again - // galois::do_all( - // galois::iterate(size_t{0}, partitioned_graph_->size()), - // [&] (size_t remap_node_id) { - // for (size_t i = 0; i < node_feature_length_; i++) { - // GALOIS_LOG_ASSERT(remapped_features[remap_node_id * - // node_feature_length_ + i] == - // local_node_features_[node_remapping_[remap_node_id] - // * node_feature_length_ + i]); - // } - // } - //); - // remapped_features.clear(); - - // std::vector remapped_labels(local_ground_truth_labels_.size()); - //// save new labels order to disk (binary file) - // galois::do_all( - // galois::iterate(size_t{0}, partitioned_graph_->size()), - // [&] (size_t remap_node_id) { - // remapped_labels[remap_node_id] = - // local_ground_truth_labels_[node_remapping_[remap_node_id]]; - // } - //); - - // std::string label_filename = input_directory_ + new_name + "-labels.bin"; - // std::ofstream label_write_stream; - // label_write_stream.open(label_filename, std::ios::binary | std::ios::out); - // label_write_stream.write((char*)remapped_labels.data(), sizeof(GNNLabel) * - // partitioned_graph_->size()); - // label_write_stream.close(); - - // galois::do_all( - // galois::iterate(size_t{0}, partitioned_graph_->size()), - // [&] (size_t remap_node_id) { - // remapped_labels[remap_node_id] = - // local_ground_truth_labels_[remap_node_id]; - // } - //); - // ReadLocalLabelsBin(new_name); - // galois::do_all( - // galois::iterate(size_t{0}, partitioned_graph_->size()), - // [&] (size_t remap_node_id) { - // GALOIS_LOG_ASSERT(local_ground_truth_labels_[remap_node_id] == - // remapped_labels[node_remapping_[remap_node_id]]); - // } - //); - // save the mapping to a binary file for use by graph convert to deal with - // the gr - std::string label_filename = input_directory_ + new_name + "-mapping.bin"; - std::ofstream label_write_stream; - label_write_stream.open(label_filename, std::ios::binary | std::ios::out); - label_write_stream.write((char*)node_remapping_.data(), - sizeof(uint32_t) * node_remapping_.size()); - label_write_stream.close(); -} +}; // namespace graphs +}; // namespace galois diff --git a/libgnn/src/graphs/GNNSubgraph.cpp b/libgnn/src/graphs/GNNSubgraph.cpp index f2148b2706..3bea1063c8 100644 --- a/libgnn/src/graphs/GNNSubgraph.cpp +++ b/libgnn/src/graphs/GNNSubgraph.cpp @@ -1,441 +1 @@ #include "galois/graphs/GNNGraph.h" -#include - -size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraph( - GNNGraph& gnn_graph, size_t num_sampled_layers) { - galois::StatTimer timer("BuildSubgraph", kRegionName); - TimerStart(&timer); - for (auto& vec : subgraph_mirrors_) { - vec.clear(); - } - CreateSubgraphMapping(gnn_graph, num_sampled_layers); - if (num_subgraph_nodes_ == 0) { - return 0; - } - DegreeCounting(gnn_graph); - EdgeCreation(gnn_graph); - NodeFeatureCreation(gnn_graph); - // loop over each node, grab out/in edges, construct them in LC_CSR_CSC - // no edge data, just topology - TimerStop(&timer); - return num_subgraph_nodes_; -} - -size_t galois::graphs::GNNGraph::GNNSubgraph::BuildSubgraphView( - GNNGraph& gnn_graph, size_t num_sampled_layers) { - galois::StatTimer timer("BuildSubgraphView", kRegionName); - TimerStart(&timer); - CreateSubgraphMapping(gnn_graph, num_sampled_layers); - NodeFeatureCreation(gnn_graph); - TimerStop(&timer); - return num_subgraph_nodes_; -} - -// TODO signature cleanup -void galois::graphs::GNNGraph::GNNSubgraph::CreateSubgraphMapping( - GNNGraph& gnn_graph, size_t) { - galois::StatTimer timer("SIDMapping", kRegionName); - TimerStart(&timer); - - assert(gnn_graph.size() == lid_to_subgraph_id_.size()); - // clear all mappings - galois::ParallelSTL::fill(lid_to_subgraph_id_.begin(), - lid_to_subgraph_id_.end(), - std::numeric_limits::max()); - - galois::GAccumulator subgraph_count; - subgraph_count.reset(); - galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()), - [&](uint32_t node_id) { - if (gnn_graph.IsActiveInSubgraph(node_id)) { - subgraph_count += 1; - } - }); - num_subgraph_nodes_ = subgraph_count.reduce(); - // if no subgraph, get out - if (num_subgraph_nodes_ == 0) { - subgraph_master_boundary_ = 0; - TimerStop(&timer); - return; - } - - // checking sanity - // galois::do_all(galois::iterate(gnn_graph.begin(), gnn_graph.end()), - // [&](uint32_t node_id) { - // if (gnn_graph.IsInSampledGraph(node_id) && - // !gnn_graph.IsActiveInSubgraph(node_id)) { - // // check if any edges are active - // for (auto a = gnn_graph.edge_begin(node_id); a != - // gnn_graph.edge_end(node_id);a++) { - // if (gnn_graph.IsEdgeSampledAny(a)) { - // galois::gWarn("ERROR node ", node_id); - // } - // } - // for (auto a = gnn_graph.in_edge_begin(node_id); a != - // gnn_graph.in_edge_end(node_id);a++) { - // if (gnn_graph.IsInEdgeSampledAny(a)) { - // galois::gWarn("ERROR in node ", node_id); - // } - // } - // } - // }); - - if (subgraph_id_to_lid_.size() < num_subgraph_nodes_) { - // allocate a bit more than necessary to avoid a big realloc - // if node value changes slightly later - subgraph_id_to_lid_.resize(num_subgraph_nodes_ * 1.02); - } - - // bitset to mark if a master is outside the "master only" boundary - // and not contiguous; needed to mask out non-masters - galois::DynamicBitSet& non_layer_zero_masters = - gnn_graph.GetNonLayerZeroMasters(); - // init the bitset as necessary - if (non_layer_zero_masters.size() < num_subgraph_nodes_) { - non_layer_zero_masters.resize(num_subgraph_nodes_); - } else { - non_layer_zero_masters.ParallelReset(); - } - - std::vector& master_offsets = gnn_graph.GetMasterOffsets(); - std::vector& mirror_offsets = gnn_graph.GetMirrorOffsets(); - - ResetSIDThreadOffsets(master_offsets.size()); - - // compute offsets for each layer - galois::PODResizeableArray layer_offsets; - layer_offsets.resize(master_offsets.size() - 1); - for (unsigned i = 0; i < layer_offsets.size(); i++) { - layer_offsets[i] = master_offsets[i] + mirror_offsets[i]; - if (i > 0) { - // prefix summing - layer_offsets[i] += layer_offsets[i - 1]; - } - } - - // all nodes before this SID are master nodes in layer 0; - // NOTE: there are master nodes past this boundary that will - // not be covered by a begin_owned loop, which may cause problems down - // the line; this is handled by the bitset above - subgraph_master_boundary_ = master_offsets[0]; - - size_t last_owned_node = *(gnn_graph.end_owned()); - // compute amount of work each thread needs to do - galois::on_each([&](size_t thread_id, size_t num_threads) { - unsigned start_node; - unsigned end_node; - // this thread always has a set number of nodes to run; this is it - std::tie(start_node, end_node) = galois::block_range( - size_t{0}, gnn_graph.size(), thread_id, num_threads); - // these arrays track how much work will need to be done by this - // thread - galois::PODResizeableArray& my_offsets = - sid_thread_offsets_[thread_id]; - galois::PODResizeableArray& my_mirror_offsets = - subgraph_mirror_offsets_[thread_id]; - - for (size_t local_node_id = start_node; local_node_id < end_node; - local_node_id++) { - // only bother if node was active - if (gnn_graph.IsActiveInSubgraph(local_node_id)) { - unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id); - // TODO(loc) this check shouldn't even be necessary; active in subgraph - // implies added at somepoint - if (node_timestamp != std::numeric_limits::max()) { - // tracks how many nodes for each timestamp this node will - // work with by incrementing this - my_offsets[node_timestamp]++; - - if (local_node_id >= last_owned_node) { - // this is a mirror node; get the host that the master is located - // on and increment this thread's mirror node count for that host - uint32_t node_gid = gnn_graph.GetGID(local_node_id); - my_mirror_offsets[gnn_graph.GetHostID(node_gid)]++; - } - } else { - GALOIS_LOG_WARN("shouldn't ever get here right?"); - } - } - } - }); - - // prefix sum the threads - galois::do_all(galois::iterate(size_t{0}, master_offsets.size()), - [&](size_t layer_num) { - for (size_t thread_id = 1; - thread_id < galois::getActiveThreads(); thread_id++) { - sid_thread_offsets_[thread_id][layer_num] += - sid_thread_offsets_[thread_id - 1][layer_num]; - } - }); - - for (unsigned i = 0; i < master_offsets.size() - 1; i++) { - if (i > 0) { - GALOIS_LOG_VASSERT( - sid_thread_offsets_[galois::getActiveThreads() - 1][i] + - layer_offsets[i - 1] == - (layer_offsets[i]), - "layer {} wrong {} vs correct {}", i, - sid_thread_offsets_[galois::getActiveThreads() - 1][i], - layer_offsets[i]); - } else { - GALOIS_LOG_VASSERT( - sid_thread_offsets_[galois::getActiveThreads() - 1][i] == - (layer_offsets[i]), - "layer {} wrong {} vs correct {}", i, - sid_thread_offsets_[galois::getActiveThreads() - 1][i], - layer_offsets[i]); - } - } - - // last element of prefix sum needs to equal the correct layer offset - galois::do_all( - galois::iterate(uint32_t{0}, - galois::runtime::getSystemNetworkInterface().Num), - [&](size_t host_num) { - // for each host, get prefix sum of each thread's mirrors - for (size_t thread_id = 1; thread_id < galois::getActiveThreads(); - thread_id++) { - subgraph_mirror_offsets_[thread_id][host_num] += - subgraph_mirror_offsets_[thread_id - 1][host_num]; - } - }); - - // allocate the mirror space; last element of prefix sum is total size - for (unsigned host_num = 0; - host_num < galois::runtime::getSystemNetworkInterface().Num; - host_num++) { - if (galois::runtime::getSystemNetworkInterface().ID == host_num) { - continue; - } - subgraph_mirrors_[host_num].resize( - subgraph_mirror_offsets_[galois::getActiveThreads() - 1][host_num]); - } - - galois::on_each([&](size_t thread_id, size_t num_threads) { - unsigned start_node; - unsigned end_node; - std::tie(start_node, end_node) = galois::block_range( - size_t{0}, gnn_graph.size(), thread_id, num_threads); - - galois::PODResizeableArray& current_thread_offset = - thread_id != 0 ? sid_thread_offsets_[thread_id - 1] : thread_zero_work_; - galois::PODResizeableArray& my_mirror_offsets = - thread_id != 0 ? subgraph_mirror_offsets_[thread_id - 1] - : thread_zero_mirror_offsets_; - - for (size_t local_node_id = start_node; local_node_id < end_node; - local_node_id++) { - if (gnn_graph.IsActiveInSubgraph(local_node_id)) { - unsigned node_timestamp = gnn_graph.SampleNodeTimestamp(local_node_id); - if (node_timestamp != std::numeric_limits::max()) { - uint32_t sid_to_use; - if (node_timestamp != 0) { - sid_to_use = layer_offsets[node_timestamp - 1] + - current_thread_offset[node_timestamp]++; - if (local_node_id < last_owned_node) { - // master node that is not in layer 0 (i.e. node_timestamp != 0) - non_layer_zero_masters.set(sid_to_use); - } - } else { - // node timestamp == 0; no layer offset needed because offset - // is 0 - sid_to_use = current_thread_offset[node_timestamp]++; - } - - // this is a mirror - if (local_node_id >= last_owned_node) { - // XXX(loc) mirror offsets - uint32_t node_gid = gnn_graph.GetGID(local_node_id); - size_t my_offset = - my_mirror_offsets[gnn_graph.GetHostID(node_gid)]++; - - if (my_offset > - subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].size()) - GALOIS_LOG_FATAL( - "{} {}", my_offset, - subgraph_mirrors_[gnn_graph.GetHostID(node_gid)].size()); - - subgraph_mirrors_[gnn_graph.GetHostID(node_gid)][my_offset] = - node_gid; - } - - subgraph_id_to_lid_[sid_to_use] = local_node_id; - lid_to_subgraph_id_[local_node_id] = sid_to_use; - } else { - GALOIS_LOG_WARN("shouldn't ever get here right?"); - } - } - } - }); - - TimerStop(&timer); -} - -// TODO optimize further? -void galois::graphs::GNNGraph::GNNSubgraph::DegreeCounting( - const GNNGraph& gnn_graph) { - galois::StatTimer timer("DegreeCounting", kRegionName); - TimerStart(&timer); - - if (local_subgraph_out_degrees_.size() < num_subgraph_nodes_) { - local_subgraph_out_degrees_.resize(num_subgraph_nodes_ * 1.02); - } - - if (local_subgraph_in_degrees_.size() < num_subgraph_nodes_) { - local_subgraph_in_degrees_.resize(num_subgraph_nodes_ * 1.02); - } - - galois::do_all( - galois::iterate(begin(), end()), - [&](uint32_t subgraph_id) { - uint32_t node_id = subgraph_id_to_lid_[subgraph_id]; - uint32_t out_degrees = 0; - for (auto out_edge_iter : gnn_graph.edges(node_id)) { - if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) { - out_degrees++; - } - } - local_subgraph_out_degrees_[subgraph_id] = out_degrees; - - uint32_t in_degrees = 0; - for (auto in_edge_iter : gnn_graph.in_edges(node_id)) { - if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) { - in_degrees++; - } - } - local_subgraph_in_degrees_[subgraph_id] = in_degrees; - }, - galois::loopname("DegreeCountingDoAll"), galois::steal()); - - TimerStop(&timer); -} - -// TODO optimize further? -void galois::graphs::GNNGraph::GNNSubgraph::EdgeCreation( - const GNNGraph& gnn_graph) { - galois::StatTimer timer("EdgeConstruction", kRegionName); - TimerStart(&timer); - // galois::DGAccumulator empty_masters; - // galois::DGAccumulator empty_mirrors; - // empty_masters.reset(); - // empty_mirrors.reset(); - - // galois::DGAccumulator total_sn; - // total_sn.reset(); - // total_sn += num_subgraph_nodes_; - // size_t global_sub_size = total_sn.reduce(); - - // prefix sum over subgraph degrees from previous phase to get starting points - for (size_t i = 1; i < num_subgraph_nodes_; i++) { - // if (local_subgraph_out_degrees_[i] == 0 && - // local_subgraph_in_degrees_[i] == 0) { - // if (i < subgraph_master_boundary_) { - // empty_masters += 1; - // } else { - // if (gnn_graph.GetNonLayerZeroMasters().test(i)) { - // empty_masters += 1; - // } else { - // empty_mirrors += 1; - // } - // } - //} - local_subgraph_out_degrees_[i] += local_subgraph_out_degrees_[i - 1]; - local_subgraph_in_degrees_[i] += local_subgraph_in_degrees_[i - 1]; - } - - // uint32_t emaster = empty_masters.reduce(); - // uint32_t emirror = empty_mirrors.reduce(); - // if (gnn_graph.host_id() == 0) { - // galois::gInfo("Empty masters percent is ", emaster / - // (float)global_sub_size, - // " ", emaster, " ", global_sub_size); - // galois::gInfo("Empty mirrors percent is ", emirror / - // (float)global_sub_size, - // " ", emirror, " ", global_sub_size); - //} - - // allocate then set node endpoints - num_subgraph_edges_ = local_subgraph_out_degrees_[num_subgraph_nodes_ - 1]; - - galois::StatTimer alloc_time("EdgeCreationAlloc", kRegionName); - TimerStart(&alloc_time); - underlying_graph_.DeallocateOnly(); - underlying_graph_.allocateFrom(num_subgraph_nodes_, num_subgraph_edges_); - underlying_graph_.CSCAllocate(); - TimerStop(&alloc_time); - - galois::gInfo(gnn_graph.host_prefix(), "Subgraph nodes and edges are ", - num_subgraph_nodes_, " ", num_subgraph_edges_); - - galois::do_all(galois::iterate(uint32_t{0}, num_subgraph_nodes_), - [&](uint32_t subgraph_id) { - underlying_graph_.fixEndEdge( - subgraph_id, local_subgraph_out_degrees_[subgraph_id]); - underlying_graph_.FixEndInEdge( - subgraph_id, local_subgraph_in_degrees_[subgraph_id]); - }); - if (subedge_to_original_edge_.size() < num_subgraph_edges_) { - subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02); - } - if (in_subedge_to_original_edge_.size() < num_subgraph_edges_) { - in_subedge_to_original_edge_.resize(num_subgraph_edges_ * 1.02); - } - - // save edges + save reference to layer sample status - galois::do_all( - galois::iterate(begin(), end()), - [&](uint32_t subgraph_id) { - uint32_t node_id = subgraph_id_to_lid_[subgraph_id]; - assert(subgraph_id != std::numeric_limits::max()); - uint32_t out_location = 0; - uint32_t in_location = 0; - if (subgraph_id != 0) { - out_location = local_subgraph_out_degrees_[subgraph_id - 1]; - in_location = local_subgraph_in_degrees_[subgraph_id - 1]; - } - - for (auto out_edge_iter : gnn_graph.edges(node_id)) { - if (gnn_graph.IsEdgeSampledAny(out_edge_iter)) { - assert(lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)] != - std::numeric_limits::max()); - subedge_to_original_edge_[out_location] = *out_edge_iter; - - underlying_graph_.constructEdge( - out_location++, - lid_to_subgraph_id_[gnn_graph.GetEdgeDest(out_edge_iter)]); - } - } - - for (auto in_edge_iter : gnn_graph.in_edges(node_id)) { - if (gnn_graph.IsInEdgeSampledAny(in_edge_iter)) { - in_subedge_to_original_edge_[in_location] = - *(gnn_graph.InEdgeToOutEdge(in_edge_iter)); - underlying_graph_.ConstructInEdge( - in_location++, - lid_to_subgraph_id_[gnn_graph.GetInEdgeDest(in_edge_iter)]); - } - } - assert(out_location == local_subgraph_out_degrees_[subgraph_id]); - assert(in_location == local_subgraph_in_degrees_[subgraph_id]); - }, - galois::loopname("EdgeCreationDoAll"), galois::steal()); - TimerStop(&timer); -} - -void galois::graphs::GNNGraph::GNNSubgraph::NodeFeatureCreation( - GNNGraph& gnn_graph) { - galois::StatTimer timer("NodeFeatureCreation", kRegionName); - TimerStart(&timer); - size_t feat_length = gnn_graph.node_feature_length(); - subgraph_node_features_.resize(feat_length * num_subgraph_nodes_); - - galois::do_all(galois::iterate(begin(), end()), [&](size_t subgraph_node_id) { - size_t local_id = subgraph_id_to_lid_[subgraph_node_id]; - std::memcpy( - &(subgraph_node_features_[subgraph_node_id * feat_length]), - &((gnn_graph.GetLocalFeatures().data())[local_id * feat_length]), - feat_length * sizeof(GNNFeature)); - }); - TimerStop(&timer); -} diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp index eed3143a01..8b13789179 100644 --- a/libgnn/src/layers/DenseLayer.cpp +++ b/libgnn/src/layers/DenseLayer.cpp @@ -1,145 +1 @@ -#include "galois/Logging.h" -#include "galois/GNNMath.h" -#include "galois/layers/DenseLayer.h" -galois::DenseLayer::DenseLayer( - size_t layer_num, const galois::graphs::GNNGraph& graph, - PointerWithSize* backward_output_matrix, - const GNNLayerDimensions& dimensions, const GNNLayerConfig& config) - : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config), - input_column_intermediates_(dimensions.input_columns), - output_column_intermediates_(dimensions.output_columns) { - // TODO Need to make sure that layer knows about forward/backward matrix - // sharing (e.g., overwriting previously used input to save space) - GALOIS_LOG_FATAL("This layer has not been kept up to date; do not use until " - "sure it's been updated"); - size_t num_input_elements = - layer_dimensions_.input_rows * layer_dimensions_.input_columns; - in_temp_1_.resize(num_input_elements, 0); - size_t num_output_elements = - layer_dimensions_.input_rows * layer_dimensions_.output_columns; - GALOIS_LOG_VERBOSE("Output elements {}", num_output_elements); - layer_type_ = galois::GNNLayerType::kDense; - p_in_temp_1_ = PointerWithSize(in_temp_1_); - GALOIS_LOG_VERBOSE("Dense initialized"); -} - -const galois::PointerWithSize -galois::DenseLayer::ForwardPhase( - const galois::PointerWithSize input_embeddings) { - GALOIS_LOG_VERBOSE("Calling forward phase"); - assert(input_embeddings.size() == - (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); - assert(p_in_temp_1_.size() == input_embeddings.size()); - assert(p_forward_output_matrix_.size() == - (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); - // pointer to input to operate on - const GNNFloat* input_data = input_embeddings.data(); - // first, dropout - if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) { - DoDropout(input_embeddings, &p_in_temp_1_); - input_data = p_in_temp_1_.data(); - } - - // FW - UpdateEmbeddings(input_data, p_forward_output_matrix_.data()); - - if (!config_.disable_activation) { - GALOIS_LOG_VERBOSE("Doing activation"); - Activation(); - } - - assert(p_forward_output_matrix_.size() == - (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); - return p_forward_output_matrix_; -} - -galois::PointerWithSize galois::DenseLayer::BackwardPhase( - galois::PointerWithSize prev_layer_input, - galois::PointerWithSize* input_gradient) { - assert(layer_phase_ == GNNPhase::kTrain); - - // derivative of activation - if (!config_.disable_activation) { - ActivationDerivative(input_gradient); - } - - if (layer_number_ != 0) { - // derivative for update - // backout = F' - UpdateEmbeddingsDerivative(input_gradient->data(), - p_backward_output_matrix_.data()); - } - - galois::PointerWithSize input_data; - if (!config_.disable_dropout) { - // dropout result is currently stored in temp 1 - // needs to be used before it gets overwritten - input_data = p_in_temp_1_; - } else { - // no dropout = use vanilla input - input_data = prev_layer_input; - } - - // W' = F^T (FW)' - galois::CBlasSGEMM(CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, - layer_dimensions_.input_rows, - layer_dimensions_.output_columns, input_data.data(), - input_gradient->data(), p_layer_weight_gradients_.data()); - // sync weight gradients; note aggregation sync occurs in the function call - // already - WeightGradientSyncSum(); - - if (!config_.disable_dropout && layer_number_ != 0) { - DoDropoutDerivative(); - } - - return p_backward_output_matrix_; -} - -void galois::DenseLayer::UpdateEmbeddings(const GNNFloat* node_embeddings, - GNNFloat* output) { -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - /* TODO(lhc) implement this - gpu_object_.UpdateEmbeddingsGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, node_embeddings, - base_gpu_object_.layer_weights(), output); - */ - } else { -#endif - // CPU version is just a call into CBlas - galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, - layer_dimensions_.input_columns, - layer_dimensions_.output_columns, node_embeddings, - layer_weights_.data(), output); -#ifdef GALOIS_ENABLE_GPU - } -#endif -} - -void galois::DenseLayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, - GNNFloat* output) { - assert(p_layer_weights_.size() == - layer_dimensions_.input_columns * layer_dimensions_.output_columns); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - /* TODO(lhc) implement this - gpu_object_.UpdateEmbeddingsDerivativeGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, gradients, - base_gpu_object_.layer_weights(), output); - */ - } else { -#endif - // difference is Trans for B matrix (data) to get z by y (weights is y by z - // normally); result is x by y - galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, - layer_dimensions_.output_columns, - layer_dimensions_.input_columns, gradients, - layer_weights_.data(), output); -#ifdef GALOIS_ENABLE_GPU - } -#endif -} diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 82a864a41d..8b13789179 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -1,558 +1 @@ -#include "galois/Logging.h" -#include "galois/layers/GNNLayer.h" -#include "galois/layers/GradientSyncStructures.h" -galois::GNNLayer::GNNLayer(size_t layer_num, - const galois::graphs::GNNGraph& graph, - PointerWithSize* backward_output_matrix, - const GNNLayerDimensions& dimensions, - const GNNLayerConfig& config) - : layer_number_(layer_num), graph_(graph), layer_dimensions_(dimensions), - config_(config) { - // TODO(loc) - // this is currently a backward-compatibility hack, need to have caller - // set output rows rather than created here - layer_dimensions_.output_rows = layer_dimensions_.input_rows; - - if (config_.allocate_weights) { - // dropout allocation; dropout is same as input - if (!config_.disable_dropout) { - dropout_mask_.resize(layer_dimensions_.input_rows * - layer_dimensions_.input_columns, - false); - } - // allocate memory based on layer dimensions - size_t num_weight_elements = - layer_dimensions_.input_columns * layer_dimensions_.output_columns; - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", layer weights ", num_weight_elements, " (", - FloatElementsToGB(num_weight_elements), " GB)"); - layer_weights_.resize(num_weight_elements); - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", layer gradients ", num_weight_elements, " (", - FloatElementsToGB(num_weight_elements), " GB)"); - layer_weight_gradients_.resize(num_weight_elements, 0); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - base_gpu_object_.InitWeightMemory(num_weight_elements); - base_gpu_object_.InitDropoutMemory(layer_dimensions_.input_rows * - layer_dimensions_.input_columns); - } -#endif - - GlorotBengioInit(&layer_weights_); - } - - // TODO(loc) optimize this and layer creation in general - // this does not use output_rows and assumes the worst case where - // all nodes are generated - // for now it's kept as input_rows so as to not break things - size_t num_output_elements = - layer_dimensions_.input_rows * layer_dimensions_.output_columns; - - if (!config_.disable_output) { - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", forward output matrix ", num_output_elements, " (", - FloatElementsToGB(num_output_elements), " GB)"); - forward_output_matrix_.resize(num_output_elements, 0); - } - - if (layer_number_ != 0) { - GALOIS_LOG_VASSERT( - backward_output_matrix->size() == - layer_dimensions_.input_rows * layer_dimensions_.input_columns, - "backward output size {} should equal input size {}", - backward_output_matrix->size(), - layer_dimensions_.input_rows * layer_dimensions_.input_columns); - } else { - GALOIS_LOG_VASSERT(backward_output_matrix->data() == nullptr, - "layer 0 should null ptr backward output"); - GALOIS_LOG_VASSERT(backward_output_matrix->size() == 0, - "layer 0 should size 0 backward output"); - } - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - base_gpu_object_.InitInOutMemory(num_output_elements, - layer_dimensions_.input_rows * - layer_dimensions_.input_columns); - - // initialize the PointerWithSize wrappers - p_layer_weights_ = PointerWithSize( - base_gpu_object_.layer_weights(), layer_weights_.size()); - p_layer_weight_gradients_ = - PointerWithSize(base_gpu_object_.layer_weight_gradients(), - layer_weight_gradients_.size()); - p_forward_output_matrix_ = PointerWithSize( - base_gpu_object_.forward_output(), forward_output_matrix_.size()); - p_backward_output_matrix_ = PointerWithSize( - base_gpu_object_.backward_output(), backward_output_matrix->size()); - // TODO can clear the cpu side vectors/don't use .size() since optimally - // they aren't initialized - } else { -#endif - // initialize the PointerWithSize wrappers - p_layer_weights_ = PointerWithSize(layer_weights_); - p_layer_weight_gradients_ = - PointerWithSize(layer_weight_gradients_); - p_forward_output_matrix_ = - PointerWithSize(forward_output_matrix_); - p_backward_output_matrix_ = *backward_output_matrix; -#ifdef GALOIS_ENABLE_GPU - } -#endif -} - -void galois::GNNLayer::ResizeOutputMatrix(size_t new_output_row) { - size_t num_output_elements = - new_output_row * layer_dimensions_.output_columns; - - if (!config_.disable_output && - (forward_output_matrix_.size() < num_output_elements)) { - galois::gInfo(graph_.host_prefix(), "Resizing layer ", layer_number_, - ", forward output matrix to ", num_output_elements, " (", - FloatElementsToGB(num_output_elements), " GB)"); - // resize with a bit of a buffer to prevent possible future resizes - size_t buffer_size = (num_output_elements * 0.02); - forward_output_matrix_.resize(num_output_elements + buffer_size, 0); - } - - // XXX(hochan) GPU end -#ifdef GALOIS_ENABLE_GPU - // XXX(hochan) -#endif - // reinitialize the PointerWithSize wrappers - p_forward_output_matrix_ = PointerWithSize(forward_output_matrix_); -#ifdef GALOIS_ENABLE_GPU - // XXX(hochan) -#endif -} - -void galois::GNNLayer::GlorotBengioInit(std::vector* vector_to_init) { - float max = std::sqrt(6.0) / std::sqrt(layer_dimensions_.output_columns + - layer_dimensions_.input_columns); - std::default_random_engine rng(1 + layer_number_); - std::uniform_real_distribution dist(-max, max); - - for (size_t i = 0; i < vector_to_init->size(); i++) { - (*vector_to_init)[i] = dist(rng); - } -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - CopyLayerWeightsToGPU(); - } -#endif -} - -void galois::GNNLayer::PairGlorotBengioInit(std::vector* vector1, - std::vector* vector2) { - // multiplied by 2 here because 2 pieces are 1 unit - float max = - std::sqrt(6.0) / std::sqrt((2 * layer_dimensions_.output_columns) + - layer_dimensions_.input_columns); - assert(vector1->size() == - (layer_dimensions_.input_columns * layer_dimensions_.output_columns)); - assert(vector2->size() == - (layer_dimensions_.input_columns * layer_dimensions_.output_columns)); - std::default_random_engine rng(1 + layer_number_); - std::uniform_real_distribution dist(-max, max); - - for (size_t i = 0; i < vector1->size(); i++) { - (*vector1)[i] = dist(rng); - } - for (size_t i = 0; i < vector2->size(); i++) { - (*vector2)[i] = dist(rng); - } - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - CopyLayerWeightsToGPU(); - } -#endif -} - -void galois::GNNLayer::RandomInitVector(std::vector* vector_to_init) { - galois::do_all( - galois::iterate(static_cast(0), vector_to_init->size()), - [&](size_t i) { - // pull from the class's per thread RNG - (*vector_to_init)[i] = random_init_rng_.GetRandomNumber(); - }, - galois::loopname("RandomInitVector")); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - CopyLayerWeightsToGPU(); - } -#endif -} - -void galois::GNNLayer::DoDropoutCPU( - const PointerWithSize input_to_dropout, - PointerWithSize* output_matrix) { - // TODO This (and dropout in general) may not work in the sampling setting - size_t num_elements = - layer_dimensions_.input_rows * layer_dimensions_.input_columns; - - // determine which parts to drop - galois::do_all( - galois::iterate(static_cast(0), num_elements), - [&](size_t i) { - dropout_mask_[i] = dropout_rng_.DoBernoulli(config_.dropout_rate); - }, - galois::loopname("LayerDropoutRNG")); - - // create new matrix with non-dropped input + some scaling - // TODO save scaling elsewhere? - GNNFloat scale = 1. / (1. - config_.dropout_rate); - galois::do_all( - galois::iterate(static_cast(0), num_elements), - [&](size_t i) { - (*output_matrix)[i] = input_to_dropout[i] * - static_cast(dropout_mask_[i]) * scale; - }, - galois::loopname("LayerDropout")); -} - -void galois::GNNLayer::DoDropout( - const PointerWithSize input_to_dropout, - PointerWithSize* output_matrix) { - galois::StatTimer timer("ForwardDropout", "GNNLayer"); - TimerStart(&timer); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - base_gpu_object_.DoDropoutGPU(input_to_dropout, *output_matrix, - config_.dropout_rate); - } else { -#endif - DoDropoutCPU(input_to_dropout, output_matrix); -#ifdef GALOIS_ENABLE_GPU - } -#endif - TimerStop(&timer); -} - -void galois::GNNLayer::ReconstructDropoutMatrix( - const PointerWithSize input_to_dropout, - PointerWithSize* output_matrix) { - galois::StatTimer timer("ReconstructDropoutMatrix", "GNNLayer"); - TimerStart(&timer); - // reuse the dropout mask from a previous dropout call - size_t num_elements = output_matrix->size(); - GNNFloat scale = 1. / (1. - config_.dropout_rate); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - base_gpu_object_.ReconstructDropoutMatrixGPU( - input_to_dropout, output_matrix, num_elements, scale); - } else { -#endif - galois::do_all( - galois::iterate(static_cast(0), num_elements), - [&](size_t i) { - (*output_matrix)[i] = input_to_dropout[i] * - static_cast(dropout_mask_[i]) * scale; - }, - galois::loopname("ReconstructDropout")); -#ifdef GALOIS_ENABLE_GPU - } -#endif - TimerStop(&timer); -} - -void galois::GNNLayer::DoDropoutDerivative() { - galois::StatTimer timer("BackwardDropout", "GNNLayer"); - TimerStart(&timer); - assert(p_backward_output_matrix_.size() == dropout_mask_.size()); - GNNFloat scale = 1. / (1. - config_.dropout_rate); - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - base_gpu_object_.DoDropoutDerivativeGPU(p_backward_output_matrix_.size(), - scale); - } else { -#endif - // use dropout mask to figure out derivative - galois::do_all( - galois::iterate(static_cast(0), - p_backward_output_matrix_.size()), - [&](size_t i) { - p_backward_output_matrix_[i] = - p_backward_output_matrix_[i] * - static_cast(dropout_mask_[i]) * scale; - }, - galois::loopname("LayerDropoutDerivative")); -#ifdef GALOIS_ENABLE_GPU - } -#endif - TimerStop(&timer); -} - -void galois::GNNLayer::Activation() { - galois::StatTimer timer("ForwardActivation", "GNNLayer"); - TimerStart(&timer); - - // TODO only does relu at the moment; should check user specified activation - // and act accordingly -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - base_gpu_object_.ActivationGPU(p_forward_output_matrix_.size()); - } else { -#endif - if (activation_memo_.size() != p_forward_output_matrix_.size()) { - activation_memo_.resize(p_forward_output_matrix_.size()); - } - activation_memo_.reset(); - assert(activation_memo_.size() == p_forward_output_matrix_.size()); - assert(layer_dimensions_.output_rows * layer_dimensions_.output_columns <= - p_forward_output_matrix_.size()); - - galois::do_all(galois::iterate(static_cast(0), - layer_dimensions_.output_rows * - layer_dimensions_.output_columns), - [&](size_t i) { - if (p_forward_output_matrix_[i] > 0.0) { - // do nothing, keep value; set the memo though - activation_memo_.set(i); - } else { - p_forward_output_matrix_[i] = 0; - } - }); -#ifdef GALOIS_ENABLE_GPU - } -#endif - TimerStop(&timer); -} - -void galois::GNNLayer::ActivationDerivative( - PointerWithSize* gradient) { - galois::StatTimer timer("BackwardActivation", "GNNLayer"); - TimerStart(&timer); - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - base_gpu_object_.ActivationDerivativeGPU(gradient->data(), - gradient->size()); - } else { -#endif - assert(gradient->size() >= - layer_dimensions_.output_rows * layer_dimensions_.output_columns); - // TODO only does relu at the moment; should check user specified activation - // and act accordingly - // keep gradient if the original output was greater than 0 - galois::do_all( - galois::iterate(static_cast(0), - layer_dimensions_.output_rows * - layer_dimensions_.output_columns), - [&](size_t i) { - // it was <= 0 before; set back to 0 - if (!activation_memo_.test(i)) { - (*gradient)[i] = 0; - } - }, - galois::loopname("ReLU-Derivative")); -#ifdef GALOIS_ENABLE_GPU - } -#endif - TimerStop(&timer); -} - -void galois::GNNLayer::WeightGradientSyncSum() { - galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon"); - TimerStart(&clubbed_timer); - galois::StatTimer t("Sync_WeightGradientsSum", "GNNLayer"); - TimerStart(&t); - int weight_size = static_cast(p_layer_weight_gradients_.size()); - - // TODO(loc) remove this limitation later; can just do a loop over the weight - // matrix - if (p_layer_weight_gradients_.size() > - size_t{std::numeric_limits::max()}) { - GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max " - "int at the moment"); - } -#ifdef GALOIS_ENABLE_GPU - // TODO(lhc) make this clang option later - bool gpu_direct_enabled = false; - if (device_personality == DevicePersonality::GPU_CUDA && - !gpu_direct_enabled) { - base_gpu_object_.CopyWeightGradientsToCPU(&layer_weight_gradients_); - MPI_Allreduce(MPI_IN_PLACE, layer_weight_gradients_.data(), weight_size, - MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); - base_gpu_object_.CopyToWeightGradients(layer_weight_gradients_); - } else { -#endif - MPI_Allreduce(MPI_IN_PLACE, - static_cast(p_layer_weight_gradients_.data()), - weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); -#ifdef GALOIS_ENABLE_GPU - } -#endif - TimerStop(&t); - TimerStop(&clubbed_timer); -} - -void galois::GNNLayer::MaskInputNonMasters(PointerWithSize* input, - size_t max_rows) { - assert(*(graph_.begin_owned()) == 0); - size_t start_node = *(graph_.end_owned()); - size_t end_node = graph_.active_size(); - - if (start_node > max_rows) { - start_node = max_rows; - } - if (end_node > max_rows) { - end_node = max_rows; - } - - size_t row_index = layer_dimensions_.input_columns; - assert(start_node * row_index <= input->size()); - assert(end_node * row_index <= input->size()); - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node, row_index); - } else { -#endif - galois::do_all( - galois::iterate(start_node, end_node), - [&](size_t non_master) { - // TODO(loc) use a std function for this for max efficiency - for (size_t i = 0; i < row_index; i++) { - (*input)[non_master * row_index + i] = 0; - } - }, - galois::loopname("MaskInputNonMasters")); -#ifdef GALOIS_ENABLE_GPU - } -#endif -} - -void galois::GNNLayer::MaskInputNonMasters(PointerWithSize* input, - size_t max_rows, - const galois::DynamicBitSet& bs) { - assert(*(graph_.begin_owned()) == 0); - size_t start_node = *(graph_.end_owned()); - size_t end_node = graph_.active_size(); - - if (start_node > max_rows) { - start_node = max_rows; - } - if (end_node > max_rows) { - end_node = max_rows; - } - - size_t row_index = layer_dimensions_.input_columns; - assert(start_node * row_index <= input->size()); - assert(end_node * row_index <= input->size()); - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - base_gpu_object_.MaskNonMastersGPU(input, start_node, end_node, row_index); - } else { -#endif - galois::do_all( - galois::iterate(start_node, end_node), - [&](size_t non_master) { - if (!bs.test(non_master)) { - // TODO(loc) use a std function for this for max efficiency - for (size_t i = 0; i < row_index; i++) { - (*input)[non_master * row_index + i] = 0; - } - } - }, - galois::loopname("MaskInputNonMasters")); -#ifdef GALOIS_ENABLE_GPU - } -#endif -} - -void galois::GNNLayer::MaskGradientNonMasters( - PointerWithSize* gradient, size_t max_rows) { - assert(*(graph_.begin_owned()) == 0); - size_t start_node = *(graph_.end_owned()); - size_t end_node = graph_.active_size(); - - if (start_node > max_rows) { - start_node = max_rows; - } - if (end_node > max_rows) { - end_node = max_rows; - } - - size_t row_index = layer_dimensions_.output_columns; - if (start_node > max_rows) { - start_node = max_rows; - } - if (end_node > max_rows) { - end_node = max_rows; - } - assert(start_node * row_index <= gradient->size()); - assert(end_node * row_index <= gradient->size()); - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - base_gpu_object_.MaskNonMastersGPU(gradient, start_node, end_node, - row_index); - } else { -#endif - galois::do_all( - galois::iterate(start_node, end_node), - [&](size_t non_master) { - // TODO(loc) use a std function for this for max efficiency - for (size_t i = 0; i < row_index; i++) { - (*gradient)[non_master * row_index + i] = 0; - } - }, - galois::loopname("MaskGradientNonMasters")); -#ifdef GALOIS_ENABLE_GPU - } -#endif -} - -void galois::GNNLayer::MaskGradientNonMasters( - PointerWithSize* gradient, size_t max_rows, - const galois::DynamicBitSet& bs) { - assert(*(graph_.begin_owned()) == 0); - size_t start_node = *(graph_.end_owned()); - size_t end_node = graph_.active_size(); - - if (start_node > max_rows) { - start_node = max_rows; - } - if (end_node > max_rows) { - end_node = max_rows; - } - - size_t row_index = layer_dimensions_.output_columns; - if (start_node > max_rows) { - start_node = max_rows; - } - if (end_node > max_rows) { - end_node = max_rows; - } - assert(start_node * row_index <= gradient->size()); - assert(end_node * row_index <= gradient->size()); - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - base_gpu_object_.MaskNonMastersGPU(gradient, start_node, end_node, - row_index); - } else { -#endif - // galois::gInfo(start_node, " to ", end_node); - galois::do_all( - galois::iterate(start_node, end_node), - [&](size_t non_master) { - // if something is not a master, kill it - if (!bs.test(non_master)) { - // galois::gInfo("don't keep ", non_master); - // TODO(loc) use a std function for this for max efficiency - for (size_t i = 0; i < row_index; i++) { - (*gradient)[non_master * row_index + i] = 0; - } - } - }, - galois::loopname("MaskGradientNonMasters")); -#ifdef GALOIS_ENABLE_GPU - } -#endif -} diff --git a/libgnn/src/layers/GraphConvolutionalLayer.cpp b/libgnn/src/layers/GraphConvolutionalLayer.cpp index b9a9c2120c..e69de29bb2 100644 --- a/libgnn/src/layers/GraphConvolutionalLayer.cpp +++ b/libgnn/src/layers/GraphConvolutionalLayer.cpp @@ -1,459 +0,0 @@ -#include "galois/Logging.h" -#include "galois/GNNMath.h" -#include "galois/layers/GraphConvolutionalLayer.h" - -galois::GraphConvolutionalLayer::GraphConvolutionalLayer( - size_t layer_num, const galois::graphs::GNNGraph& graph, - PointerWithSize* backward_output_matrix, - const GNNLayerDimensions& dimensions, const GNNLayerConfig& config) - : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config), - input_column_intermediates_(dimensions.input_columns), - output_column_intermediates_(dimensions.output_columns) { - galois::gWarn( - "GCN layer not up to date with new subgraph/sampling changes; " - "do not use until updated to reflect changes (see GraphSAGE layer)"); - - size_t num_input_elements = - layer_dimensions_.input_rows * layer_dimensions_.input_columns; - if (!config_.disable_dropout || config_.disable_aggregate_after_update || - layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", GCN input temp var 1 ", num_input_elements, " (", - FloatElementsToGB(num_input_elements), " GB)"); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AllocateInTemp1(num_input_elements); - } else { -#endif - in_temp_1_.resize(num_input_elements, 0); -#ifdef GALOIS_ENABLE_GPU - } -#endif - } - - // only on in dropout case + if in temp is smaller than out temp - if (!config_.disable_dropout && - (config_.disable_aggregate_after_update || - layer_dimensions_.input_columns <= layer_dimensions_.output_columns)) { - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", GCN input temp var 2 ", num_input_elements, " (", - FloatElementsToGB(num_input_elements), " GB)"); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AllocateInTemp2(num_input_elements); - } else { -#endif - in_temp_2_.resize(num_input_elements, 0); -#ifdef GALOIS_ENABLE_GPU - } -#endif - } - - size_t num_output_elements = - layer_dimensions_.input_rows * layer_dimensions_.output_columns; - - // only needed if out temp would be smaller than intemp - if (!config_.disable_aggregate_after_update && - layer_dimensions_.input_columns > layer_dimensions_.output_columns) { - // xform matrix first to work with a smaller output size - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", GCN output temp var ", num_output_elements, " (", - FloatElementsToGB(num_output_elements), " GB)"); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AllocateOutTemp(num_output_elements); - } else { -#endif - out_temp_.resize(num_output_elements, 0); -#ifdef GALOIS_ENABLE_GPU - } -#endif - } - - layer_type_ = galois::GNNLayerType::kGraphConvolutional; -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - // init pointers with size - p_in_temp_1_ = - PointerWithSize(gpu_object_.in_temp_1(), num_input_elements); - p_in_temp_2_ = - PointerWithSize(gpu_object_.in_temp_2(), num_input_elements); - p_out_temp_ = - PointerWithSize(gpu_object_.out_temp(), num_output_elements); - } else { -#endif - p_in_temp_1_ = PointerWithSize(in_temp_1_); - p_in_temp_2_ = PointerWithSize(in_temp_2_); - p_out_temp_ = PointerWithSize(out_temp_); -#ifdef GALOIS_ENABLE_GPU - } -#endif - - GALOIS_LOG_VERBOSE("Conv layer initialized"); -} - -const galois::PointerWithSize -galois::GraphConvolutionalLayer::ForwardPhase( - const galois::PointerWithSize input_embeddings) { - galois::StatTimer timer("ForwardPhase", kRegionName); - timer.start(); - GALOIS_LOG_VERBOSE("Calling forward phase"); - assert(input_embeddings.size() == - (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); - assert(p_forward_output_matrix_.size() == - (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); - // pointer to input to operate on - const GNNFloat* input_data = input_embeddings.data(); - GNNFloat* agg_data; - // first, dropout - if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) { - DoDropout(input_embeddings, &p_in_temp_1_); - input_data = p_in_temp_1_.data(); - agg_data = p_in_temp_2_.data(); - } else { - agg_data = p_in_temp_1_.data(); - } - - // flip aggregate/update if dimensions favor it (do less work) - if (config_.disable_aggregate_after_update || - layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { - // aggregation and update - AggregateAll(layer_dimensions_.input_columns, input_data, agg_data, - &input_column_intermediates_); - UpdateEmbeddings(agg_data, p_forward_output_matrix_.data()); - } else { - // update to aggregate - // FW - UpdateEmbeddings(input_data, p_out_temp_.data()); - // A(FW) - AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(), - p_forward_output_matrix_.data(), - &output_column_intermediates_); - } - - if (!config_.disable_activation) { - GALOIS_LOG_VERBOSE("Doing activation"); - Activation(); - } - - assert(p_forward_output_matrix_.size() == - (layer_dimensions_.input_rows * layer_dimensions_.output_columns)); - timer.stop(); - - return p_forward_output_matrix_; -} - -galois::PointerWithSize -galois::GraphConvolutionalLayer::BackwardPhase( - galois::PointerWithSize prev_layer_input, - galois::PointerWithSize* input_gradient) { - galois::StatTimer timer("BackwardPhase", kRegionName); - galois::StatTimer weight_gradient_timer("BackwardPhaseWeight", kRegionName); - galois::StatTimer weight_gradient_sync_timer("BackwardPhaseWeightSync", kRegionName); - timer.start(); - - assert(layer_phase_ == GNNPhase::kTrain); - - // derivative of activation - if (!config_.disable_activation) { - ActivationDerivative(input_gradient); - } - - // AFW = O - galois::PointerWithSize input_data; - galois::PointerWithSize agg_data; - if (!config_.disable_dropout) { - // dropout result is currently stored in temp 1 - // needs to be used before it gets overwritten - input_data = p_in_temp_1_; - agg_data = p_in_temp_2_; - } else { - // no dropout = use vanilla input - input_data = prev_layer_input; - agg_data = p_in_temp_1_; - } - - // NOTE: PREV LAYER INPUT AND BACKWARDOUTPUT ARE THE SAME MEMORY LOCATION; - // BEWARE OF DEPENDENCIES - - // derivative of aggregation/update - // TODO clean up logic here to reduce nesting - if (config_.disable_aggregate_after_update || - layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { - // aggdata can == p_intemp1; in other words, need to use before overwrite - // mask it, then use it - MaskInputNonMasters(&agg_data); - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.GetWeightGradientsGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, agg_data.data(), - input_gradient->data(), p_layer_weight_gradients_.data()); - } else { -#endif - weight_gradient_timer.start(); - // temp 2 holds aggregated feature vectors from forward phase - galois::CBlasSGEMM( - CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, - layer_dimensions_.input_rows, layer_dimensions_.output_columns, - agg_data.data(), input_gradient->data(), - p_layer_weight_gradients_.data()); - weight_gradient_timer.stop(); -#ifdef GALOIS_ENABLE_GPU - } -#endif - - // gradient isn't masked here; only temp1, which has already been - // overwritten = fine - if (layer_number_ != 0) { - // transposed sgemm for derivative; in_temp is output - assert(input_gradient->size() == - layer_dimensions_.input_rows * layer_dimensions_.output_columns); - // pintemp1 contains (AF)' - UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data()); - // pback contains F' - // derivative of aggregate is the same due to symmetric graph - AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(), - p_backward_output_matrix_.data(), - &input_column_intermediates_, true); - } - } else { - // TODO at this point, out_temp contains memoized FW - // can use it to get A' = O' (FW)^T - // aggregate occurs regardless of layer being equal to 0 because it is - // required in this case for the weight gradient calculation - // this is (FW)' - AggregateAll(layer_dimensions_.output_columns, input_gradient->data(), - p_out_temp_.data(), &output_column_intermediates_, true); - - // done after above because input_data = p_backward_output_matrix in some - // cases; use first before overwriting here if layer # doesn't = 0, it means - // I can mess with the input data itself instad of masking the gradients I - // can mask the input - if (layer_number_ != 0) { - MaskInputNonMasters(&input_data); - } else { - // if 0 then no input to mask: mask the gradient - // this is fine because gradient won't be used to get feature gradients - MaskGradientNonMasters(&p_out_temp_); - } - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.GetWeightGradientsGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, input_data.data(), - p_out_temp_.data(), p_layer_weight_gradients_.data()); - } else { -#endif - weight_gradient_timer.start(); - galois::CBlasSGEMM(CblasTrans, CblasNoTrans, - layer_dimensions_.input_columns, - layer_dimensions_.input_rows, - layer_dimensions_.output_columns, input_data.data(), - p_out_temp_.data(), p_layer_weight_gradients_.data()); - weight_gradient_timer.stop(); -#ifdef GALOIS_ENABLE_GPU - } -#endif - - if (layer_number_ != 0) { - // can now overwrite p_backward without issue; since input gradient - // is untouched if layer number isn't 0 this will be correct - UpdateEmbeddingsDerivative(p_out_temp_.data(), - p_backward_output_matrix_.data()); - } - } - - // sync weight gradients; note aggregation sync occurs in the function call - // already - weight_gradient_sync_timer.start(); - WeightGradientSyncSum(); - weight_gradient_sync_timer.stop(); - - if (!config_.disable_dropout && layer_number_ != 0) { - DoDropoutDerivative(); - } - - timer.stop(); - return p_backward_output_matrix_; -} - -void galois::GraphConvolutionalLayer::AggregateAll( - size_t column_length, const GNNFloat* node_embeddings, - GNNFloat* aggregate_output, - [[maybe_unused]] galois::substrate::PerThreadStorage>* - pts) { - AggregateAll(column_length, node_embeddings, aggregate_output, pts, false); -} - -void galois::GraphConvolutionalLayer::AggregateAll( - size_t column_length, const GNNFloat* node_embeddings, - GNNFloat* aggregate_output, - [[maybe_unused]] galois::substrate::PerThreadStorage>* - pts, - bool is_backward) { - std::string agg_timer_name = "Aggregate"; - if (!is_backward) { - agg_timer_name += "Forward"; - } else { - agg_timer_name += "Backward"; - } - galois::StatTimer timer(agg_timer_name.c_str(), kRegionName); - timer.start(); - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - size_t last_master = *(graph_.end_owned()); - gpu_object_.AggregateAllGPU( - graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings, - aggregate_output, !config_.disable_normalization, - config_.disable_self_aggregate, last_master); - graph_.AggregateSyncGPU(aggregate_output, column_length, layer_number_); - } else { -#endif - AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts); -#ifdef GALOIS_ENABLE_GPU - } -#endif - timer.stop(); -} - -void galois::GraphConvolutionalLayer::AggregateAllCPU( - size_t column_length, const GNNFloat* node_embeddings, - GNNFloat* aggregate_output, - galois::substrate::PerThreadStorage>*) { - galois::StatTimer aggregate_all_sync_timer("AggregateSync", kRegionName); - size_t num_nodes = graph_.size(); - size_t last_master = *(graph_.end_owned()); - assert(0 == *(graph_.begin_owned())); - - galois::do_all( - galois::iterate(static_cast(0), num_nodes), - [&](size_t src) { - size_t index_to_src_feature = src * column_length; - // zero out src feature first - for (size_t i = 0; i < column_length; i++) { - aggregate_output[index_to_src_feature + i] = 0; - } - - if (layer_phase_ == GNNPhase::kTrain) { - if (IsSampledLayer()) { - // XXX(loc) - GALOIS_LOG_WARN( - "Edge sampling not yet implemented for GCN; only SAGE"); - // check if node is part of sampled graph; ignore after 0'ing if not - // sampled - if (!graph_.IsInSampledGraph(src)) - return; - } - } - - GNNFloat source_norm = 0.0; - if (!config_.disable_normalization) { - source_norm = graph_.GetGCNNormFactor(src); - } - - // init to self - if (!config_.disable_self_aggregate) { - graphs::bitset_graph_aggregate.set(src); - // only aggregate self once on master - if (src < last_master) { - for (size_t i = 0; i < column_length; i++) { - aggregate_output[index_to_src_feature + i] = - node_embeddings[index_to_src_feature + i] * source_norm * - source_norm; - } - } - } - - // loop through all destinations to grab the feature to aggregate - for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src); e++) { - size_t dst = graph_.GetEdgeDest(e); - graphs::bitset_graph_aggregate.set(src); - - if (layer_phase_ == GNNPhase::kTrain) { - if (IsSampledLayer()) { - // ignore non-sampled nodes - if (layer_phase_ == GNNPhase::kTrain && - !graph_.IsInSampledGraph(dst)) - continue; - } - } - - size_t index_to_dst_feature = dst * column_length; - - if (!config_.disable_normalization) { - GNNFloat norm_scale = source_norm * graph_.GetGCNNormFactor(dst); - galois::VectorMulAdd( - column_length, &aggregate_output[index_to_src_feature], - &node_embeddings[index_to_dst_feature], norm_scale, - &aggregate_output[index_to_src_feature]); - } else { - // add dst feature to aggregate output - galois::VectorAdd(column_length, - &aggregate_output[index_to_src_feature], - &node_embeddings[index_to_dst_feature], - &aggregate_output[index_to_src_feature]); - } - } - }, - galois::chunk_size<1>(), galois::steal(), - galois::loopname("ConvolutionalAggregateAll")); - // aggregate sync - aggregate_all_sync_timer.start(); - graph_.AggregateSync(aggregate_output, column_length); - aggregate_all_sync_timer.stop(); -} - -void galois::GraphConvolutionalLayer::UpdateEmbeddings( - const GNNFloat* node_embeddings, GNNFloat* output) { - galois::StatTimer timer("ForwardXform", kRegionName); - timer.start(); - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.UpdateEmbeddingsGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, node_embeddings, - base_gpu_object_.layer_weights(), output); - } else { -#endif - // CPU version is just a call into CBlas - galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, - layer_dimensions_.input_columns, - layer_dimensions_.output_columns, node_embeddings, - layer_weights_.data(), output); -#ifdef GALOIS_ENABLE_GPU - } -#endif - timer.stop(); -} - -void galois::GraphConvolutionalLayer::UpdateEmbeddingsDerivative( - const GNNFloat* gradients, GNNFloat* output) { - galois::StatTimer timer("BackwardXform", kRegionName); - timer.start(); - - assert(p_layer_weights_.size() == - layer_dimensions_.input_columns * layer_dimensions_.output_columns); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.UpdateEmbeddingsDerivativeGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, gradients, - base_gpu_object_.layer_weights(), output); - } else { -#endif - // difference is Trans for B matrix (data) to get z by y (weights is y by z - // normally); result is x by y - galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, - layer_dimensions_.output_columns, - layer_dimensions_.input_columns, gradients, - layer_weights_.data(), output); -#ifdef GALOIS_ENABLE_GPU - } -#endif - timer.stop(); -} diff --git a/libgnn/src/layers/L2NormLayer.cpp b/libgnn/src/layers/L2NormLayer.cpp index 0d566f0b66..e69de29bb2 100644 --- a/libgnn/src/layers/L2NormLayer.cpp +++ b/libgnn/src/layers/L2NormLayer.cpp @@ -1,124 +0,0 @@ -#include "galois/layers/L2NormLayer.h" -const galois::PointerWithSize -galois::L2NormLayer::ForwardPhase( - const galois::PointerWithSize input_embeddings) { -#ifdef GALOIS_ENABLE_GPU - // TODO -#endif - GALOIS_LOG_FATAL( - "L2 layer has not been kept up to date for months; do not use"); - return ForwardPhaseCPU(input_embeddings); -} - -const galois::PointerWithSize -galois::L2NormLayer::ForwardPhaseCPU( - const galois::PointerWithSize input_embeddings) { - forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); - // for each row, get square root of squared sums then normalize - const size_t feature_length = layer_dimensions_.input_columns; - // TODO(loc) make sure this works in distributed setting as well - galois::do_all( - galois::iterate(graph_.begin_owned(), graph_.end_owned()), - [&](const unsigned row) { - if (IsSampledLayer()) { - if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(row)) - return; - } - - if (graph_.IsValidForPhase(row, layer_phase_)) { - size_t row_offset = row * feature_length; - float running_square_sum = 0.0; - // get square sums - for (size_t row_index = row_offset; - row_index < (row_offset + feature_length); row_index++) { - running_square_sum += std::pow(input_embeddings[row_index], 2); - } - - // make sure running sum isn't too small - running_square_sum = - (running_square_sum < 1.0e-12) ? 10e-12 : running_square_sum; - - // sqrt of sums, then divide row by it - float sqrt_squares = std::pow(running_square_sum, 0.5); - for (size_t row_index = row_offset; - row_index < (row_offset + feature_length); row_index++) { - forward_output_matrix_[row_index] = - input_embeddings[row_index] / sqrt_squares; - } - } - }, - galois::loopname("L2ForwardNormalization")); - - return forward_output_matrix_; -} - -galois::PointerWithSize galois::L2NormLayer::BackwardPhase( - PointerWithSize prev_layer_input, - PointerWithSize* input_gradient) { -#ifdef GALOIS_ENABLE_GPU - // TODO -#endif - return BackwardPhaseCPU(prev_layer_input, input_gradient); -} - -galois::PointerWithSize galois::L2NormLayer::BackwardPhaseCPU( - galois::PointerWithSize prev_layer_input, - galois::PointerWithSize* input_gradient) { - galois::do_all(galois::iterate(size_t{0}, p_backward_output_matrix_.size()), - [&](size_t i) { p_backward_output_matrix_[i] = 0; }); - const size_t feature_length = layer_dimensions_.input_columns; - - // derivative of some x_1 is sum of gradient w.r.t. x_1 for all elements of - // the row (since l2 norm affects entire row) - // The math itself can be derived using quotient/chain rule on each element - // of the normalized row - galois::do_all( - galois::iterate(graph_.begin_owned(), graph_.end_owned()), - [&](const unsigned row) { - if (IsSampledLayer()) { - if (layer_phase_ == GNNPhase::kTrain && !graph_.IsInSampledGraph(row)) - return; - } - - if (graph_.IsValidForPhase(row, layer_phase_)) { - size_t row_offset = row * feature_length; - // note: if you work this out on paper it turns out that terms that - // seem extra in the way this is calculated below simply get canceled - // out, so this ends up working out This implementation is taken from - // the IPDPS GraphSAINT implementation: I (loc) have confirmed the - // math checks out - float running_square_sum = 0.0; - float mult_with_input = 0.0; - - // get square sums - for (size_t row_index = row_offset; - row_index < (row_offset + feature_length); row_index++) { - running_square_sum += std::pow(prev_layer_input[row_index], 2); - // gradient multiplied with corresponding input; subtraction because - // derivative math ends up working out that way - mult_with_input -= - prev_layer_input[row_index] * (*input_gradient)[row_index]; - } - running_square_sum = - (running_square_sum < 1.0e-12) ? 10e-12 : running_square_sum; - assert(running_square_sum != 0.0); - - // denominator for all gradients is just the square sum to the -3/2'd - // power since this is -, all we have to do is multiply it later - // rather than divide - float denominator = std::pow(running_square_sum, -1.5); - assert(denominator != 0.0); - - for (size_t row_index = row_offset; - row_index < (row_offset + feature_length); row_index++) { - p_backward_output_matrix_[row_index] = - denominator * - (prev_layer_input[row_index] * mult_with_input + - (*input_gradient)[row_index] * running_square_sum); - } - } - }, - galois::loopname("L2Backward")); - - return p_backward_output_matrix_; -} diff --git a/libgnn/src/layers/SAGELayer.cpp b/libgnn/src/layers/SAGELayer.cpp index 032478745d..99d0ffc5f0 100644 --- a/libgnn/src/layers/SAGELayer.cpp +++ b/libgnn/src/layers/SAGELayer.cpp @@ -1,880 +1 @@ -#include "galois/Logging.h" -#include "galois/GNNMath.h" #include "galois/layers/SAGELayer.h" - -galois::SAGELayer::SAGELayer(size_t layer_num, - const galois::graphs::GNNGraph& graph, - PointerWithSize* backward_output_matrix, - const GNNLayerDimensions& dimensions, - const GNNLayerConfig& config, - const SAGELayerConfig& sage_config) - : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, config), - sage_config_(sage_config), - input_column_intermediates_(dimensions.input_columns), - output_column_intermediates_(dimensions.output_columns) { - if (!sage_config_.disable_concat) { - // there are now 2 weight matrices used: one for self, one for aggregation - // abstractly it's one matrix: W = W1 | W2 - size_t num_weight_elements = - layer_dimensions_.input_columns * layer_dimensions_.output_columns; - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", SAGE second layer weights ", num_weight_elements, " (", - FloatElementsToGB(num_weight_elements), " GB)"); - // TODO(lhc) for now, allocate dummy cpu weight2 for copying to GPU - layer_weights_2_.resize(num_weight_elements); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AllocateWeight2(num_weight_elements); - } -#endif - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", SAGE second layer gradients ", num_weight_elements, " (", - FloatElementsToGB(num_weight_elements), " GB)"); - layer_weight_gradients_2_.resize(num_weight_elements, 0); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AllocateWeightGradient2(num_weight_elements); - } -#endif - - // reinit both weight matrices as one unit - PairGlorotBengioInit(&layer_weights_, &layer_weights_2_); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - // copy weight2 to GPU - gpu_object_.CopyToWeights2(layer_weights_2_); - p_layer_weights_2_ = PointerWithSize( - gpu_object_.layer_weights_2(), num_weight_elements); - p_layer_weight_gradients_2_ = PointerWithSize( - gpu_object_.layer_weight_gradients_2(), num_weight_elements); - } else { -#endif - // update the pointers to them as well as realloc will require it - p_layer_weights_2_ = PointerWithSize(layer_weights_2_); - p_layer_weight_gradients_2_ = - PointerWithSize(layer_weight_gradients_2_); -#ifdef GALOIS_ENABLE_GPU - } -#endif - std::vector weight_size = {num_weight_elements}; - // initialize the optimizer - second_weight_optimizer_ = std::make_unique(weight_size, 1); - } - - // TODO(loc) dropout uses input rows; this won't work if dropout is enabled - size_t num_in_temp_elements = - layer_dimensions_.output_rows * layer_dimensions_.input_columns; - - // if (layer_number_ == 0) { - // // set this to true for layer 0; it avoids aggregation completely - // // in the last layer for the backward phase - // config_.disable_aggregate_after_update = true; - // // TODO this *will* hurt test evaluation because test eval has no - // // backward phase, so the end-to-end benefits do not exist there - // // Solution to this is to allocate all intermediate structures for both - // // cases + make sure resize handles both cases - // } - - // if in temp is smaller than out temp, or if dropout exists - if (!config_.disable_dropout || config_.disable_aggregate_after_update || - layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", SAGE input temp var 1 ", num_in_temp_elements, " (", - FloatElementsToGB(num_in_temp_elements), " GB)"); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AllocateInTemp1(num_in_temp_elements); - } else { -#endif - in_temp_1_.resize(num_in_temp_elements, 0); -#ifdef GALOIS_ENABLE_GPU - } -#endif - } - - // only on in dropout case + if in temp is smaller than out temp - if (!config_.disable_dropout && - (config_.disable_aggregate_after_update || - layer_dimensions_.input_columns <= layer_dimensions_.output_columns)) { - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", SAGE input temp var 2 ", num_in_temp_elements, " (", - FloatElementsToGB(num_in_temp_elements), " GB)"); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AllocateInTemp2(num_in_temp_elements); - } else { -#endif - in_temp_2_.resize(num_in_temp_elements, 0); -#ifdef GALOIS_ENABLE_GPU - } -#endif - } - - size_t num_out_temp = - layer_dimensions_.input_rows * layer_dimensions_.output_columns; - // only needed if out temp would be smaller than intemp - if (!config_.disable_aggregate_after_update && - layer_dimensions_.input_columns > layer_dimensions_.output_columns) { - galois::gInfo(graph_.host_prefix(), "Creating layer ", layer_number_, - ", SAGE output temp var ", num_out_temp, " (", - FloatElementsToGB(num_out_temp), " GB)"); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AllocateOutTemp(num_out_temp); - } else { -#endif - out_temp_.resize(num_out_temp, 0); -#ifdef GALOIS_ENABLE_GPU - } -#endif - } - - layer_type_ = galois::GNNLayerType::kSAGE; -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - // init pointers with size - p_in_temp_1_ = PointerWithSize(gpu_object_.in_temp_1(), - num_in_temp_elements); - p_in_temp_2_ = PointerWithSize(gpu_object_.in_temp_2(), - num_in_temp_elements); - p_out_temp_ = - PointerWithSize(gpu_object_.out_temp(), num_output_elements); - } else { -#endif - p_in_temp_1_ = PointerWithSize(in_temp_1_); - p_in_temp_2_ = PointerWithSize(in_temp_2_); - p_out_temp_ = PointerWithSize(out_temp_); -#ifdef GALOIS_ENABLE_GPU - } -#endif - - GALOIS_LOG_VERBOSE("SAGE layer initialized"); -} - -void galois::SAGELayer::ResizeIntermediates(size_t new_input_rows, - size_t new_output_rows) { - size_t num_in_temp_elements = - new_output_rows * layer_dimensions_.input_columns; - // galois::gDebug(graph_.host_prefix(), "Layer num ", layer_number_, " ", - // in_temp_1_.size(), " and ", num_in_temp_elements, " ", - // layer_dimensions_.input_columns, " ", - // layer_dimensions_.output_columns); - - // if in temp is smaller than out temp, or if dropout exists - if (!config_.disable_dropout || config_.disable_aggregate_after_update || - layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { - if (in_temp_1_.size() < num_in_temp_elements) { - galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_, - ", SAGE input temp var 1 ", num_in_temp_elements, " (", - FloatElementsToGB(num_in_temp_elements), " GB)"); - size_t buffer_size = num_in_temp_elements * 0.02; -#ifdef GALOIS_ENABLE_GPU - // XXX(hochan) - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AllocateInTemp1(num_in_temp_elements + buffer_size); - } else { -#endif - in_temp_1_.resize(num_in_temp_elements + buffer_size, 0); -#ifdef GALOIS_ENABLE_GPU - } -#endif - // XXX(hochan) GPU - p_in_temp_1_ = PointerWithSize(in_temp_1_); - } - } - - // only on in dropout case + if in temp is smaller than out temp - if (!config_.disable_dropout && - (config_.disable_aggregate_after_update || - layer_dimensions_.input_columns <= layer_dimensions_.output_columns)) { - if (in_temp_2_.size() < num_in_temp_elements) { - galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_, - ", SAGE input temp var 2 ", num_in_temp_elements, " (", - FloatElementsToGB(num_in_temp_elements), " GB)"); - size_t buffer_size = num_in_temp_elements * 0.02; -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AllocateInTemp2(num_in_temp_elements + buffer_size); - } else { -#endif - in_temp_2_.resize(num_in_temp_elements + buffer_size, 0); -#ifdef GALOIS_ENABLE_GPU - } -#endif - // XXX(hochan) GPU - p_in_temp_2_ = PointerWithSize(in_temp_2_); - } - } - - size_t num_output_temp_elements = - new_input_rows * layer_dimensions_.output_columns; - // only needed if out temp would be smaller than intemp - if (!config_.disable_aggregate_after_update && - layer_dimensions_.input_columns > layer_dimensions_.output_columns) { - if (out_temp_.size() < num_output_temp_elements) { - galois::gInfo(graph_.host_prefix(), "Resize layer ", layer_number_, - ", SAGE output temp var ", num_output_temp_elements, " (", - FloatElementsToGB(num_output_temp_elements), " GB)"); - size_t buffer_size = (num_output_temp_elements * 0.02); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.AllocateOutTemp(num_output_temp_elements + buffer_size); - } else { -#endif - out_temp_.resize(num_output_temp_elements + buffer_size, 0); -#ifdef GALOIS_ENABLE_GPU - } -#endif - p_out_temp_ = PointerWithSize(out_temp_); - } - } -} - -void galois::SAGELayer::WeightGradientSyncSum2() { - galois::StatTimer clubbed_timer("Sync_BackwardSync", "Gluon"); - TimerStart(&clubbed_timer); - galois::StatTimer t("Sync_WeightGradientsSum2", kRegionName); - TimerStart(&t); - int weight_size = static_cast(p_layer_weight_gradients_2_.size()); - -#ifdef GALOIS_ENABLE_GPU - bool gpu_direct_enabled = false; - if (device_personality == DevicePersonality::GPU_CUDA && - !gpu_direct_enabled) { - gpu_object_.CopyWeight2GradientsToCPU(&layer_weight_gradients_2_); - MPI_Allreduce(MPI_IN_PLACE, - static_cast(layer_weight_gradients_2_.data()), - weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); - gpu_object_.CopyToWeight2Gradients(layer_weight_gradients_2_); - } else { -#endif - // TODO(loc) remove this limitation later; can just do a loop over the - // weight matrix - if (p_layer_weight_gradients_2_.size() > - size_t{std::numeric_limits::max()}) { - GALOIS_LOG_FATAL("Weight sync code does not handle size larger than max " - "int at the moment"); - } - MPI_Allreduce(MPI_IN_PLACE, - static_cast(p_layer_weight_gradients_2_.data()), - weight_size, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD); -#ifdef GALOIS_ENABLE_GPU - } -#endif - TimerStop(&t); - TimerStop(&clubbed_timer); -} - -const galois::PointerWithSize galois::SAGELayer::ForwardPhase( - const galois::PointerWithSize input_embeddings) { - // galois::gDebug( - // "Layer ", layer_number_, " dims: ", layer_dimensions_.input_rows, " ", - // layer_dimensions_.output_rows, " ", layer_dimensions_.input_columns, " - // ", layer_dimensions_.output_columns, " ", input_embeddings.size(), " ", - // layer_dimensions_.input_rows * layer_dimensions_.input_columns); - galois::StatTimer timer("ForwardPhase", kRegionName); - TimerStart(&timer); - - assert(input_embeddings.size() >= - (layer_dimensions_.input_rows * layer_dimensions_.input_columns)); - assert(p_forward_output_matrix_.size() >= - (layer_dimensions_.output_rows * layer_dimensions_.output_columns)); - - // pointer to input to operate on - const GNNFloat* input_data = input_embeddings.data(); - GNNFloat* agg_data; - // first, dropout - if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) { - DoDropout(input_embeddings, &p_in_temp_1_); - input_data = p_in_temp_1_.data(); - agg_data = p_in_temp_2_.data(); - } else { - agg_data = p_in_temp_1_.data(); - } - - // O = FW1 + AFW2 is what is done if concat is on: below is the AFW2 part - // which is done regardless - - // flip aggregate/update if dimensions favor it (do less work) - if (config_.disable_aggregate_after_update || - layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { - if (!config_.disable_dropout && (layer_phase_ == GNNPhase::kTrain)) { - assert(p_in_temp_2_.size() >= - layer_dimensions_.output_rows * layer_dimensions_.input_columns); - } else { - assert(p_in_temp_1_.size() >= - layer_dimensions_.output_rows * layer_dimensions_.input_columns); - } - - // aggregation and update - AggregateAll(layer_dimensions_.input_columns, input_data, agg_data, - &input_column_intermediates_); - assert(p_forward_output_matrix_.size() >= - layer_dimensions_.output_rows * layer_dimensions_.output_columns); - UpdateEmbeddings(agg_data, p_forward_output_matrix_.data(), true); - } else { - assert(p_out_temp_.size() >= - layer_dimensions_.input_rows * layer_dimensions_.output_columns); - - // update to aggregate - // FW - UpdateEmbeddings(input_data, p_out_temp_.data(), false); - - // A(FW) - assert(p_forward_output_matrix_.size() >= - layer_dimensions_.output_rows * layer_dimensions_.output_columns); - AggregateAll(layer_dimensions_.output_columns, p_out_temp_.data(), - p_forward_output_matrix_.data(), - &output_column_intermediates_); - } - - if (!sage_config_.disable_concat) { - // FW1 is unaffected by the agg/update flip, so can to it - // separately - SelfFeatureUpdateEmbeddings(input_data, p_forward_output_matrix_.data()); - } - - if (!config_.disable_activation) { - GALOIS_LOG_VERBOSE("Doing activation"); - Activation(); - } - - assert(p_forward_output_matrix_.size() >= - (layer_dimensions_.output_rows * layer_dimensions_.output_columns)); - - TimerStop(&timer); - - return p_forward_output_matrix_; -} - -galois::PointerWithSize galois::SAGELayer::BackwardPhase( - galois::PointerWithSize prev_layer_input, - galois::PointerWithSize* input_gradient) { - galois::StatTimer timer("BackwardPhase", kRegionName); - galois::StatTimer weight_gradient_sync_timer("BackwardPhaseWeightSync", kRegionName); - galois::StatTimer weight_gradient_sync_timer2("BackwardPhaseWeight2Sync", kRegionName); - TimerStart(&timer); - - assert(layer_phase_ == GNNPhase::kTrain || layer_phase_ == GNNPhase::kBatch); - - // derivative of activation - if (!config_.disable_activation) { - ActivationDerivative(input_gradient); - } - - // if dropout was used, use the dropout matrix for the input - galois::PointerWithSize input_data; - galois::PointerWithSize agg_data; - if (!config_.disable_dropout) { - // dropout result is currently stored in temp 1 - // needs to be used before it gets overwritten - input_data = p_in_temp_1_; - agg_data = p_in_temp_2_; - } else { - // no dropout = use vanilla input - input_data = prev_layer_input; - agg_data = p_in_temp_1_; - } - - // aggregate this here before gradient starts to get overwritten - // this is xform ffirst - if (!config_.disable_aggregate_after_update && - layer_dimensions_.input_columns > layer_dimensions_.output_columns) { - // aggregate occurs regardless of layer being equal to 0 because it is - // required in this case for the weight gradient calculation - // this is (FW)' - // TODO: this is absolutely terrible performance wise as well; keep - // in mind - AggregateAll(layer_dimensions_.output_columns, input_gradient->data(), - p_out_temp_.data(), &output_column_intermediates_, true); - } - - if (!sage_config_.disable_concat) { - if (layer_number_ != 0) { - if (graph_.IsSubgraphOn()) { - MaskInputNonMasters(&input_data, layer_dimensions_.input_rows, - graph_.GetNonLayerZeroMasters()); - } else { - MaskInputNonMasters(&input_data, layer_dimensions_.input_rows); - } - } else { - // if 0 then no input to mask: mask the gradient - // this is fine because gradient won't be used to get feature gradients - if (graph_.IsSubgraphOn()) { - MaskGradientNonMasters(input_gradient, layer_dimensions_.output_rows, - graph_.GetNonLayerZeroMasters()); - } else { - MaskGradientNonMasters(input_gradient, layer_dimensions_.output_rows); - } - } - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.UpdateWeight2DerivativeGPU( - layer_dimensions_.input_columns, layer_dimensions_.input_rows, - layer_dimensions_.output_columns, input_data.data(), - input_gradient->data(), p_layer_weight_gradients_2_.data()); - } else { -#endif - // input data (prev layer input or temp1) or gradient need mask - // can mask gradient if layer == 0 - // otherwise must mask other - - galois::StatTimer concat_grad_timer("ConcatGradMultiply", kRegionName); - TimerStart(&concat_grad_timer); - galois::CBlasSGEMM( - CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, - layer_dimensions_.output_rows, layer_dimensions_.output_columns, - input_data.data(), input_gradient->data(), - p_layer_weight_gradients_2_.data()); - TimerStop(&concat_grad_timer); - -#ifdef GALOIS_ENABLE_GPU - } -#endif - } - - weight_gradient_sync_timer2.start(); - WeightGradientSyncSum2(); - weight_gradient_sync_timer2.stop(); - - // derivative of aggregation/update - // TODO clean up logic here to reduce nesting - if (config_.disable_aggregate_after_update || - layer_dimensions_.input_columns <= layer_dimensions_.output_columns) { - // aggdata can == p_intemp1; in other words, need to use before overwrite - // mask it, then use it - // XXX masking may not be required in sampling case where rows change - if (layer_number_ != 0 || sage_config_.disable_concat) { - if (graph_.IsSubgraphOn()) { - MaskInputNonMasters(&agg_data, layer_dimensions_.output_rows, - graph_.GetNonLayerZeroMasters()); - } else { - MaskInputNonMasters(&agg_data, layer_dimensions_.output_rows); - } - } - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - // XXX output rows - gpu_object_.GetWeightGradientsGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, agg_data.data(), - input_gradient->data(), p_layer_weight_gradients_.data()); - } else { -#endif - // agg data holds aggregated feature vectors from forward phase - galois::StatTimer normal_grad_timer("NormalGradMultiply", kRegionName); - TimerStart(&normal_grad_timer); - galois::CBlasSGEMM( - CblasTrans, CblasNoTrans, layer_dimensions_.input_columns, - layer_dimensions_.output_rows, layer_dimensions_.output_columns, - agg_data.data(), input_gradient->data(), - p_layer_weight_gradients_.data()); - TimerStop(&normal_grad_timer); -#ifdef GALOIS_ENABLE_GPU - } -#endif - - // 0 means input gradient shouldn't get masked - if (layer_number_ != 0) { - // NOTE: this is super nice because it avoids aggregation completely - // in the layer 0 setting - // ---unmasked--- - // transposed sgemm for derivative; in_temp is output - assert(input_gradient->size() >= - layer_dimensions_.output_rows * layer_dimensions_.output_columns); - // pintemp1 contains (AF)' - // overwrites the dropout matrix that was in ptemp1 (needed for second - // weight matrix) - UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data(), - true); - - // pback contains F' - // derivative of aggregate is the same due to symmetric graph - AggregateAll(layer_dimensions_.input_columns, p_in_temp_1_.data(), - p_backward_output_matrix_.data(), - &input_column_intermediates_, true); - } - } else { - // xform first - - // --unmasked-- - - // disable concat is part of condition because otherwise this mask - // should have gotten done elsewhere - if (layer_number_ != 0 && sage_config_.disable_concat) { - if (graph_.IsSubgraphOn()) { - MaskInputNonMasters(&input_data, layer_dimensions_.input_rows, - graph_.GetNonLayerZeroMasters()); - } else { - MaskInputNonMasters(&input_data, layer_dimensions_.input_rows); - } - } - - // layer number 0 means output needs to be masked because input cannot - // be masked - if (layer_number_ == 0) { - // if 0 then no input to mask: mask the gradient - // this is fine because gradient won't be used to get feature gradients - if (graph_.IsSubgraphOn()) { - MaskGradientNonMasters(&p_out_temp_, layer_dimensions_.input_rows, - graph_.GetNonLayerZeroMasters()); - } else { - MaskGradientNonMasters(&p_out_temp_, layer_dimensions_.input_rows); - } - } - - // W' = F^T (FW)' - // TODO put this in a function -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.GetWeightGradientsGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, input_data.data(), - p_out_temp_.data(), p_layer_weight_gradients_.data()); - } else { -#endif - // input col x input row * input row x output col - galois::StatTimer normal_grad_timer("NormalGradMultiply", kRegionName); - TimerStart(&normal_grad_timer); - galois::CBlasSGEMM(CblasTrans, CblasNoTrans, - layer_dimensions_.input_columns, - layer_dimensions_.input_rows, - layer_dimensions_.output_columns, input_data.data(), - p_out_temp_.data(), p_layer_weight_gradients_.data()); - TimerStop(&normal_grad_timer); -#ifdef GALOIS_ENABLE_GPU - } -#endif - - // to get a correct result out temp mask cannot be masked; - // outtemp will only be masked if layer number is 0, so this - // is safe in all other cases - if (layer_number_ != 0) { - // derivative for update - // backout = F' - UpdateEmbeddingsDerivative(p_out_temp_.data(), - p_backward_output_matrix_.data(), false); - } - } - - weight_gradient_sync_timer.start(); - WeightGradientSyncSum(); - weight_gradient_sync_timer.stop(); - - // full gradient needed here; should occur after all updates - if (layer_number_ != 0) { - // deal with feature gradients for the self feature here - // this function will sum directly into the backward matrix - // input gradient never gets masked if layer number != 0 - SelfFeatureUpdateEmbeddingsDerivative(input_gradient->data(), - p_backward_output_matrix_.data()); - } - - if (!config_.disable_dropout && layer_number_ != 0) { - DoDropoutDerivative(); - } - - TimerStop(&timer); - return p_backward_output_matrix_; -} - -void galois::SAGELayer::AggregateAll( - size_t column_length, const GNNFloat* node_embeddings, - GNNFloat* aggregate_output, - [[maybe_unused]] galois::substrate::PerThreadStorage>* - pts) { - AggregateAll(column_length, node_embeddings, aggregate_output, pts, false); -} - -void galois::SAGELayer::AggregateAll( - size_t column_length, const GNNFloat* node_embeddings, - GNNFloat* aggregate_output, - [[maybe_unused]] galois::substrate::PerThreadStorage>* - pts, - bool is_backward) { - std::string agg_timer_name = "AggregateCompute"; - std::string agg_sync_timer_name = "AggregateSync"; - size_t num_rows_to_handle; - if (!is_backward) { - agg_timer_name += "Forward"; - agg_sync_timer_name += "Forward"; - num_rows_to_handle = layer_dimensions_.output_rows; - } else { - agg_timer_name += "Backward"; - agg_sync_timer_name += "Backward"; - num_rows_to_handle = layer_dimensions_.input_rows; - } - galois::StatTimer timer(agg_timer_name.c_str(), kRegionName); - galois::StatTimer aggregate_all_sync_timer(agg_sync_timer_name.c_str(), kRegionName); - TimerStart(&timer); - -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - if (!IsSampledLayer()) { - gpu_object_.AggregateAllGPU( - graph_.GetGPUGraph(), graph_.size(), column_length, node_embeddings, - aggregate_output, !config_.disable_normalization, is_backward); - } else { - // TODO(hochan) - GALOIS_LOG_FATAL("SAMPLING IMPLEMENTATION"); - } - graph_.AggregateSyncGPU(aggregate_output, column_length, layer_number_); - } else { -#endif - AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts, - is_backward); - TimerStop(&timer); - - // aggregate sync - aggregate_all_sync_timer.start(); - graph_.AggregateSync(aggregate_output, column_length, is_backward, - num_rows_to_handle); - aggregate_all_sync_timer.stop(); -#ifdef GALOIS_ENABLE_GPU - } -#endif -} - -void galois::SAGELayer::AggregateAllCPU( - size_t column_length, const GNNFloat* node_embeddings, - GNNFloat* aggregate_output, - galois::substrate::PerThreadStorage>*, - bool is_backward) { - // aggregation causes a row count change - size_t num_rows_to_handle; - if (!is_backward) { - num_rows_to_handle = layer_dimensions_.output_rows; - } else { - num_rows_to_handle = layer_dimensions_.input_rows; - } - - galois::do_all( - galois::iterate(*(graph_.begin()), num_rows_to_handle), - [&](size_t src) { - size_t index_to_src_feature = src * column_length; - // zero out src feature first - for (size_t i = 0; i < column_length; i++) { - aggregate_output[index_to_src_feature + i] = 0; - } - - GNNFloat source_norm = 0.0; - if (!config_.disable_normalization) { - source_norm = graph_.GetDegreeNorm(src, graph_user_layer_number_); - } - - if (!is_backward) { - // loop through all destinations to grab the feature to aggregate - for (auto e = graph_.edge_begin(src); e != graph_.edge_end(src); - e++) { - if (layer_phase_ == GNNPhase::kTrain || - layer_phase_ == GNNPhase::kBatch) { - // XXX - // galois::gDebug("In here"); - if (IsSampledLayer()) { - if (!graph_.IsEdgeSampled(e, graph_user_layer_number_)) { - continue; - } - } - } - size_t dst = graph_.GetEdgeDest(e); - graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src)); - size_t index_to_dst_feature = dst * column_length; - - if (!config_.disable_normalization) { - GNNFloat norm_scale = source_norm; - assert(norm_scale != 0); - - galois::VectorMulAdd( - column_length, &aggregate_output[index_to_src_feature], - &node_embeddings[index_to_dst_feature], norm_scale, - &aggregate_output[index_to_src_feature]); - } else { - // add dst feature to aggregate output - galois::VectorAdd(column_length, - &aggregate_output[index_to_src_feature], - &node_embeddings[index_to_dst_feature], - &aggregate_output[index_to_src_feature]); - } - } - } else { - // loop through all destinations to grab the feature to aggregate - for (auto e = graph_.in_edge_begin(src); e != graph_.in_edge_end(src); - e++) { - if (layer_phase_ == GNNPhase::kTrain || - layer_phase_ == GNNPhase::kBatch) { - // XXX - if (IsSampledLayer()) { - if (!graph_.IsInEdgeSampled(e, graph_user_layer_number_)) { - continue; - } - } - } - size_t dst = graph_.GetInEdgeDest(e); - graphs::bitset_graph_aggregate.set(graph_.ConvertToLID(src)); - - // input row x output row in backward means that i shouldn't be - // touching nodes past output rows; the above sample check - // should deal with this where this matters - assert(dst < layer_dimensions_.output_rows); - - size_t index_to_dst_feature = dst * column_length; - - if (!config_.disable_normalization) { - GNNFloat norm_scale = - graph_.GetDegreeNorm(dst, graph_user_layer_number_); - - assert(norm_scale != 0); - - galois::VectorMulAdd( - column_length, &aggregate_output[index_to_src_feature], - &node_embeddings[index_to_dst_feature], norm_scale, - &aggregate_output[index_to_src_feature]); - } else { - // add dst feature to aggregate output - galois::VectorAdd(column_length, - &aggregate_output[index_to_src_feature], - &node_embeddings[index_to_dst_feature], - &aggregate_output[index_to_src_feature]); - } - } - } - }, - galois::chunk_size<1>(), galois::steal(), - galois::loopname("SAGEAggregateAll")); -} - -void galois::SAGELayer::UpdateEmbeddings(const GNNFloat* node_embeddings, - GNNFloat* output, bool after) { - galois::StatTimer timer("ForwardXForm", kRegionName); - TimerStart(&timer); -#ifdef GALOIS_ENABLE_GPU - // TODO self change - // XXX(hochan) output rows - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.UpdateEmbeddingsGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, node_embeddings, - base_gpu_object_.layer_weights(), output); - } else { -#endif - // galois::gDebug("Layer ", graph_user_layer_number_, " ", - // layer_dimensions_.output_rows, " ", - // layer_dimensions_.input_columns, " ", - // layer_dimensions_.output_columns); - // CPU version is just a call into CBlas - if (after) { - galois::CBlasSGEMM( - CblasNoTrans, CblasNoTrans, layer_dimensions_.output_rows, - layer_dimensions_.input_columns, layer_dimensions_.output_columns, - node_embeddings, p_layer_weights_.data(), output); - } else { - galois::CBlasSGEMM( - CblasNoTrans, CblasNoTrans, layer_dimensions_.input_rows, - layer_dimensions_.input_columns, layer_dimensions_.output_columns, - node_embeddings, p_layer_weights_.data(), output); - } -#ifdef GALOIS_ENABLE_GPU - } -#endif - TimerStop(&timer); -} - -void galois::SAGELayer::SelfFeatureUpdateEmbeddings( - const GNNFloat* node_embeddings, GNNFloat* output) { - galois::StatTimer timer("SelfForwardXForm", kRegionName); - TimerStart(&timer); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.SelfFeatureUpdateEmbeddingsGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, node_embeddings, output); - } else { -#endif - // note use of layer weights 2 differentiates this from above - galois::CBlasSGEMM( - CblasNoTrans, CblasNoTrans, layer_dimensions_.output_rows, - layer_dimensions_.input_columns, layer_dimensions_.output_columns, - node_embeddings, layer_weights_2_.data(), output, true); -#ifdef GALOIS_ENABLE_GPU - } -#endif - TimerStop(&timer); -} - -void galois::SAGELayer::UpdateEmbeddingsDerivative(const GNNFloat* gradients, - GNNFloat* output, - bool after) { - galois::StatTimer timer("BackwardXForm", kRegionName); - TimerStart(&timer); - - assert(p_layer_weights_.size() >= - layer_dimensions_.input_columns * layer_dimensions_.output_columns); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.UpdateEmbeddingsDerivativeGPU( - layer_dimensions_.input_rows, layer_dimensions_.input_columns, - layer_dimensions_.output_columns, gradients, - base_gpu_object_.layer_weights(), output); - } else { -#endif - // difference is Trans for B matrix (data) to get z by y (weights is y by z - // normally); result is x by y - // note input rows is used here due to transpose of aggregation - if (after) { - galois::CBlasSGEMM( - CblasNoTrans, CblasTrans, layer_dimensions_.output_rows, - layer_dimensions_.output_columns, layer_dimensions_.input_columns, - gradients, p_layer_weights_.data(), output); - } else { - galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.input_rows, - layer_dimensions_.output_columns, - layer_dimensions_.input_columns, gradients, - p_layer_weights_.data(), output); - } -#ifdef GALOIS_ENABLE_GPU - } -#endif - TimerStop(&timer); -} - -void galois::SAGELayer::SelfFeatureUpdateEmbeddingsDerivative( - const GNNFloat* gradients, GNNFloat* output) { - galois::StatTimer timer("SelfBackwardXForm", kRegionName); - TimerStart(&timer); - - assert(p_layer_weights_.size() >= - layer_dimensions_.input_columns * layer_dimensions_.output_columns); -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.SelfFeatureUpdateEmbeddingsDerivativeGPU( - layer_dimensions_.input_rows, layer_dimensions_.output_columns, - layer_dimensions_.input_columns, gradients, output); - } else { -#endif - // difference is Trans for B matrix (data) to get z by y (weights is y by z - // normally); result is x by y - // true at end -> accumulate - galois::CBlasSGEMM(CblasNoTrans, CblasTrans, layer_dimensions_.output_rows, - layer_dimensions_.output_columns, - layer_dimensions_.input_columns, gradients, - layer_weights_2_.data(), output, true); -#ifdef GALOIS_ENABLE_GPU - } -#endif - TimerStop(&timer); -} - -void galois::SAGELayer::OptimizeLayer(BaseOptimizer* optimizer, - size_t trainable_layer_number) { - galois::StatTimer total_gradient_timer("GradientDescent", kRegionName); - total_gradient_timer.start(); - optimizer->GradientDescent(p_layer_weight_gradients_, p_layer_weights_, - trainable_layer_number); - if (!sage_config_.disable_concat) { - second_weight_optimizer_->GradientDescent(p_layer_weight_gradients_2_, - p_layer_weights_2_, 0); - } - total_gradient_timer.stop(); -} diff --git a/libgnn/src/layers/SigmoidLayer.cpp b/libgnn/src/layers/SigmoidLayer.cpp index 595fd5c023..372751f052 100644 --- a/libgnn/src/layers/SigmoidLayer.cpp +++ b/libgnn/src/layers/SigmoidLayer.cpp @@ -1,114 +1 @@ #include "galois/layers/SigmoidLayer.h" -#include "galois/GNNMath.h" -#include - -// TODO(loc) GPU support - -const galois::PointerWithSize -galois::SigmoidLayer::ForwardPhaseCPU( - const galois::PointerWithSize input_embeddings) { - galois::gWarn( - "Sigmoid layer has not been kept up to date; do not use unless sure" - " it works with new changes"); - - input_loss_.assign(input_loss_.size(), 0.0); - forward_output_matrix_.assign(forward_output_matrix_.size(), 0.0); - const size_t feature_length = layer_dimensions_.input_columns; - node_count_.reset(); - float_accumulator_.reset(); - - galois::do_all( - galois::iterate(graph_.begin(), graph_.end()), - [&](const unsigned local_node) { - if (graph_.IsValidForPhase(local_node, layer_phase_)) { - if (IsSampledLayer()) { - if (layer_phase_ == GNNPhase::kTrain && - !graph_.IsInSampledGraph(local_node)) - return; - } - - node_count_ += 1; - - size_t node_offset = feature_length * local_node; - // sigmoid the values for this node - for (unsigned index = 0; index < feature_length; index++) { - // splitting in half is done for numerical stability of log - if (input_embeddings[node_offset + index] >= 0) { - forward_output_matrix_[node_offset + index] = - 1.0 / (1.0 + expf(-input_embeddings[node_offset + index])); - } else { - forward_output_matrix_[node_offset + index] = - expf(input_embeddings[node_offset + index]) / - (1.0 + expf(input_embeddings[node_offset + index])); - } - } - - input_loss_[local_node] = GNNCrossEntropy( - feature_length, graph_.GetMultiClassLabel(local_node), - &forward_output_matrix_[node_offset]); - // TODO(loc) normalize the loss - float_accumulator_ += input_loss_[local_node]; - } - }, - galois::steal(), galois::loopname("SigmoidForward")); - - galois::gPrint("Average loss is ", - float_accumulator_.reduce() / node_count_.reduce(), "\n"); - return forward_output_matrix_; -} - -const galois::PointerWithSize -galois::SigmoidLayer::ForwardPhase( - const galois::PointerWithSize input_embeddings) { -#ifdef GALOIS_ENABLE_GPU - // TODO(loc) when GPU needs it - printf("%p\n", input_embeddings.data()); - return p_layer_weights_; -#else - return ForwardPhaseCPU(input_embeddings); -#endif -} - -galois::PointerWithSize -galois::SigmoidLayer::BackwardPhaseCPU() { - const size_t feature_length = layer_dimensions_.input_columns; - galois::do_all(galois::iterate(size_t{0}, p_backward_output_matrix_.size()), - [&](size_t i) { p_backward_output_matrix_[i] = 0; }); - - galois::do_all( - galois::iterate(graph_.begin(), graph_.end()), - [&](const unsigned local_node) { - if (graph_.IsValidForPhase(local_node, layer_phase_)) { - if (IsSampledLayer()) { - if (layer_phase_ == GNNPhase::kTrain && - !graph_.IsInSampledGraph(local_node)) - return; - } - - // derivative cross entropy into norm grad - const GNNLabel* ground_truth = graph_.GetMultiClassLabel(local_node); - size_t node_offset = feature_length * local_node; - // sigmoid-cross-entropy derivative: turns out all it is is simple - // subtraction - for (unsigned index = 0; index < feature_length; index++) { - p_backward_output_matrix_[node_offset + index] = - forward_output_matrix_[node_offset + index] - - ground_truth[index]; - } - } - }, - galois::steal(), galois::loopname("SigmoidBackward")); - - return p_backward_output_matrix_; -} - -galois::PointerWithSize -galois::SigmoidLayer::BackwardPhase(PointerWithSize, - PointerWithSize*) { -#ifdef GALOIS_ENABLE_GPU - // TODO(loc) when GPU needs it - return p_layer_weights_; -#else - return BackwardPhaseCPU(); -#endif -} diff --git a/libgnn/src/layers/SoftmaxLayer.cpp b/libgnn/src/layers/SoftmaxLayer.cpp deleted file mode 100644 index aebbb3dd9b..0000000000 --- a/libgnn/src/layers/SoftmaxLayer.cpp +++ /dev/null @@ -1,139 +0,0 @@ -#include "galois/Logging.h" -#include "galois/GNNMath.h" -#include "galois/layers/SoftmaxLayer.h" - -const galois::PointerWithSize -galois::SoftmaxLayer::ForwardPhaseCPU( - const galois::PointerWithSize input_embeddings) { - galois::StatTimer timer("SoftmaxForward", "SoftmaxLayer"); - TimerStart(&timer); - - // note: p_backward == input_embeddings - input_loss_.assign(input_loss_.size(), 0.0); - const size_t feature_length = layer_dimensions_.input_columns; -#ifndef NDEBUG - galois::DGAccumulator loss_accum; - galois::DGAccumulator handled; - loss_accum.reset(); - handled.reset(); -#endif - - galois::do_all( - galois::iterate(size_t{0}, layer_dimensions_.input_rows), - [&](const unsigned i) { - if (IsSampledLayer()) { - if ((layer_phase_ == GNNPhase::kTrain || - layer_phase_ == GNNPhase::kBatch) && - !graph_.IsInSampledGraphSubgraph(i)) { - // XXX - VectorZero(feature_length, - &p_backward_output_matrix_[i * feature_length]); - return; - } - } - - // do softmax - GNNSoftmax(feature_length, &input_embeddings[feature_length * i], - &p_backward_output_matrix_[feature_length * i]); - // create ground truth vector for this LID - std::vector* ground_truth_vec = - ground_truth_vectors_.getLocal(); - assert(ground_truth_vec->size() == feature_length); - ground_truth_vec->assign(ground_truth_vec->size(), 0.0); - // single class label is an index; set the correct one - (*ground_truth_vec)[static_cast( - graph_.GetSingleClassLabel(i))] = 1.0; - - // calculate loss for this LID (note not all i will be filled) - input_loss_[i] = - GNNCrossEntropy(feature_length, ground_truth_vec->data(), - &p_backward_output_matrix_[feature_length * i]); -#ifndef NDEBUG - loss_accum += input_loss_[i]; - handled += 1; -#endif - }, - // TODO chunk size? - // steal on as some threads may have nothing to work on - // galois::steal(), galois::loopname("SoftmaxForward")); - galois::steal()); -#ifndef NDEBUG - GNNFloat reduced_loss = loss_accum.reduce(); - size_t t = handled.reduce(); - galois::gPrint("Loss is ", reduced_loss / t, " ", reduced_loss, " ", t, "\n"); -#endif - - TimerStop(&timer); - return p_backward_output_matrix_; -} - -const galois::PointerWithSize -galois::SoftmaxLayer::ForwardPhase( - const galois::PointerWithSize input_embeddings) { -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.ForwardPhaseGPU( - layer_phase_, graph_.size(), layer_dimensions_.input_columns, - input_embeddings.data(), p_backward_output_matrix_.data()); - return p_backward_output_matrix_; - } -#endif - return ForwardPhaseCPU(input_embeddings); -} - -galois::PointerWithSize -galois::SoftmaxLayer::BackwardPhaseCPU() { - galois::StatTimer timer("SoftmaxBackward", "SoftmaxLayer"); - TimerStart(&timer); - - const size_t feature_length = layer_dimensions_.input_columns; - - galois::do_all( - galois::iterate(size_t{0}, layer_dimensions_.input_rows), - [&](const unsigned node) { - if (IsSampledLayer()) { - if (layer_phase_ == GNNPhase::kTrain && - !graph_.IsInSampledGraphSubgraph(node)) - return; - } - - size_t correct = graph_.GetSingleClassLabel(node); - // See here for explanation for why this works - // https://gombru.github.io/2018/05/23/cross_entropy_loss/ - // Derivation of full combined derivative isn't there, but some - // emperical inspection tells me this is likely correct - // TODO(loc) work it out myself - for (size_t idx = 0; idx < feature_length; idx++) { - if (idx == correct) { - // positive class - p_backward_output_matrix_[node * feature_length + idx] = - p_backward_output_matrix_[node * feature_length + idx] - 1; - } else { - // negative class - p_backward_output_matrix_[node * feature_length + idx] = - p_backward_output_matrix_[node * feature_length + idx]; - } - } - }, - galois::steal(), galois::loopname("SoftmaxBackward")); - - TimerStop(&timer); - - return p_backward_output_matrix_; -} - -galois::PointerWithSize -galois::SoftmaxLayer::BackwardPhase(PointerWithSize, - PointerWithSize*) { -#ifdef GALOIS_ENABLE_GPU - if (device_personality == DevicePersonality::GPU_CUDA) { - gpu_object_.BackwardPhaseGPU( - layer_phase_, graph_.size(), layer_dimensions_.input_columns, - p_backward_output_matrix_.data(), p_backward_output_matrix_.data()); - return p_backward_output_matrix_; - } -#endif - return BackwardPhaseCPU(); -} - -// TODO function for getting loss diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index e646259f87..00aa14bce6 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -2,6 +2,7 @@ find_package(OpenMP) add_executable(mkl_micro mkl_micro.cpp) target_link_directories(mkl_micro PUBLIC ${MKL_LIBRARIES}) +target_link_directories(mkl_micro PUBLIC ${INTEL_COMPILER_LIBRARIES}) target_include_directories(mkl_micro PUBLIC ${MKL_INCLUDE_DIRS} ) @@ -9,6 +10,7 @@ target_link_libraries(mkl_micro ${INTEL_LIBS}) add_executable(mkl_micro_omp mkl_micro.cpp) target_link_directories(mkl_micro_omp PUBLIC ${MKL_LIBRARIES}) +target_link_directories(mkl_micro_omp PUBLIC ${INTEL_COMPILER_LIBRARIES}) target_include_directories(mkl_micro_omp PUBLIC ${MKL_INCLUDE_DIRS} ) diff --git a/libgnn/test/accuracy-test.cpp b/libgnn/test/accuracy-test.cpp index 6d26284325..f2d34c0403 100644 --- a/libgnn/test/accuracy-test.cpp +++ b/libgnn/test/accuracy-test.cpp @@ -13,8 +13,8 @@ int main() { GALOIS_LOG_VERBOSE("Num threads is {}", num_threads); // load test graph - auto test_graph = std::make_unique( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + auto test_graph = std::make_unique>( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); std::vector layer_types = { galois::GNNLayerType::kGraphConvolutional}; @@ -26,7 +26,7 @@ int main() { std::vector adam_sizes = {21}; auto adam = std::make_unique(adam_sizes, 1); - auto gnn = std::make_unique( + auto gnn = std::make_unique>( std::move(test_graph), std::move(adam), std::move(gnn_config)); // for constancy set everything to 1 gnn->SetAllLayerWeightsTo1(); diff --git a/libgnn/test/aggregate-sync-test.cpp b/libgnn/test/aggregate-sync-test.cpp index d95931a798..549e6c7c53 100644 --- a/libgnn/test/aggregate-sync-test.cpp +++ b/libgnn/test/aggregate-sync-test.cpp @@ -9,8 +9,8 @@ int main() { GALOIS_LOG_WARN("This test should be run with multiple hosts/processes!"); } - auto test_graph = std::make_unique( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + auto test_graph = std::make_unique>( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); // print edges for sanity for (size_t node = 0; node < test_graph->size(); node++) { @@ -42,8 +42,8 @@ int main() { galois::PointerWithSize p_back(back_matrix); // create the layer, no norm factor - std::unique_ptr layer_0 = - std::make_unique( + std::unique_ptr> layer_0 = + std::make_unique>( 0, *(test_graph.get()), &p_null, dimension_0, l_config); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner @@ -125,8 +125,8 @@ int main() { ////////////////////////////////////////////////////////////////////////////// // layer 1 to check backward output ////////////////////////////////////////////////////////////////////////////// - std::unique_ptr layer_1 = - std::make_unique( + std::unique_ptr> layer_1 = + std::make_unique>( 1, *(test_graph.get()), &p_back, dimension_0, l_config); layer_1->InitAllWeightsTo1(); galois::PointerWithSize layer_1_forward_output = @@ -206,8 +206,8 @@ int main() { } } ////////////////////////////////////////////////////////////////////////////// - auto test_graph_2 = std::make_unique( - "tester", galois::graphs::GNNPartitionScheme::kCVC, true); + auto test_graph_2 = std::make_unique>( + "tester", galois::graphs::GNNPartitionScheme::kCVC, true, false); // print edges for sanity for (size_t node = 0; node < test_graph_2->size(); node++) { for (auto e = test_graph_2->edge_begin(node); @@ -232,7 +232,7 @@ int main() { l_config.DebugConfig(); // create the layer, no norm factor - layer_0 = std::make_unique( + layer_0 = std::make_unique>( 0, *(test_graph_2.get()), &p_null, dimension_0, l_config); layer_0->InitAllWeightsTo1(); @@ -300,7 +300,7 @@ int main() { std::vector back_matrix_2(test_graph_2->size() * 3); galois::PointerWithSize p_back_2(back_matrix_2); - layer_1 = std::make_unique( + layer_1 = std::make_unique>( 1, *(test_graph_2.get()), &p_back_2, dimension_0, l_config); layer_1->InitAllWeightsTo1(); layer_1_forward_output = diff --git a/libgnn/test/back-conv-test.cpp b/libgnn/test/back-conv-test.cpp index 480058f6ae..6229c9288c 100644 --- a/libgnn/test/back-conv-test.cpp +++ b/libgnn/test/back-conv-test.cpp @@ -11,8 +11,8 @@ int main() { galois::runtime::getSystemNetworkInterface().ID, num_threads); // load test graph - galois::graphs::GNNGraph test_graph( - "tester", galois::graphs::GNNPartitionScheme::kCVC, true); + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kCVC, true, false); galois::PointerWithSize feats = test_graph.GetLocalFeatures(); for (size_t row = 0; row < test_graph.size(); row++) { @@ -70,8 +70,8 @@ int main() { galois::PointerWithSize p_back(back_matrix); // create layer 1 for testing backward prop actually giving weights back - std::unique_ptr layer_1 = - std::make_unique(1, test_graph, &p_back, + std::unique_ptr> layer_1 = + std::make_unique>(1, test_graph, &p_back, dimension_0, dcon); layer_1->InitAllWeightsTo1(); galois::PointerWithSize layer_1_forward_output = diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp index 5902d059fa..1bec3b4b31 100644 --- a/libgnn/test/convlayer-test.cpp +++ b/libgnn/test/convlayer-test.cpp @@ -14,8 +14,8 @@ int main() { galois::runtime::getSystemNetworkInterface().ID, num_threads); // load test graph - galois::graphs::GNNGraph test_graph( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); galois::PointerWithSize feats = test_graph.GetLocalFeatures(); @@ -60,8 +60,8 @@ int main() { galois::PointerWithSize p_back(back_matrix); // create the layer, no norm factor - std::unique_ptr layer_0 = - std::make_unique(0, test_graph, &p_null, + std::unique_ptr> layer_0 = + std::make_unique>(0, test_graph, &p_null, dimension_0, dcon); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner @@ -125,8 +125,8 @@ int main() { // create layer 1 for testing backward prop actually giving weights back - std::unique_ptr layer_1 = - std::make_unique(1, test_graph, &p_back, + std::unique_ptr> layer_1 = + std::make_unique>(1, test_graph, &p_back, dimension_0, dcon); layer_1->InitAllWeightsTo1(); galois::PointerWithSize layer_1_forward_output = @@ -202,8 +202,8 @@ int main() { // (verification requires floating point accuracy or setting a seed which I // don't have time for at the moment // TODO in future maybe add better unit test for this - std::unique_ptr layer_2 = - std::make_unique(1, test_graph, &p_back, + std::unique_ptr> layer_2 = + std::make_unique>(1, test_graph, &p_back, dimension_0, config); galois::PointerWithSize l2_fo = layer_2->ForwardPhase(test_graph.GetLocalFeatures()); diff --git a/libgnn/test/epoch-test.cpp b/libgnn/test/epoch-test.cpp index ed665684f1..c0b4ede716 100644 --- a/libgnn/test/epoch-test.cpp +++ b/libgnn/test/epoch-test.cpp @@ -13,8 +13,8 @@ int main() { GALOIS_LOG_VERBOSE("Num threads is {}", num_threads); // load graph - auto test_graph = std::make_unique( - "cora", galois::graphs::GNNPartitionScheme::kCVC, true); + auto test_graph = std::make_unique>( + "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false); std::vector layer_types = { galois::GNNLayerType::kGraphConvolutional, @@ -34,7 +34,7 @@ int main() { 16 * test_graph->GetNumLabelClasses()}; auto adam = std::make_unique(adam_sizes, 2); - auto gnn = std::make_unique( + auto gnn = std::make_unique>( std::move(test_graph), std::move(adam), std::move(gnn_config)); ////////////////////////////////////////////////////////////////////////////// diff --git a/libgnn/test/f1-test.cpp b/libgnn/test/f1-test.cpp index 64935bc235..363c12861b 100644 --- a/libgnn/test/f1-test.cpp +++ b/libgnn/test/f1-test.cpp @@ -8,8 +8,8 @@ int main() { galois::DistMemSys G; // load test graph; false at end = multilabel - galois::graphs::GNNGraph test_graph( - "tester", galois::graphs::GNNPartitionScheme::kOEC, false); + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false); // perfect precision and recall std::vector prediction = { diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp index 69c64105f6..da0e6bd3f9 100644 --- a/libgnn/test/gnnconstruct-test.cpp +++ b/libgnn/test/gnnconstruct-test.cpp @@ -14,8 +14,8 @@ int main() { galois::runtime::getSystemNetworkInterface().ID, num_threads); // load test graph - auto test_graph = std::make_unique( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + auto test_graph = std::make_unique>( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); // 2 layer test with softmax std::vector layer_types = { @@ -28,8 +28,8 @@ int main() { std::vector adam_sizes = {12, 28}; auto adam = std::make_unique(adam_sizes, 2); - galois::GraphNeuralNetwork gnn(std::move(test_graph), std::move(adam), - std::move(gnn_config)); + galois::GraphNeuralNetwork + gnn(std::move(test_graph), std::move(adam), std::move(gnn_config)); // note this does not include output layer GALOIS_LOG_ASSERT(gnn.num_intermediate_layers() == 2); diff --git a/libgnn/test/gnnfb-test.cpp b/libgnn/test/gnnfb-test.cpp index b99c8aeb8d..eb74ffb78a 100644 --- a/libgnn/test/gnnfb-test.cpp +++ b/libgnn/test/gnnfb-test.cpp @@ -14,8 +14,8 @@ int main() { galois::runtime::getSystemNetworkInterface().ID, num_threads); // load test graph - auto test_graph = std::make_unique( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + auto test_graph = std::make_unique>( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); // 2 layer test with softmax std::vector layer_types = { @@ -36,7 +36,7 @@ int main() { // middle 2 are trainable so 12 and 28 std::vector adam_sizes = {12, 28}; auto adam = std::make_unique(adam_sizes, 2); - auto gnn = std::make_unique( + auto gnn = std::make_unique>( std::move(test_graph), std::move(adam), std::move(gnn_config)); // for constancy set everything to 1 gnn->SetAllLayerWeightsTo1(); @@ -171,13 +171,13 @@ int main() { GALOIS_LOG_VERBOSE("Running with different congifuration"); - test_graph = std::make_unique( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + test_graph = std::make_unique>( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); galois::GraphNeuralNetworkConfig gnn_config2( 2, layer_types, layer_output_sizes, galois::GNNOutputLayerType::kSoftmax, dcon); auto adam2 = std::make_unique(adam_sizes, 2); - auto gnn2 = std::make_unique( + auto gnn2 = std::make_unique>( std::move(test_graph), std::move(adam2), std::move(gnn_config2)); // run to make sure no crashes occur gnn2->DoInference(); diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp index 6e12b13899..e4451a4900 100644 --- a/libgnn/test/gnngraph-test.cpp +++ b/libgnn/test/gnngraph-test.cpp @@ -16,11 +16,11 @@ int main() { // note multi level reading tested in another test GALOIS_LOG_VERBOSE("reddit with single label, oec"); - galois::graphs::GNNGraph("cora", galois::graphs::GNNPartitionScheme::kOEC, - true); + galois::graphs::GNNGraph("cora", galois::graphs::GNNPartitionScheme::kOEC, + true, false); GALOIS_LOG_VERBOSE("reddit with single label, cvc"); - galois::graphs::GNNGraph("cora", galois::graphs::GNNPartitionScheme::kCVC, - true); + galois::graphs::GNNGraph("cora", galois::graphs::GNNPartitionScheme::kCVC, + true, false); // below for when I want to check the remapper // galois::graphs::GNNGraph remapper("ogbn-papers100M", diff --git a/libgnn/test/gpu-adam-test.cpp b/libgnn/test/gpu-adam-test.cpp index 58da1d3b68..646cba3b16 100644 --- a/libgnn/test/gpu-adam-test.cpp +++ b/libgnn/test/gpu-adam-test.cpp @@ -26,8 +26,8 @@ int main() { // make this layer to get access to a gpu helper function; TODO // need a helper alloc function - galois::graphs::GNNGraph test_graph( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); galois::GNNLayerDimensions dimension_0; dimension_0.input_rows = 7; dimension_0.input_columns = test_graph.GetNumLabelClasses(); diff --git a/libgnn/test/gpu-aggregate-sync-test.cpp b/libgnn/test/gpu-aggregate-sync-test.cpp index 3a0ee7f3d4..e8d0b9b683 100644 --- a/libgnn/test/gpu-aggregate-sync-test.cpp +++ b/libgnn/test/gpu-aggregate-sync-test.cpp @@ -16,8 +16,8 @@ int main() { gpudevice = galois::runtime::getSystemNetworkInterface().ID; SetCUDADeviceId(gpudevice); - auto test_graph = std::make_unique( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + auto test_graph = std::make_unique>( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); // create same layer from convlayer-test and make sure result is the same even // in multi-host environment @@ -42,8 +42,8 @@ int main() { galois::PointerWithSize p_back(back_matrix); // create the layer, no norm factor - std::unique_ptr layer_0 = - std::make_unique( + std::unique_ptr> layer_0 = + std::make_unique>( 0, *(test_graph.get()), &p_null, dimension_0, l_config); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner @@ -130,8 +130,8 @@ int main() { ////////////////////////////////////////////////////////////////////////////// // layer 1 to check backward output ////////////////////////////////////////////////////////////////////////////// - std::unique_ptr layer_1 = - std::make_unique( + std::unique_ptr> layer_1 = + std::make_unique>( 1, *(test_graph.get()), &p_back, dimension_0, l_config); layer_1->InitAllWeightsTo1(); layer_1->ForwardPhase(test_graph->GetLocalFeatures()); diff --git a/libgnn/test/gpu-back-conv-test.cpp b/libgnn/test/gpu-back-conv-test.cpp index c089ffb698..2df78d694d 100644 --- a/libgnn/test/gpu-back-conv-test.cpp +++ b/libgnn/test/gpu-back-conv-test.cpp @@ -20,8 +20,8 @@ int main() { galois::runtime::getSystemNetworkInterface().ID, num_threads); // load test graph - galois::graphs::GNNGraph test_graph( - "tester", galois::graphs::GNNPartitionScheme::kCVC, true); + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kCVC, true, false); galois::GNNLayerDimensions dimension_0; dimension_0.input_rows = test_graph.size(); @@ -53,8 +53,8 @@ int main() { galois::PointerWithSize output_layer(output_matrix); // create layer 1 for testing backward prop actually giving weights back - std::unique_ptr layer_1 = - std::make_unique(1, test_graph, &p_back, + std::unique_ptr> layer_1 = + std::make_unique>(1, test_graph, &p_back, dimension_0, dcon); galois::PointerWithSize dummy_ones = layer_1->AllocateGPU(dummy_ones_v); layer_1->InitAllWeightsTo1(); diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp index 3a822cf9c5..a36740b5e3 100644 --- a/libgnn/test/gpu-convlayer-test.cpp +++ b/libgnn/test/gpu-convlayer-test.cpp @@ -15,8 +15,8 @@ int main() { num_threads); device_personality = DevicePersonality::GPU_CUDA; // load test graph - galois::graphs::GNNGraph test_graph( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); galois::PointerWithSize feats = test_graph.GetLocalFeatures(); @@ -52,8 +52,8 @@ int main() { dimension_0.input_columns, dimension_0.output_columns); // create the layer, no norm factor - std::unique_ptr layer_0 = - std::make_unique(0, test_graph, &p_null, + std::unique_ptr> layer_0 = + std::make_unique>(0, test_graph, &p_null, dimension_0, dcon); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner @@ -112,8 +112,8 @@ int main() { // create layer 1 for testing backward prop actually giving weights back - std::unique_ptr layer_1 = - std::make_unique(1, test_graph, &p_back, + std::unique_ptr> layer_1 = + std::make_unique>(1, test_graph, &p_back, dimension_0, dcon); layer_1->InitAllWeightsTo1(); layer_1->ForwardPhase(test_graph.GetLocalFeatures()); @@ -194,8 +194,8 @@ int main() { // (verification requires floating point accuracy or setting a seed which I // don't have time for at the moment // TODO in future maybe add better unit test for this - std::unique_ptr layer_2 = - std::make_unique(2, test_graph, &p_back, + std::unique_ptr> layer_2 = + std::make_unique>(2, test_graph, &p_back, dimension_0, config); layer_2->ForwardPhase(test_graph.GetLocalFeatures()); // pointer is to GPU memory: copy it over to a CPU source for verification diff --git a/libgnn/test/gpu-epoch-test.cpp b/libgnn/test/gpu-epoch-test.cpp index 8b71b81e3f..71a227416c 100644 --- a/libgnn/test/gpu-epoch-test.cpp +++ b/libgnn/test/gpu-epoch-test.cpp @@ -14,8 +14,8 @@ int main() { device_personality = DevicePersonality::GPU_CUDA; // load graph - auto test_graph = std::make_unique( - "cora", galois::graphs::GNNPartitionScheme::kCVC, true); + auto test_graph = std::make_unique>( + "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false); std::vector layer_types = { galois::GNNLayerType::kGraphConvolutional, @@ -36,7 +36,7 @@ int main() { std::vector cpu_pred; cpu_pred.resize(test_graph->GetNumLabelClasses() * test_graph->size()); - auto gnn = std::make_unique( + auto gnn = std::make_unique>( std::move(test_graph), std::move(adam), std::move(gnn_config)); ////////////////////////////////////////////////////////////////////////////// diff --git a/libgnn/test/gpu-sage-layer-test.cpp b/libgnn/test/gpu-sage-layer-test.cpp index 7cec3b9a2b..7af3808c85 100644 --- a/libgnn/test/gpu-sage-layer-test.cpp +++ b/libgnn/test/gpu-sage-layer-test.cpp @@ -21,8 +21,8 @@ int main() { dimension_0.output_columns = 2; // load test graph - galois::graphs::GNNGraph test_graph( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); unsigned num_layers = 3; test_graph.ResizeGPULayerVector(num_layers); test_graph.InitLayerVectorMetaObjects( @@ -46,8 +46,8 @@ int main() { galois::SAGELayerConfig scon; scon.disable_concat = false; - std::unique_ptr layer_0 = - std::make_unique(0, test_graph, &p_null, dimension_0, + std::unique_ptr> layer_0 = + std::make_unique>(0, test_graph, &p_null, dimension_0, dcon, scon); layer_0->InitAllWeightsTo1(); // sage weights for self @@ -121,7 +121,7 @@ int main() { //////////////////////////////////////////////////////////////////////////////// // create layer 1 for testing backward prop actually giving weights back - auto layer_1 = std::make_unique(1, test_graph, &p_back, + auto layer_1 = std::make_unique>(1, test_graph, &p_back, dimension_0, dcon, scon); layer_1->InitAllWeightsTo1(); layer_1->InitSelfWeightsTo1(); @@ -217,7 +217,7 @@ int main() { // (verification requires floating point accuracy or setting a seed which I // don't have time for at the moment // TODO in future maybe add better unit test for this - auto layer_2 = std::make_unique(2, test_graph, &p_back, + auto layer_2 = std::make_unique>(2, test_graph, &p_back, dimension_0, config, scon); layer_2->ForwardPhase(test_graph.GetLocalFeatures()); const std::vector& l2_fo = diff --git a/libgnn/test/gpu-softmaxlayer-test.cpp b/libgnn/test/gpu-softmaxlayer-test.cpp index 64b7c9e6f0..96875feffa 100644 --- a/libgnn/test/gpu-softmaxlayer-test.cpp +++ b/libgnn/test/gpu-softmaxlayer-test.cpp @@ -14,8 +14,8 @@ int main() { device_personality = DevicePersonality::GPU_CUDA; // load test graph - galois::graphs::GNNGraph test_graph( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); // input/output columns must be same in softmax galois::GNNLayerDimensions dimension_0; @@ -29,7 +29,7 @@ int main() { galois::PointerWithSize p_back(back_matrix); // train mode - auto output_layer = std::make_unique( + auto output_layer = std::make_unique>( 3, test_graph, &p_back, dimension_0); // input to softmax std::vector softmax_input(49, 0.0); diff --git a/libgnn/test/l2norm-layer-test.cpp b/libgnn/test/l2norm-layer-test.cpp index ca30c99ac0..6d6b30942e 100644 --- a/libgnn/test/l2norm-layer-test.cpp +++ b/libgnn/test/l2norm-layer-test.cpp @@ -10,8 +10,8 @@ int main() { GALOIS_LOG_VERBOSE("Num threads is {}", num_threads); // load test graph - galois::graphs::GNNGraph test_graph( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); // input/output columns must be same in softmax galois::GNNLayerDimensions dimension_0; @@ -38,7 +38,7 @@ int main() { std::vector back_matrix(14); galois::PointerWithSize p_back(back_matrix); - auto l2_layer = std::make_unique(2, test_graph, &p_back, + auto l2_layer = std::make_unique>(2, test_graph, &p_back, dimension_0); galois::PointerWithSize normed = l2_layer->ForwardPhase(l2_input); diff --git a/libgnn/test/multilabel-epoch-test.cpp b/libgnn/test/multilabel-epoch-test.cpp index 7626abda1d..b0a2430bd1 100644 --- a/libgnn/test/multilabel-epoch-test.cpp +++ b/libgnn/test/multilabel-epoch-test.cpp @@ -13,8 +13,8 @@ int main() { GALOIS_LOG_VERBOSE("Num threads is {}", num_threads); // load graph - auto test_graph = std::make_unique( - "tester", galois::graphs::GNNPartitionScheme::kOEC, false); + auto test_graph = std::make_unique>( + "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false); std::vector layer_types = { galois::GNNLayerType::kGraphConvolutional, @@ -34,7 +34,7 @@ int main() { 16 * test_graph->GetNumLabelClasses()}; auto adam = std::make_unique(adam_sizes, 2); - auto gnn = std::make_unique( + auto gnn = std::make_unique>( std::move(test_graph), std::move(adam), std::move(gnn_config)); ////////////////////////////////////////////////////////////////////////////// diff --git a/libgnn/test/multilabel-read.cpp b/libgnn/test/multilabel-read.cpp index 83debfa2bc..56b8b42071 100644 --- a/libgnn/test/multilabel-read.cpp +++ b/libgnn/test/multilabel-read.cpp @@ -8,8 +8,8 @@ int main() { galois::DistMemSys G; // load test graph; false at end = multilabel - galois::graphs::GNNGraph test_graph( - "tester", galois::graphs::GNNPartitionScheme::kOEC, false); + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false); const galois::GNNLabel* labels = test_graph.GetMultiClassLabel(0); unsigned i = 0; diff --git a/libgnn/test/sage-layer-test.cpp b/libgnn/test/sage-layer-test.cpp index 830e147a7c..8551126d37 100644 --- a/libgnn/test/sage-layer-test.cpp +++ b/libgnn/test/sage-layer-test.cpp @@ -14,8 +14,8 @@ int main() { galois::runtime::getSystemNetworkInterface().ID, num_threads); // load test graph - galois::graphs::GNNGraph test_graph( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); galois::GNNLayerDimensions dimension_0; dimension_0.input_rows = 7; @@ -32,8 +32,8 @@ int main() { std::vector back_matrix(21); galois::PointerWithSize p_back(back_matrix); - std::unique_ptr layer_0 = - std::make_unique(0, test_graph, &p_null, dimension_0, + std::unique_ptr> layer_0 = + std::make_unique>(0, test_graph, &p_null, dimension_0, dcon, scon); layer_0->InitAllWeightsTo1(); // sage weights for self @@ -113,7 +113,7 @@ int main() { // create layer 1 for testing backward prop actually giving weights back - auto layer_1 = std::make_unique(1, test_graph, &p_back, + auto layer_1 = std::make_unique>(1, test_graph, &p_back, dimension_0, dcon, scon); layer_1->InitAllWeightsTo1(); layer_1->InitSelfWeightsTo1(); @@ -205,7 +205,7 @@ int main() { // (verification requires floating point accuracy or setting a seed which I // don't have time for at the moment // TODO in future maybe add better unit test for this - auto layer_2 = std::make_unique(1, test_graph, &p_back, + auto layer_2 = std::make_unique>(1, test_graph, &p_back, dimension_0, config, scon); galois::PointerWithSize l2_fo = layer_2->ForwardPhase(test_graph.GetLocalFeatures()); diff --git a/libgnn/test/sample-bit-test.cpp b/libgnn/test/sample-bit-test.cpp index f603578c13..b53860d950 100644 --- a/libgnn/test/sample-bit-test.cpp +++ b/libgnn/test/sample-bit-test.cpp @@ -13,8 +13,8 @@ int main() { galois::runtime::getSystemNetworkInterface().ID, num_threads); - galois::graphs::GNNGraph graph( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + galois::graphs::GNNGraph graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); graph.InitializeSamplingData(3, false); // first, assert all edges are not sampled (should come with all 0s) diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp index 3540582ade..927f4b0e9f 100644 --- a/libgnn/test/sample-test.cpp +++ b/libgnn/test/sample-test.cpp @@ -17,8 +17,8 @@ int main() { galois::runtime::getSystemNetworkInterface().ID, num_threads); // load test graph - galois::graphs::GNNGraph test_graph( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); galois::GNNLayerDimensions dimension_0; dimension_0.input_rows = 7; @@ -43,8 +43,8 @@ int main() { std::vector back_matrix(21); galois::PointerWithSize p_back(back_matrix); - std::unique_ptr layer_1 = - std::make_unique(1, test_graph, &p_back, + std::unique_ptr> layer_1 = + std::make_unique>(1, test_graph, &p_back, dimension_0, dcon); layer_1->InitAllWeightsTo1(); layer_1->EnableSampling(); @@ -145,7 +145,7 @@ int main() { std::vector back_matrix_2(49); galois::PointerWithSize p_back_2(back_matrix_2); - auto output_layer = std::make_unique( + auto output_layer = std::make_unique>( 3, test_graph, &p_back_2, dimension_out); output_layer->EnableSampling(); galois::PointerWithSize prediction_distribution = @@ -186,10 +186,10 @@ int main() { ////////////////////////////////////////////////////////////////////////////// // sigmoid ////////////////////////////////////////////////////////////////////////////// - galois::graphs::GNNGraph multi_graph( - "tester", galois::graphs::GNNPartitionScheme::kOEC, false); + galois::graphs::GNNGraph multi_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false); - auto sigmoid_layer = std::make_unique( + auto sigmoid_layer = std::make_unique>( 3, multi_graph, &p_back_2, dimension_out); sigmoid_layer->EnableSampling(); // reuse softmax input; only thing interested in is checking for 0s diff --git a/libgnn/test/sigmoidlayer-test.cpp b/libgnn/test/sigmoidlayer-test.cpp index 0bc2cd7252..9fd861deff 100644 --- a/libgnn/test/sigmoidlayer-test.cpp +++ b/libgnn/test/sigmoidlayer-test.cpp @@ -15,8 +15,8 @@ int main() { galois::setActiveThreads(1); // load test graph - galois::graphs::GNNGraph test_graph( - "tester", galois::graphs::GNNPartitionScheme::kOEC, false); + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, false, false); // input/output columns must be same in softmax galois::GNNLayerDimensions dimension_0; @@ -51,7 +51,7 @@ int main() { galois::PointerWithSize p_back(back_matrix); // train mode - auto output_layer = std::make_unique( + auto output_layer = std::make_unique>( 3, test_graph, &p_back, dimension_0); output_layer->ForwardPhase(softmax_input); diff --git a/libgnn/test/softmaxlayer-test.cpp b/libgnn/test/softmaxlayer-test.cpp index 66c4e557bc..1ca2740729 100644 --- a/libgnn/test/softmaxlayer-test.cpp +++ b/libgnn/test/softmaxlayer-test.cpp @@ -17,8 +17,8 @@ int main() { GALOIS_LOG_VERBOSE("Num threads is {}", num_threads); // load test graph - galois::graphs::GNNGraph test_graph( - "tester", galois::graphs::GNNPartitionScheme::kOEC, true); + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); // input/output columns must be same in softmax galois::GNNLayerDimensions dimension_0; @@ -43,7 +43,7 @@ int main() { galois::PointerWithSize p_back(back_matrix); // train mode - auto output_layer = std::make_unique( + auto output_layer = std::make_unique>( 3, test_graph, &p_back, dimension_0); galois::PointerWithSize prediction_distribution = output_layer->ForwardPhase(softmax_input); diff --git a/lonestar/gnn/distributed/gcn/gcn-dist.cpp b/lonestar/gnn/distributed/gcn/gcn-dist.cpp index f33fd89c38..60e9fe75b4 100644 --- a/lonestar/gnn/distributed/gcn/gcn-dist.cpp +++ b/lonestar/gnn/distributed/gcn/gcn-dist.cpp @@ -8,8 +8,9 @@ int main(int argc, char* argv[]) { galois::StatTimer init_timer("InitializationTime"); init_timer.start(); - std::unique_ptr gnn = - InitializeGraphNeuralNetwork(); + std::unique_ptr< + galois::GraphNeuralNetwork> gnn = + InitializeGraphNeuralNetwork(); gnn->SetLayerPhases(galois::GNNPhase::kTrain); init_timer.stop(); diff --git a/lonestar/gnn/include/DistributedGraphLoader.h b/lonestar/gnn/include/DistributedGraphLoader.h index 87b12de63d..0bce4b5819 100644 --- a/lonestar/gnn/include/DistributedGraphLoader.h +++ b/lonestar/gnn/include/DistributedGraphLoader.h @@ -108,7 +108,7 @@ extern cll::opt dataset; //! partitioning scheme to use extern cll::opt partitionScheme; //! true if input graph file format is SHAD WMD -extern cll::opt useShad; +extern cll::opt useWMD; // @todo command line argument for read balancing across hosts @@ -136,22 +136,22 @@ std::unique_ptr> constructSymmetricGraph(std::vect case OEC: case IEC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, ""); + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, ""); case HOVC: case HIVC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, ""); + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, ""); case CART_VCUT: case CART_VCUT_IEC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, ""); + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, ""); case GNN_OEC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, ""); + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, ""); case GNN_CVC: return cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, ""); + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, ""); default: GALOIS_DIE("Error: partition scheme specified is invalid"); return nullptr; diff --git a/lonestar/gnn/src/DistributedGraphLoader.cpp b/lonestar/gnn/src/DistributedGraphLoader.cpp index 611a7c3e50..5e1a2dbe81 100644 --- a/lonestar/gnn/src/DistributedGraphLoader.cpp +++ b/lonestar/gnn/src/DistributedGraphLoader.cpp @@ -42,7 +42,7 @@ cll::opt partitionScheme( "gnn cvc: train nodes evenly distributed")), cll::init(GNN_OEC)); -cll::opt useShad("useShad", cll::desc("true if the input graph is" - " SHAD WMD graph format." - " Otheriwse, set false."), +cll::opt useWMD("useWMD", cll::desc("true if the input graph is" + " SHAD WMD graph format." + " Otheriwse, set false."), cll::init(false)); diff --git a/lonestar/libdistbench/include/DistBench/Input.h b/lonestar/libdistbench/include/DistBench/Input.h index 396b01a983..d7e9cb8568 100644 --- a/lonestar/libdistbench/include/DistBench/Input.h +++ b/lonestar/libdistbench/include/DistBench/Input.h @@ -100,7 +100,7 @@ extern cll::opt symmetricGraph; //! partitioning scheme to use extern cll::opt partitionScheme; //! true if input graph file format is SHAD WMD -extern cll::opt useShad; +extern cll::opt useWMD; ////! path to vertex id map for custom edge cut // extern cll::opt vertexIDMapFileName; //! true if you want to read graph structure from a file @@ -145,18 +145,18 @@ constructSymmetricGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { case OEC: case IEC: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, inputFileTranspose, mastersFile); case HOVC: case HIVC: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, inputFileTranspose); case CART_VCUT: case CART_VCUT_IEC: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, inputFileTranspose); // case CEC: @@ -166,18 +166,18 @@ constructSymmetricGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { case GINGER_O: case GINGER_I: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad ,true, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD ,true, inputFileTranspose); case FENNEL_O: case FENNEL_I: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, inputFileTranspose); case SUGAR_O: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, true, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, inputFileTranspose); default: GALOIS_DIE("partition scheme specified is invalid: ", partitionScheme); @@ -206,19 +206,19 @@ constructGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { auto& net = galois::runtime::getSystemNetworkInterface(); if (net.Num == 1) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false, inputFileTranspose); } switch (partitionScheme) { case OEC: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false, inputFileTranspose, mastersFile); case IEC: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false, inputFileTranspose, mastersFile); } else { GALOIS_DIE("incoming edge cut requires transpose graph"); @@ -227,12 +227,12 @@ constructGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { case HOVC: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false, inputFileTranspose); case HIVC: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false, inputFileTranspose); } else { GALOIS_DIE("incoming hybrid cut requires transpose graph"); @@ -241,13 +241,13 @@ constructGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { case CART_VCUT: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false, inputFileTranspose); case CART_VCUT_IEC: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false, inputFileTranspose); } else { GALOIS_DIE("cvc incoming cut requires transpose graph"); @@ -260,12 +260,12 @@ constructGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { case GINGER_O: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false, inputFileTranspose); case GINGER_I: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false, inputFileTranspose); } else { GALOIS_DIE("Ginger requires transpose graph"); @@ -274,12 +274,12 @@ constructGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { case FENNEL_O: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false, inputFileTranspose); case FENNEL_I: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useShad, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSR, useWMD, false, inputFileTranspose); } else { GALOIS_DIE("Fennel requires transpose graph"); @@ -288,7 +288,7 @@ constructGraph(std::vector& GALOIS_UNUSED(scaleFactor)) { case SUGAR_O: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useShad, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, false, inputFileTranspose); default: @@ -320,7 +320,7 @@ DistGraphPtr constructGraph(std::vector&) { if (net.Num == 1) { if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false, inputFileTranspose); } else { fprintf(stderr, "WARNING: Loading transpose graph through in-memory " @@ -328,7 +328,7 @@ DistGraphPtr constructGraph(std::vector&) { "graph with -graphTranspose to avoid unnecessary " "overhead.\n"); return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false, inputFileTranspose); } } @@ -336,12 +336,12 @@ DistGraphPtr constructGraph(std::vector&) { switch (partitionScheme) { case OEC: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false, inputFileTranspose, mastersFile); case IEC: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false, inputFileTranspose, mastersFile); } else { GALOIS_DIE("iec requires transpose graph"); @@ -350,12 +350,12 @@ DistGraphPtr constructGraph(std::vector&) { case HOVC: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false, inputFileTranspose); case HIVC: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false, inputFileTranspose); } else { GALOIS_DIE("hivc requires transpose graph"); @@ -364,13 +364,13 @@ DistGraphPtr constructGraph(std::vector&) { case CART_VCUT: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false, inputFileTranspose); case CART_VCUT_IEC: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph(inputFile, galois::CUSP_CSC, - galois::CUSP_CSC, useShad, + galois::CUSP_CSC, useWMD, false, inputFileTranspose); } else { @@ -380,12 +380,12 @@ DistGraphPtr constructGraph(std::vector&) { case GINGER_O: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false, inputFileTranspose); case GINGER_I: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false, inputFileTranspose); } else { GALOIS_DIE("Ginger requires transpose graph"); @@ -394,12 +394,12 @@ DistGraphPtr constructGraph(std::vector&) { case FENNEL_O: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false, inputFileTranspose); case FENNEL_I: if (inputFileTranspose.size()) { return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useShad, false, + inputFile, galois::CUSP_CSC, galois::CUSP_CSC, useWMD, false, inputFileTranspose); } else { GALOIS_DIE("Fennel requires transpose graph"); @@ -408,7 +408,7 @@ DistGraphPtr constructGraph(std::vector&) { case SUGAR_O: return galois::cuspPartitionGraph( - inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useShad, false, + inputFile, galois::CUSP_CSR, galois::CUSP_CSC, useWMD, false, inputFileTranspose); default: diff --git a/lonestar/libdistbench/include/DistBench/Output.h b/lonestar/libdistbench/include/DistBench/Output.h index 51df733952..e15bbc45c6 100644 --- a/lonestar/libdistbench/include/DistBench/Output.h +++ b/lonestar/libdistbench/include/DistBench/Output.h @@ -1,6 +1,7 @@ #ifndef GALOIS_DISTBENCH_OUTPUT_H #define GALOIS_DISTBENCH_OUTPUT_H +#include #include #include #include "galois/gIO.h" diff --git a/lonestar/libdistbench/src/Input.cpp b/lonestar/libdistbench/src/Input.cpp index 04321bd14e..844591506f 100644 --- a/lonestar/libdistbench/src/Input.cpp +++ b/lonestar/libdistbench/src/Input.cpp @@ -60,9 +60,9 @@ cll::opt partitionScheme( "fennel, incoming edge cut, using CuSP")), cll::init(OEC)); -cll::opt useShad("useShad", cll::desc("true if the input graph is" - " SHAD WMD graph format." - " Otheriwse, set false."), +cll::opt useWMD("useWMD", cll::desc("true if the input graph is" + " SHAD WMD graph format." + " Otheriwse, set false."), cll::init(false)); cll::opt readFromFile("readFromFile", diff --git a/lonestar/libgnnbench/include/GNNBench/Input.h b/lonestar/libgnnbench/include/GNNBench/Input.h index bb417a90f2..50713cae67 100644 --- a/lonestar/libgnnbench/include/GNNBench/Input.h +++ b/lonestar/libgnnbench/include/GNNBench/Input.h @@ -14,9 +14,168 @@ extern llvm::cl::opt input_directory; extern llvm::cl::opt input_name; //! Scheme used to partition the graph extern llvm::cl::opt partition_scheme; +extern llvm::cl::opt num_layers; +extern llvm::cl::opt layer_size; +extern llvm::cl::opt learning_rate; +extern llvm::cl::opt output_layer_type; +extern llvm::cl::opt multiclass_labels; +extern llvm::cl::opt do_graph_sampling; +extern llvm::cl::opt useWMD; +extern llvm::cl::opt use_train_subgraph; +extern llvm::cl::opt minibatch_test_interval; +extern llvm::cl::opt test_interval; +extern llvm::cl::opt val_interval; +extern llvm::cl::opt train_minibatch_size; +extern llvm::cl::opt test_minibatch_size; +extern llvm::cl::opt inductive_subgraph; const char* GNNPartitionToString(galois::graphs::GNNPartitionScheme s); +std::vector CreateLayerTypesVector(); + +template +std::vector +CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) { + // set layer sizes for intermdiate and output layers + std::vector layer_sizes_vector; + + // if (layer_sizes.size()) { + // GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers); + // for (size_t i = 0; i < num_layers; i++) { + // layer_sizes_vector.emplace_back(layer_sizes[i]); + // } + // // verify user satisfies last intermediate layer needing to have same size + // // as # label classes + // if (layer_sizes_vector.back() != gnn_graph->GetNumLabelClasses()) { + // galois::gWarn( + // "Size of last layer (", layer_sizes_vector.back(), + // ") is not equal to # label classes: forcefully changing it to ", + // gnn_graph->GetNumLabelClasses()); + // layer_sizes_vector.back() = gnn_graph->GetNumLabelClasses(); + // layer_sizes[num_layers - 1] = gnn_graph->GetNumLabelClasses(); + // } + + // GALOIS_LOG_ASSERT(layer_sizes_vector.back() == + // gnn_graph->GetNumLabelClasses()); + //} else { + // // default 16 for everything until last 2 + // for (size_t i = 0; i < num_layers - 1; i++) { + // layer_sizes_vector.emplace_back(16); + // } + // // last 2 sizes must be equivalent to # label classes; this is the last + // // intermediate layer + // layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses()); + //} + + for (size_t i = 0; i < num_layers - 1; i++) { + layer_sizes_vector.emplace_back(layer_size); + } + // last 2 sizes must be equivalent to # label classes; this is the last + // intermediate layer + layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses()); + // TODO + // for now only softmax layer which dictates the output size of the last + // intermediate layer + size of the output layer + // output layer at the moment required to be same as # label classes + layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses()); + + return layer_sizes_vector; +} + +galois::GNNLayerConfig CreateLayerConfig(); + +template +std::unique_ptr +CreateOptimizer(const galois::graphs::GNNGraph* gnn_graph) { + std::vector opt_sizes; + + // optimizer sizes are based on intermediate layer sizes, input feats, and + // # label classes + // if (layer_sizes.size()) { + // GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers); + // opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_sizes[0]); + // // assumption here is that if it reached this point then layer sizes were + // // already sanity checked previously (esp. last layer) + // for (size_t i = 1; i < num_layers; i++) { + // opt_sizes.emplace_back(layer_sizes[i] * layer_sizes[i - 1]); + // } + //} else { + // // everything is size 16 until last + // if (num_layers == 1) { + // // single layer requires a bit of special handling + // opt_sizes.emplace_back(gnn_graph->node_feature_length() * + // gnn_graph->GetNumLabelClasses()); + // } else { + // // first + // opt_sizes.emplace_back(gnn_graph->node_feature_length() * 16); + // for (size_t i = 1; i < num_layers - 1; i++) { + // opt_sizes.emplace_back(16 * 16); + // } + // // last + // opt_sizes.emplace_back(16 * gnn_graph->GetNumLabelClasses()); + // } + //} + + // everything is size 16 until last + if (num_layers == 1) { + // single layer requires a bit of special handling + opt_sizes.emplace_back(gnn_graph->node_feature_length() * + gnn_graph->GetNumLabelClasses()); + } else { + // first + opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_size); + for (size_t i = 1; i < num_layers - 1; i++) { + opt_sizes.emplace_back(layer_size * layer_size); + } + // last + opt_sizes.emplace_back(layer_size * gnn_graph->GetNumLabelClasses()); + } + GALOIS_LOG_ASSERT(opt_sizes.size() == num_layers); + + galois::AdamOptimizer::AdamConfiguration adam_config; + adam_config.alpha = learning_rate; + + // TODO only adam works right now, add the others later + return std::make_unique(adam_config, opt_sizes, + num_layers); +} + +std::vector CreateFanOutVector(); + //! Using command line args above, create a GNN using some specified layer type //! as the intermediate layer. -std::unique_ptr InitializeGraphNeuralNetwork(); +template +std::unique_ptr> +InitializeGraphNeuralNetwork() { + // partition/load graph + auto gnn_graph = std::make_unique>( + input_directory, input_name, partition_scheme, !multiclass_labels, + useWMD); + + // create layer types vector + std::vector layer_types = CreateLayerTypesVector(); + // sizes + std::vector layer_sizes_vector = + CreateLayerSizesVector(gnn_graph.get()); + // layer config object + galois::GNNLayerConfig layer_config = CreateLayerConfig(); + // GNN config object + galois::GraphNeuralNetworkConfig gnn_config( + num_layers, layer_types, layer_sizes_vector, output_layer_type, + do_graph_sampling, layer_config); + gnn_config.use_train_subgraph_ = use_train_subgraph; + gnn_config.validation_interval_ = val_interval; + gnn_config.test_interval_ = test_interval; + gnn_config.train_minibatch_size_ = train_minibatch_size; + gnn_config.test_minibatch_size_ = test_minibatch_size; + gnn_config.minibatch_test_interval_ = minibatch_test_interval; + gnn_config.inductive_subgraph_ = inductive_subgraph; + gnn_config.fan_out_vector_ = CreateFanOutVector(); + + // optimizer + std::unique_ptr opt = CreateOptimizer(gnn_graph.get()); + + // create the gnn + return std::make_unique>( + std::move(gnn_graph), std::move(opt), std::move(gnn_config)); +} diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index d1685b8e2b..44b11cfa9b 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -26,9 +26,9 @@ llvm::cl::opt partition_scheme( "Original Cartesian Vertex-Cut")), cll::init(galois::graphs::GNNPartitionScheme::kOEC)); -cll::opt useShad("useShad", cll::desc("true if the input graph is" - " SHAD WMD graph format." - " Otheriwse, set false."), +cll::opt useWMD("useWMD", cll::desc("true if the input graph is" + " SHAD WMD graph format." + " Otheriwse, set false."), cll::init(false)); llvm::cl::opt num_layers( @@ -206,55 +206,6 @@ std::vector CreateLayerTypesVector() { return layer_types; } -//! Initializes the vector of layer sizes from command line args + graph -std::vector -CreateLayerSizesVector(const galois::graphs::GNNGraph* gnn_graph) { - // set layer sizes for intermdiate and output layers - std::vector layer_sizes_vector; - - // if (layer_sizes.size()) { - // GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers); - // for (size_t i = 0; i < num_layers; i++) { - // layer_sizes_vector.emplace_back(layer_sizes[i]); - // } - // // verify user satisfies last intermediate layer needing to have same size - // // as # label classes - // if (layer_sizes_vector.back() != gnn_graph->GetNumLabelClasses()) { - // galois::gWarn( - // "Size of last layer (", layer_sizes_vector.back(), - // ") is not equal to # label classes: forcefully changing it to ", - // gnn_graph->GetNumLabelClasses()); - // layer_sizes_vector.back() = gnn_graph->GetNumLabelClasses(); - // layer_sizes[num_layers - 1] = gnn_graph->GetNumLabelClasses(); - // } - - // GALOIS_LOG_ASSERT(layer_sizes_vector.back() == - // gnn_graph->GetNumLabelClasses()); - //} else { - // // default 16 for everything until last 2 - // for (size_t i = 0; i < num_layers - 1; i++) { - // layer_sizes_vector.emplace_back(16); - // } - // // last 2 sizes must be equivalent to # label classes; this is the last - // // intermediate layer - // layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses()); - //} - - for (size_t i = 0; i < num_layers - 1; i++) { - layer_sizes_vector.emplace_back(layer_size); - } - // last 2 sizes must be equivalent to # label classes; this is the last - // intermediate layer - layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses()); - // TODO - // for now only softmax layer which dictates the output size of the last - // intermediate layer + size of the output layer - // output layer at the moment required to be same as # label classes - layer_sizes_vector.emplace_back(gnn_graph->GetNumLabelClasses()); - - return layer_sizes_vector; -} - //! Setup layer config struct based on cli args galois::GNNLayerConfig CreateLayerConfig() { galois::GNNLayerConfig layer_config; @@ -267,61 +218,6 @@ galois::GNNLayerConfig CreateLayerConfig() { return layer_config; } -std::unique_ptr -CreateOptimizer(const galois::graphs::GNNGraph* gnn_graph) { - std::vector opt_sizes; - - // optimizer sizes are based on intermediate layer sizes, input feats, and - // # label classes - // if (layer_sizes.size()) { - // GALOIS_LOG_ASSERT(layer_sizes.size() == num_layers); - // opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_sizes[0]); - // // assumption here is that if it reached this point then layer sizes were - // // already sanity checked previously (esp. last layer) - // for (size_t i = 1; i < num_layers; i++) { - // opt_sizes.emplace_back(layer_sizes[i] * layer_sizes[i - 1]); - // } - //} else { - // // everything is size 16 until last - // if (num_layers == 1) { - // // single layer requires a bit of special handling - // opt_sizes.emplace_back(gnn_graph->node_feature_length() * - // gnn_graph->GetNumLabelClasses()); - // } else { - // // first - // opt_sizes.emplace_back(gnn_graph->node_feature_length() * 16); - // for (size_t i = 1; i < num_layers - 1; i++) { - // opt_sizes.emplace_back(16 * 16); - // } - // // last - // opt_sizes.emplace_back(16 * gnn_graph->GetNumLabelClasses()); - // } - //} - - // everything is size 16 until last - if (num_layers == 1) { - // single layer requires a bit of special handling - opt_sizes.emplace_back(gnn_graph->node_feature_length() * - gnn_graph->GetNumLabelClasses()); - } else { - // first - opt_sizes.emplace_back(gnn_graph->node_feature_length() * layer_size); - for (size_t i = 1; i < num_layers - 1; i++) { - opt_sizes.emplace_back(layer_size * layer_size); - } - // last - opt_sizes.emplace_back(layer_size * gnn_graph->GetNumLabelClasses()); - } - GALOIS_LOG_ASSERT(opt_sizes.size() == num_layers); - - galois::AdamOptimizer::AdamConfiguration adam_config; - adam_config.alpha = learning_rate; - - // TODO only adam works right now, add the others later - return std::make_unique(adam_config, opt_sizes, - num_layers); -} - std::vector CreateFanOutVector() { std::vector fan_out; // fan out only matters if graph sampling is enabled @@ -342,37 +238,3 @@ std::vector CreateFanOutVector() { } return fan_out; } - -std::unique_ptr InitializeGraphNeuralNetwork() { - // partition/load graph - auto gnn_graph = std::make_unique( - input_directory, input_name, partition_scheme, !multiclass_labels, - useShad); - - // create layer types vector - std::vector layer_types = CreateLayerTypesVector(); - // sizes - std::vector layer_sizes_vector = - CreateLayerSizesVector(gnn_graph.get()); - // layer config object - galois::GNNLayerConfig layer_config = CreateLayerConfig(); - // GNN config object - galois::GraphNeuralNetworkConfig gnn_config( - num_layers, layer_types, layer_sizes_vector, output_layer_type, - do_graph_sampling, layer_config); - gnn_config.use_train_subgraph_ = use_train_subgraph; - gnn_config.validation_interval_ = val_interval; - gnn_config.test_interval_ = test_interval; - gnn_config.train_minibatch_size_ = train_minibatch_size; - gnn_config.test_minibatch_size_ = test_minibatch_size; - gnn_config.minibatch_test_interval_ = minibatch_test_interval; - gnn_config.inductive_subgraph_ = inductive_subgraph; - gnn_config.fan_out_vector_ = CreateFanOutVector(); - - // optimizer - std::unique_ptr opt = CreateOptimizer(gnn_graph.get()); - - // create the gnn - return std::make_unique( - std::move(gnn_graph), std::move(opt), std::move(gnn_config)); -} diff --git a/lonestar/libgnnbench/src/Start.cpp b/lonestar/libgnnbench/src/Start.cpp index daff6ad114..ed928374cc 100644 --- a/lonestar/libgnnbench/src/Start.cpp +++ b/lonestar/libgnnbench/src/Start.cpp @@ -117,6 +117,33 @@ void GNNBenchStart(int argc, char** argv, const char* app, const char* desc, galois::runtime::reportParam("GNNBench", "IsGraphSampled", do_graph_sampling); galois::runtime::reportParam("GNNBench", "LearningRate", learning_rate); + + if (useWMD && + partition_scheme != galois::graphs::GNNPartitionScheme::kOCVC) { + // cvc/oec (GNN-CVC, GNN-OEC in CuSP), not ocvc, are variants + // of the default CuSP cvc/oec partitioning policies. + // The original partitioning policies (including ocvc) only + // consider and attempt to balance the number of master nodes + // for each host. + // However, Galois-GNN chooses training vertices from the original graph, + // and extracts, constructs, uses a subgraph only with them for training. + // In this case, especially Galois-GNN typically chooses a consecutive + // range of vertices as the training vertices. + // This method might cause load imbalancing among hosts since most of the + // training master nodes are skewed to the few hosts. + // In order to alleviate this issue, Galois-GNN provides those variant + // partitioning policies. They consider and attempt to balance the + // number of master "training" nodes for each host. + // SHAD-GNN on WMD graphs is not necessarily constrained to this design. + // SHAD-GNN has the specific number of training vertices, and randomly + // selects vertices from a graph as that, which means that Galois-GNN + // could avoid vertex imbalancing due to the skewness if it chooses + // vertices in balance manner. + // To sum up, we do not support the specialized partitioning policies, + // but choose vertices in balance manner. + GALOIS_LOG_FATAL("Gnn CVC and OEC are not supported for WMD graphs {}", + GNNPartitionToString(partition_scheme)); + } } char name[256]; diff --git a/scripts/shad-gnn/check_feature_construction.py b/scripts/shad-gnn/check_feature_construction.py new file mode 100644 index 0000000000..62538431a6 --- /dev/null +++ b/scripts/shad-gnn/check_feature_construction.py @@ -0,0 +1,51 @@ +import csv + +""" +@autor: Hochan Lee (hochan.lee@amd.com) + +Requirement: + +The below two files should exist on the directory +where this script runs. + +1) solution.csv is the solution file. +2) 2hop.[host id].feat is the results of the feature construction +that we want to check correctness. + +Command: +python check_feature_construction.py + +""" +num_hosts = 4 + +solution = {} +with open("solution.csv", "r") as f: + reader = csv.reader(f) + for row in reader: + rlen = len(row) + feat = [] + for i in range(1, rlen): + feat.append(int(row[i])) + solution[row[0]] = feat + +fail = False +for i in range(0, num_hosts): + with open("2hop."+str(i)+".feat", "r") as f: + reader = csv.reader(f) + for row in reader: + rlen = len(row) + feat = [] + for j in range(1, rlen): + feat.append(int(row[j])) + key = row[0] + + solution_feat = solution[key] + for j in range(0, rlen-1): + if solution_feat[j] != feat[j]: + print(key, " failed at ", j, " on host:", i) + fail = True + +if fail: + print("Verification failed") +else: + print("Verification succeeded") From 40609e2182badbc2bbabfdc85d617088c6f66c3c Mon Sep 17 00:00:00 2001 From: "Lee, Hochan" <133701794+hochanlee-amd@users.noreply.github.com> Date: Fri, 15 Sep 2023 20:45:30 -0500 Subject: [PATCH 605/660] Add ego graph construction to GCN This commit adds ego graph construction for each epoch to GCN layer. The original GCN paper did not use ego graphs, but the whole graph, and so, Galois-GNN didn't implement that intentionally. To follow SHAD GCN reference code, we now added that. This commit only contains a single host execution unit test because designing and implementing multi-host execution is a time consuming task and so, I decided to do that later. But, without the unit test, I confirmed correctness of the multi-host execution based on the below changes. 1. Set all layer weights to 1, instead of random values. 2. Used nodes within global node ID range for training. (So, the nodes are deterministic) (The original code uses random selection to match SHAD's one) 3. Compared 1-host and multi hosts, like 2 and 4 hosts, accuracy results on the graph sampling mode. 4. They should be same if the GCN graph sampling is correct. --- libgnn/include/galois/graphs/GNNGraph.h | 74 ++++++++- .../galois/layers/GraphConvolutionalLayer.h | 133 ++++++++++++---- libgnn/test/CMakeLists.txt | 10 +- libgnn/test/gcn-sample-edge-test.cpp | 148 ++++++++++++++++++ libgnn/test/sample-test.cpp | 16 ++ 5 files changed, 342 insertions(+), 39 deletions(-) create mode 100644 libgnn/test/gcn-sample-edge-test.cpp diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index ad41def334..146daf24b3 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -297,7 +297,6 @@ class GNNGraph { return edge_sample_status_[layer_num].test(ei); } else { return subgraph_->OutEdgeSampled(ei, layer_num, *this); - return false; } }; bool IsEdgeSampled(EdgeIterator ei, size_t layer_num) const { @@ -545,7 +544,6 @@ class GNNGraph { continue; } } - MakeEdgeSampled(edge_iter, agg_layer_num); uint32_t dest = partitioned_graph_->getEdgeDst(edge_iter); if (!IsInSampledGraph(dest)) { @@ -1072,7 +1070,57 @@ class GNNGraph { bool MoreTestMinibatches() { return !test_batcher_->NoMoreMinibatches(); }; ////////////////////////////////////////////////////////////////////////////// - GNNFloat GetGCNNormFactor(GraphNode lid) const { + + /** + * @brief Normalization factor calculation for GCN without graph sampling + * + * @detail This function calculates normalization factor for nodes + * on a GCN layer, but not with graph sampling (ego graph construction). + * This normalization is proposed in GCN paper, and its equation is + * D^(-1/2)*A*D^(-1/2). + * XXX(hc): This degraded accuracy when graph sampling was enabled. + * That could be many reasons for that, for example, a graph was already + * small, and so, sampled graphs across layers are too small to normalize, + * or, it might be theoretical design reason as the original GCN + * did not consider ego graph construction. + * For example, the one possible reason is that backward phase and + * forward phase edge iterators are also different and maybe need to + * use different iterators. + * For now, I stopped this analysis and + * just enabled this method for only GCN without graph + * sampling. With graph sampling, I used SAGE's graph normalization. + */ + GNNFloat GetGCNNormFactor(GraphNode lid + /*, size_t graph_user_layer_num*/) const { +#if 0 + if (use_subgraph_ || use_subgraph_view_) { + size_t degree; + if (!subgraph_choose_all_) { + // case because degrees in each layer differ + degree = + sampled_out_degrees_[graph_user_layer_num][ + subgraph_->SIDToLID(lid)]; + } else { + // XXX if inductive + // degree = global_train_degrees_[subgraph_->SIDToLID(n)]; + degree = global_degrees_[subgraph_->SIDToLID(lid)]; + } + if (degree) { + return 1.0 / std::sqrt(static_cast(degree) + 1); + } else { + return 0; + } + } else { + if (global_degrees_[lid]) { + if (this->size() != this->active_size()) { + std::cout << lid << " does not match\n"; + } + return 1.0 / std::sqrt(static_cast(global_degrees_[lid]) + 1); + } else { + return 0.0; + } + } +#endif if (global_degrees_[lid]) { return 1.0 / std::sqrt(static_cast(global_degrees_[lid]) + 1); } else { @@ -1556,6 +1604,26 @@ class GNNGraph { return non_layer_zero_masters_; } + // TODO(hc): `ResizeSamplingBitsets()` and + // `GetDefinitelySampledNodesBset()` expose private member variables + // for unit tests. Other than them, these should not be used. + + void ResizeSamplingBitsets() { + if (!bitset_sampled_degrees_.size()) { + bitset_sampled_degrees_.resize(partitioned_graph_->size()); + } + if (!bitset_sample_flag_.size()) { + bitset_sample_flag_.resize(size()); + } + if (!definitely_sampled_nodes_.size()) { + definitely_sampled_nodes_.resize(partitioned_graph_->size()); + } + } + + galois::DynamicBitSet& GetDefinitelySampledNodesBset() { + return definitely_sampled_nodes_; + } + private: // included like this to avoid cyclic dependency issues + not used anywhere but // in this class anyways diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h index 2c7a41ecab..be882647a1 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h @@ -218,7 +218,25 @@ class GraphConvolutionalLayer : public GNNLayer { this->layer_dimensions_.output_columns) { // aggdata can == p_intemp1; in other words, need to use before overwrite // mask it, then use it - this->MaskInputNonMasters(&agg_data); + if (this->layer_number_ != 0) { + if (this->graph_.IsSubgraphOn()) { + this->MaskInputNonMasters(&agg_data, + this->layer_dimensions_.output_rows, + this->graph_.GetNonLayerZeroMasters()); + } else { + this->MaskInputNonMasters(&agg_data, + this->layer_dimensions_.output_rows); + } + } else { + if (this->graph_.IsSubgraphOn()) { + this->MaskGradientNonMasters(input_gradient, + this->layer_dimensions_.output_rows, + this->graph_.GetNonLayerZeroMasters()); + } else { + this->MaskGradientNonMasters(input_gradient, + this->layer_dimensions_.output_rows); + } + } #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { @@ -231,9 +249,10 @@ class GraphConvolutionalLayer : public GNNLayer { #endif weight_gradient_timer.start(); // temp 2 holds aggregated feature vectors from forward phase + // use output rows since gcn can use subgraphs galois::CBlasSGEMM( CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns, - this->layer_dimensions_.input_rows, + this->layer_dimensions_.output_rows, this->layer_dimensions_.output_columns, agg_data.data(), input_gradient->data(), this->p_layer_weight_gradients_.data()); weight_gradient_timer.stop(); @@ -249,7 +268,8 @@ class GraphConvolutionalLayer : public GNNLayer { this->layer_dimensions_.input_rows * this->layer_dimensions_.output_columns); // pintemp1 contains (AF)' - UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data()); + UpdateEmbeddingsDerivative( + input_gradient->data(), p_in_temp_1_.data()); // pback contains F' // derivative of aggregate is the same due to symmetric graph AggregateAll(this->layer_dimensions_.input_columns, p_in_temp_1_.data(), @@ -271,11 +291,24 @@ class GraphConvolutionalLayer : public GNNLayer { // means I can mess with the input data itself instad of masking the // gradients I can mask the input if (this->layer_number_ != 0) { - this->MaskInputNonMasters(&input_data); + if (this->graph_.IsSubgraphOn()) { + this->MaskInputNonMasters(&input_data, + this->layer_dimensions_.input_rows, + this->graph_.GetNonLayerZeroMasters()); + } else { + this->MaskInputNonMasters(&input_data, + this->layer_dimensions_.input_rows); + } } else { // if 0 then no input to mask: mask the gradient // this is fine because gradient won't be used to get feature gradients - this->MaskGradientNonMasters(&p_out_temp_); + if (this->graph_.IsSubgraphOn()) { + this->MaskGradientNonMasters(&p_out_temp_, + this->layer_dimensions_.input_rows, + this->graph_.GetNonLayerZeroMasters()); + } else { + this->MaskGradientNonMasters(&p_out_temp_); + } } #ifdef GALOIS_ENABLE_GPU @@ -347,14 +380,18 @@ class GraphConvolutionalLayer : public GNNLayer { void AggregateAllCPU(size_t column_length, const GNNFloat* node_embeddings, GNNFloat* aggregate_output, - galois::substrate::PerThreadStorage>*) { + galois::substrate::PerThreadStorage>*, + bool is_backward) { galois::StatTimer aggregate_all_sync_timer("AggregateSync", kRegionName); - size_t num_nodes = this->graph_.size(); + size_t num_nodes = (is_backward)? + this->layer_dimensions_.input_rows : + this->layer_dimensions_.output_rows; size_t last_master = *(this->graph_.end_owned()); + assert(0 == *(this->graph_.begin_owned())); galois::do_all( - galois::iterate(static_cast(0), num_nodes), + galois::iterate(*(this->graph_.begin()), num_nodes), [&](size_t src) { size_t index_to_src_feature = src * column_length; // zero out src feature first @@ -364,24 +401,31 @@ class GraphConvolutionalLayer : public GNNLayer { if (this->layer_phase_ == GNNPhase::kTrain) { if (this->IsSampledLayer()) { - // XXX(loc) - GALOIS_LOG_WARN( - "Edge sampling not yet implemented for GCN; only SAGE"); - // check if node is part of sampled graph; ignore after 0'ing if - // not sampled - if (!this->graph_.IsInSampledGraph(src)) + // Check if node is part of sampled graph; ignore after + // 0'ing if it is not sampled. + // TODO(hc): check if SAGE also checks this + if (!this->graph_.IsInSampledGraph(src)) { return; + } } } - GNNFloat source_norm = 0.0; + GNNFloat source_norm = 1.0; if (!this->config_.disable_normalization) { - source_norm = this->graph_.GetGCNNormFactor(src); + if (this->graph_.IsSubgraphOn() || + this->graph_.IsSubgraphViewOn()) { + source_norm = + this->graph_.GetDegreeNorm( + src, this->graph_user_layer_number_); + } else { + source_norm = this->graph_.GetGCNNormFactor(src); + } } // init to self if (!this->config_.disable_self_aggregate) { - graphs::bitset_graph_aggregate.set(src); + graphs::bitset_graph_aggregate.set( + this->graph_.ConvertToLID(src)); // only aggregate self once on master if (src < last_master) { for (size_t i = 0; i < column_length; i++) { @@ -393,25 +437,44 @@ class GraphConvolutionalLayer : public GNNLayer { } // loop through all destinations to grab the feature to aggregate - for (auto e = this->graph_.edge_begin(src); - e != this->graph_.edge_end(src); e++) { - size_t dst = this->graph_.GetEdgeDest(e); - graphs::bitset_graph_aggregate.set(src); - - if (this->layer_phase_ == GNNPhase::kTrain) { + auto e_beg = (is_backward)? + this->graph_.in_edge_begin(src) : this->graph_.edge_begin(src); + auto e_end = (is_backward)? + this->graph_.in_edge_end(src) : this->graph_.edge_end(src); + for (auto e = e_beg; e != e_end; e++) { + if (this->layer_phase_ == GNNPhase::kTrain || + this->layer_phase_ == GNNPhase::kBatch) { if (this->IsSampledLayer()) { - // ignore non-sampled nodes - if (this->layer_phase_ == GNNPhase::kTrain && - !this->graph_.IsInSampledGraph(dst)) + bool is_sampled = (is_backward)? + this->graph_.IsInEdgeSampled( + e, this->graph_user_layer_number_) : + this->graph_.IsEdgeSampled( + e, this->graph_user_layer_number_); + // ignore non-sampled nodes and edges + if (!is_sampled) { continue; + } } } - + size_t dst = (is_backward)? + this->graph_.GetInEdgeDest(e) : this->graph_.GetEdgeDest(e); + graphs::bitset_graph_aggregate.set( + this->graph_.ConvertToLID(src)); size_t index_to_dst_feature = dst * column_length; if (!this->config_.disable_normalization) { - GNNFloat norm_scale = - source_norm * this->graph_.GetGCNNormFactor(dst); + GNNFloat norm_scale; + if (this->graph_.IsSubgraphOn() || + this->graph_.IsSubgraphViewOn()) { + norm_scale = (is_backward)? + this->graph_.GetDegreeNorm( + dst, this->graph_user_layer_number_) + : source_norm; + } else { + norm_scale = + source_norm * this->graph_.GetGCNNormFactor(dst); + } + galois::VectorMulAdd( column_length, &aggregate_output[index_to_src_feature], &node_embeddings[index_to_dst_feature], norm_scale, @@ -429,7 +492,8 @@ class GraphConvolutionalLayer : public GNNLayer { galois::loopname("ConvolutionalAggregateAll")); // aggregate sync aggregate_all_sync_timer.start(); - this->graph_.AggregateSync(aggregate_output, column_length); + this->graph_.AggregateSync(aggregate_output, column_length, + is_backward, num_nodes); aggregate_all_sync_timer.stop(); } @@ -469,7 +533,8 @@ class GraphConvolutionalLayer : public GNNLayer { this->layer_number_); } else { #endif - AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts); + AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts, + is_backward); #ifdef GALOIS_ENABLE_GPU } #endif @@ -495,7 +560,8 @@ class GraphConvolutionalLayer : public GNNLayer { this->layer_dimensions_.input_rows, this->layer_dimensions_.input_columns, this->layer_dimensions_.output_columns, - node_embeddings, this->layer_weights_.data(), output); + node_embeddings, this->layer_weights_.data(), + output); #ifdef GALOIS_ENABLE_GPU } #endif @@ -503,7 +569,8 @@ class GraphConvolutionalLayer : public GNNLayer { } //! Calculate graident via mxm with last layer's gradients (backward) - void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output) { + void UpdateEmbeddingsDerivative( + const GNNFloat* gradients, GNNFloat* output) { galois::StatTimer timer("BackwardXform", kRegionName); timer.start(); diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 00aa14bce6..d005ddd6bc 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -145,13 +145,17 @@ if (NOT GALOIS_ENABLE_GPU) target_link_libraries(f1-test galois_gnn) add_test(NAME f1-test COMMAND f1-test) - add_executable(sample-test sample-test.cpp) - target_link_libraries(sample-test galois_gnn) - add_test(NAME sample-test COMMAND sample-test) + #add_executable(sample-test sample-test.cpp) + #target_link_libraries(sample-test galois_gnn) + #add_test(NAME sample-test COMMAND sample-test) add_executable(sample-bit-test sample-bit-test.cpp) target_link_libraries(sample-bit-test galois_gnn) add_test(NAME sample-bit-test COMMAND sample-bit-test) + + add_executable(gcn-sample-edge-test gcn-sample-edge-test.cpp) + target_link_libraries(gcn-sample-edge-test galois_gnn) + add_test(NAME gcn-sample-edge-test COMMAND gcn-sample-edge-test) else() add_executable(gpu-sage-layer-test gpu-sage-layer-test.cpp) target_link_libraries(gpu-sage-layer-test galois_gnn) diff --git a/libgnn/test/gcn-sample-edge-test.cpp b/libgnn/test/gcn-sample-edge-test.cpp new file mode 100644 index 0000000000..8bb4e74f9a --- /dev/null +++ b/libgnn/test/gcn-sample-edge-test.cpp @@ -0,0 +1,148 @@ +/** + * This test checks correctness by comparing hand calculation + * of the forward and backward phases. + * This is implemented to check correctness of GCN layer. + * Below is the process: + * 1. Mark and check nodes and edges to be initially sampled. + * 2. Nodes adjacent to the sampled edges are sampled. + * 3. Perform forward/backward phases and compare the results + * with hand calculation results. + */ + +// TODO(hc): Designing and implementing multi-host execution is +// a time consuming task and so, I will work on that later. +// But, without test, I confirmed correctness of the multi-host +// execution based on the below changes. +// +// 1. Set all layer weights to 1, instead of random values. +// 2. Used nodes within global node ID range for training. +// (So, the nodes are deterministic) +// (The original code uses random selection to match SHAD's one) +// 3. Compared 1-host and multi hosts, like 2 and 4 hosts, +// accuracy results on the graph sampling mode. +// 4. They should be same if the GCN graph sampling is correct. +// (It was on the test done on 09/15/2023) + +#include "galois/layers/GraphConvolutionalLayer.h" +#include "galois/layers/SAGELayer.h" + +int main() { + galois::DistMemSys G; + + size_t num_threads = 1; + // tester graph: 0 - 1 - 2 - 3 - 4 - 5 - 6 + galois::graphs::GNNGraph test_graph( + "tester", galois::graphs::GNNPartitionScheme::kOEC, true, false); + test_graph.InitializeSamplingData(); + + galois::GNNLayerConfig dcon; + dcon.disable_aggregate_after_update = false; + dcon.disable_normalization = false; + dcon.DebugConfig(); + // Choose a few sample nodes + test_graph.SetSampledNode(0); + test_graph.SetSampledNode(4); + test_graph.UnsetSampledNode(1); + test_graph.UnsetSampledNode(2); + test_graph.UnsetSampledNode(3); + test_graph.UnsetSampledNode(5); + test_graph.UnsetSampledNode(6); + + test_graph.ResizeSamplingBitsets(); + test_graph.SampleAllEdges(0, false, 1); + + // After the above lines, nodes 0, 1, 3, 4, 5 and + // edges 0, 7, 8 should be sampled. + // So, + // 0 -> 1, 2 <- 3 -> 4 + GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(0)); + GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(1)); + GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(3)); + GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(4)); + GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(5)); + + GALOIS_LOG_ASSERT(test_graph.IsEdgeSampledAny(7)); + GALOIS_LOG_ASSERT(test_graph.IsEdgeSampledAny(8)); + + + galois::DynamicBitSet& bset = + test_graph.GetDefinitelySampledNodesBset(); + bset.ParallelReset(); + bset.set(0); + bset.set(1); + bset.set(3); + bset.set(4); + bset.set(5); + test_graph.ConstructSampledSubgraph(1); + test_graph.EnableSubgraph(); + + galois::GNNLayerDimensions dimension_0; + dimension_0.input_rows = 5; + dimension_0.input_columns = 3; + dimension_0.output_columns = 2; + + // Layer declaration + std::vector back_matrix(15); + galois::PointerWithSize p_back(back_matrix); + std::unique_ptr> layer_1 = + std::make_unique>( + 1, test_graph, &p_back, dimension_0, dcon); + + layer_1->InitAllWeightsTo1(); + layer_1->EnableSampling(); + galois::PointerWithSize features = + test_graph.GetLocalFeatures(); + + galois::PointerWithSize layer_1_forward_output = + layer_1->ForwardPhase(features); + + GALOIS_LOG_ASSERT(layer_1_forward_output[0] == 3); + GALOIS_LOG_ASSERT(layer_1_forward_output[1] == 3); + GALOIS_LOG_ASSERT(layer_1_forward_output[2] == 0); + GALOIS_LOG_ASSERT(layer_1_forward_output[3] == 0); + GALOIS_LOG_ASSERT(layer_1_forward_output[4] == 0); + GALOIS_LOG_ASSERT(layer_1_forward_output[5] == 0); + GALOIS_LOG_ASSERT(layer_1_forward_output[6] == 24); + GALOIS_LOG_ASSERT(layer_1_forward_output[7] == 24); + GALOIS_LOG_ASSERT(layer_1_forward_output[8] == 0); + GALOIS_LOG_ASSERT(layer_1_forward_output[9] == 0); + + // Dummy gradients + std::vector dummy_ones_v(10, 1); + galois::PointerWithSize dummy_ones(dummy_ones_v); + dummy_ones_v.assign(10, 1); + dummy_ones_v[4] = 0; + dummy_ones_v[5] = 0; + + galois::PointerWithSize layer_1_backward_output = + layer_1->BackwardPhase( + test_graph.GetLocalFeatures(), &dummy_ones); + + GALOIS_LOG_ASSERT(layer_1_backward_output[0] == 0); + GALOIS_LOG_ASSERT(layer_1_backward_output[1] == 0); + GALOIS_LOG_ASSERT(layer_1_backward_output[2] == 0); + GALOIS_LOG_ASSERT(layer_1_backward_output[3] == 2); + GALOIS_LOG_ASSERT(layer_1_backward_output[4] == 2); + GALOIS_LOG_ASSERT(layer_1_backward_output[5] == 2); + GALOIS_LOG_ASSERT(layer_1_backward_output[6] == 0); + GALOIS_LOG_ASSERT(layer_1_backward_output[7] == 0); + GALOIS_LOG_ASSERT(layer_1_backward_output[8] == 0); + GALOIS_LOG_ASSERT(layer_1_backward_output[9] == 0); + GALOIS_LOG_ASSERT(layer_1_backward_output[10] == 0); + GALOIS_LOG_ASSERT(layer_1_backward_output[11] == 0); + GALOIS_LOG_ASSERT(layer_1_backward_output[12] == 2); + GALOIS_LOG_ASSERT(layer_1_backward_output[13] == 2); + GALOIS_LOG_ASSERT(layer_1_backward_output[14] == 2); + + galois::PointerWithSize layer_1_weight_gradients = + layer_1->GetLayerWeightGradients(); + + GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 6); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 6); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 6); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[3] == 6); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[4] == 6); + GALOIS_LOG_ASSERT(layer_1_weight_gradients[5] == 6); + + return 0; +} diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp index 927f4b0e9f..0bda9d81a8 100644 --- a/libgnn/test/sample-test.cpp +++ b/libgnn/test/sample-test.cpp @@ -1,6 +1,22 @@ //! @file sample-test.cpp //! Sampling tester +/// TODO(hc): This test is deprecated as GCN layer now supports +/// edge sampling, as well as node sampling. +/// The previous GCN only checks if node is sampled, but +/// now it checks edge sampling and for that, it utilizes +/// a bitset to mark sampled edges. +/// If that bitset is not set, the corresponding edge is ignored. +/// However, this test currently does not consider this case, +/// and doesn't work. +/// To satisfy the previous assumption and make this test work, +/// we should mark the entire adjacent edges of the sampled nodes. +/// In this case, we should not mark the edges' destination nodes as +/// sampled nodes, and so, let src node iterator skip those nodes +/// but only allow to iterate them as outgoing destinations. +/// We can reuse this code later, and so, I will not remove this +/// from the current source tree. + #include "galois/Logging.h" #include "galois/GNNMath.h" #include "galois/layers/GraphConvolutionalLayer.h" From e9a2a03d13c6329185820f28e9983faa95877ca3 Mon Sep 17 00:00:00 2001 From: marcopolo4096 Date: Wed, 20 Sep 2023 13:16:03 -0500 Subject: [PATCH 606/660] Fixed cpuinf parsing bug (#10) Co-authored-by: Marko --- libgalois/src/HWTopoLinux.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libgalois/src/HWTopoLinux.cpp b/libgalois/src/HWTopoLinux.cpp index 0835b0070e..486e707230 100644 --- a/libgalois/src/HWTopoLinux.cpp +++ b/libgalois/src/HWTopoLinux.cpp @@ -100,7 +100,7 @@ unsigned getNumaNode(cpuinfo& c) { std::vector parseCPUInfo() { std::vector vals; - const int len = 1024; + const int len = 4096; std::array line; std::ifstream procInfo("/proc/cpuinfo"); From 31e32c210ee5da23d0d68d882e071b258511e12c Mon Sep 17 00:00:00 2001 From: "Lee, Hochan" <133701794+hochanlee-amd@users.noreply.github.com> Date: Thu, 21 Sep 2023 20:40:57 -0500 Subject: [PATCH 607/660] Add ReLU layer (#9) Co-authored-by: Hochan Lee --- libgnn/include/galois/GraphNeuralNetwork.h | 6 + libgnn/include/galois/layers/GNNLayer.h | 5 +- libgnn/include/galois/layers/ReLULayer.h | 126 +++++++++++++++++++++ lonestar/libgnnbench/src/Input.cpp | 1 + 4 files changed, 136 insertions(+), 2 deletions(-) create mode 100644 libgnn/include/galois/layers/ReLULayer.h diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index c63175f65e..88a48f961c 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -14,6 +14,7 @@ #include "galois/layers/DenseLayer.h" #include "galois/layers/GraphConvolutionalLayer.h" #include "galois/layers/L2NormLayer.h" +#include "galois/layers/ReLULayer.h" #include "galois/layers/SAGELayer.h" #include "galois/layers/SigmoidLayer.h" #include "galois/layers/SoftmaxLayer.h" @@ -227,6 +228,11 @@ class GraphNeuralNetwork { i, *graph_, &prev_output_layer, layer_dims, config_.default_layer_config()))); break; + case GNNLayerType::kReLU: + gnn_layers_.push_back(std::move(std::make_unique>( + i, *graph_, &prev_output_layer, layer_dims, + config_.default_layer_config()))); + break; case GNNLayerType::kDense: gnn_layers_.push_back(std::move(std::make_unique>( i, *graph_, &prev_output_layer, layer_dims, diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 9ac6b925ae..6929eb70a2 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -26,7 +26,9 @@ enum class GNNLayerType { //! Dense linear xform layer kDense, //! L2 normalization layer - kL2Norm + kL2Norm, + //! ReLU layer + kReLU // TODO GAT }; @@ -647,7 +649,6 @@ class GNNLayer { void ActivationDerivative(PointerWithSize* gradient) { galois::StatTimer timer("BackwardActivation", "GNNLayer"); TimerStart(&timer); - #ifdef GALOIS_ENABLE_GPU if (device_personality == DevicePersonality::GPU_CUDA) { base_gpu_object_.ActivationDerivativeGPU(gradient->data(), diff --git a/libgnn/include/galois/layers/ReLULayer.h b/libgnn/include/galois/layers/ReLULayer.h new file mode 100644 index 0000000000..879c462330 --- /dev/null +++ b/libgnn/include/galois/layers/ReLULayer.h @@ -0,0 +1,126 @@ +#pragma once +#include "galois/layers/GNNLayer.h" +#include "galois/GNNMath.h" + +// XXX(hc): We don't have GPU ReLU implementation. + +// TODO(hc): All intermediate layers in Galois-GNN have internal ReLU +// layer. So, this is not yet being used. +// BUT, I would like to leave this for the future. + +namespace galois { + +//! ReLU layer: takes each row of the input matrix and sets 0 to elements < 0 in a row. +//! Currently this only works with **single class* labels and is coded as such. +template +class ReLULayer : public GNNLayer { +public: + ReLULayer(size_t layer_num, + const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, + const GNNLayerDimensions& dimensions) + : ReLULayer( + layer_num, graph, backward_output_matrix, dimensions, + GNNLayerConfig{.allocate_weights = false, .disable_output = true}) + {} + + ReLULayer(size_t layer_num, const galois::graphs::GNNGraph& graph, + PointerWithSize* backward_output_matrix, + const GNNLayerDimensions& dimensions, + const GNNLayerConfig& config) : GNNLayer(layer_num, graph, + backward_output_matrix, dimensions, config) { + this->layer_type_ = galois::GNNLayerType::kReLU; + GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns); + GALOIS_LOG_VERBOSE("ReLU initialized"); + } + + //! Perform max(0, input) to each row of input + const PointerWithSize + ForwardPhase(const PointerWithSize input_embeddings) final { + return ForwardPhaseCPU(input_embeddings); + } + + const PointerWithSize + ForwardPhaseCPU(const PointerWithSize input_embeddings) { + galois::StatTimer Timer("ReLULayer", "ReLULayer"); + this->TimerStart(&Timer); + + // note: p_backward == input_embeddings + const size_t feature_length = this->layer_dimensions_.input_columns; + + galois::do_all( + galois::iterate(size_t{0}, this->layer_dimensions_.input_rows), + [&](const unsigned row) { + if (this->IsSampledLayer()) { + if ((this->layer_phase_ == GNNPhase::kTrain || + this->layer_phase_ == GNNPhase::kBatch) && + !this->graph_.IsInSampledGraphSubgraph(row)) { + return; + } + } + + if (this->graph_.IsValidForPhase(row, this->layer_phase_)) { + size_t row_offset = row * feature_length; + for (size_t row_index = row_offset; + row_index < (row_offset + feature_length); row_index++) { + // TODO(hc): SHAD uses inplace update but Galois-GNN uses + // separate vector for outputs. + // Revisit this if there is performance differences. + this->forward_output_matrix_[row_index] = + std::max(float{0}, input_embeddings[row_index]); + } + } + }, + // TODO chunk size? + // steal on as some threads may have nothing to work on + // galois::steal(), galois::loopname("ReLUForward")); + galois::steal()); + this->TimerStop(&Timer); + return this->forward_output_matrix_; + } + + PointerWithSize BackwardPhaseCPU( + PointerWithSize prev_layer_input, + PointerWithSize* input_gradients) { + galois::StatTimer Timer("ReLUBackward", "ReLULayer"); + this->TimerStart(&Timer); + + const size_t feature_length = this->layer_dimensions_.input_columns; + + galois::do_all( + galois::iterate(size_t{0}, this->layer_dimensions_.input_rows), + [&](const unsigned row) { + if (this->IsSampledLayer()) { + if (this->layer_phase_ == GNNPhase::kTrain && + !this->graph_.IsInSampledGraphSubgraph(row)) + return; + } + // Even though ReLU is non-differentiable at 0, + // PyTorch's ReLU returns 0 for the derivative of 0. + if (this->graph_.IsValidForPhase(row, this->layer_phase_)) { + size_t row_offset = row * feature_length; + for (size_t row_index = row_offset; + row_index < (row_offset + feature_length); row_index++) { + this->p_backward_output_matrix_[row_index] = + (prev_layer_input[row_index] > 0? 1 : 0) * + (*input_gradients)[row_index]; + } + } + }, + galois::steal(), galois::loopname("ReLUBackward")); + + this->TimerStop(&Timer); + + return this->p_backward_output_matrix_; + } + + //! Get gradients to fix distribution such that it leans more towards single + //! class ground truth. + PointerWithSize + BackwardPhase(PointerWithSize prev_layer_input, + PointerWithSize* input_gradients) final { + return BackwardPhaseCPU(prev_layer_input, input_gradients); + } +}; + +} // namespace galois diff --git a/lonestar/libgnnbench/src/Input.cpp b/lonestar/libgnnbench/src/Input.cpp index 44b11cfa9b..c1da754222 100644 --- a/lonestar/libgnnbench/src/Input.cpp +++ b/lonestar/libgnnbench/src/Input.cpp @@ -63,6 +63,7 @@ llvm::cl::opt cl_layer_type( clEnumValN(galois::GNNLayerType::kSAGE, "sage", "SAGE layer (GCN with concat + mean)"), clEnumValN(galois::GNNLayerType::kL2Norm, "l2norm", "L2 norm layer"), + clEnumValN(galois::GNNLayerType::kReLU, "ReLU", "ReLU norm layer"), clEnumValN(galois::GNNLayerType::kDense, "dense", "Dense layer")), cll::init(galois::GNNLayerType::kSAGE)); From 9c74645629cc456ff20232bcd17316f4e3888be4 Mon Sep 17 00:00:00 2001 From: patrickkenney9801 Date: Mon, 2 Oct 2023 15:58:35 -0500 Subject: [PATCH 608/660] feat: Add pre-commit to the repo and contributing guidelines --- .pre-commit-config.yaml | 25 +++++++++++++++++++++++++ .tool-versions | 1 + CONTRIBUTING.md | 31 +++++++++++++++++++++++++++++++ Makefile | 20 ++++++++++++++++++++ 4 files changed, 77 insertions(+) create mode 100644 .pre-commit-config.yaml create mode 100644 .tool-versions create mode 100644 CONTRIBUTING.md create mode 100644 Makefile diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 0000000000..c30b4276e2 --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,25 @@ +--- +files: ^libcusp|^libdeepgalois|^libdist|^libgalois|^libgluon|^libgnn|^libwmd +exclude: ^scripts|^python|^inputs +repos: + - repo: https://github.com/pre-commit/pre-commit-hooks + rev: v4.2.0 + hooks: + - id: end-of-file-fixer + - id: mixed-line-ending + - id: trailing-whitespace + - repo: https://github.com/Lucas-C/pre-commit-hooks + rev: v1.2.0 + hooks: + - id: forbid-tabs + exclude: ^scripts|^python + - id: remove-tabs + exclude: ^scripts|^python + args: [--whitespaces-count, '2'] + - repo: https://github.com/pocc/pre-commit-hooks + rev: v1.3.5 + hooks: + - id: clang-format + args: [-i] + # - id: clang-tidy + # args: [--fix, -p=build/compile_commands.json] diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 0000000000..c00efa2d48 --- /dev/null +++ b/.tool-versions @@ -0,0 +1 @@ +pre-commit 2.19.0 diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md new file mode 100644 index 0000000000..2297468d67 --- /dev/null +++ b/CONTRIBUTING.md @@ -0,0 +1,31 @@ +# Contributing + +Contributors must run quality checks on code. In place of CI we +recommend using `pre-commit` (described below) instead of running +tools like `clang-format` manually. + +Code should be clear and documented where needed. + +## Tools + +### [asdf](https://asdf-vm.com) + +Provides a declarative set of tools pinned to +specific versions for environmental consistency. + +These tools are defined in `.tool-versions`. +Run `make dependencies` to initialize a new environment. + +### [pre-commit](https://pre-commit.com) + +A left shifting tool to consistently run a set of checks on the code repo. +Our checks enforce syntax validations and formatting. +We encourage contributors to use pre-commit hooks. + +```shell +# install all pre-commit hooks +make hooks + +# run pre-commit on repo once +make pre-commit +``` diff --git a/Makefile b/Makefile new file mode 100644 index 0000000000..2457b3c0a1 --- /dev/null +++ b/Makefile @@ -0,0 +1,20 @@ +dependencies: dependencies-asdf + +dependencies-asdf: + @echo "Updating asdf plugins..." + @asdf plugin update --all >/dev/null 2>&1 || true + @echo "Adding new asdf plugins..." + @cut -d" " -f1 ./.tool-versions | xargs -I % asdf plugin-add % >/dev/null 2>&1 || true + @echo "Installing asdf tools..." + @cat ./.tool-versions | xargs -I{} bash -c 'asdf install {}' + @echo "Updating local environment to use proper tool versions..." + @cat ./.tool-versions | xargs -I{} bash -c 'asdf local {}' + @asdf reshim + @echo "Done!" + +hooks: + @pre-commit install --hook-type pre-commit + @pre-commit install-hooks + +pre-commit: + @pre-commit run -a From 57618b1ea1da4755ad5c1eb14a4f1904e939b48e Mon Sep 17 00:00:00 2001 From: patrickkenney9801 Date: Mon, 2 Oct 2023 16:04:54 -0500 Subject: [PATCH 609/660] chore: Run clang-format and pre-commit checks on repo --- libcusp/include/galois/graphs/NewGeneric.h | 30 +- libcusp/test/shad-dist-graph.cpp | 44 +- .../deepgalois/layers/GradientSyncStructs.h | 2 +- .../include/deepgalois/layers/aggregator.h | 2 +- .../deepgalois/layers/graph_conv_layer.h | 2 +- .../include/deepgalois/layers/layer.h | 28 +- .../include/deepgalois/math_functions.hh | 10 +- libdeepgalois/include/deepgalois/optimizer.h | 8 +- libdeepgalois/include/deepgalois/random.h | 2 +- libdeepgalois/include/deepgalois/reader.h | 2 +- libdeepgalois/licensenote.txt | 6 +- libdeepgalois/scripts/run-multi.sh | 34 +- libdeepgalois/scripts/run-single.sh | 34 +- libdeepgalois/src/DistContext.cu | 8 +- libdeepgalois/src/Net.cu | 2 +- libdeepgalois/src/layers/gat_fw.h | 77 +- libdeepgalois/src/layers/graph_conv_layer.cpp | 29 +- libdeepgalois/src/math_functions.cpp | 6 +- libdeepgalois/src/math_functions.cu | 4 +- libdeepgalois/src/utils.cpp | 6 +- libdist/CMakeLists.txt | 2 +- libdist/include/galois/DistGalois.h | 4 +- libdist/include/galois/runtime/Serialize.h | 7 +- libgalois/CMakeLists.txt | 2 +- libgalois/include/galois/Bag.h | 2 +- libgalois/include/galois/FixedSizeRing.h | 4 +- libgalois/include/galois/LargeArray.h | 6 +- libgalois/include/galois/ParallelSTL.h | 4 +- libgalois/include/galois/SharedMemSys.h | 4 +- libgalois/include/galois/Timer.h | 6 +- libgalois/include/galois/gdeque.h | 4 +- .../include/galois/graphs/BufferedGraph.h | 18 +- .../include/galois/graphs/LC_CSR_Graph.h | 6 +- .../include/galois/graphs/LC_CSR_Hypergraph.h | 10 +- .../galois/graphs/LC_InlineEdge_Graph.h | 6 +- .../include/galois/graphs/LC_Linear_Graph.h | 6 +- .../include/galois/graphs/LC_Morph_Graph.h | 6 +- libgalois/include/galois/graphs/MorphGraph.h | 8 +- .../include/galois/graphs/MorphHyperGraph.h | 8 +- .../galois/graphs/Morph_SepInOut_Graph.h | 8 +- libgalois/include/galois/graphs/OCGraph.h | 4 +- libgalois/include/galois/gslist.h | 2 +- .../include/galois/runtime/Executor_ForEach.h | 13 +- libgalois/include/galois/runtime/Mem.h | 4 +- libgalois/include/galois/runtime/Range.h | 4 +- libgalois/include/galois/runtime/SharedMem.h | 4 +- .../include/galois/runtime/ThreadTimer.h | 12 +- .../galois/substrate/PerThreadStorage.h | 4 +- .../include/galois/substrate/SharedMem.h | 4 +- .../include/galois/substrate/ThreadPool.h | 4 +- .../include/galois/worklists/AdaptiveObim.h | 2 +- libgalois/include/galois/worklists/Chunk.h | 4 +- libgalois/include/galois/worklists/WorkList.h | 2 +- libgalois/include/shad/DataTypes.h | 785 +++++++++--------- libgalois/include/shad/Graph.h | 212 ++--- libgalois/include/shad/GraphTypes.h | 2 +- libgalois/include/shad/ShadGraphConverter.h | 294 ++++--- libgalois/src/FileGraph.cpp | 2 +- libgalois/test/bandwidth.cpp | 2 +- libgalois/test/move.cpp | 10 +- libgalois/test/reduction.cpp | 4 +- .../include/galois/graphs/GluonSubstrate.h | 23 +- .../include/galois/runtime/SyncStructures.h | 2 +- libgnn/README.md | 18 +- .../galois/graphs/DegreeSyncStructures.h | 2 +- libgnn/include/galois/graphs/GNNGraph.h | 193 +++-- libgnn/include/galois/layers/GNNLayer.h | 4 +- .../galois/layers/GraphConvolutionalLayer.h | 64 +- libgnn/include/galois/layers/ReLULayer.h | 28 +- libgnn/include/galois/layers/SAGELayer.h | 10 +- libgnn/src/GNNMath.cpp | 4 +- libgnn/src/layers/DenseLayer.cpp | 1 - libgnn/src/layers/GNNLayer.cpp | 1 - libgnn/test/CMakeLists.txt | 10 +- libgnn/test/back-conv-test.cpp | 4 +- libgnn/test/convlayer-test.cpp | 12 +- libgnn/test/gcn-sample-edge-test.cpp | 17 +- libgnn/test/gnnconstruct-test.cpp | 4 +- libgnn/test/gnngraph-test.cpp | 8 +- libgnn/test/gpu-back-conv-test.cpp | 4 +- libgnn/test/gpu-convlayer-test.cpp | 12 +- libgnn/test/gpu-sage-layer-test.cpp | 12 +- libgnn/test/l2norm-layer-test.cpp | 4 +- libgnn/test/mkl_micro.cpp | 17 +- libgnn/test/sage-layer-test.cpp | 12 +- libgnn/test/sample-test.cpp | 4 +- libgnn/test/single_mkl_micro.cpp | 120 +-- 87 files changed, 1212 insertions(+), 1209 deletions(-) diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index 9fa37159f1..e8d7e15d8e 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -397,8 +397,8 @@ class NewDistGraphGeneric : public DistGraph { *edgeEnd, base_DistGraph::numGlobalNodes, base_DistGraph::numGlobalEdges); } else { - constructCSRFromSHADGraph( - &bufGraph, &shadConverter, nodeBegin, nodeEnd, host_prefix); + constructCSRFromSHADGraph(&bufGraph, &shadConverter, nodeBegin, nodeEnd, + host_prefix); } graphReadTimer.stop(); @@ -608,10 +608,11 @@ class NewDistGraphGeneric : public DistGraph { template < typename T = NodeTy, typename std::enable_if_t>* = nullptr> - void constructCSRFromSHADGraph( - galois::graphs::BufferedGraph* bufGraph, - shad::ShadGraphConverter* shadConverter, - uint64_t nodeBegin, uint64_t nodeEnd, std::string host_prefix) { + void + constructCSRFromSHADGraph(galois::graphs::BufferedGraph* bufGraph, + shad::ShadGraphConverter* shadConverter, + uint64_t nodeBegin, uint64_t nodeEnd, + std::string host_prefix) { uint32_t numLocalNodes = nodeEnd - nodeBegin; // So, this holds outgoing edge array of a whole (global) graph. uint64_t* outIndexBuffer = shadConverter->getOutIndexBuffer(); @@ -625,14 +626,13 @@ class NewDistGraphGeneric : public DistGraph { // From now on, those arrays store local node information // as a dense memory representation. shadConverter->extractLocalOutIndexArray(nodeBegin, nodeEnd); - galois::gInfo(host_prefix, - "Completes local out index array construction"); + galois::gInfo(host_prefix, "Completes local out index array construction"); galois::gInfo(host_prefix, "Starts edge destination/data " "array construction"); uint64_t numLocalEdges = edgeEnd - edgeBegin; shadConverter->constructEdgeArrays(nodeBegin, edgeBegin, numLocalNodes, - numLocalEdges); + numLocalEdges); galois::gInfo(host_prefix, "Completes edge destination/data " "array construction"); @@ -646,12 +646,12 @@ class NewDistGraphGeneric : public DistGraph { } // Disable this method for non-SHAD graph construction. - template < - typename T = NodeTy, - typename std::enable_if_t>* = nullptr> - void constructCSRFromSHADGraph( - galois::graphs::BufferedGraph*, - shad::ShadGraphConverter*, uint64_t, uint64_t, std::string) {} + template >* = + nullptr> + void constructCSRFromSHADGraph(galois::graphs::BufferedGraph*, + shad::ShadGraphConverter*, uint64_t, uint64_t, + std::string) {} /** * @brief Assign a SHAD node type to a node data. diff --git a/libcusp/test/shad-dist-graph.cpp b/libcusp/test/shad-dist-graph.cpp index dedc3c34cb..492bfeb2ad 100644 --- a/libcusp/test/shad-dist-graph.cpp +++ b/libcusp/test/shad-dist-graph.cpp @@ -21,12 +21,12 @@ #include "galois/Galois.h" #include "galois/graphs/CuSPPartitioner.h" -#include "shad/ShadGraphConverter.h" +#include "shad/ShadGraphConverter.h" int main() { galois::DistMemSys G; unsigned M = galois::substrate::getThreadPool().getMaxThreads(); - //M = 1; + // M = 1; galois::setActiveThreads(M); shad::ShadGraphConverter shadConverter; @@ -38,7 +38,8 @@ int main() { std::string filename = "/home/hochan/data.01.csv"; shadConverter.readSHADFile(filename, &numNodes, &numEdges); std::unique_ptr> - graph = galois::cuspPartitionGraph( + graph = galois::cuspPartitionGraph( filename, galois::CUSP_CSR, galois::CUSP_CSR, true, true); std::cout << "Test starts...\n"; @@ -62,31 +63,34 @@ int main() { std::cout << "Num. nodes/edges tests has been passed\n"; - uint32_t id = galois::runtime::getSystemNetworkInterface().ID; + uint32_t id = galois::runtime::getSystemNetworkInterface().ID; uint32_t numHosts = galois::runtime::getSystemNetworkInterface().Num; { - std::ofstream fp(std::to_string(id) + ".master"); - for (uint32_t src = 0; src < graph->numMasters(); ++src) { - uint64_t srcglobal = graph->getGID(src); - fp << "node " << srcglobal << ", type: " << graph->getData(src).type << - ", key: " << graph->getData(src).key << "\n"; - for (auto e : graph->edges(src)) { - uint32_t dstlocal = graph->getEdgeDst(e); - uint64_t dstglobal = graph->getGID(dstlocal); - fp << "\t edge dst " << dstglobal << ", type: " << - graph->getEdgeData(e) << "\n"; + std::ofstream fp(std::to_string(id) + ".master"); + for (uint32_t src = 0; src < graph->numMasters(); ++src) { + uint64_t srcglobal = graph->getGID(src); + fp << "node " << srcglobal << ", type: " << graph->getData(src).type + << ", key: " << graph->getData(src).key << "\n"; + for (auto e : graph->edges(src)) { + uint32_t dstlocal = graph->getEdgeDst(e); + uint64_t dstglobal = graph->getGID(dstlocal); + fp << "\t edge dst " << dstglobal << ", type: " << graph->getEdgeData(e) + << "\n"; + } } - } - fp.close(); + fp.close(); } { for (uint32_t host = 0; host < numHosts; ++host) { - if (host == id) { continue; } - std::ofstream fp(std::to_string(id) + "-" + std::to_string(host) + ".graph"); + if (host == id) { + continue; + } + std::ofstream fp(std::to_string(id) + "-" + std::to_string(host) + + ".graph"); for (uint32_t i = 0; i < graph->size(); ++i) { - fp << i << ", " << graph->getGID(i) << ", " << - graph->getData(i).type << ", " << graph->getData(i).key << "\n"; + fp << i << ", " << graph->getGID(i) << ", " << graph->getData(i).type + << ", " << graph->getData(i).key << "\n"; } fp.close(); } diff --git a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h index 6f600b40a8..d4c23af1bb 100644 --- a/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h +++ b/libdeepgalois/include/deepgalois/layers/GradientSyncStructs.h @@ -20,7 +20,7 @@ struct GradientSync { } weight += y; // need a post process divide all step - //weight /= 2; + // weight /= 2; return true; } diff --git a/libdeepgalois/include/deepgalois/layers/aggregator.h b/libdeepgalois/include/deepgalois/layers/aggregator.h index 3f2d3c7f1b..8ef845b1d9 100644 --- a/libdeepgalois/include/deepgalois/layers/aggregator.h +++ b/libdeepgalois/include/deepgalois/layers/aggregator.h @@ -13,7 +13,7 @@ void update_all_csrmm(size_t len, Graph& g, const float_t* in, float_t* out, } // namespace deepgalois #else #include "deepgalois/GraphTypes.h" -//#include "graph_gpu.h" +// #include "graph_gpu.h" namespace deepgalois { void update_all(size_t len, GraphGPU& g, const float_t* in, float_t* out, bool norm, const float_t* norm_factor); diff --git a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h index d112ddf785..14c47c9813 100644 --- a/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h +++ b/libdeepgalois/include/deepgalois/layers/graph_conv_layer.h @@ -74,7 +74,7 @@ class graph_conv_layer : public layer { float_t* in_temp1; float_t* trans_data; // y*x mask_t* dropout_mask; // x*y - float_t epsilon; // LeakyReLU angle of negative slope: set to 0.2 + float_t epsilon; // LeakyReLU angle of negative slope: set to 0.2 // Glorot & Bengio (AISTATS 2010) inline void rand_init_matrix(size_t dim_x, size_t dim_y, vec_t& matrix, diff --git a/libdeepgalois/include/deepgalois/layers/layer.h b/libdeepgalois/include/deepgalois/layers/layer.h index 534d99b821..6e1ac879cc 100644 --- a/libdeepgalois/include/deepgalois/layers/layer.h +++ b/libdeepgalois/include/deepgalois/layers/layer.h @@ -58,23 +58,23 @@ class layer : public deepgalois::node { bool use_mask; vec_t W; // parameters to learn, for vertex v, layer0: D x 16, layer1: 16 x E vec_t Q; // parameters to learn, for vertex v's neighbors, same size as W - vec_t weight_grad; // weight gradient for updating parameters - float_t* d_W; // parameters to learn on device (GPU) + vec_t weight_grad; // weight gradient for updating parameters + float_t* d_W; // parameters to learn on device (GPU) float_t* d_weight_grad; // weight gradient on device (GPU) - vec_t alpha_l; // parameters to learn (H x 1), only used for GAT - vec_t alpha_r; // parameters to learn (H x 1), only used for GAT - vec_t alpha_lgrad; // gradients for updating alpha (GAT only) - vec_t alpha_rgrad; // gradients for updating alpha (GAT only) - mask_t* masks_; // masks to show which samples are valid - mask_t* d_masks_; // masks on device (GPU) - float_t* loss; // error for each vertex: N x 1 + vec_t alpha_l; // parameters to learn (H x 1), only used for GAT + vec_t alpha_r; // parameters to learn (H x 1), only used for GAT + vec_t alpha_lgrad; // gradients for updating alpha (GAT only) + vec_t alpha_rgrad; // gradients for updating alpha (GAT only) + mask_t* masks_; // masks to show which samples are valid + mask_t* d_masks_; // masks on device (GPU) + float_t* loss; // error for each vertex: N x 1 ContextType* context; label_t* labels; - float_t* norm_consts; // normalization score - vec_t scores; // un-normalized scores - vec_t temp_scores; // un-normalized scores - vec_t scores_grad; // gradients of un-normalized scores - vec_t norm_scores; // normalized scores + float_t* norm_consts; // normalization score + vec_t scores; // un-normalized scores + vec_t temp_scores; // un-normalized scores + vec_t scores_grad; // gradients of un-normalized scores + vec_t norm_scores; // normalized scores vec_t norm_scores_grad; // gradients of normalized scores // TODO #ifdef GALOIS_ENABLE_GPU diff --git a/libdeepgalois/include/deepgalois/math_functions.hh b/libdeepgalois/include/deepgalois/math_functions.hh index 38f461620a..e6b5836386 100644 --- a/libdeepgalois/include/deepgalois/math_functions.hh +++ b/libdeepgalois/include/deepgalois/math_functions.hh @@ -81,8 +81,8 @@ void relu_cpu(size_t n, const float_t* in, float_t* out); void d_relu_cpu(size_t n, const float_t* in, const float_t* data, float_t* out); // Leaky ReLU -void leaky_relu(float_t epsilon, float_t in, float_t &out); -void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t &out); +void leaky_relu(float_t epsilon, float_t in, float_t& out); +void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t& out); void leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, float_t* out); void d_leaky_relu_cpu(size_t n, float_t epsilon, const float_t* in, const float_t* data, float_t* out); @@ -171,8 +171,10 @@ void float_copy_device(int n, float_t* h_ptr, float_t* d_ptr); void uint8_malloc_device(int n, uint8_t*& ptr); void uint8_free_device(uint8_t*& ptr); void uint8_copy_device(int n, uint8_t* h_ptr, uint8_t* d_ptr); -acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, float_t* loss); +acc_t masked_avg_loss_gpu(int begin, int end, int count, mask_t* masks, + float_t* loss); acc_t l2_norm_gpu(int n, const float_t* in); void l2_norm_gpu(size_t x, size_t y, const float_t* in, float_t* out); -void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff, float_t* out_diff); +void d_l2_norm_gpu(size_t x, size_t y, const float_t* in_data, float_t* in_diff, + float_t* out_diff); #endif diff --git a/libdeepgalois/include/deepgalois/optimizer.h b/libdeepgalois/include/deepgalois/optimizer.h index f5eb4b54ec..694819591c 100644 --- a/libdeepgalois/include/deepgalois/optimizer.h +++ b/libdeepgalois/include/deepgalois/optimizer.h @@ -23,10 +23,10 @@ namespace deepgalois { // usesHessian : true if an optimizer uses hessian (2nd order derivative of loss // function) struct optimizer { - optimizer() = default; - optimizer(const optimizer&) = default; - optimizer(optimizer&&) = default; - optimizer& operator=(const optimizer&) = default; + optimizer() = default; + optimizer(const optimizer&) = default; + optimizer(optimizer&&) = default; + optimizer& operator=(const optimizer&) = default; optimizer& operator=(optimizer&&) = default; virtual ~optimizer() = default; virtual void update(const vec_t& dW, vec_t& W) = 0; diff --git a/libdeepgalois/include/deepgalois/random.h b/libdeepgalois/include/deepgalois/random.h index bf1648bc2a..6e5cb0fe5b 100644 --- a/libdeepgalois/include/deepgalois/random.h +++ b/libdeepgalois/include/deepgalois/random.h @@ -50,4 +50,4 @@ uniform_rand(T min, T max) { std::uniform_real_distribution dst(min, max); return dst(random_generator::get_instance()()); } -} //end of namespace +} // namespace deepgalois diff --git a/libdeepgalois/include/deepgalois/reader.h b/libdeepgalois/include/deepgalois/reader.h index 5e034ec210..c25eeceac2 100644 --- a/libdeepgalois/include/deepgalois/reader.h +++ b/libdeepgalois/include/deepgalois/reader.h @@ -1,6 +1,6 @@ #pragma once #include "deepgalois/lgraph.h" -//#include "galois/DistGalois.h" +// #include "galois/DistGalois.h" namespace deepgalois { class Reader { diff --git a/libdeepgalois/licensenote.txt b/libdeepgalois/licensenote.txt index cf1aeb6caf..d9bf751eac 100644 --- a/libdeepgalois/licensenote.txt +++ b/libdeepgalois/licensenote.txt @@ -33,13 +33,13 @@ committed. LICENSE Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are met: +modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this - list of conditions and the following disclaimer. + list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation - and/or other materials provided with the distribution. + and/or other materials provided with the distribution. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED diff --git a/libdeepgalois/scripts/run-multi.sh b/libdeepgalois/scripts/run-multi.sh index 660fac74b3..da9861fb2e 100755 --- a/libdeepgalois/scripts/run-multi.sh +++ b/libdeepgalois/scripts/run-multi.sh @@ -13,21 +13,21 @@ HIDDENDIM="16 64 128" OUTDIR=/net/ohm/export/cdgc/cxh/outputs/DeepGalois for GNN in $GNNS; do - for NT in $NTHREADS; do - for GR in $GRAPHS; do - for K in $EPOCHS; do - for DR in $DROPOUT; do - for LR in $LEARNINGRATES; do - for HD in $HIDDENDIM; do - EXEC_DIR=$LONESTARGNN/$GNN - echo $EXEC_DIR - echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log" - $EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD -sc=0 &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log - echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log" - done - done - done - done - done - done + for NT in $NTHREADS; do + for GR in $GRAPHS; do + for K in $EPOCHS; do + for DR in $DROPOUT; do + for LR in $LEARNINGRATES; do + for HD in $HIDDENDIM; do + EXEC_DIR=$LONESTARGNN/$GNN + echo $EXEC_DIR + echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log" + $EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD -sc=0 &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log + echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log" + done + done + done + done + done + done done diff --git a/libdeepgalois/scripts/run-single.sh b/libdeepgalois/scripts/run-single.sh index 37a393d788..a6bc223ebd 100755 --- a/libdeepgalois/scripts/run-single.sh +++ b/libdeepgalois/scripts/run-single.sh @@ -13,21 +13,21 @@ HIDDENDIM="16 32 64 128 256 512" OUTDIR=/net/ohm/export/cdgc/cxh/outputs/DeepGalois for GNN in $GNNS; do - for NT in $NTHREADS; do - for GR in $GRAPHS; do - for K in $EPOCHS; do - for DR in $DROPOUT; do - for LR in $LEARNINGRATES; do - for HD in $HIDDENDIM; do - EXEC_DIR=$LONESTARGNN/$GNN - echo $EXEC_DIR - echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log" - $EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log - echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log" - done - done - done - done - done - done + for NT in $NTHREADS; do + for GR in $GRAPHS; do + for K in $EPOCHS; do + for DR in $DROPOUT; do + for LR in $LEARNINGRATES; do + for HD in $HIDDENDIM; do + EXEC_DIR=$LONESTARGNN/$GNN + echo $EXEC_DIR + echo "$EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log" + $EXEC_DIR/$GNN $GR -k=$K -t=$NT -d=$DR -lr=$LR -h=$HD &> $OUTDIR/$GNN-$GR-$K-$DR-$LR-$HD-$NT.log + echo "Done. Check out $OUTDIR/$GNN-$GR-$K-$DR-$NT.log" + done + done + done + done + done + done done diff --git a/libdeepgalois/src/DistContext.cu b/libdeepgalois/src/DistContext.cu index b67f0f9125..30704b0748 100644 --- a/libdeepgalois/src/DistContext.cu +++ b/libdeepgalois/src/DistContext.cu @@ -64,9 +64,9 @@ cusparseMatDescr_t DistContext::cusparse_matdescr_ = 0; curandGenerator_t DistContext::curand_generator_ = 0; DistContext::DistContext() : DistContext(true) { - d_labels = NULL; + d_labels = NULL; d_feats = NULL; - d_labels_subg = NULL; + d_labels_subg = NULL; d_feats_subg = NULL; d_normFactors = NULL; d_normFactorsSub = NULL; @@ -110,7 +110,7 @@ size_t DistContext::read_features(std::string dataset_str) { return feat_len; } -size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, size_t n, +size_t DistContext::read_masks(std::string dataset_str, std::string mask_type, size_t n, size_t& begin, size_t& end, mask_t* masks, DGraph* dGraph) { return reader.read_masks(mask_type, n, begin, end, masks); } @@ -132,7 +132,7 @@ void DistContext::constructSubgraphLabels(size_t m, const mask_t* masks) { for (size_t i = 0; i < this->partitionedGraph->size(); i++) { if (masks[i] == 1) { if (usingSingleClass) h_labels_subg[count] = h_labels[i]; - else std::copy(h_labels + i * num_classes, h_labels + (i + 1) * num_classes, + else std::copy(h_labels + i * num_classes, h_labels + (i + 1) * num_classes, &h_labels_subg[count * num_classes]); count++; } diff --git a/libdeepgalois/src/Net.cu b/libdeepgalois/src/Net.cu index 2921b81996..ee70e1d578 100644 --- a/libdeepgalois/src/Net.cu +++ b/libdeepgalois/src/Net.cu @@ -191,7 +191,7 @@ void Net::read_test_masks(std::string dataset) { for (size_t i = globalTestBegin; i < globalTestEnd; i++) globalTestMasks[i] = 1; } else { - globalTestCount = distContext->read_masks(dataset, std::string("test"), + globalTestCount = distContext->read_masks(dataset, std::string("test"), globalSamples, globalTestBegin, globalTestEnd, globalTestMasks, NULL); } //copy_test_masks_to_device(); diff --git a/libdeepgalois/src/layers/gat_fw.h b/libdeepgalois/src/layers/gat_fw.h index e9a7bada37..d57f485a8c 100644 --- a/libdeepgalois/src/layers/gat_fw.h +++ b/libdeepgalois/src/layers/gat_fw.h @@ -1,6 +1,6 @@ -//#define USE_GAT +// #define USE_GAT #ifdef USE_GAT -// `Graph Attention Network ` +// `Graph Attention Network ` // NOTE: GAT paper uses "first concatenation then linear projection" // to compute attention scores, while ours is "first projection then // addition", the two approaches are mathematically equivalent: @@ -10,7 +10,7 @@ // save [Wh_i || Wh_j] on edges, which is not memory-efficient. Plus, // addition could be optimized with DGL's built-in function u_add_v, // which further speeds up computation and saves memory footprint. - + void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, float_t* out) { size_t n = g.size(); @@ -19,34 +19,34 @@ void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, auto deg_src = g.get_degree(src); // concatenation, dot product, LeakyReLU - //int i = 0; - //vec_t scores(deg_src); + // int i = 0; + // vec_t scores(deg_src); auto begin = g.edge_begin(src); - auto end = g.edge_end(src); + auto end = g.edge_end(src); // alpha: learnable weight vector (shared by all vertices) float_t src_score = math::dot(len, &alpha_l[0], &in[src_idx]); for (auto e = begin; e != end; e++) { - auto dst = g.getEdgeDst(e); + auto dst = g.getEdgeDst(e); auto dst_idx = dst * len; - //vec_t concat_vec(2*len); - //math::concat(len, &in[src_idx], &in[dst_idx], &concat_vec[0]); - //float_t score = math::dot(2*len, &alpha[0], &concat_vec[0]); + // vec_t concat_vec(2*len); + // math::concat(len, &in[src_idx], &in[dst_idx], &concat_vec[0]); + // float_t score = math::dot(2*len, &alpha[0], &concat_vec[0]); float_t dst_score = math::dot(len, &alpha_r[0], &in[dst_idx]); - temp_scores[e] = src_score + dst_score; + temp_scores[e] = src_score + dst_score; math::leaky_relu(epsilon, temp_scores[e], scores[e]); } // softmax to normalize the attention scores on each vertexโ€™s incoming edges - //vec_t normalized_scores(deg_src, 0); - //math::softmax(deg_src, &scores[0], &normalized_scores[0]); + // vec_t normalized_scores(deg_src, 0); + // math::softmax(deg_src, &scores[0], &normalized_scores[0]); math::softmax(deg_src, &scores[begin], &norm_scores[begin]); // aggregation: scaled by the attention scores math::clear_cpu(len, &out[src_idx]); for (auto e = begin; e != end; e++) { - auto dst = g.getEdgeDst(e); + auto dst = g.getEdgeDst(e); auto dst_idx = dst * len; - auto score = norm_scores[e]; + auto score = norm_scores[e]; vec_t neighbor(len); math::scale(len, score, &in[dst_idx], &neighbor[0]); math::vadd_cpu(len, &out[src_idx], &neighbor[0], &out[src_idx]); @@ -55,47 +55,48 @@ void graph_conv_layer::aggregate(size_t len, Graph& g, const float_t* in, } void graph_conv_layer::d_compute_scores(size_t len, Graph& g, - const float_t* in_data, - const float_t *out_data, + const float_t* in_data, + const float_t* out_data, const float_t* in_grad) { size_t n = g.size(); // compute gradients for the learnable vector `alpha` - //vec_t temp_grad(n*n); - //math::sgemm_cpu(CblasTrans, CblasNoTrans, n, len, n, 1.0, out_data, + // vec_t temp_grad(n*n); + // math::sgemm_cpu(CblasTrans, CblasNoTrans, n, len, n, 1.0, out_data, // in_grad, 0.0, temp_grad); galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { - auto begin = g.edge_begin(src); - auto end = g.edge_end(src); + auto begin = g.edge_begin(src); + auto end = g.edge_end(src); auto deg_src = g.get_degree(src); - math::d_softmax(deg_src, &scores[begin], &norm_scores[begin], + math::d_softmax(deg_src, &scores[begin], &norm_scores[begin], &scores_grad[begin], &norm_scores_grad[begin]); for (auto e = begin; e != end; e++) { auto dst = g.getEdgeDst(e); - // use norm_scores_grad as temp_scores_grad since its data is useless already - math::d_leaky_relu(epsilon, &scores_grad[e], - &temp_scores[e], &norm_scores_grad[e]); + // use norm_scores_grad as temp_scores_grad since its data is useless + // already + math::d_leaky_relu(epsilon, &scores_grad[e], &temp_scores[e], + &norm_scores_grad[e]); math::scale(len, norm_scores_grad[e], &in_data[src_idx], &alpha_lgrad[0]); math::scale(len, norm_scores_grad[e], &in_data[dst_idx], &alpha_rgrad[0]); } }); } -void graph_conv_layer::d_aggregate(size_t len, Graph& g, - const float_t* in_grad, float_t* out_grad) { +void graph_conv_layer::d_aggregate(size_t len, Graph& g, const float_t* in_grad, + float_t* out_grad) { size_t n = g.size(); // aggregation: the derivative is transposed; - // the graph is undirected (structurally symmetric), + // the graph is undirected (structurally symmetric), // but values are not the same for the symmetric positions galois::do_all(galois::iterate(size_t(0), n), [&](const auto src) { - auto src_idx = src * len; + auto src_idx = src * len; auto src_begin = g.edge_begin(src); for (auto e = src_begin; e != g.edge_end(src); e++) { - auto dst = g.getEdgeDst(e); - auto dst_idx = dst * len; + auto dst = g.getEdgeDst(e); + auto dst_idx = dst * len; auto dst_begin = g.edge_begin(dst); - auto score = norm_scores[dst_begin+e-src_begin]; // transposed + auto score = norm_scores[dst_begin + e - src_begin]; // transposed vec_t neighbor(len); math::scale(len, score, &in_grad[dst_idx], &neighbor[0]); math::vadd_cpu(len, &out_grad[src_idx], &neighbor[0], &out_grad[src_idx]); @@ -113,8 +114,8 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, // dropout if (dropout_ && phase_ == net_phase::train) { - math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, - dropout_mask, in_temp); + math::dropout_cpu(x, y, scale_, dropout_rate_, in_data, dropout_mask, + in_temp); } else { math::copy_cpu(x * y, in_data, in_temp); } @@ -125,9 +126,10 @@ void graph_conv_layer::forward_propagation(const float_t* in_data, // aggregation aggregate(z, *graph_cpu, out_temp, out_data); - + // ReLU - if (act_) math::relu_cpu(x * z, out_data, out_data); + if (act_) + math::relu_cpu(x * z, out_data, out_data); } void graph_conv_layer::back_propagation(const float_t* in_data, @@ -136,7 +138,8 @@ void graph_conv_layer::back_propagation(const float_t* in_data, size_t x = input_dims[0]; size_t y = input_dims[1]; size_t z = output_dims[1]; - if (act_) math::d_relu_cpu(x * z, out_grad, out_data, out_grad); + if (act_) + math::d_relu_cpu(x * z, out_grad, out_data, out_grad); // compute gradients for alpha (alpha is a learnable vector) d_compute_scores(z, *graph_cpu, in_temp, out_temp, out_grad); diff --git a/libdeepgalois/src/layers/graph_conv_layer.cpp b/libdeepgalois/src/layers/graph_conv_layer.cpp index da9b01dbae..f13b26be25 100644 --- a/libdeepgalois/src/layers/graph_conv_layer.cpp +++ b/libdeepgalois/src/layers/graph_conv_layer.cpp @@ -56,7 +56,7 @@ void graph_conv_layer::malloc_and_init() { // make sure seed consistent across all hosts for weight matrix rand_init_matrix(y, z, W, 1); - //rand_init_matrix(y, z, Q, 1); // for GraphSAGE + // rand_init_matrix(y, z, Q, 1); // for GraphSAGE zero_init_matrix(y, z, layer::weight_grad); @@ -64,12 +64,12 @@ void graph_conv_layer::malloc_and_init() { // alpha is only used for GAT rand_init_matrix(z, 1, alpha_l, 1); rand_init_matrix(z, 1, alpha_r, 1); - alpha_lgrad.resize(2*z); - alpha_rgrad.resize(2*z); + alpha_lgrad.resize(2 * z); + alpha_rgrad.resize(2 * z); std::fill(alpha_lgrad.begin(), alpha_lgrad.end(), 0); std::fill(alpha_rgrad.begin(), alpha_rgrad.end(), 0); auto ne = graph_cpu->sizeEdges(); // number of edges - scores.resize(ne); // a score for each edge + scores.resize(ne); // a score for each edge temp_scores.resize(ne); scores_grad.resize(ne); norm_scores.resize(ne); @@ -77,7 +77,7 @@ void graph_conv_layer::malloc_and_init() { epsilon = 0.2; // LeakyReLU angle of negative slope #endif dropout_ = true; - act_ = false; + act_ = false; if (dropout_) dropout_mask = new mask_t[x * y]; @@ -233,7 +233,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, // at this point, out_temp has the derivative of data from last step to // use for both updating gradients for features and gradients for weights // this calculates gradients for the node predictions - if (level_ != 0) {// no need to calculate in_grad for the first layer + if (level_ != 0) { // no need to calculate in_grad for the first layer // derivative of matmul needs transposed matrix math::sgemm_cpu(CblasNoTrans, CblasTrans, x, y, z, 1.0, out_temp, &W[0], 0.0, in_grad); // x*z; z*y -> x*y @@ -254,7 +254,7 @@ void graph_conv_layer::back_propagation(const float_t* in_data, compute_timer.stop(); // sync agg - //galois::gPrint(header, "x is ", x, " y is ", y, " z is ", z, "\n"); + // galois::gPrint(header, "x is ", x, " y is ", y, " z is ", z, "\n"); if (level_ != 0) { deepgalois::_syncVectorSize = y; deepgalois::_dataToSync = in_grad; @@ -275,14 +275,13 @@ void graph_conv_layer::back_propagation(const float_t* in_data, unsigned host_num = galois::runtime::getSystemNetworkInterface().Num; layer::syncSub->sync("Gradients"); galois::do_all( - galois::iterate((size_t)0, (size_t)z), - [&] (size_t i) { - //galois::gPrint("before ", i, " ", layer::weight_grad[i], "\n"); - layer::weight_grad[i] /= host_num; - //galois::gPrint("after ", i, " ", layer::weight_grad[i], "\n"); - }, - galois::loopname("sync post process") - ); + galois::iterate((size_t)0, (size_t)z), + [&](size_t i) { + // galois::gPrint("before ", i, " ", layer::weight_grad[i], "\n"); + layer::weight_grad[i] /= host_num; + // galois::gPrint("after ", i, " ", layer::weight_grad[i], "\n"); + }, + galois::loopname("sync post process")); galois::gDebug("[", layer::gradientGraph->myHostID(), "] Sync done"); conv_timer.stop(); diff --git a/libdeepgalois/src/math_functions.cpp b/libdeepgalois/src/math_functions.cpp index aed0ac79b9..b8addfe887 100644 --- a/libdeepgalois/src/math_functions.cpp +++ b/libdeepgalois/src/math_functions.cpp @@ -178,7 +178,7 @@ float_t dot(size_t n, const float_t* x, const float_t* y) { // concatenation of two vectors into one void concat(size_t n, const float_t* x, const float_t* y, float_t* z) { copy_cpu(n, x, z); - copy_cpu(n, y, z+n); + copy_cpu(n, y, z + n); } void clear_cpu(size_t n, float_t* in) { @@ -244,11 +244,11 @@ void d_relu_cpu(size_t n, const float_t* in, const float_t* data, galois::chunk_size<64>(), galois::loopname("d_relu")); } -void leaky_relu(float_t epsilon, float_t in, float_t &out) { +void leaky_relu(float_t epsilon, float_t in, float_t& out) { out = in > 0.0 ? in : epsilon * in; } -void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t &out) { +void d_leaky_relu(float_t epsilon, float_t in, float_t data, float_t& out) { out = in * (data > 0.0 ? 1.0 : epsilon); } diff --git a/libdeepgalois/src/math_functions.cu b/libdeepgalois/src/math_functions.cu index 9a7c4bc1dd..b9f7686867 100644 --- a/libdeepgalois/src/math_functions.cu +++ b/libdeepgalois/src/math_functions.cu @@ -234,9 +234,9 @@ void csrmm_gpu(const int M, const int N, const int K, const int nnz, const int* A_nnz_idx, const float* B, const float beta, float* transpose_C, float* C) { //std::cout << "[debug] csrmm_gpu m=" << M << ", n=" << N << ", k=" << K << ", nnz=" << nnz << "\n"; - CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::DistContext::cusparse_handle(), + CUSPARSE_CHECK(cusparseScsrmm2(deepgalois::DistContext::cusparse_handle(), CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_TRANSPOSE, - M, N, K, nnz, &alpha, deepgalois::DistContext::cusparse_matdescr(), + M, N, K, nnz, &alpha, deepgalois::DistContext::cusparse_matdescr(), A_nonzeros, A_idx_ptr, A_nnz_idx, B, N, &beta, transpose_C, M)); // transpose C const float one = 1.0; diff --git a/libdeepgalois/src/utils.cpp b/libdeepgalois/src/utils.cpp index 61ff3a2e58..1b237ff7c3 100644 --- a/libdeepgalois/src/utils.cpp +++ b/libdeepgalois/src/utils.cpp @@ -109,9 +109,9 @@ acc_t masked_f1_score(size_t begin, size_t end, size_t, mask_t* masks, double precision_mic = tp_accum + fp_accum > 0 ? (double)tp_accum / (double)(tp_accum + fp_accum) : 0.; - double recall_mic = tp_accum + fn_accum > 0 - ? (double)tp_accum / (double)(tp_accum + fn_accum) - : 0.; + double recall_mic = tp_accum + fn_accum > 0 + ? (double)tp_accum / (double)(tp_accum + fn_accum) + : 0.; double f1_micro = recall_mic + precision_mic > 0. ? 2. * (recall_mic * precision_mic) / (recall_mic + precision_mic) diff --git a/libdist/CMakeLists.txt b/libdist/CMakeLists.txt index 57e6aa1750..2930d37cbf 100644 --- a/libdist/CMakeLists.txt +++ b/libdist/CMakeLists.txt @@ -49,7 +49,7 @@ if (GALOIS_USE_LCI) add_dependencies(galois_dist_async lci) target_link_libraries(galois_dist_async PRIVATE ${LCI_LIBRARY} -lpsm2) - target_include_directories(galois_dist_async PUBLIC + target_include_directories(galois_dist_async PUBLIC $ $ ) diff --git a/libdist/include/galois/DistGalois.h b/libdist/include/galois/DistGalois.h index b87c539f3e..e39f311470 100644 --- a/libdist/include/galois/DistGalois.h +++ b/libdist/include/galois/DistGalois.h @@ -44,10 +44,10 @@ class DistMemSys : public runtime::SharedMem { ~DistMemSys(); - DistMemSys(const DistMemSys&) = delete; + DistMemSys(const DistMemSys&) = delete; DistMemSys& operator=(const DistMemSys&) = delete; - DistMemSys(DistMemSys&&) = delete; + DistMemSys(DistMemSys&&) = delete; DistMemSys& operator=(DistMemSys&&) = delete; }; diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h index bfd25c3cf3..a7b83174b7 100644 --- a/libdist/include/galois/runtime/Serialize.h +++ b/libdist/include/galois/runtime/Serialize.h @@ -1055,9 +1055,10 @@ inline void gDeserialize(DeSerializeBuffer&) {} * @param data Object to save data in the iterator type into */ template -auto gDeserializeRaw(Iter iter, T& data) -> decltype( - std::declval::value>::type>(), - Iter()) { +auto gDeserializeRaw(Iter iter, T& data) + -> decltype(std::declval::value>::type>(), + Iter()) { unsigned char* pdata = (unsigned char*)&data; for (size_t i = 0; i < sizeof(T); ++i) pdata[i] = *iter++; diff --git a/libgalois/CMakeLists.txt b/libgalois/CMakeLists.txt index 76161160f6..4721bc0261 100644 --- a/libgalois/CMakeLists.txt +++ b/libgalois/CMakeLists.txt @@ -10,7 +10,7 @@ set(sources "${CMAKE_CURRENT_BINARY_DIR}/Version.cpp" src/Barrier_Counting.cpp src/Barrier.cpp - src/Barrier_Dissemination.cpp + src/Barrier_Dissemination.cpp src/Barrier_MCS.cpp src/Barrier_Pthread.cpp src/Barrier_Simple.cpp diff --git a/libgalois/include/galois/Bag.h b/libgalois/include/galois/Bag.h index 6592bec529..985fdffcb7 100644 --- a/libgalois/include/galois/Bag.h +++ b/libgalois/include/galois/Bag.h @@ -212,7 +212,7 @@ class InsertBag { return *this; } - InsertBag(const InsertBag&) = delete; + InsertBag(const InsertBag&) = delete; InsertBag& operator=(const InsertBag&) = delete; ~InsertBag() { destruct_parallel(); } diff --git a/libgalois/include/galois/FixedSizeRing.h b/libgalois/include/galois/FixedSizeRing.h index 51e1466011..e1d7896781 100644 --- a/libgalois/include/galois/FixedSizeRing.h +++ b/libgalois/include/galois/FixedSizeRing.h @@ -67,7 +67,7 @@ class FixedSizeBagBase { } } - FixedSizeBagBase(const FixedSizeBagBase& o) = delete; + FixedSizeBagBase(const FixedSizeBagBase& o) = delete; FixedSizeBagBase& operator=(const FixedSizeBagBase& o) = delete; ~FixedSizeBagBase() { clear(); } @@ -284,7 +284,7 @@ class FixedSizeRing { } } - FixedSizeRing(const FixedSizeRing& o) = delete; + FixedSizeRing(const FixedSizeRing& o) = delete; FixedSizeRing& operator=(const FixedSizeRing& o) = delete; ~FixedSizeRing() { clear(); } diff --git a/libgalois/include/galois/LargeArray.h b/libgalois/include/galois/LargeArray.h index 71df3036ff..fe2e99c364 100644 --- a/libgalois/include/galois/LargeArray.h +++ b/libgalois/include/galois/LargeArray.h @@ -172,7 +172,7 @@ class LargeArray { return *this; } - LargeArray(const LargeArray&) = delete; + LargeArray(const LargeArray&) = delete; LargeArray& operator=(const LargeArray&) = delete; ~LargeArray() { @@ -305,8 +305,8 @@ class LargeArray { public: LargeArray(void*, size_t) {} - LargeArray() = default; - LargeArray(const LargeArray&) = delete; + LargeArray() = default; + LargeArray(const LargeArray&) = delete; LargeArray& operator=(const LargeArray&) = delete; friend void swap(LargeArray&, LargeArray&) {} diff --git a/libgalois/include/galois/ParallelSTL.h b/libgalois/include/galois/ParallelSTL.h index 4158a6dc5c..f5878686a9 100644 --- a/libgalois/include/galois/ParallelSTL.h +++ b/libgalois/include/galois/ParallelSTL.h @@ -119,7 +119,7 @@ struct sort_helper { RandomAccessIterator pivot = choose_rand(bounds.first, bounds.second); VT pv = *pivot; pivot = std::partition(bounds.first, bounds.second, - std::bind(comp, std::placeholders::_1, pv)); + std::bind(comp, std::placeholders::_1, pv)); // push the lower bit if (bounds.first != pivot) ctx.push(std::make_pair(bounds.first, pivot)); @@ -209,7 +209,7 @@ struct partition_helper { RP high, low; do { RP parts = dual_partition(low.first, low.second, high.first, high.second, - state->pred); + state->pred); low.first = parts.first; high.second = parts.second; if (low.first == low.second) diff --git a/libgalois/include/galois/SharedMemSys.h b/libgalois/include/galois/SharedMemSys.h index 8177a2283a..52459032d1 100644 --- a/libgalois/include/galois/SharedMemSys.h +++ b/libgalois/include/galois/SharedMemSys.h @@ -16,10 +16,10 @@ class SharedMemSys : public runtime::SharedMem { explicit SharedMemSys(); ~SharedMemSys(); - SharedMemSys(const SharedMemSys&) = delete; + SharedMemSys(const SharedMemSys&) = delete; SharedMemSys& operator=(const SharedMemSys&) = delete; - SharedMemSys(SharedMemSys&&) = delete; + SharedMemSys(SharedMemSys&&) = delete; SharedMemSys& operator=(SharedMemSys&&) = delete; }; diff --git a/libgalois/include/galois/Timer.h b/libgalois/include/galois/Timer.h index f12c41c6b0..51ab492ff4 100644 --- a/libgalois/include/galois/Timer.h +++ b/libgalois/include/galois/Timer.h @@ -72,10 +72,10 @@ class StatTimer : public TimeAccumulator { StatTimer() : StatTimer(nullptr, nullptr) {} - StatTimer(const StatTimer&) = delete; - StatTimer(StatTimer&&) = delete; + StatTimer(const StatTimer&) = delete; + StatTimer(StatTimer&&) = delete; StatTimer& operator=(const StatTimer&) = delete; - StatTimer& operator=(StatTimer&&) = delete; + StatTimer& operator=(StatTimer&&) = delete; ~StatTimer(); diff --git a/libgalois/include/galois/gdeque.h b/libgalois/include/galois/gdeque.h index 737f989107..19830c0309 100644 --- a/libgalois/include/galois/gdeque.h +++ b/libgalois/include/galois/gdeque.h @@ -35,7 +35,7 @@ namespace galois { // Experimental random access iterator. Slower than old iterator for simple // traversals, so disable for now -//#define _NEW_ITERATOR +// #define _NEW_ITERATOR //! Like std::deque but use Galois memory management functionality template void acquireNode(GraphNode N, MethodFlag mflag, - typename std::enable_if::type* = 0) { + typename std::enable_if::type* = 0) { galois::runtime::acquire(&nodeData[N], mflag); } template void acquireNode(GraphNode N, MethodFlag mflag, - typename std::enable_if<_A1 && !_A2>::type* = 0) { + typename std::enable_if<_A1&& !_A2>::type* = 0) { this->outOfLineAcquire(getId(N), mflag); } @@ -216,7 +216,7 @@ class LC_CSR_Graph : template ::has_value> void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn, - typename std::enable_if<_A1 && !_A2>::type* = 0) { + typename std::enable_if<_A1&& !_A2>::type* = 0) { edgeData.set(*nn, {}); } diff --git a/libgalois/include/galois/graphs/LC_CSR_Hypergraph.h b/libgalois/include/galois/graphs/LC_CSR_Hypergraph.h index 7c76391a46..d2ba3aad6f 100644 --- a/libgalois/include/galois/graphs/LC_CSR_Hypergraph.h +++ b/libgalois/include/galois/graphs/LC_CSR_Hypergraph.h @@ -190,13 +190,13 @@ class LC_CSR_Hypergraph : template void acquireNode(GraphNode N, MethodFlag mflag, - typename std::enable_if::type* = 0) { + typename std::enable_if::type* = 0) { galois::runtime::acquire(&nodeData[N], mflag); } template void acquireNode(GraphNode N, MethodFlag mflag, - typename std::enable_if<_A1 && !_A2>::type* = 0) { + typename std::enable_if<_A1&& !_A2>::type* = 0) { this->outOfLineAcquire(getId(N), mflag); } @@ -217,7 +217,7 @@ class LC_CSR_Hypergraph : template ::has_value> void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator nn, - typename std::enable_if<_A1 && !_A2>::type* = 0) { + typename std::enable_if<_A1&& !_A2>::type* = 0) { edgeData.set(*nn, {}); } @@ -269,8 +269,8 @@ class LC_CSR_Hypergraph : BOOST_SERIALIZATION_SPLIT_MEMBER() public: - LC_CSR_Hypergraph(LC_CSR_Hypergraph&& rhs) = default; - LC_CSR_Hypergraph() = default; + LC_CSR_Hypergraph(LC_CSR_Hypergraph&& rhs) = default; + LC_CSR_Hypergraph() = default; LC_CSR_Hypergraph& operator=(LC_CSR_Hypergraph&&) = default; /** diff --git a/libgalois/include/galois/graphs/LC_InlineEdge_Graph.h b/libgalois/include/galois/graphs/LC_InlineEdge_Graph.h index c0d8021167..f3db63a7fe 100644 --- a/libgalois/include/galois/graphs/LC_InlineEdge_Graph.h +++ b/libgalois/include/galois/graphs/LC_InlineEdge_Graph.h @@ -186,13 +186,13 @@ class LC_InlineEdge_Graph template void acquireNode(GraphNode N, MethodFlag mflag, - typename std::enable_if::type* = 0) { + typename std::enable_if::type* = 0) { galois::runtime::acquire(N, mflag); } template void acquireNode(GraphNode N, MethodFlag mflag, - typename std::enable_if<_A1 && !_A2>::type* = 0) { + typename std::enable_if<_A1&& !_A2>::type* = 0) { this->outOfLineAcquire(getId(N), mflag); } @@ -220,7 +220,7 @@ class LC_InlineEdge_Graph bool _A2 = LargeArray::has_value> void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator, EdgeInfo* edge, - typename std::enable_if<_A1 && !_A2>::type* = 0) { + typename std::enable_if<_A1&& !_A2>::type* = 0) { edge->construct(); } diff --git a/libgalois/include/galois/graphs/LC_Linear_Graph.h b/libgalois/include/galois/graphs/LC_Linear_Graph.h index a884bfc91b..f92a0a77de 100644 --- a/libgalois/include/galois/graphs/LC_Linear_Graph.h +++ b/libgalois/include/galois/graphs/LC_Linear_Graph.h @@ -163,13 +163,13 @@ class LC_Linear_Graph template void acquireNode(GraphNode N, MethodFlag mflag, - typename std::enable_if::type* = 0) { + typename std::enable_if::type* = 0) { galois::runtime::acquire(N, mflag); } template void acquireNode(GraphNode N, MethodFlag mflag, - typename std::enable_if<_A1 && !_A2>::type* = 0) { + typename std::enable_if<_A1&& !_A2>::type* = 0) { this->outOfLineAcquire(getId(N), mflag); } @@ -195,7 +195,7 @@ class LC_Linear_Graph bool _A2 = LargeArray::has_value> void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator, EdgeInfo* edge, - typename std::enable_if<_A1 && !_A2>::type* = 0) { + typename std::enable_if<_A1&& !_A2>::type* = 0) { edge->construct(); } diff --git a/libgalois/include/galois/graphs/LC_Morph_Graph.h b/libgalois/include/galois/graphs/LC_Morph_Graph.h index 78cf28b9ae..fdc02c468e 100644 --- a/libgalois/include/galois/graphs/LC_Morph_Graph.h +++ b/libgalois/include/galois/graphs/LC_Morph_Graph.h @@ -240,7 +240,7 @@ class LC_Morph_Graph */ template void acquireNode(GraphNode N, MethodFlag mflag, - typename std::enable_if::type* = 0) { + typename std::enable_if::type* = 0) { galois::runtime::acquire(N, mflag); } @@ -254,7 +254,7 @@ class LC_Morph_Graph */ template void acquireNode(GraphNode N, MethodFlag mflag, - typename std::enable_if<_A1 && !_A2>::type* = 0) { + typename std::enable_if<_A1&& !_A2>::type* = 0) { this->outOfLineAcquire(getId(N), mflag); } @@ -288,7 +288,7 @@ class LC_Morph_Graph bool _A2 = LargeArray::has_value> void constructEdgeValue(FileGraph&, typename FileGraph::edge_iterator, GraphNode src, GraphNode dst, - typename std::enable_if<_A1 && !_A2>::type* = 0) { + typename std::enable_if<_A1&& !_A2>::type* = 0) { addMultiEdge(src, dst, galois::MethodFlag::UNPROTECTED); } diff --git a/libgalois/include/galois/graphs/MorphGraph.h b/libgalois/include/galois/graphs/MorphGraph.h index a52d9dd676..7a7b89bef6 100644 --- a/libgalois/include/galois/graphs/MorphGraph.h +++ b/libgalois/include/galois/graphs/MorphGraph.h @@ -609,9 +609,9 @@ class MorphGraph : private boost::noncopyable { dst->acquire(mflag); EdgeTy* e = edgesF.mkEdge(std::forward(args)...); ii = dst->createEdgeWithReuse(src, e, Directional ? true : false, - std::forward(args)...); + std::forward(args)...); ii = src->createEdgeWithReuse(dst, e, false, - std::forward(args)...); + std::forward(args)...); } } return boost::make_filter_iterator(is_out_edge(), ii, src->end()); @@ -633,7 +633,7 @@ class MorphGraph : private boost::noncopyable { dst->acquire(mflag); EdgeTy* e = edgesF.mkEdge(std::forward(args)...); ii = dst->createEdge(src, e, Directional ? true : false, - std::forward(args)...); + std::forward(args)...); ii = src->createEdge(dst, e, false, std::forward(args)...); } } @@ -702,7 +702,7 @@ class MorphGraph : private boost::noncopyable { EdgeTy* constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator, GraphNode src, GraphNode dst, - typename std::enable_if<_A1 && !_A2>::type* = 0) { + typename std::enable_if<_A1&& !_A2>::type* = 0) { return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED); } diff --git a/libgalois/include/galois/graphs/MorphHyperGraph.h b/libgalois/include/galois/graphs/MorphHyperGraph.h index 1dae113408..f408d9fa9b 100644 --- a/libgalois/include/galois/graphs/MorphHyperGraph.h +++ b/libgalois/include/galois/graphs/MorphHyperGraph.h @@ -620,9 +620,9 @@ class MorphHyperGraph : private boost::noncopyable { dst->acquire(mflag); EdgeTy* e = edgesF.mkEdge(std::forward(args)...); ii = dst->createEdgeWithReuse(src, e, Directional ? true : false, - std::forward(args)...); + std::forward(args)...); ii = src->createEdgeWithReuse(dst, e, false, - std::forward(args)...); + std::forward(args)...); } } return boost::make_filter_iterator(is_out_edge(), ii, src->end()); @@ -644,7 +644,7 @@ class MorphHyperGraph : private boost::noncopyable { dst->acquire(mflag); EdgeTy* e = edgesF.mkEdge(std::forward(args)...); ii = dst->createEdge(src, e, Directional ? true : false, - std::forward(args)...); + std::forward(args)...); ii = src->createEdge(dst, e, false, std::forward(args)...); } } @@ -713,7 +713,7 @@ class MorphHyperGraph : private boost::noncopyable { EdgeTy* constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator, GraphNode src, GraphNode dst, - typename std::enable_if<_A1 && !_A2>::type* = 0) { + typename std::enable_if<_A1&& !_A2>::type* = 0) { return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED); } diff --git a/libgalois/include/galois/graphs/Morph_SepInOut_Graph.h b/libgalois/include/galois/graphs/Morph_SepInOut_Graph.h index db19218240..86b811a914 100644 --- a/libgalois/include/galois/graphs/Morph_SepInOut_Graph.h +++ b/libgalois/include/galois/graphs/Morph_SepInOut_Graph.h @@ -542,9 +542,9 @@ class Morph_SepInOut_Graph : private boost::noncopyable { dst->acquire(mflag); EdgeTy* e = edgesF.mkEdge(std::forward(args)...); ii = dst->createEdgeWithReuse(src, e, Directional ? true : false, - std::forward(args)...); + std::forward(args)...); ii = src->createEdgeWithReuse(dst, e, false, - std::forward(args)...); + std::forward(args)...); } } return boost::make_filter_iterator(is_out_edge(), ii, src->end()); @@ -565,7 +565,7 @@ class Morph_SepInOut_Graph : private boost::noncopyable { dst->acquire(mflag); EdgeTy* e = edgesF.mkEdge(std::forward(args)...); ii = dst->createEdge(src, e, Directional ? true : false, - std::forward(args)...); + std::forward(args)...); ii = src->createEdge(dst, e, false, std::forward(args)...); } } @@ -634,7 +634,7 @@ class Morph_SepInOut_Graph : private boost::noncopyable { EdgeTy* constructOutEdgeValue(FileGraph&, typename FileGraph::edge_iterator, GraphNode src, GraphNode dst, - typename std::enable_if<_A1 && !_A2>::type* = 0) { + typename std::enable_if<_A1&& !_A2>::type* = 0) { return createOutEdge(src, dst, galois::MethodFlag::UNPROTECTED); } diff --git a/libgalois/include/galois/graphs/OCGraph.h b/libgalois/include/galois/graphs/OCGraph.h index 02cb9afd9e..5e1c2d7c26 100644 --- a/libgalois/include/galois/graphs/OCGraph.h +++ b/libgalois/include/galois/graphs/OCGraph.h @@ -394,13 +394,13 @@ class OCImmutableEdgeGraph template void acquireNode(GraphNode N, MethodFlag mflag, - typename std::enable_if::type* = 0) { + typename std::enable_if::type* = 0) { galois::runtime::acquire(&nodeData[N], mflag); } template void acquireNode(GraphNode N, MethodFlag mflag, - typename std::enable_if<_A1 && !_A2>::type* = 0) { + typename std::enable_if<_A1&& !_A2>::type* = 0) { this->outOfLineAcquire(idFromNode(N), mflag); } diff --git a/libgalois/include/galois/gslist.h b/libgalois/include/galois/gslist.h index dd7fcc8c70..ca8a372515 100644 --- a/libgalois/include/galois/gslist.h +++ b/libgalois/include/galois/gslist.h @@ -174,7 +174,7 @@ class gslist_base { gslist_base() : first(0) {} - gslist_base(const gslist_base&) = delete; + gslist_base(const gslist_base&) = delete; gslist_base& operator=(const gslist_base&) = delete; gslist_base(gslist_base&& other) : first(0) { *this = std::move(other); } diff --git a/libgalois/include/galois/runtime/Executor_ForEach.h b/libgalois/include/galois/runtime/Executor_ForEach.h index 5b40cb6e48..ff17133009 100644 --- a/libgalois/include/galois/runtime/Executor_ForEach.h +++ b/libgalois/include/galois/runtime/Executor_ForEach.h @@ -383,9 +383,10 @@ class ForEachExecutor { public: ForEachExecutor(FunctionTy f, const ArgsTy& args) - : ForEachExecutor(T1{}, f, args, get_trait_value(args).args, - std::make_index_sequence(args).args)>::value>{}) {} + : ForEachExecutor( + T1{}, f, args, get_trait_value(args).args, + std::make_index_sequence(args).args)>::value>{}) {} template void init(const RangeTy&) {} @@ -416,8 +417,10 @@ class ForEachExecutor { }; template -constexpr auto has_with_iterator(int) -> decltype( - std::declval::type>(), bool()) { +constexpr auto has_with_iterator(int) + -> decltype(std::declval< + typename WLTy::template with_iterator::type>(), + bool()) { return true; } diff --git a/libgalois/include/galois/runtime/Mem.h b/libgalois/include/galois/runtime/Mem.h index 994f27ac25..3d2a353f3c 100644 --- a/libgalois/include/galois/runtime/Mem.h +++ b/libgalois/include/galois/runtime/Mem.h @@ -554,8 +554,8 @@ class StaticSingleInstance : private boost::noncopyable { // std::unique_ptr(); template -substrate::PtrLock - StaticSingleInstance::ptr = substrate::PtrLock(); +substrate::PtrLock StaticSingleInstance::ptr = + substrate::PtrLock(); class PageHeap : public StaticSingleInstance { diff --git a/libgalois/include/galois/runtime/Range.h b/libgalois/include/galois/runtime/Range.h index 01632dcd3a..135686e83f 100644 --- a/libgalois/include/galois/runtime/Range.h +++ b/libgalois/include/galois/runtime/Range.h @@ -303,8 +303,8 @@ template class HasLocalIter { template - using CallExprType = typename std::remove_reference().local_begin())>::type; + using CallExprType = typename std::remove_reference< + decltype(std::declval().local_begin())>::type; template static std::true_type go(typename std::add_pointer>::type); diff --git a/libgalois/include/galois/runtime/SharedMem.h b/libgalois/include/galois/runtime/SharedMem.h index 34d847d6ed..7389502bd9 100644 --- a/libgalois/include/galois/runtime/SharedMem.h +++ b/libgalois/include/galois/runtime/SharedMem.h @@ -46,10 +46,10 @@ class SharedMem : public galois::substrate::SharedMem { internal::setPagePoolState(nullptr); } - SharedMem(const SharedMem&) = delete; + SharedMem(const SharedMem&) = delete; SharedMem& operator=(const SharedMem&) = delete; - SharedMem(SharedMem&&) = delete; + SharedMem(SharedMem&&) = delete; SharedMem& operator=(SharedMem&&) = delete; }; diff --git a/libgalois/include/galois/runtime/ThreadTimer.h b/libgalois/include/galois/runtime/ThreadTimer.h index 86ae77389d..e75ba51efd 100644 --- a/libgalois/include/galois/runtime/ThreadTimer.h +++ b/libgalois/include/galois/runtime/ThreadTimer.h @@ -49,10 +49,10 @@ class PerThreadTimer : private ThreadTimers { PerThreadTimer(const char* const region, const char* const category) : region_(region), category_(category) {} - PerThreadTimer(const PerThreadTimer&) = delete; - PerThreadTimer(PerThreadTimer&&) = delete; + PerThreadTimer(const PerThreadTimer&) = delete; + PerThreadTimer(PerThreadTimer&&) = delete; PerThreadTimer& operator=(const PerThreadTimer&) = delete; - PerThreadTimer& operator=(PerThreadTimer&&) = delete; + PerThreadTimer& operator=(PerThreadTimer&&) = delete; ~PerThreadTimer() { reportTimes(); } @@ -67,10 +67,10 @@ class PerThreadTimer { public: PerThreadTimer(const char* const, const char* const) {} - PerThreadTimer(const PerThreadTimer&) = delete; - PerThreadTimer(PerThreadTimer&&) = delete; + PerThreadTimer(const PerThreadTimer&) = delete; + PerThreadTimer(PerThreadTimer&&) = delete; PerThreadTimer& operator=(const PerThreadTimer&) = delete; - PerThreadTimer& operator=(PerThreadTimer&&) = delete; + PerThreadTimer& operator=(PerThreadTimer&&) = delete; ~PerThreadTimer() = default; diff --git a/libgalois/include/galois/substrate/PerThreadStorage.h b/libgalois/include/galois/substrate/PerThreadStorage.h index b4a6140dd4..fc43055853 100644 --- a/libgalois/include/galois/substrate/PerThreadStorage.h +++ b/libgalois/include/galois/substrate/PerThreadStorage.h @@ -55,7 +55,7 @@ class PerBackend { public: PerBackend(); - PerBackend(const PerBackend&) = delete; + PerBackend(const PerBackend&) = delete; PerBackend& operator=(const PerBackend&) = delete; ~PerBackend() { @@ -194,7 +194,7 @@ class PerSocketStorage { return *this; } - PerSocketStorage(const PerSocketStorage&) = delete; + PerSocketStorage(const PerSocketStorage&) = delete; PerSocketStorage& operator=(const PerSocketStorage&) = delete; ~PerSocketStorage() { destruct(); } diff --git a/libgalois/include/galois/substrate/SharedMem.h b/libgalois/include/galois/substrate/SharedMem.h index 1c809b52ad..e8a6fe58a4 100644 --- a/libgalois/include/galois/substrate/SharedMem.h +++ b/libgalois/include/galois/substrate/SharedMem.h @@ -48,10 +48,10 @@ class SharedMem { */ ~SharedMem(); - SharedMem(const SharedMem&) = delete; + SharedMem(const SharedMem&) = delete; SharedMem& operator=(const SharedMem&) = delete; - SharedMem(SharedMem&&) = delete; + SharedMem(SharedMem&&) = delete; SharedMem& operator=(SharedMem&&) = delete; }; diff --git a/libgalois/include/galois/substrate/ThreadPool.h b/libgalois/include/galois/substrate/ThreadPool.h index 4158b87321..1ed295d8a0 100644 --- a/libgalois/include/galois/substrate/ThreadPool.h +++ b/libgalois/include/galois/substrate/ThreadPool.h @@ -130,10 +130,10 @@ class ThreadPool { public: ~ThreadPool(); - ThreadPool(const ThreadPool&) = delete; + ThreadPool(const ThreadPool&) = delete; ThreadPool& operator=(const ThreadPool&) = delete; - ThreadPool(ThreadPool&&) = delete; + ThreadPool(ThreadPool&&) = delete; ThreadPool& operator=(ThreadPool&&) = delete; //! execute work on all threads diff --git a/libgalois/include/galois/worklists/AdaptiveObim.h b/libgalois/include/galois/worklists/AdaptiveObim.h index 79223cf628..758af8582f 100644 --- a/libgalois/include/galois/worklists/AdaptiveObim.h +++ b/libgalois/include/galois/worklists/AdaptiveObim.h @@ -402,7 +402,7 @@ struct AdaptiveOrderedByIntegerMetric double diff = ((p.maxPrio >> delta) - (p.minPrio >> delta)) >= 1 ? ((p.maxPrio >> delta) - (p.minPrio >> delta)) : 1; - double xx = 16 / diff; + double xx = 16 / diff; if (delta > (unsigned int)(std::floor(std::log2(xx)))) delta -= (unsigned int)(std::floor(std::log2(xx))); else diff --git a/libgalois/include/galois/worklists/Chunk.h b/libgalois/include/galois/worklists/Chunk.h index cf6d697e6a..17398e9ff7 100644 --- a/libgalois/include/galois/worklists/Chunk.h +++ b/libgalois/include/galois/worklists/Chunk.h @@ -145,8 +145,8 @@ struct ChunkMaster { public: typedef T value_type; - ChunkMaster() = default; - ChunkMaster(const ChunkMaster&) = delete; + ChunkMaster() = default; + ChunkMaster(const ChunkMaster&) = delete; ChunkMaster& operator=(const ChunkMaster&) = delete; void flush() { diff --git a/libgalois/include/galois/worklists/WorkList.h b/libgalois/include/galois/worklists/WorkList.h index fab4e80e2a..25eb900785 100644 --- a/libgalois/include/galois/worklists/WorkList.h +++ b/libgalois/include/galois/worklists/WorkList.h @@ -56,7 +56,7 @@ namespace { // don't pollute the symbol table with the example // All classes (should) conform to: template class AbstractWorkList { - AbstractWorkList(const AbstractWorkList&) = delete; + AbstractWorkList(const AbstractWorkList&) = delete; const AbstractWorkList& operator=(const AbstractWorkList&) = delete; public: diff --git a/libgalois/include/shad/DataTypes.h b/libgalois/include/shad/DataTypes.h index 84dc770bee..253d064cbf 100644 --- a/libgalois/include/shad/DataTypes.h +++ b/libgalois/include/shad/DataTypes.h @@ -35,237 +35,245 @@ #include #include - namespace shad { /// @brief Data conversion utilities. -/// +/// /// Please refer to methods specialization to check /// which data types are supported. namespace data_types { - /// @brief Enumeration of supported data types. - /// - /// The enumeration is meant to be used when parsing data - /// (i.e. type information is not known at compile time). - enum data_t { - STRING = 0, // string support is currenlty limited - CHARS, // sequence of characters - UINT, // unsigned, binds by default to uint64_t - INT, // int, binds by default to int64_t - FLOAT, // float, binds by default to float - DOUBLE, // double, binds by default to double - BOOL, // bool, binds by default to bool - DATE, // date in "%y-%m-%d" format, binds by default to time_t - USDATE, // date in "%m/%d/%y" format, binds by default to time_t - DATE_TIME, // date in "%y-%m-%dT%H:%M:%S" format, - // binds by default to time_t - IP_ADDRESS, // IPv4, binds by default to data_types::ipv4_t - LIST_UINT, // Sequence of unsigneds, support currently limited - LIST_INT, // Sequence of integers, support currently limited - LIST_DOUBLE, // Sequence of doubles, support currently limited - NONE - }; - - /// @brief Data structures for storing schema information. - /// Given a tuple of data, it associates elements labels and data types - /// to their position in the tuple. - using schema_t = std::vector>; - - /// @brief Encoded null value. - /// @tparam ENC_t encoding type. - /// @return Encoded null value for ENC_t. - template - constexpr ENC_t kNullValue = ENC_t(); - - /// @brief Encoded null value for uint64_t. - /// @return Null encoded value for uint64_t. - template <> - constexpr uint64_t kNullValue = std::numeric_limits::max(); - - /// @brief Encoded null value for time_t (same as long). - /// @return Null encoded value for time_t (same as long). - template <> - constexpr time_t kNullValue = std::numeric_limits::max(); - - /// @brief Encoded null value for double. - /// @return Null encoded value for double. - template <> - constexpr double kNullValue = std::numeric_limits::max(); - - /// @brief Encode Function - /// Available specializations: - /// ENC_t = uint64_t, IN_t = std::string - /// @tparam ENC_t The type to encode to. - /// @tparam IN_t The type (format) of the data to encode. - /// @tparam DT data_types::data_t of the data to encode. - /// @param in Data to encode. - /// @return Encoded data. - template - ENC_t encode(IN_t &in); - - /// @brief Encode Function - /// Available specializations: - /// ENC_t = uint64_t, IN_t = default bindings of data_types::data_t - /// @tparam ENC_t The type to encode to. - /// @tparam IN_t The type of the data to encode. - /// @param in Data to encode. - /// @return Encoded data. - template - ENC_t encode(IN_t &in); - - template - ENC_t encode(IN_t &in, data_t dt); - - template - std::array encode(std::string &str) { - std::array res; - if (str.size() > 0) { - memcpy(res.data(), str.data(), sizeof(ENC_t)*MAX_s); - } else { - res.fill('\0'); - } - return res; - } +/// @brief Enumeration of supported data types. +/// +/// The enumeration is meant to be used when parsing data +/// (i.e. type information is not known at compile time). +enum data_t { + STRING = 0, // string support is currenlty limited + CHARS, // sequence of characters + UINT, // unsigned, binds by default to uint64_t + INT, // int, binds by default to int64_t + FLOAT, // float, binds by default to float + DOUBLE, // double, binds by default to double + BOOL, // bool, binds by default to bool + DATE, // date in "%y-%m-%d" format, binds by default to time_t + USDATE, // date in "%m/%d/%y" format, binds by default to time_t + DATE_TIME, // date in "%y-%m-%dT%H:%M:%S" format, + // binds by default to time_t + IP_ADDRESS, // IPv4, binds by default to data_types::ipv4_t + LIST_UINT, // Sequence of unsigneds, support currently limited + LIST_INT, // Sequence of integers, support currently limited + LIST_DOUBLE, // Sequence of doubles, support currently limited + NONE +}; + +/// @brief Data structures for storing schema information. +/// Given a tuple of data, it associates elements labels and data types +/// to their position in the tuple. +using schema_t = std::vector>; + +/// @brief Encoded null value. +/// @tparam ENC_t encoding type. +/// @return Encoded null value for ENC_t. +template +constexpr ENC_t kNullValue = ENC_t(); + +/// @brief Encoded null value for uint64_t. +/// @return Null encoded value for uint64_t. +template <> +constexpr uint64_t kNullValue = std::numeric_limits::max(); + +/// @brief Encoded null value for time_t (same as long). +/// @return Null encoded value for time_t (same as long). +template <> +constexpr time_t kNullValue = std::numeric_limits::max(); + +/// @brief Encoded null value for double. +/// @return Null encoded value for double. +template <> +constexpr double kNullValue = std::numeric_limits::max(); + +/// @brief Encode Function +/// Available specializations: +/// ENC_t = uint64_t, IN_t = std::string +/// @tparam ENC_t The type to encode to. +/// @tparam IN_t The type (format) of the data to encode. +/// @tparam DT data_types::data_t of the data to encode. +/// @param in Data to encode. +/// @return Encoded data. +template +ENC_t encode(IN_t& in); + +/// @brief Encode Function +/// Available specializations: +/// ENC_t = uint64_t, IN_t = default bindings of data_types::data_t +/// @tparam ENC_t The type to encode to. +/// @tparam IN_t The type of the data to encode. +/// @param in Data to encode. +/// @return Encoded data. +template +ENC_t encode(IN_t& in); - template - typename std::enable_if<(std::is_arithmetic::value or (sizeof(DEC_t) == sizeof(ENC_t))), DEC_t>::type - decode(ENC_t encvalue) { - DEC_t val; - memcpy(&val, &encvalue, sizeof(DEC_t)); - return val; +template +ENC_t encode(IN_t& in, data_t dt); + +template +std::array encode(std::string& str) { + std::array res; + if (str.size() > 0) { + memcpy(res.data(), str.data(), sizeof(ENC_t) * MAX_s); + } else { + res.fill('\0'); } + return res; +} - template - DEC_t decode(ENC_t value); +template +typename std::enable_if<(std::is_arithmetic::value or + (sizeof(DEC_t) == sizeof(ENC_t))), + DEC_t>::type +decode(ENC_t encvalue) { + DEC_t val; + memcpy(&val, &encvalue, sizeof(DEC_t)); + return val; +} - template - typename std::enable_if<(ST==data_t::INT), int64_t>::type - decode(ENC_t encvalue) { - return decode(encvalue); - } +template +DEC_t decode(ENC_t value); - template - typename std::enable_if<(ST==data_t::UINT), uint64_t>::type - decode(ENC_t encvalue) { - return decode(encvalue); - } +template +typename std::enable_if<(ST == data_t::INT), int64_t>::type +decode(ENC_t encvalue) { + return decode(encvalue); +} - template - typename std::enable_if<(ST==data_t::FLOAT), float>::type - decode(ENC_t encvalue) { - return decode(encvalue); - } +template +typename std::enable_if<(ST == data_t::UINT), uint64_t>::type +decode(ENC_t encvalue) { + return decode(encvalue); +} - template - typename std::enable_if<(ST==data_t::DOUBLE), double>::type - decode(ENC_t encvalue) { - return decode(encvalue); - } +template +typename std::enable_if<(ST == data_t::FLOAT), float>::type +decode(ENC_t encvalue) { + return decode(encvalue); +} - template - typename std::enable_if<(ST==data_t::BOOL), bool>::type - decode(ENC_t encvalue) { - return decode(encvalue); - } +template +typename std::enable_if<(ST == data_t::DOUBLE), double>::type +decode(ENC_t encvalue) { + return decode(encvalue); +} - template - typename std::enable_if<(ST==data_t::DATE), std::time_t>::type - decode(ENC_t encvalue) { - return decode(encvalue); - } - - template - std::string decode(std::array &val) { - return std::string(reinterpret_cast(val.data())); - } -} // namespace data_types +template +typename std::enable_if<(ST == data_t::BOOL), bool>::type +decode(ENC_t encvalue) { + return decode(encvalue); +} +template +typename std::enable_if<(ST == data_t::DATE), std::time_t>::type +decode(ENC_t encvalue) { + return decode(encvalue); +} + +template +std::string decode(std::array& val) { + return std::string(reinterpret_cast(val.data())); +} +} // namespace data_types // ENCODE METHODS SPECIALIZATION FOR UINT64 ENC_t -template<> inline -uint64_t data_types::encode(std::string &str) { +template <> +inline uint64_t +data_types::encode(std::string& str) { uint64_t value; - try { value = std::stoull(str); } - catch(...) { value = kNullValue; } + try { + value = std::stoull(str); + } catch (...) { + value = kNullValue; + } return value; } -template<> inline -uint64_t data_types::encode(std::string &str) { +template <> +inline uint64_t +data_types::encode(std::string& str) { uint64_t encval; int64_t value; - try { value = stoll(str); } - catch(...) { return kNullValue; } + try { + value = stoll(str); + } catch (...) { + return kNullValue; + } memcpy(&encval, &value, sizeof(value)); return encval; } -template<> inline -uint64_t data_types::encode(std::string &str) { +template <> +inline uint64_t +data_types::encode(std::string& str) { uint64_t encval; float value; - try { value = stof(str); } - catch(...) { return kNullValue; } + try { + value = stof(str); + } catch (...) { + return kNullValue; + } memcpy(&encval, &value, sizeof(value)); return encval; } -template<> inline -uint64_t data_types::encode(std::string &str) { +template <> +inline uint64_t data_types::encode( + std::string& str) { uint64_t encval; double value; - try { value = stod(str); } - catch(...) { return kNullValue; } + try { + value = stod(str); + } catch (...) { + return kNullValue; + } memcpy(&encval, &value, sizeof(value)); return encval; } -template<> inline -uint64_t data_types::encode(std::string &str) { - if (str.size() == 0) return kNullValue; +template <> +inline uint64_t +data_types::encode(std::string& str) { + if (str.size() == 0) + return kNullValue; uint64_t encval = 1; - if ((str == "F") || (str == "f") || (str == "FALSE") - || (str == "false") || (str == "0")) encval = 0; + if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") || + (str == "0")) + encval = 0; return encval; } - -template<> inline -uint64_t data_types::encode(std::string &str) { +template <> +inline uint64_t +data_types::encode(std::string& str) { uint64_t encval = 0; memset(&encval, '\0', sizeof(encval)); - memcpy(&encval, str.c_str(), sizeof(encval)-1); + memcpy(&encval, str.c_str(), sizeof(encval) - 1); return encval; } -template<> inline -uint64_t data_types::encode(std::string &str) { +template <> +inline uint64_t +data_types::encode( + std::string& str) { uint64_t val, value = 0; std::string::iterator start = str.begin(); - for (unsigned i = 0; i < 4; i ++) { + for (unsigned i = 0; i < 4; i++) { std::string::iterator end = std::find(start, str.end(), '.'); try { val = std::stoull(std::string(start, end)); - } catch(...) { + } catch (...) { return kNullValue; } if (val < 256) { - value = (value << 8) + val; start = end + 1; + value = (value << 8) + val; + start = end + 1; } else { return kNullValue; } @@ -273,57 +281,52 @@ uint64_t data_types::encode inline -uint64_t data_types::encode(std::string &str) { +template <> +inline uint64_t +data_types::encode(std::string& str) { uint64_t value = 0; - struct tm date{}; + struct tm date {}; date.tm_isdst = -1; strptime(str.c_str(), "%Y-%m-%d", &date); time_t t; try { t = mktime(&date); - } - catch(...) { + } catch (...) { return kNullValue; } memcpy(&value, &t, sizeof(value)); return value; } -template<> inline -uint64_t data_types::encode(std::string &str) { +template <> +inline uint64_t data_types::encode( + std::string& str) { uint64_t value = 0; - struct tm date{}; + struct tm date {}; date.tm_isdst = -1; strptime(str.c_str(), "%m/%d/%y", &date); time_t t; try { t = mktime(&date); - } - catch(...) { + } catch (...) { return kNullValue; } memcpy(&value, &t, sizeof(value)); return value; } -template<> inline -uint64_t data_types::encode(std::string &str) { +template <> +inline uint64_t +data_types::encode( + std::string& str) { uint64_t value = 0; - struct tm date{}; + struct tm date {}; date.tm_isdst = -1; strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date); time_t t; try { t = mktime(&date); - } - catch(...) { + } catch (...) { return kNullValue; } memcpy(&value, &t, sizeof(value)); @@ -332,89 +335,96 @@ uint64_t data_types::encode inline -double data_types::encode(std::string &str) { +template <> +inline double +data_types::encode(std::string& str) { double encval; uint64_t value; - try { value = std::stoull(str); } - catch(...) { return kNullValue; } + try { + value = std::stoull(str); + } catch (...) { + return kNullValue; + } memcpy(&encval, &value, sizeof(value)); return encval; } -template<> inline -double data_types::encode(std::string &str) { +template <> +inline double +data_types::encode(std::string& str) { double encval; int64_t value; - try { value = stoll(str); } - catch(...) { return kNullValue; } + try { + value = stoll(str); + } catch (...) { + return kNullValue; + } memcpy(&encval, &value, sizeof(value)); return encval; } -template<> inline -double data_types::encode(std::string &str) { +template <> +inline double +data_types::encode(std::string& str) { double encval; float value; - try { value = stof(str); } - catch(...) { return kNullValue; } + try { + value = stof(str); + } catch (...) { + return kNullValue; + } memcpy(&encval, &value, sizeof(value)); return encval; } -template<> inline -double data_types::encode(std::string &str) { +template <> +inline double +data_types::encode(std::string& str) { double value; - try { value = stod(str); } - catch(...) { return kNullValue; } + try { + value = stod(str); + } catch (...) { + return kNullValue; + } return value; } -template<> inline -double data_types::encode(std::string &str) { - if (str.size() == 0) return kNullValue; +template <> +inline double +data_types::encode(std::string& str) { + if (str.size() == 0) + return kNullValue; double encval = 1; - if ((str == "F") || (str == "f") || (str == "FALSE") - || (str == "false") || (str == "0")) encval = 0; + if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") || + (str == "0")) + encval = 0; return encval; } - -template<> inline -double data_types::encode(std::string &str) { +template <> +inline double +data_types::encode(std::string& str) { double encval = 0; memset(&encval, '\0', sizeof(encval)); - memcpy(&encval, str.c_str(), sizeof(encval)-1); + memcpy(&encval, str.c_str(), sizeof(encval) - 1); return encval; } -template<> inline -double data_types::encode(std::string &str) { +template <> +inline double data_types::encode( + std::string& str) { uint64_t val, value = 0; std::string::iterator start = str.begin(); - for (unsigned i = 0; i < 4; i ++) { + for (unsigned i = 0; i < 4; i++) { std::string::iterator end = std::find(start, str.end(), '.'); try { val = std::stoull(std::string(start, end)); - } catch(...) { + } catch (...) { return kNullValue; } if (val < 256) { - value = (value << 8) + val; start = end + 1; + value = (value << 8) + val; + start = end + 1; } else { return kNullValue; } @@ -424,57 +434,51 @@ double data_types::encode inline -double data_types::encode(std::string &str) { +template <> +inline double +data_types::encode(std::string& str) { double value = 0; - struct tm date{}; + struct tm date {}; date.tm_isdst = -1; strptime(str.c_str(), "%Y-%m-%d", &date); time_t t; try { t = mktime(&date); - } - catch(...) { + } catch (...) { return kNullValue; } memcpy(&value, &t, sizeof(value)); return value; } -template<> inline -double data_types::encode(std::string &str) { +template <> +inline double +data_types::encode(std::string& str) { double value = 0; - struct tm date{}; + struct tm date {}; date.tm_isdst = -1; strptime(str.c_str(), "%m/%d/%y", &date); time_t t; try { t = mktime(&date); - } - catch(...) { + } catch (...) { return kNullValue; } memcpy(&value, &t, sizeof(value)); return value; } -template<> inline -double data_types::encode(std::string &str) { +template <> +inline double data_types::encode( + std::string& str) { double value = 0; - struct tm date{}; + struct tm date {}; date.tm_isdst = -1; strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date); time_t t; try { t = mktime(&date); - } - catch(...) { + } catch (...) { return kNullValue; } memcpy(&value, &t, sizeof(value)); @@ -482,87 +486,94 @@ double data_types::encode inline -time_t data_types::encode(std::string &str) { +template <> +inline time_t +data_types::encode(std::string& str) { time_t value; - try { value = std::stoul(str); } - catch(...) { value = kNullValue; } + try { + value = std::stoul(str); + } catch (...) { + value = kNullValue; + } return value; } -template<> inline -time_t data_types::encode(std::string &str) { +template <> +inline time_t +data_types::encode(std::string& str) { int64_t value; - try { value = stol(str); } - catch(...) { return kNullValue; } + try { + value = stol(str); + } catch (...) { + return kNullValue; + } return value; } -template<> inline -time_t data_types::encode(std::string &str) { +template <> +inline time_t +data_types::encode(std::string& str) { time_t encval; float value; - try { value = stof(str); } - catch(...) { return kNullValue; } + try { + value = stof(str); + } catch (...) { + return kNullValue; + } memcpy(&encval, &value, sizeof(value)); return encval; } -template<> inline -time_t data_types::encode(std::string &str) { +template <> +inline time_t +data_types::encode(std::string& str) { time_t encval; double value; - try { value = stod(str); } - catch(...) { return kNullValue; } + try { + value = stod(str); + } catch (...) { + return kNullValue; + } memcpy(&encval, &value, sizeof(value)); return encval; } -template<> inline -time_t data_types::encode(std::string &str) { - if (str.size() == 0) return kNullValue; +template <> +inline time_t +data_types::encode(std::string& str) { + if (str.size() == 0) + return kNullValue; time_t encval = 1; - if ((str == "F") || (str == "f") || (str == "FALSE") - || (str == "false") || (str == "0")) encval = 0; + if ((str == "F") || (str == "f") || (str == "FALSE") || (str == "false") || + (str == "0")) + encval = 0; return encval; } - -template<> inline -time_t data_types::encode(std::string &str) { +template <> +inline time_t +data_types::encode(std::string& str) { time_t encval = 0; memset(&encval, '\0', sizeof(encval)); - memcpy(&encval, str.c_str(), sizeof(encval)-1); + memcpy(&encval, str.c_str(), sizeof(encval) - 1); return encval; } -template<> inline -time_t data_types::encode(std::string &str) { +template <> +inline time_t data_types::encode( + std::string& str) { time_t val, value = 0; std::string::iterator start = str.begin(); - for (unsigned i = 0; i < 4; i ++) { + for (unsigned i = 0; i < 4; i++) { std::string::iterator end = std::find(start, str.end(), '.'); try { val = std::stoull(std::string(start, end)); - } catch(...) { + } catch (...) { return kNullValue; } if (val < 256) { - value = (value << 8) + val; start = end + 1; + value = (value << 8) + val; + start = end + 1; } else { return kNullValue; } @@ -570,165 +581,161 @@ time_t data_types::encode inline -time_t data_types::encode(std::string &str) { - struct tm date{}; +template <> +inline time_t +data_types::encode(std::string& str) { + struct tm date {}; date.tm_isdst = -1; strptime(str.c_str(), "%Y-%m-%d", &date); time_t t; try { t = mktime(&date); - } - catch(...) { + } catch (...) { return kNullValue; } return t; } -template<> inline -time_t data_types::encode(std::string &str) { - struct tm date{}; +template <> +inline time_t +data_types::encode(std::string& str) { + struct tm date {}; date.tm_isdst = -1; strptime(str.c_str(), "%m/%d/%y", &date); time_t t; try { t = mktime(&date); - } - catch(...) { + } catch (...) { return kNullValue; } return t; } -template<> inline -time_t data_types::encode(std::string &str) { - struct tm date{}; +template <> +inline time_t data_types::encode( + std::string& str) { + struct tm date {}; date.tm_isdst = -1; strptime(str.c_str(), "%Y-%m-%dT%H:%M:%S", &date); time_t t; try { t = mktime(&date); - } - catch(...) { + } catch (...) { return kNullValue; } return t; } template -ENC_t data_types::encode(IN_t &in, data_types::data_t dt) { +ENC_t data_types::encode(IN_t& in, data_types::data_t dt) { switch (dt) { -// case data_types::STRING : -// return data_types::encode(in); -// case data_types::CHARS : -// return data_types::encode(in); - case data_types::UINT : - return data_types::encode(in); - case data_types::INT : - return data_types::encode(in); - case data_types::FLOAT : - return data_types::encode(in); - case data_types::DOUBLE : - return data_types::encode(in); - case data_types::BOOL : - return data_types::encode(in); - case data_types::DATE : - return data_types::encode(in); - case data_types::USDATE : - return data_types::encode(in); - case data_types::DATE_TIME : - return data_types::encode(in); - case data_types::IP_ADDRESS : - return data_types::encode(in); + // case data_types::STRING : + // return data_types::encode(in); + // case data_types::CHARS : + // return data_types::encode(in); + case data_types::UINT: + return data_types::encode(in); + case data_types::INT: + return data_types::encode(in); + case data_types::FLOAT: + return data_types::encode(in); + case data_types::DOUBLE: + return data_types::encode(in); + case data_types::BOOL: + return data_types::encode(in); + case data_types::DATE: + return data_types::encode(in); + case data_types::USDATE: + return data_types::encode(in); + case data_types::DATE_TIME: + return data_types::encode(in); + case data_types::IP_ADDRESS: + return data_types::encode(in); } return data_types::kNullValue; } -template<> inline -std::string data_types::decode(uint64_t value) { - if (value == kNullValue) return ""; +template <> +inline std::string +data_types::decode(uint64_t value) { + if (value == kNullValue) + return ""; return std::to_string(value); } -template<> inline -std::string data_types::decode(uint64_t value) { - if (value == kNullValue) return ""; +template <> +inline std::string +data_types::decode(uint64_t value) { + if (value == kNullValue) + return ""; int64_t v; memcpy(&v, &value, sizeof(v)); return std::to_string(v); } -template<> inline -std::string data_types::decode(uint64_t value) { - if (value == kNullValue) return ""; +template <> +inline std::string +data_types::decode(uint64_t value) { + if (value == kNullValue) + return ""; float v; memcpy(&v, &value, sizeof(v)); return std::to_string(v); } -template<> inline -std::string data_types::decode(uint64_t value) { - if (value == kNullValue) return ""; +template <> +inline std::string +data_types::decode(uint64_t value) { + if (value == kNullValue) + return ""; double v; memcpy(&v, &value, sizeof(v)); return std::to_string(v); } -template<> inline -std::string data_types::decode(uint64_t value) { +template <> +inline std::string +data_types::decode( + uint64_t value) { std::string ipAddr = ""; uint64_t octets[4]; - for (uint64_t k = 0; k < 4; k ++) {octets[k] = value & 255; value = value >> 8;} - for (uint64_t k = 3; k >= 1; k --) ipAddr += std::to_string(octets[k]) + '.'; + for (uint64_t k = 0; k < 4; k++) { + octets[k] = value & 255; + value = value >> 8; + } + for (uint64_t k = 3; k >= 1; k--) + ipAddr += std::to_string(octets[k]) + '.'; return ipAddr + std::to_string(octets[0]); } -template<> inline -std::string data_types::decode(uint64_t value) { - if (value == kNullValue) return ""; +template <> +inline std::string +data_types::decode(uint64_t value) { + if (value == kNullValue) + return ""; return std::to_string(value); } -template<> inline -std::string data_types::decode(uint64_t value) { +template <> +inline std::string +data_types::decode(uint64_t value) { time_t t = data_types::decode(value); char dateString[11]; strftime(dateString, 11, "%Y-%m-%d", std::localtime(&t)); return std::string(dateString); } -template<> inline -std::string data_types::decode(uint64_t value) { +template <> +inline std::string +data_types::decode(uint64_t value) { const char* c = reinterpret_cast(&value); return std::string(c); } -template <> inline -uint64_t data_types::decode(uint64_t encvalue) { +template <> +inline uint64_t data_types::decode(uint64_t encvalue) { return encvalue; } -} // namespace shad +} // namespace shad #endif // LIBGALOIS_INCLUDE_SHAD_DATA_TYPES_H_ diff --git a/libgalois/include/shad/Graph.h b/libgalois/include/shad/Graph.h index 9029b1ef32..2c785e53d5 100644 --- a/libgalois/include/shad/Graph.h +++ b/libgalois/include/shad/Graph.h @@ -1,5 +1,5 @@ -//TODO(hc): Upgrade copyright if it is necessary; for now, we have no plan -// to make this public. +// TODO(hc): Upgrade copyright if it is necessary; for now, we have no plan +// to make this public. //===------------------------------------------------------------*- C++ -*-===// // @@ -50,7 +50,7 @@ #include "DataTypes.h" #include "GraphTypes.h" -#define UINT shad::data_types::UINT +#define UINT shad::data_types::UINT #define DOUBLE shad::data_types::DOUBLE #define USDATE shad::data_types::USDATE #define ENCODE shad::data_types::encode @@ -58,112 +58,120 @@ namespace shad { class Vertex { - public: - // Vertex id; initially it is set - // to a local node id while CuSP reads a file and constructs - // this vertex. After each host finishes and synchronizes it to construct - // a full CSR graph, it is updated to a global node id. - uint64_t id; - TYPES type; - uint64_t shadKey; - // Number of edges. - // This is incremented while reads a graph. - uint64_t numEdges{0}; - - Vertex () { - this->id = shad::data_types::kNullValue; - this->type = TYPES::NONE; - this->shadKey = shad::data_types::kNullValue; - } - - Vertex (uint64_t id_, TYPES type_, uint64_t shadKey_) { - this->id = id_; - this->type = type_; - this->shadKey = shadKey_; - } - - void incrNumEdges() { - this->numEdges += 1; - } - - uint64_t getNumEdges() { - return this->numEdges; - } +public: + // Vertex id; initially it is set + // to a local node id while CuSP reads a file and constructs + // this vertex. After each host finishes and synchronizes it to construct + // a full CSR graph, it is updated to a global node id. + uint64_t id; + TYPES type; + uint64_t shadKey; + // Number of edges. + // This is incremented while reads a graph. + uint64_t numEdges{0}; + + Vertex() { + this->id = shad::data_types::kNullValue; + this->type = TYPES::NONE; + this->shadKey = shad::data_types::kNullValue; + } + + Vertex(uint64_t id_, TYPES type_, uint64_t shadKey_) { + this->id = id_; + this->type = type_; + this->shadKey = shadKey_; + } + + void incrNumEdges() { this->numEdges += 1; } + + uint64_t getNumEdges() { return this->numEdges; } }; class Edge { - public: - uint64_t src; // vertex id of src - uint64_t dst; // vertex id of dst - TYPES type; - TYPES src_type; - TYPES dst_type; - uint64_t src_glbid; - uint64_t dst_glbid; - - Edge () { - src = shad::data_types::kNullValue; - dst = shad::data_types::kNullValue; - type = TYPES::NONE; - src_type = TYPES::NONE; - dst_type = TYPES::NONE; +public: + uint64_t src; // vertex id of src + uint64_t dst; // vertex id of dst + TYPES type; + TYPES src_type; + TYPES dst_type; + uint64_t src_glbid; + uint64_t dst_glbid; + + Edge() { + src = shad::data_types::kNullValue; + dst = shad::data_types::kNullValue; + type = TYPES::NONE; + src_type = TYPES::NONE; + dst_type = TYPES::NONE; + src_glbid = shad::data_types::kNullValue; + dst_glbid = shad::data_types::kNullValue; + } + + Edge(std::vector& tokens) { + if (tokens[0] == "Sale") { + src = ENCODE(tokens[1]); + dst = ENCODE(tokens[2]); + type = TYPES::SALE; + src_type = TYPES::PERSON; + dst_type = TYPES::PERSON; + src_glbid = shad::data_types::kNullValue; + dst_glbid = shad::data_types::kNullValue; + } else if (tokens[0] == "Author") { + src = ENCODE(tokens[1]); + type = TYPES::AUTHOR; + src_type = TYPES::PERSON; + src_glbid = shad::data_types::kNullValue; + dst_glbid = shad::data_types::kNullValue; + if (tokens[3] != "") + dst = ENCODE(tokens[3]); + else if (tokens[4] != "") + dst = ENCODE(tokens[4]); + else if (tokens[5] != "") + dst = ENCODE(tokens[5]); + if (tokens[3] != "") + dst_type = TYPES::FORUM; + else if (tokens[4] != "") + dst_type = TYPES::FORUMEVENT; + else if (tokens[5] != "") + dst_type = TYPES::PUBLICATION; + } else if (tokens[0] == "Includes") { + src = ENCODE(tokens[3]); + dst = ENCODE(tokens[4]); + type = TYPES::INCLUDES; + src_type = TYPES::FORUM; + dst_type = TYPES::FORUMEVENT; + src_glbid = shad::data_types::kNullValue; + dst_glbid = shad::data_types::kNullValue; + } else if (tokens[0] == "HasTopic") { + dst = ENCODE(tokens[6]); + type = TYPES::HASTOPIC; + dst_type = TYPES::TOPIC; + src_glbid = shad::data_types::kNullValue; + dst_glbid = shad::data_types::kNullValue; + if (tokens[3] != "") + src = ENCODE(tokens[3]); + else if (tokens[4] != "") + src = ENCODE(tokens[4]); + else if (tokens[5] != "") + src = ENCODE(tokens[5]); + if (tokens[3] != "") + src_type = TYPES::FORUM; + else if (tokens[4] != "") + src_type = TYPES::FORUMEVENT; + else if (tokens[5] != "") + src_type = TYPES::PUBLICATION; + } else if (tokens[0] == "HasOrg") { + src = ENCODE(tokens[5]); + dst = ENCODE(tokens[6]); + type = TYPES::HASORG; + src_type = TYPES::PUBLICATION; + dst_type = TYPES::TOPIC; src_glbid = shad::data_types::kNullValue; dst_glbid = shad::data_types::kNullValue; } - - Edge (std::vector & tokens) { - if (tokens[0] == "Sale") { - src = ENCODE(tokens[1]); - dst = ENCODE(tokens[2]); - type = TYPES::SALE; - src_type = TYPES::PERSON; - dst_type = TYPES::PERSON; - src_glbid = shad::data_types::kNullValue; - dst_glbid = shad::data_types::kNullValue; - } else if (tokens[0] == "Author") { - src = ENCODE(tokens[1]); - type = TYPES::AUTHOR; - src_type = TYPES::PERSON; - src_glbid = shad::data_types::kNullValue; - dst_glbid = shad::data_types::kNullValue; - if (tokens[3] != "") dst = ENCODE(tokens[3]); - else if (tokens[4] != "") dst = ENCODE(tokens[4]); - else if (tokens[5] != "") dst = ENCODE(tokens[5]); - if (tokens[3] != "") dst_type = TYPES::FORUM; - else if (tokens[4] != "") dst_type = TYPES::FORUMEVENT; - else if (tokens[5] != "") dst_type = TYPES::PUBLICATION; - } else if (tokens[0] == "Includes") { - src = ENCODE(tokens[3]); - dst = ENCODE(tokens[4]); - type = TYPES::INCLUDES; - src_type = TYPES::FORUM; - dst_type = TYPES::FORUMEVENT; - src_glbid = shad::data_types::kNullValue; - dst_glbid = shad::data_types::kNullValue; - } else if (tokens[0] == "HasTopic") { - dst = ENCODE(tokens[6]); - type = TYPES::HASTOPIC; - dst_type = TYPES::TOPIC; - src_glbid = shad::data_types::kNullValue; - dst_glbid = shad::data_types::kNullValue; - if (tokens[3] != "") src = ENCODE(tokens[3]); - else if (tokens[4] != "") src = ENCODE(tokens[4]); - else if (tokens[5] != "") src = ENCODE(tokens[5]); - if (tokens[3] != "") src_type = TYPES::FORUM; - else if (tokens[4] != "") src_type = TYPES::FORUMEVENT; - else if (tokens[5] != "") src_type = TYPES::PUBLICATION; - } else if (tokens[0] == "HasOrg") { - src = ENCODE(tokens[5]); - dst = ENCODE(tokens[6]); - type = TYPES::HASORG; - src_type = TYPES::PUBLICATION; - dst_type = TYPES::TOPIC; - src_glbid = shad::data_types::kNullValue; - dst_glbid = shad::data_types::kNullValue; - } - } + } }; -} // namespace agile::workflow1 +} // namespace shad #endif // GRAPH_H diff --git a/libgalois/include/shad/GraphTypes.h b/libgalois/include/shad/GraphTypes.h index eb84e123c2..e9f7afc0ab 100644 --- a/libgalois/include/shad/GraphTypes.h +++ b/libgalois/include/shad/GraphTypes.h @@ -66,6 +66,6 @@ enum class TYPES { NONE }; -} // namespace agile::workflow1 +} // namespace shad #endif // GRAPHTYPES_H diff --git a/libgalois/include/shad/ShadGraphConverter.h b/libgalois/include/shad/ShadGraphConverter.h index 4b1c0351db..87cef93a93 100644 --- a/libgalois/include/shad/ShadGraphConverter.h +++ b/libgalois/include/shad/ShadGraphConverter.h @@ -25,8 +25,7 @@ using ShadEdgeTy = uint64_t; class ShadGraphConverter { public: - ShadGraphConverter() : - nodeDataBuffer(nullptr) {} + ShadGraphConverter() : nodeDataBuffer(nullptr) {} ~ShadGraphConverter() { // BufferedGraph holds these arrays. @@ -43,15 +42,15 @@ class ShadGraphConverter { std::ofstream fp("shad_graph.out"); for (size_t i = 0; i < this->verticeIdKeyMapping.size(); ++i) { uint64_t key = this->verticeIdKeyMapping[i]; - Vertex v = this->vertices[key]; - fp << "node " << i << ", type: " << to_underlying(v.type) << ", key: " << - key << "\n"; + Vertex v = this->vertices[key]; + fp << "node " << i << ", type: " << to_underlying(v.type) + << ", key: " << key << "\n"; auto edgeRange = this->edges.equal_range(key); - for (auto ei = edgeRange.first ; ei != edgeRange.second; ++ei) { + for (auto ei = edgeRange.first; ei != edgeRange.second; ++ei) { Edge& edge = ei->second; Vertex dst = this->vertices[edge.dst]; - fp << "\t edge dst " << dst.id << ", type: " << - to_underlying(edge.type) << ", key: " << dst.shadKey << "\n"; + fp << "\t edge dst " << dst.id << ", type: " << to_underlying(edge.type) + << ", key: " << dst.shadKey << "\n"; } } fp.close(); @@ -71,7 +70,7 @@ class ShadGraphConverter { * @param numEdges number of edges that this method reads */ void InspectGraph(const std::string& filename, size_t* numNodes, - size_t* numEdges) { + size_t* numEdges) { // TODO(hc): Get the number of nodes and edges from file // For example, it reads {SALE, Author, Includes, HasTopic, HasOrg} as // edges. So we just count how many they exist in the file. @@ -85,9 +84,10 @@ class ShadGraphConverter { while (!file.eof()) { getline(file, line); // Skip comments. - if (line[0] == '#') continue; + if (line[0] == '#') + continue; // Delimiter and # tokens set for WMD data file. - std::vector tokens = splitTokens(line, ',', 10); + std::vector tokens = splitTokens(line, ',', 10); if (this->isTokenNodeType(tokens[0])) { ++(*numNodes); @@ -96,15 +96,15 @@ class ShadGraphConverter { } } - std::cout << "Number of nodes:" << *numNodes << ", number of edges:" << - *numEdges << "\n"; + std::cout << "Number of nodes:" << *numNodes + << ", number of edges:" << *numEdges << "\n"; } /** * @brief Construct a buffered graph from existing arrays constructed * by constructNodeArrays() and constructEdgeArrays(). * - * @param numGlobalNodes The number of global nodes + * @param numGlobalNodes The number of global nodes * @param numGlobalEdges The number of global edges * @param nodeBegin Global node ID of the first local node * @param nodeEnd (Global node ID of the last local node) + 1 @@ -113,32 +113,31 @@ class ShadGraphConverter { * @param bufferedGraph Buffered graph for CuSP */ void constructBufferedGraph( - uint64_t numGlobalNodes, uint64_t numGlobalEdges, - uint32_t nodeBegin, uint32_t nodeEnd, - uint64_t edgeBegin, uint64_t edgeEnd, - [[maybe_unused]]galois::graphs::BufferedGraph* bufferedGraph) { + uint64_t numGlobalNodes, uint64_t numGlobalEdges, uint32_t nodeBegin, + uint32_t nodeEnd, uint64_t edgeBegin, uint64_t edgeEnd, + [[maybe_unused]] galois::graphs::BufferedGraph* + bufferedGraph) { // TODO(hc): Each of these functions first construct graphs in the SHAD - // format as this file is written in not binary, but string, and also + // format as this file is written in not binary, but string, and also // nodes or edges are not sorted. So, until we preprocess the input graph // file, we should first read it in memory, and reconstruct this to Galois - // compatible + // compatible uint32_t numLocalNodes = nodeEnd - nodeBegin; uint64_t numLocalEdges = edgeEnd - edgeBegin; - bufferedGraph->constructFrom( - outIndexBuffer, edgeDestBuffer, edgeDataBuffer, - numGlobalNodes, numGlobalEdges, numLocalNodes, numLocalEdges, - nodeBegin, edgeBegin); + bufferedGraph->constructFrom(outIndexBuffer, edgeDestBuffer, edgeDataBuffer, + numGlobalNodes, numGlobalEdges, numLocalNodes, + numLocalEdges, nodeBegin, edgeBegin); #if 0 - TODO(hc): This verification should be fixed since it tests + TODO(hc): This verification should be fixed since it tests a shared-memory execution that one host loads the whole graph. It should not work on distributed-memory machine since a CSR graph should be partitioned but tepmorary maps reading and holding SHAD graphs are for global graph. #ifndef NDEBUG std::cout << "CSR verification starts.." << std::endl << std::flush; - this->VerifyCSRConstruction(outIndexBuffer, nodeDataBuffer, + this->VerifyCSRConstruction(outIndexBuffer, nodeDataBuffer, edgeDestBuffer, edgeDataBuffer); std::cout << "CSR verification starts.. [done]" << std::endl << std::flush; #endif @@ -153,20 +152,20 @@ class ShadGraphConverter { */ // TODO(hc): We can assign a disjointed range of file for each host. // For now, let all hosts read the whole file. - void readSHADFile( - const std::string& filename, uint64_t* numGlobalNodes, - uint64_t *numGlobalEdges) { + void readSHADFile(const std::string& filename, uint64_t* numGlobalNodes, + uint64_t* numGlobalEdges) { std::ifstream graphFile(filename.c_str()); uint64_t vertexId{0}; std::string line; uint64_t numNodes{0}, numEdges{0}; // TODO(hc): We can parallelize it by assigning disjointed // ranges with some inspection. - // But this would be the future work as + // But this would be the future work as while (!graphFile.eof()) { getline(graphFile, line); // Skip comments. - if (line[0] == '#') continue; + if (line[0] == '#') + continue; // Delimiter and # tokens set for WMD data file. std::vector tokens = splitTokens(line, ',', 10); @@ -242,7 +241,7 @@ class ShadGraphConverter { } } - // After the above loop, vertices and edges are complete. + // After the above loop, vertices and edges are complete. this->CountNumEdgesForEachVertex(numNodes, numEdges); *numGlobalNodes = numNodes; *numGlobalEdges = numEdges; @@ -253,20 +252,16 @@ class ShadGraphConverter { } /** - * @brief Return node data array. + * @brief Return node data array. * Note that this can be either of global graph or local graph. */ - ShadNodeTy* getNodeDataBuffer() { - return nodeDataBuffer; - } + ShadNodeTy* getNodeDataBuffer() { return nodeDataBuffer; } /** * @brief Return node outgoing edge index array * Note that this can be either of global graph or local graph. */ - uint64_t* getOutIndexBuffer() { - return outIndexBuffer; - } + uint64_t* getOutIndexBuffer() { return outIndexBuffer; } /** * @brief Construct vertex outgoing edge range buffer and @@ -282,36 +277,33 @@ class ShadGraphConverter { * @param numLocalNodes The number of local nodes * */ - void constructNodeArrays( - uint32_t nodeBegin, uint32_t nodeEnd, uint32_t numLocalNodes) { + void constructNodeArrays(uint32_t nodeBegin, uint32_t nodeEnd, + uint32_t numLocalNodes) { // 1) Construct an edge index array (size == number of nodes). this->outIndexBuffer = new uint64_t[numLocalNodes]; this->nodeDataBuffer = new ShadNodeTy[numLocalNodes]; - // TODO(hc): for now, only consider a single host, but need to add offset later. - galois::do_all(galois::iterate(this->vertices), - [&](auto element) { - Vertex& vertex = element.second; - uint64_t vertexId = vertex.id; - if (vertexId >= nodeBegin && vertexId < nodeEnd) { - this->outIndexBuffer[vertexId - nodeBegin] = - vertex.getNumEdges(); - // Fill vertex data too; This assumes that a SHAD graph - // has a type, which is considered as a vertex data. - this->nodeDataBuffer[vertexId - nodeBegin].type = - this->to_underlying(vertex.type); - this->nodeDataBuffer[vertexId - nodeBegin].key = - vertex.shadKey; - //std::cout << vertexId - nodeBegin << " is set to " - //<< this->nodeDataBuffer[vertexId - nodeBegin].type << " and " << - //this->nodeDataBuffer[vertexId - nodeBegin].key << "\n"; - } - }); + // TODO(hc): for now, only consider a single host, but need to add offset + // later. + galois::do_all(galois::iterate(this->vertices), [&](auto element) { + Vertex& vertex = element.second; + uint64_t vertexId = vertex.id; + if (vertexId >= nodeBegin && vertexId < nodeEnd) { + this->outIndexBuffer[vertexId - nodeBegin] = vertex.getNumEdges(); + // Fill vertex data too; This assumes that a SHAD graph + // has a type, which is considered as a vertex data. + this->nodeDataBuffer[vertexId - nodeBegin].type = + this->to_underlying(vertex.type); + this->nodeDataBuffer[vertexId - nodeBegin].key = vertex.shadKey; + // std::cout << vertexId - nodeBegin << " is set to " + //<< this->nodeDataBuffer[vertexId - nodeBegin].type << " and " << + // this->nodeDataBuffer[vertexId - nodeBegin].key << "\n"; + } + }); // 2) Perform parallel prefix sum to finalize outgoing edge index // array construction. galois::ParallelSTL::partial_sum( - outIndexBuffer, &(outIndexBuffer[numLocalNodes]), - outIndexBuffer); + outIndexBuffer, &(outIndexBuffer[numLocalNodes]), outIndexBuffer); } /** @@ -331,12 +323,10 @@ class ShadGraphConverter { * @param numLocalEdges The number of local edges * */ - template >* = nullptr> - void constructEdgeArrays( - uint32_t nodeBegin, uint64_t edgeBegin, uint32_t numLocalNodes, - uint64_t numLocalEdges) { + template >* = nullptr> + void constructEdgeArrays(uint32_t nodeBegin, uint64_t edgeBegin, + uint32_t numLocalNodes, uint64_t numLocalEdges) { this->edgeDestBuffer = new uint32_t[numLocalEdges]; this->edgeDataBuffer = new ShadEdgeTy[numLocalEdges]; std::vector edgeIndexPointers(numLocalNodes, 0); @@ -346,8 +336,8 @@ class ShadGraphConverter { galois::block_range(uint32_t{0}, numLocalNodes, tid, numThreads); // 2) Each thread iterates the whole edges. for (auto edgeElem : this->edges) { - uint64_t srcVertex = edgeElem.first; - Vertex& vertex = this->vertices[srcVertex]; + uint64_t srcVertex = edgeElem.first; + Vertex& vertex = this->vertices[srcVertex]; uint64_t srcVertexId = vertex.id; // 3) Each thread fills edge destination for the assigned nodes. if (srcVertexId >= thread_work_range.first + nodeBegin && @@ -356,8 +346,9 @@ class ShadGraphConverter { // OutIndexBuffer now contains global edge range. // So we need to subtract edge offset to get the local edge id. uint64_t nodeBaseOffset = - ((srcVertexId - nodeBegin) == 0)? - 0 : outIndexBuffer[srcVertexId - nodeBegin - 1] - edgeBegin; + ((srcVertexId - nodeBegin) == 0) + ? 0 + : outIndexBuffer[srcVertexId - nodeBegin - 1] - edgeBegin; edgeDestBuffer[edgeIdx + nodeBaseOffset] = this->vertices[edgeElem.second.dst].id; edgeDataBuffer[edgeIdx + nodeBaseOffset] = @@ -386,12 +377,10 @@ class ShadGraphConverter { * @param numLocalEdges The number of local edges * */ - template >* = nullptr> - void constructEdgeArrays( - uint32_t nodeBegin, uint64_t edgeBegin, uint32_t numLocalNodes, - uint64_t numLocalEdges) { + template >* = nullptr> + void constructEdgeArrays(uint32_t nodeBegin, uint64_t edgeBegin, + uint32_t numLocalNodes, uint64_t numLocalEdges) { edgeDestBuffer = new uint32_t[numLocalEdges]; std::vector edgeIndexPointers(numLocalNodes, 0); galois::on_each([&](uint32_t tid, uint32_t numThreads) { @@ -400,16 +389,17 @@ class ShadGraphConverter { galois::block_range(uint32_t{0}, numLocalNodes, tid, numThreads); // 2) Each thread iterates the whole edges. for (auto edgeElem : this->edges) { - uint64_t srcVertex = edgeElem.first; - Vertex& vertex = this->vertices[srcVertex]; + uint64_t srcVertex = edgeElem.first; + Vertex& vertex = this->vertices[srcVertex]; uint64_t srcVertexId = vertex.id; // 3) Each thread fills edge destination for the assigned nodes. if (srcVertexId >= thread_work_range.first + nodeBegin && srcVertexId < thread_work_range.second + nodeBegin) { uint64_t edgeIdx = edgeIndexPointers[srcVertexId - nodeBegin]++; uint64_t nodeBaseOffset = - ((srcVertexId - nodeBegin)== 0)? - 0 : outIndexBuffer[srcVertexId - 1] - edgeBegin; + ((srcVertexId - nodeBegin) == 0) + ? 0 + : outIndexBuffer[srcVertexId - 1] - edgeBegin; edgeDestBuffer[edgeIdx + nodeBaseOffset] = this->vertices[edgeElem.second.dst].id; } @@ -421,7 +411,7 @@ class ShadGraphConverter { /** * @brief Extract outgoing edge index ranges for local vertices - * from the global outgoing edge index range array. + * from the global outgoing edge index range array. * * @param nodeBegin Node global id of the first local node * @param nodeEnd (Node global id for the last local node + 1) @@ -429,10 +419,9 @@ class ShadGraphConverter { void extractLocalOutIndexArray(uint32_t nodeBegin, uint32_t nodeEnd) { uint64_t* newOutIndexBuffer = new uint64_t[nodeEnd - nodeBegin]; - galois::do_all(galois::iterate(nodeBegin, nodeEnd), - [&](uint32_t n) { - newOutIndexBuffer[n - nodeBegin] = this->outIndexBuffer[n]; - } ); + galois::do_all(galois::iterate(nodeBegin, nodeEnd), [&](uint32_t n) { + newOutIndexBuffer[n - nodeBegin] = this->outIndexBuffer[n]; + }); delete[] this->outIndexBuffer; this->outIndexBuffer = newOutIndexBuffer; } @@ -449,7 +438,7 @@ class ShadGraphConverter { * a temporary vertex map */ bool checkNode(uint64_t id, int type) { - uint64_t key = this->verticeIdKeyMapping[id]; + uint64_t key = this->verticeIdKeyMapping[id]; Vertex& vertex = this->vertices[key]; return (this->to_underlying(vertex.type) == type); } @@ -467,20 +456,19 @@ class ShadGraphConverter { * @return True if passed information matches to the one in * a temporary edge map */ - bool checkEdge(uint64_t snid, uint64_t dnid, - uint64_t /*eid*/, int type) { - uint64_t skey = this->verticeIdKeyMapping[snid]; + bool checkEdge(uint64_t snid, uint64_t dnid, uint64_t /*eid*/, int type) { + uint64_t skey = this->verticeIdKeyMapping[snid]; auto edgeRange = this->edges.equal_range(skey); uint64_t eidx{0}; Edge edge; bool found{false}; - for (auto ei = edgeRange.first ; ei != edgeRange.second; ++ei, ++eidx) { + for (auto ei = edgeRange.first; ei != edgeRange.second; ++ei, ++eidx) { edge = ei->second; // Multiple edges having the same source and destination could // exist. So we repeat until find the one that has the same type to // the passed one. if (this->vertices[edge.dst].id == dnid && - this->to_underlying(edge.type) == type) { + this->to_underlying(edge.type) == type) { found = true; break; } @@ -517,16 +505,16 @@ class ShadGraphConverter { } } - std::vector splitTokens( - std::string& line, char delim, uint64_t size = 0) { + std::vector splitTokens(std::string& line, char delim, + uint64_t size = 0) { uint64_t ndx = 0, start = 0, end = 0; - std::vector tokens(size); + std::vector tokens(size); - for ( ; end < line.length(); end ++) { + for (; end < line.length(); end++) { if ((line[end] == delim) || (line[end] == '\n')) { - tokens[ndx] = line.substr(start, end - start); - start = end + 1; - ndx ++; + tokens[ndx] = line.substr(start, end - start); + start = end + 1; + ndx++; } } @@ -536,9 +524,8 @@ class ShadGraphConverter { } void CountNumEdgesForEachVertex(uint64_t numNodes, uint64_t /*numEdges*/) { - //galois::on_each([this, numNodes, numEdges]( - galois::on_each([&]( - uint32_t tid, uint32_t numThreads) { + // galois::on_each([this, numNodes, numEdges]( + galois::on_each([&](uint32_t tid, uint32_t numThreads) { // Each thread is assigned disjointed range of nodes. // Each thread iterates edges and accumulates edges for only // the nodes assigned to that. @@ -546,7 +533,7 @@ class ShadGraphConverter { galois::block_range(uint64_t{0}, numNodes, tid, numThreads); for (auto edgeElem : this->edges) { uint64_t srcVertex = edgeElem.first; - Vertex& vertex = this->vertices[srcVertex]; + Vertex& vertex = this->vertices[srcVertex]; if (vertex.id >= thread_work_range.first && vertex.id < thread_work_range.second) { vertex.incrNumEdges(); @@ -569,7 +556,7 @@ class ShadGraphConverter { void insertSHADVertex(const uint64_t& key, const TYPES& type, uint64_t& id) { auto found = this->vertices.find(key); if (found == this->vertices.end()) { - this->vertices[key] = Vertex(id, type, key); + this->vertices[key] = Vertex(id, type, key); this->verticeIdKeyMapping[id] = key; id++; } else { @@ -579,7 +566,7 @@ class ShadGraphConverter { /** * @brief Insert SHAD edge to a edge map. - * @detail Edges + * @detail Edges * * @param vertexKey Source vertex's SHAD token key * @param edge Adjacent edge of the vertex @@ -616,9 +603,9 @@ class ShadGraphConverter { // 3) Check if vertex information in the edges map is equal to the one // in the vertex map. assert(element.second.src_type == - this->vertices[element.second.src].type); + this->vertices[element.second.src].type); assert(element.second.dst_type == - this->vertices[element.second.dst].type); + this->vertices[element.second.dst].type); } } @@ -627,62 +614,63 @@ class ShadGraphConverter { // the number of total edges counted during inspection. uint64_t numAccumulatedEdges{0}; for (auto& element : this->vertices) { - numAccumulatedEdges += element.second.getNumEdges(); + numAccumulatedEdges += element.second.getNumEdges(); } assert(numAccumulatedEdges == numEdges); } - void VerifyCSRConstruction( - [[maybe_unused]] uint64_t* outIndexBuffer, - [[maybe_unused]] ShadNodeTy* nodeDataBuffer, - [[maybe_unused]] uint32_t* edgeDestBuffer, - [[maybe_unused]] void* edgeDataBuffer) {} + void VerifyCSRConstruction([[maybe_unused]] uint64_t* outIndexBuffer, + [[maybe_unused]] ShadNodeTy* nodeDataBuffer, + [[maybe_unused]] uint32_t* edgeDestBuffer, + [[maybe_unused]] void* edgeDataBuffer) {} template >* = nullptr> - void VerifyCSRConstruction( - uint64_t* outIndexBuffer, [[maybe_unused]] ShadNodeTy* nodeDataBuffer, - uint32_t* edgeDestBuffer, ShadEdgeTy* edgeDataBuffer) { + typename std::enable_if_t>* = nullptr> + void VerifyCSRConstruction(uint64_t* outIndexBuffer, + [[maybe_unused]] ShadNodeTy* nodeDataBuffer, + uint32_t* edgeDestBuffer, + ShadEdgeTy* edgeDataBuffer) { // 1) Iterate edge index array. - // 2) Compare each verteices' edge range with SHAD vertex + // 2) Compare each verteices' edge range with SHAD vertex for (size_t i = 0; i < this->vertices.size(); ++i) { - Vertex& srcV = this->vertices[this->verticeIdKeyMapping[i]]; + Vertex& srcV = this->vertices[this->verticeIdKeyMapping[i]]; uint64_t srcShadKey = srcV.shadKey; assert(this->verticeIdKeyMapping[i] == srcV.shadKey); - uint64_t edgeBegin = (i == 0)? 0 : outIndexBuffer[i - 1]; - uint64_t edgeEnd = outIndexBuffer[i]; + uint64_t edgeBegin = (i == 0) ? 0 : outIndexBuffer[i - 1]; + uint64_t edgeEnd = outIndexBuffer[i]; assert(srcV.numEdges == edgeEnd - edgeBegin); assert(this->to_underlying(srcV.type) == int(nodeDataBuffer[i].type)); assert(srcV.id == i); - galois::do_all(galois::iterate(edgeBegin, edgeEnd), + galois::do_all( + galois::iterate(edgeBegin, edgeEnd), [&](size_t j) { - uint32_t dstV = edgeDestBuffer[j]; - [[maybe_unused]] uint64_t edgeData = edgeDataBuffer[j]; - - [[maybe_unused]] bool found{false}; - auto edgeRange = this->edges.equal_range(srcShadKey); - size_t cnt{0}; - for (auto ei = edgeRange.first ; ei != edgeRange.second; ++ei) { - Edge& edge = ei->second; - if (this->vertices[edge.dst].id == dstV) { - // Multiple edges between vertices are possible. - if (this->to_underlying(edge.type) == int(edgeData)) { - assert(this->vertices[edge.src].id == i); - assert(this->vertices[edge.src].id == srcV.id); - found = true; + uint32_t dstV = edgeDestBuffer[j]; + [[maybe_unused]] uint64_t edgeData = edgeDataBuffer[j]; + + [[maybe_unused]] bool found{false}; + auto edgeRange = this->edges.equal_range(srcShadKey); + size_t cnt{0}; + for (auto ei = edgeRange.first; ei != edgeRange.second; ++ei) { + Edge& edge = ei->second; + if (this->vertices[edge.dst].id == dstV) { + // Multiple edges between vertices are possible. + if (this->to_underlying(edge.type) == int(edgeData)) { + assert(this->vertices[edge.src].id == i); + assert(this->vertices[edge.src].id == srcV.id); + found = true; + } + } + cnt++; } - } - cnt++; - } - assert((edgeEnd - edgeBegin) == cnt); - /* - for (auto i = this->edges.begin(); i != this->edges.end(); ++i) { - std::cout << srcId << " vs " << i->first << "\n"; - } - */ - assert(found); - }, galois::steal()); + assert((edgeEnd - edgeBegin) == cnt); + /* + for (auto i = this->edges.begin(); i != this->edges.end(); ++i) { + std::cout << srcId << " vs " << i->first << "\n"; + } + */ + assert(found); + }, + galois::steal()); } } #endif @@ -691,14 +679,14 @@ class ShadGraphConverter { * @brief Cast a type to an underlying type; in case of scoped enum, * this should be an integral type. * - * @param e + * @param e */ template constexpr typename std::underlying_type::type to_underlying(E e) noexcept { - return static_cast::type>(e); + return static_cast::type>(e); } - // This holds the whole global vertices and their + // This holds the whole global vertices and their // information such as its type. A key is globla node ID, and its value // is the information. std::unordered_map vertices; @@ -715,6 +703,6 @@ class ShadGraphConverter { ShadEdgeTy* edgeDataBuffer; }; -}; // shad namespace +}; // namespace shad #endif diff --git a/libgalois/src/FileGraph.cpp b/libgalois/src/FileGraph.cpp index 420854378b..97db8c7aac 100644 --- a/libgalois/src/FileGraph.cpp +++ b/libgalois/src/FileGraph.cpp @@ -709,7 +709,7 @@ void FileGraphWriter::phase1() { graphVersion = numNodes <= std::numeric_limits::max() ? 1 : 2; size_t bytes = galois::graphs::rawBlockSize(numNodes, numEdges, sizeofEdge, - graphVersion); + graphVersion); char* mmap_base = reinterpret_cast(mmap( nullptr, bytes, PROT_READ | PROT_WRITE, _MAP_ANON | MAP_PRIVATE, -1, 0)); if (mmap_base == MAP_FAILED) diff --git a/libgalois/test/bandwidth.cpp b/libgalois/test/bandwidth.cpp index e30d8cf061..0550c20000 100644 --- a/libgalois/test/bandwidth.cpp +++ b/libgalois/test/bandwidth.cpp @@ -79,7 +79,7 @@ void run_interleaved(size_t seed, size_t mega, bool full) { auto ptr = galois::substrate::largeMallocInterleaved( size * sizeof(int), full ? galois::substrate::getThreadPool().getMaxThreads() - : galois::runtime::activeThreads); + : galois::runtime::activeThreads); int* block = (int*)ptr.get(); run_interleaved_helper r(block, seed, size); diff --git a/libgalois/test/move.cpp b/libgalois/test/move.cpp index 608fc4651b..5f04b7fa8e 100644 --- a/libgalois/test/move.cpp +++ b/libgalois/test/move.cpp @@ -26,17 +26,17 @@ #include "galois/substrate/PerThreadStorage.h" struct MoveOnly { - MoveOnly() = default; - MoveOnly(MoveOnly&&) = default; - MoveOnly& operator=(MoveOnly&&) = default; - MoveOnly(const MoveOnly&) = delete; + MoveOnly() = default; + MoveOnly(MoveOnly&&) = default; + MoveOnly& operator=(MoveOnly&&) = default; + MoveOnly(const MoveOnly&) = delete; MoveOnly& operator=(const MoveOnly&) = delete; }; struct MoveOnlyA { int* x; MoveOnlyA() {} - MoveOnlyA(const MoveOnlyA&) = delete; + MoveOnlyA(const MoveOnlyA&) = delete; MoveOnly& operator=(const MoveOnlyA&) = delete; ~MoveOnlyA() {} }; diff --git a/libgalois/test/reduction.cpp b/libgalois/test/reduction.cpp index ef5fc3be99..3285fcf9e8 100644 --- a/libgalois/test/reduction.cpp +++ b/libgalois/test/reduction.cpp @@ -12,11 +12,11 @@ struct Move { Move(const Move&) = delete; Move(Move&&) noexcept {} Move& operator=(const Move&) = delete; - Move& operator =(Move&&) noexcept { return *this; } + Move& operator=(Move&&) noexcept { return *this; } }; void test_move() { - auto merge_fn = [](Move& a, Move &&) -> Move& { return a; }; + auto merge_fn = [](Move& a, Move&&) -> Move& { return a; }; auto identity_fn = []() { return Move(); }; diff --git a/libgluon/include/galois/graphs/GluonSubstrate.h b/libgluon/include/galois/graphs/GluonSubstrate.h index ec24bf2ce6..9d53b080ba 100644 --- a/libgluon/include/galois/graphs/GluonSubstrate.h +++ b/libgluon/include/galois/graphs/GluonSubstrate.h @@ -473,14 +473,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject { void RevertHandshakeToRealGraph() { // XXX make sure I dont need anything else - masterNodes = &master_nodes_concrete_; - mirrorNodes = &(userGraph.getMirrorNodes()); + masterNodes = &master_nodes_concrete_; + mirrorNodes = &(userGraph.getMirrorNodes()); maxSharedSize = original_max_shared_size_; } - void - SetupSubgraphMirrors(std::vector>& subgraph_mirrors, - bool use_timer) { + void SetupSubgraphMirrors(std::vector>& subgraph_mirrors, + bool use_timer) { galois::StatTimer t("SubgraphMirrorSetup"); if (use_timer) { t.start(); @@ -4751,13 +4750,13 @@ class GluonSubstrate : public galois::runtime::GlobalObject { ///* // * Headers for boost serialization // */ -//#include -//#include -//#include -//#include -//#include -//#include -//#include +// #include +// #include +// #include +// #include +// #include +// #include +// #include // // public: // /** diff --git a/libgluon/include/galois/runtime/SyncStructures.h b/libgluon/include/galois/runtime/SyncStructures.h index 56cf8dd311..588403ad83 100644 --- a/libgluon/include/galois/runtime/SyncStructures.h +++ b/libgluon/include/galois/runtime/SyncStructures.h @@ -1988,7 +1988,7 @@ class FieldFlags { template \ struct GNNSumAggregate_##fieldname { \ using NodeTy = NTy; \ - using ValTy = GNNFloat; \ + using ValTy = GNNFloat; \ \ static ValTy extract(uint32_t, NodeTy&) { return 0.f; } \ \ diff --git a/libgnn/README.md b/libgnn/README.md index dbca774922..2f3bf1a3aa 100644 --- a/libgnn/README.md +++ b/libgnn/README.md @@ -91,7 +91,7 @@ code has to occur before backward is called). Regarding the backward step: it turns out that for single class classification, the gradient if the answer is wrong is simply -the softmax value itself, and if the answer is right, then its +the softmax value itself, and if the answer is right, then its the softmax value - 1. This has the advantage of being very numerically stable as well. @@ -106,7 +106,7 @@ ReLU activation is used by the compute layers: if the value is greater than 0, it is kept, else it is discarded. Because the forward output matrix gets overwritten during -the backward step and because the derivative of the +the backward step and because the derivative of the ReLU operation requires knowledge of what elements were affected by the ReLU, the system must *track* which elements were not set to 0 using a bitmask. This @@ -151,7 +151,7 @@ the length of the vector. Actually doing this in the feature matrix is not great as it would mean that the original weight matrix needs to double in size, and additional space would have to be allocated on top of the existing input features -with the aggregated copied over to it. +with the aggregated copied over to it. Instead of doing this, you can allocate a separate weight matrix of the same size as the original, multiply the original input @@ -187,7 +187,7 @@ after aggregation, the rows of the matrix go from IR to OR. Therefore, after linear xform, IC turns to OC. After both operations, the output matrix to the next layer is the -expected OR by OC. Depending on which one occurs first, +expected OR by OC. Depending on which one occurs first, the code generates an intermediate of OR by IC *or* IC by OC. (more than one may be needed if dropout is used as that generates a new dropout matrix). @@ -370,7 +370,7 @@ The way this works is relatively simple: the code loops through each layer and calls the forward or backward pass function on it. -Depending on how the test interval is set, between each epoch +Depending on how the test interval is set, between each epoch a test subgraph may be used to check test accuracy. The flaw with the current design is that the graph object is only aware of one 'graph' at any one point, meaning the code @@ -382,7 +382,7 @@ a status that is set on nodes based on the minibatch and only includes *local seed nodes*, so keep this in mind when using it (there have been unintentional problems where I assumed `kBatch` meant more than just local seed nodes). The main reason for this is -that it helps to distinguish local and global seed nodes to avoid +that it helps to distinguish local and global seed nodes to avoid over-calculating gradients. # GNN Graph @@ -450,7 +450,7 @@ to keep things correct. In addition, the degree of a node for each sampled phase locally is kept track of. At the end of all sampling, the degrees of the nodes at each layer are synchronized among all hosts. -This is required because normalization in aggregation uses +This is required because normalization in aggregation uses the subgraph degrees (this is actually quite annoying runtime wise as it adds this extra degree sync step). @@ -465,7 +465,7 @@ the CSR; this includes edges that may not always be active. 4) Create the local subgraph features matrix by copying them over from the original feature matrix. -In order to make row elimination easier, +In order to make row elimination easier, the SID of the vertices are ordered such that seed nodes are first, the 1-hop samples next, then 2-hops, 3-hops, etc. This makes it easy to eliminate vertices that aren't used after @@ -559,4 +559,4 @@ Some updates will need to be made in order to do dynamic resizing of the data depending on the size of the minibatch. The best way to avoid this in general, though, is to just allocate space for the test subgraph's k-hops since that is likely to be more expensive than whatever -the minibatch size for the train nodes are (unless it's all nodes). \ No newline at end of file +the minibatch size for the train nodes are (unless it's all nodes). diff --git a/libgnn/include/galois/graphs/DegreeSyncStructures.h b/libgnn/include/galois/graphs/DegreeSyncStructures.h index a104f18bff..d08913caf0 100644 --- a/libgnn/include/galois/graphs/DegreeSyncStructures.h +++ b/libgnn/include/galois/graphs/DegreeSyncStructures.h @@ -1,5 +1,5 @@ #include "galois/GNNTypes.h" -//#include "galois/Logging.h" +// #include "galois/Logging.h" namespace galois { namespace graphs { diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index 146daf24b3..db5df02223 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -55,8 +55,7 @@ class GNNGraph { GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme, bool has_single_class_label, bool useWMD = false) : GNNGraph(galois::default_gnn_dataset_path, dataset_name, - partition_scheme, has_single_class_label, - useWMD) {} + partition_scheme, has_single_class_label, useWMD) {} //! Loads a graph and all relevant metadata (labels, features, masks, etc.) GNNGraph(const std::string& input_directory, const std::string& dataset_name, @@ -72,8 +71,8 @@ class GNNGraph { std::to_string(galois::runtime::getSystemNetworkInterface().ID) + std::string("] "); // load partition - partitioned_graph_ = LoadPartition(input_directory_, dataset_name, - partition_scheme, useWMD); + partitioned_graph_ = + LoadPartition(input_directory_, dataset_name, partition_scheme, useWMD); galois::gInfo(host_prefix_, "Loading partition is completed"); // reverse edges partitioned_graph_->ConstructIncomingEdges(); @@ -918,9 +917,9 @@ class GNNGraph { } } - template < - typename T = VTy, - typename std::enable_if_t>* = nullptr> + template >* = + nullptr> void ConstructFeatureBy2HopAggregation() {} void ConstructFeatureBy2HopAggregationGPU() { @@ -934,11 +933,11 @@ class GNNGraph { void ConstructFeatureBy2HopAggregationCPU() { galois::gInfo("Construct an initial feature on CPU by " "aggregating and concatenating neighbors' features."); - //this->PrintFeatures("0hop"); - // this->FillTestNodeType(); - //this->PrintGraphTopo("before"); + // this->PrintFeatures("0hop"); + // this->FillTestNodeType(); + // this->PrintGraphTopo("before"); this->Construct1HopFeatureCPU(); - //this->PrintFeatures("1hop"); + // this->PrintFeatures("1hop"); this->Construct2HopFeatureCPU(); this->PrintFeatures("2hop"); } @@ -1009,7 +1008,6 @@ class GNNGraph { "GraphAggregateSync"); } - /// Construct feature from 2-hop neighbors. /// After `Construct1HopFeatureCPU()`, each vertex aggregates types of /// the outgoing edges and neighbors, and constructs a histogram for @@ -1089,9 +1087,9 @@ class GNNGraph { * For now, I stopped this analysis and * just enabled this method for only GCN without graph * sampling. With graph sampling, I used SAGE's graph normalization. - */ + */ GNNFloat GetGCNNormFactor(GraphNode lid - /*, size_t graph_user_layer_num*/) const { + /*, size_t graph_user_layer_num*/) const { #if 0 if (use_subgraph_ || use_subgraph_view_) { size_t degree; @@ -1283,7 +1281,7 @@ class GNNGraph { //! that follows SHAD GNN feature construction. This aggregates features of //! the neighbor vertices that are from (vertex's feature offset + //! 1/2 * feature length) to (vertex's feature offset + feature length), - //! to (vertex's feature offset) of the current vertex, from its proxies. + //! to (vertex's feature offset) of the current vertex, from its proxies. //! //! @param matrix_to_sync Float pointer pointing to features of the target //! vertex @@ -1296,13 +1294,13 @@ class GNNGraph { // set globals for the sync substrate if (use_timer_) { - sync_substrate_->template sync< - writeSource, readAny, SHADGNNSumAggregate, Bitset_graph_aggregate>( - "SHADGraphAggregateSync"); + sync_substrate_ + ->template sync, + Bitset_graph_aggregate>("SHADGraphAggregateSync"); } else { - sync_substrate_->template sync< - writeSource, readAny, SHADGNNSumAggregate, Bitset_graph_aggregate>( - "Ignore"); + sync_substrate_ + ->template sync, + Bitset_graph_aggregate>("Ignore"); } } @@ -1682,18 +1680,16 @@ class GNNGraph { // is better std::mutex label_class_set_mtx; std::unordered_set label_class_set; - galois::do_all( - galois::iterate(size_t{0}, graph.size()), - [&](size_t lid) { - local_ground_truth_labels_[lid] = graph.getData(lid).type; - label_class_set_mtx.lock(); - auto found = label_class_set.find(local_ground_truth_labels_[lid]); - if (found == label_class_set.end()) { - label_class_set.emplace(local_ground_truth_labels_[lid]); - ++num_label_classes_; - } - label_class_set_mtx.unlock(); - }); + galois::do_all(galois::iterate(size_t{0}, graph.size()), [&](size_t lid) { + local_ground_truth_labels_[lid] = graph.getData(lid).type; + label_class_set_mtx.lock(); + auto found = label_class_set.find(local_ground_truth_labels_[lid]); + if (found == label_class_set.end()) { + label_class_set.emplace(local_ground_truth_labels_[lid]); + ++num_label_classes_; + } + label_class_set_mtx.unlock(); + }); // Exchange found local vertex classes with other hosts to // calculate the total number of the classes. @@ -1703,9 +1699,11 @@ class GNNGraph { // support std::set and std::unordered_set de/serialization. // TODO(hc): support this type of serialization. std::vector label_vec(label_class_set.begin(), label_class_set.end()); - auto &net = galois::runtime::getSystemNetworkInterface(); + auto& net = galois::runtime::getSystemNetworkInterface(); for (uint32_t h = 0; h < net.Num; ++h) { - if (h == net.ID) { continue; } + if (h == net.ID) { + continue; + } galois::runtime::SendBuffer b; galois::runtime::gSerialize(b, label_vec); net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); @@ -1719,25 +1717,24 @@ class GNNGraph { std::vector h_label_vec; galois::runtime::gDeserialize(p->second, h_label_vec); - galois::do_all(galois::iterate(h_label_vec), - [&](int i) { - label_class_set_mtx.lock(); - auto found = label_class_set.find(i); - if (found == label_class_set.end()) { - label_class_set.emplace(i); - // Increaes the number of classes only if - // it was not found in the local host. - ++num_label_classes_; - } - label_class_set_mtx.unlock(); - } ); + galois::do_all(galois::iterate(h_label_vec), [&](int i) { + label_class_set_mtx.lock(); + auto found = label_class_set.find(i); + if (found == label_class_set.end()) { + label_class_set.emplace(i); + // Increaes the number of classes only if + // it was not found in the local host. + ++num_label_classes_; + } + label_class_set_mtx.unlock(); + }); } increment_evilPhase(); } - template < - typename T = VTy, - typename std::enable_if_t>* = nullptr> + template >* = + nullptr> void ConstructLocalLabels() {} void ReadLocalLabelsBin(const std::string& dataset_name) { @@ -2024,7 +2021,7 @@ class GNNGraph { return other_accum.reduce(); } - //! @brief Choose and set local training/validation/testing vertices + //! @brief Choose and set local training/validation/testing vertices //! consecutively. void SetLocalMasksConsecutively() { // allocate the memory for the local masks @@ -2033,20 +2030,19 @@ class GNNGraph { local_validation_mask_.resize(partitioned_graph_->size()); local_testing_mask_.resize(partitioned_graph_->size()); - global_training_count_ = partitioned_graph_->globalSize() / 4; - size_t global_testing_count = global_training_count_ / 2; - global_training_mask_range_ = { - .begin = 0, .end = global_training_count_, .size = global_training_count_}; - global_testing_mask_range_ = { - .begin = global_training_count_, - .end = global_training_count_ + global_testing_count, - .size = global_testing_count - }; + global_training_count_ = partitioned_graph_->globalSize() / 4; + size_t global_testing_count = global_training_count_ / 2; + global_training_mask_range_ = {.begin = 0, + .end = global_training_count_, + .size = global_training_count_}; + global_testing_mask_range_ = {.begin = global_training_count_, + .end = global_training_count_ + + global_testing_count, + .size = global_testing_count}; global_validation_mask_range_ = { .begin = global_training_count_ + global_testing_count, - .end = global_training_count_ + 2 * global_testing_count, - .size = global_testing_count - }; + .end = global_training_count_ + 2 * global_testing_count, + .size = global_testing_count}; // training for (size_t i = global_training_mask_range_.begin; i < global_training_mask_range_.end; i++) { @@ -2076,8 +2072,8 @@ class GNNGraph { //! @brief Randomly choose and set local training/validation/testing //! vertices. This mimics what AGILE GNN does through Pytorch //! `DistributedRandomSampler`. - void DistributedRandomSampling( - size_t local_sample_size, std::vector* masks) { + void DistributedRandomSampling(size_t local_sample_size, + std::vector* masks) { // Pytorch's DistributedRandomSampler, // first materializes an array populated with // 0 to (num_local_vertices - 1), shuffles this array, and @@ -2088,15 +2084,16 @@ class GNNGraph { // the current host, but also others, and mark vertices to // the corresponding mask array if they are locals. auto& net = galois::runtime::getSystemNetworkInterface(); - std::vector< - std::pair> num_masters_per_hosts(net.Num); - std::pair master_ranges = - { partitioned_graph_->getGID(0), - partitioned_graph_->getGID(partitioned_graph_->numMasters() - 1) }; + std::vector> num_masters_per_hosts(net.Num); + std::pair master_ranges = { + partitioned_graph_->getGID(0), + partitioned_graph_->getGID(partitioned_graph_->numMasters() - 1)}; // 1) Exchange node master ranges, and so, each host knows // the range of vertex sampling. for (uint32_t h = 0; h < net.Num; ++h) { - if (h == net.ID) { continue; } + if (h == net.ID) { + continue; + } galois::runtime::SendBuffer b; galois::runtime::gSerialize(b, master_ranges); net.sendTagged(h, galois::runtime::evilPhase, std::move(b)); @@ -2108,34 +2105,32 @@ class GNNGraph { p = net.recieveTagged(galois::runtime::evilPhase); } while (!p); - galois::runtime::gDeserialize(p->second, - num_masters_per_hosts[p->first]); + galois::runtime::gDeserialize(p->second, num_masters_per_hosts[p->first]); } increment_evilPhase(); // 2) Sample vertices and mark them to the `masks` array // if a vertex is local. for (uint32_t h = 0; h < net.Num; ++h) { - size_t h_begin = (h == net.ID)? master_ranges.first : num_masters_per_hosts[h].first; - size_t h_end = (h == net.ID)? master_ranges.second : num_masters_per_hosts[h].second; + size_t h_begin = + (h == net.ID) ? master_ranges.first : num_masters_per_hosts[h].first; + size_t h_end = (h == net.ID) ? master_ranges.second + : num_masters_per_hosts[h].second; std::vector h_all_indices(h_end - h_begin); // Fill global vertex ids to h_global_ids. galois::do_all(galois::iterate(h_begin, h_end), - [&](size_t i) { - h_all_indices[i - h_begin] = i; - } ); + [&](size_t i) { h_all_indices[i - h_begin] = i; }); std::mt19937 rand(0); std::shuffle(h_all_indices.begin(), h_all_indices.end(), rand); galois::do_all( - galois::iterate(size_t{0}, local_sample_size), - [&](size_t i) { + galois::iterate(size_t{0}, local_sample_size), [&](size_t i) { // First, it doens't have duplications. // Second, only mark `masks` if the checking vertex is a local // vertex. if (partitioned_graph_->isLocal(h_all_indices[i])) { (*masks)[partitioned_graph_->getLID(h_all_indices[i])] = 1; } - } ); + }); } } @@ -2146,26 +2141,28 @@ class GNNGraph { local_validation_mask_.resize(partitioned_graph_->size()); local_testing_mask_.resize(partitioned_graph_->size()); - auto& net = galois::runtime::getSystemNetworkInterface(); - global_training_count_ = partitioned_graph_->globalSize() / 4; + auto& net = galois::runtime::getSystemNetworkInterface(); + global_training_count_ = partitioned_graph_->globalSize() / 4; size_t global_testing_count = global_training_count_ / 2; - size_t num_local_training_samples = global_training_count_ / net.Num; - size_t num_local_testing_samples = global_testing_count / net.Num; + size_t num_local_training_samples = global_training_count_ / net.Num; + size_t num_local_testing_samples = global_testing_count / net.Num; size_t num_local_validating_samples = num_local_testing_samples; - global_training_mask_range_ = { - .begin = 0, .end = global_training_count_, .size = global_training_count_}; - global_testing_mask_range_ = { - .begin = 0, .end = global_training_count_, .size = global_training_count_}; - global_validation_mask_range_ = { - .begin = 0, .end = global_training_count_, .size = global_training_count_}; + global_training_mask_range_ = {.begin = 0, + .end = global_training_count_, + .size = global_training_count_}; + global_testing_mask_range_ = {.begin = 0, + .end = global_training_count_, + .size = global_training_count_}; + global_validation_mask_range_ = {.begin = 0, + .end = global_training_count_, + .size = global_training_count_}; incomplete_masks_ = true; - DistributedRandomSampling( - num_local_training_samples, &local_training_mask_); - DistributedRandomSampling( - num_local_testing_samples, &local_testing_mask_); - DistributedRandomSampling( - num_local_validating_samples, &local_validation_mask_); + DistributedRandomSampling(num_local_training_samples, + &local_training_mask_); + DistributedRandomSampling(num_local_testing_samples, &local_testing_mask_); + DistributedRandomSampling(num_local_validating_samples, + &local_validation_mask_); } //! Read masks of local nodes only for training, validation, and testing @@ -2533,7 +2530,7 @@ class GNNGraph { ++galois::runtime::evilPhase; if (galois::runtime::evilPhase >= static_cast(std::numeric_limits::max())) { - galois::runtime::evilPhase = 1; + galois::runtime::evilPhase = 1; } } diff --git a/libgnn/include/galois/layers/GNNLayer.h b/libgnn/include/galois/layers/GNNLayer.h index 6929eb70a2..73153a44de 100644 --- a/libgnn/include/galois/layers/GNNLayer.h +++ b/libgnn/include/galois/layers/GNNLayer.h @@ -10,8 +10,8 @@ #include "galois/layers/GNNLayer.cuh" #endif -//#define PRINT_VEC_LOG_ -//#define PRINT_GPU_VEC_ +// #define PRINT_VEC_LOG_ +// #define PRINT_GPU_VEC_ namespace galois { diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h index be882647a1..3931ed06e1 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h @@ -268,8 +268,7 @@ class GraphConvolutionalLayer : public GNNLayer { this->layer_dimensions_.input_rows * this->layer_dimensions_.output_columns); // pintemp1 contains (AF)' - UpdateEmbeddingsDerivative( - input_gradient->data(), p_in_temp_1_.data()); + UpdateEmbeddingsDerivative(input_gradient->data(), p_in_temp_1_.data()); // pback contains F' // derivative of aggregate is the same due to symmetric graph AggregateAll(this->layer_dimensions_.input_columns, p_in_temp_1_.data(), @@ -383,9 +382,8 @@ class GraphConvolutionalLayer : public GNNLayer { galois::substrate::PerThreadStorage>*, bool is_backward) { galois::StatTimer aggregate_all_sync_timer("AggregateSync", kRegionName); - size_t num_nodes = (is_backward)? - this->layer_dimensions_.input_rows : - this->layer_dimensions_.output_rows; + size_t num_nodes = (is_backward) ? this->layer_dimensions_.input_rows + : this->layer_dimensions_.output_rows; size_t last_master = *(this->graph_.end_owned()); assert(0 == *(this->graph_.begin_owned())); @@ -414,9 +412,8 @@ class GraphConvolutionalLayer : public GNNLayer { if (!this->config_.disable_normalization) { if (this->graph_.IsSubgraphOn() || this->graph_.IsSubgraphViewOn()) { - source_norm = - this->graph_.GetDegreeNorm( - src, this->graph_user_layer_number_); + source_norm = this->graph_.GetDegreeNorm( + src, this->graph_user_layer_number_); } else { source_norm = this->graph_.GetGCNNormFactor(src); } @@ -424,8 +421,7 @@ class GraphConvolutionalLayer : public GNNLayer { // init to self if (!this->config_.disable_self_aggregate) { - graphs::bitset_graph_aggregate.set( - this->graph_.ConvertToLID(src)); + graphs::bitset_graph_aggregate.set(this->graph_.ConvertToLID(src)); // only aggregate self once on master if (src < last_master) { for (size_t i = 0; i < column_length; i++) { @@ -437,42 +433,40 @@ class GraphConvolutionalLayer : public GNNLayer { } // loop through all destinations to grab the feature to aggregate - auto e_beg = (is_backward)? - this->graph_.in_edge_begin(src) : this->graph_.edge_begin(src); - auto e_end = (is_backward)? - this->graph_.in_edge_end(src) : this->graph_.edge_end(src); + auto e_beg = (is_backward) ? this->graph_.in_edge_begin(src) + : this->graph_.edge_begin(src); + auto e_end = (is_backward) ? this->graph_.in_edge_end(src) + : this->graph_.edge_end(src); for (auto e = e_beg; e != e_end; e++) { if (this->layer_phase_ == GNNPhase::kTrain || this->layer_phase_ == GNNPhase::kBatch) { if (this->IsSampledLayer()) { - bool is_sampled = (is_backward)? - this->graph_.IsInEdgeSampled( - e, this->graph_user_layer_number_) : - this->graph_.IsEdgeSampled( - e, this->graph_user_layer_number_); + bool is_sampled = (is_backward) + ? this->graph_.IsInEdgeSampled( + e, this->graph_user_layer_number_) + : this->graph_.IsEdgeSampled( + e, this->graph_user_layer_number_); // ignore non-sampled nodes and edges if (!is_sampled) { continue; } } } - size_t dst = (is_backward)? - this->graph_.GetInEdgeDest(e) : this->graph_.GetEdgeDest(e); - graphs::bitset_graph_aggregate.set( - this->graph_.ConvertToLID(src)); + size_t dst = (is_backward) ? this->graph_.GetInEdgeDest(e) + : this->graph_.GetEdgeDest(e); + graphs::bitset_graph_aggregate.set(this->graph_.ConvertToLID(src)); size_t index_to_dst_feature = dst * column_length; if (!this->config_.disable_normalization) { GNNFloat norm_scale; if (this->graph_.IsSubgraphOn() || this->graph_.IsSubgraphViewOn()) { - norm_scale = (is_backward)? - this->graph_.GetDegreeNorm( - dst, this->graph_user_layer_number_) - : source_norm; + norm_scale = (is_backward) + ? this->graph_.GetDegreeNorm( + dst, this->graph_user_layer_number_) + : source_norm; } else { - norm_scale = - source_norm * this->graph_.GetGCNNormFactor(dst); + norm_scale = source_norm * this->graph_.GetGCNNormFactor(dst); } galois::VectorMulAdd( @@ -492,8 +486,8 @@ class GraphConvolutionalLayer : public GNNLayer { galois::loopname("ConvolutionalAggregateAll")); // aggregate sync aggregate_all_sync_timer.start(); - this->graph_.AggregateSync(aggregate_output, column_length, - is_backward, num_nodes); + this->graph_.AggregateSync(aggregate_output, column_length, is_backward, + num_nodes); aggregate_all_sync_timer.stop(); } @@ -534,7 +528,7 @@ class GraphConvolutionalLayer : public GNNLayer { } else { #endif AggregateAllCPU(column_length, node_embeddings, aggregate_output, pts, - is_backward); + is_backward); #ifdef GALOIS_ENABLE_GPU } #endif @@ -560,8 +554,7 @@ class GraphConvolutionalLayer : public GNNLayer { this->layer_dimensions_.input_rows, this->layer_dimensions_.input_columns, this->layer_dimensions_.output_columns, - node_embeddings, this->layer_weights_.data(), - output); + node_embeddings, this->layer_weights_.data(), output); #ifdef GALOIS_ENABLE_GPU } #endif @@ -569,8 +562,7 @@ class GraphConvolutionalLayer : public GNNLayer { } //! Calculate graident via mxm with last layer's gradients (backward) - void UpdateEmbeddingsDerivative( - const GNNFloat* gradients, GNNFloat* output) { + void UpdateEmbeddingsDerivative(const GNNFloat* gradients, GNNFloat* output) { galois::StatTimer timer("BackwardXform", kRegionName); timer.start(); diff --git a/libgnn/include/galois/layers/ReLULayer.h b/libgnn/include/galois/layers/ReLULayer.h index 879c462330..c35704a28e 100644 --- a/libgnn/include/galois/layers/ReLULayer.h +++ b/libgnn/include/galois/layers/ReLULayer.h @@ -10,25 +10,25 @@ namespace galois { -//! ReLU layer: takes each row of the input matrix and sets 0 to elements < 0 in a row. -//! Currently this only works with **single class* labels and is coded as such. +//! ReLU layer: takes each row of the input matrix and sets 0 to elements < 0 in +//! a row. Currently this only works with **single class* labels and is coded as +//! such. template class ReLULayer : public GNNLayer { public: - ReLULayer(size_t layer_num, - const galois::graphs::GNNGraph& graph, + ReLULayer(size_t layer_num, const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, const GNNLayerDimensions& dimensions) : ReLULayer( layer_num, graph, backward_output_matrix, dimensions, - GNNLayerConfig{.allocate_weights = false, .disable_output = true}) - {} + GNNLayerConfig{.allocate_weights = false, .disable_output = true}) { + } ReLULayer(size_t layer_num, const galois::graphs::GNNGraph& graph, PointerWithSize* backward_output_matrix, - const GNNLayerDimensions& dimensions, - const GNNLayerConfig& config) : GNNLayer(layer_num, graph, - backward_output_matrix, dimensions, config) { + const GNNLayerDimensions& dimensions, const GNNLayerConfig& config) + : GNNLayer(layer_num, graph, backward_output_matrix, dimensions, + config) { this->layer_type_ = galois::GNNLayerType::kReLU; GALOIS_LOG_ASSERT(dimensions.input_columns == dimensions.output_columns); GALOIS_LOG_VERBOSE("ReLU initialized"); @@ -79,9 +79,9 @@ class ReLULayer : public GNNLayer { return this->forward_output_matrix_; } - PointerWithSize BackwardPhaseCPU( - PointerWithSize prev_layer_input, - PointerWithSize* input_gradients) { + PointerWithSize + BackwardPhaseCPU(PointerWithSize prev_layer_input, + PointerWithSize* input_gradients) { galois::StatTimer Timer("ReLUBackward", "ReLULayer"); this->TimerStart(&Timer); @@ -102,8 +102,8 @@ class ReLULayer : public GNNLayer { for (size_t row_index = row_offset; row_index < (row_offset + feature_length); row_index++) { this->p_backward_output_matrix_[row_index] = - (prev_layer_input[row_index] > 0? 1 : 0) * - (*input_gradients)[row_index]; + (prev_layer_input[row_index] > 0 ? 1 : 0) * + (*input_gradients)[row_index]; } } }, diff --git a/libgnn/include/galois/layers/SAGELayer.h b/libgnn/include/galois/layers/SAGELayer.h index 19d5a75815..5bcaf66589 100644 --- a/libgnn/include/galois/layers/SAGELayer.h +++ b/libgnn/include/galois/layers/SAGELayer.h @@ -585,11 +585,11 @@ class SAGELayer : public GNNLayer { static const constexpr char* kRegionName = "SAGELayer"; //! CPU aggregation - void AggregateAllCPU( - size_t column_length, const GNNFloat* node_embeddings, - GNNFloat* aggregate_output, - galois::substrate::PerThreadStorage>*, - bool is_backward) { + void + AggregateAllCPU(size_t column_length, const GNNFloat* node_embeddings, + GNNFloat* aggregate_output, + galois::substrate::PerThreadStorage>*, + bool is_backward) { // aggregation causes a row count change size_t num_rows_to_handle; if (!is_backward) { diff --git a/libgnn/src/GNNMath.cpp b/libgnn/src/GNNMath.cpp index c25f3ae7ec..582fba95f6 100644 --- a/libgnn/src/GNNMath.cpp +++ b/libgnn/src/GNNMath.cpp @@ -57,8 +57,8 @@ void galois::VectorMulAdd(size_t length, const GNNFloat* a, const GNNFloat* b, constexpr size_t vectorization_length = 16; const size_t aligned_end = length - length % vectorization_length; __m512 scale_vec_main = _mm512_set_ps( - b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, - b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale); + b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, + b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale, b_scale); for (size_t i = 0; i < aligned_end; i += vectorization_length) { _mm512_storeu_ps( &output[i], diff --git a/libgnn/src/layers/DenseLayer.cpp b/libgnn/src/layers/DenseLayer.cpp index 8b13789179..e69de29bb2 100644 --- a/libgnn/src/layers/DenseLayer.cpp +++ b/libgnn/src/layers/DenseLayer.cpp @@ -1 +0,0 @@ - diff --git a/libgnn/src/layers/GNNLayer.cpp b/libgnn/src/layers/GNNLayer.cpp index 8b13789179..e69de29bb2 100644 --- a/libgnn/src/layers/GNNLayer.cpp +++ b/libgnn/src/layers/GNNLayer.cpp @@ -1 +0,0 @@ - diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index d005ddd6bc..75cb516b7a 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -94,7 +94,7 @@ if (NOT GALOIS_ENABLE_GPU) add_executable(softmaxlayer-test softmaxlayer-test.cpp) target_link_libraries(softmaxlayer-test galois_gnn) add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test) - + add_executable(sigmoidlayer-test sigmoidlayer-test.cpp) target_link_libraries(sigmoidlayer-test galois_gnn) add_test(NAME sigmoidlayer-test COMMAND sigmoidlayer-test) @@ -102,19 +102,19 @@ if (NOT GALOIS_ENABLE_GPU) add_executable(gnnconstruct-test gnnconstruct-test.cpp) target_link_libraries(gnnconstruct-test galois_gnn) add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test) - + add_executable(gnnfb-test gnnfb-test.cpp) target_link_libraries(gnnfb-test galois_gnn) add_test(NAME gnnfb-test COMMAND gnnfb-test) - + add_executable(adam-test adam-test.cpp) target_link_libraries(adam-test galois_gnn) add_test(NAME adam-test COMMAND adam-test) - + add_executable(accuracy-test accuracy-test.cpp) target_link_libraries(accuracy-test galois_gnn) add_test(NAME accuracy-test COMMAND accuracy-test) - + add_executable(epoch-test epoch-test.cpp) target_link_libraries(epoch-test galois_gnn) add_test(NAME epoch-test COMMAND epoch-test) diff --git a/libgnn/test/back-conv-test.cpp b/libgnn/test/back-conv-test.cpp index 6229c9288c..df3dfe915e 100644 --- a/libgnn/test/back-conv-test.cpp +++ b/libgnn/test/back-conv-test.cpp @@ -71,8 +71,8 @@ int main() { // create layer 1 for testing backward prop actually giving weights back std::unique_ptr> layer_1 = - std::make_unique>(1, test_graph, &p_back, - dimension_0, dcon); + std::make_unique>( + 1, test_graph, &p_back, dimension_0, dcon); layer_1->InitAllWeightsTo1(); galois::PointerWithSize layer_1_forward_output = layer_1->ForwardPhase(test_graph.GetLocalFeatures()); diff --git a/libgnn/test/convlayer-test.cpp b/libgnn/test/convlayer-test.cpp index 1bec3b4b31..6170e87d50 100644 --- a/libgnn/test/convlayer-test.cpp +++ b/libgnn/test/convlayer-test.cpp @@ -61,8 +61,8 @@ int main() { // create the layer, no norm factor std::unique_ptr> layer_0 = - std::make_unique>(0, test_graph, &p_null, - dimension_0, dcon); + std::make_unique>( + 0, test_graph, &p_null, dimension_0, dcon); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner const galois::PointerWithSize layer_0_forward_output = @@ -126,8 +126,8 @@ int main() { // create layer 1 for testing backward prop actually giving weights back std::unique_ptr> layer_1 = - std::make_unique>(1, test_graph, &p_back, - dimension_0, dcon); + std::make_unique>( + 1, test_graph, &p_back, dimension_0, dcon); layer_1->InitAllWeightsTo1(); galois::PointerWithSize layer_1_forward_output = layer_1->ForwardPhase(test_graph.GetLocalFeatures()); @@ -203,8 +203,8 @@ int main() { // don't have time for at the moment // TODO in future maybe add better unit test for this std::unique_ptr> layer_2 = - std::make_unique>(1, test_graph, &p_back, - dimension_0, config); + std::make_unique>( + 1, test_graph, &p_back, dimension_0, config); galois::PointerWithSize l2_fo = layer_2->ForwardPhase(test_graph.GetLocalFeatures()); GALOIS_LOG_ASSERT(l2_fo.size() == 14); diff --git a/libgnn/test/gcn-sample-edge-test.cpp b/libgnn/test/gcn-sample-edge-test.cpp index 8bb4e74f9a..c612639d10 100644 --- a/libgnn/test/gcn-sample-edge-test.cpp +++ b/libgnn/test/gcn-sample-edge-test.cpp @@ -37,7 +37,7 @@ int main() { galois::GNNLayerConfig dcon; dcon.disable_aggregate_after_update = false; - dcon.disable_normalization = false; + dcon.disable_normalization = false; dcon.DebugConfig(); // Choose a few sample nodes test_graph.SetSampledNode(0); @@ -52,7 +52,7 @@ int main() { test_graph.SampleAllEdges(0, false, 1); // After the above lines, nodes 0, 1, 3, 4, 5 and - // edges 0, 7, 8 should be sampled. + // edges 0, 7, 8 should be sampled. // So, // 0 -> 1, 2 <- 3 -> 4 GALOIS_LOG_ASSERT(test_graph.IsInSampledGraph(0)); @@ -64,9 +64,7 @@ int main() { GALOIS_LOG_ASSERT(test_graph.IsEdgeSampledAny(7)); GALOIS_LOG_ASSERT(test_graph.IsEdgeSampledAny(8)); - - galois::DynamicBitSet& bset = - test_graph.GetDefinitelySampledNodesBset(); + galois::DynamicBitSet& bset = test_graph.GetDefinitelySampledNodesBset(); bset.ParallelReset(); bset.set(0); bset.set(1); @@ -77,8 +75,8 @@ int main() { test_graph.EnableSubgraph(); galois::GNNLayerDimensions dimension_0; - dimension_0.input_rows = 5; - dimension_0.input_columns = 3; + dimension_0.input_rows = 5; + dimension_0.input_columns = 3; dimension_0.output_columns = 2; // Layer declaration @@ -115,8 +113,7 @@ int main() { dummy_ones_v[5] = 0; galois::PointerWithSize layer_1_backward_output = - layer_1->BackwardPhase( - test_graph.GetLocalFeatures(), &dummy_ones); + layer_1->BackwardPhase(test_graph.GetLocalFeatures(), &dummy_ones); GALOIS_LOG_ASSERT(layer_1_backward_output[0] == 0); GALOIS_LOG_ASSERT(layer_1_backward_output[1] == 0); @@ -136,7 +133,7 @@ int main() { galois::PointerWithSize layer_1_weight_gradients = layer_1->GetLayerWeightGradients(); - + GALOIS_LOG_ASSERT(layer_1_weight_gradients[0] == 6); GALOIS_LOG_ASSERT(layer_1_weight_gradients[1] == 6); GALOIS_LOG_ASSERT(layer_1_weight_gradients[2] == 6); diff --git a/libgnn/test/gnnconstruct-test.cpp b/libgnn/test/gnnconstruct-test.cpp index da0e6bd3f9..aa1513ca91 100644 --- a/libgnn/test/gnnconstruct-test.cpp +++ b/libgnn/test/gnnconstruct-test.cpp @@ -28,8 +28,8 @@ int main() { std::vector adam_sizes = {12, 28}; auto adam = std::make_unique(adam_sizes, 2); - galois::GraphNeuralNetwork - gnn(std::move(test_graph), std::move(adam), std::move(gnn_config)); + galois::GraphNeuralNetwork gnn( + std::move(test_graph), std::move(adam), std::move(gnn_config)); // note this does not include output layer GALOIS_LOG_ASSERT(gnn.num_intermediate_layers() == 2); diff --git a/libgnn/test/gnngraph-test.cpp b/libgnn/test/gnngraph-test.cpp index e4451a4900..b8a05fc8cc 100644 --- a/libgnn/test/gnngraph-test.cpp +++ b/libgnn/test/gnngraph-test.cpp @@ -16,11 +16,11 @@ int main() { // note multi level reading tested in another test GALOIS_LOG_VERBOSE("reddit with single label, oec"); - galois::graphs::GNNGraph("cora", galois::graphs::GNNPartitionScheme::kOEC, - true, false); + galois::graphs::GNNGraph( + "cora", galois::graphs::GNNPartitionScheme::kOEC, true, false); GALOIS_LOG_VERBOSE("reddit with single label, cvc"); - galois::graphs::GNNGraph("cora", galois::graphs::GNNPartitionScheme::kCVC, - true, false); + galois::graphs::GNNGraph( + "cora", galois::graphs::GNNPartitionScheme::kCVC, true, false); // below for when I want to check the remapper // galois::graphs::GNNGraph remapper("ogbn-papers100M", diff --git a/libgnn/test/gpu-back-conv-test.cpp b/libgnn/test/gpu-back-conv-test.cpp index 2df78d694d..7fedffeda6 100644 --- a/libgnn/test/gpu-back-conv-test.cpp +++ b/libgnn/test/gpu-back-conv-test.cpp @@ -54,8 +54,8 @@ int main() { // create layer 1 for testing backward prop actually giving weights back std::unique_ptr> layer_1 = - std::make_unique>(1, test_graph, &p_back, - dimension_0, dcon); + std::make_unique>( + 1, test_graph, &p_back, dimension_0, dcon); galois::PointerWithSize dummy_ones = layer_1->AllocateGPU(dummy_ones_v); layer_1->InitAllWeightsTo1(); layer_1->ForwardPhase(test_graph.GetLocalFeatures()); diff --git a/libgnn/test/gpu-convlayer-test.cpp b/libgnn/test/gpu-convlayer-test.cpp index a36740b5e3..dc5a4ad917 100644 --- a/libgnn/test/gpu-convlayer-test.cpp +++ b/libgnn/test/gpu-convlayer-test.cpp @@ -53,8 +53,8 @@ int main() { // create the layer, no norm factor std::unique_ptr> layer_0 = - std::make_unique>(0, test_graph, &p_null, - dimension_0, dcon); + std::make_unique>( + 0, test_graph, &p_null, dimension_0, dcon); layer_0->InitAllWeightsTo1(); // make sure it runs in a sane manner layer_0->ForwardPhase(test_graph.GetLocalFeatures()); @@ -113,8 +113,8 @@ int main() { // create layer 1 for testing backward prop actually giving weights back std::unique_ptr> layer_1 = - std::make_unique>(1, test_graph, &p_back, - dimension_0, dcon); + std::make_unique>( + 1, test_graph, &p_back, dimension_0, dcon); layer_1->InitAllWeightsTo1(); layer_1->ForwardPhase(test_graph.GetLocalFeatures()); const std::vector& layer_1_forward_output = @@ -195,8 +195,8 @@ int main() { // don't have time for at the moment // TODO in future maybe add better unit test for this std::unique_ptr> layer_2 = - std::make_unique>(2, test_graph, &p_back, - dimension_0, config); + std::make_unique>( + 2, test_graph, &p_back, dimension_0, config); layer_2->ForwardPhase(test_graph.GetLocalFeatures()); // pointer is to GPU memory: copy it over to a CPU source for verification const std::vector& l2_fo = diff --git a/libgnn/test/gpu-sage-layer-test.cpp b/libgnn/test/gpu-sage-layer-test.cpp index 7af3808c85..bbe5cc97cb 100644 --- a/libgnn/test/gpu-sage-layer-test.cpp +++ b/libgnn/test/gpu-sage-layer-test.cpp @@ -47,8 +47,8 @@ int main() { scon.disable_concat = false; std::unique_ptr> layer_0 = - std::make_unique>(0, test_graph, &p_null, dimension_0, - dcon, scon); + std::make_unique>(0, test_graph, &p_null, + dimension_0, dcon, scon); layer_0->InitAllWeightsTo1(); // sage weights for self layer_0->InitSelfWeightsTo1(); @@ -121,8 +121,8 @@ int main() { //////////////////////////////////////////////////////////////////////////////// // create layer 1 for testing backward prop actually giving weights back - auto layer_1 = std::make_unique>(1, test_graph, &p_back, - dimension_0, dcon, scon); + auto layer_1 = std::make_unique>( + 1, test_graph, &p_back, dimension_0, dcon, scon); layer_1->InitAllWeightsTo1(); layer_1->InitSelfWeightsTo1(); @@ -217,8 +217,8 @@ int main() { // (verification requires floating point accuracy or setting a seed which I // don't have time for at the moment // TODO in future maybe add better unit test for this - auto layer_2 = std::make_unique>(2, test_graph, &p_back, - dimension_0, config, scon); + auto layer_2 = std::make_unique>( + 2, test_graph, &p_back, dimension_0, config, scon); layer_2->ForwardPhase(test_graph.GetLocalFeatures()); const std::vector& l2_fo = layer_2->CopyForwardOutputFromGPU(); diff --git a/libgnn/test/l2norm-layer-test.cpp b/libgnn/test/l2norm-layer-test.cpp index 6d6b30942e..d2b659f238 100644 --- a/libgnn/test/l2norm-layer-test.cpp +++ b/libgnn/test/l2norm-layer-test.cpp @@ -38,8 +38,8 @@ int main() { std::vector back_matrix(14); galois::PointerWithSize p_back(back_matrix); - auto l2_layer = std::make_unique>(2, test_graph, &p_back, - dimension_0); + auto l2_layer = std::make_unique>( + 2, test_graph, &p_back, dimension_0); galois::PointerWithSize normed = l2_layer->ForwardPhase(l2_input); diff --git a/libgnn/test/mkl_micro.cpp b/libgnn/test/mkl_micro.cpp index 73b3a08893..a2e68fa9df 100644 --- a/libgnn/test/mkl_micro.cpp +++ b/libgnn/test/mkl_micro.cpp @@ -82,7 +82,7 @@ int main(int argc, char* argv[]) { // dimensions from test case size_t a_dim = 12000000; - //size_t a_dim = 120000; + // size_t a_dim = 120000; size_t b_dim = 128; size_t c_dim = 16; @@ -90,7 +90,7 @@ int main(int argc, char* argv[]) { std::vector matrix_1(a_dim * b_dim); std::vector matrix_2(a_dim * c_dim); // output - //std::vector matrix_3(a_dim * c_dim); + // std::vector matrix_3(a_dim * c_dim); std::vector matrix_3(b_dim * c_dim); size_t kBigSize = 1000000000; @@ -126,16 +126,19 @@ int main(int argc, char* argv[]) { auto start = std::chrono::high_resolution_clock::now(); // transpose because it's the same as the problematic call in GNN // TODO(loc) non transpose version - //CBlasSGEMM(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(), + // CBlasSGEMM(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, + // matrix_1.data(), // matrix_2.data(), matrix_3.data()); CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(), matrix_2.data(), matrix_3.data()); - //CBlasSGEMM(CblasNoTrans, CblasTrans, b_dim, a_dim, c_dim, matrix_1.data(), - // matrix_2.data(), matrix_3.data()); + // CBlasSGEMM(CblasNoTrans, CblasTrans, b_dim, a_dim, c_dim, + // matrix_1.data(), + // matrix_2.data(), matrix_3.data()); auto stop = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::time_point_cast(stop) - - std::chrono::time_point_cast(start); + auto duration = + std::chrono::time_point_cast(stop) - + std::chrono::time_point_cast(start); printf("Run duration is %lf ms\n", duration.count() / 1000.0); } diff --git a/libgnn/test/sage-layer-test.cpp b/libgnn/test/sage-layer-test.cpp index 8551126d37..3f53921795 100644 --- a/libgnn/test/sage-layer-test.cpp +++ b/libgnn/test/sage-layer-test.cpp @@ -33,8 +33,8 @@ int main() { galois::PointerWithSize p_back(back_matrix); std::unique_ptr> layer_0 = - std::make_unique>(0, test_graph, &p_null, dimension_0, - dcon, scon); + std::make_unique>(0, test_graph, &p_null, + dimension_0, dcon, scon); layer_0->InitAllWeightsTo1(); // sage weights for self layer_0->InitSelfWeightsTo1(); @@ -113,8 +113,8 @@ int main() { // create layer 1 for testing backward prop actually giving weights back - auto layer_1 = std::make_unique>(1, test_graph, &p_back, - dimension_0, dcon, scon); + auto layer_1 = std::make_unique>( + 1, test_graph, &p_back, dimension_0, dcon, scon); layer_1->InitAllWeightsTo1(); layer_1->InitSelfWeightsTo1(); @@ -205,8 +205,8 @@ int main() { // (verification requires floating point accuracy or setting a seed which I // don't have time for at the moment // TODO in future maybe add better unit test for this - auto layer_2 = std::make_unique>(1, test_graph, &p_back, - dimension_0, config, scon); + auto layer_2 = std::make_unique>( + 1, test_graph, &p_back, dimension_0, config, scon); galois::PointerWithSize l2_fo = layer_2->ForwardPhase(test_graph.GetLocalFeatures()); GALOIS_LOG_ASSERT(l2_fo.size() == 14); diff --git a/libgnn/test/sample-test.cpp b/libgnn/test/sample-test.cpp index 0bda9d81a8..d875a72ee4 100644 --- a/libgnn/test/sample-test.cpp +++ b/libgnn/test/sample-test.cpp @@ -60,8 +60,8 @@ int main() { galois::PointerWithSize p_back(back_matrix); std::unique_ptr> layer_1 = - std::make_unique>(1, test_graph, &p_back, - dimension_0, dcon); + std::make_unique>( + 1, test_graph, &p_back, dimension_0, dcon); layer_1->InitAllWeightsTo1(); layer_1->EnableSampling(); diff --git a/libgnn/test/single_mkl_micro.cpp b/libgnn/test/single_mkl_micro.cpp index 7111b1b057..97035bdfba 100644 --- a/libgnn/test/single_mkl_micro.cpp +++ b/libgnn/test/single_mkl_micro.cpp @@ -20,25 +20,26 @@ // MKL wrapper #ifdef USE_OMP void CBlasSGEMMOMP(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, - size_t input_rows, size_t input_columns, size_t output_columns, - const float* a, const float* b, float* output) { + size_t input_rows, size_t input_columns, + size_t output_columns, const float* a, const float* b, + float* output) { // set lead dimension based on cblas spec w.r.t. transpose setting size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows; size_t lead_dim_b = (trans_b == CblasNoTrans) ? output_columns : input_columns; - #pragma omp parallel for +#pragma omp parallel for for (int i = 0; i < omp_get_num_threads(); i++) { unsigned chunk_size = input_rows / omp_get_num_threads(); - unsigned my_start = chunk_size * i; - unsigned my_end = chunk_size * (i + 1); + unsigned my_start = chunk_size * i; + unsigned my_end = chunk_size * (i + 1); if (omp_get_num_threads() - 1 == i) { my_end = input_rows; } unsigned rows_to_use = my_end - my_start; const float* my_a = a + (my_start * input_columns); - float* my_output = output + (my_start * output_columns); + float* my_output = output + (my_start * output_columns); // do the MM cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns, @@ -49,9 +50,10 @@ void CBlasSGEMMOMP(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, #endif #if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS) -void CBlasSGEMMGalois(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans_b, - size_t input_rows, size_t input_columns, size_t output_columns, - const float* a, const float* b, float* output) { +void CBlasSGEMMGalois(const CBLAS_TRANSPOSE trans_a, + const CBLAS_TRANSPOSE trans_b, size_t input_rows, + size_t input_columns, size_t output_columns, + const float* a, const float* b, float* output) { // set lead dimension based on cblas spec w.r.t. transpose setting size_t lead_dim_a = (trans_a == CblasNoTrans) ? input_columns : input_rows; size_t lead_dim_b = @@ -62,46 +64,44 @@ void CBlasSGEMMGalois(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans temps.resize(galois::getActiveThreads()); } - galois::on_each( - [&] (size_t i, size_t num_threads) { - if (trans_a != CblasTrans) { - unsigned chunk_size = input_rows / num_threads; - unsigned my_start = chunk_size * i; - unsigned my_end = chunk_size * (i + 1); - if (num_threads - 1 == i) { - my_end = input_rows; - } - unsigned rows_to_use = my_end - my_start; - - const float* my_a = a + (my_start * input_columns); - float* my_output = output + (my_start * output_columns); - - // do the MM - cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns, - input_columns, 1.0, my_a, lead_dim_a, b, lead_dim_b, - false ? 1.0 : 0.0, my_output, output_columns); - } else { - galois::PODResizeableArray& my_pod = temps[i]; - my_pod.resize(input_rows * output_columns); - - unsigned chunk_size = input_columns / num_threads; - unsigned my_start = chunk_size * i; - unsigned my_end = chunk_size * (i + 1); - if (num_threads - 1 == i) { - my_end = input_columns; - } - unsigned b_rows_to_use = my_end - my_start; - - const float* my_a = a + (my_start * input_rows); - const float* my_b = b + (my_start * output_columns); - - // do the MM - cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns, - b_rows_to_use, 1.0, my_a, lead_dim_a, my_b, lead_dim_b, - false ? 1.0 : 0.0, my_pod.data(), output_columns); + galois::on_each([&](size_t i, size_t num_threads) { + if (trans_a != CblasTrans) { + unsigned chunk_size = input_rows / num_threads; + unsigned my_start = chunk_size * i; + unsigned my_end = chunk_size * (i + 1); + if (num_threads - 1 == i) { + my_end = input_rows; } + unsigned rows_to_use = my_end - my_start; + + const float* my_a = a + (my_start * input_columns); + float* my_output = output + (my_start * output_columns); + + // do the MM + cblas_sgemm(CblasRowMajor, trans_a, trans_b, rows_to_use, output_columns, + input_columns, 1.0, my_a, lead_dim_a, b, lead_dim_b, + false ? 1.0 : 0.0, my_output, output_columns); + } else { + galois::PODResizeableArray& my_pod = temps[i]; + my_pod.resize(input_rows * output_columns); + + unsigned chunk_size = input_columns / num_threads; + unsigned my_start = chunk_size * i; + unsigned my_end = chunk_size * (i + 1); + if (num_threads - 1 == i) { + my_end = input_columns; + } + unsigned b_rows_to_use = my_end - my_start; + + const float* my_a = a + (my_start * input_rows); + const float* my_b = b + (my_start * output_columns); + + // do the MM + cblas_sgemm(CblasRowMajor, trans_a, trans_b, input_rows, output_columns, + b_rows_to_use, 1.0, my_a, lead_dim_a, my_b, lead_dim_b, + false ? 1.0 : 0.0, my_pod.data(), output_columns); } - ); + }); if (trans_a == CblasTrans) { printf("Manual summation\n"); @@ -114,7 +114,6 @@ void CBlasSGEMMGalois(const CBLAS_TRANSPOSE trans_a, const CBLAS_TRANSPOSE trans } #endif - void CacheFlush(std::vector* matrix) { for (size_t i = 0; i < matrix->size(); i++) { (*matrix)[i] = i; @@ -155,7 +154,7 @@ int main(int argc, char* argv[]) { std::vector matrix_1(a_dim * b_dim); std::vector matrix_2(a_dim * c_dim); // output - //std::vector matrix_3(a_dim * c_dim); + // std::vector matrix_3(a_dim * c_dim); std::vector matrix_3(b_dim * c_dim); size_t kBigSize = 1000000000; @@ -184,21 +183,24 @@ int main(int argc, char* argv[]) { // transpose because it's the same as the problematic call in GNN // TODO(loc) non transpose version #ifdef USE_OMP - CBlasSGEMMOMP(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(), - matrix_2.data(), matrix_3.data()); + CBlasSGEMMOMP(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, + matrix_1.data(), matrix_2.data(), matrix_3.data()); #endif #if defined(USE_SHARED_GALOIS) || defined(USE_DIST_GALOIS) - //CBlasSGEMMGalois(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, matrix_1.data(), - // matrix_2.data(), matrix_3.data()); - CBlasSGEMMGalois(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(), - matrix_2.data(), matrix_3.data()); + // CBlasSGEMMGalois(CblasNoTrans, CblasNoTrans, a_dim, b_dim, c_dim, + // matrix_1.data(), + // matrix_2.data(), matrix_3.data()); + CBlasSGEMMGalois(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, + matrix_1.data(), matrix_2.data(), matrix_3.data()); #endif - //CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, matrix_1.data(), - // matrix_2.data(), matrix_3.data()); + // CBlasSGEMM(CblasTrans, CblasNoTrans, b_dim, a_dim, c_dim, + // matrix_1.data(), + // matrix_2.data(), matrix_3.data()); auto stop = std::chrono::high_resolution_clock::now(); - auto duration = std::chrono::time_point_cast(stop) - - std::chrono::time_point_cast(start); + auto duration = + std::chrono::time_point_cast(stop) - + std::chrono::time_point_cast(start); printf("Run duration is %lf ms\n", duration.count() / 1000.0); } From ce1a079890e8182946d2914a3e453864b93db709 Mon Sep 17 00:00:00 2001 From: "Lee, Hochan" <133701794+hochanlee-amd@users.noreply.github.com> Date: Mon, 23 Oct 2023 22:05:52 -0500 Subject: [PATCH 610/660] Make linking MKL libraries in a portable way This commit removes the absolute path to link the MKL library from CMakeLists and source codes, but uses Intel's MKL CMake configuration file and achieves portable linking. --- CMakeLists.txt | 4 +- libgnn/CMakeLists.txt | 33 +++-- libgnn/README.md | 18 +++ libgnn/test/CMakeLists.txt | 249 ++++++++++++++----------------------- 4 files changed, 126 insertions(+), 178 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 88eaa64d74..4731b8b99d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -141,9 +141,7 @@ endif() # TODO (loc) prefix with GALOIS, move elsewhere more fitting in this file ################################################################################ if(USE_MKL_BLAS) - SET(MKL_ROOT /home/hochan/intel/oneapi/mkl/2023.1.0) - find_package(MKL REQUIRED) - message(STATUS "MKL: ${MKL_INCLUDE_DIRS}") + find_package(MKL CONFIG REQUIRED PATH $ENV{MKL_ROOT}) if (MKL_FOUND) else() message(WARNING "MKL not found") diff --git a/libgnn/CMakeLists.txt b/libgnn/CMakeLists.txt index ca799c34b4..030e5bb516 100644 --- a/libgnn/CMakeLists.txt +++ b/libgnn/CMakeLists.txt @@ -6,34 +6,29 @@ set(sources src/graphs/GNNGraph.cpp ) -## TODO(hc): Note that these libraries should be hard-coded -## based on your own system. -## These should be automatic library linking. -set(MKL_LIBRARIES ${MKL_ROOT}/lib/intel64) -set(INTEL_COMPILER_LIBRARIES /home/hochan/intel/oneapi/compiler/2023.1.0/linux/compiler/lib/intel64_lin) -set(INTEL_LIBS "-lmkl_intel_lp64 -lmkl_intel_thread -lmkl_core -liomp5") -set(SINGLE_INTEL_LIBS "-lmkl_intel_lp64 -lmkl_sequential -lmkl_core") - add_library(galois_gnn STATIC ${sources}) -target_link_directories(galois_gnn PUBLIC ${MKL_LIBRARIES}) -target_link_libraries(galois_gnn ${INTEL_LIBS}) -target_link_directories(galois_gnn PUBLIC ${INTEL_COMPILER_LIBRARIES}) -target_link_libraries(galois_gnn galois_shmem) -target_link_libraries(galois_gnn galois_dist_async galois_cusp galois_gluon galois_support) +target_compile_options(galois_gnn PUBLIC + $) +target_include_directories(galois_gnn PUBLIC + $) +target_link_libraries(galois_gnn PUBLIC $) +target_link_libraries(galois_gnn PUBLIC galois_shmem) +target_link_libraries(galois_gnn PUBLIC galois_dist_async galois_cusp galois_gluon galois_support) target_include_directories(galois_gnn PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include - ${MKL_INCLUDE_DIRS} ) add_library(galois_gnn_single STATIC ${sources}) -target_link_directories(galois_gnn_single PUBLIC ${MKL_LIBRARIES}) -target_link_libraries(galois_gnn_single galois_shmem) -target_link_libraries(galois_gnn_single ${SINGLE_INTEL_LIBS}) -target_link_libraries(galois_gnn_single galois_dist_async galois_cusp galois_gluon galois_support) +target_compile_options(galois_gnn_single PUBLIC + $) +target_include_directories(galois_gnn_single PUBLIC + $) +target_link_libraries(galois_gnn_single PUBLIC $) +target_link_libraries(galois_gnn_single PUBLIC galois_shmem) +target_link_libraries(galois_gnn_single PUBLIC galois_dist_async galois_cusp galois_gluon galois_support) target_include_directories(galois_gnn_single PUBLIC ${CMAKE_CURRENT_SOURCE_DIR}/include - ${MKL_INCLUDE_DIRS} ) set_target_properties(galois_gnn PROPERTIES EXPORT_NAME galois_gnn) diff --git a/libgnn/README.md b/libgnn/README.md index 2f3bf1a3aa..ded103c9b9 100644 --- a/libgnn/README.md +++ b/libgnn/README.md @@ -560,3 +560,21 @@ data depending on the size of the minibatch. The best way to avoid this in general, though, is to just allocate space for the test subgraph's k-hops since that is likely to be more expensive than whatever the minibatch size for the train nodes are (unless it's all nodes). + +Author: Hochan Lee, + +# Intel Open API MKL + +Galois-GNN requires Intel Math Kernel Library (MKL), and so, you are required to +install Intel oneAPI. This toolkit contains all the necessary tools and libraries including +the MKL library. We recommend to get Intel oneAPI >= 2023.1.0. from the Intel official website +(https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html) +as this is what we have used and tested. + +Once you followed their instruction and installed Intel oneAPI, +you should export the MKL path in the installation path to your environment +before you cmake and install Galois-GNN. CMakeLists.txt will look for the MKL root path. + +```Shell +export MKL_ROOT=[THE PARENT PATH OF THE INSTALLATION PATH]/intel/openapi/mkl/2023.1.0 +``` diff --git a/libgnn/test/CMakeLists.txt b/libgnn/test/CMakeLists.txt index 75cb516b7a..40efcfa0e3 100644 --- a/libgnn/test/CMakeLists.txt +++ b/libgnn/test/CMakeLists.txt @@ -1,75 +1,5 @@ find_package(OpenMP) -add_executable(mkl_micro mkl_micro.cpp) -target_link_directories(mkl_micro PUBLIC ${MKL_LIBRARIES}) -target_link_directories(mkl_micro PUBLIC ${INTEL_COMPILER_LIBRARIES}) -target_include_directories(mkl_micro PUBLIC - ${MKL_INCLUDE_DIRS} -) -target_link_libraries(mkl_micro ${INTEL_LIBS}) - -add_executable(mkl_micro_omp mkl_micro.cpp) -target_link_directories(mkl_micro_omp PUBLIC ${MKL_LIBRARIES}) -target_link_directories(mkl_micro_omp PUBLIC ${INTEL_COMPILER_LIBRARIES}) -target_include_directories(mkl_micro_omp PUBLIC - ${MKL_INCLUDE_DIRS} -) -target_link_libraries(mkl_micro_omp PUBLIC ${INTEL_LIBS} OpenMP::OpenMP_CXX) -target_compile_definitions(mkl_micro_omp PUBLIC USE_OMP=1) - -add_executable(mkl_micro_sgalois mkl_micro.cpp) -target_link_libraries(mkl_micro_sgalois galois_gnn) -target_compile_definitions(mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1) - -add_executable(mkl_micro_dgalois mkl_micro.cpp) -target_link_libraries(mkl_micro_dgalois galois_gnn) -target_compile_definitions(mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1) - -add_executable(remapverify remapverify.cpp) -target_link_libraries(remapverify galois_gnn) -target_compile_definitions(remapverify PUBLIC USE_DIST_GALOIS=1) - -add_executable(mkl_micro_delete_galois mkl_micro.cpp) -target_link_libraries(mkl_micro_delete_galois galois_gnn) -target_compile_definitions(mkl_micro_delete_galois PUBLIC USE_SHARED_GALOIS_DELETE=1) - -################################################################################ - -#add_executable(single_mkl_micro single_mkl_micro.cpp) -#target_link_directories(single_mkl_micro PUBLIC ${MKL_LIBRARIES}) -#target_include_directories(single_mkl_micro PUBLIC -# ${MKL_INCLUDE_DIRS} -#) -#target_link_libraries(single_mkl_micro ${SINGLE_INTEL_LIBS}) - -add_executable(single_mkl_micro_omp single_mkl_micro.cpp) -target_link_directories(single_mkl_micro_omp PUBLIC ${MKL_LIBRARIES}) -target_include_directories(single_mkl_micro_omp PUBLIC - ${MKL_INCLUDE_DIRS} -) -target_link_libraries(single_mkl_micro_omp ${SINGLE_INTEL_LIBS} OpenMP::OpenMP_CXX) -target_compile_definitions(single_mkl_micro_omp PUBLIC USE_OMP=1) - -add_executable(single_mkl_micro_sgalois single_mkl_micro.cpp) -target_link_libraries(single_mkl_micro_sgalois galois_gnn_single) -target_compile_definitions(single_mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1) - -add_executable(single_mkl_micro_dgalois single_mkl_micro.cpp) -target_link_libraries(single_mkl_micro_dgalois galois_gnn_single) -target_compile_definitions(single_mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1) - -################################################################################ - -add_executable(gstl_test gstl_test.cpp) -target_link_libraries(gstl_test galois_shmem) - -################################################################################ - -add_executable(gnngraph-test gnngraph-test.cpp) -target_link_libraries(gnngraph-test galois_gnn) -add_test(NAME gnngraph-test COMMAND gnngraph-test) - -# multihost testing things set(hosts) set(host 12) while (${host} GREATER 1) @@ -78,105 +8,50 @@ while (${host} GREATER 1) endwhile() list(APPEND hosts "1") -if (NOT GALOIS_ENABLE_GPU) - add_executable(convlayer-test convlayer-test.cpp) - target_link_libraries(convlayer-test galois_gnn) - add_test(NAME convlayer-test COMMAND convlayer-test) - - add_executable(sage-layer-test sage-layer-test.cpp) - target_link_libraries(sage-layer-test galois_gnn) - add_test(NAME sage-layer-test COMMAND sage-layer-test) - - add_executable(l2norm-layer-test l2norm-layer-test.cpp) - target_link_libraries(l2norm-layer-test galois_gnn) - add_test(NAME l2norm-layer-test COMMAND l2norm-layer-test) - - add_executable(softmaxlayer-test softmaxlayer-test.cpp) - target_link_libraries(softmaxlayer-test galois_gnn) - add_test(NAME softmaxlayer-test COMMAND softmaxlayer-test) - - add_executable(sigmoidlayer-test sigmoidlayer-test.cpp) - target_link_libraries(sigmoidlayer-test galois_gnn) - add_test(NAME sigmoidlayer-test COMMAND sigmoidlayer-test) - - add_executable(gnnconstruct-test gnnconstruct-test.cpp) - target_link_libraries(gnnconstruct-test galois_gnn) - add_test(NAME gnnconstruct-test COMMAND gnnconstruct-test) - - add_executable(gnnfb-test gnnfb-test.cpp) - target_link_libraries(gnnfb-test galois_gnn) - add_test(NAME gnnfb-test COMMAND gnnfb-test) - - add_executable(adam-test adam-test.cpp) - target_link_libraries(adam-test galois_gnn) - add_test(NAME adam-test COMMAND adam-test) - - add_executable(accuracy-test accuracy-test.cpp) - target_link_libraries(accuracy-test galois_gnn) - add_test(NAME accuracy-test COMMAND accuracy-test) - - add_executable(epoch-test epoch-test.cpp) - target_link_libraries(epoch-test galois_gnn) - add_test(NAME epoch-test COMMAND epoch-test) - - add_executable(multilabel-epoch-test multilabel-epoch-test.cpp) - target_link_libraries(multilabel-epoch-test galois_gnn) - add_test(NAME multilabel-epoch-test COMMAND multilabel-epoch-test) +add_executable(gnngraph-test gnngraph-test.cpp) +target_link_libraries(gnngraph-test galois_gnn) +add_test(NAME gnngraph-test COMMAND gnngraph-test) +if (NOT GALOIS_ENABLE_GPU) + set(GALOIS_TESTS + ${GALOIS_TESTS} + convlayer-test + sage-layer-test + l2norm-layer-test + softmaxlayer-test + sigmoidlayer-test + gnnconstruct-test + gnnfb-test + adam-test + accuracy-test + epoch-test + multilabel-epoch-test + multilabel-read + f1-test + sample-bit-test + gcn-sample-edge-test + ) add_executable(aggregate-sync-test aggregate-sync-test.cpp) target_link_libraries(aggregate-sync-test galois_gnn) foreach(host_count ${hosts}) add_test(NAME run-aggsync-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} ./aggregate-sync-test) set_tests_properties(run-aggsync-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1") endforeach() - add_executable(back-conv-test back-conv-test.cpp) target_link_libraries(back-conv-test galois_gnn) foreach(host_count ${hosts}) add_test(NAME run-back-conv-${host_count} COMMAND mpiexec --bind-to none -n ${host_count} ./back-conv-test) set_tests_properties(run-back-conv-${host_count} PROPERTIES ENVIRONMENT "GALOIS_DO_NOT_BIND_THREADS=1") endforeach() - - add_executable(multilabel-read multilabel-read.cpp) - target_link_libraries(multilabel-read galois_gnn) - add_test(NAME multilabel-read COMMAND multilabel-read) - - add_executable(f1-test f1-test.cpp) - target_link_libraries(f1-test galois_gnn) - add_test(NAME f1-test COMMAND f1-test) - - #add_executable(sample-test sample-test.cpp) - #target_link_libraries(sample-test galois_gnn) - #add_test(NAME sample-test COMMAND sample-test) - - add_executable(sample-bit-test sample-bit-test.cpp) - target_link_libraries(sample-bit-test galois_gnn) - add_test(NAME sample-bit-test COMMAND sample-bit-test) - - add_executable(gcn-sample-edge-test gcn-sample-edge-test.cpp) - target_link_libraries(gcn-sample-edge-test galois_gnn) - add_test(NAME gcn-sample-edge-test COMMAND gcn-sample-edge-test) else() - add_executable(gpu-sage-layer-test gpu-sage-layer-test.cpp) - target_link_libraries(gpu-sage-layer-test galois_gnn) - add_test(NAME gpu-sage-layer-test COMMAND gpu-sage-layer-test) - - add_executable(gpu-convlayer-test gpu-convlayer-test.cpp) - target_link_libraries(gpu-convlayer-test galois_gnn) - add_test(NAME gpu-convlayer-test COMMAND gpu-convlayer-test) - - add_executable(gpu-softmaxlayer-test gpu-softmaxlayer-test.cpp) - target_link_libraries(gpu-softmaxlayer-test galois_gnn) - add_test(NAME gpu-softmaxlayer-test COMMAND gpu-softmaxlayer-test) - - add_executable(gpu-adam-test gpu-adam-test.cpp) - target_link_libraries(gpu-adam-test galois_gnn) - add_test(NAME gpu-adam-test COMMAND gpu-adam-test) - - add_executable(gpu-epoch-test gpu-epoch-test.cpp) - target_link_libraries(gpu-epoch-test galois_gnn) - #add_test(NAME gpu-epoch-test COMMAND gpu-epoch-test) - + set(GALOIS_TESTS + ${GALOIS_TESTS} + gpu-sage-layer-test + gpu-convlayer-test + gpu-softmaxlayer-test + gpu-adam-test + gpu-epoch-test + ) add_executable(gpu-aggregate-sync-test gpu-aggregate-sync-test.cpp) target_link_libraries(gpu-aggregate-sync-test galois_gnn) @@ -202,4 +77,66 @@ else() endforeach() endif() -# TODO multi host tests? +message("Galois Tests..") +foreach(galois_test ${GALOIS_TESTS}) + add_executable(${galois_test} ${galois_test}.cpp) + target_link_libraries(${galois_test} galois_gnn) + add_test(NAME ${galois_test} COMMAND ${galois_test}) +endforeach() + +add_executable(remapverify remapverify.cpp) +target_link_libraries(remapverify galois_gnn) +target_compile_definitions(remapverify PUBLIC USE_DIST_GALOIS=1) + +# MKL Test +set(MKL_TESTS + mkl_micro_sgalois + mkl_micro_dgalois + mkl_micro_delete_galois + single_mkl_micro_sgalois + single_mkl_micro_dgalois + mkl_micro + mkl_micro_omp + single_mkl_micro_omp +) + +add_executable(mkl_micro_sgalois mkl_micro.cpp) +target_link_libraries(mkl_micro_sgalois PUBLIC galois_gnn) +target_compile_definitions(mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1) + +add_executable(mkl_micro_dgalois mkl_micro.cpp) +target_link_libraries(mkl_micro_dgalois PUBLIC galois_gnn) +target_compile_definitions(mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1) + +add_executable(mkl_micro_delete_galois mkl_micro.cpp) +target_link_libraries(mkl_micro_delete_galois PUBLIC galois_gnn) +target_compile_definitions(mkl_micro_delete_galois PUBLIC USE_SHARED_GALOIS_DELETE=1) + +add_executable(single_mkl_micro_sgalois single_mkl_micro.cpp) +target_link_libraries(single_mkl_micro_sgalois PUBLIC galois_gnn_single) +target_compile_definitions(single_mkl_micro_sgalois PUBLIC USE_SHARED_GALOIS=1) + +add_executable(single_mkl_micro_dgalois single_mkl_micro.cpp) +target_link_libraries(single_mkl_micro_dgalois PUBLIC galois_gnn_single) +target_compile_definitions(single_mkl_micro_dgalois PUBLIC USE_DIST_GALOIS=1) + +add_executable(mkl_micro mkl_micro.cpp) + +add_executable(mkl_micro_omp mkl_micro.cpp) +target_link_libraries(mkl_micro_omp PUBLIC ${INTEL_LIBS} OpenMP::OpenMP_CXX) +target_compile_definitions(mkl_micro_omp PUBLIC USE_OMP=1) + +add_executable(single_mkl_micro_omp single_mkl_micro.cpp) +target_link_libraries(single_mkl_micro_omp PUBLIC ${SINGLE_INTEL_LIBS} OpenMP::OpenMP_CXX) +target_compile_definitions(single_mkl_micro_omp PUBLIC USE_OMP=1) + +foreach(mkl_test ${MKL_TESTS}) + target_compile_options(${mkl_test} PUBLIC + $) + target_include_directories(${mkl_test} PUBLIC + $) + target_link_libraries(${mkl_test} PUBLIC $) +endforeach() + +add_executable(gstl_test gstl_test.cpp) +target_link_libraries(gstl_test galois_shmem) From 80cf95269c34be52146784627e7287ac779f7c59 Mon Sep 17 00:00:00 2001 From: patrickkenney9801 Date: Tue, 24 Oct 2023 18:29:29 -0500 Subject: [PATCH 611/660] feat: Add Dockerfile to repo --- .gitignore | 1 + CONTRIBUTING.md | 6 +++++ Dockerfile | 66 +++++++++++++++++++++++++++++++++++++++++++++++++ Makefile | 21 ++++++++++++++++ 4 files changed, 94 insertions(+) create mode 100644 Dockerfile diff --git a/.gitignore b/.gitignore index 94fc673c6e..8f0aff5b96 100644 --- a/.gitignore +++ b/.gitignore @@ -24,6 +24,7 @@ tags # no build files /build* +/dockerbuild* # no python build artifacts *.pyc diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 2297468d67..007227dc70 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -6,6 +6,12 @@ tools like `clang-format` manually. Code should be clear and documented where needed. +## Setup + +Users can run `make docker-image` to setup all dependecies needed for +`pando-galois`. After creating the image it can be run via `make docker`. +And for first time cmake users can run `make run-cmake`. + ## Tools ### [asdf](https://asdf-vm.com) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000..d49b9d3211 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,66 @@ +ARG BUILD_IMAGE=ubuntu:22.04 +FROM --platform=linux/amd64 ${BUILD_IMAGE} AS build + +WORKDIR /tmp + +ENV DEBIAN_FRONTEND=noninteractive +RUN apt update && \ + apt install -y \ + cmake \ + gcc \ + g++ \ + build-essential \ + make \ + libboost-all-dev \ + libfmt-dev \ + libzstd-dev \ + lsb-release \ + wget \ + software-properties-common \ + gnupg \ + gdb \ + vim \ + git \ + python3 \ + python3-pip \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +# setup intel repo for intel-basekit +RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | \ + gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null +RUN echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | \ + tee /etc/apt/sources.list.d/oneAPI.list +RUN apt update && \ + apt install -y \ + intel-basekit \ + && apt-get clean && rm -rf /var/lib/apt/lists/* + +RUN bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)" + +ARG SRC_DIR=/pando-galois +ARG BUILD_DIR=/pando-galois/dockerbuild +ARG UNAME +ARG UID +ARG GID + +RUN if [ "${UNAME}" != "root" ] ; then groupadd -g ${GID} ${UNAME} \ + && useradd -ms /bin/bash -u "${UID}" -g "${GID}" ${UNAME} ; fi + +RUN mkdir -p /home/${UNAME} \ + && chown ${UNAME}:${UNAME} /home/${UNAME} + +USER ${UNAME} +WORKDIR /home/${UNAME} +ENV BUILD_DIR=${BUILD_DIR} + +RUN pip3 install compdb pre-commit cpplint "clang-format>=12.0.1" + +RUN echo "PATH=/home/${UNAME}/.local/bin/:\$PATH" >> /home/${UNAME}/.zshenv + +RUN echo "export SRC_DIR=${SRC_DIR}" >> /home/${UNAME}/.bashrc +RUN echo "export BUILD_DIR=${BUILD_DIR}" >> /home/${UNAME}/.bashrc +RUN echo "export OMPI_ALLOW_RUN_AS_ROOT=1" >> /home/${UNAME}/.bashrc +RUN echo "export OMPI_ALLOW_RUN_AS_ROOT_CONFIRM=1" >> /home/${UNAME}/.bashrc +RUN echo "export MKL_ROOT=/opt/intel/oneapi/mkl/2023.2.0" >> /home/${UNAME}/.bashrc + +WORKDIR ${SRC_DIR} diff --git a/Makefile b/Makefile index 2457b3c0a1..df77923812 100644 --- a/Makefile +++ b/Makefile @@ -1,3 +1,9 @@ +SHELL := /bin/bash + +IMAGE_NAME := pando-galois +VERSION := 0.0.1 +CONTAINER_SRC_DIR := /pando-galois + dependencies: dependencies-asdf dependencies-asdf: @@ -18,3 +24,18 @@ hooks: pre-commit: @pre-commit run -a + +docker-image: + @docker --context default build --build-arg VERSION=${VERSION} \ + --build-arg UNAME=$(shell whoami) \ + --build-arg UID=$(shell id -u) \ + --build-arg GID=$(shell id -g) \ + -t ${IMAGE_NAME}:${VERSION} \ + --file Dockerfile \ + --target build . + +docker: + @docker --context default run --rm -v $(shell pwd)/:${CONTAINER_SRC_DIR} --privileged --workdir=${CONTAINER_SRC_DIR} -it ${IMAGE_NAME}:${VERSION} bash -l + +run-cmake: + @cmake -S . -B ${BUILD_DIR} -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_MKL_BLAS=ON -DGALOIS_ENABLE_DIST=ON From 5fadc59352102f606737ecc69795e42812f61c5b Mon Sep 17 00:00:00 2001 From: "Lee, Hochan" <133701794+hochanlee-amd@users.noreply.github.com> Date: Wed, 1 Nov 2023 11:30:50 -0500 Subject: [PATCH 612/660] Fix for GCN minibatching (#19) GCN didn't correctly check minibatch subgraphs, and this commit fixes that. --- libgnn/include/galois/GraphNeuralNetwork.h | 29 +++++-- libgnn/include/galois/graphs/GNNGraph.h | 83 ++++++++++++++----- .../galois/layers/GraphConvolutionalLayer.h | 48 +++++++---- libgnn/include/galois/layers/SoftmaxLayer.h | 5 +- 4 files changed, 120 insertions(+), 45 deletions(-) diff --git a/libgnn/include/galois/GraphNeuralNetwork.h b/libgnn/include/galois/GraphNeuralNetwork.h index 88a48f961c..1d9108fbe5 100644 --- a/libgnn/include/galois/GraphNeuralNetwork.h +++ b/libgnn/include/galois/GraphNeuralNetwork.h @@ -500,8 +500,10 @@ class GraphNeuralNetwork { galois::StatTimer epoch_timer("TrainingTime", kRegionName); galois::StatTimer validation_timer("ValidationTime", kRegionName); galois::StatTimer epoch_test_timer("TestTime", kRegionName); - + float total_checked{0}, correct{0}; for (size_t epoch = 0; epoch < num_epochs; epoch++) { + total_checked = 0; + correct = 0; epoch_timer.start(); // swap to train subgraph if (config_.use_train_subgraph_ && !config_.train_minibatch_size()) { @@ -684,16 +686,30 @@ class GraphNeuralNetwork { mb_timer.stop(); const PointerWithSize batch_pred = DoInference(); - train_accuracy = GetGlobalAccuracy(batch_pred); + + if (graph_->is_using_wmd()) { + std::pair accuracy_results = + this->graph_->GetGlobalAccuracyCheckResult( + batch_pred, phase_, config_.do_sampling()); + train_accuracy = accuracy_results.first / accuracy_results.second; + correct += accuracy_results.first; + total_checked += accuracy_results.second; + galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1, + ": The number of correct answers is ", correct, "/", + total_checked, "\n"); + } else { + train_accuracy = GetGlobalAccuracy(batch_pred); + galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1, + ": Train accuracy/F1 micro is ", train_accuracy, + " time ", batch_timer.get(), "\n"); + } + GradientPropagation(); work_left_ += graph_->MoreTrainMinibatches(); char global_work_left = work_left_.reduce(); batch_timer.stop(); epoch_timer.stop(); - galois::gPrint("Epoch ", epoch, " Batch ", batch_num - 1, - ": Train accuracy/F1 micro is ", train_accuracy, - " time ", batch_timer.get(), "\n"); bool test_eval = config_.minibatch_test_interval_ @@ -760,6 +776,9 @@ class GraphNeuralNetwork { if (this_host == 0) { const std::string t_name_acc = "TrainEpoch" + std::to_string(epoch) + "Accuracy"; + if (config_.train_minibatch_size() && this->graph_->is_using_wmd()) { + train_accuracy = correct / total_checked; + } galois::gPrint("Epoch ", epoch, ": Train accuracy/F1 micro is ", train_accuracy, "\n"); galois::runtime::reportStat_Single(kRegionName, t_name_acc, diff --git a/libgnn/include/galois/graphs/GNNGraph.h b/libgnn/include/galois/graphs/GNNGraph.h index db5df02223..31b1fbf120 100644 --- a/libgnn/include/galois/graphs/GNNGraph.h +++ b/libgnn/include/galois/graphs/GNNGraph.h @@ -53,15 +53,15 @@ class GNNGraph { // galois::LargeArray>>; GNNGraph(const std::string& dataset_name, GNNPartitionScheme partition_scheme, - bool has_single_class_label, bool useWMD = false) + bool has_single_class_label, bool use_wmd = false) : GNNGraph(galois::default_gnn_dataset_path, dataset_name, - partition_scheme, has_single_class_label, useWMD) {} + partition_scheme, has_single_class_label, use_wmd) {} //! Loads a graph and all relevant metadata (labels, features, masks, etc.) GNNGraph(const std::string& input_directory, const std::string& dataset_name, GNNPartitionScheme partition_scheme, bool has_single_class_label, - bool useWMD = false) - : input_directory_(input_directory) { + bool use_wmd = false) + : input_directory_(input_directory), use_wmd_(use_wmd) { GALOIS_LOG_VERBOSE("[{}] Constructing partitioning for {}", host_id_, dataset_name); // save host id @@ -72,7 +72,7 @@ class GNNGraph { std::string("] "); // load partition partitioned_graph_ = - LoadPartition(input_directory_, dataset_name, partition_scheme, useWMD); + LoadPartition(input_directory_, dataset_name, partition_scheme); galois::gInfo(host_prefix_, "Loading partition is completed"); // reverse edges partitioned_graph_->ConstructIncomingEdges(); @@ -93,7 +93,7 @@ class GNNGraph { bitset_graph_aggregate.resize(partitioned_graph_->size()); // Construct/read additional graph data - if (useWMD) { + if (use_wmd) { galois::gInfo("Feature is constructed by aggregating 2-hop features, " "instead from feature files"); this->ConstructFeatureBy2HopAggregation(); @@ -939,7 +939,7 @@ class GNNGraph { this->Construct1HopFeatureCPU(); // this->PrintFeatures("1hop"); this->Construct2HopFeatureCPU(); - this->PrintFeatures("2hop"); + //this->PrintFeatures("2hop"); } void PrintFeatures(std::string postfix) { @@ -1178,6 +1178,16 @@ class GNNGraph { return GetGlobalAccuracyCPU(predictions, phase, sampling); } + /** + * @brief Compare predictions from a model and ground truths, and return the + * results. + */ + std::pair + GetGlobalAccuracyCheckResult(PointerWithSize predictions, + GNNPhase phase, bool sampling) { + return GetGlobalAccuracyCPUSingle(predictions, phase, sampling); + } + std::pair GetBatchAccuracy(PointerWithSize predictions) { // check owned nodes' accuracy @@ -1622,6 +1632,9 @@ class GNNGraph { return definitely_sampled_nodes_; } + /* @brief Return true if this is constructed from a WMD graph otherwise false. */ + bool is_using_wmd() { return this->use_wmd_; } + private: // included like this to avoid cyclic dependency issues + not used anywhere but // in this class anyways @@ -1632,12 +1645,13 @@ class GNNGraph { ////////////////////////////////////////////////////////////////////////////// //! Partitions a particular dataset given some partitioning scheme - std::unique_ptr LoadPartition( - const std::string& input_directory, const std::string& dataset_name, - galois::graphs::GNNPartitionScheme partition_scheme, bool useWMD) { + std::unique_ptr + LoadPartition(const std::string& input_directory, + const std::string& dataset_name, + galois::graphs::GNNPartitionScheme partition_scheme) { // XXX input path std::string input_file = input_directory + dataset_name + ".csgr"; - if (useWMD) { + if (this->use_wmd_) { input_file = dataset_name; } GALOIS_LOG_VERBOSE("Partition loading: File to read is {}", input_file); @@ -1646,16 +1660,16 @@ class GNNGraph { switch (partition_scheme) { case galois::graphs::GNNPartitionScheme::kOEC: return galois::cuspPartitionGraph( - input_file, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "", "", - false, 1); + input_file, galois::CUSP_CSR, galois::CUSP_CSR, this->use_wmd_, true, + "", "", false, 1); case galois::graphs::GNNPartitionScheme::kCVC: return galois::cuspPartitionGraph( - input_file, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "", "", - false, 1); + input_file, galois::CUSP_CSR, galois::CUSP_CSR, this->use_wmd_, true, + "", "", false, 1); case galois::graphs::GNNPartitionScheme::kOCVC: return galois::cuspPartitionGraph( - input_file, galois::CUSP_CSR, galois::CUSP_CSR, useWMD, true, "", "", - false, 1); + input_file, galois::CUSP_CSR, galois::CUSP_CSR, this->use_wmd_, true, + "", "", false, 1); default: GALOIS_LOG_FATAL("Error: partition scheme specified is invalid"); return nullptr; @@ -1680,6 +1694,7 @@ class GNNGraph { // is better std::mutex label_class_set_mtx; std::unordered_set label_class_set; + num_label_classes_ = 0; galois::do_all(galois::iterate(size_t{0}, graph.size()), [&](size_t lid) { local_ground_truth_labels_[lid] = graph.getData(lid).type; label_class_set_mtx.lock(); @@ -2362,7 +2377,9 @@ class GNNGraph { float accuracy{0}; if (is_single_class_label()) { global_accuracy_for_singleclass_timer.start(); - accuracy = GetGlobalAccuracyCPUSingle(predictions, phase, sampling); + auto accuracy_result = + GetGlobalAccuracyCPUSingle(predictions, phase, sampling); + accuracy = accuracy_result.first / accuracy_result.second; global_accuracy_for_singleclass_timer.stop(); } else { global_accuracy_for_multiclass_timer.start(); @@ -2373,12 +2390,30 @@ class GNNGraph { return accuracy; } - float GetGlobalAccuracyCPUSingle(PointerWithSize predictions, - GNNPhase phase, bool) { + std::pair + GetGlobalAccuracyCPUSingle(PointerWithSize predictions, + GNNPhase phase, bool) { // check owned nodes' accuracy num_correct_.reset(); total_checked_.reset(); +#if 0 + std::cout << "single accuracy print:\n"; + for (int i = *begin_owned(); i < *end_owned(); ++i) { + if (!IsValidForPhase(i, GNNPhase::kBatch)) { + continue; + } + //std::cout << subgraph_->SIDToLID(i) << ", " << galois::MaxIndex(num_label_classes_, &predictions[i * num_label_classes_]) << + std::cout << "accuracy:" << subgraph_->SIDToLID(i) << ", " << + predictions[i * num_label_classes_] << ", " << + predictions[i * num_label_classes_ + 1] << ", " << + predictions[i * num_label_classes_ + 2] << ", " << + predictions[i * num_label_classes_ + 3] << ", " << + predictions[i * num_label_classes_ + 4] << "-> " << + galois::MaxIndex(num_label_classes_, &predictions[i * num_label_classes_]) << + " vs " << GetSingleClassLabel(i) << "\n"; + } +#endif galois::do_all( // will only loop over sampled nodes if sampling is on galois::iterate(begin_owned(), end_owned()), @@ -2408,9 +2443,8 @@ class GNNGraph { GALOIS_LOG_DEBUG("Sub: {}, Accuracy: {} / {}", use_subgraph_, global_correct, global_checked); - - return static_cast(global_correct) / - static_cast(global_checked); + return std::make_pair(static_cast(global_correct), + static_cast(global_checked)); } float GetGlobalAccuracyCPUMulti(PointerWithSize predictions, @@ -2646,6 +2680,9 @@ class GNNGraph { std::vector node_remapping_; + // True if a WMD graph is being used otherwise false + bool use_wmd_{false}; + ////////////////////////////////////////////////////////////////////////////// // GPU things ////////////////////////////////////////////////////////////////////////////// diff --git a/libgnn/include/galois/layers/GraphConvolutionalLayer.h b/libgnn/include/galois/layers/GraphConvolutionalLayer.h index 3931ed06e1..d5259a7af9 100644 --- a/libgnn/include/galois/layers/GraphConvolutionalLayer.h +++ b/libgnn/include/galois/layers/GraphConvolutionalLayer.h @@ -139,7 +139,8 @@ class GraphConvolutionalLayer : public GNNLayer { GNNFloat* agg_data; // first, dropout if (!this->config_.disable_dropout && - (this->layer_phase_ == GNNPhase::kTrain)) { + (this->layer_phase_ == GNNPhase::kTrain || + this->layer_phase_ == GNNPhase::kBatch)) { this->DoDropout(input_embeddings, &p_in_temp_1_); input_data = p_in_temp_1_.data(); agg_data = p_in_temp_2_.data(); @@ -187,7 +188,8 @@ class GraphConvolutionalLayer : public GNNLayer { kRegionName); timer.start(); - assert(this->layer_phase_ == GNNPhase::kTrain); + assert(this->layer_phase_ == GNNPhase::kTrain || + this->layer_phase_ == GNNPhase::kBatch); // derivative of activation if (!this->config_.disable_activation) { @@ -285,12 +287,11 @@ class GraphConvolutionalLayer : public GNNLayer { input_gradient->data(), p_out_temp_.data(), &output_column_intermediates_, true); - // done after above because input_data = p_backward_output_matrix in some - // cases; use first before overwriting here if layer # doesn't = 0, it - // means I can mess with the input data itself instad of masking the - // gradients I can mask the input if (this->layer_number_ != 0) { if (this->graph_.IsSubgraphOn()) { + // Gradients for mirror nodes should be updated by their owner + // hosts. In case of graph sampling, we should let this know whether + // a node is a sampled master or not. this->MaskInputNonMasters(&input_data, this->layer_dimensions_.input_rows, this->graph_.GetNonLayerZeroMasters()); @@ -299,6 +300,8 @@ class GraphConvolutionalLayer : public GNNLayer { this->layer_dimensions_.input_rows); } } else { + // The first layer can zerofy non-master nodes' gradients since + // it is the last gradient aggregation. // if 0 then no input to mask: mask the gradient // this is fine because gradient won't be used to get feature gradients if (this->graph_.IsSubgraphOn()) { @@ -320,6 +323,12 @@ class GraphConvolutionalLayer : public GNNLayer { } else { #endif weight_gradient_timer.start(); + // p_out_temp aggregated gradients from the next layer. + // The weight gradients for this layer is calculated by + // (The current vertex embedding x p_out_temp). + // Vertex embedding dimension is (input row x input column), + // p_out_temp dimension is (input row x output column), + // and weight is (input column x output column). galois::CBlasSGEMM( CblasTrans, CblasNoTrans, this->layer_dimensions_.input_columns, this->layer_dimensions_.input_rows, @@ -382,13 +391,18 @@ class GraphConvolutionalLayer : public GNNLayer { galois::substrate::PerThreadStorage>*, bool is_backward) { galois::StatTimer aggregate_all_sync_timer("AggregateSync", kRegionName); - size_t num_nodes = (is_backward) ? this->layer_dimensions_.input_rows - : this->layer_dimensions_.output_rows; + size_t num_nodes = (is_backward) + ? this->layer_dimensions_.input_rows + // In case of minibatching or graph sampling, + // the outut row must be the samped graph's number of + // nodes of that layer. + : this->layer_dimensions_.output_rows; size_t last_master = *(this->graph_.end_owned()); assert(0 == *(this->graph_.begin_owned())); galois::do_all( + /* Either an original or a sampled graph iterator is used */ galois::iterate(*(this->graph_.begin()), num_nodes), [&](size_t src) { size_t index_to_src_feature = src * column_length; @@ -397,12 +411,13 @@ class GraphConvolutionalLayer : public GNNLayer { aggregate_output[index_to_src_feature + i] = 0; } - if (this->layer_phase_ == GNNPhase::kTrain) { + if (this->layer_phase_ == GNNPhase::kTrain || + this->layer_phase_ == GNNPhase::kBatch) { if (this->IsSampledLayer()) { // Check if node is part of sampled graph; ignore after // 0'ing if it is not sampled. // TODO(hc): check if SAGE also checks this - if (!this->graph_.IsInSampledGraph(src)) { + if (!this->graph_.IsInSampledGraphSubgraph(src)) { return; } } @@ -550,11 +565,14 @@ class GraphConvolutionalLayer : public GNNLayer { } else { #endif // CPU version is just a call into CBlas - galois::CBlasSGEMM(CblasNoTrans, CblasNoTrans, - this->layer_dimensions_.input_rows, - this->layer_dimensions_.input_columns, - this->layer_dimensions_.output_columns, - node_embeddings, this->layer_weights_.data(), output); + galois::CBlasSGEMM( + CblasNoTrans, CblasNoTrans, + this->layer_dimensions_.input_rows /* Graph or sampled graph nodes */, + this->layer_dimensions_.input_columns, + this->layer_dimensions_.output_columns, + node_embeddings /* input row x input columns */, + this->layer_weights_.data() /* input column x output column */, + output); #ifdef GALOIS_ENABLE_GPU } #endif diff --git a/libgnn/include/galois/layers/SoftmaxLayer.h b/libgnn/include/galois/layers/SoftmaxLayer.h index b55e37f05d..0fe3d66284 100644 --- a/libgnn/include/galois/layers/SoftmaxLayer.h +++ b/libgnn/include/galois/layers/SoftmaxLayer.h @@ -69,6 +69,7 @@ class SoftmaxLayer : public GNNLayer { // do softmax GNNSoftmax(feature_length, &input_embeddings[feature_length * i], &this->p_backward_output_matrix_[feature_length * i]); + // create ground truth vector for this LID std::vector* ground_truth_vec = ground_truth_vectors_.getLocal(); @@ -97,7 +98,6 @@ class SoftmaxLayer : public GNNLayer { galois::gPrint("Loss is ", reduced_loss / t, " ", reduced_loss, " ", t, "\n"); #endif - this->TimerStop(&Timer); return this->p_backward_output_matrix_; } @@ -127,7 +127,8 @@ class SoftmaxLayer : public GNNLayer { galois::iterate(size_t{0}, this->layer_dimensions_.input_rows), [&](const unsigned node) { if (this->IsSampledLayer()) { - if (this->layer_phase_ == GNNPhase::kTrain && + if ((this->layer_phase_ == GNNPhase::kTrain || + this->layer_phase_ == GNNPhase::kBatch) && !this->graph_.IsInSampledGraphSubgraph(node)) return; } From 5f7f4d28ada758e5919b332810b4e37e3736b8fb Mon Sep 17 00:00:00 2001 From: Ian Henriksen Date: Fri, 21 Aug 2020 15:17:23 -0500 Subject: [PATCH 613/660] Use relaxed consistency in atomic helpers. Remove unused Galois atomic header. Fix broken barrier in DistStats Shamelessly stolen from KatanaGraph. Fixes an issue where host 0 can stay in termination detection because it detects work after other hosts have decided that they are done with termination detection. kway fixed comment out stdout coarsening fix time sep output imbalance Fix 0-initialization of elements in a multiple_sum structure on GPU Update Refine.cpp self edge check multi edge support for tc Update README.md Small change to help compilation Added an LC_CSR graph with 64 bit node indexes Changes to support a common interface Added partial log-structure for LS_CSR with out of line objects Not ready for consumption Changes to dev Small fix so pangolin compiles Fix Catch so it compiles easily on Linux Parallel stuff for LS_LC_CSR_64_Graph Some changes for different insertions types Experimental changes for Distributed Graph Made a passable transpose function that should work for our purposes Added parallel constructor Update LS_LC_CSR_64 to be usable with distributed traingle counting feat: add setter for OffilineGraph size feat: a flag to set size on addEdgesUnSort Initial commit of instrumentation for Yineng Added a print method fix: typo feat: profile each host seperatly feat: write profile to seperate file fix: typo fix: initalize EdgeEnd on construction Added sane defaults for initilization feat: add OEC policy and make OfflineGraph virtual chore: Port over wmd graph from graph-log-sketch revert: Disable transferring node data due to serialization issues chore: clean up code Stack Sizes Made BFS graph500 compliant and added stack_capture Added StackTracer to CMakeLists Added documentation for Stack Capture Fixed CMakeLists.txt for Stack_Capture Prefix sums for thread ranges Added documentation to PrefixSum and WaterFallLock feat: make wmd a lib and add test Another small compilation bug, needed to return the right object A few changes in order to make liblonestar compile with PrefixSum Old bugfix got borked feat: support loop ranges for LS_LC_CSR_64 chore: remove unused profile code fix: resolve compile error and warning fix: resolve unused arg fix: resolve minor compile warning feat: add active threads arg to wmd graph test fix: add steal to do all in WMDGraph feat: remove unnecessary computation feat: support file striping in read graph file feat: support multithreading file reading fix: remove slow memory op fix: a bug about getline and '\n' fix: incorrect loop condition fix: remove slow reserve() Adding new Graph Part policy with less metadata memory error fix: change int typing and restyle fixed bug + improved perf feat: add testing script feat: add glbid field to vertex struct of wmd feat: print full graph in test script bug fix style: address some style issues and add comments Small fixes for compilation issues Another small compilation bug, needed to return the right object A few changes in order to make liblonestar compile with PrefixSum Old fix that got borked somehow Remove constants that didn't seemed to creep back up during merging fix: typo and TODO chores: add TODO for numa ds restyle: change the way to deal with unused var chores: add comment for perthread storage and test changes for PR bug fix chore: Run clang-format on the repo and add git hooks from gnn branch chore: Add instrument.h header file to libwmd feat: Add extensible import for multiple files and projection chore: Add instrumentation for memory accesses to import/projection bugfix: Add Divija's fix for setting edge destinations to local ids trying to commit fix: remove uncessary code feat: add instrument to wmd graph importer fixed all bugs (hopefully) test: update instrument feat: add a scrip to process instrument result --- .gitignore | 5 + CMakeLists.txt | 41 +- CONTRIBUTING.md | 45 + Makefile | 6 + README.md | 43 +- libcusp/include/galois/graphs/BasePolicies.h | 25 +- .../include/galois/graphs/DistributedGraph.h | 22 +- .../galois/graphs/GenericPartitioners.h | 13 +- libcusp/include/galois/graphs/NewGeneric.h | 12 + libdist/include/galois/runtime/Serialize.h | 33 + libdist/src/DistStats.cpp | 14 +- libgalois/include/galois/Atomic.h | 284 --- libgalois/include/galois/AtomicHelpers.h | 26 +- libgalois/include/galois/PrefixSum.h | 196 ++ libgalois/include/galois/WaterFallLock.h | 77 + .../include/galois/graphs/GraphHelpers.h | 38 +- .../include/galois/graphs/LC_CSR_64_Graph.h | 1027 ++++++++++ .../include/galois/graphs/LC_CSR_Graph.h | 4 +- .../galois/graphs/LS_LC_CSR_64_Graph.h | 1647 ++++++++++++++++ .../include/galois/graphs/LS_LC_CSR_Graph.h | 1113 +++++++++++ libgalois/include/galois/graphs/MorphGraph.h | 49 +- .../include/galois/graphs/OfflineGraph.h | 12 +- .../include/galois/runtime/StackTracer.h | 219 +++ .../galois/substrate/PerThreadStorage.h | 4 + libgalois/src/HWTopoLinux.cpp | 1 + libgalois/test/CMakeLists.txt | 2 + libgalois/test/prefixsum.cpp | 101 + libgalois/test/wfl.cpp | 106 + libgpu/include/internal.h | 4 +- libwmd/CMakeLists.txt | 31 + libwmd/include/galois/wmd/WMDGraph.h | 1720 +++++++++++++++++ libwmd/include/galois/wmd/WMDPartitioner.h | 931 +++++++++ libwmd/include/galois/wmd/data_types.h | 741 +++++++ libwmd/include/galois/wmd/graph.h | 190 ++ libwmd/include/galois/wmd/graphTypes.h | 75 + libwmd/include/galois/wmd/instrument.h | 190 ++ libwmd/include/galois/wmd/schema.h | 177 ++ libwmd/test/CMakeLists.txt | 12 + libwmd/test/wmd-graph-build.cpp | 125 ++ lonestar/analytics/cpu/bipart/Coarsening.cpp | 248 ++- lonestar/analytics/cpu/bipart/Refine.cpp | 50 +- lonestar/analytics/cpu/bipart/bipart.cpp | 316 +-- lonestar/analytics/cpu/bipart/bipart.h | 22 +- .../cpu/triangle-counting/Triangles.cpp | 14 +- .../analytics/distributed/bfs/bfs_push.cpp | 116 +- .../scientific/cpu/longestedge/test/catch.hpp | 1 + scripts/generate_wmdpartitioner_statstics.py | 56 + tools/graph-convert/graph-convert.cpp | 1 + 48 files changed, 9570 insertions(+), 615 deletions(-) delete mode 100644 libgalois/include/galois/Atomic.h create mode 100644 libgalois/include/galois/PrefixSum.h create mode 100644 libgalois/include/galois/WaterFallLock.h create mode 100644 libgalois/include/galois/graphs/LC_CSR_64_Graph.h create mode 100644 libgalois/include/galois/graphs/LS_LC_CSR_64_Graph.h create mode 100644 libgalois/include/galois/graphs/LS_LC_CSR_Graph.h create mode 100644 libgalois/include/galois/runtime/StackTracer.h create mode 100644 libgalois/test/prefixsum.cpp create mode 100644 libgalois/test/wfl.cpp create mode 100644 libwmd/CMakeLists.txt create mode 100644 libwmd/include/galois/wmd/WMDGraph.h create mode 100644 libwmd/include/galois/wmd/WMDPartitioner.h create mode 100644 libwmd/include/galois/wmd/data_types.h create mode 100644 libwmd/include/galois/wmd/graph.h create mode 100644 libwmd/include/galois/wmd/graphTypes.h create mode 100644 libwmd/include/galois/wmd/instrument.h create mode 100644 libwmd/include/galois/wmd/schema.h create mode 100644 libwmd/test/CMakeLists.txt create mode 100644 libwmd/test/wmd-graph-build.cpp create mode 100755 scripts/generate_wmdpartitioner_statstics.py diff --git a/.gitignore b/.gitignore index 8f0aff5b96..3a054d27a7 100644 --- a/.gitignore +++ b/.gitignore @@ -23,11 +23,16 @@ tags .ycm_extra_conf.py # no build files +<<<<<<< HEAD /build* /dockerbuild* +======= +/*build* +>>>>>>> 8e396b028 (Fixed CMakeLists.txt for Stack_Capture) # no python build artifacts *.pyc /python/galois.egg-info /python/galois/*.so /_skbuild + diff --git a/CMakeLists.txt b/CMakeLists.txt index 4731b8b99d..146f4adb25 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,19 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules") include(GNUInstallDirs) - +<<<<<<< HEAD +<<<<<<< HEAD +======= +if(STACK_CAPTURE) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finstrument-functions") + set(INSTRUMENT_EXCLUDE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/libgalois/include/galois/runtime/StackTracer.h") + set(INSTRUMENT_EXCLUDE_FILE "${INSTRUMENT_EXCLUDE_FILE},/usr/include/c++/11/sstream") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finstrument-functions-exclude-file-list=${INSTRUMENT_EXCLUDE_FILE}") +endif(STACK_CAPTURE) +>>>>>>> deb11e279 (Added StackTracer to CMakeLists) + +======= +>>>>>>> 8e396b028 (Fixed CMakeLists.txt for Stack_Capture) file(STRINGS config/version.txt GALOIS_VERSION) string(REGEX REPLACE "[ \t\n]" "" GALOIS_VERSION ${GALOIS_VERSION}) string(REGEX REPLACE "([0-9]+)\\.([0-9]+)\\.([0-9]+)" "\\1" GALOIS_VERSION_MAJOR ${GALOIS_VERSION}) @@ -23,9 +35,10 @@ endif() ###### Options (alternatively pass as options to cmake -DName=Value) ###### ###### Distributed-heterogeneous features ###### -set(GALOIS_ENABLE_DIST OFF CACHE BOOL "Enable distributed features") +set(GALOIS_ENABLE_DIST ON CACHE BOOL "Enable distributed features") set(GALOIS_CUDA_CAPABILITY "" CACHE STRING "Semi-colon list of CUDA compute capability version numbers to enable GPU features") # e.g., "3.7;6.1" set(GALOIS_COMM_STATS OFF CACHE BOOL "Report more detailed statistics of communication") +set(GALOIS_ENABLE_WMD ON CACHE BOOL "Enable WMD dataset support") ###### General features ###### set(GALOIS_ENABLE_PAPI OFF CACHE BOOL "Use PAPI counters for profiling") set(GALOIS_ENABLE_VTUNE OFF CACHE BOOL "Use VTune for profiling") @@ -51,6 +64,7 @@ set(GALOIS_NUM_TEST_GPUS "0" CACHE STRING "Number of test GPUs to use (on a sing set(GALOIS_USE_LCI OFF CACHE BOOL "Use LCI network runtime instead of MPI") set(GALOIS_USE_BARE_MPI OFF CACHE BOOL "Use MPI directly (no dedicated network-runtime thread)") set(GALOIS_NUM_TEST_THREADS "" CACHE STRING "Maximum number of threads to use when running tests (default: number of physical cores)") +set(GALOIS_ENABLE_INSTRUMENT OFF CACHE BOOL "Enable generating instrument in the runtime") if(NOT GALOIS_NUM_TEST_THREADS) cmake_host_system_information(RESULT GALOIS_NUM_TEST_THREADS QUERY NUMBER_OF_PHYSICAL_CORES) @@ -175,6 +189,10 @@ if(GALOIS_ENABLE_PAPI) add_definitions(-DGALOIS_ENABLE_PAPI) endif() +if (GALOIS_ENABLE_INSTRUMENT) + add_definitions(-DGALOIS_INSTRUMENT) +endif() + find_package(Threads REQUIRED) include(CheckMmap) @@ -233,6 +251,10 @@ if (GALOIS_ENABLE_DIST) add_subdirectory(libdist) add_subdirectory(libcusp) add_subdirectory(libgluon) + if (GALOIS_ENABLE_WMD) + find_package(MPI REQUIRED) + add_subdirectory(libwmd) + endif() endif() # TODO(loc) prefix with GALOIS @@ -248,6 +270,13 @@ if (GALOIS_ENABLE_GPU) string(REPLACE "." "" GENCODE ${GENCODE}) add_compile_options("$<$:-gencode=arch=compute_${GENCODE},code=sm_${GENCODE}>") endforeach() +<<<<<<< HEAD +======= + + # This is necessary to allow building for CUDA 11.x (where CUB is bundled) and earlier versions (where CUB is not included) + add_definitions(-DTHRUST_IGNORE_CUB_VERSION_CHECK) + +>>>>>>> 191e9ff91 (Fix 0-initialization of elements in a multiple_sum structure on GPU) add_subdirectory(libgpu) if (USE_DEEPGALOIS) @@ -343,3 +372,11 @@ set(CPACK_PACKAGE_VERSION_MAJOR ${GALOIS_VERSION_MAJOR}) set(CPACK_PACKAGE_VERSION_MINOR ${GALOIS_VERSION_MINOR}) set(CPACK_PACKAGE_VERSION_PATCH ${GALOIS_VERSION_PATCH}) include(CPack) + +if(STACK_CAPTURE) + message("Writing CMAKE_CXX_FLAGS") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSTACK_TRACE -finstrument-functions") + set(INSTRUMENT_EXCLUDE_FILE "${CMAKE_CURRENT_SOURCE_DIR}/libgalois/include/galois/runtime/StackTracer.h") + set(INSTRUMENT_EXCLUDE_FILE "${INSTRUMENT_EXCLUDE_FILE},/usr/include/c++/11/sstream") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -finstrument-functions-exclude-file-list=${INSTRUMENT_EXCLUDE_FILE}" CACHE STRING "CMAKE Flags" FORCE) +endif(STACK_CAPTURE) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 007227dc70..36e317c15b 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -6,12 +6,57 @@ tools like `clang-format` manually. Code should be clear and documented where needed. +<<<<<<< HEAD +<<<<<<< HEAD ## Setup Users can run `make docker-image` to setup all dependecies needed for `pando-galois`. After creating the image it can be run via `make docker`. And for first time cmake users can run `make run-cmake`. +======= +>>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch) +======= +# Instrumentation + +This section pertains to enabling and instrumenting memory accesses for +performance projections on the theoretical PANDO hardware. + +In order for the instrumentation code in `libwmd/include/galois/wmd/instrument.h`, +the following should be added to your top level source directory: + +```cmake +set(GALOIS_ENABLE_INSTRUMENT ON) +if (GALOIS_ENABLE_INSTRUMENT) + add_definitions(-DGALOIS_INSTRUMENT) +endif() +``` + +Here is a description of the control-flow macros used by the instrumentation +and when they should be used. + +```cpp +// Should be called once at the start of the program to initialize the instrumentation +// For example specifying `GRAPH_NAME=example-graph` will result in instrumentation +// files starting with `example-graph` +I_INIT(GRAPH_NAME, HOST, NUM_HOSTS, NUM_EDGES) +// Should be called once at the end of the program to cleanup the instrumentation +I_DEINIT() +// Should be called after the first kernel measured if multiple kernels are being measured +// For example if you specified `GRAPH_NAME=example-graph` above then specifying here that +// `NAME_SUFFIX=-kernel2` will result in instrumentation files starting `example-graph-kernel2` +I_NEW_FILE(NAME_SUFFIX, NUM_EDGES) +// I_ROUND should be called at the end of a communication round to log all memory accesses +// and communication recorded into instrumentation files +// I_CLEAR should be called after I_ROUND +I_ROUND(ROUND_NUM) +I_CLEAR() +// Should be called when sending custom communication to a remote host, recommended practice +// is to just pass in the size of the SendBuffer you are using +I_LC(REMOTE_HOST, BYTES) +``` + +>>>>>>> 43672aff5 (chore: Add instrument.h header file to libwmd) ## Tools ### [asdf](https://asdf-vm.com) diff --git a/Makefile b/Makefile index df77923812..d9fc8742ba 100644 --- a/Makefile +++ b/Makefile @@ -1,9 +1,12 @@ +<<<<<<< HEAD SHELL := /bin/bash IMAGE_NAME := pando-galois VERSION := 0.0.1 CONTAINER_SRC_DIR := /pando-galois +======= +>>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch) dependencies: dependencies-asdf dependencies-asdf: @@ -24,6 +27,7 @@ hooks: pre-commit: @pre-commit run -a +<<<<<<< HEAD docker-image: @docker --context default build --build-arg VERSION=${VERSION} \ @@ -39,3 +43,5 @@ docker: run-cmake: @cmake -S . -B ${BUILD_DIR} -DCMAKE_BUILD_TYPE=RelWithDebInfo -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DUSE_MKL_BLAS=ON -DGALOIS_ENABLE_DIST=ON +======= +>>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch) diff --git a/README.md b/README.md index 3375e800ee..ffda74f765 100644 --- a/README.md +++ b/README.md @@ -11,17 +11,17 @@ an implicitly parallel programming model, where the programmer replaces serial l constructs (e.g. for and while) and serial data structures in their algorithms with parallel loop constructs and concurrent data structures provided by Galois to express their algorithms. Galois is designed so that the programmer does not have to deal with low-level parallel programming constructs such as -threads, locks, barriers, condition variables, etc. +threads, locks, barriers, condition variables, etc. Highlights include: - Parallel *for_each* loop that handles dependencies between iterations, as well as dynamic work creation, and a *do_all* loop for simple parallelism. Both provide load balancing and excellent scalability on multi-socket systems - A concurrent graph library designed for graph analytics algorithms as well as - other domains such as irregular meshes. -- Scalable concurrent containers such as bag, vector, list, etc. + other domains such as irregular meshes. +- Scalable concurrent containers such as bag, vector, list, etc. -Galois is released under the BSD-3-Clause license. +Galois is released under the BSD-3-Clause license. Building Galois @@ -45,7 +45,7 @@ Dependencies Galois builds, runs, and has been tested on GNU/Linux. Even though Galois may build on systems similar to Linux, we have not tested correctness or performance, so please -beware. +beware. At the minimum, Galois depends on the following software: @@ -55,7 +55,7 @@ At the minimum, Galois depends on the following software: - libllvm (>= 7.0 with RTTI support) - libfmt (>= 4.0) -Here are the dependencies for the optional features: +Here are the dependencies for the optional features: - Linux HUGE_PAGES support (please see [www.kernel.org/doc/Documentation/vm/hugetlbpage.txt](https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt)). Performance will most likely degrade without HUGE_PAGES enabled. Galois uses 2MB huge page size and relies on the kernel configuration to set aside a large amount of 2MB pages. For example, our performance testing machine (4x14 cores, 192GB RAM) is configured to support up to 65536 2MB pages: @@ -70,13 +70,14 @@ Here are the dependencies for the optional features: ``` - libnuma support. Performance may degrade without it. Please install - libnuma-dev on Debian like systems, and numactl-dev on Red Hat like systems. -- Doxygen (>= 1.8.5) for compiling documentation as webpages or latex files + libnuma-dev on Debian like systems, and numactl-dev on Red Hat like systems. +- Doxygen (>= 1.8.5) for compiling documentation as webpages or latex files - PAPI (>= 5.2.0.0 ) for profiling sections of code - Vtune (>= 2017 ) for profiling sections of code - MPICH2 (>= 3.2) if you are interested in building and running distributed system applications in Galois -- CUDA (>= 8.0) if you want to build GPU or distributed heterogeneous applications +- CUDA (>= 8.0 and < 11.0) if you want to build GPU or distributed heterogeneous applications. + Note that versions >= 11.0 use an incompatible CUB module and will fail to execute. - Eigen (3.3.1 works for us) for some matrix-completion app variants @@ -148,6 +149,12 @@ ctest in the build directory. +Capturing Stack Information +--------------------------- +Currently if you add `-DSTACK_CAPTURE` to your `cmake` line then you will configure stack capturing. +Please view `libgalois/include/runtime/StackTracer.h` for documentation on functions for printing and reseting. +Do not attempt to modify the capture process otherwise. + Running Galois Applications =========================== @@ -156,9 +163,9 @@ Graph Format ------------ Many Galois/Lonestar applications work with graphs. We store graphs in a binary format -called *galois graph file* +called *galois graph file* (`.gr` file extension). Other formats such as edge-list or Matrix-Market can be -converted to `.gr` format with `graph-convert` tool provided in galois. +converted to `.gr` format with `graph-convert` tool provided in galois. You can build graph-convert as follows: ```Shell @@ -168,20 +175,20 @@ make graph-convert ``` Other applications, such as Delaunay Mesh Refinement may read special file formats -or some may even generate random inputs on the fly. +or some may even generate random inputs on the fly. Running ------- All Lonestar applications take a `-t` command-line option to specify the number of threads to use. All applications run a basic sanity check (often insufficient for -correctness) on the program output, which can be turned off with the `-noverify` option. You -can specify `-help` command-line option to print all available options. +correctness) on the program output, which can be turned off with the `-noverify` option. You +can specify `-help` command-line option to print all available options. Upon successful completion, each application will produce some stats regarding running time of various sections, parallel loop iterations and memory usage, etc. These stats are in CSV format and can be redirected to a file using `-statFile` option. -Please refer to the manual for details on stats. +Please refer to the manual for details on stats. Running LonestarGPU applications -------------------------- @@ -199,7 +206,7 @@ Documentation ============= Galois documentation is produced using doxygen, included in this repository, which includes a tutorial, a user's -manual and API documentation for the Galois library. +manual and API documentation for the Galois library. Users can build doxygen documentation in the build directory using: @@ -215,12 +222,12 @@ See online documentation at: Source-Tree Organization ======================== -- `libgalois` contains the source code for the shared-memory Galois library, e.g., runtime, graphs, worklists, etc. +- `libgalois` contains the source code for the shared-memory Galois library, e.g., runtime, graphs, worklists, etc. - `lonestar` contains the Lonestar benchmark applications and tutorial examples for Galois - `libdist` contains the source code for the distributed-memory and heterogeneous Galois library - `lonestardist` contains the source code for the distributed-memory and heterogeneous benchmark applications. Please refer to `lonestardist/README.md` for instructions on - building and running these apps. + building and running these apps. - `tools` contains various helper programs such as graph-converter to convert between graph file formats and graph-stats to print graph properties diff --git a/libcusp/include/galois/graphs/BasePolicies.h b/libcusp/include/galois/graphs/BasePolicies.h index 446e9c7dae..d0cc16c354 100644 --- a/libcusp/include/galois/graphs/BasePolicies.h +++ b/libcusp/include/galois/graphs/BasePolicies.h @@ -42,6 +42,8 @@ class PartitioningScaffold { uint64_t _numEdges; //!< number of edges in graph //! maps from host id to nodes that host as read from disk std::vector> _gid2host; + std::vector _virtualToPhyMapping; //saving Virtual hosts to Phy hosts map + bool hash; //switch between using gid2host and VtoP maps public: /** @@ -64,6 +66,11 @@ class PartitioningScaffold { */ void saveGIDToHost(std::vector>& gid2host) { _gid2host = gid2host; + hash = false; + } + void saveGIDToHost(std::vector& virtualToPhyMapping) { + _virtualToPhyMapping = virtualToPhyMapping; + hash = true; } bool predeterminedMapping(std::vector&) { return false; } @@ -90,15 +97,19 @@ class ReadMasterAssignment : public PartitioningScaffold { * @returns Host ID of host that read the node specified by the GID. */ uint32_t retrieveMaster(uint32_t gid) const { - for (auto h = 0U; h < _numHosts; ++h) { - uint64_t start, end; - std::tie(start, end) = _gid2host[h]; - if (gid >= start && gid < end) { - return h; + if(hash == false) { + for (auto h = 0U; h < _numHosts; ++h) { + uint64_t start, end; + std::tie(start, end) = _gid2host[h]; + if (gid >= start && gid < end) { + return h; + } } + assert(false); + return _numHosts; + } else { + return _virtualToPhyMapping[gid%(_virtualToPhyMapping.size())]; } - assert(false); - return _numHosts; } // below all unused if not assigning masters in default manner, but must be diff --git a/libcusp/include/galois/graphs/DistributedGraph.h b/libcusp/include/galois/graphs/DistributedGraph.h index 540b25e120..e4f38d80ea 100644 --- a/libcusp/include/galois/graphs/DistributedGraph.h +++ b/libcusp/include/galois/graphs/DistributedGraph.h @@ -30,8 +30,12 @@ #include #include +<<<<<<< HEAD #include "galois/graphs/LC_CSR_Graph.h" #include "galois/graphs/LC_CSR_CSC_Graph.h" +======= +#include "galois/graphs/LS_LC_CSR_64_Graph.h" +>>>>>>> 3945b1acc (Experimental changes for Distributed Graph) #include "galois/graphs/BufferedGraph.h" #include "galois/runtime/DistStats.h" #include "galois/graphs/OfflineGraph.h" @@ -68,9 +72,13 @@ class DistGraph { //! Graph name used for printing things constexpr static const char* const GRNAME = "dGraph"; +<<<<<<< HEAD using GraphTy = galois::graphs::LC_CSR_CSC_Graph; +======= + using GraphTy = galois::graphs::LS_LC_CSR_64_Graph; +>>>>>>> 3945b1acc (Experimental changes for Distributed Graph) // vector for determining range objects for master nodes + nodes // with edges (which includes masters) @@ -896,6 +904,11 @@ class DistGraph { return graph.edge_end(N, galois::MethodFlag::UNPROTECTED); } + /** + * Return the degree of the edge in the local graph + **/ + inline uint64_t localDegree(GraphNode N) { return graph.getDegree(N); } + /** * Returns an iterable object over the edges of a particular node in the * graph. @@ -1081,7 +1094,8 @@ class DistGraph { } else { masterRanges = galois::graphs::determineUnitRangesFromGraph( graph, galois::runtime::activeThreads, beginMaster, - beginMaster + numOwned, 0); + beginMaster + numOwned, 0, + (galois::graphs::is_LS_LC_CSR_64_Graph::value == 1)); } } @@ -1149,6 +1163,12 @@ class DistGraph { */ void edgesEqualMasters() { specificRanges[2] = specificRanges[1]; } + void recalculateG2LMap() { + for (uint64_t i = 0; i < localToGlobalVector.size(); i++) { + globalToLocalMap[localToGlobalVector[i]] = i; + } + } + public: /** * Write the local LC_CSR graph to the file on a disk. diff --git a/libcusp/include/galois/graphs/GenericPartitioners.h b/libcusp/include/galois/graphs/GenericPartitioners.h index b02d2c9594..3794d9eef1 100644 --- a/libcusp/include/galois/graphs/GenericPartitioners.h +++ b/libcusp/include/galois/graphs/GenericPartitioners.h @@ -25,8 +25,6 @@ class NoCommunication : public galois::graphs::ReadMasterAssignment { } }; -/** - */ class MiningPolicyNaive : public galois::graphs::ReadMasterAssignment { public: MiningPolicyNaive(uint32_t, uint32_t numHosts, uint64_t, uint64_t, @@ -38,6 +36,17 @@ class MiningPolicyNaive : public galois::graphs::ReadMasterAssignment { bool keepEdge(uint32_t src, uint32_t dst) const { return src < dst; } }; +class OECPolicy : public galois::graphs::ReadMasterAssignment { +public: + OECPolicy(uint32_t, uint32_t numHosts, uint64_t, uint64_t, + std::vector&) + : galois::graphs::ReadMasterAssignment(0, numHosts, 0, 0) {} + + static bool needNodeDegrees() { return false; } + + bool keepEdge(uint32_t, uint32_t) const { return true; } +}; + class MiningPolicyDegrees : public galois::graphs::ReadMasterAssignment { std::vector& ndegrees; diff --git a/libcusp/include/galois/graphs/NewGeneric.h b/libcusp/include/galois/graphs/NewGeneric.h index e8d7e15d8e..d1ad172080 100644 --- a/libcusp/include/galois/graphs/NewGeneric.h +++ b/libcusp/include/galois/graphs/NewGeneric.h @@ -1654,7 +1654,19 @@ class NewDistGraphGeneric : public DistGraph { waitTime.start(); while (hostFinished.count() != base_DistGraph::numHosts || loadsClear.count() != base_DistGraph::numHosts) { +<<<<<<< HEAD // make sure all assignments are done and all loads are done +======= + // #ifndef NDEBUG + // galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts to + // finish, ", + // hostFinished.count()); + // galois::gDebug("[", base_DistGraph::id, "] waiting for all hosts + // loads " + // "syncs to finish, ", loadsClear.count()); + // #endif + // make sure all assignments are done and all loads are done +>>>>>>> 5901b24b6 (chore: Run clang-format on the repo and add git hooks from gnn branch) syncAssignmentReceivesAsync(localNodeToMaster, gid2offsets, hostFinished); asyncRecvLoad(nodeLoads, edgeLoads, loadsClear); diff --git a/libdist/include/galois/runtime/Serialize.h b/libdist/include/galois/runtime/Serialize.h index a7b83174b7..6832a1afc4 100644 --- a/libdist/include/galois/runtime/Serialize.h +++ b/libdist/include/galois/runtime/Serialize.h @@ -28,7 +28,9 @@ #define GALOIS_RUNTIME_SERIALIZE_H #include +#include #include +#include #include #include #include @@ -276,12 +278,22 @@ gSizedObj(const T&, return sizeof(uintptr_t); } +<<<<<<< HEAD //! Size of BufferWrapper is size + number of things in it template inline size_t gSizedObj(const galois::BufferWrapper& data) { return sizeof(size_t) + data.size() * sizeof(T); } +======= +template +inline size_t gSizedObj(const std::unordered_map& data) { + size_t sz = 0; + for (auto i : data) + sz += gSizedObj(i.first) + gSizedObj(i.second); + return sz; +} +>>>>>>> b1a39cdd7 (bug fix) /** * Returns the size necessary for storing 2 elements of a pair into a * serialize buffer. @@ -447,6 +459,16 @@ inline void gSerializeObj( * @param [in,out] buf Serialize buffer to serialize into * @param [in] data Data to serialize */ +template +inline void gSerializeObj(SerializeBuffer& buf, + const std::unordered_map& data) { + uint64_t cnt = 0; + for (auto i : data) { + cnt++; + gSerialize(buf, i.first, i.second); + } +} + template inline void gSerializeObj(SerializeBuffer& buf, const T& data, @@ -794,6 +816,17 @@ void gDeserializeObj( data.deserialize(buf); } +template +void gDeserializeObj(DeSerializeBuffer& buf, std::unordered_map& data) { + while (!buf.empty()) { + std::pair i; + gDeserialize(buf, i.first, i.second); + if (buf.getOffset() > buf.size()) { + break; + } + data[i.first] = i.second; + } +} /** * Deserialize a pair from a buffer. * diff --git a/libdist/src/DistStats.cpp b/libdist/src/DistStats.cpp index e8399451f3..1fe46bc514 100644 --- a/libdist/src/DistStats.cpp +++ b/libdist/src/DistStats.cpp @@ -286,13 +286,18 @@ void DistStatManager::combineAtHost_0(void) { combineAtHost_0_helper(); getSystemNetworkInterface().flush(); + // work done before check + td += 1; + // barrier while (td.reduce()) { + td.reset(); if (getHostID() == 0) { // receive from other hosts receiveAtHost_0_helper(); } - }; + } + // explicit barrier after logical barrier is required // as next async phase begins immediately getHostBarrier().wait(); @@ -302,13 +307,18 @@ void DistStatManager::combineAtHost_0(void) { combineAtHost_0_helper2(); getSystemNetworkInterface().flush(); + td += 1; + // barrier while (td.reduce()) { + td.reset(); + if (getHostID() == 0) { // receive from other hosts receiveAtHost_0_helper2(); } - }; + } + // explicit barrier after logical barrier is required // as next async phase begins immediately getHostBarrier().wait(); diff --git a/libgalois/include/galois/Atomic.h b/libgalois/include/galois/Atomic.h deleted file mode 100644 index e073bf5aa7..0000000000 --- a/libgalois/include/galois/Atomic.h +++ /dev/null @@ -1,284 +0,0 @@ -/* - * This file belongs to the Galois project, a C++ library for exploiting - * parallelism. The code is being released under the terms of the 3-Clause BSD - * License (a copy is located in LICENSE.txt at the top-level directory). - * - * Copyright (C) 2018, The University of Texas at Austin. All rights reserved. - * UNIVERSITY EXPRESSLY DISCLAIMS ANY AND ALL WARRANTIES CONCERNING THIS - * SOFTWARE AND DOCUMENTATION, INCLUDING ANY WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR ANY PARTICULAR PURPOSE, NON-INFRINGEMENT AND WARRANTIES OF - * PERFORMANCE, AND ANY WARRANTY THAT MIGHT OTHERWISE ARISE FROM COURSE OF - * DEALING OR USAGE OF TRADE. NO WARRANTY IS EITHER EXPRESS OR IMPLIED WITH - * RESPECT TO THE USE OF THE SOFTWARE OR DOCUMENTATION. Under no circumstances - * shall University be liable for incidental, special, indirect, direct or - * consequential damages or loss of profits, interruption of business, or - * related expenses which may arise from use of Software or Documentation, - * including but not limited to those resulting from defects in Software and/or - * Documentation, or loss or inaccuracy of data of any kind. - */ - -#ifndef GALOIS_ATOMIC_H -#define GALOIS_ATOMIC_H - -#include - -#include "galois/config.h" -#include "galois/substrate/CacheLineStorage.h" - -namespace galois { - -namespace internal { -/** - * Common implementation. - */ -template class W, bool CONCURRENT> -class GAtomicImpl { - // galois::runtime::LL::CacheLineStorage val; - W val; - -public: - //! Initialize with a value - explicit GAtomicImpl(const T& i) : val(i) {} - //! default constructor - GAtomicImpl() {} - - //! atomic add and fetch - T operator+=(const T& rhs) { return __sync_add_and_fetch(&val.data, rhs); } - //! atomic sub and fetch - T operator-=(const T& rhs) { return __sync_sub_and_fetch(&(val.data), rhs); } - //! atomic increment and fetch - T operator++() { return __sync_add_and_fetch(&(val.data), 1); } - //! atomic fetch and increment - T operator++(int) { return __sync_fetch_and_add(&(val.data), 1); } - //! atomic decrement and fetch - T operator--() { return __sync_sub_and_fetch(&(val.data), 1); } - //! atomic fetch and decrement - T operator--(int) { return __sync_fetch_and_sub(&(val.data), 1); } - //! conversion operator to base data type - operator T() const { return val.data; } - //! assign from underlying type - T& operator=(const T& i) { return val.data = i; } - //! assignment operator - T& operator=(const GAtomicImpl& i) { return val.data = i.val.data; } - //! direct compare and swap - bool cas(const T& expected, const T& updated) { - if (val.data != expected) { - return false; - } -#if defined(__INTEL_COMPILER) - return __sync_bool_compare_and_swap( - &val.data, *reinterpret_cast(&expected), - *reinterpret_cast(&updated)); -#else - return __sync_bool_compare_and_swap(&val.data, expected, updated); -#endif - } -}; - -// non-current version -template class W> -class GAtomicImpl { - // galois::runtime::LL::CacheLineStorage val; - W val; - -public: - //! Initialize with a value - explicit GAtomicImpl(const T& i) : val(i) {} - //! default constructor - GAtomicImpl() {} - - //! atomic add and fetch - T operator+=(const T& rhs) { return (val.data += rhs); } - //! atomic sub and fetch - T operator-=(const T& rhs) { return (val.data -= rhs); } - //! atomic increment and fetch - T operator++() { return ++(val.data); } - //! atomic fetch and increment - T operator++(int) { return (val.data)++; } - //! atomic decrement and fetch - T operator--() { return --(val.data); } - //! atomic fetch and decrement - T operator--(int) { return (val.data)--; } - //! conversion operator to base data type - operator T() const { return val.data; } - //! assign from underlying type - T& operator=(const T& i) { return val.data = i; } - //! assignment operator - T& operator=(const GAtomicImpl& i) { return val.data = i.val.data; } - //! direct compare and swap - bool cas(const T& expected, const T& updated) { - if (val.data != expected) { - return false; - } else { - val.data = updated; - return true; - } - } -}; - -//! Basic atomic -template class W, bool CONCURRENT> -class GAtomicBase : public GAtomicImpl { - typedef GAtomicImpl Super_ty; - -public: - //! Initialize with a value - explicit GAtomicBase(const T& i) : Super_ty(i) {} - - //! default constructor - GAtomicBase() : Super_ty() {} - - T& operator=(const GAtomicBase& that) { return Super_ty::operator=(that); } - - T& operator=(const T& that) { return Super_ty::operator=(that); } -}; - -//! Specialization for pointers -template class W, bool CONCURRENT> -class GAtomicBase : public GAtomicImpl { - typedef GAtomicImpl Super_ty; - -public: - typedef typename std::iterator_traits::difference_type difference_type; - - GAtomicBase() : Super_ty() {} - - GAtomicBase(T* i) : Super_ty(i) {} - - T*& operator=(const GAtomicBase& that) { return Super_ty::operator=(that); } - - T*& operator=(T* that) { return Super_ty::operator=(that); } - - T* operator+=(const difference_type& rhs) { - if (CONCURRENT) { - return __sync_add_and_fetch(&Super_ty::val.data, rhs); - } else { - return (Super_ty::val.data += rhs); - } - } - - T* operator-=(const difference_type& rhs) { - if (CONCURRENT) { - return __sync_sub_and_fetch(&Super_ty::val.data, rhs); - } else { - return (Super_ty::val.data -= rhs); - } - } -}; - -//! Specialization for const pointers -template class W, bool CONCURRENT> -class GAtomicBase - : public GAtomicImpl { - typedef GAtomicImpl Super_ty; - -public: - typedef - typename std::iterator_traits::difference_type difference_type; - - GAtomicBase() : Super_ty() {} - - GAtomicBase(const T* i) : Super_ty(i) {} - - const T*& operator=(const GAtomicBase& that) { - return Super_ty::operator=(that); - } - - const T*& operator=(const T* that) { return Super_ty::operator=(that); } - - const T* operator+=(const difference_type& rhs) { - if (CONCURRENT) { - return __sync_add_and_fetch(&Super_ty::val.data, rhs); - } else { - return (Super_ty::val.data += rhs); - } - } - - const T* operator-=(const difference_type& rhs) { - if (CONCURRENT) { - return __sync_sub_and_fetch(&Super_ty::val.data, rhs); - } else { - return (Super_ty::val.data -= rhs); - } - } -}; - -//! Specialization for bools -template